pytabkit.bench.data package
Submodules
pytabkit.bench.data.common module
- class pytabkit.bench.data.common.SplitType
Bases:
object- DEFAULT = 'default-split'
- RANDOM = 'random-split'
- class pytabkit.bench.data.common.TaskSource
Bases:
object- AUTOML_CLASS_SMALL = 'automl-class-small'
- CUSTOM = 'custom'
- OPENML_CLASS = 'openml-class'
- OPENML_CLASS_BIN_EXTRA = 'openml-class-bin-extra'
- OPENML_REGRESSION = 'openml-reg'
- TABARENA_CLASS = 'tabarena-class'
- TABARENA_REG = 'tabarena-reg'
- UCI_BIN_CLASS = 'uci-bin-class'
- UCI_MULTI_CLASS = 'uci-multi-class'
- UCI_REGRESSION = 'uci-reg'
pytabkit.bench.data.get_uci module
- pytabkit.bench.data.get_uci.get_EEG_steady_state()
- pytabkit.bench.data.get_uci.get_KDC_4007()
- pytabkit.bench.data.get_uci.get_abalone()
- pytabkit.bench.data.get_uci.get_activity_recognition()
- pytabkit.bench.data.get_uci.get_air_quality()
- pytabkit.bench.data.get_uci.get_anuran_calls()
- pytabkit.bench.data.get_uci.get_appliances_energy()
- pytabkit.bench.data.get_uci.get_arabic_digit()
- pytabkit.bench.data.get_uci.get_artificial_characters()
- pytabkit.bench.data.get_uci.get_assamese_characters()
- pytabkit.bench.data.get_uci.get_australian_sign_language()
- pytabkit.bench.data.get_uci.get_avila()
- pytabkit.bench.data.get_uci.get_bach_chorals_harmony()
- pytabkit.bench.data.get_uci.get_bank_marketing()
- pytabkit.bench.data.get_uci.get_bejing_pm25()
- pytabkit.bench.data.get_uci.get_bike_sharing()
- pytabkit.bench.data.get_uci.get_bitcoin_heist()
- pytabkit.bench.data.get_uci.get_ble_rssi_indoor_location()
- pytabkit.bench.data.get_uci.get_blood_pressure()
- pytabkit.bench.data.get_uci.get_carbon_nanotubes()
- pytabkit.bench.data.get_uci.get_cargo_2000()
- pytabkit.bench.data.get_uci.get_census_income()
- pytabkit.bench.data.get_uci.get_character_trajectories()
- pytabkit.bench.data.get_uci.get_chess()
- pytabkit.bench.data.get_uci.get_chess_krvk()
- pytabkit.bench.data.get_uci.get_crop_mapping()
- pytabkit.bench.data.get_uci.get_crowd_sourced_mapping()
- pytabkit.bench.data.get_uci.get_cycle_power_plant()
- pytabkit.bench.data.get_uci.get_default_credit_card()
- pytabkit.bench.data.get_uci.get_eeg_eye_state()
- pytabkit.bench.data.get_uci.get_eeg_steady_state_visual()
- pytabkit.bench.data.get_uci.get_electrical_grid_stability_simulated()
- pytabkit.bench.data.get_uci.get_emg_for_gestures()
- pytabkit.bench.data.get_uci.get_emg_physical_action()
- pytabkit.bench.data.get_uci.get_epileptic_seizure_recognition()
- pytabkit.bench.data.get_uci.get_facebook_comment_volume()
- pytabkit.bench.data.get_uci.get_facebook_live_sellers_thailand()
- pytabkit.bench.data.get_uci.get_firewall()
- pytabkit.bench.data.get_uci.get_firm_teacher_clave()
- pytabkit.bench.data.get_uci.get_first_order_theorem_proving()
- pytabkit.bench.data.get_uci.get_five_cities_pm25()
- pytabkit.bench.data.get_uci.get_gas_sensor_drift()
- pytabkit.bench.data.get_uci.get_gas_turbine()
- pytabkit.bench.data.get_uci.get_gesture_phase_segmentation()
- pytabkit.bench.data.get_uci.get_gnfuv_unmanned_surface_vehicles()
- pytabkit.bench.data.get_uci.get_grammatical_facial_expressions()
- pytabkit.bench.data.get_uci.get_hiv_1_protease()
- pytabkit.bench.data.get_uci.get_htru2()
- pytabkit.bench.data.get_uci.get_human_activity_smartphone()
- pytabkit.bench.data.get_uci.get_indoor_channel_measurements()
- pytabkit.bench.data.get_uci.get_indoor_loc()
- pytabkit.bench.data.get_uci.get_indoor_loc_mag()
- pytabkit.bench.data.get_uci.get_indoor_user_movement_prediction()
- pytabkit.bench.data.get_uci.get_insurance_benchmark()
- pytabkit.bench.data.get_uci.get_isolet()
- pytabkit.bench.data.get_uci.get_landsat_satimage()
- pytabkit.bench.data.get_uci.get_letter_recognition()
- pytabkit.bench.data.get_uci.get_madelon()
- pytabkit.bench.data.get_uci.get_magic_gamma_telescope()
- pytabkit.bench.data.get_uci.get_metro_interstate_traffic_volume()
- pytabkit.bench.data.get_uci.get_meu_mobile_ksd()
- pytabkit.bench.data.get_uci.get_mushroom()
- pytabkit.bench.data.get_uci.get_musk()
- pytabkit.bench.data.get_uci.get_nomao()
- pytabkit.bench.data.get_uci.get_nursery()
- pytabkit.bench.data.get_uci.get_occupancy_detection()
- pytabkit.bench.data.get_uci.get_online_news_popularity()
- pytabkit.bench.data.get_uci.get_online_shoppers_attention()
- pytabkit.bench.data.get_uci.get_opportunity_activity()
- pytabkit.bench.data.get_uci.get_optical_recognition_handwritten_digits()
- pytabkit.bench.data.get_uci.get_oral_toxicity()
- pytabkit.bench.data.get_uci.get_ozone_level()
- pytabkit.bench.data.get_uci.get_page_blocks()
- pytabkit.bench.data.get_uci.get_parking_birmingham()
- pytabkit.bench.data.get_uci.get_parkinson()
- pytabkit.bench.data.get_uci.get_pen_recognition_handwritten_characters()
- pytabkit.bench.data.get_uci.get_phishing()
- pytabkit.bench.data.get_uci.get_pmu_ud()
- pytabkit.bench.data.get_uci.get_polish_companies_bankruptcy()
- pytabkit.bench.data.get_uci.get_protein_tertiary_structure()
- pytabkit.bench.data.get_uci.get_query_analytics()
- pytabkit.bench.data.get_uci.get_real_estate_value()
- pytabkit.bench.data.get_uci.get_seismic_bumps()
- pytabkit.bench.data.get_uci.get_seoul_bike_data()
- pytabkit.bench.data.get_uci.get_shill_bidding()
- pytabkit.bench.data.get_uci.get_simulated_falls()
- pytabkit.bench.data.get_uci.get_skill_craft()
- pytabkit.bench.data.get_uci.get_smartphone_human_activity()
- pytabkit.bench.data.get_uci.get_smartphone_human_activity_postural()
- pytabkit.bench.data.get_uci.get_sml2010()
- pytabkit.bench.data.get_uci.get_south_german_credit()
- pytabkit.bench.data.get_uci.get_spambase()
- pytabkit.bench.data.get_uci.get_superconductivity()
- pytabkit.bench.data.get_uci.get_tamilnadu_electricity()
- pytabkit.bench.data.get_uci.get_tarvel_review_ratings()
- pytabkit.bench.data.get_uci.get_thyroids()
- pytabkit.bench.data.get_uci.get_turkiye_student_evaluation()
- pytabkit.bench.data.get_uci.get_vicon_physical_action()
- pytabkit.bench.data.get_uci.get_wall_following_robot()
- pytabkit.bench.data.get_uci.get_wave_energy()
- pytabkit.bench.data.get_uci.get_waveform()
- pytabkit.bench.data.get_uci.get_wilt()
- pytabkit.bench.data.get_uci.get_wine_quality()
pytabkit.bench.data.import_talent_benchmark module
- pytabkit.bench.data.import_talent_benchmark.import_talent_benchmark(paths, talent_folder, source_name, allow_regression=True, allow_classification=True, normalize_y=False, min_n_samples=1, max_n_classes=100000, min_n_classes=0, remove_missing_cont=True, remove_duplicates=False, max_n_samples=None, ignore_above_n_classes=100000, dry_run=False)
- Parameters:
paths (Paths)
talent_folder (str)
source_name (str)
allow_regression (bool)
allow_classification (bool)
normalize_y (bool)
min_n_samples (int)
max_n_classes (int)
min_n_classes (int)
remove_missing_cont (bool)
remove_duplicates (bool)
max_n_samples (int | None)
ignore_above_n_classes (int)
dry_run (bool)
pytabkit.bench.data.import_tasks module
- class pytabkit.bench.data.import_tasks.PandasTask
Bases:
object- __init__(x_df, y_df, cat_indicator, task_type, more_info)
- Parameters:
x_df (DataFrame)
y_df (Series)
cat_indicator (List[bool])
task_type (str)
more_info (Dict)
- deduplicate()
- static from_openml_task_id(task_id)
- Parameters:
task_id (int)
- get_n_classes()
- get_n_samples()
- get_task(task_desc)
- Parameters:
task_desc (TaskDescription)
- Return type:
- limit_n_classes(max_n_classes)
- Parameters:
max_n_classes (int)
- normalize_regression_y()
- remove_missing_cont()
- subsample(max_size)
- Parameters:
max_size (int)
- pytabkit.bench.data.import_tasks.check_zero_hot(uci_base_path)
- pytabkit.bench.data.import_tasks.convert_to_class_numbers(y)
- pytabkit.bench.data.import_tasks.download_if_not_exists(url, dest)
- Parameters:
url (str)
dest (str)
- pytabkit.bench.data.import_tasks.extract_categories(X)
- pytabkit.bench.data.import_tasks.get_openml_ds_names(task_ids)
- Parameters:
task_ids (List[int])
- pytabkit.bench.data.import_tasks.get_openml_task_ids(suite_id)
- Parameters:
suite_id (str | int)
- Return type:
List[int]
- pytabkit.bench.data.import_tasks.import_from_csv(ds_path, task_type, task_desc, paths, default_split_idx=None, remove_duplicates=False)
- Parameters:
ds_path (Path | str)
task_type (TaskType)
task_desc (TaskDescription)
paths (Paths)
default_split_idx (int | None)
remove_duplicates (bool)
- pytabkit.bench.data.import_tasks.import_openml(task_ids, task_source_name, paths, cache_dir=None, normalize_y=False, min_n_samples=1, max_n_classes=100000, min_n_classes=0, remove_missing_cont=True, remove_duplicates=False, exclude_ds_names=None, max_n_samples=None, include_only_ds_names=None, rerun=False, ignore_above_n_classes=100000)
- Parameters:
task_ids (List[int])
task_source_name (str)
paths (Paths)
cache_dir (str | Path | None)
normalize_y (bool)
min_n_samples (int)
max_n_classes (int)
min_n_classes (int)
remove_missing_cont (bool)
remove_duplicates (bool)
exclude_ds_names (List[str] | None)
max_n_samples (int | None)
include_only_ds_names (List[str] | None)
rerun (bool)
ignore_above_n_classes (int)
- pytabkit.bench.data.import_tasks.import_uci_tasks(paths, remove_duplicates=False, rerun=False)
- Parameters:
paths (Paths)
remove_duplicates (bool)
- pytabkit.bench.data.import_tasks.set_openml_cache_dir(dir_name)
- Parameters:
dir_name (str | Path)
pytabkit.bench.data.paths module
- class pytabkit.bench.data.paths.Paths
Bases:
objectThis class provides paths where data can be stored. Its base path can be configured. It requires one base folder, which will have several subfolders: algs, tasks, task_collections, results, result_summaries, eval, plots, tmp, … by subclassing this class, specific folders can be re-located (e.g. put data on SSD)
- __init__(base_folder, tasks_folder=None, results_folder=None, result_summaries_folder=None, uci_download_folder=None)
- Parameters:
base_folder (str)
tasks_folder (str | None)
results_folder (str | None)
result_summaries_folder (str | None)
uci_download_folder (str | None)
- algs()
- Return type:
Path
- base()
- Return type:
Path
- eval()
- Return type:
Path
- static from_env_variables()
Construct a Paths object that is constructed from environment variables if they are set. Otherwise, the base folder will either be taken from custom_paths.py, if available, or set to ‘./tab_bench_data’. :return: Paths object.
- Return type:
- new_tmp_folder()
- Return type:
- plots()
- Return type:
Path
- resources()
- resources_exp_it(exp_name, iteration)
- Parameters:
exp_name (str)
iteration (int)
- Return type:
Path
- result_summaries()
- Return type:
Path
- results()
- Return type:
Path
- results_alg_task(task_desc, alg_name, n_cv)
- Parameters:
task_desc (TaskDescription)
alg_name (str)
n_cv (int)
- Return type:
Path
- results_alg_task_split(task_desc, alg_name, n_cv, split_type, split_id)
- Parameters:
task_desc (TaskDescription)
alg_name (str)
n_cv (int)
split_type (str)
split_id (int)
- Return type:
Path
- results_task(task_desc)
- Parameters:
task_desc (TaskDescription)
- Return type:
Path
- summary_alg_task(task_desc, alg_name, n_cv)
- Parameters:
task_desc (TaskDescription)
alg_name (str)
n_cv (int)
- Return type:
Path
- task_collections()
- Return type:
Path
- task_source(task_source_name)
- Parameters:
task_source_name (str)
- Return type:
Path
- tasks()
- Return type:
Path
- tasks_task(task_desc)
- Parameters:
task_desc (TaskDescription)
- Return type:
Path
- times()
- Return type:
Path
- times_alg_task(alg_name, task_desc)
- Parameters:
alg_name (str)
task_desc (TaskDescription)
- tmp()
- Return type:
Path
- uci_download()
- Return type:
Path
pytabkit.bench.data.tasks module
- class pytabkit.bench.data.tasks.Task
Bases:
objectTask (dataset with defined target variable), consisting of a task info and a dataset.
- __init__(task_info, ds)
- Parameters:
task_info (TaskInfo)
ds (DictDataset)
- class pytabkit.bench.data.tasks.TaskCollection
Bases:
objectCollection (list) of TaskDescription objects with its own name (can be the name of the task source).
- __init__(coll_name, task_descs)
- Parameters:
coll_name (str) – Name of the task collection.
task_descs (List[TaskDescription]) – Task descriptions.
- static from_source(task_source, paths)
Create a task collection with all tasks from a given task source (that have been imported/saved with this task source name). The task collection will have the same name as the source. :param task_source: Name of the task source. :param paths: Path configuration. :return: TaskCollection object.
- Parameters:
task_source (str)
paths (Paths)
- Return type:
- class pytabkit.bench.data.tasks.TaskDescription
Bases:
objectThe minimal necessary information to identify a task, consisting of a task source and a task name. A task is a dataset with a specific target variable.
- __init__(task_source, task_name)
- Parameters:
task_source (str) – Name of the source where the task was retrieved from (see
data.common.TaskSource)task_name (str) – Name of the task (dataset).
- exists_task(paths)
Check if the task for this description is stored on disk.
- Parameters:
paths (Paths) – Path configuration.
- Returns:
True iff it exists.
- static from_dict(data)
Create from a dictionary.
- Parameters:
data (Dict) – Dictionary.
- Returns:
TaskDescription object.
- Return type:
- load_info(paths)
Load the associated TaskInfo object.
- load_task(paths)
Load the associated Task object.
- Parameters:
paths (Paths) – Path configuration.
- Returns:
Task object.
- to_dict()
Convert to a dictionary for saving.
- Returns:
Dictionary with ‘task_source’ and ‘task_name’ entries.
- Return type:
Dict
- class pytabkit.bench.data.tasks.TaskInfo
Bases:
objectInformation about a task (without containing the dataset itself).
- __init__(task_desc, n_samples, tensor_infos, default_split_idx, more_info_dict, max_n_trainval=None)
- Parameters:
task_desc (TaskDescription) – Task description.
n_samples (int) – Number of samples.
tensor_infos (Dict[str, TensorInfo]) – Information about the tensors (x_cat, x_cont, y).
default_split_idx (int | None) – If the dataset has a default split, this is the index of the first test sample. We assume that in this case, the training part is stored before the test part.
more_info_dict (Dict | None) – Dictionary with more information that can be stored, for example about the original OpenML dataset id.
max_n_trainval (int | None) – maximum number of samples used for training+validation in random splits. If None (default value), no maximum is imposed.
- static from_ds(task_desc, ds, default_split_idx=None, more_info_dict=None)
- Parameters:
task_desc (TaskDescription)
ds (DictDataset)
default_split_idx (int | None)
more_info_dict (Dict | None)
- Return type:
- get_ds_size_gb()
- Returns:
Dataset size in gigabyte, when stored in torch Tensors (8 byte for categorical variables, 4 byte for continuous variables).
- Return type:
float
- get_n_classes()
- Returns:
Number of classes for classification, or 0 for regression.
- Return type:
int
- get_random_splits(n_splits, trainval_fraction=0.8, train_fraction=0.75)
- Parameters:
n_splits (int)
trainval_fraction (float)
train_fraction (float)
- Return type:
List[SplitInfo]
- static load(paths, task_desc)
- Parameters:
paths (Paths)
task_desc (TaskDescription)
- load_task(paths)
Load the associated task. :param paths: Path configuration. :return: Task object.
- class pytabkit.bench.data.tasks.TaskPackage
Bases:
objectCombines information about how to run a task on a benchmark.
pytabkit.bench.data.uci_file_ops module
- class pytabkit.bench.data.uci_file_ops.UCIVars
Bases:
object- binary_classification_data_folder = '../bin-class-data/'
- data_folder = '../data/'
- data_group_id = 0
- multiclass_classification_data_folder = '../multi-class-data/'
- raw_data_folder = '../raw-data/'
- regression_data_folder = '../regression-data/'
- statistics_filename = '../data_statistics.csv'
- pytabkit.bench.data.uci_file_ops.auto_replace_categories_in_mixed_data(data, column, separator, unknown_string='', unknown_replacement_value=0)
- pytabkit.bench.data.uci_file_ops.auto_replace_missing_in_mixed_data(data, unknown_string='?')
- pytabkit.bench.data.uci_file_ops.concat_files(source_filename_pattern, target_filename)
- pytabkit.bench.data.uci_file_ops.convert_replace_string_to_vector(string, separator)
- pytabkit.bench.data.uci_file_ops.convert_time_to_seconds(time, sep)
- pytabkit.bench.data.uci_file_ops.count_bin_columns(data)
- pytabkit.bench.data.uci_file_ops.download_and_save(url, filename)
- pytabkit.bench.data.uci_file_ops.get_categories_in_mixed_data(data, column)
- pytabkit.bench.data.uci_file_ops.get_category_replace_string(category_size, position, separator)
- pytabkit.bench.data.uci_file_ops.is_number(string, german_decimal)
- pytabkit.bench.data.uci_file_ops.load_mixed_raw_data(filename, sep, header=False)
- pytabkit.bench.data.uci_file_ops.load_raw_data(filename, sep, description_columns=0, date_column=-1, date_sep='', date_order='', time_column=-1, time_sep='', german_decimal=False, na_string='---', show_intermediate=False, header=False)
- pytabkit.bench.data.uci_file_ops.make_folder(folder)
- pytabkit.bench.data.uci_file_ops.move_label_in_front(data, label_column)
- pytabkit.bench.data.uci_file_ops.my_decode(x)
- pytabkit.bench.data.uci_file_ops.prepare_new_data_set_group_id()
- pytabkit.bench.data.uci_file_ops.remove_columns(data, columns)
- pytabkit.bench.data.uci_file_ops.remove_empty_columns(data)
- pytabkit.bench.data.uci_file_ops.remove_files(folder, filename_pattern)
- pytabkit.bench.data.uci_file_ops.remove_rows_with_label(data, label)
- pytabkit.bench.data.uci_file_ops.replace_bin_cats_in_mixed_data(data, categories, column, separator, unknown_string='', unknown_replacement_value=0)
- pytabkit.bench.data.uci_file_ops.replace_categories_in_file(filename, categories, separator)
- pytabkit.bench.data.uci_file_ops.replace_categories_in_mixed_data(data, categories, column, separator, unknown_string='', unknown_replacement_value=0)
- pytabkit.bench.data.uci_file_ops.replace_chars_in_file(filename, old_char, new_char)
- pytabkit.bench.data.uci_file_ops.replace_circulars_in_mixed_data(data, categories, column, separator, unknown_string='')
- pytabkit.bench.data.uci_file_ops.replace_isodate_by_day_in_mixed_data(data, column)
- pytabkit.bench.data.uci_file_ops.replace_manual_in_mixed_data(data, categories, column, replacement, separator, unknown_string='', unknown_replacement_value=0)
- pytabkit.bench.data.uci_file_ops.replace_ordinals_in_mixed_data(data, categories, column, separator, unknown_string='', unknown_replacement_value=0, begin_value=1)
- pytabkit.bench.data.uci_file_ops.replace_time_by_seconds_in_mixed_data(data, column, sep, rounded=1)
- pytabkit.bench.data.uci_file_ops.save_data_stats(data_stats)
- pytabkit.bench.data.uci_file_ops.save_data_to_file(data, filename, is_classification, is_regression=True, min_scale=-1.0, max_scale=1.0)
- pytabkit.bench.data.uci_file_ops.un_z_raw_data(filename)
- pytabkit.bench.data.uci_file_ops.unarff_raw_data(filename)
- pytabkit.bench.data.uci_file_ops.ungz_raw_data(filename)
- pytabkit.bench.data.uci_file_ops.unrar_raw_data(filename)
- pytabkit.bench.data.uci_file_ops.untar_raw_data(filename)
- pytabkit.bench.data.uci_file_ops.unzip_raw_data(filename)
- pytabkit.bench.data.uci_file_ops.write_mixed_raw_data(filename, data, sep)