pytabkit.bench.data package

Submodules

pytabkit.bench.data.common module

class pytabkit.bench.data.common.SplitType

Bases: object

DEFAULT = 'default-split'
RANDOM = 'random-split'
class pytabkit.bench.data.common.TaskSource

Bases: object

AUTOML_CLASS_SMALL = 'automl-class-small'
CUSTOM = 'custom'
OPENML_CLASS = 'openml-class'
OPENML_CLASS_BIN_EXTRA = 'openml-class-bin-extra'
OPENML_REGRESSION = 'openml-reg'
TABARENA_CLASS = 'tabarena-class'
TABARENA_REG = 'tabarena-reg'
UCI_BIN_CLASS = 'uci-bin-class'
UCI_MULTI_CLASS = 'uci-multi-class'
UCI_REGRESSION = 'uci-reg'

pytabkit.bench.data.get_uci module

pytabkit.bench.data.get_uci.download_all_uci(paths)
Parameters:

paths (Paths)

pytabkit.bench.data.get_uci.get_EEG_steady_state()
pytabkit.bench.data.get_uci.get_KDC_4007()
pytabkit.bench.data.get_uci.get_abalone()
pytabkit.bench.data.get_uci.get_activity_recognition()
pytabkit.bench.data.get_uci.get_air_quality()
pytabkit.bench.data.get_uci.get_anuran_calls()
pytabkit.bench.data.get_uci.get_appliances_energy()
pytabkit.bench.data.get_uci.get_arabic_digit()
pytabkit.bench.data.get_uci.get_artificial_characters()
pytabkit.bench.data.get_uci.get_assamese_characters()
pytabkit.bench.data.get_uci.get_australian_sign_language()
pytabkit.bench.data.get_uci.get_avila()
pytabkit.bench.data.get_uci.get_bach_chorals_harmony()
pytabkit.bench.data.get_uci.get_bank_marketing()
pytabkit.bench.data.get_uci.get_bejing_pm25()
pytabkit.bench.data.get_uci.get_bike_sharing()
pytabkit.bench.data.get_uci.get_bitcoin_heist()
pytabkit.bench.data.get_uci.get_ble_rssi_indoor_location()
pytabkit.bench.data.get_uci.get_blood_pressure()
pytabkit.bench.data.get_uci.get_carbon_nanotubes()
pytabkit.bench.data.get_uci.get_cargo_2000()
pytabkit.bench.data.get_uci.get_census_income()
pytabkit.bench.data.get_uci.get_character_trajectories()
pytabkit.bench.data.get_uci.get_chess()
pytabkit.bench.data.get_uci.get_chess_krvk()
pytabkit.bench.data.get_uci.get_crop_mapping()
pytabkit.bench.data.get_uci.get_crowd_sourced_mapping()
pytabkit.bench.data.get_uci.get_cycle_power_plant()
pytabkit.bench.data.get_uci.get_default_credit_card()
pytabkit.bench.data.get_uci.get_eeg_eye_state()
pytabkit.bench.data.get_uci.get_eeg_steady_state_visual()
pytabkit.bench.data.get_uci.get_electrical_grid_stability_simulated()
pytabkit.bench.data.get_uci.get_emg_for_gestures()
pytabkit.bench.data.get_uci.get_emg_physical_action()
pytabkit.bench.data.get_uci.get_epileptic_seizure_recognition()
pytabkit.bench.data.get_uci.get_facebook_comment_volume()
pytabkit.bench.data.get_uci.get_facebook_live_sellers_thailand()
pytabkit.bench.data.get_uci.get_firewall()
pytabkit.bench.data.get_uci.get_firm_teacher_clave()
pytabkit.bench.data.get_uci.get_first_order_theorem_proving()
pytabkit.bench.data.get_uci.get_five_cities_pm25()
pytabkit.bench.data.get_uci.get_gas_sensor_drift()
pytabkit.bench.data.get_uci.get_gas_turbine()
pytabkit.bench.data.get_uci.get_gesture_phase_segmentation()
pytabkit.bench.data.get_uci.get_gnfuv_unmanned_surface_vehicles()
pytabkit.bench.data.get_uci.get_grammatical_facial_expressions()
pytabkit.bench.data.get_uci.get_hiv_1_protease()
pytabkit.bench.data.get_uci.get_htru2()
pytabkit.bench.data.get_uci.get_human_activity_smartphone()
pytabkit.bench.data.get_uci.get_indoor_channel_measurements()
pytabkit.bench.data.get_uci.get_indoor_loc()
pytabkit.bench.data.get_uci.get_indoor_loc_mag()
pytabkit.bench.data.get_uci.get_indoor_user_movement_prediction()
pytabkit.bench.data.get_uci.get_insurance_benchmark()
pytabkit.bench.data.get_uci.get_isolet()
pytabkit.bench.data.get_uci.get_landsat_satimage()
pytabkit.bench.data.get_uci.get_letter_recognition()
pytabkit.bench.data.get_uci.get_madelon()
pytabkit.bench.data.get_uci.get_magic_gamma_telescope()
pytabkit.bench.data.get_uci.get_metro_interstate_traffic_volume()
pytabkit.bench.data.get_uci.get_meu_mobile_ksd()
pytabkit.bench.data.get_uci.get_mushroom()
pytabkit.bench.data.get_uci.get_musk()
pytabkit.bench.data.get_uci.get_naval_propulsion()
pytabkit.bench.data.get_uci.get_nomao()
pytabkit.bench.data.get_uci.get_nursery()
pytabkit.bench.data.get_uci.get_occupancy_detection()
pytabkit.bench.data.get_uci.get_online_news_popularity()
pytabkit.bench.data.get_uci.get_online_shoppers_attention()
pytabkit.bench.data.get_uci.get_opportunity_activity()
pytabkit.bench.data.get_uci.get_optical_recognition_handwritten_digits()
pytabkit.bench.data.get_uci.get_oral_toxicity()
pytabkit.bench.data.get_uci.get_ozone_level()
pytabkit.bench.data.get_uci.get_page_blocks()
pytabkit.bench.data.get_uci.get_parking_birmingham()
pytabkit.bench.data.get_uci.get_parkinson()
pytabkit.bench.data.get_uci.get_pen_recognition_handwritten_characters()
pytabkit.bench.data.get_uci.get_phishing()
pytabkit.bench.data.get_uci.get_pmu_ud()
pytabkit.bench.data.get_uci.get_polish_companies_bankruptcy()
pytabkit.bench.data.get_uci.get_protein_tertiary_structure()
pytabkit.bench.data.get_uci.get_query_analytics()
pytabkit.bench.data.get_uci.get_real_estate_value()
pytabkit.bench.data.get_uci.get_seismic_bumps()
pytabkit.bench.data.get_uci.get_seoul_bike_data()
pytabkit.bench.data.get_uci.get_shill_bidding()
pytabkit.bench.data.get_uci.get_simulated_falls()
pytabkit.bench.data.get_uci.get_skill_craft()
pytabkit.bench.data.get_uci.get_smartphone_human_activity()
pytabkit.bench.data.get_uci.get_smartphone_human_activity_postural()
pytabkit.bench.data.get_uci.get_sml2010()
pytabkit.bench.data.get_uci.get_south_german_credit()
pytabkit.bench.data.get_uci.get_spambase()
pytabkit.bench.data.get_uci.get_superconductivity()
pytabkit.bench.data.get_uci.get_tamilnadu_electricity()
pytabkit.bench.data.get_uci.get_tarvel_review_ratings()
pytabkit.bench.data.get_uci.get_thyroids()
pytabkit.bench.data.get_uci.get_turkiye_student_evaluation()
pytabkit.bench.data.get_uci.get_vicon_physical_action()
pytabkit.bench.data.get_uci.get_wall_following_robot()
pytabkit.bench.data.get_uci.get_wave_energy()
pytabkit.bench.data.get_uci.get_waveform()
pytabkit.bench.data.get_uci.get_wilt()
pytabkit.bench.data.get_uci.get_wine_quality()

pytabkit.bench.data.import_talent_benchmark module

pytabkit.bench.data.import_talent_benchmark.import_talent_benchmark(paths, talent_folder, source_name, allow_regression=True, allow_classification=True, normalize_y=False, min_n_samples=1, max_n_classes=100000, min_n_classes=0, remove_missing_cont=True, remove_duplicates=False, max_n_samples=None, ignore_above_n_classes=100000, dry_run=False)
Parameters:
  • paths (Paths)

  • talent_folder (str)

  • source_name (str)

  • allow_regression (bool)

  • allow_classification (bool)

  • normalize_y (bool)

  • min_n_samples (int)

  • max_n_classes (int)

  • min_n_classes (int)

  • remove_missing_cont (bool)

  • remove_duplicates (bool)

  • max_n_samples (int | None)

  • ignore_above_n_classes (int)

  • dry_run (bool)

pytabkit.bench.data.import_tasks module

class pytabkit.bench.data.import_tasks.PandasTask

Bases: object

__init__(x_df, y_df, cat_indicator, task_type, more_info)
Parameters:
  • x_df (DataFrame)

  • y_df (Series)

  • cat_indicator (List[bool])

  • task_type (str)

  • more_info (Dict)

deduplicate()
static from_openml_task_id(task_id)
Parameters:

task_id (int)

get_n_classes()
get_n_samples()
get_task(task_desc)
Parameters:

task_desc (TaskDescription)

Return type:

Task

limit_n_classes(max_n_classes)
Parameters:

max_n_classes (int)

normalize_regression_y()
remove_missing_cont()
subsample(max_size)
Parameters:

max_size (int)

pytabkit.bench.data.import_tasks.check_zero_hot(uci_base_path)
pytabkit.bench.data.import_tasks.convert_to_class_numbers(y)
pytabkit.bench.data.import_tasks.download_if_not_exists(url, dest)
Parameters:
  • url (str)

  • dest (str)

pytabkit.bench.data.import_tasks.extract_categories(X)
pytabkit.bench.data.import_tasks.get_openml_ds_names(task_ids)
Parameters:

task_ids (List[int])

pytabkit.bench.data.import_tasks.get_openml_task_ids(suite_id)
Parameters:

suite_id (str | int)

Return type:

List[int]

pytabkit.bench.data.import_tasks.import_from_csv(ds_path, task_type, task_desc, paths, default_split_idx=None, remove_duplicates=False)
Parameters:
  • ds_path (Path | str)

  • task_type (TaskType)

  • task_desc (TaskDescription)

  • paths (Paths)

  • default_split_idx (int | None)

  • remove_duplicates (bool)

pytabkit.bench.data.import_tasks.import_openml(task_ids, task_source_name, paths, cache_dir=None, normalize_y=False, min_n_samples=1, max_n_classes=100000, min_n_classes=0, remove_missing_cont=True, remove_duplicates=False, exclude_ds_names=None, max_n_samples=None, include_only_ds_names=None, rerun=False, ignore_above_n_classes=100000)
Parameters:
  • task_ids (List[int])

  • task_source_name (str)

  • paths (Paths)

  • cache_dir (str | Path | None)

  • normalize_y (bool)

  • min_n_samples (int)

  • max_n_classes (int)

  • min_n_classes (int)

  • remove_missing_cont (bool)

  • remove_duplicates (bool)

  • exclude_ds_names (List[str] | None)

  • max_n_samples (int | None)

  • include_only_ds_names (List[str] | None)

  • rerun (bool)

  • ignore_above_n_classes (int)

pytabkit.bench.data.import_tasks.import_uci_tasks(paths, remove_duplicates=False, rerun=False)
Parameters:
  • paths (Paths)

  • remove_duplicates (bool)

pytabkit.bench.data.import_tasks.set_openml_cache_dir(dir_name)
Parameters:

dir_name (str | Path)

pytabkit.bench.data.paths module

class pytabkit.bench.data.paths.Paths

Bases: object

This class provides paths where data can be stored. Its base path can be configured. It requires one base folder, which will have several subfolders: algs, tasks, task_collections, results, result_summaries, eval, plots, tmp, … by subclassing this class, specific folders can be re-located (e.g. put data on SSD)

__init__(base_folder, tasks_folder=None, results_folder=None, result_summaries_folder=None, uci_download_folder=None)
Parameters:
  • base_folder (str)

  • tasks_folder (str | None)

  • results_folder (str | None)

  • result_summaries_folder (str | None)

  • uci_download_folder (str | None)

algs()
Return type:

Path

base()
Return type:

Path

eval()
Return type:

Path

static from_env_variables()

Construct a Paths object that is constructed from environment variables if they are set. Otherwise, the base folder will either be taken from custom_paths.py, if available, or set to ‘./tab_bench_data’. :return: Paths object.

Return type:

Paths

new_tmp_folder()
Return type:

TmpPathContextManager

plots()
Return type:

Path

resources()
resources_exp_it(exp_name, iteration)
Parameters:
  • exp_name (str)

  • iteration (int)

Return type:

Path

result_summaries()
Return type:

Path

results()
Return type:

Path

results_alg_task(task_desc, alg_name, n_cv)
Parameters:
Return type:

Path

results_alg_task_split(task_desc, alg_name, n_cv, split_type, split_id)
Parameters:
  • task_desc (TaskDescription)

  • alg_name (str)

  • n_cv (int)

  • split_type (str)

  • split_id (int)

Return type:

Path

results_task(task_desc)
Parameters:

task_desc (TaskDescription)

Return type:

Path

summary_alg_task(task_desc, alg_name, n_cv)
Parameters:
Return type:

Path

task_collections()
Return type:

Path

task_source(task_source_name)
Parameters:

task_source_name (str)

Return type:

Path

tasks()
Return type:

Path

tasks_task(task_desc)
Parameters:

task_desc (TaskDescription)

Return type:

Path

times()
Return type:

Path

times_alg_task(alg_name, task_desc)
Parameters:
tmp()
Return type:

Path

uci_download()
Return type:

Path

class pytabkit.bench.data.paths.TmpPathContextManager

Bases: object

Helper class: Context manager for creating temporary paths.

__init__(path)
Parameters:

path (Path)

pytabkit.bench.data.tasks module

class pytabkit.bench.data.tasks.Task

Bases: object

Task (dataset with defined target variable), consisting of a task info and a dataset.

__init__(task_info, ds)
Parameters:
save(paths)
Parameters:

paths (Paths)

class pytabkit.bench.data.tasks.TaskCollection

Bases: object

Collection (list) of TaskDescription objects with its own name (can be the name of the task source).

__init__(coll_name, task_descs)
Parameters:
  • coll_name (str) – Name of the task collection.

  • task_descs (List[TaskDescription]) – Task descriptions.

static from_name(coll_name, paths)
Parameters:
  • coll_name (str)

  • paths (Paths)

Return type:

TaskCollection

static from_source(task_source, paths)

Create a task collection with all tasks from a given task source (that have been imported/saved with this task source name). The task collection will have the same name as the source. :param task_source: Name of the task source. :param paths: Path configuration. :return: TaskCollection object.

Parameters:
  • task_source (str)

  • paths (Paths)

Return type:

TaskCollection

load_infos(paths)
Parameters:

paths (Paths)

Return type:

List[TaskInfo]

save(paths)
Parameters:

paths (Paths)

class pytabkit.bench.data.tasks.TaskDescription

Bases: object

The minimal necessary information to identify a task, consisting of a task source and a task name. A task is a dataset with a specific target variable.

__init__(task_source, task_name)
Parameters:
  • task_source (str) – Name of the source where the task was retrieved from (see data.common.TaskSource)

  • task_name (str) – Name of the task (dataset).

exists_task(paths)

Check if the task for this description is stored on disk.

Parameters:

paths (Paths) – Path configuration.

Returns:

True iff it exists.

static from_dict(data)

Create from a dictionary.

Parameters:

data (Dict) – Dictionary.

Returns:

TaskDescription object.

Return type:

TaskDescription

load_info(paths)

Load the associated TaskInfo object.

Parameters:

paths (Paths) – Path configuration.

Returns:

Task info object.

Return type:

TaskInfo

load_task(paths)

Load the associated Task object.

Parameters:

paths (Paths) – Path configuration.

Returns:

Task object.

to_dict()

Convert to a dictionary for saving.

Returns:

Dictionary with ‘task_source’ and ‘task_name’ entries.

Return type:

Dict

class pytabkit.bench.data.tasks.TaskInfo

Bases: object

Information about a task (without containing the dataset itself).

__init__(task_desc, n_samples, tensor_infos, default_split_idx, more_info_dict, max_n_trainval=None)
Parameters:
  • task_desc (TaskDescription) – Task description.

  • n_samples (int) – Number of samples.

  • tensor_infos (Dict[str, TensorInfo]) – Information about the tensors (x_cat, x_cont, y).

  • default_split_idx (int | None) – If the dataset has a default split, this is the index of the first test sample. We assume that in this case, the training part is stored before the test part.

  • more_info_dict (Dict | None) – Dictionary with more information that can be stored, for example about the original OpenML dataset id.

  • max_n_trainval (int | None) – maximum number of samples used for training+validation in random splits. If None (default value), no maximum is imposed.

static from_ds(task_desc, ds, default_split_idx=None, more_info_dict=None)
Parameters:
Return type:

TaskInfo

get_default_splits(n_splits)
Return type:

List[SplitInfo]

get_ds_size_gb()
Returns:

Dataset size in gigabyte, when stored in torch Tensors (8 byte for categorical variables, 4 byte for continuous variables).

Return type:

float

get_n_classes()
Returns:

Number of classes for classification, or 0 for regression.

Return type:

int

get_random_splits(n_splits, trainval_fraction=0.8, train_fraction=0.75)
Parameters:
  • n_splits (int)

  • trainval_fraction (float)

  • train_fraction (float)

Return type:

List[SplitInfo]

static load(paths, task_desc)
Parameters:
load_task(paths)

Load the associated task. :param paths: Path configuration. :return: Task object.

Parameters:

paths (Paths)

Return type:

Task

save(paths)
Parameters:

paths (Paths)

class pytabkit.bench.data.tasks.TaskPackage

Bases: object

Combines information about how to run a task on a benchmark.

__init__(task_info, split_infos, n_cv, n_refit, paths, rerun, alg_name, save_y_pred)
Parameters:
  • task_info (TaskInfo)

  • split_infos (List[SplitInfo])

  • n_cv (int)

  • n_refit (int)

  • paths (Paths)

  • rerun (bool)

  • alg_name (str)

  • save_y_pred (bool)

pytabkit.bench.data.uci_file_ops module

class pytabkit.bench.data.uci_file_ops.UCIVars

Bases: object

binary_classification_data_folder = '../bin-class-data/'
data_folder = '../data/'
data_group_id = 0
multiclass_classification_data_folder = '../multi-class-data/'
raw_data_folder = '../raw-data/'
regression_data_folder = '../regression-data/'
statistics_filename = '../data_statistics.csv'
pytabkit.bench.data.uci_file_ops.auto_replace_categories_in_mixed_data(data, column, separator, unknown_string='', unknown_replacement_value=0)
pytabkit.bench.data.uci_file_ops.auto_replace_missing_in_mixed_data(data, unknown_string='?')
pytabkit.bench.data.uci_file_ops.concat_files(source_filename_pattern, target_filename)
pytabkit.bench.data.uci_file_ops.convert_replace_string_to_vector(string, separator)
pytabkit.bench.data.uci_file_ops.convert_time_to_seconds(time, sep)
pytabkit.bench.data.uci_file_ops.count_bin_columns(data)
pytabkit.bench.data.uci_file_ops.download_and_save(url, filename)
pytabkit.bench.data.uci_file_ops.get_categories_in_mixed_data(data, column)
pytabkit.bench.data.uci_file_ops.get_category_replace_string(category_size, position, separator)
pytabkit.bench.data.uci_file_ops.is_number(string, german_decimal)
pytabkit.bench.data.uci_file_ops.load_mixed_raw_data(filename, sep, header=False)
pytabkit.bench.data.uci_file_ops.load_raw_data(filename, sep, description_columns=0, date_column=-1, date_sep='', date_order='', time_column=-1, time_sep='', german_decimal=False, na_string='---', show_intermediate=False, header=False)
pytabkit.bench.data.uci_file_ops.make_folder(folder)
pytabkit.bench.data.uci_file_ops.move_label_in_front(data, label_column)
pytabkit.bench.data.uci_file_ops.my_decode(x)
pytabkit.bench.data.uci_file_ops.prepare_new_data_set_group_id()
pytabkit.bench.data.uci_file_ops.remove_columns(data, columns)
pytabkit.bench.data.uci_file_ops.remove_empty_columns(data)
pytabkit.bench.data.uci_file_ops.remove_files(folder, filename_pattern)
pytabkit.bench.data.uci_file_ops.remove_rows_with_label(data, label)
pytabkit.bench.data.uci_file_ops.replace_bin_cats_in_mixed_data(data, categories, column, separator, unknown_string='', unknown_replacement_value=0)
pytabkit.bench.data.uci_file_ops.replace_categories_in_file(filename, categories, separator)
pytabkit.bench.data.uci_file_ops.replace_categories_in_mixed_data(data, categories, column, separator, unknown_string='', unknown_replacement_value=0)
pytabkit.bench.data.uci_file_ops.replace_chars_in_file(filename, old_char, new_char)
pytabkit.bench.data.uci_file_ops.replace_circulars_in_mixed_data(data, categories, column, separator, unknown_string='')
pytabkit.bench.data.uci_file_ops.replace_isodate_by_day_in_mixed_data(data, column)
pytabkit.bench.data.uci_file_ops.replace_manual_in_mixed_data(data, categories, column, replacement, separator, unknown_string='', unknown_replacement_value=0)
pytabkit.bench.data.uci_file_ops.replace_ordinals_in_mixed_data(data, categories, column, separator, unknown_string='', unknown_replacement_value=0, begin_value=1)
pytabkit.bench.data.uci_file_ops.replace_time_by_seconds_in_mixed_data(data, column, sep, rounded=1)
pytabkit.bench.data.uci_file_ops.save_data_stats(data_stats)
pytabkit.bench.data.uci_file_ops.save_data_to_file(data, filename, is_classification, is_regression=True, min_scale=-1.0, max_scale=1.0)
pytabkit.bench.data.uci_file_ops.un_z_raw_data(filename)
pytabkit.bench.data.uci_file_ops.unarff_raw_data(filename)
pytabkit.bench.data.uci_file_ops.ungz_raw_data(filename)
pytabkit.bench.data.uci_file_ops.unrar_raw_data(filename)
pytabkit.bench.data.uci_file_ops.untar_raw_data(filename)
pytabkit.bench.data.uci_file_ops.unzip_raw_data(filename)
pytabkit.bench.data.uci_file_ops.write_mixed_raw_data(filename, data, sep)

Module contents