Spaces:
No application file
No application file
| # pipeline taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py | |
| """ | |
| This files includes a the data processing for Tox21. | |
| As an input it takes a list of SMILES and it outputs a nested dictionary with | |
| SMILES and target names as keys. | |
| """ | |
| import os | |
| import argparse | |
| import numpy as np | |
| from src.data import create_descriptors, get_tox21_split | |
| from src.utils import ( | |
| TASKS, | |
| HF_TOKEN, | |
| write_pickle, | |
| create_dir, | |
| ) | |
| parser = argparse.ArgumentParser( | |
| description="Data preprocessing script for the Tox21 dataset" | |
| ) | |
| parser.add_argument( | |
| "--data_folder", | |
| type=str, | |
| default="data/", | |
| help="Folder containing the tox21_compoundData.csv file.", | |
| ) | |
| parser.add_argument( | |
| "--save_folder", | |
| type=str, | |
| default="data/", | |
| help="Folder to which preprocessed the data CSV and NPZ files should be saved.", | |
| ) | |
| parser.add_argument( | |
| "--cv_fold", | |
| type=int, | |
| default=4, | |
| help="Select fold used as validation set.", | |
| ) | |
| parser.add_argument( | |
| "--feature_selection", | |
| type=int, | |
| default=1, | |
| help="True (=1) to use feature selection.", | |
| ) | |
| parser.add_argument( | |
| "--feature_selection_path", | |
| type=str, | |
| default="feat_selection.npz", | |
| help="Filename for saving feature selections.", | |
| ) | |
| parser.add_argument( | |
| "--min_var", | |
| type=float, | |
| default=0.05, | |
| help="Minimum variance threshold for selecting features.", | |
| ) | |
| parser.add_argument( | |
| "--max_corr", | |
| type=float, | |
| default=0.95, | |
| help="Maximum correlation threshold for selecting features.", | |
| ) | |
| parser.add_argument( | |
| "--ecdfs_path", | |
| type=str, | |
| default="ecdfs.pkl", | |
| help="Filename to save ECDFs.", | |
| ) | |
| parser.add_argument( | |
| "--ecfps_radius", | |
| type=int, | |
| default=3, | |
| help="Radius used for creating ECFPs.", | |
| ) | |
| parser.add_argument( | |
| "--ecfps_folds", | |
| type=int, | |
| default=8192, | |
| help="Folds used for creating ECFPs.", | |
| ) | |
| def main(args): | |
| """Preprocessing train/val data to use for TabPFN. | |
| 1. Download Tox21 train/val data from HF | |
| 2. Preprocess dataset splits | |
| """ | |
| ds = get_tox21_split(HF_TOKEN, cvfold=args.cv_fold) | |
| feature_creation_kwargs = { | |
| "radius": args.ecfps_radius, | |
| "fpsize": args.ecfps_folds, | |
| "min_var": args.min_var, | |
| "max_corr": args.max_corr, | |
| } | |
| splits = ["train", "validation"] | |
| for split in splits: | |
| print(f"Preprocess {split} molecules") | |
| ds_split = ds[split] | |
| smiles = list(ds_split["smiles"]) | |
| if split == "train": | |
| output = create_descriptors( | |
| smiles, | |
| return_feature_selection=True, | |
| return_ecdfs=True, | |
| **feature_creation_kwargs, | |
| ) | |
| features = output.pop("features") | |
| feature_selection = output.pop("feature_selection") | |
| ecdfs = output.pop("ecdfs") | |
| np.savez( | |
| args.feature_selection_path, | |
| ecfps_selec=feature_selection["ecfps_selec"], | |
| tox_selec=feature_selection["tox_selec"], | |
| ) | |
| print(f"Saved feature selection under {args.feature_selection_path}") | |
| write_pickle(args.ecdfs_path, ecdfs) | |
| print(f"Saved ECDFs under {args.ecdfs_path}") | |
| else: | |
| features = create_descriptors( | |
| smiles, | |
| ecdfs=ecdfs, | |
| feature_selection=feature_selection, | |
| **feature_creation_kwargs, | |
| )["features"] | |
| labels = [] | |
| for task in TASKS: | |
| labels.append(ds_split[task].to_numpy()) | |
| labels = np.stack(labels, axis=1) | |
| save_path = os.path.join(args.save_folder, f"tox21_{split}_cv4.npz") | |
| with open(save_path, "wb") as f: | |
| np.savez( | |
| f, | |
| labels=labels, | |
| **features, | |
| ) | |
| print(f"Saved preprocessed {split} split under {save_path}") | |
| print("Preprocessing finished successfully") | |
| if __name__ == "__main__": | |
| args = parser.parse_args() | |
| args.ecdfs_path = os.path.join(args.save_folder, args.ecdfs_path) | |
| args.feature_selection_path = os.path.join( | |
| args.save_folder, args.feature_selection_path | |
| ) | |
| create_dir(args.save_folder) | |
| create_dir(args.ecdfs_path, is_file=True) | |
| create_dir(args.feature_selection_path, is_file=True) | |
| main(args) | |