Spaces:

ml-jku
/

tox21_tabpfn_classifier

No application file

App Files Files Community

antoniaebner commited on Oct 29, 2025

Commit

7c1c2c8

1 Parent(s): 6df6d5b

add preprocessing

Browse files

Files changed (3) hide show

preprocess.py +180 -0
src/{preprocess.py → data.py} +32 -166
src/utils.py +10 -0

preprocess.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# pipeline taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+"""
+This files includes a the data processing for Tox21.
+As an input it takes a list of SMILES and it outputs a nested dictionary with
+SMILES and target names as keys.
+"""
+import os
+import argparse
+import numpy as np
+from src.data import create_descriptors, get_tox21_split
+from src.utils import (
+    TASKS,
+    HF_TOKEN,
+    write_pickle,
+    create_dir,
+)
+parser = argparse.ArgumentParser(
+    description="Data preprocessing script for the Tox21 dataset"
+)
+parser.add_argument(
+    "--data_folder",
+    type=str,
+    default="data/",
+    help="Folder containing the tox21_compoundData.csv file.",
+)
+parser.add_argument(
+    "--save_folder",
+    type=str,
+    default="data/",
+    help="Folder to which preprocessed the data CSV and NPZ files should be saved.",
+)
+parser.add_argument(
+    "--cv_fold",
+    type=int,
+    default=4,
+    help="Select fold used as validation set.",
+)
+parser.add_argument(
+    "--feature_selection",
+    type=int,
+    default=1,
+    help="True (=1) to use feature selection.",
+)
+parser.add_argument(
+    "--feature_selection_path",
+    type=str,
+    default="feat_selection.npz",
+    help="Filename for saving feature selections.",
+)
+parser.add_argument(
+    "--min_var",
+    type=float,
+    default=0.05,
+    help="Minimum variance threshold for selecting features.",
+)
+parser.add_argument(
+    "--max_corr",
+    type=float,
+    default=0.95,
+    help="Maximum correlation threshold for selecting features.",
+)
+parser.add_argument(
+    "--ecdfs_path",
+    type=str,
+    default="ecdfs.pkl",
+    help="Filename to save ECDFs.",
+)
+parser.add_argument(
+    "--ecfps_radius",
+    type=int,
+    default=3,
+    help="Radius used for creating ECFPs.",
+)
+parser.add_argument(
+    "--ecfps_folds",
+    type=int,
+    default=8192,
+    help="Folds used for creating ECFPs.",
+)
+def main(args):
+    """Preprocessing train/val data to use for TabPFN.
+    1. Download Tox21 train/val data from HF
+    2. Preprocess dataset splits
+    """
+    ds = get_tox21_split(HF_TOKEN, cvfold=args.cv_fold)
+    feature_creation_kwargs = {
+        "radius": args.ecfps_radius,
+        "fpsize": args.ecfps_folds,
+        "min_var": args.min_var,
+        "max_corr": args.max_corr,
+    }
+    splits = ["train", "validation"]
+    for split in splits:
+        print(f"Preprocess {split} molecules")
+        ds_split = ds[split]
+        smiles = list(ds_split["smiles"])
+        if split == "train":
+            output = create_descriptors(
+                smiles,
+                return_feature_selection=True,
+                return_ecdfs=True,
+                **feature_creation_kwargs,
+            )
+            features = output.pop("features")
+            feature_selection = output.pop("feature_selection")
+            ecdfs = output.pop("ecdfs")
+            np.savez(
+                args.feature_selection_path,
+                ecfps_selec=feature_selection["ecfps_selec"],
+                tox_selec=feature_selection["tox_selec"],
+            )
+            print(f"Saved feature selection under {args.feature_selection_path}")
+            write_pickle(args.ecdfs_path, ecdfs)
+            print(f"Saved ECDFs under {args.ecdfs_path}")
+        else:
+            features = create_descriptors(
+                smiles,
+                ecdfs=ecdfs,
+                feature_selection=feature_selection,
+                **feature_creation_kwargs,
+            )["features"]
+        labels = []
+        for task in TASKS:
+            labels.append(ds_split[task].to_numpy())
+        labels = np.stack(labels, axis=1)
+        save_path = os.path.join(args.save_folder, f"tox21_{split}_cv4.npz")
+        with open(save_path, "wb") as f:
+            np.savez(
+                f,
+                labels=labels,
+                **features,
+            )
+            print(f"Saved preprocessed {split} split under {save_path}")
+    print("Preprocessing finished successfully")
+if __name__ == "__main__":
+    args = parser.parse_args()
+    args.ecdfs_path = os.path.join(args.save_folder, args.ecdfs_path)
+    args.feature_selection_path = os.path.join(
+        args.save_folder, args.feature_selection_path
+    )
+    create_dir(args.save_folder)
+    create_dir(args.ecdfs_path, is_file=True)
+    create_dir(args.feature_selection_path, is_file=True)
+    main(args)

src/{preprocess.py → data.py} RENAMED Viewed

@@ -6,97 +6,23 @@ As an input it takes a list of SMILES and it outputs a nested dictionary with
 SMILES and target names as keys.
 """
-import os
-import argparse
 import json
 import numpy as np
 import pandas as pd
 from sklearn.feature_selection import VarianceThreshold
 from statsmodels.distributions.empirical_distribution import ECDF
-from datasets import load_dataset
 from rdkit import Chem, DataStructs
 from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
 from rdkit.Chem.rdchem import Mol
-from utils import (
-    TASKS,
-    HF_TOKEN,
     USED_200_DESCR,
     Standardizer,
-    write_pickle,
-)
-parser = argparse.ArgumentParser(
-    description="Data preprocessing script for the Tox21 dataset"
-)
-parser.add_argument(
-    "--data_folder",
-    type=str,
-    default="data/",
-)
-parser.add_argument(
-    "--save_folder",
-    type=str,
-    default="data/",
-)
-parser.add_argument(
-    "--use_hf",
-    type=int,
-    default=0,
-)
-parser.add_argument(
-    "--path_ecdfs",
-    type=str,
-    default="ecdfs.pkl",
-)
-parser.add_argument(
-    "--path_feat_selec",
-    type=str,
-    default="feat_selection.npz",
-)
-parser.add_argument(
-    "--tox_smarts_filepath",
-    type=str,
-    default="tox_smarts.json",
-)
-parser.add_argument(
-    "--feature_selection",
-    type=int,
-    default=1,
-)
-parser.add_argument(
-    "--min_var",
-    type=float,
-    default=0.05,
-)
-parser.add_argument(
-    "--max_corr",
-    type=float,
-    default=0.95,
-)
-parser.add_argument(
-    "--ecfps_radius",
-    type=int,
-    default=3,
-)
-parser.add_argument(
-    "--ecfps_folds",
-    type=int,
-    default=8192,
 )
@@ -128,7 +54,7 @@ def create_cleaned_mol_objects(smiles: list[str]) -> tuple[list[Mol], np.ndarray
     return mols, np.array(clean_mol_mask)
-def create_ecfp_fps(mols: list[Mol], radius=None, fpsize=None) -> np.ndarray:
     """This function ECFP fingerprints for a list of molecules.
     Args:
@@ -139,13 +65,10 @@ def create_ecfp_fps(mols: list[Mol], radius=None, fpsize=None) -> np.ndarray:
     """
     ecfps = list()
-    kwargs = {}
-    if not fpsize is None:
-        kwargs["fpSize"] = fpsize
-    if not radius is None:
-        kwargs["radius"] = radius
     for mol in mols:
-        gen = rdFingerprintGenerator.GetMorganGenerator(countSimulation=True, **kwargs)
         fp_sparse_vec = gen.GetCountFingerprint(mol)
         fp = np.zeros((0,), np.int8)
@@ -283,15 +206,16 @@ def create_descriptors(
     feature_selection=None,
     return_ecdfs=False,
     return_feature_selection=False,
 ):
     # Create cleanded rdkit mol objects
     mols, clean_mol_mask = create_cleaned_mol_objects(smiles)
     print("Cleaned molecules")
-    tox_patterns = get_tox_patterns(args.tox_smarts_filepath)
     # Create fingerprints and descriptors
-    ecfps = create_ecfp_fps(mols, radius=args.ecfps_radius, fpsize=args.ecfps_folds)
     # expand using mol_mask
     ecfps = fill(ecfps, ~clean_mol_mask)
     print("Created ECFP fingerprints")
@@ -303,8 +227,8 @@ def create_descriptors(
     # Create and save feature selection for ecfps and tox
     if feature_selection is None:
         print("Create Feature selection")
-        ecfps_selec = get_feature_selection(ecfps, args.min_var, args.max_corr)
-        tox_selec = get_feature_selection(tox, args.min_var, args.max_corr)
         feature_selection = {"ecfps_selec": ecfps_selec, "tox_selec": tox_selec}
     else:
@@ -351,7 +275,7 @@ def create_descriptors(
 def get_feature_selection(
-    raw_features: np.ndarray, min_var=0.01, max_corr=0.95
 ) -> np.ndarray:
     # select features with at least min_var variation
     var_thresh = VarianceThreshold(threshold=min_var)
@@ -372,86 +296,28 @@ def get_feature_selection(
     return feature_selection
-def main(args):
-    """Preprocessing train/val data to use for TabPFN.
-    1. Download Tox21 train/val data from HF & CVfolds used in DeepTox
-    2. Combine datasets & re-split data. New validation split is CVfold=4
-    3. Preprocess dataset splits
-    """
-    splits = ["train", "validation", "test"]  # TODO: remove test
-    if args.use_hf:
-        ds = load_dataset("tschouis/tox21", token=HF_TOKEN)
-    else:
-        ds = {}
-        for split in splits:
-            if split == "train":
-                ds[split] = pd.read_csv(
-                    os.path.join(args.data_folder, f"tox21_{split}_cv4.csv")
-                )
-            else:
-                ds[split] = pd.read_csv(
-                    os.path.join(args.data_folder, f"tox21_{split}_cv4.csv")
-                )
-    for split in splits:
-        print(f"Preprocess {split} molecules")
-        smiles = list(ds[split]["smiles"])
-        if split == "train":
-            output = create_descriptors(
-                smiles, return_feature_selection=True, return_ecdfs=True
-            )
-            features = output.pop("features")
-            feature_selection = output.pop("feature_selection")
-            ecdfs = output.pop("ecdfs")
-            np.savez(
-                args.path_feat_selec,
-                ecfps_selec=feature_selection["ecfps_selec"],
-                tox_selec=feature_selection["tox_selec"],
-            )
-            print(f"Saved feature selection under {args.path_feat_selec}")
-            write_pickle(args.path_ecdfs, ecdfs)
-            print(f"Saved ECDFs under {args.path_ecdfs}")
-        else:
-            features = create_descriptors(
-                smiles, ecdfs=ecdfs, feature_selection=feature_selection
-            )["features"]
-        labels = []
-        for task in TASKS:
-            datasplit = ds[split].to_pandas() if args.use_hf else ds[split]
-            labels.append(datasplit[task].to_numpy())
-        labels = np.stack(labels, axis=1)
-        save_path = os.path.join(args.save_folder, f"tox21_{split}_cv4.npz")
-        with open(save_path, "wb") as f:
-            np.savez(
-                f,
-                labels=labels,
-                **features,
-            )
-            print(f"Saved preprocessed {split} split under {save_path}")
-    print("Preprocessing finished successfully")
-if __name__ == "__main__":
-    args = parser.parse_args()
-    if not os.path.exists(args.save_folder):
-        os.makedirs(args.save_folder)
-    args.path_ecdfs = os.path.join(args.save_folder, args.path_ecdfs)
-    args.path_feat_selec = os.path.join(args.save_folder, args.path_feat_selec)
-    args.tox_smarts_filepath = os.path.join(args.data_folder, args.tox_smarts_filepath)
-    if not os.path.exists(os.path.dirname(args.path_ecdfs)):
-        os.makedirs(os.path.dirname(args.path_ecdfs))
-    main(args)

 SMILES and target names as keys.
 """
 import json
 import numpy as np
 import pandas as pd
+from datasets import load_dataset
 from sklearn.feature_selection import VarianceThreshold
 from statsmodels.distributions.empirical_distribution import ECDF
 from rdkit import Chem, DataStructs
 from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
 from rdkit.Chem.rdchem import Mol
+from .utils import (
     USED_200_DESCR,
+    TOX_SMARTS_PATH,
     Standardizer,
 )
     return mols, np.array(clean_mol_mask)
+def create_ecfp_fps(mols: list[Mol], radius=3, fpsize=2048, **kwargs) -> np.ndarray:
     """This function ECFP fingerprints for a list of molecules.
     Args:
     """
     ecfps = list()
     for mol in mols:
+        gen = rdFingerprintGenerator.GetMorganGenerator(
+            countSimulation=True, fpSize=fpsize, radius=radius
+        )
         fp_sparse_vec = gen.GetCountFingerprint(mol)
         fp = np.zeros((0,), np.int8)
     feature_selection=None,
     return_ecdfs=False,
     return_feature_selection=False,
+    **kwargs,
 ):
     # Create cleanded rdkit mol objects
     mols, clean_mol_mask = create_cleaned_mol_objects(smiles)
     print("Cleaned molecules")
+    tox_patterns = get_tox_patterns(TOX_SMARTS_PATH)
     # Create fingerprints and descriptors
+    ecfps = create_ecfp_fps(mols, **kwargs)
     # expand using mol_mask
     ecfps = fill(ecfps, ~clean_mol_mask)
     print("Created ECFP fingerprints")
     # Create and save feature selection for ecfps and tox
     if feature_selection is None:
         print("Create Feature selection")
+        ecfps_selec = get_feature_selection(ecfps, **kwargs)
+        tox_selec = get_feature_selection(tox, **kwargs)
         feature_selection = {"ecfps_selec": ecfps_selec, "tox_selec": tox_selec}
     else:
 def get_feature_selection(
+    raw_features: np.ndarray, min_var=0.01, max_corr=0.95, **kwargs
 ) -> np.ndarray:
     # select features with at least min_var variation
     var_thresh = VarianceThreshold(threshold=min_var)
     return feature_selection
+def get_tox21_split(token, cvfold=None):
+    ds = load_dataset("tschouis/tox21", token=token)
+    train_df = ds["train"].to_pandas()
+    val_df = ds["validation"].to_pandas()
+    if cvfold is None:
+        return {"train": train_df, "validation": val_df}
+    combined_df = pd.concat([train_df, val_df], ignore_index=True)
+    cvfold = float(cvfold)
+    # create new splits
+    cvfold = float(cvfold)
+    train_df = combined_df[combined_df.CVfold != cvfold]
+    val_df = combined_df[combined_df.CVfold == cvfold]
+    # exclude train mols that occur in the validation split
+    val_inchikeys = set(val_df["inchikey"])
+    train_df = train_df[~train_df["inchikey"].isin(val_inchikeys)]
+    return {
+        "train": train_df.reset_index(drop=True),
+        "validation": val_df.reset_index(drop=True),
+    }

src/utils.py CHANGED Viewed

@@ -28,6 +28,8 @@ TASKS = [
     "SR-p53",
 ]
 KNOWN_DESCR = ["ecfps", "rdkit_descr_quantiles", "maccs", "tox"]
 USED_200_DESCR = [
@@ -441,3 +443,11 @@ def load_pickle(path: str):
 def write_pickle(path: str, obj: object):
     with open(path, "wb") as file:
         pickle.dump(obj, file)

     "SR-p53",
 ]
+TOX_SMARTS_PATH = "data/tox_smarts.json"
 KNOWN_DESCR = ["ecfps", "rdkit_descr_quantiles", "maccs", "tox"]
 USED_200_DESCR = [
 def write_pickle(path: str, obj: object):
     with open(path, "wb") as file:
         pickle.dump(obj, file)
+def create_dir(path, is_file=False):
+    """Creates the parent directories if a path to a file is given, else create the given directory"""
+    to_create = os.path.dirname(path) if is_file else path
+    if not os.path.exists(to_create):
+        os.makedirs(to_create)