Spaces:

ml-jku
/

tox21_tabpfn_classifier

No application file

App Files Files Community

antoniaebner commited on Oct 23, 2025

Commit

f0ecde9

1 Parent(s): 1561c1d

add preprocessing pipeline

Browse files

Files changed (2) hide show

requirements.txt +3 -3
src/preprocess_old.py +513 -0

requirements.txt CHANGED Viewed

@@ -1,12 +1,12 @@
 fastapi
 uvicorn[standard]
-statsmodels
-rdkit
 numpy==2.2.6
 scikit-learn==1.6.1
 joblib
 tabulate
-datasets
 scipy==1.16.1
 pandas==2.3.2
 tabpfn==2.2.1

 fastapi
 uvicorn[standard]
+statsmodels==0.14.5
+rdkit==2025.03.5
 numpy==2.2.6
 scikit-learn==1.6.1
 joblib
 tabulate
+datasets==4.0.0
 scipy==1.16.1
 pandas==2.3.2
 tabpfn==2.2.1

src/preprocess_old.py ADDED Viewed

	@@ -0,0 +1,513 @@

+# pipeline taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+"""
+This files includes a the data processing for Tox21.
+As an input it takes a list of SMILES and it outputs a nested dictionary with
+SMILES and target names as keys.
+"""
+import os
+import argparse
+import json
+from typing import Iterable
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import StandardScaler
+from sklearn.feature_selection import VarianceThreshold
+from statsmodels.distributions.empirical_distribution import ECDF
+from datasets import load_dataset
+from rdkit import Chem, DataStructs
+from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
+from rdkit.Chem.rdchem import Mol
+from utils import (
+    TASKS,
+    KNOWN_DESCR,
+    HF_TOKEN,
+    USED_200_DESCR,
+    Standardizer,
+    load_pickle,
+    write_pickle,
+)
+parser = argparse.ArgumentParser(
+    description="Data preprocessing script for the Tox21 dataset"
+)
+parser.add_argument(
+    "--data_folder",
+    type=str,
+    default="data/",
+)
+parser.add_argument(
+    "--save_folder",
+    type=str,
+    default="data/",
+)
+parser.add_argument(
+    "--use_hf",
+    type=int,
+    default=0,
+)
+parser.add_argument(
+    "--path_ecdfs",
+    type=str,
+    default="ecdfs.pkl",
+)
+parser.add_argument(
+    "--path_feat_selec",
+    type=str,
+    default="feat_selection.npz",
+)
+parser.add_argument(
+    "--tox_smarts_filepath",
+    type=str,
+    default="tox_smarts.json",
+)
+parser.add_argument(
+    "--feature_selection",
+    type=int,
+    default=1,
+)
+parser.add_argument(
+    "--min_var",
+    type=float,
+    default=0.05,
+)
+parser.add_argument(
+    "--max_corr",
+    type=float,
+    default=0.95,
+)
+parser.add_argument(
+    "--ecfps_radius",
+    type=int,
+    default=None,
+)
+parser.add_argument(
+    "--ecfps_folds",
+    type=int,
+    default=8192,
+)
+def create_cleaned_mol_objects(smiles: list[str]) -> tuple[list[Mol], np.ndarray]:
+    """This function creates cleaned RDKit mol objects from a list of SMILES.
+    Args:
+        smiles (list[str]): list of SMILES
+    Returns:
+        list[Mol]: list of cleaned molecules
+        np.ndarray[bool]: mask that contains False at index `i`, if molecule in `smiles` at
+            index `i` could not be cleaned and was removed.
+    """
+    sm = Standardizer(canon_taut=True)
+    clean_mol_mask = list()
+    mols = list()
+    for i, smile in enumerate(smiles):
+        mol = Chem.MolFromSmiles(smile)
+        standardized_mol, _ = sm.standardize_mol(mol)
+        is_cleaned = standardized_mol is not None
+        clean_mol_mask.append(is_cleaned)
+        if not is_cleaned:
+            continue
+        can_mol = Chem.MolFromSmiles(Chem.MolToSmiles(standardized_mol))
+        mols.append(can_mol)
+    return mols, np.array(clean_mol_mask)
+def create_ecfp_fps(mols: list[Mol], radius=None, fpsize=None) -> np.ndarray:
+    """This function ECFP fingerprints for a list of molecules.
+    Args:
+        mols (list[Mol]): list of molecules
+    Returns:
+        np.ndarray: ECFP fingerprints of molecules
+    """
+    ecfps = list()
+    kwargs = {}
+    if not fpsize is None:
+        kwargs["fpSize"] = fpsize
+    if not radius is None:
+        kwargs["radius"] = radius
+    for mol in mols:
+        gen = rdFingerprintGenerator.GetMorganGenerator(countSimulation=True, **kwargs)
+        fp_sparse_vec = gen.GetCountFingerprint(mol)
+        fp = np.zeros((0,), np.int8)
+        DataStructs.ConvertToNumpyArray(fp_sparse_vec, fp)
+        ecfps.append(fp)
+    return np.array(ecfps)
+def create_maccs_keys(mols: list[Mol]) -> np.ndarray:
+    maccs = [MACCSkeys.GenMACCSKeys(x) for x in mols]
+    return np.array(maccs)
+def get_tox_patterns(filepath: str):
+    """This calculates tox features defined in tox_smarts.json.
+    Args:
+        mols: A list of Mol
+        n_jobs: If >1 multiprocessing is used
+    """
+    # load patterns
+    with open(filepath) as f:
+        smarts_list = [s[1] for s in json.load(f)]
+    # Code does not work for this case
+    assert len([s for s in smarts_list if ("AND" in s) and ("OR" in s)]) == 0
+    # Chem.MolFromSmarts takes a long time so it pays of to parse all the smarts first
+    # and then use them for all molecules. This gives a huge speedup over existing code.
+    # a list of patterns, whether to negate the match result and how to join them to obtain one boolean value
+    all_patterns = []
+    for smarts in smarts_list:
+        patterns = []  # list of smarts-patterns
+        # value for each of the patterns above. Negates the values of the above later.
+        negations = []
+        if " AND " in smarts:
+            smarts = smarts.split(" AND ")
+            merge_any = False  # If an ' AND ' is found all 'subsmarts' have to match
+        else:
+            # If there is an ' OR ' present it's enough is any of the 'subsmarts' match.
+            # This also accumulates smarts where neither ' OR ' nor ' AND ' occur
+            smarts = smarts.split(" OR ")
+            merge_any = True
+        # for all subsmarts check if they are preceded by 'NOT '
+        for s in smarts:
+            neg = s.startswith("NOT ")
+            if neg:
+                s = s[4:]
+            patterns.append(Chem.MolFromSmarts(s))
+            negations.append(neg)
+        all_patterns.append((patterns, negations, merge_any))
+    return all_patterns
+def create_tox_features(mols: list[Mol], patterns: list) -> np.ndarray:
+    """Matches the tox patterns against a molecule. Returns a boolean array"""
+    tox_data = []
+    for mol in mols:
+        mol_features = []
+        for patts, negations, merge_any in patterns:
+            matches = [mol.HasSubstructMatch(p) for p in patts]
+            matches = [m != n for m, n in zip(matches, negations)]
+            if merge_any:
+                pres = any(matches)
+            else:
+                pres = all(matches)
+            mol_features.append(pres)
+        tox_data.append(np.array(mol_features))
+    return np.array(tox_data)
+def create_rdkit_descriptors(mols: list[Mol]) -> np.ndarray:
+    """This function creates RDKit descriptors for a list of molecules.
+    Args:
+        mols (list[Mol]): list of molecules
+    Returns:
+        np.ndarray: RDKit descriptors of molecules
+    """
+    rdkit_descriptors = list()
+    for mol in mols:
+        descrs = []
+        for _, descr_calc_fn in Descriptors._descList:
+            descrs.append(descr_calc_fn(mol))
+        descrs = np.array(descrs)
+        descrs = descrs[USED_200_DESCR]
+        rdkit_descriptors.append(descrs)
+    return np.array(rdkit_descriptors)
+def create_quantiles(raw_features: np.ndarray, ecdfs: list) -> np.ndarray:
+    """Create quantile values for given features using the columns
+    Args:
+        raw_features (np.ndarray): values to put into quantiles
+        ecdfs (list): ECDFs to use
+    Returns:
+        np.ndarray: computed quantiles
+    """
+    quantiles = np.zeros_like(raw_features)
+    for column in range(raw_features.shape[1]):
+        raw_values = raw_features[:, column].reshape(-1)
+        ecdf = ecdfs[column]
+        q = ecdf(raw_values)
+        quantiles[:, column] = q
+    return quantiles
+def fill(features, mask, value=np.nan):
+    n_mols = len(mask)
+    n_features = features.shape[1]
+    data = np.zeros(shape=(n_mols, n_features))
+    data.fill(value)
+    data[~mask] = features
+    return data
+def normalize_features(
+    raw_features,
+    scaler=None,
+    save_scaler_path: str = "",
+    verbose=True,
+):
+    if scaler is None:
+        scaler = StandardScaler()
+        scaler.fit(raw_features)
+        if verbose:
+            print("Fitted the StandardScaler")
+        if save_scaler_path:
+            write_pickle(save_scaler_path, scaler)
+            if verbose:
+                print(f"Saved the StandardScaler under {save_scaler_path}")
+    # Normalize feature vectors
+    normalized_features = scaler.transform(raw_features)
+    if verbose:
+        print("Normalized molecule features")
+    return normalized_features, scaler
+def create_descriptors(
+    smiles,
+    ecdfs=None,
+    scaler=None,
+    feature_selection=None,
+    descriptors: Iterable = KNOWN_DESCR,
+):
+    # Create cleanded rdkit mol objects
+    mols, clean_mol_mask = create_cleaned_mol_objects(smiles)
+    print("Cleaned molecules")
+    features = []
+    # if "ecfps" in descriptors:
+    # Create fingerprints and descriptors
+    ecfps = create_ecfp_fps(mols)
+    # expand using mol_mask
+    ecfps = fill(ecfps, ~clean_mol_mask)
+    features.append(ecfps)
+    print("Created ECFP fingerprints")
+    # if "maccs" in descriptors:
+    maccs = create_maccs_keys(mols)
+    maccs = fill(maccs, ~clean_mol_mask)
+    features.append(maccs)
+    print("Created MACCS keys")
+    # if "tox" in descriptors:
+    tox_patterns = get_tox_patterns("assets/tox_smarts.json")
+    tox = create_tox_features(mols, tox_patterns)
+    tox = fill(tox, ~clean_mol_mask)
+    features.append(tox)
+    print("Created Tox features")
+    # if "rdkit_descr_quantiles" in descriptors:
+    rdkit_descrs = create_rdkit_descriptors(mols)
+    print("Created RDKit descriptors")
+    # Create and save ecdfs
+    if ecdfs is None:
+        print("Create ECDFs")
+        ecdfs = []
+        for column in range(rdkit_descrs.shape[1]):
+            raw_values = rdkit_descrs[:, column].reshape(-1)
+            ecdfs.append(ECDF(raw_values))
+    # Create quantiles
+    rdkit_descr_quantiles = create_quantiles(rdkit_descrs, ecdfs)
+    # expand using mol_mask
+    rdkit_descr_quantiles = fill(rdkit_descr_quantiles, ~clean_mol_mask)
+    features.append(rdkit_descr_quantiles)
+    print("Created quantiles of RDKit descriptors")
+    # concatenate features
+    raw_features = np.concatenate(features, axis=1)
+    # normalize with scaler if scaler is passed, else create scaler
+    features, _ = normalize_features(
+        raw_features,
+        scaler=scaler,
+        verbose=True,
+    )
+    return features, clean_mol_mask
+def get_feature_selection(
+    raw_features: np.ndarray, min_var=0.01, max_corr=0.95
+) -> np.ndarray:
+    # select features with at least 0.01 variation
+    var_thresh = VarianceThreshold(threshold=min_var)
+    feature_selection = var_thresh.fit(raw_features).get_support(
+        indices=True
+    )  # list containing selected feature indices
+    n_features_preselected = len(feature_selection)
+    # Remove highly correlated features
+    corr_matrix = np.corrcoef(raw_features[:, feature_selection], rowvar=False)
+    upper_tri = np.triu(corr_matrix, k=1)
+    to_keep = np.ones((n_features_preselected,), dtype=bool)
+    for i in range(upper_tri.shape[0]):
+        for j in range(upper_tri.shape[1]):
+            if upper_tri[i, j] > max_corr:
+                to_keep[j] = False
+    feature_selection = feature_selection[to_keep]
+    return feature_selection
+def main(args):
+    splits = ["train", "validation", "test"]  # TODO: remove test
+    if args.use_hf:
+        ds = load_dataset("tschouis/tox21", token=HF_TOKEN)
+    else:
+        ds = {}
+        for split in splits:
+            if split == "train":
+                ds[split] = pd.read_csv(
+                    os.path.join(args.data_folder, f"tox21_{split}_cv4.csv")
+                )
+            else:
+                ds[split] = pd.read_csv(
+                    os.path.join(args.data_folder, f"tox21_{split}_cv4.csv")
+                )
+    for split in splits:
+        print(f"Preprocess {split} molecules")
+        smiles = list(ds[split]["smiles"])
+        # Create cleanded rdkit mol objects
+        mols, clean_mol_mask = create_cleaned_mol_objects(smiles)
+        print("Cleaned molecules")
+        tox_patterns = get_tox_patterns(args.tox_smarts_filepath)
+        # Create fingerprints and descriptors
+        ecfps = create_ecfp_fps(mols, radius=args.ecfps_radius, fpsize=args.ecfps_folds)
+        # expand using mol_mask
+        ecfps = fill(ecfps, ~clean_mol_mask)
+        print("Created ECFP fingerprints")
+        tox = create_tox_features(mols, tox_patterns)
+        tox = fill(tox, ~clean_mol_mask)
+        print("Created Tox features")
+        # Create and save feature selection for ecfps and tox
+        if args.feature_selection:
+            if split == "train":
+                print("Create Feature selection")
+                ecfps_selec = get_feature_selection(ecfps, args.min_var, args.max_corr)
+                tox_selec = get_feature_selection(tox, args.min_var, args.max_corr)
+                np.savez(
+                    args.path_feat_selec, ecfps_selec=ecfps_selec, tox_selec=tox_selec
+                )
+            else:
+                print(f"Load feature selection from {args.path_feat_selec}")
+                feature_selection = np.load(args.path_feat_selec)
+                ecfps_selec = feature_selection["ecfps_selec"]
+                tox_selec = feature_selection["tox_selec"]
+            ecfps = ecfps[:, ecfps_selec]
+            tox = tox[:, tox_selec]
+        maccs = create_maccs_keys(mols)
+        maccs = fill(maccs, ~clean_mol_mask)
+        print("Created MACCS keys")
+        rdkit_descrs = create_rdkit_descriptors(mols)
+        print("Created RDKit descriptors")
+        # Create and save ecdfs
+        if split == "train":
+            print("Create ECDFs")
+            ecdfs = []
+            for column in range(rdkit_descrs.shape[1]):
+                raw_values = rdkit_descrs[:, column].reshape(-1)
+                ecdfs.append(ECDF(raw_values))
+            write_pickle(args.path_ecdfs, ecdfs)
+            print(f"Saved ECDFs under {args.path_ecdfs}")
+        else:
+            print(f"Load ECDFs from {args.path_ecdfs}")
+            ecdfs = load_pickle(args.path_ecdfs)
+        # Create quantiles
+        rdkit_descr_quantiles = create_quantiles(rdkit_descrs, ecdfs)
+        # expand using mol_mask
+        rdkit_descr_quantiles = fill(rdkit_descr_quantiles, ~clean_mol_mask)
+        print("Created quantiles of RDKit descriptors")
+        labels = []
+        for task in TASKS:
+            datasplit = ds[split].to_pandas() if args.use_hf else ds[split]
+            labels.append(datasplit[task].to_numpy())
+        labels = np.stack(labels, axis=1)
+        save_path = os.path.join(args.save_folder, f"tox21_{split}_cv4.npz")
+        with open(save_path, "wb") as f:
+            np.savez(
+                f,
+                labels=labels,
+                ecfps=ecfps,
+                tox=tox,
+                maccs=maccs,
+                rdkit_descr_quantiles=rdkit_descr_quantiles,
+            )
+            print(f"Saved preprocessed {split} split under {save_path}")
+    print("Preprocessing finished successfully")
+if __name__ == "__main__":
+    args = parser.parse_args()
+    if not os.path.exists(args.save_folder):
+        os.makedirs(args.save_folder)
+    args.path_ecdfs = os.path.join(args.save_folder, args.path_ecdfs)
+    args.path_feat_selec = os.path.join(args.save_folder, args.path_feat_selec)
+    args.tox_smarts_filepath = os.path.join(args.data_folder, args.tox_smarts_filepath)
+    if not os.path.exists(os.path.dirname(args.path_ecdfs)):
+        os.makedirs(os.path.dirname(args.path_ecdfs))
+    main(args)