Spaces:

ailab-bio
/

PROTAC-Degradation-Predictor

Running

App Files Files Community

ribesstefano commited on Apr 22, 2024

Commit

ea572f9

1 Parent(s): 5e01175

started testing package code

Browse files

Files changed (10) hide show

protac_degradation_predictor/__init__.py +11 -3
protac_degradation_predictor/config.py +20 -23
protac_degradation_predictor/data/PROTAC-DB.csv +0 -0
protac_degradation_predictor/data/PROTAC-Degradation-DB.csv +0 -0
protac_degradation_predictor/data_utils.py +81 -13
protac_degradation_predictor/optuna_utils.py +2 -2
protac_degradation_predictor/protac_degradation_predictor.py +5 -3
protac_degradation_predictor/pytorch_models.py +2 -2
protac_degradation_predictor/sklearn_models.py +1 -1
src/main.py +323 -0

protac_degradation_predictor/__init__.py CHANGED Viewed

@@ -1,6 +1,14 @@
-from .protac_degradation_predictor import (
-    PROTAC_Model,
-    train_model,
 )
 __version__ = "0.0.1"

+# from .protac_degradation_predictor.config import config
+# from .protac_degradation_predictor.pytorch_models import train_model
+# from .protac_degradation_predictor.pytorch_models import
+# from .protac_degradation_predictor.pytorch_models import
+from . import (
+    config,
+    pytorch_models,
+    sklearn_models,
+    protac_dataset,
+    data_utils,
+    optuna_utils,
 )
 __version__ = "0.0.1"

protac_degradation_predictor/config.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from dataclasses import dataclass
 @dataclass(frozen=True)
 class Config:
@@ -11,27 +11,24 @@ class Config:
     # Data information
     dmax_threshold: float = 0.6
     pdc50_threshold: float = 6.0
-    e3_ligase2uniprot: dict = {
-        'VHL': 'P40337',
-        'CRBN': 'Q96SW2',
-        'DCAF11': 'Q8TEB1',
-        'DCAF15': 'Q66K64',
-        'DCAF16': 'Q9NXF7',
-        'MDM2': 'Q00987',
-        'Mdm2': 'Q00987',
-        'XIAP': 'P98170',
-        'cIAP1': 'Q7Z460',
-        'IAP': 'P98170',  # I couldn't find the Uniprot ID for IAP, so it's XIAP instead
-        'Iap': 'P98170',  # I couldn't find the Uniprot ID for IAP, so it's XIAP instead
-        'AhR': 'P35869',
-        'RNF4': 'P78317',
-        'RNF114': 'Q9Y508',
-        'FEM1B': 'Q9UK73',
-        'Ubr1': 'Q8IWV7',
-    }
-    def __post_init__(self):
-        self.active_label: str = f'Active (Dmax {self.dmax_threshold}, pDC50 {self.pdc50_threshold})'
 config = Config()

+from dataclasses import dataclass, field
 @dataclass(frozen=True)
 class Config:
     # Data information
     dmax_threshold: float = 0.6
     pdc50_threshold: float = 6.0
+    active_label: str = field(default=f'Active (Dmax {dmax_threshold}, pDC50 {pdc50_threshold})')
+    e3_ligase2uniprot: dict = field(default_factory=lambda: {
+            'VHL': 'P40337',
+            'CRBN': 'Q96SW2',
+            'DCAF11': 'Q8TEB1',
+            'DCAF15': 'Q66K64',
+            'DCAF16': 'Q9NXF7',
+            'MDM2': 'Q00987',
+            'Mdm2': 'Q00987',
+            'XIAP': 'P98170',
+            'cIAP1': 'Q7Z460',
+            'IAP': 'P98170',  # I couldn't find the Uniprot ID for IAP, so it's XIAP instead
+            'Iap': 'P98170',  # I couldn't find the Uniprot ID for IAP, so it's XIAP instead
+            'AhR': 'P35869',
+            'RNF4': 'P78317',
+            'RNF114': 'Q9Y508',
+            'FEM1B': 'Q9UK73',
+            'Ubr1': 'Q8IWV7',
+        })
 config = Config()

protac_degradation_predictor/data/PROTAC-DB.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

protac_degradation_predictor/data/PROTAC-Degradation-DB.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

protac_degradation_predictor/data_utils.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import os
 import pkg_resources
 import pickle
-from typing import Dict
-from config import config
 import h5py
 import numpy as np
@@ -19,8 +19,19 @@ memory = Memory(cachedir, verbose=0)
 @memory.cache
-def load_protein2embedding() -> Dict[str, np.ndarray]:
-    embeddings_path = pkg_resources.resource_stream(__name__, 'data/uniprot2embedding.h5')
     protein2embedding = {}
     with h5py.File(embeddings_path, "r") as file:
         for sequence_id in file.keys():
@@ -30,17 +41,74 @@ def load_protein2embedding() -> Dict[str, np.ndarray]:
 @memory.cache
-def load_cell2embedding() -> Dict[str, np.ndarray]:
-    embeddings_path = pkg_resources.resource_stream(__name__, 'data/cell2embedding.pkl')
     with open(embeddings_path, 'rb') as f:
         cell2embedding = pickle.load(f)
     return cell2embedding
-def get_fingerprint(smiles: str) -> np.ndarray:
-    morgan_fpgen = AllChem.GetMorganGenerator(
-        radius=config.morgan_radius,
-        fpSize=config.fingerprint_size,
-        includeChirality=True,
-    )
-    return morgan_fpgen.GetFingerprint(Chem.MolFromSmiles(smiles))

 import os
 import pkg_resources
 import pickle
+from typing import Dict, Optional
+from .config import config
 import h5py
 import numpy as np
 @memory.cache
+def load_protein2embedding(
+    embeddings_path: Optional[str] = None,
+) -> Dict[str, np.ndarray]:
+    """ Load the protein embeddings from a file.
+    Args:
+        embeddings_path (str): The path to the embeddings file.
+    Returns:
+        Dict[str, np.ndarray]: A dictionary of protein embeddings.
+    """
+    if embeddings_path is None:
+        embeddings_path = pkg_resources.resource_stream(__name__, 'data/uniprot2embedding.h5')
     protein2embedding = {}
     with h5py.File(embeddings_path, "r") as file:
         for sequence_id in file.keys():
 @memory.cache
+def load_cell2embedding(
+        embeddings_path: Optional[str] = None,
+) -> Dict[str, np.ndarray]:
+    """ Load the cell line embeddings from a file.
+    Args:
+        embeddings_path (str): The path to the embeddings file.
+    Returns:
+        Dict[str, np.ndarray]: A dictionary of cell line embeddings.
+    """
+    if embeddings_path is None:
+        embeddings_path = pkg_resources.resource_stream(__name__, 'data/cell2embedding.pkl')
     with open(embeddings_path, 'rb') as f:
         cell2embedding = pickle.load(f)
     return cell2embedding
+def get_fingerprint(smiles: str, morgan_fpgen = None) -> np.ndarray:
+    """ Get the Morgan fingerprint of a molecule.
+    Args:
+        smiles (str): The SMILES string of the molecule.
+        morgan_fpgen: The Morgan fingerprint generator.
+    Returns:
+        np.ndarray: The Morgan fingerprint.
+    """
+    if morgan_fpgen is None:
+        morgan_fpgen = AllChem.GetMorganGenerator(
+            radius=config.morgan_radius,
+            fpSize=config.fingerprint_size,
+            includeChirality=True,
+        )
+    return morgan_fpgen.GetFingerprint(Chem.MolFromSmiles(smiles))
+def is_active(
+        DC50: float,
+        Dmax: float,
+        pDC50_threshold: float = 7.0,
+        Dmax_threshold: float = 0.8,
+        oring: bool = False, # Deprecated
+) -> bool:
+    """ Check if a PROTAC is active based on DC50 and Dmax.
+    Args:
+        DC50(float): DC50 in nM
+        Dmax(float): Dmax in %
+    Returns:
+        bool: True if active, False if inactive, np.nan if either DC50 or Dmax is NaN
+    """
+    pDC50 = -np.log10(DC50 * 1e-9) if pd.notnull(DC50) else np.nan
+    Dmax = Dmax / 100
+    if pd.notnull(pDC50):
+        if pDC50 < pDC50_threshold:
+            return False
+    if pd.notnull(Dmax):
+        if Dmax < Dmax_threshold:
+            return False
+    if oring:
+        if pd.notnull(pDC50):
+            return True if pDC50 >= pDC50_threshold else False
+        elif pd.notnull(Dmax):
+            return True if Dmax >= Dmax_threshold else False
+        else:
+            return np.nan
+    else:
+        if pd.notnull(pDC50) and pd.notnull(Dmax):
+            return True if pDC50 >= pDC50_threshold and Dmax >= Dmax_threshold else False
+        else:
+            return np.nan

protac_degradation_predictor/optuna_utils.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import os
 from typing import Literal, List, Tuple, Optional, Dict
-from pytorch_models import train_model
-from sklearn_models import (
     train_sklearn_model,
     suggest_random_forest,
     suggest_logistic_regression,

 import os
 from typing import Literal, List, Tuple, Optional, Dict
+from .pytorch_models import train_model
+from .sklearn_models import (
     train_sklearn_model,
     suggest_random_forest,
     suggest_logistic_regression,

protac_degradation_predictor/protac_degradation_predictor.py CHANGED Viewed

@@ -1,20 +1,22 @@
 import pkg_resources
 import logging
-from pytorch_models import PROTAC_Model, load_model
-from data_utils import (
     load_protein2embedding,
     load_cell2embedding,
     get_fingerprint,
 )
-from config import config
 import numpy as np
 import torch
 from torch import sigmoid
 package_name = 'protac_degradation_predictor'
 def get_protac_active_proba(
         protac_smiles: str,
         e3_ligase: str,

 import pkg_resources
 import logging
+from .pytorch_models import PROTAC_Model, load_model
+from .data_utils import (
     load_protein2embedding,
     load_cell2embedding,
     get_fingerprint,
 )
+from .config import config
 import numpy as np
 import torch
 from torch import sigmoid
 package_name = 'protac_degradation_predictor'
 def get_protac_active_proba(
         protac_smiles: str,
         e3_ligase: str,

protac_degradation_predictor/pytorch_models.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import warnings
 from typing import Literal, List, Tuple, Optional, Dict
-from protac_dataset import PROTAC_Dataset
-from config import Config
 import pandas as pd
 import numpy as np

 import warnings
 from typing import Literal, List, Tuple, Optional, Dict
+from .protac_dataset import PROTAC_Dataset
+from .config import Config
 import pandas as pd
 import numpy as np

protac_degradation_predictor/sklearn_models.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import Literal, List, Tuple, Optional, Dict
-from protac_dataset import PROTAC_Dataset
 import pandas as pd
 from sklearn.base import ClassifierMixin

 from typing import Literal, List, Tuple, Optional, Dict
+from .protac_dataset import PROTAC_Dataset
 import pandas as pd
 from sklearn.base import ClassifierMixin

src/main.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import os
+from collections import defaultdict
+import warnings
+from protac_degradation_predictor.config import config
+from protac_degradation_predictor.data_utils import (
+    load_protein2embedding,
+    load_cell2embedding,
+    is_active,
+)
+from protac_degradation_predictor.pytorch_models import (
+    train_model,
+)
+from protac_degradation_predictor.optuna_utils import (
+    hyperparameter_tuning_and_training,
+)
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from rdkit import DataStructs
+from jsonargparse import CLI
+import pandas as pd
+from tqdm import tqdm
+import numpy as np
+from sklearn.preprocessing import OrdinalEncoder
+from sklearn.model_selection import (
+    StratifiedKFold,
+    StratifiedGroupKFold,
+)
+# Ignore UserWarning from Matplotlib
+warnings.filterwarnings("ignore", ".*FixedLocator*")
+# Ignore UserWarning from PyTorch Lightning
+warnings.filterwarnings("ignore", ".*does not have many workers.*")
+def main(
+    active_col: str = 'Active (Dmax 0.6, pDC50 6.0)',
+    n_trials: int = 50,
+    fast_dev_run: bool = False,
+    test_split: float = 0.2,
+    cv_n_splits: int = 5,
+):
+    """ Train a PROTAC model using the given datasets and hyperparameters.
+    Args:
+        use_ored_activity (bool): Whether to use the 'Active - OR' column.
+        n_trials (int): The number of hyperparameter optimization trials.
+        n_splits (int): The number of cross-validation splits.
+        fast_dev_run (bool): Whether to run a fast development run.
+    """
+    ## Set the Column to Predict
+    active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
+    # Get Dmax_threshold from the active_col
+    Dmax_threshold = float(active_col.split('Dmax')[1].split(',')[0].strip('(').strip(')').strip())
+    pDC50_threshold = float(active_col.split('pDC50')[1].strip('(').strip(')').strip())
+    ## Load the Data
+    protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
+    # Map E3 Ligase Iap to IAP
+    protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
+    protac_df[active_col] = protac_df.apply(
+        lambda x: is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
+    )
+    ## Test Sets
+    test_indeces = {}
+    ### Random Split
+    # Randomly select 20% of the active PROTACs as the test set
+    active_df = protac_df[protac_df[active_col].notna()].copy()
+    test_df = active_df.sample(frac=test_split, random_state=42)
+    test_indeces['random'] = test_df.index
+    ### E3-based Split
+    encoder = OrdinalEncoder()
+    protac_df['E3 Group'] = encoder.fit_transform(protac_df[['E3 Ligase']]).astype(int)
+    active_df = protac_df[protac_df[active_col].notna()].copy()
+    test_df = active_df[(active_df['E3 Ligase'] != 'VHL') & (active_df['E3 Ligase'] != 'CRBN')]
+    test_indeces['e3_ligase'] = test_df.index
+    ### Tanimoto-based Split
+    #### Precompute fingerprints
+    morgan_fpgen = AllChem.GetMorganGenerator(
+        radius=config.morgan_radius,
+        fpSize=config.fingerprint_size,
+        includeChirality=True,
+    )
+    smiles2fp = {}
+    for smiles in tqdm(protac_df['Smiles'].unique().tolist(), desc='Precomputing fingerprints'):
+        # Get the fingerprint as a bit vector
+        morgan_fp = morgan_fpgen.GetFingerprint(Chem.MolFromSmiles(smiles))
+        smiles2fp[smiles] = morgan_fp
+    # Get the pair-wise tanimoto similarity between the PROTAC fingerprints
+    tanimoto_matrix = defaultdict(list)
+    for i, smiles1 in enumerate(tqdm(protac_df['Smiles'].unique(), desc='Computing Tanimoto similarity')):
+        fp1 = smiles2fp[smiles1]
+        # TODO: Use BulkTanimotoSimilarity for better performance
+        for j, smiles2 in enumerate(protac_df['Smiles'].unique()):
+            if j < i:
+                continue
+            fp2 = smiles2fp[smiles2]
+            tanimoto_dist = DataStructs.TanimotoSimilarity(fp1, fp2)
+            tanimoto_matrix[smiles1].append(tanimoto_dist)
+    avg_tanimoto = {k: np.mean(v) for k, v in tanimoto_matrix.items()}
+    protac_df['Avg Tanimoto'] = protac_df['Smiles'].map(avg_tanimoto)
+    smiles2fp = {s: np.array(fp) for s, fp in smiles2fp.items()}
+    # Make the grouping of the PROTACs based on the Tanimoto similarity
+    n_bins_tanimoto = 200
+    tanimoto_groups = pd.cut(protac_df['Avg Tanimoto'], bins=n_bins_tanimoto).copy()
+    encoder = OrdinalEncoder()
+    protac_df['Tanimoto Group'] = encoder.fit_transform(tanimoto_groups.values.reshape(-1, 1)).astype(int)
+    active_df = protac_df[protac_df[active_col].notna()].copy()
+    # Sort the groups so that samples with the highest tanimoto similarity,
+    # i.e., the "less similar" ones, are placed in the test set first
+    tanimoto_groups = active_df.groupby('Tanimoto Group')['Avg Tanimoto'].mean().sort_values(ascending=False).index
+    test_df = []
+    # For each group, get the number of active and inactive entries. Then, add those
+    # entries to the test_df if: 1) the test_df lenght + the group entries is less
+    # 20% of the active_df lenght, and 2) the percentage of True and False entries
+    # in the active_col in test_df is roughly 50%.
+    for group in tanimoto_groups:
+        group_df = active_df[active_df['Tanimoto Group'] == group]
+        if test_df == []:
+            test_df.append(group_df)
+            continue
+        num_entries = len(group_df)
+        num_active_group = group_df[active_col].sum()
+        num_inactive_group = num_entries - num_active_group
+        tmp_test_df = pd.concat(test_df)
+        num_entries_test = len(tmp_test_df)
+        num_active_test = tmp_test_df[active_col].sum()
+        num_inactive_test = num_entries_test - num_active_test
+        # Check if the group entries can be added to the test_df
+        if num_entries_test + num_entries < test_split * len(active_df):
+            # Add anything at the beggining
+            if num_entries_test + num_entries < test_split / 2 * len(active_df):
+                test_df.append(group_df)
+                continue
+            # Be more selective and make sure that the percentage of active and
+            # inactive is balanced
+            if (num_active_group + num_active_test) / (num_entries_test + num_entries) < 0.6:
+                if (num_inactive_group + num_inactive_test) / (num_entries_test + num_entries) < 0.6:
+                    test_df.append(group_df)
+    test_df = pd.concat(test_df)
+    # Save to global dictionary of test indeces
+    test_indeces['tanimoto'] = test_df.index
+    ### Target-based Split
+    encoder = OrdinalEncoder()
+    protac_df['Uniprot Group'] = encoder.fit_transform(protac_df[['Uniprot']]).astype(int)
+    active_df = protac_df[protac_df[active_col].notna()].copy()
+    test_df = []
+    # For each group, get the number of active and inactive entries. Then, add those
+    # entries to the test_df if: 1) the test_df lenght + the group entries is less
+    # 20% of the active_df lenght, and 2) the percentage of True and False entries
+    # in the active_col in test_df is roughly 50%.
+    # Start the loop from the groups containing the smallest number of entries.
+    for group in reversed(active_df['Uniprot'].value_counts().index):
+        group_df = active_df[active_df['Uniprot'] == group]
+        if test_df == []:
+            test_df.append(group_df)
+            continue
+        num_entries = len(group_df)
+        num_active_group = group_df[active_col].sum()
+        num_inactive_group = num_entries - num_active_group
+        tmp_test_df = pd.concat(test_df)
+        num_entries_test = len(tmp_test_df)
+        num_active_test = tmp_test_df[active_col].sum()
+        num_inactive_test = num_entries_test - num_active_test
+        # Check if the group entries can be added to the test_df
+        if num_entries_test + num_entries < test_split * len(active_df):
+            # Add anything at the beggining
+            if num_entries_test + num_entries < test_split / 2 * len(active_df):
+                test_df.append(group_df)
+                continue
+            # Be more selective and make sure that the percentage of active and
+            # inactive is balanced
+            if (num_active_group + num_active_test) / (num_entries_test + num_entries) < 0.6:
+                if (num_inactive_group + num_inactive_test) / (num_entries_test + num_entries) < 0.6:
+                    test_df.append(group_df)
+    test_df = pd.concat(test_df)
+    # Save to global dictionary of test indeces
+    test_indeces['uniprot'] = test_df.index
+    ## Cross-Validation Training
+    # Make directory ../reports if it does not exist
+    if not os.path.exists('../reports'):
+        os.makedirs('../reports')
+    # Load embedding dictionaries
+    protein2embedding = load_protein2embedding('../data/uniprot2embedding.h5')
+    cell2embedding = load_cell2embedding('../data/cell2embedding.pkl')
+    report = []
+    for split_type, indeces in test_indeces.items():
+        active_df = protac_df[protac_df[active_col].notna()].copy()
+        test_df = active_df.loc[indeces]
+        train_val_df = active_df[~active_df.index.isin(test_df.index)]
+        if split_type == 'random':
+            kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = None
+        elif split_type == 'e3_ligase':
+            kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = train_val_df['E3 Group'].to_numpy()
+        elif split_type == 'tanimoto':
+            kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = train_val_df['Tanimoto Group'].to_numpy()
+        elif split_type == 'uniprot':
+            kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = train_val_df['Uniprot Group'].to_numpy()
+        # Start the CV over the folds
+        X = train_val_df.drop(columns=active_col)
+        y = train_val_df[active_col].tolist()
+        for k, (train_index, val_index) in enumerate(kf.split(X, y, group)):
+            print('-' * 100)
+            print(f'Starting CV for group type: {split_type}, fold: {k}')
+            print('-' * 100)
+            train_df = train_val_df.iloc[train_index]
+            val_df = train_val_df.iloc[val_index]
+            leaking_uniprot = list(set(train_df['Uniprot']).intersection(set(val_df['Uniprot'])))
+            leaking_smiles = list(set(train_df['Smiles']).intersection(set(val_df['Smiles'])))
+            stats = {
+                'fold': k,
+                'split_type': split_type,
+                'train_len': len(train_df),
+                'val_len': len(val_df),
+                'train_perc': len(train_df) / len(train_val_df),
+                'val_perc': len(val_df) / len(train_val_df),
+                'train_active_perc': train_df[active_col].sum() / len(train_df),
+                'train_inactive_perc': (len(train_df) - train_df[active_col].sum()) / len(train_df),
+                'val_active_perc': val_df[active_col].sum() / len(val_df),
+                'val_inactive_perc': (len(val_df) - val_df[active_col].sum()) / len(val_df),
+                'test_active_perc': test_df[active_col].sum() / len(test_df),
+                'test_inactive_perc': (len(test_df) - test_df[active_col].sum()) / len(test_df),
+                'num_leaking_uniprot': len(leaking_uniprot),
+                'num_leaking_smiles': len(leaking_smiles),
+                'train_leaking_uniprot_perc': len(train_df[train_df['Uniprot'].isin(leaking_uniprot)]) / len(train_df),
+                'train_leaking_smiles_perc': len(train_df[train_df['Smiles'].isin(leaking_smiles)]) / len(train_df),
+            }
+            if split_type != 'random':
+                stats['train_unique_groups'] = len(np.unique(group[train_index]))
+                stats['val_unique_groups'] = len(np.unique(group[val_index]))
+            print(stats)
+        #     # Train and evaluate the model
+        #     model, trainer, metrics = hyperparameter_tuning_and_training(
+        #         protein2embedding,
+        #         cell2embedding,
+        #         smiles2fp,
+        #         train_df,
+        #         val_df,
+        #         test_df,
+        #         fast_dev_run=fast_dev_run,
+        #         n_trials=n_trials,
+        #         logger_name=f'protac_{active_name}_{split_type}_fold_{k}_test_split_{test_split}',
+        #         active_label=active_col,
+        #         study_filename=f'../reports/study_{active_name}_{split_type}_fold_{k}_test_split_{test_split}.pkl',
+        #     )
+        #     hparams = {p.replace('hparam_', ''): v for p, v in stats.items() if p.startswith('hparam_')}
+        #     stats.update(metrics)
+        #     report.append(stats.copy())
+        #     del model
+        #     del trainer
+        #     # Ablation study: disable embeddings at a time
+        #     for disabled_embeddings in [['e3'], ['poi'], ['cell'], ['smiles'], ['e3', 'cell'], ['poi', 'e3', 'cell']]:
+        #         print('-' * 100)
+        #         print(f'Ablation study with disabled embeddings: {disabled_embeddings}')
+        #         print('-' * 100)
+        #         stats['disabled_embeddings'] = 'disabled ' + ' '.join(disabled_embeddings)
+        #         model, trainer, metrics = train_model(
+        #             protein2embedding,
+        #             cell2embedding,
+        #             smiles2fp,
+        #             train_df,
+        #             val_df,
+        #             test_df,
+        #             fast_dev_run=fast_dev_run,
+        #             logger_name=f'protac_{active_name}_{split_type}_fold_{k}_disabled-{"-".join(disabled_embeddings)}',
+        #             active_label=active_col,
+        #             disabled_embeddings=disabled_embeddings,
+        #             **hparams,
+        #         )
+        #         stats.update(metrics)
+        #         report.append(stats.copy())
+        #         del model
+        #         del trainer
+        # report_df = pd.DataFrame(report)
+        # report_df.to_csv(
+        #     f'../reports/cv_report_hparam_search_{cv_n_splits}-splits_{active_name}_test_split_{test_split}_sklearn.csv',
+        #     index=False,
+        # )
+if __name__ == '__main__':
+    cli = CLI(main)