Spaces:

ailab-bio
/

PROTAC-Degradation-Predictor

Running

App Files Files Community

ribesstefano commited on May 2, 2024

Commit

4e1d3f6

1 Parent(s): 48aea13

Added ablation study with randomly sampled vectors + Started working on LightningDataModule wrapper

Browse files

Files changed (35) hide show

plots/Active_Dmax_0.6_pDC50_6.0_metrics.pdf +0 -0
plots/Active_Dmax_0.6_pDC50_6.0_metrics_majority_vote.pdf +0 -0
plots/ablation_study_random.pdf +0 -0
plots/ablation_study_tanimoto.pdf +0 -0
plots/ablation_study_uniprot.pdf +0 -0
plots/old_Active_Dmax_0.6_pDC50_6.0_metrics.pdf +0 -0
plots/training_metrics_random_best_model_n0.pdf +0 -0
plots/training_metrics_random_best_model_n1.pdf +0 -0
plots/training_metrics_random_best_model_n2.pdf +0 -0
plots/training_metrics_random_cv_model_fold0.pdf +0 -0
plots/training_metrics_random_cv_model_fold1.pdf +0 -0
plots/training_metrics_random_cv_model_fold2.pdf +0 -0
plots/training_metrics_random_cv_model_fold3.pdf +0 -0
plots/training_metrics_random_cv_model_fold4.pdf +0 -0
plots/training_metrics_tanimoto_best_model_n0.pdf +0 -0
plots/training_metrics_tanimoto_best_model_n1.pdf +0 -0
plots/training_metrics_tanimoto_best_model_n2.pdf +0 -0
plots/training_metrics_tanimoto_cv_model_fold0.pdf +0 -0
plots/training_metrics_tanimoto_cv_model_fold1.pdf +0 -0
plots/training_metrics_tanimoto_cv_model_fold2.pdf +0 -0
plots/training_metrics_tanimoto_cv_model_fold3.pdf +0 -0
plots/training_metrics_tanimoto_cv_model_fold4.pdf +0 -0
plots/training_metrics_uniprot_best_model_n0.pdf +0 -0
plots/training_metrics_uniprot_best_model_n1.pdf +0 -0
plots/training_metrics_uniprot_best_model_n2.pdf +0 -0
plots/training_metrics_uniprot_cv_model_fold0.pdf +0 -0
plots/training_metrics_uniprot_cv_model_fold1.pdf +0 -0
plots/training_metrics_uniprot_cv_model_fold2.pdf +0 -0
plots/training_metrics_uniprot_cv_model_fold3.pdf +0 -0
plots/training_metrics_uniprot_cv_model_fold4.pdf +0 -0
protac_degradation_predictor/optuna_utils.py +96 -52
protac_degradation_predictor/protac_dataset.py +383 -14
protac_degradation_predictor/pytorch_models.py +48 -31
src/plot_experiment_results.py +49 -31
src/run_experiments.py +1 -1

plots/Active_Dmax_0.6_pDC50_6.0_metrics.pdf ADDED Viewed

Binary file (16.8 kB). View file

plots/Active_Dmax_0.6_pDC50_6.0_metrics_majority_vote.pdf ADDED Viewed

Binary file (16.7 kB). View file

plots/ablation_study_random.pdf ADDED Viewed

Binary file (15.8 kB). View file

plots/ablation_study_tanimoto.pdf ADDED Viewed

Binary file (15.3 kB). View file

plots/ablation_study_uniprot.pdf ADDED Viewed

Binary file (15.6 kB). View file

plots/old_Active_Dmax_0.6_pDC50_6.0_metrics.pdf ADDED Viewed

Binary file (16.8 kB). View file

plots/training_metrics_random_best_model_n0.pdf ADDED Viewed

Binary file (16.9 kB). View file

plots/training_metrics_random_best_model_n1.pdf ADDED Viewed

Binary file (16.7 kB). View file

plots/training_metrics_random_best_model_n2.pdf ADDED Viewed

Binary file (16.9 kB). View file

plots/training_metrics_random_cv_model_fold0.pdf ADDED Viewed

Binary file (17.4 kB). View file

plots/training_metrics_random_cv_model_fold1.pdf ADDED Viewed

Binary file (17.3 kB). View file

plots/training_metrics_random_cv_model_fold2.pdf ADDED Viewed

Binary file (17.1 kB). View file

plots/training_metrics_random_cv_model_fold3.pdf ADDED Viewed

Binary file (17.3 kB). View file

plots/training_metrics_random_cv_model_fold4.pdf ADDED Viewed

Binary file (17.6 kB). View file

plots/training_metrics_tanimoto_best_model_n0.pdf ADDED Viewed

Binary file (16.9 kB). View file

plots/training_metrics_tanimoto_best_model_n1.pdf ADDED Viewed

Binary file (16.9 kB). View file

plots/training_metrics_tanimoto_best_model_n2.pdf ADDED Viewed

Binary file (16.6 kB). View file

plots/training_metrics_tanimoto_cv_model_fold0.pdf ADDED Viewed

Binary file (17.1 kB). View file

plots/training_metrics_tanimoto_cv_model_fold1.pdf ADDED Viewed

Binary file (16.8 kB). View file

plots/training_metrics_tanimoto_cv_model_fold2.pdf ADDED Viewed

Binary file (17.3 kB). View file

plots/training_metrics_tanimoto_cv_model_fold3.pdf ADDED Viewed

Binary file (17 kB). View file

plots/training_metrics_tanimoto_cv_model_fold4.pdf ADDED Viewed

Binary file (17.1 kB). View file

plots/training_metrics_uniprot_best_model_n0.pdf ADDED Viewed

Binary file (16.3 kB). View file

plots/training_metrics_uniprot_best_model_n1.pdf ADDED Viewed

Binary file (15.9 kB). View file

plots/training_metrics_uniprot_best_model_n2.pdf ADDED Viewed

Binary file (16.1 kB). View file

plots/training_metrics_uniprot_cv_model_fold0.pdf ADDED Viewed

Binary file (17.2 kB). View file

plots/training_metrics_uniprot_cv_model_fold1.pdf ADDED Viewed

Binary file (17.7 kB). View file

plots/training_metrics_uniprot_cv_model_fold2.pdf ADDED Viewed

Binary file (17 kB). View file

plots/training_metrics_uniprot_cv_model_fold3.pdf ADDED Viewed

Binary file (16.5 kB). View file

plots/training_metrics_uniprot_cv_model_fold4.pdf ADDED Viewed

Binary file (17.1 kB). View file

protac_degradation_predictor/optuna_utils.py CHANGED Viewed

@@ -2,7 +2,13 @@ import os
 from typing import Literal, List, Tuple, Optional, Dict
 import logging
-from .pytorch_models import train_model, PROTAC_Model
 from .sklearn_models import (
     train_sklearn_model,
     suggest_random_forest,
@@ -83,6 +89,26 @@ def get_dataframe_stats(
     return stats
 def pytorch_model_objective(
         trial: optuna.Trial,
         protein2embedding: Dict,
@@ -198,18 +224,7 @@ def pytorch_model_objective(
     # Get the majority vote for the test predictions
     if test_df is not None and not fast_dev_run:
-        # Get the majority vote for the test predictions
-        test_preds = torch.stack(test_preds)
-        test_preds, _ = torch.mode(test_preds, dim=0)
-        y = torch.tensor(test_df[active_label].tolist())
-        # Measure the test accuracy and ROC AUC
-        majority_vote_metrics = {
-            'test_acc': Accuracy(task='binary')(test_preds, y).item(),
-            'test_roc_auc': AUROC(task='binary')(test_preds, y).item(),
-            'test_precision': Precision(task='binary')(test_preds, y).item(),
-            'test_recall': Recall(task='binary')(test_preds, y).item(),
-            'test_f1': F1Score(task='binary')(test_preds, y).item(),
-        }
         majority_vote_metrics.update(get_dataframe_stats(train_df, val_df, test_df, active_label))
         trial.set_user_attr('majority_vote_metrics', majority_vote_metrics)
         logging.info(f'Majority vote metrics: {majority_vote_metrics}')
@@ -278,6 +293,7 @@ def hyperparameter_tuning_and_training(
             study = joblib.load(study_filename)
             study_loaded = True
             logging.info(f'Loaded study from {study_filename}')
     if not study_loaded or force_study:
         study.optimize(
@@ -333,12 +349,13 @@ def hyperparameter_tuning_and_training(
     )
     # Retrain N models with the best hyperparameters (measure model uncertainty)
     test_report = []
     test_preds = []
     dfs_stats = get_dataframe_stats(train_val_df, test_df=test_df, active_label=active_label)
     for i in range(n_models_for_test):
         pl.seed_everything(42 + i + 1)
-        _, trainer, metrics, test_pred = train_model(
             protein2embedding=protein2embedding,
             cell2embedding=cell2embedding,
             smiles2fp=smiles2fp,
@@ -366,22 +383,12 @@ def hyperparameter_tuning_and_training(
         test_report.append(metrics.copy())
         test_preds.append(test_pred)
     test_report = pd.DataFrame(test_report)
     # Get the majority vote for the test predictions
     if not fast_dev_run:
-        test_preds = torch.stack(test_preds)
-        test_preds, _ = torch.mode(test_preds, dim=0)
-        y = torch.tensor(test_df[active_label].tolist())
-        # Measure the test accuracy and ROC AUC
-        majority_vote_metrics = {
-            'cv_models': False,
-            'test_acc': Accuracy(task='binary')(test_preds, y).item(),
-            'test_roc_auc': AUROC(task='binary')(test_preds, y).item(),
-            'test_precision': Precision(task='binary')(test_preds, y).item(),
-            'test_recall': Recall(task='binary')(test_preds, y).item(),
-            'test_f1': F1Score(task='binary')(test_preds, y).item(),
-        }
         majority_vote_metrics.update(get_dataframe_stats(train_val_df, test_df=test_df, active_label=active_label))
         majority_vote_metrics_cv = study.best_trial.user_attrs['majority_vote_metrics']
         majority_vote_metrics_cv['cv_models'] = True
@@ -408,34 +415,71 @@ def hyperparameter_tuning_and_training(
         logging.info('-' * 100)
         logging.info(f'Ablation study with disabled embeddings: {disabled_embeddings}')
         logging.info('-' * 100)
-        _, _, metrics = train_model(
-            protein2embedding=protein2embedding,
-            cell2embedding=cell2embedding,
-            smiles2fp=smiles2fp,
-            train_df=train_val_df,
-            val_df=test_df,
-            fast_dev_run=fast_dev_run,
-            active_label=active_label,
-            max_epochs=max_epochs,
-            use_logger=False,
-            logger_save_dir=logger_save_dir,
-            logger_name=f'{logger_name}_disabled-{"-".join(disabled_embeddings)}',
-            disabled_embeddings=disabled_embeddings,
-            batch_size=128,
-            apply_scaling=True,
-            **study.best_params,
-        )
-        # Rename the keys in the metrics dictionary
-        metrics = {k.replace('val_', 'test_'): v for k, v in metrics.items()}
-        metrics['disabled_embeddings'] = 'disabled ' + ' '.join(disabled_embeddings)
-        metrics['model_type'] = 'Pytorch'
-        metrics.update(dfs_stats)
-        # Add the training metrics
-        train_metrics = {m: v.item() for m, v in trainer.callback_metrics.items() if 'train' in m}
-        metrics.update(train_metrics)
-        ablation_report.append(metrics.copy())
     ablation_report = pd.DataFrame(ablation_report)
     # Add a column with the split_type to all reports

 from typing import Literal, List, Tuple, Optional, Dict
 import logging
+from .pytorch_models import (
+    train_model,
+    PROTAC_Model,
+    evaluate_model,
+)
+from .protac_dataset import get_datasets
 from .sklearn_models import (
     train_sklearn_model,
     suggest_random_forest,
     return stats
+def get_majority_vote_metrics(
+        test_preds: List,
+        test_df: pd.DataFrame,
+        active_label: str = 'Active',
+) -> Dict:
+    """ Get the majority vote metrics. """
+    test_preds = torch.stack(test_preds)
+    test_preds, _ = torch.mode(test_preds, dim=0)
+    y = torch.tensor(test_df[active_label].tolist())
+    # Measure the test accuracy and ROC AUC
+    majority_vote_metrics = {
+        'test_acc': Accuracy(task='binary')(test_preds, y).item(),
+        'test_roc_auc': AUROC(task='binary')(test_preds, y).item(),
+        'test_precision': Precision(task='binary')(test_preds, y).item(),
+        'test_recall': Recall(task='binary')(test_preds, y).item(),
+        'test_f1': F1Score(task='binary')(test_preds, y).item(),
+    }
+    return majority_vote_metrics
 def pytorch_model_objective(
         trial: optuna.Trial,
         protein2embedding: Dict,
     # Get the majority vote for the test predictions
     if test_df is not None and not fast_dev_run:
+        majority_vote_metrics = get_majority_vote_metrics(test_preds, test_df, active_label)
         majority_vote_metrics.update(get_dataframe_stats(train_df, val_df, test_df, active_label))
         trial.set_user_attr('majority_vote_metrics', majority_vote_metrics)
         logging.info(f'Majority vote metrics: {majority_vote_metrics}')
             study = joblib.load(study_filename)
             study_loaded = True
             logging.info(f'Loaded study from {study_filename}')
+            logging.info(f'Study best params: {study.best_params}')
     if not study_loaded or force_study:
         study.optimize(
     )
     # Retrain N models with the best hyperparameters (measure model uncertainty)
+    best_models = []
     test_report = []
     test_preds = []
     dfs_stats = get_dataframe_stats(train_val_df, test_df=test_df, active_label=active_label)
     for i in range(n_models_for_test):
         pl.seed_everything(42 + i + 1)
+        model, trainer, metrics, test_pred = train_model(
             protein2embedding=protein2embedding,
             cell2embedding=cell2embedding,
             smiles2fp=smiles2fp,
         test_report.append(metrics.copy())
         test_preds.append(test_pred)
+        best_models.append({'model': model, 'trainer': trainer})
     test_report = pd.DataFrame(test_report)
     # Get the majority vote for the test predictions
     if not fast_dev_run:
+        majority_vote_metrics = get_majority_vote_metrics(test_preds, test_df, active_label)
         majority_vote_metrics.update(get_dataframe_stats(train_val_df, test_df=test_df, active_label=active_label))
         majority_vote_metrics_cv = study.best_trial.user_attrs['majority_vote_metrics']
         majority_vote_metrics_cv['cv_models'] = True
         logging.info('-' * 100)
         logging.info(f'Ablation study with disabled embeddings: {disabled_embeddings}')
         logging.info('-' * 100)
+        disabled_embeddings_str = 'disabled ' + ' '.join(disabled_embeddings)
+        test_preds = []
+        for i, model_trainer in enumerate(best_models):
+            logging.info(f'Evaluating model n.{i} on {disabled_embeddings_str}.')
+            model = model_trainer['model']
+            trainer = model_trainer['trainer']
+            _, test_ds, _  = get_datasets(
+                protein2embedding=protein2embedding,
+                cell2embedding=cell2embedding,
+                smiles2fp=smiles2fp,
+                train_df=train_val_df,
+                val_df=test_df,
+                disabled_embeddings=disabled_embeddings,
+                active_label=active_label,
+                scaler=model.scalers,
+                use_single_scaler=model.join_embeddings == 'beginning',
+            )
+            ret = evaluate_model(model, trainer, test_ds, batch_size=128)
+            # NOTE: We are passing the test set as the validation set argument
+            # Rename the keys in the metrics dictionary
+            test_preds.append(ret['val_pred'])
+            ret['val_metrics'] = {k.replace('val_', 'test_'): v for k, v in ret['val_metrics'].items()}
+            ret['val_metrics'].update(dfs_stats)
+            ret['val_metrics']['majority_vote'] = False
+            ret['val_metrics']['model_type'] = 'Pytorch'
+            ret['val_metrics']['disabled_embeddings'] = disabled_embeddings_str
+            ablation_report.append(ret['val_metrics'].copy())
+        # Get the majority vote for the test predictions
+        if not fast_dev_run:
+            majority_vote_metrics = get_majority_vote_metrics(test_preds, test_df, active_label)
+            majority_vote_metrics.update(dfs_stats)
+            majority_vote_metrics['majority_vote'] = True
+            majority_vote_metrics['model_type'] = 'Pytorch'
+            majority_vote_metrics['disabled_embeddings'] = disabled_embeddings_str
+            ablation_report.append(majority_vote_metrics.copy())
+        # _, _, metrics = train_model(
+        #     protein2embedding=protein2embedding,
+        #     cell2embedding=cell2embedding,
+        #     smiles2fp=smiles2fp,
+        #     train_df=train_val_df,
+        #     val_df=test_df,
+        #     fast_dev_run=fast_dev_run,
+        #     active_label=active_label,
+        #     max_epochs=max_epochs,
+        #     use_logger=False,
+        #     logger_save_dir=logger_save_dir,
+        #     logger_name=f'{logger_name}_disabled-{"-".join(disabled_embeddings)}',
+        #     disabled_embeddings=disabled_embeddings,
+        #     batch_size=128,
+        #     apply_scaling=True,
+        #     **study.best_params,
+        # )
+        # # Rename the keys in the metrics dictionary
+        # metrics = {k.replace('val_', 'test_'): v for k, v in metrics.items()}
+        # metrics['disabled_embeddings'] = disabled_embeddings_str
+        # metrics['model_type'] = 'Pytorch'
+        # metrics.update(dfs_stats)
+        # # Add the training metrics
+        # train_metrics = {m: v.item() for m, v in trainer.callback_metrics.items() if 'train' in m}
+        # metrics.update(train_metrics)
+        # ablation_report.append(metrics.copy())
     ablation_report = pd.DataFrame(ablation_report)
     # Add a column with the split_type to all reports

protac_degradation_predictor/protac_dataset.py CHANGED Viewed

@@ -1,13 +1,26 @@
 from typing import Literal, List, Tuple, Optional, Dict
-from torch.utils.data import Dataset
-import numpy as np
 from imblearn.over_sampling import SMOTE, ADASYN
 import pandas as pd
-from sklearn.preprocessing import StandardScaler
 class PROTAC_Dataset(Dataset):
     def __init__(
         self,
         protac_df: pd.DataFrame,
@@ -17,6 +30,9 @@ class PROTAC_Dataset(Dataset):
         use_smote: bool = False,
         oversampler: Optional[SMOTE | ADASYN] = None,
         active_label: str = 'Active',
     ):
         """ Initialize the PROTAC dataset
@@ -28,13 +44,17 @@ class PROTAC_Dataset(Dataset):
             use_smote (bool): Whether to use SMOTE for oversampling
             use_ored_activity (bool): Whether to use the 'Active - OR' column
         """
-        # Filter out examples with NaN in active_col column
-        self.data = protac_df  # [~protac_df[active_col].isna()]
         self.protein2embedding = protein2embedding
         self.cell2embedding = cell2embedding
         self.smiles2fp = smiles2fp
         self.active_label = active_label
-        self.use_single_scaler = None
         self.smiles_emb_dim = smiles2fp[list(smiles2fp.keys())[0]].shape[0]
         self.protein_emb_dim = protein2embedding[list(
@@ -115,15 +135,15 @@ class PROTAC_Dataset(Dataset):
         """
         if use_single_scaler:
             self.use_single_scaler = True
-            scaler = StandardScaler(**scaler_kwargs)
             embeddings = np.hstack([
                 np.array(self.data['Smiles'].tolist()),
                 np.array(self.data['Uniprot'].tolist()),
                 np.array(self.data['E3 Ligase Uniprot'].tolist()),
                 np.array(self.data['Cell Line Identifier'].tolist()),
             ])
-            scaler.fit(embeddings)
-            return scaler
         else:
             self.use_single_scaler = False
             scalers = {}
@@ -137,6 +157,7 @@ class PROTAC_Dataset(Dataset):
             scalers['E3 Ligase Uniprot'].fit(np.stack(self.data['E3 Ligase Uniprot'].to_numpy()))
             scalers['Cell Line Identifier'].fit(np.stack(self.data['Cell Line Identifier'].to_numpy()))
             return scalers
     def apply_scaling(self, scalers: dict, use_single_scaler: bool = False):
@@ -190,11 +211,359 @@ class PROTAC_Dataset(Dataset):
         return len(self.data)
     def __getitem__(self, idx):
         elem = {
-            'smiles_emb': self.data['Smiles'].iloc[idx],
-            'poi_emb': self.data['Uniprot'].iloc[idx],
-            'e3_emb': self.data['E3 Ligase Uniprot'].iloc[idx],
-            'cell_emb': self.data['Cell Line Identifier'].iloc[idx],
             'active': self.data[self.active_label].iloc[idx],
         }
-        return elem

 from typing import Literal, List, Tuple, Optional, Dict
+from collections import defaultdict
+from .data_utils import (
+    get_fingerprint,
+    is_active,
+    load_cell2embedding,
+    load_protein2embedding,
+)
+from torch.utils.data import Dataset, DataLoader
 from imblearn.over_sampling import SMOTE, ADASYN
+from sklearn.preprocessing import StandardScaler, OrdinalEncoder
+import numpy as np
 import pandas as pd
+import pytorch_lightning as pl
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from rdkit import DataStructs
 class PROTAC_Dataset(Dataset):
     def __init__(
         self,
         protac_df: pd.DataFrame,
         use_smote: bool = False,
         oversampler: Optional[SMOTE | ADASYN] = None,
         active_label: str = 'Active',
+        disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
+        scaler: Optional[StandardScaler | Dict[str, StandardScaler]] = None,
+        use_single_scaler: Optional[bool] = None,
     ):
         """ Initialize the PROTAC dataset
             use_smote (bool): Whether to use SMOTE for oversampling
             use_ored_activity (bool): Whether to use the 'Active - OR' column
         """
+        # Filter out examples with NaN in active_label column
+        self.data = protac_df  # [~protac_df[active_label].isna()]
         self.protein2embedding = protein2embedding
         self.cell2embedding = cell2embedding
         self.smiles2fp = smiles2fp
         self.active_label = active_label
+        self.disabled_embeddings = disabled_embeddings
+        # Scaling parameters
+        self.scaler = scaler
+        self.use_single_scaler = use_single_scaler
         self.smiles_emb_dim = smiles2fp[list(smiles2fp.keys())[0]].shape[0]
         self.protein_emb_dim = protein2embedding[list(
         """
         if use_single_scaler:
             self.use_single_scaler = True
+            self.scaler = StandardScaler(**scaler_kwargs)
             embeddings = np.hstack([
                 np.array(self.data['Smiles'].tolist()),
                 np.array(self.data['Uniprot'].tolist()),
                 np.array(self.data['E3 Ligase Uniprot'].tolist()),
                 np.array(self.data['Cell Line Identifier'].tolist()),
             ])
+            self.scaler.fit(embeddings)
+            return self.scaler
         else:
             self.use_single_scaler = False
             scalers = {}
             scalers['E3 Ligase Uniprot'].fit(np.stack(self.data['E3 Ligase Uniprot'].to_numpy()))
             scalers['Cell Line Identifier'].fit(np.stack(self.data['Cell Line Identifier'].to_numpy()))
+            self.scaler = scalers
             return scalers
     def apply_scaling(self, scalers: dict, use_single_scaler: bool = False):
         return len(self.data)
     def __getitem__(self, idx):
+        if 'smiles' in self.disabled_embeddings:
+            # Uniformly sample a binary vector for the fingerprint
+            smiles_emb = np.random.randint(0, 2, size=self.smiles_emb_dim).astype(np.float32)
+            if not self.use_single_scaler and self.scaler is not None:
+                smiles_emb = smiles_emb[np.newaxis, :]
+                smiles_emb = self.scaler['Smiles'].transform(smiles_emb).flatten()
+        else:
+            smiles_emb = self.data['Smiles'].iloc[idx]
+        if 'poi' in self.disabled_embeddings:
+            # Uniformly sample a vector for the protein
+            poi_emb = np.random.rand(self.protein_emb_dim).astype(np.float32)
+            if not self.use_single_scaler and self.scaler is not None:
+                poi_emb = poi_emb[np.newaxis, :]
+                poi_emb = self.scaler['Uniprot'].transform(poi_emb).flatten()
+        else:
+            poi_emb = self.data['Uniprot'].iloc[idx]
+        if 'e3' in self.disabled_embeddings:
+            # Uniformly sample a vector for the E3 ligase
+            e3_emb = np.random.rand(self.protein_emb_dim).astype(np.float32)
+            if not self.use_single_scaler and self.scaler is not None:
+                # Add extra dimension for compatibility with the scaler
+                e3_emb = e3_emb[np.newaxis, :]
+                e3_emb = self.scaler['E3 Ligase Uniprot'].transform(e3_emb)
+                e3_emb = e3_emb.flatten()
+        else:
+            e3_emb = self.data['E3 Ligase Uniprot'].iloc[idx]
+        if 'cell' in self.disabled_embeddings:
+            # Uniformly sample a vector for the cell line
+            cell_emb = np.random.rand(self.cell_emb_dim).astype(np.float32)
+            if not self.use_single_scaler and self.scaler is not None:
+                cell_emb = cell_emb[np.newaxis, :]
+                cell_emb = self.scaler['Cell Line Identifier'].transform(cell_emb).flatten()
+        else:
+            cell_emb = self.data['Cell Line Identifier'].iloc[idx]
         elem = {
+            'smiles_emb': smiles_emb,
+            'poi_emb': poi_emb,
+            'e3_emb': e3_emb,
+            'cell_emb': cell_emb,
             'active': self.data[self.active_label].iloc[idx],
         }
+        return elem
+def get_datasets(
+        train_df: pd.DataFrame,
+        val_df: pd.DataFrame,
+        test_df: Optional[pd.DataFrame] = None,
+        protein2embedding: Dict = None,
+        cell2embedding: Dict = None,
+        smiles2fp: Dict = None,
+        use_smote: bool = True,
+        smote_k_neighbors: int = 5,
+        active_label: str = 'Active',
+        disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
+        scaler: Optional[StandardScaler | Dict[str, StandardScaler]] = None,
+        use_single_scaler: Optional[bool] = None,
+) -> Tuple[PROTAC_Dataset, PROTAC_Dataset, Optional[PROTAC_Dataset]]:
+    """ Get the datasets for training the PROTAC model. """
+    oversampler = SMOTE(k_neighbors=smote_k_neighbors, random_state=42)
+    train_ds = PROTAC_Dataset(
+        train_df,
+        protein2embedding,
+        cell2embedding,
+        smiles2fp,
+        use_smote=use_smote,
+        oversampler=oversampler if use_smote else None,
+        active_label=active_label,
+        disabled_embeddings=disabled_embeddings,
+        scaler=scaler,
+        use_single_scaler=use_single_scaler,
+    )
+    val_ds = PROTAC_Dataset(
+        val_df,
+        protein2embedding,
+        cell2embedding,
+        smiles2fp,
+        active_label=active_label,
+        disabled_embeddings=disabled_embeddings,
+        scaler=train_ds.scaler if train_ds.scaler is not None else scaler,
+        use_single_scaler=train_ds.use_single_scaler if train_ds.use_single_scaler is not None else use_single_scaler,
+    )
+    if test_df is not None:
+        test_ds = PROTAC_Dataset(
+            test_df,
+            protein2embedding,
+            cell2embedding,
+            smiles2fp,
+            active_label=active_label,
+            disabled_embeddings=disabled_embeddings,
+            scaler=train_ds.scaler if train_ds.scaler is not None else scaler,
+            use_single_scaler=train_ds.use_single_scaler if train_ds.use_single_scaler is not None else use_single_scaler,
+        )
+    else:
+        test_ds = None
+    return train_ds, val_ds, test_ds
+class PROTAC_DataModule(pl.LightningDataModule):
+    """ PyTorch Lightning DataModule for the PROTAC dataset.
+    TODO: Work in progress. It would be nice to wrap all information into a
+    single class, but it is not clear how to do it yet due to cross-validation
+    and the need to split the data into training, validation, and test sets
+    accordingly.
+    Args:
+        protac_csv_filepath (str): The path to the PROTAC CSV file.
+        protein2embedding_filepath (str): The path to the protein to embedding dictionary.
+        cell2embedding_filepath (str): The path to the cell line to embedding dictionary.
+        pDC50_threshold (float): The threshold for the pDC50 value to consider a PROTAC active.
+        Dmax_threshold (float): The threshold for the Dmax value to consider a PROTAC active.
+        use_smote (bool): Whether to use SMOTE for oversampling.
+        smote_k_neighbors (int): The number of neighbors to use for SMOTE.
+        active_label (str): The column containing the active/inactive information.
+        disabled_embeddings (list): The list of embeddings to disable.
+        scaler (StandardScaler | dict): The scaler to use for the embeddings.
+        use_single_scaler (bool): Whether to use a single scaler for all features.
+    """
+    def __init__(
+        self,
+        protac_csv_filepath: str,
+        protein2embedding_filepath: str,
+        cell2embedding_filepath: str,
+        pDC50_threshold: float = 6.0,
+        Dmax_threshold: float = 0.6,
+        use_smote: bool = True,
+        smote_k_neighbors: int = 5,
+        active_label: str = 'Active',
+        disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
+        scaler: Optional[StandardScaler | Dict[str, StandardScaler]] = None,
+        use_single_scaler: Optional[bool] = None,
+    ):
+        super(PROTAC_DataModule, self).__init__()
+        # Load the PROTAC dataset
+        self.protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
+        # Map E3 Ligase Iap to IAP
+        self.protac_df['E3 Ligase'] = self.protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
+        self.protac_df[active_label] = self.protac_df.apply(
+            lambda x: is_active(
+                x['DC50 (nM)'],
+                x['Dmax (%)'],
+                pDC50_threshold=pDC50_threshold,
+                Dmax_threshold=Dmax_threshold,
+            ),
+            axis=1,
+        )
+        self.smiles2fp, self.protac_df = self.get_smiles2fp_and_avg_tanimoto(self.protac_df)
+        self.active_df = self.protac_df[self.protac_df[active_label].notna()].copy()
+        # Load embedding dictionaries
+        self.protein2embedding = load_protein2embedding(protein2embedding_filepath)
+        self.cell2embedding = load_cell2embedding(cell2embedding_filepath)
+    def setup(self, stage: str):
+        self.train_ds, self.val_ds, self.test_ds = get_datasets(
+            self.train_df,
+            self.val_df,
+            self.test_df,
+            self.protein2embedding,
+            self.cell2embedding,
+            self.smiles2fp,
+            use_smote=self.use_smote,
+            smote_k_neighbors=self.smote_k_neighbors,
+            active_label=self.active_label,
+            disabled_embeddings=self.disabled_embeddings,
+            scaler=self.scaler,
+            use_single_scaler=self.use_single_scaler,
+        )
+    def train_dataloader(self):
+        return DataLoader(self.train_ds, batch_size=32, shuffle=True)
+    def val_dataloader(self):
+        return DataLoader(self.val_ds, batch_size=32)
+    def test_dataloader(self):
+        return DataLoader(self.test_ds, batch_size=32)
+    @staticmethod
+    def get_random_split_indices(active_df: pd.DataFrame, test_split: float) -> pd.Index:
+        """ Get the indices of the test set using a random split.
+        Args:
+            active_df (pd.DataFrame): The DataFrame containing the active PROTACs.
+            test_split (float): The percentage of the active PROTACs to use as the test set.
+        Returns:
+            pd.Index: The indices of the test set.
+        """
+        return active_df.sample(frac=test_split, random_state=42).index
+    @staticmethod
+    def get_e3_ligase_split_indices(active_df: pd.DataFrame) -> pd.Index:
+        """ Get the indices of the test set using the E3 ligase split.
+        Args:
+            active_df (pd.DataFrame): The DataFrame containing the active PROTACs.
+        Returns:
+            pd.Index: The indices of the test set.
+        """
+        encoder = OrdinalEncoder()
+        active_df['E3 Group'] = encoder.fit_transform(active_df[['E3 Ligase']]).astype(int)
+        test_df = active_df[(active_df['E3 Ligase'] != 'VHL') & (active_df['E3 Ligase'] != 'CRBN')]
+        return test_df.index
+    @staticmethod
+    def get_smiles2fp_and_avg_tanimoto(protac_df: pd.DataFrame) -> tuple:
+        """ Get the SMILES to fingerprint dictionary and the average Tanimoto similarity.
+        Args:
+            protac_df (pd.DataFrame): The DataFrame containing the PROTACs.
+        Returns:
+            tuple: The SMILES to fingerprint dictionary and the average Tanimoto similarity.
+        """
+        unique_smiles = protac_df['Smiles'].unique().tolist()
+        smiles2fp = {}
+        for smiles in unique_smiles:
+            smiles2fp[smiles] = get_fingerprint(smiles)
+        tanimoto_matrix = defaultdict(list)
+        fps = list(smiles2fp.values())
+        # Compute all-against-all Tanimoto similarity using BulkTanimotoSimilarity
+        for i, (smiles1, fp1) in enumerate(zip(unique_smiles, fps)):
+            similarities = DataStructs.BulkTanimotoSimilarity(fp1, fps[i:])  # Only compute for i to end, avoiding duplicates
+            for j, similarity in enumerate(similarities):
+                distance = 1 - similarity
+                tanimoto_matrix[smiles1].append(distance)  # Store as distance
+                if i != i + j:
+                    tanimoto_matrix[unique_smiles[i + j]].append(distance)  # Symmetric filling
+        # Calculate average Tanimoto distance for each unique SMILES
+        avg_tanimoto = {k: np.mean(v) for k, v in tanimoto_matrix.items()}
+        protac_df['Avg Tanimoto'] = protac_df['Smiles'].map(avg_tanimoto)
+        smiles2fp = {s: np.array(fp) for s, fp in smiles2fp.items()}
+        return smiles2fp, protac_df
+    @staticmethod
+    def get_tanimoto_split_indices(
+            active_df: pd.DataFrame,
+            active_label: str,
+            test_split: float,
+            n_bins_tanimoto: int = 200,
+    ) -> pd.Index:
+        """ Get the indices of the test set using the Tanimoto-based split.
+        Args:
+            active_df (pd.DataFrame): The DataFrame containing the active PROTACs.
+            n_bins_tanimoto (int): The number of bins to use for the Tanimoto similarity.
+        Returns:
+            pd.Index: The indices of the test set.
+        """
+        tanimoto_groups = pd.cut(active_df['Avg Tanimoto'], bins=n_bins_tanimoto).copy()
+        encoder = OrdinalEncoder()
+        active_df['Tanimoto Group'] = encoder.fit_transform(tanimoto_groups.values.reshape(-1, 1)).astype(int)
+        # Sort the groups so that samples with the highest tanimoto similarity,
+        # i.e., the "less similar" ones, are placed in the test set first
+        tanimoto_groups = active_df.groupby('Tanimoto Group')['Avg Tanimoto'].mean().sort_values(ascending=False).index
+        test_df = []
+        # For each group, get the number of active and inactive entries. Then, add those
+        # entries to the test_df if: 1) the test_df lenght + the group entries is less
+        # 20% of the active_df lenght, and 2) the percentage of True and False entries
+        # in the active_label in test_df is roughly 50%.
+        for group in tanimoto_groups:
+            group_df = active_df[active_df['Tanimoto Group'] == group]
+            if test_df == []:
+                test_df.append(group_df)
+                continue
+            num_entries = len(group_df)
+            num_active_group = group_df[active_label].sum()
+            num_inactive_group = num_entries - num_active_group
+            tmp_test_df = pd.concat(test_df)
+            num_entries_test = len(tmp_test_df)
+            num_active_test = tmp_test_df[active_label].sum()
+            num_inactive_test = num_entries_test - num_active_test
+            # Check if the group entries can be added to the test_df
+            if num_entries_test + num_entries < test_split * len(active_df):
+                # Add anything at the beggining
+                if num_entries_test + num_entries < test_split / 2 * len(active_df):
+                    test_df.append(group_df)
+                    continue
+                # Be more selective and make sure that the percentage of active and
+                # inactive is balanced
+                if (num_active_group + num_active_test) / (num_entries_test + num_entries) < 0.6:
+                    if (num_inactive_group + num_inactive_test) / (num_entries_test + num_entries) < 0.6:
+                        test_df.append(group_df)
+        test_df = pd.concat(test_df)
+        return test_df.index
+    @staticmethod
+    def get_target_split_indices(active_df: pd.DataFrame, active_label: str, test_split: float) -> pd.Index:
+        """ Get the indices of the test set using the target-based split.
+        Args:
+            active_df (pd.DataFrame): The DataFrame containing the active PROTACs.
+            active_label (str): The column containing the active/inactive information.
+            test_split (float): The percentage of the active PROTACs to use as the test set.
+        Returns:
+            pd.Index: The indices of the test set.
+        """
+        encoder = OrdinalEncoder()
+        active_df['Uniprot Group'] = encoder.fit_transform(active_df[['Uniprot']]).astype(int)
+        test_df = []
+        # For each group, get the number of active and inactive entries. Then, add those
+        # entries to the test_df if: 1) the test_df lenght + the group entries is less
+        # 20% of the active_df lenght, and 2) the percentage of True and False entries
+        # in the active_label in test_df is roughly 50%.
+        # Start the loop from the groups containing the smallest number of entries.
+        for group in reversed(active_df['Uniprot'].value_counts().index):
+            group_df = active_df[active_df['Uniprot'] == group]
+            if test_df == []:
+                test_df.append(group_df)
+                continue
+            num_entries = len(group_df)
+            num_active_group = group_df[active_label].sum()
+            num_inactive_group = num_entries - num_active_group
+            tmp_test_df = pd.concat(test_df)
+            num_entries_test = len(tmp_test_df)
+            num_active_test = tmp_test_df[active_label].sum()
+            num_inactive_test = num_entries_test - num_active_test
+            # Check if the group entries can be added to the test_df
+            if num_entries_test + num_entries < test_split * len(active_df):
+                # Add anything at the beggining
+                if num_entries_test + num_entries < test_split / 2 * len(active_df):
+                    test_df.append(group_df)
+                    continue
+                # Be more selective and make sure that the percentage of active and
+                # inactive is balanced
+                if (num_active_group + num_active_test) / (num_entries_test + num_entries) < 0.6:
+                    if (num_inactive_group + num_inactive_test) / (num_entries_test + num_entries) < 0.6:
+                        test_df.append(group_df)
+        test_df = pd.concat(test_df)
+        return test_df.index

protac_degradation_predictor/pytorch_models.py CHANGED Viewed

@@ -3,7 +3,7 @@ import pickle
 import logging
 from typing import Literal, List, Tuple, Optional, Dict
-from .protac_dataset import PROTAC_Dataset
 from .config import config
 import pandas as pd
@@ -38,7 +38,7 @@ class PROTAC_Predictor(nn.Module):
         dropout: float = 0.2,
         join_embeddings: Literal['beginning', 'concat', 'sum'] = 'sum',
         use_batch_norm: bool = False,
-        disabled_embeddings: list = [],
     ):
         """ Initialize the PROTAC model.
@@ -69,17 +69,17 @@ class PROTAC_Predictor(nn.Module):
         # and can be summed on a "similar scale".
         if self.join_embeddings != 'beginning':
             if 'poi' not in self.disabled_embeddings:
-                self.poi_emb = nn.Sequential(
                     nn.Linear(poi_emb_dim, hidden_dim),
                     nn.Softmax(dim=1),
                 )
             if 'e3' not in self.disabled_embeddings:
-                self.e3_emb = nn.Sequential(
                     nn.Linear(e3_emb_dim, hidden_dim),
                     nn.Softmax(dim=1),
                 )
             if 'cell' not in self.disabled_embeddings:
-                self.cell_emb = nn.Sequential(
                     nn.Linear(cell_emb_dim, hidden_dim),
                     nn.Softmax(dim=1),
                 )
@@ -95,12 +95,12 @@ class PROTAC_Predictor(nn.Module):
             joint_dim += poi_emb_dim if 'poi' not in self.disabled_embeddings else 0
             joint_dim += e3_emb_dim if 'e3' not in self.disabled_embeddings else 0
             joint_dim += cell_emb_dim if 'cell' not in self.disabled_embeddings else 0
         elif self.join_embeddings == 'concat':
             joint_dim = hidden_dim * (4 - len(self.disabled_embeddings))
         elif self.join_embeddings == 'sum':
             joint_dim = hidden_dim
-        self.fc0 = nn.Linear(joint_dim, joint_dim)
         self.fc1 = nn.Linear(joint_dim, hidden_dim)
         self.fc2 = nn.Linear(hidden_dim, hidden_dim)
         self.fc3 = nn.Linear(hidden_dim, 1)
@@ -125,11 +125,11 @@ class PROTAC_Predictor(nn.Module):
             x = self.dropout(F.relu(self.fc0(x)))
         else:
             if 'poi' not in self.disabled_embeddings:
-                embeddings.append(self.poi_emb(poi_emb))
             if 'e3' not in self.disabled_embeddings:
-                embeddings.append(self.e3_emb(e3_emb))
             if 'cell' not in self.disabled_embeddings:
-                embeddings.append(self.cell_emb(cell_emb))
             if 'smiles' not in self.disabled_embeddings:
                 embeddings.append(self.smiles_emb(smiles_emb))
             if self.join_embeddings == 'concat':
@@ -163,7 +163,7 @@ class PROTAC_Model(pl.LightningModule):
         train_dataset: PROTAC_Dataset = None,
         val_dataset: PROTAC_Dataset = None,
         test_dataset: PROTAC_Dataset = None,
-        disabled_embeddings: list = [],
         apply_scaling: bool = True,
     ):
         """ Initialize the PROTAC Pytorch Lightning model.
@@ -217,7 +217,7 @@ class PROTAC_Model(pl.LightningModule):
             dropout=dropout,
             join_embeddings=join_embeddings,
             use_batch_norm=use_batch_norm,
-            disabled_embeddings=disabled_embeddings,
         )
         stages = ['train_metrics', 'val_metrics', 'test_metrics']
@@ -429,7 +429,7 @@ def train_model(
         logger_name: str = 'protac',
         enable_checkpointing: bool = False,
         checkpoint_model_name: str = 'protac',
-        disabled_embeddings: List[str] = [],
         return_predictions: bool = False,
 ) -> tuple:
     """ Train a PROTAC model using the given datasets and hyperparameters.
@@ -453,31 +453,18 @@ def train_model(
     Returns:
         tuple: The trained model, the trainer, and the metrics.
     """
-    oversampler = SMOTE(k_neighbors=smote_k_neighbors, random_state=42)
-    train_ds = PROTAC_Dataset(
         train_df,
-        protein2embedding,
-        cell2embedding,
-        smiles2fp,
-        use_smote=use_smote,
-        oversampler=oversampler if use_smote else None,
-        active_label=active_label,
-    )
-    val_ds = PROTAC_Dataset(
         val_df,
         protein2embedding,
         cell2embedding,
         smiles2fp,
         active_label=active_label,
     )
-    if test_df is not None:
-        test_ds = PROTAC_Dataset(
-            test_df,
-            protein2embedding,
-            cell2embedding,
-            smiles2fp,
-            active_label=active_label,
-        )
     loggers = [
         pl.loggers.TensorBoardLogger(
             save_dir=logger_save_dir,
@@ -505,7 +492,7 @@ def train_model(
         ),
         pl.callbacks.EarlyStopping(
             monitor='val_loss',
-            patience=10, # Original: 5
             mode='min',
             verbose=False,
         ),
@@ -586,6 +573,36 @@ def train_model(
     return model, trainer, metrics
 def load_model(
         ckpt_path: str,
 ) -> PROTAC_Model:

 import logging
 from typing import Literal, List, Tuple, Optional, Dict
+from .protac_dataset import PROTAC_Dataset, get_datasets
 from .config import config
 import pandas as pd
         dropout: float = 0.2,
         join_embeddings: Literal['beginning', 'concat', 'sum'] = 'sum',
         use_batch_norm: bool = False,
+        disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
     ):
         """ Initialize the PROTAC model.
         # and can be summed on a "similar scale".
         if self.join_embeddings != 'beginning':
             if 'poi' not in self.disabled_embeddings:
+                self.poi_fc = nn.Sequential(
                     nn.Linear(poi_emb_dim, hidden_dim),
                     nn.Softmax(dim=1),
                 )
             if 'e3' not in self.disabled_embeddings:
+                self.e3_fc = nn.Sequential(
                     nn.Linear(e3_emb_dim, hidden_dim),
                     nn.Softmax(dim=1),
                 )
             if 'cell' not in self.disabled_embeddings:
+                self.cell_fc = nn.Sequential(
                     nn.Linear(cell_emb_dim, hidden_dim),
                     nn.Softmax(dim=1),
                 )
             joint_dim += poi_emb_dim if 'poi' not in self.disabled_embeddings else 0
             joint_dim += e3_emb_dim if 'e3' not in self.disabled_embeddings else 0
             joint_dim += cell_emb_dim if 'cell' not in self.disabled_embeddings else 0
+            self.fc0 = nn.Linear(joint_dim, joint_dim)
         elif self.join_embeddings == 'concat':
             joint_dim = hidden_dim * (4 - len(self.disabled_embeddings))
         elif self.join_embeddings == 'sum':
             joint_dim = hidden_dim
         self.fc1 = nn.Linear(joint_dim, hidden_dim)
         self.fc2 = nn.Linear(hidden_dim, hidden_dim)
         self.fc3 = nn.Linear(hidden_dim, 1)
             x = self.dropout(F.relu(self.fc0(x)))
         else:
             if 'poi' not in self.disabled_embeddings:
+                embeddings.append(self.poi_fc(poi_emb))
             if 'e3' not in self.disabled_embeddings:
+                embeddings.append(self.e3_fc(e3_emb))
             if 'cell' not in self.disabled_embeddings:
+                embeddings.append(self.cell_fc(cell_emb))
             if 'smiles' not in self.disabled_embeddings:
                 embeddings.append(self.smiles_emb(smiles_emb))
             if self.join_embeddings == 'concat':
         train_dataset: PROTAC_Dataset = None,
         val_dataset: PROTAC_Dataset = None,
         test_dataset: PROTAC_Dataset = None,
+        disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
         apply_scaling: bool = True,
     ):
         """ Initialize the PROTAC Pytorch Lightning model.
             dropout=dropout,
             join_embeddings=join_embeddings,
             use_batch_norm=use_batch_norm,
+            disabled_embeddings=[], # NOTE: This is handled in the PROTAC_Dataset classes
         )
         stages = ['train_metrics', 'val_metrics', 'test_metrics']
         logger_name: str = 'protac',
         enable_checkpointing: bool = False,
         checkpoint_model_name: str = 'protac',
+        disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
         return_predictions: bool = False,
 ) -> tuple:
     """ Train a PROTAC model using the given datasets and hyperparameters.
     Returns:
         tuple: The trained model, the trainer, and the metrics.
     """
+    train_ds, val_ds, test_ds = get_datasets(
         train_df,
         val_df,
+        test_df,
         protein2embedding,
         cell2embedding,
         smiles2fp,
+        use_smote=use_smote,
+        smote_k_neighbors=smote_k_neighbors,
         active_label=active_label,
+        disabled_embeddings=disabled_embeddings,
     )
     loggers = [
         pl.loggers.TensorBoardLogger(
             save_dir=logger_save_dir,
         ),
         pl.callbacks.EarlyStopping(
             monitor='val_loss',
+            patience=5, # Original: 5
             mode='min',
             verbose=False,
         ),
     return model, trainer, metrics
+def evaluate_model(
+        model: PROTAC_Model,
+        trainer: pl.Trainer,
+        val_ds: PROTAC_Dataset,
+        test_ds: Optional[PROTAC_Dataset] = None,
+        batch_size: int = 128,
+) -> tuple:
+    """ Evaluate a PROTAC model using the given datasets. """
+    ret = {}
+    val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
+    val_metrics = trainer.validate(model, val_dl, verbose=False)[0]
+    val_metrics = {m: v for m, v in val_metrics.items() if 'val' in m}
+    # Get predictions on validation set
+    val_pred = torch.cat(trainer.predict(model, val_dl)).squeeze()
+    ret['val_metrics'] = val_metrics
+    ret['val_pred'] = val_pred
+    if test_ds is not None:
+        test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
+        test_metrics = trainer.test(model, test_dl, verbose=False)[0]
+        test_metrics = {m: v for m, v in test_metrics.items() if 'test' in m}
+        # Get predictions on test set
+        test_pred = torch.cat(trainer.predict(model, test_dl)).squeeze()
+        ret['test_metrics'] = test_metrics
+        ret['test_pred'] = test_pred
+    return ret
 def load_model(
         ckpt_path: str,
 ) -> PROTAC_Model:

src/plot_experiment_results.py CHANGED Viewed

@@ -12,7 +12,7 @@ import numpy as np
 palette = ['#83B8FE', '#FFA54C', '#94ED67', '#FF7FFF']
-def plot_training_curves(df, split_type, stage='test'):
     Stage = 'Test' if stage == 'test' else 'Validation'
     # Clean the data
@@ -22,20 +22,28 @@ def plot_training_curves(df, split_type, stage='test'):
     df = df.apply(pd.to_numeric, errors='coerce')
     # Group by 'epoch' and aggregate by mean
-    epoch_data = df.groupby('epoch').mean()
     fig, ax = plt.subplots(3, 1, figsize=(10, 15))
     # Plot training loss
-    ax[0].plot(epoch_data.index, epoch_data['train_loss_epoch'], label='Training Loss')
-    ax[0].plot(epoch_data.index, epoch_data[f'{stage}_loss'], label=f'{Stage} Loss', linestyle='--')
     ax[0].set_ylabel('Loss')
     ax[0].legend(loc='lower right')
     ax[0].grid(axis='both', alpha=0.5)
     # Plot training accuracy
-    ax[1].plot(epoch_data.index, epoch_data['train_acc_epoch'], label='Training Accuracy')
-    ax[1].plot(epoch_data.index, epoch_data[f'{stage}_acc'], label=f'{Stage} Accuracy', linestyle='--')
     ax[1].set_ylabel('Accuracy')
     ax[1].legend(loc='lower right')
     ax[1].grid(axis='both', alpha=0.5)
@@ -45,8 +53,10 @@ def plot_training_curves(df, split_type, stage='test'):
     ax[1].yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1, decimals=0))
     # Plot training ROC-AUC
-    ax[2].plot(epoch_data.index, epoch_data['train_roc_auc_epoch'], label='Training ROC-AUC')
-    ax[2].plot(epoch_data.index, epoch_data[f'{stage}_roc_auc'], label=f'{Stage} ROC-AUC', linestyle='--')
     ax[2].set_ylabel('ROC-AUC')
     ax[2].legend(loc='lower right')
     ax[2].grid(axis='both', alpha=0.5)
@@ -167,6 +177,7 @@ def plot_ablation_study(report):
         'disabled poi',
         'disabled e3',
         'disabled cell',
         'disabled poi e3 smiles',
         'disabled poi e3 cell',
     ]
@@ -226,6 +237,7 @@ def plot_ablation_study(report):
             'disabled e3': 'Disabled E3 information',
             'disabled poi': 'Disabled target information',
             'disabled cell': 'Disabled cell information',
             'disabled poi e3 smiles': 'Disabled compound, E3, and target info\n(only cell information left)',
             'disabled poi e3 cell': 'Disabled cell, E3, and target info\n(only compound information left)',
         })
@@ -323,6 +335,7 @@ def main():
     for split_type in ['random', 'tanimoto', 'uniprot']:
         for i in range(n_models_for_test):
             logs_dir = f'logs_{report_base_name}_{split_type}_best_model_n{i}'
             metrics = pd.read_csv(f'logs/{logs_dir}/{logs_dir}/metrics.csv')
@@ -330,36 +343,41 @@ def main():
             # Rename 'val_' columns to 'test_' columns
             metrics = metrics.rename(columns={'val_loss': 'test_loss', 'val_acc': 'test_acc', 'val_roc_auc': 'test_roc_auc'})
             plot_training_curves(metrics, f'{split_type}_best_model_n{i}')
         for i in range(cv_n_folds):
             # logs_dir = f'logs_{report_base_name}_{split_type}_best_model_n{i}'
             logs_dir = f'logs_{report_base_name}_{split_type}_{split_type}_cv_model_fold{i}'
             metrics = pd.read_csv(f'logs/{logs_dir}/{logs_dir}/metrics.csv')
             metrics['fold'] = i
             plot_training_curves(metrics, f'{split_type}_cv_model_fold{i}', stage='val')
-    plot_performance_metrics(
-        reports['cv_train'],
-        reports['test'],
-        title=f'{active_name}_metrics',
-    )
-    plot_performance_metrics(
-        reports['cv_train'],
-        reports['majority_vote'][reports['majority_vote']['cv_models'] == False],
-        title=f'{active_name}_metrics_majority_vote',
-    )
-    plot_majority_voting_performance(reports['majority_vote'])
-    reports['test']['disabled_embeddings'] = pd.NA
-    plot_ablation_study(pd.concat([
-        reports['ablation'],
-        reports['test'],
-    ]))
-    # Plot hyperparameter optimization results to markdown
-    print(reports['hparam'][['split_type', 'hidden_dim', 'learning_rate', 'dropout', 'use_smote', 'smote_k_neighbors']].to_markdown(index=False))
 if __name__ == '__main__':

 palette = ['#83B8FE', '#FFA54C', '#94ED67', '#FF7FFF']
+def plot_training_curves(df, split_type, stage='test', multimodels=False, groupby='model_id'):
     Stage = 'Test' if stage == 'test' else 'Validation'
     # Clean the data
     df = df.apply(pd.to_numeric, errors='coerce')
     # Group by 'epoch' and aggregate by mean
+    if multimodels:
+        epoch_data = df.groupby([groupby, 'epoch']).mean().reset_index()
+    else:
+        epoch_data = df.groupby('epoch').mean().reset_index()
     fig, ax = plt.subplots(3, 1, figsize=(10, 15))
     # Plot training loss
+    # ax[0].plot(epoch_data.index, epoch_data['train_loss_epoch'], label='Training Loss')
+    # ax[0].plot(epoch_data.index, epoch_data[f'{stage}_loss'], label=f'{Stage} Loss', linestyle='--')
+    sns.lineplot(data=epoch_data, x='epoch', y='train_loss_epoch', ax=ax[0], label='Training Loss')
+    sns.lineplot(data=epoch_data, x='epoch', y=f'{stage}_loss', ax=ax[0], label=f'{Stage} Loss', linestyle='--')
     ax[0].set_ylabel('Loss')
     ax[0].legend(loc='lower right')
     ax[0].grid(axis='both', alpha=0.5)
     # Plot training accuracy
+    # ax[1].plot(epoch_data.index, epoch_data['train_acc_epoch'], label='Training Accuracy')
+    # ax[1].plot(epoch_data.index, epoch_data[f'{stage}_acc'], label=f'{Stage} Accuracy', linestyle='--')
+    sns.lineplot(data=epoch_data, x='epoch', y='train_acc_epoch', ax=ax[1], label='Training Accuracy')
+    sns.lineplot(data=epoch_data, x='epoch', y=f'{stage}_acc', ax=ax[1], label=f'{Stage} Accuracy', linestyle='--')
     ax[1].set_ylabel('Accuracy')
     ax[1].legend(loc='lower right')
     ax[1].grid(axis='both', alpha=0.5)
     ax[1].yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1, decimals=0))
     # Plot training ROC-AUC
+    # ax[2].plot(epoch_data.index, epoch_data['train_roc_auc_epoch'], label='Training ROC-AUC')
+    # ax[2].plot(epoch_data.index, epoch_data[f'{stage}_roc_auc'], label=f'{Stage} ROC-AUC', linestyle='--')
+    sns.lineplot(data=epoch_data, x='epoch', y='train_roc_auc_epoch', ax=ax[2], label='Training ROC-AUC')
+    sns.lineplot(data=epoch_data, x='epoch', y=f'{stage}_roc_auc', ax=ax[2], label=f'{Stage} ROC-AUC', linestyle='--')
     ax[2].set_ylabel('ROC-AUC')
     ax[2].legend(loc='lower right')
     ax[2].grid(axis='both', alpha=0.5)
         'disabled poi',
         'disabled e3',
         'disabled cell',
+        'disabled poi e3',
         'disabled poi e3 smiles',
         'disabled poi e3 cell',
     ]
             'disabled e3': 'Disabled E3 information',
             'disabled poi': 'Disabled target information',
             'disabled cell': 'Disabled cell information',
+            'disabled poi e3': 'Disabled E3 and target info',
             'disabled poi e3 smiles': 'Disabled compound, E3, and target info\n(only cell information left)',
             'disabled poi e3 cell': 'Disabled cell, E3, and target info\n(only compound information left)',
         })
     for split_type in ['random', 'tanimoto', 'uniprot']:
+        split_metrics = []
         for i in range(n_models_for_test):
             logs_dir = f'logs_{report_base_name}_{split_type}_best_model_n{i}'
             metrics = pd.read_csv(f'logs/{logs_dir}/{logs_dir}/metrics.csv')
             # Rename 'val_' columns to 'test_' columns
             metrics = metrics.rename(columns={'val_loss': 'test_loss', 'val_acc': 'test_acc', 'val_roc_auc': 'test_roc_auc'})
             plot_training_curves(metrics, f'{split_type}_best_model_n{i}')
+            split_metrics.append(metrics)
+        plot_training_curves(pd.concat(split_metrics), f'{split_type}_best_model', multimodels=True)
+        split_metrics_cv = []
         for i in range(cv_n_folds):
             # logs_dir = f'logs_{report_base_name}_{split_type}_best_model_n{i}'
             logs_dir = f'logs_{report_base_name}_{split_type}_{split_type}_cv_model_fold{i}'
             metrics = pd.read_csv(f'logs/{logs_dir}/{logs_dir}/metrics.csv')
             metrics['fold'] = i
             plot_training_curves(metrics, f'{split_type}_cv_model_fold{i}', stage='val')
+            split_metrics_cv.append(metrics)
+        plot_training_curves(pd.concat(split_metrics_cv), f'{split_type}_cv_model', stage='val', multimodels=True, groupby='fold')
+    # plot_performance_metrics(
+    #     reports['cv_train'],
+    #     reports['test'],
+    #     title=f'{active_name}_metrics',
+    # )
+    # plot_performance_metrics(
+    #     reports['cv_train'],
+    #     reports['majority_vote'][reports['majority_vote']['cv_models'] == False],
+    #     title=f'{active_name}_metrics_majority_vote',
+    # )
+    # plot_majority_voting_performance(reports['majority_vote'])
+    # reports['test']['disabled_embeddings'] = pd.NA
+    # plot_ablation_study(pd.concat([
+    #     reports['ablation'],
+    #     reports['test'],
+    # ]))
+    # # Plot hyperparameter optimization results to markdown
+    # print(reports['hparam'][['split_type', 'hidden_dim', 'learning_rate', 'dropout', 'use_smote', 'smote_k_neighbors']].to_markdown(index=False))
 if __name__ == '__main__':

src/run_experiments.py CHANGED Viewed

@@ -305,7 +305,7 @@ def main(
         # Start the experiment
         experiment_name = f'{active_name}_test_split_{test_split}_{split_type}'
-        optuna_reports = pdp.hyperparameter_tuning_and_training(
             protein2embedding=protein2embedding,
             cell2embedding=cell2embedding,
             smiles2fp=smiles2fp,

         # Start the experiment
         experiment_name = f'{active_name}_test_split_{test_split}_{split_type}'
+        optuna_reports = pdp.hyperparameter_tuning_and_training(
             protein2embedding=protein2embedding,
             cell2embedding=cell2embedding,
             smiles2fp=smiles2fp,