Defetya
/

simson_base

Safetensors

Model card Files Files and versions

xet

Community

Defetya commited on Jul 22, 2025

Commit

e426db9

verified ·

1 Parent(s): aea539f

Upload moleculenet_eval/eval.py with huggingface_hub

Browse files

Files changed (1) hide show

moleculenet_eval/eval.py +46 -88

moleculenet_eval/eval.py CHANGED Viewed

@@ -17,11 +17,7 @@ from collections import defaultdict
 torch.set_float32_matmul_precision('high')
 # --- 1. Data Loading ---
-# Function to load datasets from their respective URLs.
 def load_lists_from_url(data):
-    """
-    Load SMILES and labels from Moleculenet website.
-    """
     if data == 'bbbp':
         df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv')
         smiles, labels = df.smiles, df.p_np
@@ -35,7 +31,7 @@ def load_lists_from_url(data):
     elif data == 'sider':
         df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/sider.csv.gz', compression='gzip')
         smiles = df.smiles
-        labels = df.drop(['smiles'], axis=1)    # (1427, 27)
     elif data == 'esol':
         df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv')
         smiles = df.smiles
@@ -49,27 +45,20 @@ def load_lists_from_url(data):
         smiles, labels = df.smiles, df['exp']
     elif data == 'tox21':
         df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/tox21.csv.gz', compression='gzip')
-        df = df.dropna(axis=0, how='any').reset_index(drop=True)   # drop nan values
         smiles = df.smiles
-        labels = df.drop(['mol_id', 'smiles'], axis=1)  # 12 cols
     elif data == 'bace':
         df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv')
         smiles, labels = df.mol, df.Class
-    elif data == 'tox21':
-        df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/tox21.csv.gz', compression='gzip')
-        df = df.dropna(axis=0, how='any').reset_index(drop=True)  # drop nan values
-        smiles = df.smiles
-        labels = df.drop(['mol_id', 'smiles'], axis=1)  # 12 cols
     elif data == 'qm8':
         df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv')
-        df = df.dropna(axis=0, how='any').reset_index(drop=True)  # drop nan values
         smiles = df.smiles
-        labels = df.drop(['smiles', 'E2-PBE0.1', 'E1-PBE0.1', 'f1-PBE0.1', 'f2-PBE0.1'], axis=1)  # 12 tasks
     return smiles, labels
 # --- 2. Scaffold Splitting ---
-# Class to split the dataset based on molecular scaffolds.
 class ScaffoldSplitter:
     def __init__(self, data, seed, train_frac=0.8, val_frac=0.1, test_frac=0.1, include_chirality=True):
         self.data = data
@@ -86,28 +75,20 @@ class ScaffoldSplitter:
     def scaffold_split(self):
         smiles, labels = load_lists_from_url(self.data)
-        # Initialize non_null as False for all samples
         non_null = np.ones(len(smiles)) == 0
-        # Dataset-specific null handling
-        if self.data == 'tox21' or self.data == 'sider' or self.data == 'clintox':
             for i in range(len(smiles)):
-                # Check if molecule is valid AND no missing labels
                 if Chem.MolFromSmiles(smiles[i]) and labels.loc[i].isnull().sum() == 0:
                     non_null[i] = 1
         else:
-            # For single-task datasets, only check molecule validity
             for i in range(len(smiles)):
                 if Chem.MolFromSmiles(smiles[i]):
                     non_null[i] = 1
-        # Extract valid samples with original indices preserved
         smiles_list = list(compress(enumerate(smiles), non_null))
         rng = np.random.RandomState(self.seed)
-        # Group by scaffold
         scaffolds = defaultdict(list)
         for i, sms in smiles_list:
             scaffold = self.generate_scaffold(sms)
@@ -115,13 +96,10 @@ class ScaffoldSplitter:
         scaffold_sets = list(scaffolds.values())
         rng.shuffle(scaffold_sets)
-        # Calculate target sizes for validation and test sets
         n_total_val = int(np.floor(self.val_frac * len(smiles_list)))
         n_total_test = int(np.floor(self.test_frac * len(smiles_list)))
         train_idx, val_idx, test_idx = [], [], []
-        # Assign scaffold groups to splits
         for scaffold_set in scaffold_sets:
             if len(val_idx) + len(scaffold_set) <= n_total_val:
                 val_idx.extend(scaffold_set)
@@ -129,10 +107,20 @@ class ScaffoldSplitter:
                 test_idx.extend(scaffold_set)
             else:
                 train_idx.extend(scaffold_set)
         return train_idx, val_idx, test_idx
 # --- 3. PyTorch Dataset ---
-# Custom Dataset class for handling SMILES data.
 class MoleculeDataset(Dataset):
     def __init__(self, smiles_list, labels, tokenizer, max_len=512):
         self.smiles_list = smiles_list
@@ -154,25 +142,16 @@ class MoleculeDataset(Dataset):
             max_length=self.max_len,
             return_tensors='pt'
         )
         item = {key: val.squeeze(0) for key, val in encoding.items()}
-        # Handle single-task and multi-task labels
         if isinstance(label, pd.Series):
             label_values = label.values.astype(np.float32)
         else:
             label_values = np.array([label], dtype=np.float32)
         item['labels'] = torch.tensor(label_values, dtype=torch.float)
         return item
 # --- 4. Model Architecture ---
 def global_ap(x):
-    """
-    Global Average Pooling
-    Input: [B, max_len, hid_dim]
-    Return: [B, hid_dim]
-    """
     return torch.mean(x.view(x.size(0), x.size(1), -1), dim=1)
 class SimSonEncoder(nn.Module):
@@ -183,7 +162,6 @@ class SimSonEncoder(nn.Module):
         self.bert = BertModel(config, add_pooling_layer=False)
         self.linear = nn.Linear(config.hidden_size, max_len)
         self.dropout = nn.Dropout(dropout)
     def forward(self, input_ids, attention_mask=None):
         if attention_mask is None:
             attention_mask = input_ids.ne(self.config.pad_token_id)
@@ -199,7 +177,6 @@ class SimSonClassifier(nn.Module):
         self.clf = nn.Linear(encoder.max_len, num_labels)
         self.relu = nn.ReLU()
         self.dropout = nn.Dropout(dropout)
     def forward(self, input_ids, attention_mask=None):
         x = self.encoder(input_ids, attention_mask)
         x = self.relu(self.dropout(x))
@@ -207,13 +184,11 @@ class SimSonClassifier(nn.Module):
         return logits
     def load_encoder_params(self, state_dict_path):
-        """Loads pretrained parameters into the SimSonEncoder."""
         self.encoder.load_state_dict(torch.load(state_dict_path))
         print("Pretrained encoder parameters loaded.")
 # --- 5. Training, Validation, and Testing Loops ---
 def get_criterion(task_type, num_labels):
-    """Select loss function based on task."""
     if task_type == 'classification':
         return nn.BCEWithLogitsLoss()
     elif task_type == 'regression':
@@ -227,14 +202,12 @@ def train_epoch(model, dataloader, optimizer, scheduler, criterion, device):
     for batch in dataloader:
         inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
         labels = batch['labels'].to(device)
         optimizer.zero_grad()
         outputs = model(**inputs)
         loss = criterion(outputs, labels)
         loss.backward()
         optimizer.step()
         scheduler.step()
         total_loss += loss.item()
     return total_loss / len(dataloader)
@@ -258,40 +231,31 @@ def test_model(model, dataloader, device):
             inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
             labels = batch['labels']
             outputs = model(**inputs)
-            # Apply sigmoid for classification probabilities
             preds = torch.sigmoid(outputs)
             all_preds.append(preds.cpu().numpy())
             all_labels.append(labels.numpy())
     return np.concatenate(all_preds), np.concatenate(all_labels)
 # --- 6. Main Execution Block ---
 def main():
-    # --- Configuration ---
     DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     print(f"Using device: {DEVICE}")
     DATASETS_TO_RUN = {
-        #'esol': {'task_type': 'regression', 'num_labels': 1},
-        #'freesolv': {'task_type': 'regression', 'num_labels':1},
-        #'lipophicility': {'task_type': 'regression', 'num_labels': 1},
-        #'qm8': {'task_type': 'regression', 'num_labels': 12},
-        #'bbbp': {'task_type': 'classification', 'num_labels': 1},
-        'tox21': {'task_type': 'classification', 'num_labels': 12},
-        #'sider': {'task_type': 'classification', 'num_labels': 27},
-        #'clintox': {'task_type': 'classification', 'num_labels': 2},
-        #'hiv': {'task_type': 'classification', 'num_labels': 1},
-        #'bace': {'task_type': 'classification', 'num_labels': 1},
     }
-    PATIENCE = 25
-    EPOCHS = 200
     LEARNING_RATE = 2e-5
     BATCH_SIZE = 128
-    MAX_LEN = 256
-    # --- Tokenizer and Model Config ---
     TOKENIZER = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MTR')
     ENCODER_CONFIG = BertConfig(
         vocab_size=TOKENIZER.vocab_size,
@@ -305,24 +269,24 @@ def main():
     aggregated_results = {}
     for name, info in DATASETS_TO_RUN.items():
-        print(f"\n{'='*20} Processing Dataset: {name.upper()} {'='*20}")
-        # --- Data Loading and Splitting ---
-        splitter = ScaffoldSplitter(data=name, seed=42)
-        train_idx, val_idx, test_idx = splitter.scaffold_split()
-        # Load data once
         smiles, labels = load_lists_from_url(name)
-        # Extract splits using returned indices
         train_smiles = smiles.iloc[train_idx].reset_index(drop=True)
         train_labels = labels.iloc[train_idx].reset_index(drop=True)
         val_smiles = smiles.iloc[val_idx].reset_index(drop=True)
         val_labels = labels.iloc[val_idx].reset_index(drop=True)
         test_smiles = smiles.iloc[test_idx].reset_index(drop=True)
-        test_labels = labels.iloc[test_idx].reset_index(drop=True)
         print(f"Data split - Train: {len(train_smiles)}, Val: {len(val_smiles)}, Test: {len(test_smiles)}")
         train_dataset = MoleculeDataset(train_smiles, train_labels, TOKENIZER, MAX_LEN)
@@ -333,16 +297,14 @@ def main():
         val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
         test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
-        # --- Model, Loss, and Optimizer ---
         encoder = SimSonEncoder(ENCODER_CONFIG, 512)
         encoder = torch.compile(encoder)
         model = SimSonClassifier(encoder, num_labels=info['num_labels']).to(DEVICE)
         model.load_encoder_params('../simson_checkpoints/checkpoint_best_model.bin')
         criterion = get_criterion(info['task_type'], info['num_labels'])
         optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
         scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS * len(train_loader))
-        # --- Training and Validation ---
         best_val_loss = float('inf')
         best_model_state = None
         current_patience = 0
@@ -362,12 +324,12 @@ def main():
                     print(f'Early stopping at {PATIENCE} epochs')
                     break
-        # --- Testing ---
         print("\nTesting with the best model...")
         model.load_state_dict(best_model_state)
         test_preds, test_true = test_model(model, test_loader, DEVICE)
-        # Store results. For classification, you can now calculate metrics like ROC-AUC.
         aggregated_results[name] = {
             'best_val_loss': best_val_loss,
             'test_predictions': test_preds,
@@ -375,19 +337,15 @@ def main():
         }
         print(f"Finished testing for {name}.")
-    # --- Final Results Aggregation ---
     print(f"\n{'='*20} AGGREGATED RESULTS {'='*20}")
     for name, result in aggregated_results.items():
-        # Here you would typically calculate and display final metrics from predictions
-        # For example, using scikit-learn's roc_auc_score
-        # from sklearn.metrics import roc_auc_score
         if name in ['bbbp', 'tox21', 'sider', 'clintox', 'hiv', 'bace']:
             auc = roc_auc_score(result['test_labels'], result['test_predictions'], average='macro')
             print(f'{name} ROC AUC: {auc}')
         if name in ['lipophicility', 'esol', 'qm8']:
             rmse = root_mean_squared_error(result['test_labels'], result['test_predictions'])
-            mae = mean_absolute_error(result['test_labels'], result['test_predictions'])
             print(f'{name} MAE: {mae}')
             print(f'{name} RMSE: {rmse}')

 torch.set_float32_matmul_precision('high')
 # --- 1. Data Loading ---
 def load_lists_from_url(data):
     if data == 'bbbp':
         df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv')
         smiles, labels = df.smiles, df.p_np
     elif data == 'sider':
         df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/sider.csv.gz', compression='gzip')
         smiles = df.smiles
+        labels = df.drop(['smiles'], axis=1)
     elif data == 'esol':
         df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv')
         smiles = df.smiles
         smiles, labels = df.smiles, df['exp']
     elif data == 'tox21':
         df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/tox21.csv.gz', compression='gzip')
+        df = df.dropna(axis=0, how='any').reset_index(drop=True)
         smiles = df.smiles
+        labels = df.drop(['mol_id', 'smiles'], axis=1)
     elif data == 'bace':
         df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv')
         smiles, labels = df.mol, df.Class
     elif data == 'qm8':
         df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv')
+        df = df.dropna(axis=0, how='any').reset_index(drop=True)
         smiles = df.smiles
+        labels = df.drop(['smiles', 'E2-PBE0.1', 'E1-PBE0.1', 'f1-PBE0.1', 'f2-PBE0.1'], axis=1)
     return smiles, labels
 # --- 2. Scaffold Splitting ---
 class ScaffoldSplitter:
     def __init__(self, data, seed, train_frac=0.8, val_frac=0.1, test_frac=0.1, include_chirality=True):
         self.data = data
     def scaffold_split(self):
         smiles, labels = load_lists_from_url(self.data)
         non_null = np.ones(len(smiles)) == 0
+        if self.data in {'tox21', 'sider', 'clintox'}:
             for i in range(len(smiles)):
                 if Chem.MolFromSmiles(smiles[i]) and labels.loc[i].isnull().sum() == 0:
                     non_null[i] = 1
         else:
             for i in range(len(smiles)):
                 if Chem.MolFromSmiles(smiles[i]):
                     non_null[i] = 1
         smiles_list = list(compress(enumerate(smiles), non_null))
         rng = np.random.RandomState(self.seed)
         scaffolds = defaultdict(list)
         for i, sms in smiles_list:
             scaffold = self.generate_scaffold(sms)
         scaffold_sets = list(scaffolds.values())
         rng.shuffle(scaffold_sets)
         n_total_val = int(np.floor(self.val_frac * len(smiles_list)))
         n_total_test = int(np.floor(self.test_frac * len(smiles_list)))
         train_idx, val_idx, test_idx = [], [], []
         for scaffold_set in scaffold_sets:
             if len(val_idx) + len(scaffold_set) <= n_total_val:
                 val_idx.extend(scaffold_set)
                 test_idx.extend(scaffold_set)
             else:
                 train_idx.extend(scaffold_set)
         return train_idx, val_idx, test_idx
+# --- 2a. Normal Random Split ---
+def random_split_indices(n, seed=42, train_frac=0.8, val_frac=0.1, test_frac=0.1):
+    np.random.seed(seed)
+    indices = np.random.permutation(n)
+    n_train = int(n * train_frac)
+    n_val = int(n * val_frac)
+    train_idx = indices[:n_train]
+    val_idx = indices[n_train:n_train+n_val]
+    test_idx = indices[n_train+n_val:]
+    return train_idx.tolist(), val_idx.tolist(), test_idx.tolist()
 # --- 3. PyTorch Dataset ---
 class MoleculeDataset(Dataset):
     def __init__(self, smiles_list, labels, tokenizer, max_len=512):
         self.smiles_list = smiles_list
             max_length=self.max_len,
             return_tensors='pt'
         )
         item = {key: val.squeeze(0) for key, val in encoding.items()}
         if isinstance(label, pd.Series):
             label_values = label.values.astype(np.float32)
         else:
             label_values = np.array([label], dtype=np.float32)
         item['labels'] = torch.tensor(label_values, dtype=torch.float)
         return item
 # --- 4. Model Architecture ---
 def global_ap(x):
     return torch.mean(x.view(x.size(0), x.size(1), -1), dim=1)
 class SimSonEncoder(nn.Module):
         self.bert = BertModel(config, add_pooling_layer=False)
         self.linear = nn.Linear(config.hidden_size, max_len)
         self.dropout = nn.Dropout(dropout)
     def forward(self, input_ids, attention_mask=None):
         if attention_mask is None:
             attention_mask = input_ids.ne(self.config.pad_token_id)
         self.clf = nn.Linear(encoder.max_len, num_labels)
         self.relu = nn.ReLU()
         self.dropout = nn.Dropout(dropout)
     def forward(self, input_ids, attention_mask=None):
         x = self.encoder(input_ids, attention_mask)
         x = self.relu(self.dropout(x))
         return logits
     def load_encoder_params(self, state_dict_path):
         self.encoder.load_state_dict(torch.load(state_dict_path))
         print("Pretrained encoder parameters loaded.")
 # --- 5. Training, Validation, and Testing Loops ---
 def get_criterion(task_type, num_labels):
     if task_type == 'classification':
         return nn.BCEWithLogitsLoss()
     elif task_type == 'regression':
     for batch in dataloader:
         inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
         labels = batch['labels'].to(device)
         optimizer.zero_grad()
         outputs = model(**inputs)
         loss = criterion(outputs, labels)
         loss.backward()
         optimizer.step()
         scheduler.step()
         total_loss += loss.item()
     return total_loss / len(dataloader)
             inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
             labels = batch['labels']
             outputs = model(**inputs)
             preds = torch.sigmoid(outputs)
             all_preds.append(preds.cpu().numpy())
             all_labels.append(labels.numpy())
     return np.concatenate(all_preds), np.concatenate(all_labels)
 # --- 6. Main Execution Block ---
 def main():
     DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     print(f"Using device: {DEVICE}")
     DATASETS_TO_RUN = {
+        # 'esol': {'task_type': 'regression', 'num_labels': 1, 'split': 'random'},
+        #'tox21': {'task_type': 'classification', 'num_labels': 12, 'split': 'random'},
+        #'hiv': {'task_type': 'classification', 'num_labels': 27, 'split': 'scaffold'},
+        # Add more datasets here, e.g. 'bbbp': {'task_type': 'classification', 'num_labels': 1, 'split': 'random'},
+        #'sider': {'task_type': 'classification', 'num_labels': 27, 'split': 'random'},
+        #'bace': {'task_type': 'classification', 'num_labels': 1, 'split': 'random'},
+        'clintox': {'task_type': 'classification', 'num_labels': 2, 'split': 'scaffold'}
     }
+    PATIENCE = 15
+    EPOCHS = 100
     LEARNING_RATE = 2e-5
     BATCH_SIZE = 128
+    MAX_LEN = 512
     TOKENIZER = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MTR')
     ENCODER_CONFIG = BertConfig(
         vocab_size=TOKENIZER.vocab_size,
     aggregated_results = {}
     for name, info in DATASETS_TO_RUN.items():
+        print(f"\n{'='*20} Processing Dataset: {name.upper()} ({info['split']} split) {'='*20}")
         smiles, labels = load_lists_from_url(name)
+        # Split selection
+        if info.get('split', 'scaffold') == 'scaffold':
+            splitter = ScaffoldSplitter(data=name, seed=42)
+            train_idx, val_idx, test_idx = splitter.scaffold_split()
+        elif info['split'] == 'random':
+            train_idx, val_idx, test_idx = random_split_indices(len(smiles), seed=42)
+        else:
+            raise ValueError(f"Unknown split type for {name}: {info['split']}")
         train_smiles = smiles.iloc[train_idx].reset_index(drop=True)
         train_labels = labels.iloc[train_idx].reset_index(drop=True)
         val_smiles = smiles.iloc[val_idx].reset_index(drop=True)
         val_labels = labels.iloc[val_idx].reset_index(drop=True)
         test_smiles = smiles.iloc[test_idx].reset_index(drop=True)
+        test_labels = labels.iloc[test_idx].reset_index(drop=True)
         print(f"Data split - Train: {len(train_smiles)}, Val: {len(val_smiles)}, Test: {len(test_smiles)}")
         train_dataset = MoleculeDataset(train_smiles, train_labels, TOKENIZER, MAX_LEN)
         val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
         test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
         encoder = SimSonEncoder(ENCODER_CONFIG, 512)
         encoder = torch.compile(encoder)
         model = SimSonClassifier(encoder, num_labels=info['num_labels']).to(DEVICE)
         model.load_encoder_params('../simson_checkpoints/checkpoint_best_model.bin')
         criterion = get_criterion(info['task_type'], info['num_labels'])
         optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
         scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS * len(train_loader))
         best_val_loss = float('inf')
         best_model_state = None
         current_patience = 0
                     print(f'Early stopping at {PATIENCE} epochs')
                     break
         print("\nTesting with the best model...")
         model.load_state_dict(best_model_state)
+        test_loss = eval_epoch(model, test_loader, criterion, DEVICE)
+        print(f'Test loss: {test_loss}')
         test_preds, test_true = test_model(model, test_loader, DEVICE)
         aggregated_results[name] = {
             'best_val_loss': best_val_loss,
             'test_predictions': test_preds,
         }
         print(f"Finished testing for {name}.")
     print(f"\n{'='*20} AGGREGATED RESULTS {'='*20}")
     for name, result in aggregated_results.items():
         if name in ['bbbp', 'tox21', 'sider', 'clintox', 'hiv', 'bace']:
             auc = roc_auc_score(result['test_labels'], result['test_predictions'], average='macro')
             print(f'{name} ROC AUC: {auc}')
         if name in ['lipophicility', 'esol', 'qm8']:
             rmse = root_mean_squared_error(result['test_labels'], result['test_predictions'])
+            mae = mean_absolute_error(result['test_labels'], result['test_predictions'])
             print(f'{name} MAE: {mae}')
             print(f'{name} RMSE: {rmse}')