Defetya
/

simson_base

Safetensors

Model card Files Files and versions

xet

Community

Defetya commited on Jul 24, 2025

Commit

0896173

verified ·

1 Parent(s): e426db9

Upload moleculenet_eval/eval.py with huggingface_hub

Browse files

Files changed (1) hide show

moleculenet_eval/eval.py +116 -16

moleculenet_eval/eval.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch.nn as nn
 import torch.optim as optim
 from torch.utils.data import Dataset, DataLoader
 from transformers import BertConfig, BertModel, AutoTokenizer
-from rdkit import Chem
 from rdkit.Chem.Scaffolds import MurckoScaffold
 import copy
 from tqdm import tqdm
@@ -13,9 +13,68 @@ import os
 from sklearn.metrics import roc_auc_score, root_mean_squared_error, mean_absolute_error
 from itertools import compress
 from collections import defaultdict
 torch.set_float32_matmul_precision('high')
 # --- 1. Data Loading ---
 def load_lists_from_url(data):
     if data == 'bbbp':
@@ -207,7 +266,7 @@ def train_epoch(model, dataloader, optimizer, scheduler, criterion, device):
         loss = criterion(outputs, labels)
         loss.backward()
         optimizer.step()
-        scheduler.step()
         total_loss += loss.item()
     return total_loss / len(dataloader)
@@ -236,6 +295,38 @@ def test_model(model, dataloader, device):
             all_labels.append(labels.numpy())
     return np.concatenate(all_preds), np.concatenate(all_labels)
 # --- 6. Main Execution Block ---
 def main():
     DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -244,16 +335,17 @@ def main():
     DATASETS_TO_RUN = {
         # 'esol': {'task_type': 'regression', 'num_labels': 1, 'split': 'random'},
         #'tox21': {'task_type': 'classification', 'num_labels': 12, 'split': 'random'},
-        #'hiv': {'task_type': 'classification', 'num_labels': 27, 'split': 'scaffold'},
         # Add more datasets here, e.g. 'bbbp': {'task_type': 'classification', 'num_labels': 1, 'split': 'random'},
         #'sider': {'task_type': 'classification', 'num_labels': 27, 'split': 'random'},
         #'bace': {'task_type': 'classification', 'num_labels': 1, 'split': 'random'},
-        'clintox': {'task_type': 'classification', 'num_labels': 2, 'split': 'scaffold'}
     }
     PATIENCE = 15
-    EPOCHS = 100
-    LEARNING_RATE = 2e-5
-    BATCH_SIZE = 128
     MAX_LEN = 512
     TOKENIZER = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MTR')
@@ -302,18 +394,18 @@ def main():
         model = SimSonClassifier(encoder, num_labels=info['num_labels']).to(DEVICE)
         model.load_encoder_params('../simson_checkpoints/checkpoint_best_model.bin')
         criterion = get_criterion(info['task_type'], info['num_labels'])
-        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
-        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS * len(train_loader))
-        best_val_loss = float('inf')
         best_model_state = None
         current_patience = 0
         for epoch in range(EPOCHS):
             train_loss = train_epoch(model, train_loader, optimizer, scheduler, criterion, DEVICE)
-            val_loss = eval_epoch(model, val_loader, criterion, DEVICE)
-            print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
-            if val_loss < best_val_loss:
                 best_val_loss = val_loss
                 best_model_state = copy.deepcopy(model.state_dict())
                 print(f"  -> New best model saved with validation loss: {best_val_loss:.4f}")
@@ -325,7 +417,8 @@ def main():
                     break
         print("\nTesting with the best model...")
-        model.load_state_dict(best_model_state)
         test_loss = eval_epoch(model, test_loader, criterion, DEVICE)
         print(f'Test loss: {test_loss}')
         test_preds, test_true = test_model(model, test_loader, DEVICE)
@@ -336,6 +429,15 @@ def main():
             'test_labels': test_true
         }
         print(f"Finished testing for {name}.")
     print(f"\n{'='*20} AGGREGATED RESULTS {'='*20}")
     for name, result in aggregated_results.items():
@@ -352,6 +454,4 @@ def main():
     print("\nScript finished.")
 if __name__ == '__main__':
-    # Note: This script requires rdkit. You can install it via pip:
-    # pip install rdkit-pypi
     main()

 import torch.optim as optim
 from torch.utils.data import Dataset, DataLoader
 from transformers import BertConfig, BertModel, AutoTokenizer
+from rdkit import Chem, RDLogger
 from rdkit.Chem.Scaffolds import MurckoScaffold
 import copy
 from tqdm import tqdm
 from sklearn.metrics import roc_auc_score, root_mean_squared_error, mean_absolute_error
 from itertools import compress
 from collections import defaultdict
+from sklearn.metrics.pairwise import cosine_similarity
+RDLogger.DisableLog('rdApp.*')
 torch.set_float32_matmul_precision('high')
+# --- 0. Smiles enumeration
+class SmilesEnumerator:
+    """Generates randomized SMILES strings for data augmentation."""
+    def randomize_smiles(self, smiles):
+        try:
+            mol = Chem.MolFromSmiles(smiles)
+            return Chem.MolToSmiles(mol, doRandom=True, canonical=False) if mol else smiles
+        except:
+            return smiles
+def compute_embedding_similarity(encoder, smiles_list, tokenizer, device, max_len=256):
+    encoder.eval()
+    enumerator = SmilesEnumerator()
+    embeddings_orig = []
+    embeddings_aug = []
+    with torch.no_grad():
+        for smi in smiles_list:
+            # Original SMILES encoding
+            encoding_orig = tokenizer(
+                smi,
+                truncation=True,
+                padding='max_length',
+                max_length=max_len,
+                return_tensors='pt'
+            )
+            # Augmented SMILES encoding
+            smi_aug = enumerator.randomize_smiles(smi)
+            encoding_aug = tokenizer(
+                smi_aug,
+                truncation=True,
+                padding='max_length',
+                max_length=max_len,
+                return_tensors='pt'
+            )
+            input_ids_orig = encoding_orig.input_ids.to(device)
+            attention_mask_orig = encoding_orig.attention_mask.to(device)
+            input_ids_aug = encoding_aug.input_ids.to(device)
+            attention_mask_aug = encoding_aug.attention_mask.to(device)
+            emb_orig = encoder(input_ids_orig, attention_mask_orig).cpu().numpy().flatten()
+            emb_aug = encoder(input_ids_aug, attention_mask_aug).cpu().numpy().flatten()
+            embeddings_orig.append(emb_orig)
+            embeddings_aug.append(emb_aug)
+    embeddings_orig = np.array(embeddings_orig)
+    embeddings_aug = np.array(embeddings_aug)
+    # Cosine similarity between each original and its augmented version
+    similarities = np.array([cosine_similarity([embeddings_orig[i]], [embeddings_aug[i]])[0][0] for i in range(len(embeddings_orig))])
+    return similarities
 # --- 1. Data Loading ---
 def load_lists_from_url(data):
     if data == 'bbbp':
         loss = criterion(outputs, labels)
         loss.backward()
         optimizer.step()
+        #scheduler.step()
         total_loss += loss.item()
     return total_loss / len(dataloader)
             all_labels.append(labels.numpy())
     return np.concatenate(all_preds), np.concatenate(all_labels)
+def calc_val_metrics(model, dataloader, criterion, device, task_type):
+    model.eval()
+    all_labels, all_preds = [], []
+    total_loss = 0
+    with torch.no_grad():
+        for batch in dataloader:
+            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
+            labels = batch['labels'].to(device)
+            outputs = model(**inputs)
+            loss = criterion(outputs, labels)
+            total_loss += loss.item()
+            if task_type == 'classification':
+                pred_probs = torch.sigmoid(outputs).cpu().numpy()
+                all_preds.append(pred_probs)
+                all_labels.append(labels.cpu().numpy())
+            else:
+                # Regression
+                preds = outputs.cpu().numpy()
+                all_preds.append(preds)
+                all_labels.append(labels.cpu().numpy())
+    avg_loss = total_loss / len(dataloader)
+    if task_type == 'classification':
+        y_true = np.concatenate(all_labels)
+        y_pred = np.concatenate(all_preds)
+        try:
+            score = roc_auc_score(y_true, y_pred, average='macro')
+        except Exception:
+            score = 0.0
+        return avg_loss, score
+    else:
+        return avg_loss, None
 # --- 6. Main Execution Block ---
 def main():
     DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     DATASETS_TO_RUN = {
         # 'esol': {'task_type': 'regression', 'num_labels': 1, 'split': 'random'},
         #'tox21': {'task_type': 'classification', 'num_labels': 12, 'split': 'random'},
+        #'hiv': {'task_type': 'classification', 'num_labels': 1, 'split': 'scaffold'},
         # Add more datasets here, e.g. 'bbbp': {'task_type': 'classification', 'num_labels': 1, 'split': 'random'},
         #'sider': {'task_type': 'classification', 'num_labels': 27, 'split': 'random'},
         #'bace': {'task_type': 'classification', 'num_labels': 1, 'split': 'random'},
+        'clintox': {'task_type': 'classification', 'num_labels': 2, 'split': 'random'},
+        #'bbbp': {'task_type': 'classification', 'num_labels': 1, 'split': 'scaffold'}
     }
     PATIENCE = 15
+    EPOCHS = 50
+    LEARNING_RATE = 1e-4
+    BATCH_SIZE = 16
     MAX_LEN = 512
     TOKENIZER = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MTR')
         model = SimSonClassifier(encoder, num_labels=info['num_labels']).to(DEVICE)
         model.load_encoder_params('../simson_checkpoints/checkpoint_best_model.bin')
         criterion = get_criterion(info['task_type'], info['num_labels'])
+        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=0.0024)
+        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.59298)
+        best_val_loss = float('-inf')
         best_model_state = None
         current_patience = 0
         for epoch in range(EPOCHS):
             train_loss = train_epoch(model, train_loader, optimizer, scheduler, criterion, DEVICE)
+            val_loss, val_metric = calc_val_metrics(model, val_loader, criterion, 'cuda', info['task_type'])
+            print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | ROC AUC: {val_metric:.4f}")
+            if val_metric <= val_loss:
                 best_val_loss = val_loss
                 best_model_state = copy.deepcopy(model.state_dict())
                 print(f"  -> New best model saved with validation loss: {best_val_loss:.4f}")
                     break
         print("\nTesting with the best model...")
+        if not best_model_state is None:
+            model.load_state_dict(best_model_state)
         test_loss = eval_epoch(model, test_loader, criterion, DEVICE)
         print(f'Test loss: {test_loss}')
         test_preds, test_true = test_model(model, test_loader, DEVICE)
             'test_labels': test_true
         }
         print(f"Finished testing for {name}.")
+        test_smiles_list = list(test_smiles)
+        similarities = compute_embedding_similarity(
+            model.encoder, test_smiles_list, TOKENIZER, DEVICE, MAX_LEN
+        )
+        print(f"Similarity score: {similarities.mean():.4f}")
+        if name == 'do_not_save':
+            torch.save(model.encoder.state_dict(), 'moleculenet_clintox_encoder.bin')
     print(f"\n{'='*20} AGGREGATED RESULTS {'='*20}")
     for name, result in aggregated_results.items():
     print("\nScript finished.")
 if __name__ == '__main__':
     main()