Defetya
/

simson_base

Safetensors

Model card Files Files and versions

xet

Community

Defetya commited on Jul 22, 2025

Commit

aea539f

verified ·

1 Parent(s): ce0db25

Upload moleculenet_eval/eval.py with huggingface_hub

Browse files

Files changed (1) hide show

moleculenet_eval/eval.py +352 -498

moleculenet_eval/eval.py CHANGED Viewed

@@ -1,545 +1,399 @@
-# ==============================================================================
-# 1. IMPORTS
-# ==============================================================================
-import os
-import warnings
-import wandb
 import torch
 import torch.nn as nn
 import torch.optim as optim
-import torch.nn.functional as F
-from torch.utils.data import DataLoader, Dataset
-import numpy as np
 from tqdm import tqdm
-from rdkit import Chem, RDLogger
-from datasets import load_dataset, load_from_disk
-from transformers import AutoTokenizer, BertModel, BertConfig
-import pandas as pd
-# ==============================================================================
-# 2. INITIAL SETUP
-# ==============================================================================
-# Suppress RDKit console output
-RDLogger.DisableLog('rdApp.*')
-# Ignore warnings for cleaner output
-warnings.filterwarnings("ignore")
-# ==============================================================================
-# 3. MODEL AND LOSS FUNCTION
-# ==============================================================================
-def global_average_pooling(x):
-    """Global Average Pooling: from [B, max_len, hid_dim] to [B, hid_dim]"""
-    return torch.mean(x, dim=1)
-class SimSonEncoder(nn.Module):
-    """The main encoder model based on BERT."""
-    def __init__(self, config: BertConfig, max_len: int, dropout: float = 0.1):
-        super(SimSonEncoder, self).__init__()
-        self.bert = BertModel(config, add_pooling_layer=False)
-        self.linear = nn.Linear(config.hidden_size, max_len)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, input_ids, attention_mask=None):
-        if attention_mask is None:
-            attention_mask = input_ids.ne(self.bert.config.pad_token_id)
-        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
-        hidden_states = self.dropout(outputs.last_hidden_state)
-        pooled_output = global_average_pooling(hidden_states)
-        return self.linear(pooled_output)
-class ContrastiveLoss(nn.Module):
-    """Calculates the contrastive loss for the SimSon model."""
-    def __init__(self, temperature=0.2):
-        super(ContrastiveLoss, self).__init__()
-        self.temperature = temperature
-        self.similarity_fn = F.cosine_similarity
-    def forward(self, proj_1, proj_2):
-        batch_size = proj_1.shape[0]
-        device = proj_1.device
-        # Normalize projections
-        z_i = F.normalize(proj_1, p=2, dim=1)
-        z_j = F.normalize(proj_2, p=2, dim=1)
-        # Concatenate for similarity matrix calculation
-        representations = torch.cat([z_i, z_j], dim=0)
-        # Calculate cosine similarity between all pairs
-        similarity_matrix = self.similarity_fn(representations.unsqueeze(1), representations.unsqueeze(0), dim=2)
-        # Identify positive pairs (original and its augmentation)
-        sim_ij = torch.diag(similarity_matrix, batch_size)
-        sim_ji = torch.diag(similarity_matrix, -batch_size)
-        positives = torch.cat([sim_ij, sim_ji], dim=0)
-        # Create a mask to exclude self-comparisons
-        nominator = torch.exp(positives / self.temperature)
-        mask = (~torch.eye(batch_size * 2, batch_size * 2, dtype=torch.bool, device=device)).float()
-        denominator = mask * torch.exp(similarity_matrix / self.temperature)
-        # Calculate the final loss
-        loss = -torch.log(nominator / torch.sum(denominator, dim=1))
-        return torch.sum(loss) / (2 * batch_size)
-# ==============================================================================
-# 4. DATA HANDLING (Keeping your existing classes unchanged)
-# ==============================================================================
-class SmilesEnumerator:
-    """Generates randomized SMILES strings for data augmentation."""
-    def randomize_smiles(self, smiles):
-        try:
-            mol = Chem.MolFromSmiles(smiles)
-            return Chem.MolToSmiles(mol, doRandom=True, canonical=False) if mol else smiles
-        except:
-            return smiles
-class ContrastiveSmilesDataset(Dataset):
-    """Dataset for creating pairs of augmented SMILES for contrastive learning."""
-    def __init__(self, smiles_list, tokenizer, max_length=512):
         self.smiles_list = smiles_list
         self.tokenizer = tokenizer
-        self.max_length = max_length
-        self.enumerator = SmilesEnumerator()
     def __len__(self):
         return len(self.smiles_list)
     def __getitem__(self, idx):
-        original_smiles = self.smiles_list[idx]
-        # Create two different augmentations of the same SMILES
-        smiles_1 = self.enumerator.randomize_smiles(original_smiles)
-        smiles_2 = self.enumerator.randomize_smiles(original_smiles)
-        # Tokenize and do pad. Padding will be handled by the collate_fn.
-        tokens_1 = self.tokenizer(smiles_1, max_length=self.max_length, truncation=True, padding='max_length')
-        tokens_2 = self.tokenizer(smiles_2, max_length=self.max_length, truncation=True, padding='max_length')
-        return {
-            'input_ids_1': torch.tensor(tokens_1['input_ids']),
-            'attention_mask_1': torch.tensor(tokens_1['attention_mask']),
-            'input_ids_2': torch.tensor(tokens_2['input_ids']),
-            'attention_mask_2': torch.tensor(tokens_2['attention_mask']),
-        }
-class PrecomputedContrastiveSmilesDataset(Dataset):
-    """
-    A Dataset class that reads pre-augmented SMILES pairs from a Parquet file.
-    This is significantly faster as it offloads the expensive SMILES randomization
-    to a one-time preprocessing step.
-    """
-    def __init__(self, tokenizer, file_path: str, max_length: int = 512):
-        self.tokenizer = tokenizer
-        self.max_length = max_length
-        # Load the entire dataset from the Parquet file into memory.
-        # This is fast and efficient for subsequent access.
-        print(f"Loading pre-computed data from {file_path}...")
-        self.data = pd.read_parquet(file_path)
-        print("Data loaded successfully.")
-    def __len__(self):
-        """Returns the total number of pairs in the dataset."""
-        return len(self.data)
-    def __getitem__(self, idx):
-        """
-        Retrieves a pre-augmented pair, tokenizes it, and returns it
-        in the format expected by the DataCollator.
-        """
-        # Retrieve the pre-augmented pair from the DataFrame
-        row = self.data.iloc[idx]
-        smiles_1 = row['smiles_1']
-        smiles_2 = row['smiles_2']
-        # Tokenize the pair. This operation is fast and remains in the data loader.
-        tokens_1 = self.tokenizer(smiles_1, max_length=self.max_length, truncation=True, padding='max_length')
-        tokens_2 = self.tokenizer(smiles_2, max_length=self.max_length, truncation=True, padding='max_length')
-        return {
-            'input_ids_1': torch.tensor(tokens_1['input_ids']),
-            'attention_mask_1': torch.tensor(tokens_1['attention_mask']),
-            'input_ids_2': torch.tensor(tokens_2['input_ids']),
-            'attention_mask_2': torch.tensor(tokens_2['attention_mask']),
-        }
-class PreTokenizedSmilesDataset(Dataset):
     """
-    A Dataset that loads a pre-tokenized and pre-padded dataset created
-    by the preprocessing script. It uses memory-mapping for instant loads
-    and high efficiency.
     """
-    def __init__(self, dataset_path: str):
-        # Load the dataset from disk. This is very fast due to memory-mapping.
-        self.dataset = load_from_disk(dataset_path)
-        # Set the format to PyTorch tensors for direct use in the model
-        self.dataset.set_format(type='torch', columns=[
-            'input_ids_1', 'attention_mask_1', 'input_ids_2', 'attention_mask_2'
-        ])
-        print(f"Successfully loaded pre-tokenized dataset from {dataset_path}.")
-    def __len__(self):
-        """Returns the total number of items in the dataset."""
-        return len(self.dataset)
-    def __getitem__(self, idx):
-        """Retrieves a single pre-processed item."""
-        return self.dataset[idx]
-class DataCollatorWithPadding:
-    """
-    A collate function that dynamically pads inputs to the longest sequence
-    across both augmented views in the batch, ensuring consistent tensor shapes.
-    """
-    def __init__(self, tokenizer):
-        self.tokenizer = tokenizer
-    def __call__(self, features):
-        # Create a combined list of features for both views to find the global max length
-        combined_features = []
-        for feature in features:
-            combined_features.append({'input_ids': feature['input_ids_1'], 'attention_mask': feature['attention_mask_1']})
-            combined_features.append({'input_ids': feature['input_ids_2'], 'attention_mask': feature['attention_mask_2']})
-        # Pad the combined batch. This ensures all sequences are padded to the same length.
-        padded_combined = self.tokenizer.pad(combined_features, padding='longest', return_tensors='pt')
-        # Split the padded tensors back into two views
-        batch_size = len(features)
-        input_ids_1, input_ids_2 = torch.split(padded_combined['input_ids'], batch_size, dim=0)
-        attention_mask_1, attention_mask_2 = torch.split(padded_combined['attention_mask'], batch_size, dim=0)
-        return {
-            'input_ids_1': input_ids_1,
-            'attention_mask_1': attention_mask_1,
-            'input_ids_2': input_ids_2,
-            'attention_mask_2': attention_mask_2,
-        }
-# ==============================================================================
-# 5. CHECKPOINT UTILITIES
-# ==============================================================================
-def save_checkpoint(model, optimizer, scheduler, global_step, save_path):
-    """Save complete checkpoint with model, optimizer, scheduler states and step count."""
-    checkpoint = {
-        'model_state_dict': model.state_dict(),
-        'optimizer_state_dict': optimizer.state_dict(),
-        'scheduler_state_dict': scheduler.state_dict(),
-        'global_step': global_step,
-    }
-    torch.save(checkpoint, save_path)
-    print(f"Full checkpoint saved at step {global_step}")
-def load_checkpoint(checkpoint_path, model, optimizer, scheduler):
-    """Load checkpoint and return the global step to resume from."""
-    checkpoint = torch.load(checkpoint_path, map_location='cpu')
-    model.load_state_dict(checkpoint['model_state_dict'])
-    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
-    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
-    global_step = checkpoint['global_step']
-    print(f"Checkpoint loaded from step {global_step}")
-    return global_step
-# ==============================================================================
-# 6. TRAINING AND EVALUATION LOOPS - MODIFIED
-# ==============================================================================
-def evaluation_step(model, batch, criterion, device):
-    """Performs a single evaluation step on a batch of data."""
-    input_ids_1 = batch['input_ids_1'].to(device)
-    attention_mask_1 = batch['attention_mask_1'].to(device)
-    input_ids_2 = batch['input_ids_2'].to(device)
-    attention_mask_2 = batch['attention_mask_2'].to(device)
-    combined_input_ids = torch.cat([input_ids_1, input_ids_2], dim=0)
-    combined_attention_mask = torch.cat([attention_mask_1, attention_mask_2], dim=0)
-    with torch.no_grad():
-        combined_proj = model(combined_input_ids, combined_attention_mask)
-    batch_size = input_ids_1.size(0)
-    proj_1, proj_2 = torch.split(combined_proj, batch_size, dim=0)
-    loss = criterion(proj_1, proj_2)
-    return proj_1, proj_2, loss
-def train_with_step_based_validation(model, train_loader, val_loader, optimizer, criterion, device,
-                                   scheduler, checkpoint_path, save_steps, validation_steps,
-                                   start_step=0, max_steps=None):
-    """
-    Modified training function with step-based validation and checkpointing.
-    """
     model.train()
-    global_step = start_step
-    best_val_loss = float('inf')
-    # Calculate total steps if max_steps is not provided
-    if max_steps is None:
-        max_steps = len(train_loader)
-    progress_bar = tqdm(total=max_steps - start_step, desc="Training Steps", initial=start_step)
-    # Create iterator that can be resumed from any point
-    train_iterator = iter(train_loader)
-    # Skip batches if resuming from checkpoint
-    if start_step > 0:
-        batches_to_skip = start_step % len(train_loader)
-        for _ in range(batches_to_skip):
-            try:
-                next(train_iterator)
-            except StopIteration:
-                train_iterator = iter(train_loader)
-    while global_step < max_steps:
-        try:
-            batch = next(train_iterator)
-        except StopIteration:
-            train_iterator = iter(train_loader)
-            batch = next(train_iterator)
-        # Training step
-        input_ids_1 = batch['input_ids_1'].to(device)
-        attention_mask_1 = batch['attention_mask_1'].to(device)
-        input_ids_2 = batch['input_ids_2'].to(device)
-        attention_mask_2 = batch['attention_mask_2'].to(device)
         optimizer.zero_grad()
-        with torch.autocast(dtype=torch.float16, device_type="cuda"):
-            combined_input_ids = torch.cat([input_ids_1, input_ids_2], dim=0)
-            combined_attention_mask = torch.cat([attention_mask_1, attention_mask_2], dim=0)
-            combined_proj = model(combined_input_ids, combined_attention_mask)
-            batch_size = input_ids_1.size(0)
-            proj_1, proj_2 = torch.split(combined_proj, batch_size, dim=0)
-            loss = criterion(proj_1, proj_2)
         loss.backward()
         optimizer.step()
-        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
         scheduler.step()
-        global_step += 1
-        progress_bar.update(1)
-        progress_bar.set_postfix(loss=f"{loss.item():.4f}", step=global_step)
-        wandb.log({
-            "train_batch_loss": loss.item(),
-            "learning_rate": scheduler.get_last_lr()[0],
-            "global_step": global_step
-        })
-        # Step-based validation
-        if global_step % validation_steps == 0:
-            val_loss = validate_epoch(model, val_loader, criterion, device)
-            wandb.log({
-                "val_loss": val_loss,
-                "global_step": global_step
-            })
-            # Save best model (model state only for best checkpoint)
-            if val_loss < best_val_loss:
-                best_val_loss = val_loss
-                model_save_path = checkpoint_path.replace('.pt', '_best_model.bin')
-                torch.save(model.state_dict(), model_save_path)
-                progress_bar.write(f"Step {global_step}: New best model saved with val loss {val_loss:.4f}")
-            model.train()  # Resume training mode after validation
-        # Step-based checkpointing (full checkpoint)
-        if global_step % save_steps == 0:
-            save_checkpoint(model, optimizer, scheduler, global_step, checkpoint_path)
-    progress_bar.close()
-    return global_step
-def validate_epoch(model, val_loader, criterion, device):
-    """Validation function - unchanged from original."""
-    model.eval()
-    total_loss = 0
-    progress_bar = tqdm(val_loader, desc="Validating", leave=False)
-    for batch in progress_bar:
-        _, _, loss = evaluation_step(model, batch, criterion, device)
         total_loss += loss.item()
-    avg_loss = total_loss / len(val_loader)
-    print(f'Validation loss: {avg_loss:.4f}')
-    return avg_loss
-def test_model(model, test_loader, criterion, device):
-    """Test function - unchanged from original."""
     model.eval()
     total_loss = 0
-    all_similarities = []
-    progress_bar = tqdm(test_loader, desc="Testing", leave=False)
-    for batch in progress_bar:
-        proj_1, proj_2, loss = evaluation_step(model, batch, criterion, device)
-        total_loss += loss.item()
-        proj_1_norm = F.normalize(proj_1, p=2, dim=1)
-        proj_2_norm = F.normalize(proj_2, p=2, dim=1)
-        batch_similarities = F.cosine_similarity(proj_1_norm, proj_2_norm, dim=1)
-        all_similarities.extend(batch_similarities.cpu().numpy())
-    avg_loss = total_loss / len(test_loader)
-    avg_sim = np.mean(all_similarities)
-    std_sim = np.std(all_similarities)
-    return avg_loss, avg_sim, std_sim
-# ==============================================================================
-# 7. MODIFIED SINGLE-GPU TRAINING
-# ==============================================================================
-def run_training(model_config, hparams, data_splits):
-    """The main function to run the training and evaluation process with step-based validation."""
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    print(f"Using device: {device}")
-    wandb_key = os.getenv("WANDB_API_KEY")
-    if wandb_key:
-        wandb.login(key=wandb_key)
-    wandb.init(
-        #project="simson-contrastive-learning-single-gpu",
-        #name=f"run-{wandb.util.generate_id()}",
-        #config=hparams
-    )
-    train_smiles, val_smiles, test_smiles = data_splits
-    tokenizer = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MTR')
-    precomputed_train_path = 'data/pubchem_119m_splits/train.parquet'
-    precomputed_test_path = 'data/pubchem_119m_splits/test.parquet'
-    precomputed_val_path = 'data/pubchem_119m_splits/validation.parquet'
-    train_dataset = PrecomputedContrastiveSmilesDataset(tokenizer, file_path=precomputed_train_path, max_length=hparams['max_length'])
-    test_dataset = PrecomputedContrastiveSmilesDataset(tokenizer, file_path=precomputed_test_path, max_length=hparams['max_length'])
-    val_dataset = PrecomputedContrastiveSmilesDataset(tokenizer, file_path=precomputed_val_path, max_length=hparams['max_length'])
-    train_loader = DataLoader(train_dataset, batch_size=hparams['batch_size'], shuffle=True, num_workers=16, prefetch_factor=128, pin_memory=True)
-    val_loader = DataLoader(val_dataset, batch_size=hparams['batch_size'], shuffle=False, num_workers=2, pin_memory=True)
-    test_loader = DataLoader(test_dataset, batch_size=hparams['batch_size'], shuffle=False, num_workers=2, pin_memory=True)
-    print('Initialized all data. Compiling the model...')
-    model = SimSonEncoder(config=model_config, max_len=hparams['max_embeddings']).to(device)
-    model = torch.compile(model)
-    model.load_state_dict(torch.load('simson_checkpoints_small/simson_model_single_gpu.bin'))
-    print(model)
-    total_params = sum(p.numel() for p in model.parameters())
-    print(f"Total number of parameters: {total_params // 1_000_000} M")
-    wandb.config.update({"total_params_M": total_params // 1_000_000})
-    criterion = ContrastiveLoss(temperature=hparams['temperature']).to(device)
-    optimizer = optim.AdamW(model.parameters(), lr=hparams['lr'], weight_decay=1e-5, fused=True)
-    total_steps = hparams['epochs'] * len(train_loader)
-    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_mult=1, T_0=total_steps)
-    print("Starting training...")
-    wandb.watch(model, log='all', log_freq=5000)
-    start_step = 0
-    checkpoint_path = hparams['checkpoint_path']
-    # Resume from checkpoint if provided
-    if hparams.get('resume_checkpoint') and os.path.exists(hparams['resume_checkpoint']):
-        print(f"Resuming from checkpoint: {hparams['resume_checkpoint']}")
-        start_step = load_checkpoint(hparams['resume_checkpoint'], model, optimizer, scheduler)
-    # Train with step-based validation
-    final_step = train_with_step_based_validation(
-        model, train_loader, val_loader, optimizer, criterion, device,
-        scheduler, checkpoint_path, hparams['save_steps'], hparams['validation_steps'],
-        start_step=start_step, max_steps=total_steps
-    )
-    print("Training complete. Starting final testing...")
-    # Load the best model for testing (model state only)
-    best_model_path = checkpoint_path.replace('.pt', '_best_model.bin')
-    if os.path.exists(best_model_path):
-        model.load_state_dict(torch.load(best_model_path))
-        print("Loaded best model for testing")
-    test_loss, avg_sim, std_sim = test_model(model, test_loader, criterion, device)
-    print("\n--- Test Results ---")
-    print(f"Test Loss: {test_loss:.4f}")
-    print(f"Average Cosine Similarity: {avg_sim:.4f} ± {std_sim:.4f}")
-    print("--------------------")
-    wandb.log({
-        "test_loss": test_loss,
-        "avg_cosine_similarity": avg_sim,
-        "std_cosine_similarity": std_sim
-    })
-    # Save final model state only
-    final_model_path = hparams['save_path']
-    torch.save(model.state_dict(), final_model_path)
-    print(f"Final model saved to {final_model_path}")
-    wandb.finish()
-# ==============================================================================
-# 8. MAIN EXECUTION
-# ==============================================================================
 def main():
-    """Main function to configure and run the training process."""
-    hparams = {
-        'epochs': 1,
-        'lr': 1e-5,
-        'temperature': 0.05,
-        'batch_size': 64,
-        'max_length': 256,
-        'save_path': "simson_checkpoints_pubchem/simson_model_single_gpu.bin",
-        'checkpoint_path': "simson_checkpoints/checkpoint.pt",  # Full checkpoint
-        'save_steps': 50_000,  # Save checkpoint every 10k steps
-        'validation_steps': 50_000,  # Validate every 5k steps
-        'max_embeddings': 512,
-        'resume_checkpoint': None,  # Set to checkpoint path to resume
-    }
-    dataset = load_dataset('HoangHa/SMILES-250M')['train']
-    smiles_column_name = 'SMILES'
-    total_size = len(dataset)
-    test_size = int(0.1 * total_size)
-    val_size = int(0.1 * (total_size - test_size))
-    test_smiles = dataset.select(range(test_size))[smiles_column_name]
-    val_smiles = dataset.select(range(test_size, test_size + val_size))[smiles_column_name]
-    train_smiles = dataset.select(range(test_size + val_size, total_size))[smiles_column_name]
-    data_splits = (train_smiles, val_smiles, test_smiles)
-    tokenizer = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MTR')
-    model_config = BertConfig(
-        vocab_size=tokenizer.vocab_size,
         hidden_size=768,
         num_hidden_layers=4,
         num_attention_heads=12,
         intermediate_size=2048,
         max_position_embeddings=512
     )
-    # Create directories
-    save_dir = os.path.dirname(hparams['save_path'])
-    checkpoint_dir = os.path.dirname(hparams['checkpoint_path'])
-    for directory in [save_dir, checkpoint_dir]:
-        if not os.path.exists(directory):
-            os.makedirs(directory)
-    # Directly call the training function for a single-GPU run
-    run_training(model_config, hparams, data_splits)
 if __name__ == '__main__':
     main()

+import pandas as pd
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.optim as optim
+from torch.utils.data import Dataset, DataLoader
+from transformers import BertConfig, BertModel, AutoTokenizer
+from rdkit import Chem
+from rdkit.Chem.Scaffolds import MurckoScaffold
+import copy
 from tqdm import tqdm
+import os
+from sklearn.metrics import roc_auc_score, root_mean_squared_error, mean_absolute_error
+from itertools import compress
+from collections import defaultdict
+torch.set_float32_matmul_precision('high')
+# --- 1. Data Loading ---
+# Function to load datasets from their respective URLs.
+def load_lists_from_url(data):
+    """
+    Load SMILES and labels from Moleculenet website.
+    """
+    if data == 'bbbp':
+        df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv')
+        smiles, labels = df.smiles, df.p_np
+    elif data == 'clintox':
+        df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/clintox.csv.gz', compression='gzip')
+        smiles = df.smiles
+        labels = df.drop(['smiles'], axis=1)
+    elif data == 'hiv':
+        df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/HIV.csv')
+        smiles, labels = df.smiles, df.HIV_active
+    elif data == 'sider':
+        df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/sider.csv.gz', compression='gzip')
+        smiles = df.smiles
+        labels = df.drop(['smiles'], axis=1)    # (1427, 27)
+    elif data == 'esol':
+        df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv')
+        smiles = df.smiles
+        labels = df['ESOL predicted log solubility in mols per litre']
+    elif data == 'freesolv':
+        df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/SAMPL.csv')
+        smiles = df.smiles
+        labels = df.calc
+    elif data == 'lipophicility':
+        df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv')
+        smiles, labels = df.smiles, df['exp']
+    elif data == 'tox21':
+        df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/tox21.csv.gz', compression='gzip')
+        df = df.dropna(axis=0, how='any').reset_index(drop=True)   # drop nan values
+        smiles = df.smiles
+        labels = df.drop(['mol_id', 'smiles'], axis=1)  # 12 cols
+    elif data == 'bace':
+        df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv')
+        smiles, labels = df.mol, df.Class
+    elif data == 'tox21':
+        df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/tox21.csv.gz', compression='gzip')
+        df = df.dropna(axis=0, how='any').reset_index(drop=True)  # drop nan values
+        smiles = df.smiles
+        labels = df.drop(['mol_id', 'smiles'], axis=1)  # 12 cols
+    elif data == 'qm8':
+        df = pd.read_csv('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv')
+        df = df.dropna(axis=0, how='any').reset_index(drop=True)  # drop nan values
+        smiles = df.smiles
+        labels = df.drop(['smiles', 'E2-PBE0.1', 'E1-PBE0.1', 'f1-PBE0.1', 'f2-PBE0.1'], axis=1)  # 12 tasks
+    return smiles, labels
+# --- 2. Scaffold Splitting ---
+# Class to split the dataset based on molecular scaffolds.
+class ScaffoldSplitter:
+    def __init__(self, data, seed, train_frac=0.8, val_frac=0.1, test_frac=0.1, include_chirality=True):
+        self.data = data
+        self.seed = seed
+        self.include_chirality = include_chirality
+        self.train_frac = train_frac
+        self.val_frac = val_frac
+        self.test_frac = test_frac
+    def generate_scaffold(self, smiles):
+        mol = Chem.MolFromSmiles(smiles)
+        scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=self.include_chirality)
+        return scaffold
+    def scaffold_split(self):
+        smiles, labels = load_lists_from_url(self.data)
+        # Initialize non_null as False for all samples
+        non_null = np.ones(len(smiles)) == 0
+        # Dataset-specific null handling
+        if self.data == 'tox21' or self.data == 'sider' or self.data == 'clintox':
+            for i in range(len(smiles)):
+                # Check if molecule is valid AND no missing labels
+                if Chem.MolFromSmiles(smiles[i]) and labels.loc[i].isnull().sum() == 0:
+                    non_null[i] = 1
+        else:
+            # For single-task datasets, only check molecule validity
+            for i in range(len(smiles)):
+                if Chem.MolFromSmiles(smiles[i]):
+                    non_null[i] = 1
+        # Extract valid samples with original indices preserved
+        smiles_list = list(compress(enumerate(smiles), non_null))
+        rng = np.random.RandomState(self.seed)
+        # Group by scaffold
+        scaffolds = defaultdict(list)
+        for i, sms in smiles_list:
+            scaffold = self.generate_scaffold(sms)
+            scaffolds[scaffold].append(i)
+        scaffold_sets = list(scaffolds.values())
+        rng.shuffle(scaffold_sets)
+        # Calculate target sizes for validation and test sets
+        n_total_val = int(np.floor(self.val_frac * len(smiles_list)))
+        n_total_test = int(np.floor(self.test_frac * len(smiles_list)))
+        train_idx, val_idx, test_idx = [], [], []
+        # Assign scaffold groups to splits
+        for scaffold_set in scaffold_sets:
+            if len(val_idx) + len(scaffold_set) <= n_total_val:
+                val_idx.extend(scaffold_set)
+            elif len(test_idx) + len(scaffold_set) <= n_total_test:
+                test_idx.extend(scaffold_set)
+            else:
+                train_idx.extend(scaffold_set)
+        return train_idx, val_idx, test_idx
+# --- 3. PyTorch Dataset ---
+# Custom Dataset class for handling SMILES data.
+class MoleculeDataset(Dataset):
+    def __init__(self, smiles_list, labels, tokenizer, max_len=512):
         self.smiles_list = smiles_list
+        self.labels = labels
         self.tokenizer = tokenizer
+        self.max_len = max_len
     def __len__(self):
         return len(self.smiles_list)
     def __getitem__(self, idx):
+        smiles = self.smiles_list[idx]
+        label = self.labels.iloc[idx]
+        encoding = self.tokenizer(
+            smiles,
+            truncation=True,
+            padding='max_length',
+            max_length=self.max_len,
+            return_tensors='pt'
+        )
+        item = {key: val.squeeze(0) for key, val in encoding.items()}
+        # Handle single-task and multi-task labels
+        if isinstance(label, pd.Series):
+            label_values = label.values.astype(np.float32)
+        else:
+            label_values = np.array([label], dtype=np.float32)
+        item['labels'] = torch.tensor(label_values, dtype=torch.float)
+        return item
+# --- 4. Model Architecture ---
+def global_ap(x):
     """
+    Global Average Pooling
+    Input: [B, max_len, hid_dim]
+    Return: [B, hid_dim]
     """
+    return torch.mean(x.view(x.size(0), x.size(1), -1), dim=1)
+class SimSonEncoder(nn.Module):
+    def __init__(self, config: BertConfig, max_len: int, dropout: float = 0.1):
+        super(SimSonEncoder, self).__init__()
+        self.config = config
+        self.max_len = max_len
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.linear = nn.Linear(config.hidden_size, max_len)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, input_ids, attention_mask=None):
+        if attention_mask is None:
+            attention_mask = input_ids.ne(self.config.pad_token_id)
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        hidden_states = self.dropout(outputs.last_hidden_state)
+        pooled = global_ap(hidden_states)
+        return self.linear(pooled)
+class SimSonClassifier(nn.Module):
+    def __init__(self, encoder: SimSonEncoder, num_labels: int, dropout=0.1):
+        super(SimSonClassifier, self).__init__()
+        self.encoder = encoder
+        self.clf = nn.Linear(encoder.max_len, num_labels)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, input_ids, attention_mask=None):
+        x = self.encoder(input_ids, attention_mask)
+        x = self.relu(self.dropout(x))
+        logits = self.clf(x)
+        return logits
+    def load_encoder_params(self, state_dict_path):
+        """Loads pretrained parameters into the SimSonEncoder."""
+        self.encoder.load_state_dict(torch.load(state_dict_path))
+        print("Pretrained encoder parameters loaded.")
+# --- 5. Training, Validation, and Testing Loops ---
+def get_criterion(task_type, num_labels):
+    """Select loss function based on task."""
+    if task_type == 'classification':
+        return nn.BCEWithLogitsLoss()
+    elif task_type == 'regression':
+        return nn.MSELoss()
+    else:
+        raise ValueError(f"Unknown task type: {task_type}")
+def train_epoch(model, dataloader, optimizer, scheduler, criterion, device):
     model.train()
+    total_loss = 0
+    for batch in dataloader:
+        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
+        labels = batch['labels'].to(device)
         optimizer.zero_grad()
+        outputs = model(**inputs)
+        loss = criterion(outputs, labels)
         loss.backward()
         optimizer.step()
         scheduler.step()
         total_loss += loss.item()
+    return total_loss / len(dataloader)
+def eval_epoch(model, dataloader, criterion, device):
     model.eval()
     total_loss = 0
+    with torch.no_grad():
+        for batch in dataloader:
+            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
+            labels = batch['labels'].to(device)
+            outputs = model(**inputs)
+            loss = criterion(outputs, labels)
+            total_loss += loss.item()
+    return total_loss / len(dataloader)
+def test_model(model, dataloader, device):
+    model.eval()
+    all_preds, all_labels = [], []
+    with torch.no_grad():
+        for batch in dataloader:
+            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
+            labels = batch['labels']
+            outputs = model(**inputs)
+            # Apply sigmoid for classification probabilities
+            preds = torch.sigmoid(outputs)
+            all_preds.append(preds.cpu().numpy())
+            all_labels.append(labels.numpy())
+    return np.concatenate(all_preds), np.concatenate(all_labels)
+# --- 6. Main Execution Block ---
 def main():
+    # --- Configuration ---
+    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"Using device: {DEVICE}")
+    DATASETS_TO_RUN = {
+        #'esol': {'task_type': 'regression', 'num_labels': 1},
+        #'freesolv': {'task_type': 'regression', 'num_labels':1},
+        #'lipophicility': {'task_type': 'regression', 'num_labels': 1},
+        #'qm8': {'task_type': 'regression', 'num_labels': 12},
+        #'bbbp': {'task_type': 'classification', 'num_labels': 1},
+        'tox21': {'task_type': 'classification', 'num_labels': 12},
+        #'sider': {'task_type': 'classification', 'num_labels': 27},
+        #'clintox': {'task_type': 'classification', 'num_labels': 2},
+        #'hiv': {'task_type': 'classification', 'num_labels': 1},
+        #'bace': {'task_type': 'classification', 'num_labels': 1},
+    }
+    PATIENCE = 25
+    EPOCHS = 200
+    LEARNING_RATE = 2e-5
+    BATCH_SIZE = 128
+    MAX_LEN = 256
+    # --- Tokenizer and Model Config ---
+    TOKENIZER = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MTR')
+    ENCODER_CONFIG = BertConfig(
+        vocab_size=TOKENIZER.vocab_size,
         hidden_size=768,
         num_hidden_layers=4,
         num_attention_heads=12,
         intermediate_size=2048,
         max_position_embeddings=512
     )
+    aggregated_results = {}
+    for name, info in DATASETS_TO_RUN.items():
+        print(f"\n{'='*20} Processing Dataset: {name.upper()} {'='*20}")
+        # --- Data Loading and Splitting ---
+        splitter = ScaffoldSplitter(data=name, seed=42)
+        train_idx, val_idx, test_idx = splitter.scaffold_split()
+        # Load data once
+        smiles, labels = load_lists_from_url(name)
+        # Extract splits using returned indices
+        train_smiles = smiles.iloc[train_idx].reset_index(drop=True)
+        train_labels = labels.iloc[train_idx].reset_index(drop=True)
+        val_smiles = smiles.iloc[val_idx].reset_index(drop=True)
+        val_labels = labels.iloc[val_idx].reset_index(drop=True)
+        test_smiles = smiles.iloc[test_idx].reset_index(drop=True)
+        test_labels = labels.iloc[test_idx].reset_index(drop=True)
+        print(f"Data split - Train: {len(train_smiles)}, Val: {len(val_smiles)}, Test: {len(test_smiles)}")
+        train_dataset = MoleculeDataset(train_smiles, train_labels, TOKENIZER, MAX_LEN)
+        val_dataset = MoleculeDataset(val_smiles, val_labels, TOKENIZER, MAX_LEN)
+        test_dataset = MoleculeDataset(test_smiles, test_labels, TOKENIZER, MAX_LEN)
+        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
+        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
+        test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
+        # --- Model, Loss, and Optimizer ---
+        encoder = SimSonEncoder(ENCODER_CONFIG, 512)
+        encoder = torch.compile(encoder)
+        model = SimSonClassifier(encoder, num_labels=info['num_labels']).to(DEVICE)
+        model.load_encoder_params('../simson_checkpoints/checkpoint_best_model.bin')
+        criterion = get_criterion(info['task_type'], info['num_labels'])
+        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
+        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS * len(train_loader))
+        # --- Training and Validation ---
+        best_val_loss = float('inf')
+        best_model_state = None
+        current_patience = 0
+        for epoch in range(EPOCHS):
+            train_loss = train_epoch(model, train_loader, optimizer, scheduler, criterion, DEVICE)
+            val_loss = eval_epoch(model, val_loader, criterion, DEVICE)
+            print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                best_model_state = copy.deepcopy(model.state_dict())
+                print(f"  -> New best model saved with validation loss: {best_val_loss:.4f}")
+                current_patience = 0
+            else:
+                current_patience += 1
+                if current_patience >= PATIENCE:
+                    print(f'Early stopping at {PATIENCE} epochs')
+                    break
+        # --- Testing ---
+        print("\nTesting with the best model...")
+        model.load_state_dict(best_model_state)
+        test_preds, test_true = test_model(model, test_loader, DEVICE)
+        # Store results. For classification, you can now calculate metrics like ROC-AUC.
+        aggregated_results[name] = {
+            'best_val_loss': best_val_loss,
+            'test_predictions': test_preds,
+            'test_labels': test_true
+        }
+        print(f"Finished testing for {name}.")
+    # --- Final Results Aggregation ---
+    print(f"\n{'='*20} AGGREGATED RESULTS {'='*20}")
+    for name, result in aggregated_results.items():
+        # Here you would typically calculate and display final metrics from predictions
+        # For example, using scikit-learn's roc_auc_score
+        # from sklearn.metrics import roc_auc_score
+        if name in ['bbbp', 'tox21', 'sider', 'clintox', 'hiv', 'bace']:
+            auc = roc_auc_score(result['test_labels'], result['test_predictions'], average='macro')
+            print(f'{name} ROC AUC: {auc}')
+        if name in ['lipophicility', 'esol', 'qm8']:
+            rmse = root_mean_squared_error(result['test_labels'], result['test_predictions'])
+            mae = mean_absolute_error(result['test_labels'], result['test_predictions'])
+            print(f'{name} MAE: {mae}')
+            print(f'{name} RMSE: {rmse}')
+    print("\nScript finished.")
 if __name__ == '__main__':
+    # Note: This script requires rdkit. You can install it via pip:
+    # pip install rdkit-pypi
     main()