Upload 4 files

Browse files

Files changed (4) hide show

README.md +57 -0
best_model_checkpoint.pt +3 -0
esm3bedding.py +86 -0
modules.py +1744 -0

README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+---
+license: mit
+tags:
+  - protein
+  - binding-affinity
+  - deep-learning
+  - esm
+  - pytorch
+language:
+  - en
+---
+# 🧬 Protein Binding Affinity Predictor
+Dual-head model for predicting protein-protein binding affinity (ΔG) and mutation effects (ΔΔG).
+## Model Performance
+| Metric | Validation Score |
+|--------|-----------------|
+| dG Pearson | 0.51 |
+| ddG Pearson | 0.70 |
+| Sum PCC | 1.21 |
+## Architecture
+- **Backbone**: ESM-600M (frozen embeddings)
+- **Pooling**: Sliced-Wasserstein Embedding (SWE)
+- **Heads**: Dual-head (dG + ddG)
+- **Input**: Protein sequences (1153-dim = 1152 ESM + 1 mutation channel)
+## Usage
+```python
+from huggingface_hub import hf_hub_download
+import torch
+# Download checkpoint
+ckpt = hf_hub_download(repo_id="supanthadey1/protein-binding-affinity", filename="best_model_checkpoint.pt")
+checkpoint = torch.load(ckpt, map_location='cpu')
+model.load_state_dict(checkpoint['model_state_dict'])
+```
+## Predictions
+- **ΔG (kcal/mol)**: Binding free energy. More negative = stronger binding.
+- **ΔΔG (kcal/mol)**: Mutation effect. Negative = stabilizing, Positive = destabilizing.
+## Training Data
+Trained on multiple datasets including SKEMPI, BindingGym, PDBbind, and others.
+## Citation
+```
+[Citation coming soon]
+```

best_model_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ac87dbc506c018fcf8b26f296d595350e8544adc2034da24cdd6cdd03e6b9a6
+size 1603771034

esm3bedding.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# esm3bedding.py
+import os
+import torch
+from esm.models.esmc import ESMC
+from esm.sdk.api import ESMProtein, LogitsConfig
+from huggingface_hub import login
+from utils import get_logger
+from base import Featurizer
+logg = get_logger()
+class ESM3Featurizer(Featurizer):
+    def __init__(self, save_dir: str, api_key: str, per_tok: bool = True):
+        super().__init__("ESM3", 1152, save_dir=save_dir)
+        self.per_tok = per_tok
+        self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.client = None
+        self._login(api_key)
+        self._initialize_model()
+    def _login(self, api_key: str):
+        try:
+            login(api_key)
+            logg.info("Successfully logged into Hugging Face Hub.")
+        except Exception as e:
+            logg.error(f"Failed to log in to Hugging Face Hub: {e}")
+            raise RuntimeError("Hugging Face login failed. Check your API key.")
+    def _initialize_model(self):
+        try:
+            logg.info("Initializing ESMC model (esmc_600m)...")
+            # First try normal online loading
+            try:
+                self.client = ESMC.from_pretrained("esmc_600m")
+                self.client.to(self._device)
+                logg.info("ESMC model loaded.")
+                return
+            except Exception as online_error:
+                logg.warning(f"Online model loading failed: {online_error}")
+                logg.info("Attempting offline mode (using local cache)...")
+            # Fallback: Try offline mode using cached files
+            import os
+            os.environ["HF_HUB_OFFLINE"] = "1"
+            os.environ["TRANSFORMERS_OFFLINE"] = "1"
+            try:
+                self.client = ESMC.from_pretrained("esmc_600m", local_files_only=True)
+                self.client.to(self._device)
+                logg.info("ESMC model loaded from local cache (offline mode).")
+            except Exception as offline_error:
+                logg.error(f"Offline loading also failed: {offline_error}")
+                logg.error("="*60)
+                logg.error("ESMC MODEL NOT FOUND IN CACHE!")
+                logg.error("Run this on a node with internet access to cache the model:")
+                logg.error("  python -c \"from esm.models.esmc import ESMC; ESMC.from_pretrained('esmc_600m')\"")
+                logg.error("="*60)
+                raise RuntimeError("ESMC model not available. See error messages above.")
+        except Exception as e:
+            logg.error(f"Failed to load ESMC model: {e}")
+            raise RuntimeError("ESMC model initialization failed.")
+    def _transform(self, sequence: str) -> torch.Tensor:
+        try:
+            # REPLACE (not remove) invalid chars to preserve sequence length
+            valid_aa = set('ACDEFGHIKLMNPQRSTVWY')
+            clean_sequence = ''.join(c if c in valid_aa else 'A' for c in sequence.upper())
+            protein = ESMProtein(sequence=clean_sequence)
+            protein_tensor = self.client.encode(protein)
+            logits_config = LogitsConfig(sequence=True, return_embeddings=True)
+            output = self.client.logits(protein_tensor, logits_config)
+            embeddings = output.embeddings  # shape => [1, L, D] or [L, D]
+            if embeddings.dim() == 3 and embeddings.shape[0] == 1:
+                embeddings = embeddings.squeeze(0)  # => [L, D]
+            if not self.per_tok:
+                embeddings = embeddings.mean(dim=0)  # => [D]
+            return embeddings
+        except Exception as e:
+            logg.error(f"Error generating embeddings for sequence: {e}")
+            return None

modules.py ADDED Viewed

	@@ -0,0 +1,1744 @@

+from typing import Tuple, List, Dict, Optional
+from pathlib import Path
+import os
+import math
+import pandas as pd
+import pytorch_lightning as pl
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torch.nn.utils.rnn import pad_sequence
+import numpy as np
+import hashlib
+import json
+import time
+from esm3bedding import ESM3Featurizer
+from utils import get_logger
+logg = get_logger()
+#########################################
+#        Source Type Mapping            #
+#########################################
+SOURCE_TYPE_MAP = {
+    # Protein complexes (unique structures)
+    'PDBbind': 'protein_complex',
+    'PPIKB': 'protein_complex',
+    'asd_biomap': 'protein_complex',
+    'asd_aae': 'protein_complex',
+    'asd_aatp': 'protein_complex',
+    'asd_osh': 'protein_complex',
+    # True mutations
+    'SKEMPI': 'mutation',
+    'BindingGym': 'mutation',
+    'asd_flab_koenig2017': 'mutation',      # 1-2aa differences
+    'asd_flab_warszawski2019': 'mutation',  # 1-2aa differences
+    'asd_flab_rosace2023': 'mutation',      # 1-5aa differences
+    'PEPBI': 'mutation',
+    # Antibody CDR variants
+    'asd_abbd': 'antibody_cdr',             # 3-14aa CDR differences
+    'abdesign': 'antibody_cdr',
+    'asd_flab_hie2022': 'antibody_cdr',     # 2-17aa differences
+    'asd_flab_shanehsazzadeh2023': 'antibody_cdr',  # 3-18aa differences
+}
+SOURCE_TYPE_TO_ID = {'protein_complex': 0, 'mutation': 1, 'antibody_cdr': 2}
+DEFAULT_SOURCE_TYPE = 'mutation'  # Default for unknown sources
+#########################################
+#        Collate function (Siamese)     #
+#########################################
+def advanced_collate_fn(batch):
+    mut_c1_list, mut_c2_list, mut_y_list = [], [], []
+    wt_c1_list, wt_c2_list, wt_y_list = [], [], []
+    has_valid_wt_list = []  # CRITICAL: Track which samples have REAL WT embeddings (not zeros)
+    meta_list = []
+    for data, meta in batch:
+        (c1, c2, y, cw1, cw2, yw) = data
+        # mutant
+        mut_c1_list.append(c1)
+        mut_c2_list.append(c2)
+        mut_y_list.append(torch.tensor([y], dtype=torch.float32))
+        # wildtype
+        if cw1 is not None and cw2 is not None and yw is not None:
+            wt_c1_list.append(cw1)
+            wt_c2_list.append(cw2)
+            wt_y_list.append(torch.tensor([yw], dtype=torch.float32))
+            has_valid_wt_list.append(True)  # Real WT data available
+        else:
+            # fallback if no known WT - ZEROS corrupt ddG signal!
+            wt_c1_list.append(torch.zeros((1, c1.shape[1])))
+            wt_c2_list.append(torch.zeros((1, c2.shape[1])))
+            wt_y_list.append(torch.tensor([0.0], dtype=torch.float32))
+            has_valid_wt_list.append(False)  # INVALID for ddG - would compute mut-0=mut
+        meta_list.append(meta)
+    # pad mutant
+    c1_padded = pad_sequence(mut_c1_list, batch_first=True)
+    c2_padded = pad_sequence(mut_c2_list, batch_first=True)
+    B = c1_padded.shape[0]
+    N1 = c1_padded.shape[1]
+    N2 = c2_padded.shape[1]
+    c1_mask_list, c2_mask_list = [], []
+    for i in range(B):
+        l1 = mut_c1_list[i].shape[0]
+        l2 = mut_c2_list[i].shape[0]
+        m1 = [True]*l1 + [False]*(N1-l1)
+        m2 = [True]*l2 + [False]*(N2-l2)
+        c1_mask_list.append(torch.tensor(m1, dtype=torch.bool))
+        c2_mask_list.append(torch.tensor(m2, dtype=torch.bool))
+    c1_mask = torch.stack(c1_mask_list, dim=0)
+    c2_mask = torch.stack(c2_mask_list, dim=0)
+    y_mut = torch.cat(mut_y_list, dim=0)
+    # pad wildtype
+    w1_padded = pad_sequence(wt_c1_list, batch_first=True)
+    w2_padded = pad_sequence(wt_c2_list, batch_first=True)
+    N1w = w1_padded.shape[1]
+    N2w = w2_padded.shape[1]
+    w1_mask_list, w2_mask_list = [], []
+    for i in range(B):
+        l1 = wt_c1_list[i].shape[0]
+        l2 = wt_c2_list[i].shape[0]
+        m1 = [True]*l1 + [False]*(N1w-l1)
+        m2 = [True]*l2 + [False]*(N2w-l2)
+        w1_mask_list.append(torch.tensor(m1, dtype=torch.bool))
+        w2_mask_list.append(torch.tensor(m2, dtype=torch.bool))
+    w1_mask = torch.stack(w1_mask_list, dim=0)
+    w2_mask = torch.stack(w2_mask_list, dim=0)
+    y_wt = torch.cat(wt_y_list, dim=0)
+    has_wt_list = []
+    is_wt_list = []  # NEW: Track which samples ARE WT (not just have WT reference)
+    has_dg_list = []
+    has_ddg_list = []  # Track which samples have valid explicit ddG
+    has_inferred_ddg_list = []  # NEW: Track which samples have inferred ddG
+    has_both_list = []
+    ddg_list = []
+    ddg_inferred_list = []  # NEW: Inferred ddG values
+    # DEBUG: Track data consistency
+    n_has_ddg_true = 0
+    n_ddg_zero = 0
+    n_ddg_nan = 0
+    for i in range(B):
+        # from meta - use has_any_wt to include both real and inferred WT sequences
+        has_wt_list.append(meta_list[i].get("has_any_wt", meta_list[i].get("has_real_wt", False)))
+        is_wt_list.append(meta_list[i].get("is_wt", False))  # NEW: Whether sample IS a WT sample (not mutant)
+        has_dg_list.append(meta_list[i].get("has_dg", False))  # Default False to prevent false positives
+        # FIX: Include inferred ddG in has_ddg flag so validation samples with dG_mut and dG_wt are used
+        has_explicit_ddg = meta_list[i].get("has_ddg", False)
+        has_inferred_ddg_flag = meta_list[i].get("has_inferred_ddg", False)
+        # has_ddg should be True if we have EITHER explicit OR inferred ddG
+        has_ddg_flag = has_explicit_ddg or has_inferred_ddg_flag
+        has_ddg_list.append(has_ddg_flag)
+        has_inferred_ddg_list.append(has_inferred_ddg_flag)
+        has_both_list.append(meta_list[i].get("has_both_dg_ddg", False))  # For symmetric consistency
+        # FIX: Use explicit ddG if available, otherwise use inferred ddG (dG_mut - dG_wt)
+        ddg_val = meta_list[i].get("ddg", float('nan'))
+        ddg_inf_val = meta_list[i].get("ddg_inferred", float('nan'))
+        is_explicit_nan = ddg_val != ddg_val
+        is_inferred_nan = ddg_inf_val != ddg_inf_val
+        # DEBUG: Check for data consistency issues
+        if has_explicit_ddg:
+            n_has_ddg_true += 1
+            if is_explicit_nan:
+                n_ddg_nan += 1
+            elif abs(ddg_val) < 1e-8:
+                n_ddg_zero += 1
+        # Priority: explicit ddG > inferred ddG > 0.0 fallback (masked out)
+        if not is_explicit_nan:
+            ddg_list.append(ddg_val)
+        elif not is_inferred_nan:
+            ddg_list.append(ddg_inf_val)  # Use inferred ddG when explicit unavailable
+        else:
+            ddg_list.append(0.0)  # Fallback (will be masked by has_ddg=False)
+        # Collect inferred ddG values for separate tracking (already fetched above)
+        ddg_inferred_list.append(ddg_inf_val if not is_inferred_nan else 0.0)
+    # DEBUG: Log batch statistics if there are issues
+    if n_has_ddg_true > 0 and (n_ddg_nan > 0 or n_ddg_zero > B // 2):
+        print(f"[COLLATE DEBUG] Batch has_ddg stats: {n_has_ddg_true}/{B} have has_ddg=True, "
+              f"{n_ddg_nan} have NaN ddg (BUG!), {n_ddg_zero} have ddg≈0")
+    has_wt = torch.tensor(has_wt_list, dtype=torch.bool)
+    has_valid_wt = torch.tensor(has_valid_wt_list, dtype=torch.bool)  # CRITICAL: Only True if WT is real (not zeros)
+    is_wt = torch.tensor(is_wt_list, dtype=torch.bool)  # Sample IS a WT sample
+    has_dg = torch.tensor(has_dg_list, dtype=torch.bool)
+    has_ddg = torch.tensor(has_ddg_list, dtype=torch.bool)
+    has_inferred_ddg = torch.tensor(has_inferred_ddg_list, dtype=torch.bool)
+    has_both_dg_ddg = torch.tensor(has_both_list, dtype=torch.bool)
+    ddg_labels = torch.tensor(ddg_list, dtype=torch.float32)
+    ddg_inferred_labels = torch.tensor(ddg_inferred_list, dtype=torch.float32)
+    # DEBUG: Log WT validity stats for first few batches
+    n_valid_wt = has_valid_wt.sum().item()
+    n_has_wt = has_wt.sum().item()
+    if n_has_wt > 0 and n_valid_wt < n_has_wt:
+        print(f"[COLLATE DEBUG] WT validity: {n_valid_wt}/{n_has_wt} have valid WT embeddings "
+              f"({n_has_wt - n_valid_wt} samples have zero-fallback and will be EXCLUDED from ddG training)")
+    # Collect data_source for per-source metrics
+    data_source_list = [meta_list[i].get("data_source", "unknown") for i in range(B)]
+    # Collect source_type_ids for model conditioning
+    source_type_id_list = []
+    for i in range(B):
+        data_src = meta_list[i].get("data_source", "unknown")
+        source_type = SOURCE_TYPE_MAP.get(data_src, DEFAULT_SOURCE_TYPE)
+        source_type_id = SOURCE_TYPE_TO_ID[source_type]
+        source_type_id_list.append(source_type_id)
+    source_type_ids = torch.tensor(source_type_id_list, dtype=torch.long)
+    out = {
+        "mutant": (c1_padded, c1_mask, c2_padded, c2_mask, y_mut),
+        "wildtype": (w1_padded, w1_mask, w2_padded, w2_mask, y_wt),
+        "has_wt": has_wt,
+        "has_valid_wt": has_valid_wt,  # CRITICAL: True only if WT embeddings are real (not zeros)
+        "is_wt": is_wt,  # Sample IS a WT sample (for routing to dG head)
+        "has_dg": has_dg,  # Whether samples have absolute dG values
+        "has_ddg": has_ddg,  # Whether samples have valid explicit ddG values
+        "has_inferred_ddg": has_inferred_ddg,  # Whether samples have inferred ddG
+        "has_both_dg_ddg": has_both_dg_ddg,  # For symmetric consistency loss
+        "ddg_labels": ddg_labels,  # Direct ddG labels for BindingGym-style data
+        "ddg_inferred_labels": ddg_inferred_labels,  # Inferred ddG = dG_mut - dG_wt
+        "data_source": data_source_list,  # For per-source validation metrics
+        "source_type_ids": source_type_ids,  # For model conditioning (0=protein_complex, 1=mutation, 2=antibody_cdr)
+        "metadata": meta_list
+    }
+    return out
+#########################################
+#  SiameseDataset (Simplified)          #
+#########################################
+class AdvancedSiameseDataset(Dataset):
+    """
+    Dataset that handles mutation positions with a simple indicator channel.
+    Reads columns:
+      #Pdb, block1_sequence, block1_mut_positions, block1_mutations,
+      block2_sequence, block2_mut_positions, block2_mutations, del_g, ...
+    """
+    def __init__(self, df: pd.DataFrame, featurizer: ESM3Featurizer, embedding_dir: str,
+                 normalize_embeddings=True, augment=False, max_len=1022,
+                 wt_reference_df: pd.DataFrame = None):
+        super().__init__()
+        # Store WT reference DF (e.g. training set) for looking up missing WTs
+        # This enables Implicit ddG (dG_mut - dG_wt) even if WTs are not in the current split
+        self.wt_reference_df = wt_reference_df if wt_reference_df is not None else None
+        initial_len = len(df)
+        # CRITICAL FIX: Do NOT drop rows based on length because it shifts indices!
+        # External splits (indices) rely on the original row numbers.
+        # Instead, we TRUNCATE sequences that are too long to maintain alignment.
+        # Identify long sequences
+        long_mask = (df["block1_sequence"].astype(str).str.len() > max_len) | \
+                    (df["block2_sequence"].astype(str).str.len() > max_len)
+        n_long = long_mask.sum()
+        if n_long > 0:
+            print(f"  [Dataset] Truncating {n_long} samples with length > {max_len} to maintain index alignment (CRITICAL FIX).")
+            # Truncate sequences in place
+            # Use .copy() to avoid SettingWithCopyWarning if df is a slice
+            df = df.copy()
+            df.loc[long_mask, "block1_sequence"] = df.loc[long_mask, "block1_sequence"].astype(str).str.slice(0, max_len)
+            df.loc[long_mask, "block2_sequence"] = df.loc[long_mask, "block2_sequence"].astype(str).str.slice(0, max_len)
+        # No rows dropped, so indices remain aligned with split files
+        self.df = df.reset_index(drop=True)
+        #region agent log
+        try:
+            cols = set(self.df.columns.tolist())
+            need = {"block1_mut_positions", "block2_mut_positions", "Mutation(s)_PDB"}
+            missing = sorted(list(need - cols))
+            payload = {
+                "sessionId": "debug-session",
+                "runId": "pre-fix",
+                "hypothesisId": "G",
+                "location": "modules.py:AdvancedSiameseDataset:__init__",
+                "message": "Dataset columns presence check for mutation positions",
+                "data": {
+                    "n_rows": int(len(self.df)),
+                    "has_block1_mut_positions": "block1_mut_positions" in cols,
+                    "has_block2_mut_positions": "block2_mut_positions" in cols,
+                    "has_mutation_pdb": "Mutation(s)_PDB" in cols,
+                    "missing": missing,
+                },
+                "timestamp": int(time.time() * 1000),
+            }
+            with open("/Users/supantha/Documents/code_v2/protein/.cursor/debug.log", "a") as f:
+                f.write(json.dumps(payload, default=str) + "\n")
+            print(f"[AGENTLOG MUTPOSCOLS] missing={missing}")
+        except Exception:
+            pass
+        #endregion
+        #region agent log
+        # Disambiguate whether "0 positions" is happening for MUT embeddings or WT embeddings
+        try:
+            if not hasattr(self, "_agent_embed_call_counter"):
+                self._agent_embed_call_counter = 0
+            if self._agent_embed_call_counter < 10:
+                self._agent_embed_call_counter += 1
+                print(
+                    f"[AGENTLOG EMBCALL] idx={idx} role=mut "
+                    f"b1_mutpos_n={len(b1_mutpos)} b2_mutpos_n={len(b2_mutpos)} "
+                    f"seq1_len={len(item.get('seq1',''))} seq2_len={len(item.get('seq2',''))}"
+                )
+        except Exception:
+            pass
+        #endregion
+        # Recover antibody WTs (ANTIBODY_MUTATION) before augmentation or indexing
+        self.df = self._recover_antibody_wts(self.df)
+        # ---------- OPTIONAL AUGMENT: reverse mutation (mut ↔ WT) ----------
+        # Only augment MUTANT samples (not WT) - WT samples don't benefit from reversal
+        # and doubling them confuses the pdb_to_wt lookup
+        if augment:
+            # Identify mutant rows (non-empty Mutation(s)_PDB)
+            mut_mask = self.df["Mutation(s)_PDB"].notna() & (self.df["Mutation(s)_PDB"].str.strip() != "")
+            mutant_df = self.df[mut_mask].copy()
+            if len(mutant_df) > 0:
+                # Create reversed copies of mutant samples only
+                rev_df = mutant_df.copy()
+                # For the reverse augmentation we invert the sign of ddg
+                if "ddg" in rev_df.columns:
+                    rev_df["ddg"] = -rev_df["ddg"]
+                rev_df["is_reverse"] = True  # flag for reversed samples
+                # Original samples stay as-is
+                self.df["is_reverse"] = False
+                self.df = pd.concat([self.df, rev_df], ignore_index=True)
+                print(f"  [Dataset] Augmented: added {len(rev_df)} reversed mutant samples (antisymmetry training)")
+            else:
+                self.df["is_reverse"] = False
+        else:
+            self.df["is_reverse"] = False
+        # -------------------------------------------------------------------
+        # ---------- PAIR ID (mutant – WT) ----------------------------------
+        # Use PDB + cleaned‑mutation string so mutant and its WT share an ID
+        self.df["pair_id"] = (
+            self.df["#Pdb"].astype(str) + "_" +
+            self.df["Mutation(s)_cleaned"].fillna("")  # WT rows have empty mutation
+        )
+        # -------------------------------------------------------------------
+        self.featurizer = featurizer
+        self.embedding_dir = Path(embedding_dir)
+        self.embedding_dir.mkdir(exist_ok=True, parents=True)
+        self.normalize = normalize_embeddings
+        self.samples = []
+        self._embedding_cache = {}  # LRU-style cache for frequently accessed embeddings
+        self._cache_max_size = 20000  # Cache up to 20k embeddings (~20-40GB RAM)
+        self._cache_hits = 0
+        self._cache_misses = 0
+        # map each PDB to a wildtype row index if it exists
+        print(f"  [Dataset] Building WT index for {len(self.df)} rows...")
+        self.pdb_to_wt = {}
+        for i, row in self.df.iterrows():
+            pdb = row["#Pdb"]
+            mut_str = row.get("Mutation(s)_PDB","")
+            is_wt = (pd.isna(mut_str) or mut_str.strip()=="")
+            if is_wt and pdb not in self.pdb_to_wt:
+                self.pdb_to_wt[pdb] = i
+        # Build external WT map if reference DF is provided
+        self.external_pdb_to_wt = {}
+        if self.wt_reference_df is not None:
+            print(f"  [Dataset] Building external WT index from {len(self.wt_reference_df)} reference rows...")
+            for i, row in self.wt_reference_df.iterrows():
+                # Only index actual WTs
+                mut_str = row.get("Mutation(s)_PDB","")
+                is_wt = (pd.isna(mut_str) or mut_str.strip()=="")
+                if 'is_wt' in row: # Prioritize pre-computed flag
+                    is_wt = is_wt or row['is_wt']
+                pdb = row["#Pdb"]
+                if is_wt and pdb not in self.external_pdb_to_wt:
+                    self.external_pdb_to_wt[pdb] = i
+            print(f"  [Dataset] Indexed {len(self.external_pdb_to_wt)} external WTs.")
+        # Build external WT map if reference DF is provided
+        self.external_pdb_to_wt = {}
+        if self.wt_reference_df is not None:
+            print(f"  [Dataset] Building external WT index from {len(self.wt_reference_df)} reference rows...")
+            for i, row in self.wt_reference_df.iterrows():
+                # Only index actual WTs
+                mut_str = row.get("Mutation(s)_PDB","")
+                is_wt = (pd.isna(mut_str) or mut_str.strip()=="")
+                # Also check 'is_wt' column if present
+                if 'is_wt' in row:
+                    is_wt = is_wt or row['is_wt']
+                pdb = row["#Pdb"]
+                if is_wt and pdb not in self.external_pdb_to_wt:
+                    self.external_pdb_to_wt[pdb] = i
+            print(f"  [Dataset] Indexed {len(self.external_pdb_to_wt)} external WTs.")
+        # LAZY LOADING: Only store metadata, NOT embeddings
+        # Embeddings will be loaded on-demand in __getitem__
+        print(f"  [Dataset] Building sample metadata for {len(self.df)} rows (lazy loading)...")
+        from tqdm import tqdm
+        for i, row in tqdm(self.df.iterrows(), total=len(self.df), desc="  Indexing"):
+            # RESET computed mutations for this row to prevent stale data from previous iterations
+            if hasattr(self, '_last_computed_mutpos'):
+                del self._last_computed_mutpos
+            pdb = row["#Pdb"]
+            seq1 = row["block1_sequence"]
+            seq2 = row["block2_sequence"]
+            # Data source for per-source validation metrics
+            data_source = row.get("data_source", "unknown")
+            # Handle missing dG values (e.g., BindingGym has only ddG)
+            raw_delg = row["del_g"]
+            delg = float(raw_delg) if pd.notna(raw_delg) and raw_delg != '' else float('nan')
+            # Get ddG if available (for ddG-only datasets like BindingGym)
+            raw_ddg = row.get("ddg", None)
+            ddg = float(raw_ddg) if pd.notna(raw_ddg) and raw_ddg != '' else float('nan')
+            # Parse mutations (just store the string, parse later)
+            b1_mutpos_str = row.get("block1_mut_positions","[]")
+            b2_mutpos_str = row.get("block2_mut_positions","[]")
+            # DEBUG: Print first few rows to debug disappearing mutations
+            if i < 5:
+                print(f"DEBUG ROW {i}: b1='{b1_mutpos_str}' ({type(b1_mutpos_str)}), b2='{b2_mutpos_str}' ({type(b2_mutpos_str)})")
+                #region agent log
+                try:
+                    payload = {
+                        "sessionId": "debug-session",
+                        "runId": "pre-fix",
+                        "hypothesisId": "G",
+                        "location": "modules.py:AdvancedSiameseDataset:__init__:row0_4",
+                        "message": "Raw mutpos strings from df row (first few)",
+                        "data": {
+                            "i": int(i),
+                            "b1_mutpos_str": str(b1_mutpos_str),
+                            "b2_mutpos_str": str(b2_mutpos_str),
+                            "mutation_pdb": str(row.get("Mutation(s)_PDB", "")),
+                        },
+                        "timestamp": int(time.time() * 1000),
+                    }
+                    with open("/Users/supantha/Documents/code_v2/protein/.cursor/debug.log", "a") as f:
+                        f.write(json.dumps(payload, default=str) + "\n")
+                    print(f"[AGENTLOG MUTPOSRAW] i={i} b1={b1_mutpos_str} b2={b2_mutpos_str} mut={row.get('Mutation(s)_PDB','')}")
+                except Exception:
+                    pass
+                #endregion
+            # Get chain info for block assignment during WT inference
+            b1_chains = str(row.get("block1_chains", "")).upper()
+            b2_chains = str(row.get("block2_chains", "")).upper()
+            mut_str = row.get("Mutation(s)_PDB","")
+            is_wt = (pd.isna(mut_str) or mut_str.strip()=="")
+            wt_idx = self.pdb_to_wt.get(pdb, None)
+            # Get WT info if available (Internal > External)
+            row_wt = None
+            wt_source = None
+            if not hasattr(self, '_wt_source_stats'):
+                self._wt_source_stats = {'internal': 0, 'external': 0}
+            if wt_idx is not None:
+                row_wt = self.df.iloc[wt_idx]
+                wt_source = 'internal'
+                self._wt_source_stats['internal'] += 1
+            elif pdb in self.external_pdb_to_wt:
+                ext_idx = self.external_pdb_to_wt[pdb]
+                row_wt = self.wt_reference_df.iloc[ext_idx]
+                wt_source = 'external'
+                self._wt_source_stats['external'] += 1
+            if row_wt is not None:
+                seq1_wt = row_wt["block1_sequence"]
+                seq2_wt = row_wt["block2_sequence"]
+                raw_delg_wt = row_wt["del_g"]
+                delg_wt = float(raw_delg_wt) if pd.notna(raw_delg_wt) and raw_delg_wt != '' else float('nan')
+                b1_wtpos_str = row_wt.get("block1_mut_positions","[]")
+                b2_wtpos_str = row_wt.get("block2_mut_positions","[]")
+                # BUGFIX: If we have WT but NO mutation positions in CSV, we MUST calculate them!
+                # This fixes the "0% mutation positions" issue when the CSV column is empty/missing
+                if not is_wt and (b1_mutpos_str in ["[]", "", "nan", "None"] and b2_mutpos_str in ["[]", "", "nan", "None"]):
+                     # Run inference to locate mutations (side-effect: sets _last_computed_mutpos)
+                     # We ignore the inferred WT sequence since we have the real one
+                     # We pass "[]" to force scanning PDB positions
+                     self._infer_wt_sequences(
+                         seq1, seq2, mut_str, "[]", "[]",
+                         b1_chains, b2_chains
+                     )
+                     # Update mutpos_str if we found mutations
+                     if hasattr(self, '_last_computed_mutpos'):
+                        comp_b1, comp_b2 = self._last_computed_mutpos
+                        if b1_mutpos_str in ["[]", "", "nan", "None"] and comp_b1:
+                            b1_mutpos_str = str(comp_b1)
+                        if b2_mutpos_str in ["[]", "", "nan", "None"] and comp_b2:
+                            b2_mutpos_str = str(comp_b2)
+            else:
+                # No WT row found - try to INFER WT sequence by reversing mutations
+                # This is crucial for BindingGym data which stores mutant sequences only
+                seq1_wt, seq2_wt = self._infer_wt_sequences(
+                    seq1, seq2, mut_str, b1_mutpos_str, b2_mutpos_str,
+                    b1_chains, b2_chains  # Chain info for block assignment
+                )
+                delg_wt = float('nan')  # No WT dG available for inferred sequences
+                b1_wtpos_str, b2_wtpos_str = "[]", "[]"  # WT has no mutation positions
+                # FIX Bug #3: Use computed mutation positions from inference if original empty
+                if hasattr(self, '_last_computed_mutpos'):
+                    comp_b1, comp_b2 = self._last_computed_mutpos
+                    if b1_mutpos_str in ["[]", "", "nan", "None"] and comp_b1:
+                        b1_mutpos_str = str(comp_b1)
+                    if b2_mutpos_str in ["[]", "", "nan", "None"] and comp_b2:
+                        b2_mutpos_str = str(comp_b2)
+            # Check if this sample has BOTH dG and ddG (for symmetric consistency)
+            has_dg = not (delg != delg)  # False if NaN
+            has_ddg = not (ddg != ddg)   # False if NaN
+            has_both = has_dg and has_ddg
+            # NEW: Compute inferred ddG for samples with dG_mut and dG_wt but no explicit ddG
+            # ddG_inferred = dG_mut - dG_wt (can be used as additional training signal)
+            has_dg_wt = not (delg_wt != delg_wt)  # False if NaN
+            has_inferred_ddg = has_dg and has_dg_wt and (not has_ddg)  # Only if no explicit ddG
+            if has_inferred_ddg:
+                ddg_inferred = delg - delg_wt  # Computed from dG values
+            else:
+                ddg_inferred = float('nan')
+            # Track WT availability: real (from row), inferred, or none
+            has_real_wt = (wt_idx is not None)
+            has_inferred_wt = (wt_idx is None and seq1_wt is not None and seq2_wt is not None)
+            has_any_wt = has_real_wt or has_inferred_wt
+            # Store ONLY metadata - no embeddings loaded yet!
+            is_reverse = row.get("is_reverse", False)  # Track reversed samples
+            # CRITICAL: Swap sequences and dG for reversed samples (antisymmetry augmentation)
+            if is_reverse:
+                # Swap sequences: New Mutant = Old WT, New WT = Old Mutant
+                if seq1_wt is not None and seq2_wt is not None:
+                    seq1, seq1_wt = seq1_wt, seq1
+                    seq2, seq2_wt = seq2_wt, seq2
+                    # Swap dG values
+                    delg, delg_wt = delg_wt, delg
+                    # Negate inferred ddG (dG_new_mut - dG_new_wt = dG_old_wt - dG_old_mut = -(dG_old_mut - dG_old_wt))
+                    if not math.isnan(ddg_inferred):
+                        ddg_inferred = -ddg_inferred
+                    # Note: Explicit 'ddg' is already negated in __init__ augmentation logic
+                    # Note: We do NOT swap mutation positions because the indices of difference
+                    # are the same for A->B vs B->A. We want the 'input' (new mutant) to have
+                    # the indicator flags at the difference sites.
+            self.samples.append({
+                "pdb": pdb,
+                "is_wt": is_wt,
+                "is_reverse": is_reverse,  # True if this is a reversed (augmented) sample
+                "seq1": seq1, "seq2": seq2, "delg": delg,
+                "seq1_wt": seq1_wt, "seq2_wt": seq2_wt, "delg_wt": delg_wt,
+                "ddg": ddg,
+                "ddg_inferred": ddg_inferred,  # NEW: Computed from dG_mut - dG_wt
+                "has_dg": has_dg,
+                "has_ddg": has_ddg,
+                "has_inferred_ddg": has_inferred_ddg,  # NEW: True if ddg_inferred is valid
+                "has_both_dg_ddg": has_both,
+                "has_real_wt": has_real_wt,
+                "has_inferred_wt": has_inferred_wt,
+                "has_any_wt": has_any_wt,
+                "b1_mutpos_str": b1_mutpos_str,
+                "b2_mutpos_str": b2_mutpos_str,
+                "b1_wtpos_str": b1_wtpos_str,
+                "b2_wtpos_str": b2_wtpos_str,
+                "data_source": data_source
+            })
+        # Log WT inference statistics
+        n_real_wt = sum(1 for s in self.samples if s["has_real_wt"])
+        n_inferred_wt = sum(1 for s in self.samples if s["has_inferred_wt"])
+        n_no_wt = len(self.samples) - n_real_wt - n_inferred_wt
+        # Detailed stats for Real WTs (Internal vs External)
+        if hasattr(self, '_wt_source_stats'):
+            n_internal = self._wt_source_stats.get('internal', 0)
+            n_external = self._wt_source_stats.get('external', 0)
+            source_msg = f" (Internal: {n_internal}, External: {n_external})"
+        else:
+            source_msg = ""
+        print(f"  [Dataset] Ready! {len(self.samples)} samples indexed (embeddings loaded on-demand)")
+        print(f"  [Dataset] WT stats: {n_real_wt} real WT{source_msg}, {n_inferred_wt} inferred WT, {n_no_wt} no WT")
+        # Log detailed failure breakdown (for debugging)
+        if hasattr(self, '_wt_inference_failures') and hasattr(self, '_wt_inference_fail_count'):
+            print(f"  [Dataset] ⚠️ WT inference failed for {self._wt_inference_fail_count} samples:")
+            fail_dict = self._wt_inference_failures
+            # Count by category (note: these are capped sample counts, not totals)
+            n_no_pdb = len(fail_dict.get('no_pdb', []))
+            n_del_ins = len(fail_dict.get('del_ins_only', []))
+            n_parse = len(fail_dict.get('parse_fail', []))
+            if n_no_pdb > 0:
+                print(f"      - ANTIBODY samples (no PDB structure): {self._wt_inference_fail_count} samples")
+                print(f"        (These are antibody design samples without original PDB - only dG usable)")
+            elif n_del_ins > 0 or n_parse > 0:
+                print(f"      - DEL/INS/stop-codon (can't reverse): counted")
+                print(f"      - Parsing failed (unknown format): counted")
+            # Show samples for non-ANTIBODY failures
+            if fail_dict.get('parse_fail') and n_no_pdb == 0:
+                print(f"      Sample parse failures:")
+                for mut in fail_dict['parse_fail'][:5]:
+                    print(f"        '{mut}'")
+    def _parse_mutpos(self, pos_str) -> List[int]:
+        """
+        pos_str might be '[]' or '[170, 172]' etc.
+        We'll do a simple parse.
+        """
+        # Handle NaN, None, or non-string values
+        if pos_str is None or (isinstance(pos_str, float) and str(pos_str) == 'nan'):
+            return []
+        if not isinstance(pos_str, str):
+            pos_str = str(pos_str)
+        pos_str = pos_str.strip()
+        if pos_str.startswith("[") and pos_str.endswith("]"):
+            inside = pos_str[1:-1].strip()
+            if not inside:
+                return []
+            # split by comma
+            arr = inside.split(",")
+            out = []
+            for x in arr:
+                x_ = x.strip()
+                if x_:
+                    out.append(int(x_))
+            return out
+        return []
+    def _recover_antibody_wts(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Recover WT information for antibody samples (ANTIBODY_MUTATION)
+        by finding the closest-to-consensus sequence in each antigen group.
+        Strategy:
+        1. Identify samples with 'ANTIBODY_MUTATION'
+        2. Group by antigen (block2_sequence)
+        3. Assign unique Pseudo-PDB ID to each group (e.g. ANTIBODY_GRP_xxx)
+        4. For same-length groups: find sequence closest to consensus as WT
+        5. For variable-length groups: fallback to best binder (lowest del_g)
+        6. Mark selected sequence as WT (clear mutation string)
+        """
+        from collections import Counter
+        # Identify antibody mutation rows
+        mask = df['Mutation(s)_PDB'].astype(str).str.contains('ANTIBODY_MUTATION', na=False)
+        if not mask.any():
+            return df
+        print(f"  [Dataset] Attempting to recover WT for {mask.sum()} antibody samples...")
+        recovered_count = 0
+        n_groups = 0
+        n_consensus = 0
+        n_median = 0
+        n_fallback = 0
+        # We need a copy to avoid SettingWithCopy warnings if df is a slice
+        df = df.copy()
+        # Add a temporary column for grouping (hash of antigen sequence)
+        df['temp_antigen_hash'] = df['block2_sequence'].apply(lambda x: hashlib.md5(str(x).encode()).hexdigest())
+        # Get hashes for antibody rows
+        ab_hashes = df.loc[mask, 'temp_antigen_hash'].unique()
+        for h in ab_hashes:
+            # Get all antibody rows for this antigen
+            grp_mask = mask & (df['temp_antigen_hash'] == h)
+            grp_indices = df.index[grp_mask]
+            if len(grp_indices) == 0:
+                continue
+            n_groups += 1
+            # 1. Create unique Pseudo-PDB ID
+            pseudo_pdb = f"ANTIBODY_GRP_{h[:8]}"
+            df.loc[grp_indices, '#Pdb'] = pseudo_pdb
+            # 2. Select WT: closest-to-consensus (same-length) or best-binder (variable-length)
+            seqs = df.loc[grp_indices, 'block1_sequence'].tolist()
+            seq_lens = set(len(s) for s in seqs)
+            wt_idx = None
+            if len(seq_lens) == 1:
+                # SAME LENGTH: Use closest-to-consensus
+                seq_len = list(seq_lens)[0]
+                # Build consensus sequence
+                consensus = []
+                for pos in range(seq_len):
+                    residues = [s[pos] for s in seqs]
+                    counts = Counter(residues)
+                    most_common = counts.most_common(1)[0][0]
+                    consensus.append(most_common)
+                consensus_seq = ''.join(consensus)
+                # Find sequence with minimum Hamming distance to consensus
+                min_dist = float('inf')
+                for idx in grp_indices:
+                    seq = df.at[idx, 'block1_sequence']
+                    dist = sum(c1 != c2 for c1, c2 in zip(seq, consensus_seq))
+                    if dist < min_dist:
+                        min_dist = dist
+                        wt_idx = idx
+                n_consensus += 1
+            else:
+                # VARIABLE LENGTH: Fallback to median binder (more representative than best)
+                if 'del_g' in df.columns:
+                    delg_vals = pd.to_numeric(df.loc[grp_indices, 'del_g'], errors='coerce').dropna()
+                    if len(delg_vals) > 0:
+                        # Find index of value closest to median
+                        median_val = delg_vals.median()
+                        median_idx = (delg_vals - median_val).abs().idxmin()
+                        wt_idx = median_idx
+                        n_median += 1
+            # FINAL FALLBACK: Pick first sample if no other method works (e.g., all NaN dG)
+            if wt_idx is None and len(grp_indices) > 0:
+                wt_idx = grp_indices[0]
+                n_fallback += 1
+            # 3. Mark selected sequence as WT
+            if wt_idx is not None:
+                df.at[wt_idx, 'Mutation(s)_PDB'] = ""
+                recovered_count += len(grp_indices)
+        # Cleanup
+        df.drop(columns=['temp_antigen_hash'], inplace=True, errors='ignore')
+        print(f"  [Dataset] Recovered {recovered_count} antibody samples ({n_groups} groups):")
+        print(f"      - {n_consensus} groups via closest-to-consensus")
+        print(f"      - {n_median} groups via median-binder (variable-length)")
+        if n_fallback > 0:
+            print(f"      - {n_fallback} groups via first-sample fallback (no dG data)")
+        return df
+    def _infer_wt_sequences(self, mut_seq1: str, mut_seq2: str, mutation_str: str,
+                            b1_mutpos_str: str, b2_mutpos_str: str,
+                            b1_chains: str = "", b2_chains: str = "") -> Tuple[Optional[str], Optional[str]]:
+        """
+        Infer wildtype sequences by reversing mutations in the mutant sequences.
+        IMPROVED: Instead of relying on PDB positions (which don't match 0-indexed
+        sequence positions), this version searches for the mutant residue and
+        reverses it. Also computes actual mutation positions as byproduct.
+        Mutations are in formats like:
+        - BindingGym: "H:P53L" or "H:P53L,H:Y57C" (chain:WTresPOSmutres)
+        - SKEMPI: "HP53L" or "CA182A" (chainWTresPOSmutres)
+        Args:
+            mut_seq1: Mutant sequence for block1
+            mut_seq2: Mutant sequence for block2
+            mutation_str: Raw mutation string from data
+            b1_mutpos_str: Mutation positions for block1 (e.g., "[52, 56]")
+            b2_mutpos_str: Mutation positions for block2
+            b1_chains: Chain letters in block1 (e.g., "AB")
+            b2_chains: Chain letters in block2 (e.g., "HL")
+        Returns:
+            Tuple of (wt_seq1, wt_seq2) or (None, None) if inference fails
+        """
+        import re
+        if pd.isna(mutation_str) or str(mutation_str).strip() == '':
+            # No mutations = this IS the wildtype
+            return mut_seq1, mut_seq2
+        # FALLBACK: Handle ANTIBODY_MUTATION samples that couldn't be recovered
+        mutation_str_upper = str(mutation_str).strip().upper()
+        if 'ANTIBODY_MUTATION' in mutation_str_upper or mutation_str_upper == 'ANTIBODY_MUTATION':
+            if not hasattr(self, '_wt_inference_failures'):
+                self._wt_inference_failures = {'parse_fail': [], 'del_ins_only': [], 'no_pdb': [], 'other': []}
+                self._wt_inference_fail_count = 0
+            self._wt_inference_fail_count += 1
+            if len(self._wt_inference_failures['no_pdb']) < 5:
+                self._wt_inference_failures['no_pdb'].append(mutation_str[:80])
+            return None, None
+        try:
+            # Parse mutation string to extract (chain, position, original_AA, mutant_AA)
+            mutations = []
+            mutation_str = str(mutation_str).strip()
+            # Split by common delimiters
+            parts = re.split(r'[,;]', mutation_str)
+            for part in parts:
+                part = part.strip().strip('"\'')
+                if not part:
+                    continue
+                # Skip deletion/insertion markers - can't reverse these
+                if 'DEL' in part.upper() or 'INS' in part.upper() or '*' in part:
+                    continue
+                # BindingGym format: "H:P53L" or "L:K103R"
+                if ':' in part:
+                    chain_mut = part.split(':')
+                    if len(chain_mut) >= 2:
+                        chain = chain_mut[0].strip().upper()
+                        for mut_part in chain_mut[1:]:
+                            mut_part = mut_part.strip()
+                            if not mut_part:
+                                continue
+                            match = re.match(r'([A-Z])(\d+)([A-Z])', mut_part)
+                            if match:
+                                wt_aa = match.group(1)
+                                pos = int(match.group(2))  # PDB-numbered (1-indexed)
+                                mut_aa = match.group(3)
+                                mutations.append((chain, pos, wt_aa, mut_aa))
+                else:
+                    # SKEMPI format: "CA182A" = C(WTresidue) + A(chain) + 182(pos) + A(mutant)
+                    # Format: WTresidue + ChainID + Position[insertcode] + MutResidue
+                    # Example: CA182A means Cysteine at chain A position 182 mutated to Alanine
+                    match = re.match(r'([A-Z])([A-Z])(-?\d+[a-z]?)([A-Z])', part)
+                    if match:
+                        wt_aa = match.group(1)    # First char is WT residue
+                        chain = match.group(2).upper()  # Second char is chain ID
+                        pos_str = match.group(3)
+                        pos = int(re.match(r'-?\d+', pos_str).group())
+                        mut_aa = match.group(4)   # Last char is mutant residue
+                        mutations.append((chain, pos, wt_aa, mut_aa))
+                    else:
+                        # Simple format without chain: "F139A" (used by PEPBI)
+                        # Format: WTresidue + Position + MutResidue
+                        match = re.match(r'([A-Z])(\d+)([A-Z])', part)
+                        if match:
+                            wt_aa = match.group(1)
+                            pos = int(match.group(2))
+                            mut_aa = match.group(3)
+                            # No chain info - will try both blocks
+                            mutations.append(('?', pos, wt_aa, mut_aa))
+            if not mutations:
+                if not hasattr(self, '_wt_inference_failures'):
+                    self._wt_inference_failures = {'parse_fail': [], 'del_ins_only': [], 'other': []}
+                    self._wt_inference_fail_count = 0
+                self._wt_inference_fail_count += 1
+                if 'DEL' in mutation_str.upper() or 'INS' in mutation_str.upper() or '*' in mutation_str:
+                    category = 'del_ins_only'
+                else:
+                    category = 'parse_fail'
+                if len(self._wt_inference_failures.get(category, [])) < 10:
+                    self._wt_inference_failures.setdefault(category, []).append(mutation_str[:80])
+                return None, None
+            # Convert sequences to lists for mutation
+            wt_seq1_list = list(mut_seq1) if mut_seq1 else []
+            wt_seq2_list = list(mut_seq2) if mut_seq2 else []
+            # Build chain sets for block assignment
+            b1_chain_set = set(b1_chains.upper()) if b1_chains else set()
+            b2_chain_set = set(b2_chains.upper()) if b2_chains else set()
+            # Parse PRECOMPUTED mutation positions (these are correct 0-indexed seq positions)
+            # PDB residue numbers often don't match sequence indices due to numbering offsets
+            precomputed_b1_positions = self._parse_mutpos(b1_mutpos_str)
+            precomputed_b2_positions = self._parse_mutpos(b2_mutpos_str)
+            # Track reversal success
+            if not hasattr(self, '_wt_inference_stats'):
+                self._wt_inference_stats = {'reversed': 0, 'not_found': 0, 'total': 0}
+            # Also track actual mutation positions found
+            found_positions_b1 = []
+            found_positions_b2 = []
+            # STRATEGY 1: Use precomputed positions if available (MOST RELIABLE)
+            # These were computed during preprocessing with correct PDB-to-sequence mapping
+            if precomputed_b1_positions or precomputed_b2_positions:
+                pos_idx = 0
+                for chain, pdb_pos, wt_aa, mut_aa in mutations:
+                    self._wt_inference_stats['total'] += 1
+                    reversed_this = False
+                    # Determine which block based on chain
+                    if chain in b2_chain_set:
+                        # Use precomputed block2 positions
+                        if pos_idx < len(precomputed_b2_positions):
+                            seq_idx = precomputed_b2_positions[pos_idx]
+                            if 0 <= seq_idx < len(wt_seq2_list) and wt_seq2_list[seq_idx] == mut_aa:
+                                wt_seq2_list[seq_idx] = wt_aa
+                                reversed_this = True
+                                found_positions_b2.append(seq_idx)
+                    elif chain in b1_chain_set:
+                        # Use precomputed block1 positions
+                        if pos_idx < len(precomputed_b1_positions):
+                            seq_idx = precomputed_b1_positions[pos_idx]
+                            if 0 <= seq_idx < len(wt_seq1_list) and wt_seq1_list[seq_idx] == mut_aa:
+                                wt_seq1_list[seq_idx] = wt_aa
+                                reversed_this = True
+                                found_positions_b1.append(seq_idx)
+                    else:
+                        # Chain unknown - try both precomputed positions
+                        if pos_idx < len(precomputed_b1_positions):
+                            seq_idx = precomputed_b1_positions[pos_idx]
+                            if 0 <= seq_idx < len(wt_seq1_list) and wt_seq1_list[seq_idx] == mut_aa:
+                                wt_seq1_list[seq_idx] = wt_aa
+                                reversed_this = True
+                                found_positions_b1.append(seq_idx)
+                        if not reversed_this and pos_idx < len(precomputed_b2_positions):
+                            seq_idx = precomputed_b2_positions[pos_idx]
+                            if 0 <= seq_idx < len(wt_seq2_list) and wt_seq2_list[seq_idx] == mut_aa:
+                                wt_seq2_list[seq_idx] = wt_aa
+                                reversed_this = True
+                                found_positions_b2.append(seq_idx)
+                    if reversed_this:
+                        self._wt_inference_stats['reversed'] += 1
+                    else:
+                        self._wt_inference_stats['not_found'] += 1
+                    pos_idx += 1
+                self._last_computed_mutpos = (found_positions_b1, found_positions_b2)
+                return ''.join(wt_seq1_list), ''.join(wt_seq2_list)
+            # STRATEGY 2: Fall back to PDB position-based search (less reliable)
+            for chain, pdb_pos, wt_aa, mut_aa in mutations:
+                self._wt_inference_stats['total'] += 1
+                reversed_this = False
+                found_idx = None
+                # Determine which block(s) to search based on chain
+                chain_known = chain in b1_chain_set or chain in b2_chain_set
+                if chain in b1_chain_set:
+                    blocks_to_try = [(wt_seq1_list, True, found_positions_b1)]
+                elif chain in b2_chain_set:
+                    blocks_to_try = [(wt_seq2_list, False, found_positions_b2)]
+                else:
+                    # Chain info unavailable - try BOTH blocks
+                    blocks_to_try = [
+                        (wt_seq1_list, True, found_positions_b1),
+                        (wt_seq2_list, False, found_positions_b2)
+                    ]
+                for target_seq, is_block1, pos_list in blocks_to_try:
+                    if reversed_this:
+                        break  # Already found in previous block
+                    guess_idx = pdb_pos - 1  # Convert to 0-indexed
+                    # Strategy 1: Try exact position if in bounds
+                    if 0 <= guess_idx < len(target_seq) and target_seq[guess_idx] == mut_aa:
+                        found_idx = guess_idx
+                    else:
+                        # Strategy 2: Search ±50 window around expected position
+                        search_start = max(0, pdb_pos - 50)
+                        search_end = min(len(target_seq), pdb_pos + 50)
+                        for idx in range(search_start, search_end):
+                            if target_seq[idx] == mut_aa:
+                                found_idx = idx
+                                break
+                    # Strategy 3: If position was out of bounds AND chain unknown,
+                    # search the ENTIRE sequence as last resort
+                    if found_idx is None and not chain_known:
+                        if guess_idx >= len(target_seq) or guess_idx < 0:
+                            # Position was out of bounds - search entire sequence
+                            for idx in range(len(target_seq)):
+                                if target_seq[idx] == mut_aa:
+                                    found_idx = idx
+                                    break
+                    if found_idx is not None:
+                        target_seq[found_idx] = wt_aa  # Reverse the mutation!
+                        reversed_this = True
+                        pos_list.append(found_idx)
+                if reversed_this:
+                    self._wt_inference_stats['reversed'] += 1
+                else:
+                    self._wt_inference_stats['not_found'] += 1
+            # Store computed mutation positions for later use (helps with Bug #3)
+            # These are the ACTUAL 0-indexed positions in the sequence
+            self._last_computed_mutpos = (found_positions_b1, found_positions_b2)
+            return ''.join(wt_seq1_list), ''.join(wt_seq2_list)
+        except Exception as e:
+            # On any error, return None to indicate inference failed
+            return None, None
+    def _get_embedding(self, seq: str, mut_positions: List[int]) -> torch.Tensor:
+        """
+        Basic embedding with mutation position indicator channel.
+        Args:
+            seq: The protein sequence
+            mut_positions: List of positions that are mutated (0-indexed)
+        """
+        # Get base ESM embedding (already ensures min length of 2)
+        base_emb = self._get_or_create_embedding(seq)  # => [L, 1152]
+        base_emb = base_emb.cpu()
+        # Get sequence length and embedding dimension
+        L, D = base_emb.shape
+        #region agent log
+        try:
+            if not hasattr(self, "_agent_log_counter"):
+                self._agent_log_counter = 0
+            if self._agent_log_counter < 5:
+                self._agent_log_counter += 1
+                last1_stats = None
+                last2_stats = None
+                if D >= 1153:
+                    v1 = base_emb[:, -1]
+                    last1_stats = {
+                        "min": float(v1.min().item()),
+                        "max": float(v1.max().item()),
+                        "mean": float(v1.float().mean().item()),
+                        "std": float(v1.float().std().item()),
+                    }
+                if D >= 1154:
+                    v2 = base_emb[:, -2]
+                    last2_stats = {
+                        "min": float(v2.min().item()),
+                        "max": float(v2.max().item()),
+                        "mean": float(v2.float().mean().item()),
+                        "std": float(v2.float().std().item()),
+                    }
+                payload = {
+                    "sessionId": "debug-session",
+                    "runId": "pre-fix",
+                    "hypothesisId": "F",
+                    "location": "modules.py:AdvancedSiameseDataset:_get_embedding",
+                    "message": "Base embedding shape + tail-channel stats before appending mutation indicator",
+                    "data": {
+                        "L": int(L),
+                        "D": int(D),
+                        "mut_positions_n": int(len(mut_positions) if mut_positions is not None else -1),
+                        "mut_positions_first5": (mut_positions[:5] if mut_positions else []),
+                        "base_last1": last1_stats,
+                        "base_last2": last2_stats,
+                    },
+                    "timestamp": int(time.time() * 1000),
+                }
+                with open("/Users/supantha/Documents/code_v2/protein/.cursor/debug.log", "a") as f:
+                    f.write(json.dumps(payload, default=str) + "\n")
+                # Also emit a concise line to stdout/logs (useful on cluster runs)
+                print(f"[AGENTLOG EMB] D={D} mut_n={len(mut_positions) if mut_positions else 0} last1={last1_stats} last2={last2_stats}")
+        except Exception:
+            pass
+        #endregion
+        # Create mutation indicator channel (just one channel)
+        # FIX FOR DOUBLE-INDICATOR BUG: Check if base_emb already has indicator (D=1153)
+        # If D=1153, the cached embedding already has an old indicator channel - OVERWRITE it
+        # If D=1152, this is a fresh ESM embedding - APPEND indicator channel
+        D = base_emb.shape[-1]
+        L = base_emb.shape[0]
+        if D == 1153:
+            # Already has indicator channel (from cache) - overwrite it with correct mutation positions
+            new_emb = base_emb.clone()
+            new_emb[:, -1] = 0.0  # Reset old indicator
+            for pos in mut_positions:
+                if isinstance(pos, int) and 0 <= pos < L:
+                    new_emb[pos, -1] = 1.0
+            print(f"[AGENTLOG INDICATOR-FIX] D=1153 OVERWRITING last channel with {len(mut_positions)} positions")
+        else:
+            # Fresh ESM embedding (D=1152) - append indicator channel
+            chan = torch.zeros((L, 1), dtype=base_emb.dtype, device=base_emb.device)
+            for pos in mut_positions:
+                if isinstance(pos, int) and 0 <= pos < L:
+                    chan[pos, 0] = 1.0
+            new_emb = torch.cat([base_emb, chan], dim=-1)
+            print(f"[AGENTLOG INDICATOR-FIX] D={D} APPENDING indicator channel with {len(mut_positions)} positions")
+        return new_emb
+    def _get_or_create_embedding(self, seq: str) -> torch.Tensor:
+        # Check LRU cache first (limited size to control memory)
+        if seq in self._embedding_cache:
+            self._cache_hits += 1
+            return self._embedding_cache[seq].clone()
+        seq_hash = hashlib.md5(seq.encode()).hexdigest()
+        pt_file = self.embedding_dir / f"{seq_hash}.pt"
+        npy_file = self.embedding_dir / f"{seq_hash}.npy"
+        emb = None
+        load_source = None  # Track where embedding came from
+        # Try .npy first (pre-computed), then .pt
+        if npy_file.is_file():
+            try:
+                import numpy as np
+                emb = torch.from_numpy(np.load(npy_file))
+                load_source = "npy"
+            except Exception:
+                pass
+        if emb is None and pt_file.is_file():
+            try:
+                emb = torch.load(pt_file, map_location="cpu")
+                load_source = "pt"
+            except Exception:
+                pt_file.unlink(missing_ok=True)  # Delete corrupted file
+        if emb is None:
+            # On-the-fly embedding generation for missing sequences (e.g., inferred WT)
+            # This is slower but ensures accurate embeddings
+            try:
+                emb = self.featurizer.transform(seq)  # [L, 1152]
+                # Save for future use
+                torch.save(emb, pt_file)
+                load_source = "generated"
+                # Track on-the-fly generation stats
+                if not hasattr(self, '_on_the_fly_count'):
+                    self._on_the_fly_count = 0
+                self._on_the_fly_count += 1
+                # Log first few on-the-fly generations
+                if self._on_the_fly_count <= 5:
+                    print(f"[EMBEDDING] Generated on-the-fly #{self._on_the_fly_count}: len={len(seq)}, saved to {pt_file.name}")
+                elif self._on_the_fly_count == 6:
+                    print(f"[EMBEDDING] Generated 5+ embeddings on-the-fly (suppressing further logs)")
+            except Exception as e:
+                raise RuntimeError(
+                    f"Embedding not found and on-the-fly generation failed for sequence (len={len(seq)}): {e}"
+                )
+        #region agent log
+        try:
+            if not hasattr(self, "_agent_embload_counter"):
+                self._agent_embload_counter = 0
+            if self._agent_embload_counter < 8:
+                self._agent_embload_counter += 1
+                shape = tuple(int(x) for x in emb.shape)
+                D = int(shape[1]) if len(shape) == 2 else None
+                payload = {
+                    "sessionId": "debug-session",
+                    "runId": "pre-fix",
+                    "hypothesisId": "A",
+                    "location": "modules.py:AdvancedSiameseDataset:_get_or_create_embedding",
+                    "message": "Loaded embedding tensor (source + shape) before any indicator is appended",
+                    "data": {
+                        "load_source": load_source,
+                        "seq_len": int(len(seq)),
+                        "shape": shape,
+                        "D": D,
+                        "looks_like_has_indicator": bool(D is not None and D >= 1153),
+                        "file_pt_exists": bool(pt_file.is_file()),
+                        "file_npy_exists": bool(npy_file.is_file()),
+                    },
+                    "timestamp": int(time.time() * 1000),
+                }
+                with open("/Users/supantha/Documents/code_v2/protein/.cursor/debug.log", "a") as f:
+                    f.write(json.dumps(payload, default=str) + "\n")
+                print(f"[AGENTLOG EMBLOAD] src={load_source} shape={shape} D={D}")
+        except Exception:
+            pass
+        #endregion
+        # SAFETY: Ensure embedding has valid shape (at least 5 residues for interpolation)
+        if emb.shape[0] < 5:
+            # Pad to minimum length of 5 by repeating
+            repeats = (5 // emb.shape[0]) + 1
+            emb = emb.repeat(repeats, 1)[:5]  # Ensure exactly 5 rows
+        # Track cache miss
+        self._cache_misses += 1
+        # Add to LRU cache (evict oldest if full)
+        if len(self._embedding_cache) >= self._cache_max_size:
+            # Remove oldest entry (first key in dict)
+            oldest_key = next(iter(self._embedding_cache))
+            del self._embedding_cache[oldest_key]
+        self._embedding_cache[seq] = emb
+        return emb.clone()  # Return clone to avoid mutation issues
+    def get_cache_stats(self):
+        """Return cache statistics."""
+        total = self._cache_hits + self._cache_misses
+        hit_rate = (self._cache_hits / total * 100) if total > 0 else 0
+        on_the_fly = getattr(self, '_on_the_fly_count', 0)
+        wt_missing = getattr(self, '_wt_missing_count', 0)
+        return {
+            "hits": self._cache_hits,
+            "misses": self._cache_misses,
+            "total": total,
+            "hit_rate": hit_rate,
+            "cache_size": len(self._embedding_cache),
+            "cache_max": self._cache_max_size,
+            "on_the_fly_generated": on_the_fly,
+            "wt_embedding_failed": wt_missing
+        }
+    def print_cache_stats(self):
+        """Print cache statistics."""
+        stats = self.get_cache_stats()
+        print(f"  [Cache] Hits: {stats['hits']:,} | Misses: {stats['misses']:,} | "
+              f"Hit Rate: {stats['hit_rate']:.1f}% | Size: {stats['cache_size']:,}/{stats['cache_max']:,}")
+        if stats['on_the_fly_generated'] > 0:
+            print(f"  [Cache] On-the-fly generated: {stats['on_the_fly_generated']:,} embeddings")
+        if stats['wt_embedding_failed'] > 0:
+            print(f"  [Cache] ⚠️ WT embedding failures: {stats['wt_embedding_failed']:,} (excluded from ddG training)")
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        item = self.samples[idx]
+        # DEBUG: Track sequence difference statistics
+        if not hasattr(self, '_seq_diff_stats'):
+            self._seq_diff_stats = {'same': 0, 'different': 0, 'no_wt': 0}
+        if not hasattr(self, '_mutpos_stats'):
+            self._mutpos_stats = {'has_mutpos': 0, 'no_mutpos': 0}
+        # LAZY LOADING: Load embeddings on-demand
+        b1_mutpos = self._parse_mutpos(item["b1_mutpos_str"])
+        b2_mutpos = self._parse_mutpos(item["b2_mutpos_str"])
+        #region agent log
+        try:
+            if not hasattr(self, "_agent_mutpos_getitem_counter"):
+                self._agent_mutpos_getitem_counter = 0
+            if self._agent_mutpos_getitem_counter < 20:
+                self._agent_mutpos_getitem_counter += 1
+                payload = {
+                    "sessionId": "debug-session",
+                    "runId": "pre-fix",
+                    "hypothesisId": "G",
+                    "location": "modules.py:AdvancedSiameseDataset:__getitem__",
+                    "message": "Parsed mut_positions passed to _get_embedding",
+                    "data": {
+                        "idx": int(idx),
+                        "pdb": str(item.get("pdb")),
+                        "is_wt": bool(item.get("is_wt")),
+                        "b1_mutpos_str": str(item.get("b1_mutpos_str")),
+                        "b2_mutpos_str": str(item.get("b2_mutpos_str")),
+                        "b1_mutpos_n": int(len(b1_mutpos)),
+                        "b2_mutpos_n": int(len(b2_mutpos)),
+                        "b1_mutpos_first5": b1_mutpos[:5],
+                        "b2_mutpos_first5": b2_mutpos[:5],
+                    },
+                    "timestamp": int(time.time() * 1000),
+                }
+                with open("/Users/supantha/Documents/code_v2/protein/.cursor/debug.log", "a") as f:
+                    f.write(json.dumps(payload, default=str) + "\n")
+                print(f"[AGENTLOG MUTPOSGET] idx={idx} b1n={len(b1_mutpos)} b2n={len(b2_mutpos)} b1str={item.get('b1_mutpos_str')} b2str={item.get('b2_mutpos_str')}")
+        except Exception:
+            pass
+        #endregion
+        # Track mutation position statistics
+        if len(b1_mutpos) > 0 or len(b2_mutpos) > 0:
+            self._mutpos_stats['has_mutpos'] += 1
+        else:
+            self._mutpos_stats['no_mutpos'] += 1
+        # Log mutation position stats periodically
+        total = sum(self._mutpos_stats.values())
+        if total in [100, 1000, 10000]:
+            has_mp = self._mutpos_stats['has_mutpos']
+            no_mp = self._mutpos_stats['no_mutpos']
+            print(f"  [MUTPOS] After {total} samples: {has_mp} have mutation positions ({100*has_mp/total:.1f}%), "
+                  f"{no_mp} have NO mutation positions ({100*no_mp/total:.1f}%)")
+        c1_emb = self._get_embedding(item["seq1"], b1_mutpos)
+        c2_emb = self._get_embedding(item["seq2"], b2_mutpos)
+        if self.normalize:
+            c1_emb[:, :-1] = torch.nn.functional.normalize(c1_emb[:, :-1], p=2, dim=-1)
+            c2_emb[:, :-1] = torch.nn.functional.normalize(c2_emb[:, :-1], p=2, dim=-1)
+        # Load WT embeddings if available
+        if item["seq1_wt"] is not None:
+            # DEBUG: Track sequence differences
+            seq1_same = (item["seq1"] == item["seq1_wt"])
+            seq2_same = (item["seq2"] == item["seq2_wt"])
+            if seq1_same and seq2_same:
+                self._seq_diff_stats['same'] += 1
+            else:
+                self._seq_diff_stats['different'] += 1
+            # Periodic logging
+            total_samples = sum(self._seq_diff_stats.values())
+            if total_samples in [100, 1000, 10000, 50000]:
+                same = self._seq_diff_stats['same']
+                diff = self._seq_diff_stats['different']
+                no_wt = self._seq_diff_stats['no_wt']
+                print(f"  [SEQ DIFF] After {total_samples} samples: {same} same seq ({100*same/total_samples:.1f}%), "
+                      f"{diff} different ({100*diff/total_samples:.1f}%), {no_wt} no WT")
+            b1_wtpos = self._parse_mutpos(item["b1_wtpos_str"])
+            b2_wtpos = self._parse_mutpos(item["b2_wtpos_str"])
+            #region agent log
+            try:
+                if not hasattr(self, "_agent_embed_call_counter_wt"):
+                    self._agent_embed_call_counter_wt = 0
+                if self._agent_embed_call_counter_wt < 10:
+                    self._agent_embed_call_counter_wt += 1
+                    print(
+                        f"[AGENTLOG EMBCALL] idx={idx} role=wt "
+                        f"b1_wtpos_n={len(b1_wtpos)} b2_wtpos_n={len(b2_wtpos)} "
+                        f"seq1_wt_len={len(item.get('seq1_wt','') or '')} seq2_wt_len={len(item.get('seq2_wt','') or '')}"
+                    )
+            except Exception:
+                pass
+            #endregion
+            try:
+                cw1 = self._get_embedding(item["seq1_wt"], b1_wtpos)
+                cw2 = self._get_embedding(item["seq2_wt"], b2_wtpos)
+            except RuntimeError as e:
+                # WT embedding unavailable - mark as no WT for this sample
+                # DO NOT use mutant embedding as proxy - this corrupts the mutation signal!
+                # Instead, set cw1, cw2 to None and let training handle missing WT
+                cw1, cw2 = None, None
+                if not hasattr(self, '_wt_missing_count'):
+                    self._wt_missing_count = 0
+                self._wt_missing_count += 1
+                if self._wt_missing_count <= 3:  # Only log first 3 to avoid spam
+                    print(f"  [WARN] WT embedding missing #{self._wt_missing_count}, sample will be WT-less: {e}")
+            if cw1 is not None and self.normalize:
+                cw1[:, :-1] = torch.nn.functional.normalize(cw1[:, :-1], p=2, dim=-1)
+                cw2[:, :-1] = torch.nn.functional.normalize(cw2[:, :-1], p=2, dim=-1)
+        else:
+            cw1, cw2 = None, None
+            self._seq_diff_stats['no_wt'] += 1
+        data_tuple = (c1_emb, c2_emb, item["delg"],
+                      cw1, cw2, item["delg_wt"])
+        meta = {
+            "pdb": item["pdb"],
+            "is_wt": item["is_wt"],
+            "has_real_wt": item["has_real_wt"],
+            "has_dg": item["has_dg"],
+            "has_ddg": item["has_ddg"],  # Whether sample has valid explicit ddG value
+            "has_inferred_ddg": item["has_inferred_ddg"],  # Whether sample has inferred ddG (dG_mut - dG_wt)
+            "has_both_dg_ddg": item["has_both_dg_ddg"],
+            "ddg": item["ddg"],
+            "ddg_inferred": item["ddg_inferred"],  # Inferred ddG value (needed for Fix #1)
+            "has_any_wt": item["has_any_wt"],  # Include inferred WT status (CRITICAL!)
+            "b1_mutpos": b1_mutpos,
+            "b2_mutpos": b2_mutpos,
+            "data_source": item["data_source"]
+        }
+        return (data_tuple, meta)
+#########################################
+# AffinityDataModule
+#########################################
+from sklearn.model_selection import GroupKFold
+class AffinityDataModule(pl.LightningDataModule):
+    """
+    Data module for protein binding affinity prediction.
+    Supports multiple splitting strategies:
+    1. split_indices_dir: Load pre-computed cluster-based splits (RECOMMENDED)
+    2. use_cluster_split: Create new cluster-based splits on the fly
+    3. split column: Use existing 'split' column in CSV (legacy)
+    4. num_folds > 1: GroupKFold on PDB IDs
+    """
+    def __init__(
+        self,
+        data_csv: str,
+        protein_featurizer: ESM3Featurizer,
+        embedding_dir: str = "precomputed_esm",
+        batch_size: int = 32,
+        num_workers: int = 4,
+        shuffle: bool = True,
+        num_folds: int = 1,
+        fold_index: int = 0,
+        # New cluster-based splitting options
+        split_indices_dir: str = None,  # Path to pre-computed split indices
+        benchmark_indices_dir: str = None,  # Path to balanced benchmark subset indices (optional override)
+        use_cluster_split: bool = False,  # Create cluster-based splits on the fly
+        train_ratio: float = 0.70,
+        val_ratio: float = 0.15,
+        test_ratio: float = 0.15,
+        random_state: int = 42
+    ):
+        super().__init__()
+        self.data_csv = data_csv
+        self.featurizer = protein_featurizer
+        self.embedding_dir = embedding_dir
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.shuffle = shuffle
+        self.num_folds = num_folds
+        self.fold_index = fold_index
+        # Cluster-based splitting options
+        self.split_indices_dir = split_indices_dir
+        self.benchmark_indices_dir = benchmark_indices_dir  # Optional balanced benchmark override
+        self.use_cluster_split = use_cluster_split
+        self.train_ratio = train_ratio
+        self.val_ratio = val_ratio
+        self.test_ratio = test_ratio
+        self.random_state = random_state
+        self.train_dataset = None
+        self.val_dataset = None
+        self.test_dataset = None
+        # Dual-split datasets (separate for dG and ddG heads)
+        self.dg_train_dataset = None  # WT-only training set for Stage A
+        self.ddg_train_dataset = None  # Mutation training set for Stage B
+        self.dg_val_dataset = None
+        self.dg_test_dataset = None
+        self.ddg_val_dataset = None
+        self.ddg_test_dataset = None
+        self.use_dual_split = False
+    def prepare_data(self):
+        if not os.path.exists(self.data_csv):
+            raise FileNotFoundError(f"Data CSV not found => {self.data_csv}")
+    def setup(self, stage=None):
+        data = pd.read_csv(self.data_csv, low_memory=False)
+        # Check if this is a dual-split directory
+        dual_split_file = os.path.join(self.split_indices_dir, 'dg_val_indices.csv') if self.split_indices_dir else None
+        # Strategy 0: Load DUAL splits (separate for dG and ddG heads)
+        if self.split_indices_dir and dual_split_file and os.path.exists(dual_split_file):
+            from data_splitting import load_dual_splits
+            print(f"\n[DataModule] Loading DUAL splits from {self.split_indices_dir}")
+            splits = load_dual_splits(self.split_indices_dir)
+            self.use_dual_split = True
+            # Combined training set (union of dG and ddG train indices)
+            train_idx = splits['combined_train']
+            train_df = data.iloc[train_idx].reset_index(drop=True)
+            # For backward compatibility, use ddG validation as default val set
+            # (since most validation is on mutation data)
+            val_idx = splits['ddg']['val']
+            val_df = data.iloc[val_idx].reset_index(drop=True)
+            test_idx = splits['ddg']['test']
+            test_df = data.iloc[test_idx].reset_index(drop=True)
+            # Create separate datasets for each head
+            # CRITICAL: Create separate dG (WT-only) and ddG (MT-only) TRAINING sets
+            # This fixes Stage A WT starvation where WT is diluted to 2.75% in combined_train
+            dg_train_df = data.iloc[splits['dg']['train']].reset_index(drop=True)
+            ddg_train_df = data.iloc[splits['ddg']['train']].reset_index(drop=True)
+            dg_val_df = data.iloc[splits['dg']['val']].reset_index(drop=True)
+            dg_test_df = data.iloc[splits['dg']['test']].reset_index(drop=True)
+            ddg_val_df = data.iloc[splits['ddg']['val']].reset_index(drop=True)
+            ddg_test_df = data.iloc[splits['ddg']['test']].reset_index(drop=True)
+            print(f"\n[DataModule] Creating dG TRAIN dataset ({len(dg_train_df)} WT rows)...")
+            self.dg_train_dataset = AdvancedSiameseDataset(dg_train_df, self.featurizer, self.embedding_dir, augment=False)  # Baseline: no augment
+            print(f"[DataModule] Creating ddG TRAIN dataset ({len(ddg_train_df)} MT rows)...")
+            self.ddg_train_dataset = AdvancedSiameseDataset(ddg_train_df, self.featurizer, self.embedding_dir, augment=False)  # Baseline: no augment
+            # === BALANCED BENCHMARK OVERRIDE ===
+            # If benchmark_indices_dir is provided, use those for ddG val/test instead
+            if self.benchmark_indices_dir and os.path.exists(self.benchmark_indices_dir):
+                print(f"\n[DataModule] Loading BALANCED BENCHMARK indices from {self.benchmark_indices_dir}")
+                # Load ddG benchmark val indices
+                ddg_val_bench_file = os.path.join(self.benchmark_indices_dir, 'ddg_val_benchmark_indices.csv')
+                if os.path.exists(ddg_val_bench_file):
+                    bench_val_idx = pd.read_csv(ddg_val_bench_file, header=None).iloc[:, 0].values.tolist()
+                    ddg_val_df = data.iloc[bench_val_idx].reset_index(drop=True)
+                    print(f"  ddG val: {len(ddg_val_df)} rows (balanced benchmark)")
+                # Load ddG benchmark test indices
+                ddg_test_bench_file = os.path.join(self.benchmark_indices_dir, 'ddg_test_benchmark_indices.csv')
+                if os.path.exists(ddg_test_bench_file):
+                    bench_test_idx = pd.read_csv(ddg_test_bench_file, header=None).iloc[:, 0].values.tolist()
+                    ddg_test_df = data.iloc[bench_test_idx].reset_index(drop=True)
+                    print(f"  ddG test: {len(ddg_test_df)} rows (balanced benchmark)")
+            print(f"\n[DataModule] Creating dG val dataset ({len(dg_val_df)} rows)...")
+            # NOTE: Do NOT subsample validation - we want accurate metrics on full set
+            self.dg_val_dataset = AdvancedSiameseDataset(
+                dg_val_df, self.featurizer, self.embedding_dir, augment=False,
+                wt_reference_df=data # FIX: Use full data for WT lookup (robust to split boundaries)
+            )
+            print(f"\n[DataModule] Creating dG test dataset ({len(dg_test_df)} rows)...")
+            self.dg_test_dataset = AdvancedSiameseDataset(
+                dg_test_df, self.featurizer, self.embedding_dir, augment=False,
+                wt_reference_df=data # FIX: Use full data for WT lookup
+            )
+            print(f"\n[DataModule] Creating ddG val dataset ({len(ddg_val_df)} rows)...")
+            # NOTE: Do NOT subsample validation - we want accurate metrics on full set
+            # cap_k only applies to training DMS data
+            self.ddg_val_dataset = AdvancedSiameseDataset(
+                ddg_val_df, self.featurizer, self.embedding_dir, augment=False,
+                wt_reference_df=data # FIX: Use full data for WT lookup
+            )
+            print(f"\n[DataModule] Creating ddG test dataset ({len(ddg_test_df)} rows)...")
+            self.ddg_test_dataset = AdvancedSiameseDataset(
+                ddg_test_df, self.featurizer, self.embedding_dir, augment=False,
+                wt_reference_df=data # FIX: Use full data for WT lookup
+            )
+            print(f"\n[DataModule] Dual split datasets created:")
+            print(f"  dG train: {len(self.dg_train_dataset)} samples (WT-only for Stage A)")
+            print(f"  ddG train: {len(self.ddg_train_dataset)} samples (MT-only)")
+            print(f"  dG val:   {len(self.dg_val_dataset)} samples")
+            print(f"  dG test:  {len(self.dg_test_dataset)} samples")
+            print(f"  ddG val:  {len(self.ddg_val_dataset)} samples")
+            print(f"  ddG test: {len(self.ddg_test_dataset)} samples")
+        # Strategy 1: Load pre-computed cluster-based splits (single split)
+        elif self.split_indices_dir and os.path.exists(self.split_indices_dir):
+            from data_splitting import load_split_indices, verify_no_leakage
+            train_idx, val_idx, test_idx = load_split_indices(self.split_indices_dir)
+            train_df = data.iloc[train_idx].reset_index(drop=True)
+            val_df = data.iloc[val_idx].reset_index(drop=True)
+            test_df = data.iloc[test_idx].reset_index(drop=True)
+            # Verify no leakage
+            verify_no_leakage(data, train_idx, val_idx, test_idx)
+        # Strategy 2: Create cluster-based splits on the fly
+        elif self.use_cluster_split:
+            from data_splitting import create_cluster_splits, verify_no_leakage
+            # Create splits directory if needed
+            splits_dir = os.path.join(os.path.dirname(self.data_csv), 'splits')
+            train_idx, val_idx, test_idx = create_cluster_splits(
+                data,
+                train_ratio=self.train_ratio,
+                val_ratio=self.val_ratio,
+                test_ratio=self.test_ratio,
+                random_state=self.random_state,
+                save_dir=splits_dir
+            )
+            train_df = data.iloc[train_idx].reset_index(drop=True)
+            val_df = data.iloc[val_idx].reset_index(drop=True)
+            test_df = data.iloc[test_idx].reset_index(drop=True)
+        # Strategy 3: Legacy - use 'split' column in CSV
+        else:
+            # must have block1_sequence, block1_mut_positions, block2_sequence, ...
+            bench_df = data[data["split"]=="Benchmark test"].copy()
+            trainval_df = data[data["split"]!="Benchmark test"].copy()
+            if self.num_folds > 1:
+                gkf = GroupKFold(n_splits=self.num_folds)
+                groups = trainval_df["#Pdb"].values
+                folds = list(gkf.split(trainval_df, groups=groups))
+                train_idx, val_idx = folds[self.fold_index]
+                train_df = trainval_df.iloc[train_idx].reset_index(drop=True)
+                val_df = trainval_df.iloc[val_idx].reset_index(drop=True)
+            else:
+                train_df = trainval_df[trainval_df["split"]=="train"].reset_index(drop=True)
+                val_df = trainval_df[trainval_df["split"]=="val"].reset_index(drop=True)
+            test_df = bench_df
+        print(f"\n[DataModule] Creating TRAIN dataset ({len(train_df)} rows)...")
+        self.train_dataset = AdvancedSiameseDataset(
+            train_df, self.featurizer, self.embedding_dir, augment=False  # Baseline: no augment (enable later for antisymmetry)
+        )
+        print(f"\n[DataModule] Creating VAL dataset ({len(val_df)} rows)...")
+        # Subsampling disabled for v20 ablation to ensure robust Macro-PCC evaluation
+        # (need full diversity of PDB families for honest reporting)
+        self.val_dataset = AdvancedSiameseDataset(
+            val_df, self.featurizer, self.embedding_dir, augment=False,
+            wt_reference_df=train_df  # Pass training set as source for WTs
+        )
+        print(f"\n[DataModule] Creating TEST dataset ({len(test_df)} rows)...")
+        self.test_dataset = AdvancedSiameseDataset(
+            test_df, self.featurizer, self.embedding_dir, augment=False,
+            wt_reference_df=train_df  # Pass training set as source for WTs (no leakage, WTs are known)
+        )
+        # FIX: Create separate dg_test and ddg_test datasets for proper test metric logging
+        # This is CRITICAL for sweep runs - without this, test metrics are never computed!
+        if self.dg_test_dataset is None and self.ddg_test_dataset is None:
+            # Determine WT/MT based on Mutation(s)_cleaned column
+            def is_wt_row(row):
+                mut_str = str(row.get('Mutation(s)_cleaned', '')).strip()
+                return mut_str == '' or mut_str.lower() == 'nan' or mut_str == 'WT'
+            # Separate test_df into WT (for dG test) and MT (for ddG test)
+            test_is_wt = test_df.apply(is_wt_row, axis=1)
+            dg_test_df = test_df[test_is_wt].reset_index(drop=True)
+            ddg_test_df = test_df[~test_is_wt].reset_index(drop=True)
+            if len(dg_test_df) > 0:
+                print(f"\n[DataModule] Creating dG TEST dataset ({len(dg_test_df)} WT rows)...")
+                self.dg_test_dataset = AdvancedSiameseDataset(
+                    dg_test_df, self.featurizer, self.embedding_dir, augment=False,
+                    wt_reference_df=data  # Use full data for WT lookup
+                )
+            else:
+                print(f"[DataModule] WARNING: No WT rows in test set for dG test dataset!")
+            if len(ddg_test_df) > 0:
+                print(f"\n[DataModule] Creating ddG TEST dataset ({len(ddg_test_df)} MT rows)...")
+                self.ddg_test_dataset = AdvancedSiameseDataset(
+                    ddg_test_df, self.featurizer, self.embedding_dir, augment=False,
+                    wt_reference_df=data  # Use full data for WT lookup
+                )
+            else:
+                print(f"[DataModule] WARNING: No MT rows in test set for ddG test dataset!")
+        # Log dataset sizes
+        print(f"\nDataset sizes:")
+        print(f"  Train: {len(self.train_dataset)} samples")
+        print(f"  Val:   {len(self.val_dataset)} samples")
+        print(f"  Test:  {len(self.test_dataset)} samples")
+        if self.dg_test_dataset:
+            print(f"  dG Test:  {len(self.dg_test_dataset)} samples (WT)")
+        if self.ddg_test_dataset:
+            print(f"  ddG Test: {len(self.ddg_test_dataset)} samples (MT)")
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            collate_fn=advanced_collate_fn
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            collate_fn=advanced_collate_fn
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            collate_fn=advanced_collate_fn
+        )
+    # Dual-split training dataloaders for separate dG-only (Stage A) and ddG (Stage B) training
+    def dg_train_dataloader(self):
+        """Training dataloader for dG head (WT data only for Stage A pretraining)."""
+        if self.dg_train_dataset is None:
+            return None
+        return DataLoader(
+            self.dg_train_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            collate_fn=advanced_collate_fn
+        )
+    def ddg_train_dataloader(self):
+        """Training dataloader for ddG head (mutation data for Stage B training)."""
+        if self.ddg_train_dataset is None:
+            return None
+        return DataLoader(
+            self.ddg_train_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            collate_fn=advanced_collate_fn
+        )
+    # Dual-split dataloaders for separate dG and ddG validation
+    def dg_val_dataloader(self):
+        """Validation dataloader for dG head (WT data only)."""
+        if self.dg_val_dataset is None:
+            return None
+        return DataLoader(
+            self.dg_val_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            collate_fn=advanced_collate_fn
+        )
+    def dg_test_dataloader(self):
+        """Test dataloader for dG head (WT data only)."""
+        if self.dg_test_dataset is None:
+            return None
+        return DataLoader(
+            self.dg_test_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            collate_fn=advanced_collate_fn
+        )
+    def ddg_val_dataloader(self):
+        """Validation dataloader for ddG head (mutation data including DMS)."""
+        if self.ddg_val_dataset is None:
+            return None
+        return DataLoader(
+            self.ddg_val_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            collate_fn=advanced_collate_fn
+        )
+    def ddg_test_dataloader(self):
+        """Test dataloader for ddG head (mutation data including DMS)."""
+        if self.ddg_test_dataset is None:
+            return None
+        return DataLoader(
+            self.ddg_test_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            collate_fn=advanced_collate_fn
+        )