File size: 84,891 Bytes

6ef15c4

from typing import Tuple, List, Dict, Optional
from pathlib import Path
import os
import math
import pandas as pd
import pytorch_lightning as pl
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import hashlib
import json
import time
from esm3bedding import ESM3Featurizer
from utils import get_logger

logg = get_logger()

#########################################
#        Source Type Mapping            #
#########################################
SOURCE_TYPE_MAP = {
    # Protein complexes (unique structures)
    'PDBbind': 'protein_complex',
    'PPIKB': 'protein_complex',
    'asd_biomap': 'protein_complex',
    'asd_aae': 'protein_complex',
    'asd_aatp': 'protein_complex',
    'asd_osh': 'protein_complex',
    # True mutations
    'SKEMPI': 'mutation',
    'BindingGym': 'mutation',
    'asd_flab_koenig2017': 'mutation',      # 1-2aa differences
    'asd_flab_warszawski2019': 'mutation',  # 1-2aa differences
    'asd_flab_rosace2023': 'mutation',      # 1-5aa differences
    'PEPBI': 'mutation',
    # Antibody CDR variants
    'asd_abbd': 'antibody_cdr',             # 3-14aa CDR differences
    'abdesign': 'antibody_cdr',
    'asd_flab_hie2022': 'antibody_cdr',     # 2-17aa differences
    'asd_flab_shanehsazzadeh2023': 'antibody_cdr',  # 3-18aa differences
}
SOURCE_TYPE_TO_ID = {'protein_complex': 0, 'mutation': 1, 'antibody_cdr': 2}
DEFAULT_SOURCE_TYPE = 'mutation'  # Default for unknown sources


#########################################
#        Collate function (Siamese)     #
#########################################
def advanced_collate_fn(batch):
    mut_c1_list, mut_c2_list, mut_y_list = [], [], []
    wt_c1_list, wt_c2_list, wt_y_list = [], [], []
    has_valid_wt_list = []  # CRITICAL: Track which samples have REAL WT embeddings (not zeros)
    meta_list = []

    for data, meta in batch:
        (c1, c2, y, cw1, cw2, yw) = data
        # mutant
        mut_c1_list.append(c1)
        mut_c2_list.append(c2)
        mut_y_list.append(torch.tensor([y], dtype=torch.float32))
        # wildtype
        if cw1 is not None and cw2 is not None and yw is not None:
            wt_c1_list.append(cw1)
            wt_c2_list.append(cw2)
            wt_y_list.append(torch.tensor([yw], dtype=torch.float32))
            has_valid_wt_list.append(True)  # Real WT data available
        else:
            # fallback if no known WT - ZEROS corrupt ddG signal!
            wt_c1_list.append(torch.zeros((1, c1.shape[1])))
            wt_c2_list.append(torch.zeros((1, c2.shape[1])))
            wt_y_list.append(torch.tensor([0.0], dtype=torch.float32))
            has_valid_wt_list.append(False)  # INVALID for ddG - would compute mut-0=mut

        meta_list.append(meta)

    # pad mutant
    c1_padded = pad_sequence(mut_c1_list, batch_first=True)
    c2_padded = pad_sequence(mut_c2_list, batch_first=True)

    B = c1_padded.shape[0]
    N1 = c1_padded.shape[1]
    N2 = c2_padded.shape[1]
    c1_mask_list, c2_mask_list = [], []
    for i in range(B):
        l1 = mut_c1_list[i].shape[0]
        l2 = mut_c2_list[i].shape[0]
        m1 = [True]*l1 + [False]*(N1-l1)
        m2 = [True]*l2 + [False]*(N2-l2)
        c1_mask_list.append(torch.tensor(m1, dtype=torch.bool))
        c2_mask_list.append(torch.tensor(m2, dtype=torch.bool))
    c1_mask = torch.stack(c1_mask_list, dim=0)
    c2_mask = torch.stack(c2_mask_list, dim=0)
    y_mut = torch.cat(mut_y_list, dim=0)

    # pad wildtype
    w1_padded = pad_sequence(wt_c1_list, batch_first=True)
    w2_padded = pad_sequence(wt_c2_list, batch_first=True)
    N1w = w1_padded.shape[1]
    N2w = w2_padded.shape[1]
    w1_mask_list, w2_mask_list = [], []
    for i in range(B):
        l1 = wt_c1_list[i].shape[0]
        l2 = wt_c2_list[i].shape[0]
        m1 = [True]*l1 + [False]*(N1w-l1)
        m2 = [True]*l2 + [False]*(N2w-l2)
        w1_mask_list.append(torch.tensor(m1, dtype=torch.bool))
        w2_mask_list.append(torch.tensor(m2, dtype=torch.bool))
    w1_mask = torch.stack(w1_mask_list, dim=0)
    w2_mask = torch.stack(w2_mask_list, dim=0)
    y_wt = torch.cat(wt_y_list, dim=0)

    has_wt_list = []
    is_wt_list = []  # NEW: Track which samples ARE WT (not just have WT reference)
    has_dg_list = []
    has_ddg_list = []  # Track which samples have valid explicit ddG
    has_inferred_ddg_list = []  # NEW: Track which samples have inferred ddG
    has_both_list = []
    ddg_list = []
    ddg_inferred_list = []  # NEW: Inferred ddG values
    
    # DEBUG: Track data consistency
    n_has_ddg_true = 0
    n_ddg_zero = 0
    n_ddg_nan = 0
    
    for i in range(B):
        # from meta - use has_any_wt to include both real and inferred WT sequences
        has_wt_list.append(meta_list[i].get("has_any_wt", meta_list[i].get("has_real_wt", False)))
        is_wt_list.append(meta_list[i].get("is_wt", False))  # NEW: Whether sample IS a WT sample (not mutant)
        has_dg_list.append(meta_list[i].get("has_dg", False))  # Default False to prevent false positives
        # FIX: Include inferred ddG in has_ddg flag so validation samples with dG_mut and dG_wt are used
        has_explicit_ddg = meta_list[i].get("has_ddg", False)
        has_inferred_ddg_flag = meta_list[i].get("has_inferred_ddg", False)
        # has_ddg should be True if we have EITHER explicit OR inferred ddG
        has_ddg_flag = has_explicit_ddg or has_inferred_ddg_flag
        has_ddg_list.append(has_ddg_flag)
        has_inferred_ddg_list.append(has_inferred_ddg_flag)
        has_both_list.append(meta_list[i].get("has_both_dg_ddg", False))  # For symmetric consistency
        
        # FIX: Use explicit ddG if available, otherwise use inferred ddG (dG_mut - dG_wt)
        ddg_val = meta_list[i].get("ddg", float('nan'))
        ddg_inf_val = meta_list[i].get("ddg_inferred", float('nan'))
        is_explicit_nan = ddg_val != ddg_val
        is_inferred_nan = ddg_inf_val != ddg_inf_val
        
        # DEBUG: Check for data consistency issues
        if has_explicit_ddg:
            n_has_ddg_true += 1
            if is_explicit_nan:
                n_ddg_nan += 1
            elif abs(ddg_val) < 1e-8:
                n_ddg_zero += 1
        
        # Priority: explicit ddG > inferred ddG > 0.0 fallback (masked out)
        if not is_explicit_nan:
            ddg_list.append(ddg_val)
        elif not is_inferred_nan:
            ddg_list.append(ddg_inf_val)  # Use inferred ddG when explicit unavailable
        else:
            ddg_list.append(0.0)  # Fallback (will be masked by has_ddg=False)
        # Collect inferred ddG values for separate tracking (already fetched above)
        ddg_inferred_list.append(ddg_inf_val if not is_inferred_nan else 0.0)
    
    # DEBUG: Log batch statistics if there are issues
    if n_has_ddg_true > 0 and (n_ddg_nan > 0 or n_ddg_zero > B // 2):
        print(f"[COLLATE DEBUG] Batch has_ddg stats: {n_has_ddg_true}/{B} have has_ddg=True, "
              f"{n_ddg_nan} have NaN ddg (BUG!), {n_ddg_zero} have ddg≈0")
    
    has_wt = torch.tensor(has_wt_list, dtype=torch.bool)
    has_valid_wt = torch.tensor(has_valid_wt_list, dtype=torch.bool)  # CRITICAL: Only True if WT is real (not zeros)
    is_wt = torch.tensor(is_wt_list, dtype=torch.bool)  # Sample IS a WT sample
    has_dg = torch.tensor(has_dg_list, dtype=torch.bool)
    has_ddg = torch.tensor(has_ddg_list, dtype=torch.bool)
    has_inferred_ddg = torch.tensor(has_inferred_ddg_list, dtype=torch.bool)
    has_both_dg_ddg = torch.tensor(has_both_list, dtype=torch.bool)
    ddg_labels = torch.tensor(ddg_list, dtype=torch.float32)
    ddg_inferred_labels = torch.tensor(ddg_inferred_list, dtype=torch.float32)
    
    # DEBUG: Log WT validity stats for first few batches
    n_valid_wt = has_valid_wt.sum().item()
    n_has_wt = has_wt.sum().item()
    if n_has_wt > 0 and n_valid_wt < n_has_wt:
        print(f"[COLLATE DEBUG] WT validity: {n_valid_wt}/{n_has_wt} have valid WT embeddings "
              f"({n_has_wt - n_valid_wt} samples have zero-fallback and will be EXCLUDED from ddG training)")

    # Collect data_source for per-source metrics
    data_source_list = [meta_list[i].get("data_source", "unknown") for i in range(B)]
    
    # Collect source_type_ids for model conditioning
    source_type_id_list = []
    for i in range(B):
        data_src = meta_list[i].get("data_source", "unknown")
        source_type = SOURCE_TYPE_MAP.get(data_src, DEFAULT_SOURCE_TYPE)
        source_type_id = SOURCE_TYPE_TO_ID[source_type]
        source_type_id_list.append(source_type_id)
    source_type_ids = torch.tensor(source_type_id_list, dtype=torch.long)

    out = {
        "mutant": (c1_padded, c1_mask, c2_padded, c2_mask, y_mut),
        "wildtype": (w1_padded, w1_mask, w2_padded, w2_mask, y_wt),
        "has_wt": has_wt,
        "has_valid_wt": has_valid_wt,  # CRITICAL: True only if WT embeddings are real (not zeros)
        "is_wt": is_wt,  # Sample IS a WT sample (for routing to dG head)
        "has_dg": has_dg,  # Whether samples have absolute dG values
        "has_ddg": has_ddg,  # Whether samples have valid explicit ddG values
        "has_inferred_ddg": has_inferred_ddg,  # Whether samples have inferred ddG
        "has_both_dg_ddg": has_both_dg_ddg,  # For symmetric consistency loss
        "ddg_labels": ddg_labels,  # Direct ddG labels for BindingGym-style data
        "ddg_inferred_labels": ddg_inferred_labels,  # Inferred ddG = dG_mut - dG_wt
        "data_source": data_source_list,  # For per-source validation metrics
        "source_type_ids": source_type_ids,  # For model conditioning (0=protein_complex, 1=mutation, 2=antibody_cdr)
        "metadata": meta_list
    }
    return out

#########################################
#  SiameseDataset (Simplified)          #
#########################################
class AdvancedSiameseDataset(Dataset):
    """
    Dataset that handles mutation positions with a simple indicator channel.
    
    Reads columns:
      #Pdb, block1_sequence, block1_mut_positions, block1_mutations,
      block2_sequence, block2_mut_positions, block2_mutations, del_g, ...
    """
    def __init__(self, df: pd.DataFrame, featurizer: ESM3Featurizer, embedding_dir: str, 
                 normalize_embeddings=True, augment=False, max_len=1022,
                 wt_reference_df: pd.DataFrame = None):
        super().__init__()
        
        # Store WT reference DF (e.g. training set) for looking up missing WTs
        # This enables Implicit ddG (dG_mut - dG_wt) even if WTs are not in the current split
        self.wt_reference_df = wt_reference_df if wt_reference_df is not None else None
        initial_len = len(df)
        
        # CRITICAL FIX: Do NOT drop rows based on length because it shifts indices!
        # External splits (indices) rely on the original row numbers.
        # Instead, we TRUNCATE sequences that are too long to maintain alignment.
        
        # Identify long sequences
        long_mask = (df["block1_sequence"].astype(str).str.len() > max_len) | \
                    (df["block2_sequence"].astype(str).str.len() > max_len)
        n_long = long_mask.sum()
        
        if n_long > 0:
            print(f"  [Dataset] Truncating {n_long} samples with length > {max_len} to maintain index alignment (CRITICAL FIX).")
            # Truncate sequences in place
            # Use .copy() to avoid SettingWithCopyWarning if df is a slice
            df = df.copy() 
            df.loc[long_mask, "block1_sequence"] = df.loc[long_mask, "block1_sequence"].astype(str).str.slice(0, max_len)
            df.loc[long_mask, "block2_sequence"] = df.loc[long_mask, "block2_sequence"].astype(str).str.slice(0, max_len)

        # No rows dropped, so indices remain aligned with split files
        self.df = df.reset_index(drop=True)

        #region agent log
        try:
            cols = set(self.df.columns.tolist())
            need = {"block1_mut_positions", "block2_mut_positions", "Mutation(s)_PDB"}
            missing = sorted(list(need - cols))
            payload = {
                "sessionId": "debug-session",
                "runId": "pre-fix",
                "hypothesisId": "G",
                "location": "modules.py:AdvancedSiameseDataset:__init__",
                "message": "Dataset columns presence check for mutation positions",
                "data": {
                    "n_rows": int(len(self.df)),
                    "has_block1_mut_positions": "block1_mut_positions" in cols,
                    "has_block2_mut_positions": "block2_mut_positions" in cols,
                    "has_mutation_pdb": "Mutation(s)_PDB" in cols,
                    "missing": missing,
                },
                "timestamp": int(time.time() * 1000),
            }
            with open("/Users/supantha/Documents/code_v2/protein/.cursor/debug.log", "a") as f:
                f.write(json.dumps(payload, default=str) + "\n")
            print(f"[AGENTLOG MUTPOSCOLS] missing={missing}")
        except Exception:
            pass
        #endregion

        #region agent log
        # Disambiguate whether "0 positions" is happening for MUT embeddings or WT embeddings
        try:
            if not hasattr(self, "_agent_embed_call_counter"):
                self._agent_embed_call_counter = 0
            if self._agent_embed_call_counter < 10:
                self._agent_embed_call_counter += 1
                print(
                    f"[AGENTLOG EMBCALL] idx={idx} role=mut "
                    f"b1_mutpos_n={len(b1_mutpos)} b2_mutpos_n={len(b2_mutpos)} "
                    f"seq1_len={len(item.get('seq1',''))} seq2_len={len(item.get('seq2',''))}"
                )
        except Exception:
            pass
        #endregion
        
        # Recover antibody WTs (ANTIBODY_MUTATION) before augmentation or indexing
        self.df = self._recover_antibody_wts(self.df)
        
        # ---------- OPTIONAL AUGMENT: reverse mutation (mut ↔ WT) ----------
        # Only augment MUTANT samples (not WT) - WT samples don't benefit from reversal
        # and doubling them confuses the pdb_to_wt lookup
        if augment:
            # Identify mutant rows (non-empty Mutation(s)_PDB)
            mut_mask = self.df["Mutation(s)_PDB"].notna() & (self.df["Mutation(s)_PDB"].str.strip() != "")
            mutant_df = self.df[mut_mask].copy()
            
            if len(mutant_df) > 0:
                # Create reversed copies of mutant samples only
                rev_df = mutant_df.copy()
                # For the reverse augmentation we invert the sign of ddg
                if "ddg" in rev_df.columns:
                    rev_df["ddg"] = -rev_df["ddg"]
                rev_df["is_reverse"] = True  # flag for reversed samples
                
                # Original samples stay as-is
                self.df["is_reverse"] = False
                self.df = pd.concat([self.df, rev_df], ignore_index=True)
                print(f"  [Dataset] Augmented: added {len(rev_df)} reversed mutant samples (antisymmetry training)")
            else:
                self.df["is_reverse"] = False
        else:
            self.df["is_reverse"] = False
        # -------------------------------------------------------------------

        # ---------- PAIR ID (mutant – WT) ----------------------------------
        # Use PDB + cleaned‑mutation string so mutant and its WT share an ID
        self.df["pair_id"] = (
            self.df["#Pdb"].astype(str) + "_" +
            self.df["Mutation(s)_cleaned"].fillna("")  # WT rows have empty mutation
        )
        # -------------------------------------------------------------------

       
        self.featurizer = featurizer
        self.embedding_dir = Path(embedding_dir)
        self.embedding_dir.mkdir(exist_ok=True, parents=True)
        self.normalize = normalize_embeddings

        self.samples = []
        self._embedding_cache = {}  # LRU-style cache for frequently accessed embeddings
        self._cache_max_size = 20000  # Cache up to 20k embeddings (~20-40GB RAM)
        self._cache_hits = 0
        self._cache_misses = 0

        # map each PDB to a wildtype row index if it exists
        print(f"  [Dataset] Building WT index for {len(self.df)} rows...")
        self.pdb_to_wt = {}
        for i, row in self.df.iterrows():
            pdb = row["#Pdb"]
            mut_str = row.get("Mutation(s)_PDB","")
            is_wt = (pd.isna(mut_str) or mut_str.strip()=="")
            if is_wt and pdb not in self.pdb_to_wt:
                self.pdb_to_wt[pdb] = i

        # Build external WT map if reference DF is provided
        self.external_pdb_to_wt = {}
        if self.wt_reference_df is not None:
            print(f"  [Dataset] Building external WT index from {len(self.wt_reference_df)} reference rows...")
            for i, row in self.wt_reference_df.iterrows():
                # Only index actual WTs
                mut_str = row.get("Mutation(s)_PDB","")
                is_wt = (pd.isna(mut_str) or mut_str.strip()=="")
                if 'is_wt' in row: # Prioritize pre-computed flag
                    is_wt = is_wt or row['is_wt']
                    
                pdb = row["#Pdb"]
                if is_wt and pdb not in self.external_pdb_to_wt:
                    self.external_pdb_to_wt[pdb] = i
            print(f"  [Dataset] Indexed {len(self.external_pdb_to_wt)} external WTs.")

        # Build external WT map if reference DF is provided
        self.external_pdb_to_wt = {}
        if self.wt_reference_df is not None:
            print(f"  [Dataset] Building external WT index from {len(self.wt_reference_df)} reference rows...")
            for i, row in self.wt_reference_df.iterrows():
                # Only index actual WTs
                mut_str = row.get("Mutation(s)_PDB","")
                is_wt = (pd.isna(mut_str) or mut_str.strip()=="")
                # Also check 'is_wt' column if present
                if 'is_wt' in row:
                    is_wt = is_wt or row['is_wt']
                    
                pdb = row["#Pdb"]
                if is_wt and pdb not in self.external_pdb_to_wt:
                    self.external_pdb_to_wt[pdb] = i
            print(f"  [Dataset] Indexed {len(self.external_pdb_to_wt)} external WTs.")

        # LAZY LOADING: Only store metadata, NOT embeddings
        # Embeddings will be loaded on-demand in __getitem__
        print(f"  [Dataset] Building sample metadata for {len(self.df)} rows (lazy loading)...")
        from tqdm import tqdm
        for i, row in tqdm(self.df.iterrows(), total=len(self.df), desc="  Indexing"):
            # RESET computed mutations for this row to prevent stale data from previous iterations
            if hasattr(self, '_last_computed_mutpos'):
                del self._last_computed_mutpos
                
            pdb = row["#Pdb"]
            seq1 = row["block1_sequence"]
            seq2 = row["block2_sequence"]
            
            # Data source for per-source validation metrics
            data_source = row.get("data_source", "unknown")
            
            # Handle missing dG values (e.g., BindingGym has only ddG)
            raw_delg = row["del_g"]
            delg = float(raw_delg) if pd.notna(raw_delg) and raw_delg != '' else float('nan')
            
            # Get ddG if available (for ddG-only datasets like BindingGym)
            raw_ddg = row.get("ddg", None)
            ddg = float(raw_ddg) if pd.notna(raw_ddg) and raw_ddg != '' else float('nan')

            # Parse mutations (just store the string, parse later)
            b1_mutpos_str = row.get("block1_mut_positions","[]")
            b2_mutpos_str = row.get("block2_mut_positions","[]")

            # DEBUG: Print first few rows to debug disappearing mutations
            if i < 5:
                print(f"DEBUG ROW {i}: b1='{b1_mutpos_str}' ({type(b1_mutpos_str)}), b2='{b2_mutpos_str}' ({type(b2_mutpos_str)})")
                #region agent log
                try:
                    payload = {
                        "sessionId": "debug-session",
                        "runId": "pre-fix",
                        "hypothesisId": "G",
                        "location": "modules.py:AdvancedSiameseDataset:__init__:row0_4",
                        "message": "Raw mutpos strings from df row (first few)",
                        "data": {
                            "i": int(i),
                            "b1_mutpos_str": str(b1_mutpos_str),
                            "b2_mutpos_str": str(b2_mutpos_str),
                            "mutation_pdb": str(row.get("Mutation(s)_PDB", "")),
                        },
                        "timestamp": int(time.time() * 1000),
                    }
                    with open("/Users/supantha/Documents/code_v2/protein/.cursor/debug.log", "a") as f:
                        f.write(json.dumps(payload, default=str) + "\n")
                    print(f"[AGENTLOG MUTPOSRAW] i={i} b1={b1_mutpos_str} b2={b2_mutpos_str} mut={row.get('Mutation(s)_PDB','')}")
                except Exception:
                    pass
                #endregion
            
            # Get chain info for block assignment during WT inference
            b1_chains = str(row.get("block1_chains", "")).upper()
            b2_chains = str(row.get("block2_chains", "")).upper()

            mut_str = row.get("Mutation(s)_PDB","")
            is_wt = (pd.isna(mut_str) or mut_str.strip()=="")
            wt_idx = self.pdb_to_wt.get(pdb, None)
            
            # Get WT info if available (Internal > External)
            row_wt = None
            wt_source = None
            
            if not hasattr(self, '_wt_source_stats'):
                self._wt_source_stats = {'internal': 0, 'external': 0}

            if wt_idx is not None:
                row_wt = self.df.iloc[wt_idx]
                wt_source = 'internal'
                self._wt_source_stats['internal'] += 1
            elif pdb in self.external_pdb_to_wt:
                ext_idx = self.external_pdb_to_wt[pdb]
                row_wt = self.wt_reference_df.iloc[ext_idx]
                wt_source = 'external'
                self._wt_source_stats['external'] += 1
            
            if row_wt is not None:
                seq1_wt = row_wt["block1_sequence"]
                seq2_wt = row_wt["block2_sequence"]
                raw_delg_wt = row_wt["del_g"]
                delg_wt = float(raw_delg_wt) if pd.notna(raw_delg_wt) and raw_delg_wt != '' else float('nan')
                b1_wtpos_str = row_wt.get("block1_mut_positions","[]")
                b2_wtpos_str = row_wt.get("block2_mut_positions","[]")
                
                # BUGFIX: If we have WT but NO mutation positions in CSV, we MUST calculate them!
                # This fixes the "0% mutation positions" issue when the CSV column is empty/missing
                if not is_wt and (b1_mutpos_str in ["[]", "", "nan", "None"] and b2_mutpos_str in ["[]", "", "nan", "None"]):
                     # Run inference to locate mutations (side-effect: sets _last_computed_mutpos)
                     # We ignore the inferred WT sequence since we have the real one
                     # We pass "[]" to force scanning PDB positions
                     self._infer_wt_sequences(
                         seq1, seq2, mut_str, "[]", "[]", 
                         b1_chains, b2_chains
                     )
                     
                     # Update mutpos_str if we found mutations
                     if hasattr(self, '_last_computed_mutpos'):
                        comp_b1, comp_b2 = self._last_computed_mutpos
                        if b1_mutpos_str in ["[]", "", "nan", "None"] and comp_b1:
                            b1_mutpos_str = str(comp_b1)
                        if b2_mutpos_str in ["[]", "", "nan", "None"] and comp_b2:
                            b2_mutpos_str = str(comp_b2)

            else:
                # No WT row found - try to INFER WT sequence by reversing mutations
                # This is crucial for BindingGym data which stores mutant sequences only
                seq1_wt, seq2_wt = self._infer_wt_sequences(
                    seq1, seq2, mut_str, b1_mutpos_str, b2_mutpos_str,
                    b1_chains, b2_chains  # Chain info for block assignment
                )
                delg_wt = float('nan')  # No WT dG available for inferred sequences
                b1_wtpos_str, b2_wtpos_str = "[]", "[]"  # WT has no mutation positions
                
                # FIX Bug #3: Use computed mutation positions from inference if original empty
                if hasattr(self, '_last_computed_mutpos'):
                    comp_b1, comp_b2 = self._last_computed_mutpos
                    if b1_mutpos_str in ["[]", "", "nan", "None"] and comp_b1:
                        b1_mutpos_str = str(comp_b1)
                    if b2_mutpos_str in ["[]", "", "nan", "None"] and comp_b2:
                        b2_mutpos_str = str(comp_b2)

            # Check if this sample has BOTH dG and ddG (for symmetric consistency)
            has_dg = not (delg != delg)  # False if NaN
            has_ddg = not (ddg != ddg)   # False if NaN
            has_both = has_dg and has_ddg
            
            # NEW: Compute inferred ddG for samples with dG_mut and dG_wt but no explicit ddG
            # ddG_inferred = dG_mut - dG_wt (can be used as additional training signal)
            has_dg_wt = not (delg_wt != delg_wt)  # False if NaN
            has_inferred_ddg = has_dg and has_dg_wt and (not has_ddg)  # Only if no explicit ddG
            if has_inferred_ddg:
                ddg_inferred = delg - delg_wt  # Computed from dG values
            else:
                ddg_inferred = float('nan')
            
            # Track WT availability: real (from row), inferred, or none
            has_real_wt = (wt_idx is not None)
            has_inferred_wt = (wt_idx is None and seq1_wt is not None and seq2_wt is not None)
            has_any_wt = has_real_wt or has_inferred_wt
            
            # Store ONLY metadata - no embeddings loaded yet!
            is_reverse = row.get("is_reverse", False)  # Track reversed samples
            
            # CRITICAL: Swap sequences and dG for reversed samples (antisymmetry augmentation)
            if is_reverse:
                # Swap sequences: New Mutant = Old WT, New WT = Old Mutant
                if seq1_wt is not None and seq2_wt is not None:
                    seq1, seq1_wt = seq1_wt, seq1
                    seq2, seq2_wt = seq2_wt, seq2
                    # Swap dG values
                    delg, delg_wt = delg_wt, delg
                    # Negate inferred ddG (dG_new_mut - dG_new_wt = dG_old_wt - dG_old_mut = -(dG_old_mut - dG_old_wt))
                    if not math.isnan(ddg_inferred):
                        ddg_inferred = -ddg_inferred
                    # Note: Explicit 'ddg' is already negated in __init__ augmentation logic
                    # Note: We do NOT swap mutation positions because the indices of difference 
                    # are the same for A->B vs B->A. We want the 'input' (new mutant) to have 
                    # the indicator flags at the difference sites.
            
            self.samples.append({
                "pdb": pdb,
                "is_wt": is_wt,
                "is_reverse": is_reverse,  # True if this is a reversed (augmented) sample
                "seq1": seq1, "seq2": seq2, "delg": delg,
                "seq1_wt": seq1_wt, "seq2_wt": seq2_wt, "delg_wt": delg_wt,
                "ddg": ddg,
                "ddg_inferred": ddg_inferred,  # NEW: Computed from dG_mut - dG_wt
                "has_dg": has_dg,
                "has_ddg": has_ddg,
                "has_inferred_ddg": has_inferred_ddg,  # NEW: True if ddg_inferred is valid
                "has_both_dg_ddg": has_both,
                "has_real_wt": has_real_wt,
                "has_inferred_wt": has_inferred_wt,
                "has_any_wt": has_any_wt,
                "b1_mutpos_str": b1_mutpos_str,
                "b2_mutpos_str": b2_mutpos_str,
                "b1_wtpos_str": b1_wtpos_str,
                "b2_wtpos_str": b2_wtpos_str,
                "data_source": data_source
            })
        
        # Log WT inference statistics
        n_real_wt = sum(1 for s in self.samples if s["has_real_wt"])
        n_inferred_wt = sum(1 for s in self.samples if s["has_inferred_wt"])
        n_no_wt = len(self.samples) - n_real_wt - n_inferred_wt
        
        # Detailed stats for Real WTs (Internal vs External)
        if hasattr(self, '_wt_source_stats'):
            n_internal = self._wt_source_stats.get('internal', 0)
            n_external = self._wt_source_stats.get('external', 0)
            source_msg = f" (Internal: {n_internal}, External: {n_external})"
        else:
            source_msg = ""

        print(f"  [Dataset] Ready! {len(self.samples)} samples indexed (embeddings loaded on-demand)")
        print(f"  [Dataset] WT stats: {n_real_wt} real WT{source_msg}, {n_inferred_wt} inferred WT, {n_no_wt} no WT")
        
        # Log detailed failure breakdown (for debugging)
        if hasattr(self, '_wt_inference_failures') and hasattr(self, '_wt_inference_fail_count'):
            print(f"  [Dataset] ⚠️ WT inference failed for {self._wt_inference_fail_count} samples:")
            fail_dict = self._wt_inference_failures
            
            # Count by category (note: these are capped sample counts, not totals)
            n_no_pdb = len(fail_dict.get('no_pdb', []))
            n_del_ins = len(fail_dict.get('del_ins_only', []))
            n_parse = len(fail_dict.get('parse_fail', []))
            
            if n_no_pdb > 0:
                print(f"      - ANTIBODY samples (no PDB structure): {self._wt_inference_fail_count} samples")
                print(f"        (These are antibody design samples without original PDB - only dG usable)")
            elif n_del_ins > 0 or n_parse > 0:
                print(f"      - DEL/INS/stop-codon (can't reverse): counted")
                print(f"      - Parsing failed (unknown format): counted")
            
            # Show samples for non-ANTIBODY failures
            if fail_dict.get('parse_fail') and n_no_pdb == 0:
                print(f"      Sample parse failures:")
                for mut in fail_dict['parse_fail'][:5]:
                    print(f"        '{mut}'")

    def _parse_mutpos(self, pos_str) -> List[int]:
        """
        pos_str might be '[]' or '[170, 172]' etc.
        We'll do a simple parse.
        """
        # Handle NaN, None, or non-string values
        if pos_str is None or (isinstance(pos_str, float) and str(pos_str) == 'nan'):
            return []
        if not isinstance(pos_str, str):
            pos_str = str(pos_str)
        pos_str = pos_str.strip()
        if pos_str.startswith("[") and pos_str.endswith("]"):
            inside = pos_str[1:-1].strip()
            if not inside:
                return []
            # split by comma
            arr = inside.split(",")
            out = []
            for x in arr:
                x_ = x.strip()
                if x_:
                    out.append(int(x_))
            return out
        return []

    def _recover_antibody_wts(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Recover WT information for antibody samples (ANTIBODY_MUTATION)
        by finding the closest-to-consensus sequence in each antigen group.
        
        Strategy:
        1. Identify samples with 'ANTIBODY_MUTATION'
        2. Group by antigen (block2_sequence)
        3. Assign unique Pseudo-PDB ID to each group (e.g. ANTIBODY_GRP_xxx)
        4. For same-length groups: find sequence closest to consensus as WT
        5. For variable-length groups: fallback to best binder (lowest del_g)
        6. Mark selected sequence as WT (clear mutation string)
        """
        from collections import Counter
        
        # Identify antibody mutation rows
        mask = df['Mutation(s)_PDB'].astype(str).str.contains('ANTIBODY_MUTATION', na=False)
        
        if not mask.any():
            return df
            
        print(f"  [Dataset] Attempting to recover WT for {mask.sum()} antibody samples...")
        
        recovered_count = 0
        n_groups = 0
        n_consensus = 0
        n_median = 0
        n_fallback = 0
        
        # We need a copy to avoid SettingWithCopy warnings if df is a slice
        df = df.copy()
        
        # Add a temporary column for grouping (hash of antigen sequence)
        df['temp_antigen_hash'] = df['block2_sequence'].apply(lambda x: hashlib.md5(str(x).encode()).hexdigest())
        
        # Get hashes for antibody rows
        ab_hashes = df.loc[mask, 'temp_antigen_hash'].unique()
        
        for h in ab_hashes:
            # Get all antibody rows for this antigen
            grp_mask = mask & (df['temp_antigen_hash'] == h)
            grp_indices = df.index[grp_mask]
            
            if len(grp_indices) == 0: 
                continue
            
            n_groups += 1
            
            # 1. Create unique Pseudo-PDB ID
            pseudo_pdb = f"ANTIBODY_GRP_{h[:8]}"
            df.loc[grp_indices, '#Pdb'] = pseudo_pdb
            
            # 2. Select WT: closest-to-consensus (same-length) or best-binder (variable-length)
            seqs = df.loc[grp_indices, 'block1_sequence'].tolist()
            seq_lens = set(len(s) for s in seqs)
            
            wt_idx = None
            
            if len(seq_lens) == 1:
                # SAME LENGTH: Use closest-to-consensus
                seq_len = list(seq_lens)[0]
                
                # Build consensus sequence
                consensus = []
                for pos in range(seq_len):
                    residues = [s[pos] for s in seqs]
                    counts = Counter(residues)
                    most_common = counts.most_common(1)[0][0]
                    consensus.append(most_common)
                consensus_seq = ''.join(consensus)
                
                # Find sequence with minimum Hamming distance to consensus
                min_dist = float('inf')
                for idx in grp_indices:
                    seq = df.at[idx, 'block1_sequence']
                    dist = sum(c1 != c2 for c1, c2 in zip(seq, consensus_seq))
                    if dist < min_dist:
                        min_dist = dist
                        wt_idx = idx
                
                n_consensus += 1
            else:
                # VARIABLE LENGTH: Fallback to median binder (more representative than best)
                if 'del_g' in df.columns:
                    delg_vals = pd.to_numeric(df.loc[grp_indices, 'del_g'], errors='coerce').dropna()
                    if len(delg_vals) > 0:
                        # Find index of value closest to median
                        median_val = delg_vals.median()
                        median_idx = (delg_vals - median_val).abs().idxmin()
                        wt_idx = median_idx
                        n_median += 1
            
            # FINAL FALLBACK: Pick first sample if no other method works (e.g., all NaN dG)
            if wt_idx is None and len(grp_indices) > 0:
                wt_idx = grp_indices[0]
                n_fallback += 1
            
            # 3. Mark selected sequence as WT
            if wt_idx is not None:
                df.at[wt_idx, 'Mutation(s)_PDB'] = ""
                recovered_count += len(grp_indices)
        
        # Cleanup
        df.drop(columns=['temp_antigen_hash'], inplace=True, errors='ignore')
        
        print(f"  [Dataset] Recovered {recovered_count} antibody samples ({n_groups} groups):")
        print(f"      - {n_consensus} groups via closest-to-consensus")
        print(f"      - {n_median} groups via median-binder (variable-length)")
        if n_fallback > 0:
            print(f"      - {n_fallback} groups via first-sample fallback (no dG data)")
        return df

    def _infer_wt_sequences(self, mut_seq1: str, mut_seq2: str, mutation_str: str,
                            b1_mutpos_str: str, b2_mutpos_str: str,
                            b1_chains: str = "", b2_chains: str = "") -> Tuple[Optional[str], Optional[str]]:
        """
        Infer wildtype sequences by reversing mutations in the mutant sequences.
        
        IMPROVED: Instead of relying on PDB positions (which don't match 0-indexed 
        sequence positions), this version searches for the mutant residue and 
        reverses it. Also computes actual mutation positions as byproduct.
        
        Mutations are in formats like:
        - BindingGym: "H:P53L" or "H:P53L,H:Y57C" (chain:WTresPOSmutres)
        - SKEMPI: "HP53L" or "CA182A" (chainWTresPOSmutres)
        
        Args:
            mut_seq1: Mutant sequence for block1
            mut_seq2: Mutant sequence for block2
            mutation_str: Raw mutation string from data
            b1_mutpos_str: Mutation positions for block1 (e.g., "[52, 56]")
            b2_mutpos_str: Mutation positions for block2
            b1_chains: Chain letters in block1 (e.g., "AB")
            b2_chains: Chain letters in block2 (e.g., "HL")
            
        Returns:
            Tuple of (wt_seq1, wt_seq2) or (None, None) if inference fails
        """
        import re
        
        if pd.isna(mutation_str) or str(mutation_str).strip() == '':
            # No mutations = this IS the wildtype
            return mut_seq1, mut_seq2
        
        # FALLBACK: Handle ANTIBODY_MUTATION samples that couldn't be recovered
        mutation_str_upper = str(mutation_str).strip().upper()
        if 'ANTIBODY_MUTATION' in mutation_str_upper or mutation_str_upper == 'ANTIBODY_MUTATION':
            if not hasattr(self, '_wt_inference_failures'):
                self._wt_inference_failures = {'parse_fail': [], 'del_ins_only': [], 'no_pdb': [], 'other': []}
                self._wt_inference_fail_count = 0
            self._wt_inference_fail_count += 1
            if len(self._wt_inference_failures['no_pdb']) < 5:
                self._wt_inference_failures['no_pdb'].append(mutation_str[:80])
            return None, None
        
        try:
            # Parse mutation string to extract (chain, position, original_AA, mutant_AA)
            mutations = []
            mutation_str = str(mutation_str).strip()
            
            # Split by common delimiters
            parts = re.split(r'[,;]', mutation_str)
            
            for part in parts:
                part = part.strip().strip('"\'')
                if not part:
                    continue
                
                # Skip deletion/insertion markers - can't reverse these
                if 'DEL' in part.upper() or 'INS' in part.upper() or '*' in part:
                    continue
                    
                # BindingGym format: "H:P53L" or "L:K103R"
                if ':' in part:
                    chain_mut = part.split(':')
                    if len(chain_mut) >= 2:
                        chain = chain_mut[0].strip().upper()
                        for mut_part in chain_mut[1:]:
                            mut_part = mut_part.strip()
                            if not mut_part:
                                continue
                            match = re.match(r'([A-Z])(\d+)([A-Z])', mut_part)
                            if match:
                                wt_aa = match.group(1)
                                pos = int(match.group(2))  # PDB-numbered (1-indexed)
                                mut_aa = match.group(3)
                                mutations.append((chain, pos, wt_aa, mut_aa))
                else:
                    # SKEMPI format: "CA182A" = C(WTresidue) + A(chain) + 182(pos) + A(mutant)
                    # Format: WTresidue + ChainID + Position[insertcode] + MutResidue
                    # Example: CA182A means Cysteine at chain A position 182 mutated to Alanine
                    match = re.match(r'([A-Z])([A-Z])(-?\d+[a-z]?)([A-Z])', part)
                    if match:
                        wt_aa = match.group(1)    # First char is WT residue
                        chain = match.group(2).upper()  # Second char is chain ID
                        pos_str = match.group(3)
                        pos = int(re.match(r'-?\d+', pos_str).group())
                        mut_aa = match.group(4)   # Last char is mutant residue
                        mutations.append((chain, pos, wt_aa, mut_aa))
                    else:
                        # Simple format without chain: "F139A" (used by PEPBI)
                        # Format: WTresidue + Position + MutResidue
                        match = re.match(r'([A-Z])(\d+)([A-Z])', part)
                        if match:
                            wt_aa = match.group(1)
                            pos = int(match.group(2))
                            mut_aa = match.group(3)
                            # No chain info - will try both blocks
                            mutations.append(('?', pos, wt_aa, mut_aa))
            
            if not mutations:
                if not hasattr(self, '_wt_inference_failures'):
                    self._wt_inference_failures = {'parse_fail': [], 'del_ins_only': [], 'other': []}
                    self._wt_inference_fail_count = 0
                self._wt_inference_fail_count += 1
                
                if 'DEL' in mutation_str.upper() or 'INS' in mutation_str.upper() or '*' in mutation_str:
                    category = 'del_ins_only'
                else:
                    category = 'parse_fail'
                
                if len(self._wt_inference_failures.get(category, [])) < 10:
                    self._wt_inference_failures.setdefault(category, []).append(mutation_str[:80])
                
                return None, None
            
            # Convert sequences to lists for mutation
            wt_seq1_list = list(mut_seq1) if mut_seq1 else []
            wt_seq2_list = list(mut_seq2) if mut_seq2 else []
            
            # Build chain sets for block assignment
            b1_chain_set = set(b1_chains.upper()) if b1_chains else set()
            b2_chain_set = set(b2_chains.upper()) if b2_chains else set()
            
            # Parse PRECOMPUTED mutation positions (these are correct 0-indexed seq positions)
            # PDB residue numbers often don't match sequence indices due to numbering offsets
            precomputed_b1_positions = self._parse_mutpos(b1_mutpos_str)
            precomputed_b2_positions = self._parse_mutpos(b2_mutpos_str)
            
            # Track reversal success
            if not hasattr(self, '_wt_inference_stats'):
                self._wt_inference_stats = {'reversed': 0, 'not_found': 0, 'total': 0}
            
            # Also track actual mutation positions found
            found_positions_b1 = []
            found_positions_b2 = []
            
            # STRATEGY 1: Use precomputed positions if available (MOST RELIABLE)
            # These were computed during preprocessing with correct PDB-to-sequence mapping
            if precomputed_b1_positions or precomputed_b2_positions:
                pos_idx = 0
                for chain, pdb_pos, wt_aa, mut_aa in mutations:
                    self._wt_inference_stats['total'] += 1
                    reversed_this = False
                    
                    # Determine which block based on chain
                    if chain in b2_chain_set:
                        # Use precomputed block2 positions
                        if pos_idx < len(precomputed_b2_positions):
                            seq_idx = precomputed_b2_positions[pos_idx]
                            if 0 <= seq_idx < len(wt_seq2_list) and wt_seq2_list[seq_idx] == mut_aa:
                                wt_seq2_list[seq_idx] = wt_aa
                                reversed_this = True
                                found_positions_b2.append(seq_idx)
                    elif chain in b1_chain_set:
                        # Use precomputed block1 positions
                        if pos_idx < len(precomputed_b1_positions):
                            seq_idx = precomputed_b1_positions[pos_idx]
                            if 0 <= seq_idx < len(wt_seq1_list) and wt_seq1_list[seq_idx] == mut_aa:
                                wt_seq1_list[seq_idx] = wt_aa
                                reversed_this = True
                                found_positions_b1.append(seq_idx)
                    else:
                        # Chain unknown - try both precomputed positions
                        if pos_idx < len(precomputed_b1_positions):
                            seq_idx = precomputed_b1_positions[pos_idx]
                            if 0 <= seq_idx < len(wt_seq1_list) and wt_seq1_list[seq_idx] == mut_aa:
                                wt_seq1_list[seq_idx] = wt_aa
                                reversed_this = True
                                found_positions_b1.append(seq_idx)
                        if not reversed_this and pos_idx < len(precomputed_b2_positions):
                            seq_idx = precomputed_b2_positions[pos_idx]
                            if 0 <= seq_idx < len(wt_seq2_list) and wt_seq2_list[seq_idx] == mut_aa:
                                wt_seq2_list[seq_idx] = wt_aa
                                reversed_this = True
                                found_positions_b2.append(seq_idx)
                    
                    if reversed_this:
                        self._wt_inference_stats['reversed'] += 1
                    else:
                        self._wt_inference_stats['not_found'] += 1
                    pos_idx += 1
                
                self._last_computed_mutpos = (found_positions_b1, found_positions_b2)
                return ''.join(wt_seq1_list), ''.join(wt_seq2_list)
            
            # STRATEGY 2: Fall back to PDB position-based search (less reliable)
            for chain, pdb_pos, wt_aa, mut_aa in mutations:
                self._wt_inference_stats['total'] += 1
                reversed_this = False
                found_idx = None
                
                # Determine which block(s) to search based on chain
                chain_known = chain in b1_chain_set or chain in b2_chain_set
                
                if chain in b1_chain_set:
                    blocks_to_try = [(wt_seq1_list, True, found_positions_b1)]
                elif chain in b2_chain_set:
                    blocks_to_try = [(wt_seq2_list, False, found_positions_b2)]
                else:
                    # Chain info unavailable - try BOTH blocks
                    blocks_to_try = [
                        (wt_seq1_list, True, found_positions_b1),
                        (wt_seq2_list, False, found_positions_b2)
                    ]
                
                for target_seq, is_block1, pos_list in blocks_to_try:
                    if reversed_this:
                        break  # Already found in previous block
                    
                    guess_idx = pdb_pos - 1  # Convert to 0-indexed
                    
                    # Strategy 1: Try exact position if in bounds
                    if 0 <= guess_idx < len(target_seq) and target_seq[guess_idx] == mut_aa:
                        found_idx = guess_idx
                    else:
                        # Strategy 2: Search ±50 window around expected position
                        search_start = max(0, pdb_pos - 50)
                        search_end = min(len(target_seq), pdb_pos + 50)
                        for idx in range(search_start, search_end):
                            if target_seq[idx] == mut_aa:
                                found_idx = idx
                                break
                    
                    # Strategy 3: If position was out of bounds AND chain unknown,
                    # search the ENTIRE sequence as last resort
                    if found_idx is None and not chain_known:
                        if guess_idx >= len(target_seq) or guess_idx < 0:
                            # Position was out of bounds - search entire sequence
                            for idx in range(len(target_seq)):
                                if target_seq[idx] == mut_aa:
                                    found_idx = idx
                                    break
                    
                    if found_idx is not None:
                        target_seq[found_idx] = wt_aa  # Reverse the mutation!
                        reversed_this = True
                        pos_list.append(found_idx)
                
                if reversed_this:
                    self._wt_inference_stats['reversed'] += 1
                else:
                    self._wt_inference_stats['not_found'] += 1
            
            # Store computed mutation positions for later use (helps with Bug #3)
            # These are the ACTUAL 0-indexed positions in the sequence
            self._last_computed_mutpos = (found_positions_b1, found_positions_b2)
            
            return ''.join(wt_seq1_list), ''.join(wt_seq2_list)
            
        except Exception as e:
            # On any error, return None to indicate inference failed
            return None, None

    def _get_embedding(self, seq: str, mut_positions: List[int]) -> torch.Tensor:
        """
        Basic embedding with mutation position indicator channel.
        
        Args:
            seq: The protein sequence
            mut_positions: List of positions that are mutated (0-indexed)
        """
        # Get base ESM embedding (already ensures min length of 2)
        base_emb = self._get_or_create_embedding(seq)  # => [L, 1152]
        base_emb = base_emb.cpu()
        
        # Get sequence length and embedding dimension
        L, D = base_emb.shape

        #region agent log
        try:
            if not hasattr(self, "_agent_log_counter"):
                self._agent_log_counter = 0
            if self._agent_log_counter < 5:
                self._agent_log_counter += 1
                last1_stats = None
                last2_stats = None
                if D >= 1153:
                    v1 = base_emb[:, -1]
                    last1_stats = {
                        "min": float(v1.min().item()),
                        "max": float(v1.max().item()),
                        "mean": float(v1.float().mean().item()),
                        "std": float(v1.float().std().item()),
                    }
                if D >= 1154:
                    v2 = base_emb[:, -2]
                    last2_stats = {
                        "min": float(v2.min().item()),
                        "max": float(v2.max().item()),
                        "mean": float(v2.float().mean().item()),
                        "std": float(v2.float().std().item()),
                    }
                payload = {
                    "sessionId": "debug-session",
                    "runId": "pre-fix",
                    "hypothesisId": "F",
                    "location": "modules.py:AdvancedSiameseDataset:_get_embedding",
                    "message": "Base embedding shape + tail-channel stats before appending mutation indicator",
                    "data": {
                        "L": int(L),
                        "D": int(D),
                        "mut_positions_n": int(len(mut_positions) if mut_positions is not None else -1),
                        "mut_positions_first5": (mut_positions[:5] if mut_positions else []),
                        "base_last1": last1_stats,
                        "base_last2": last2_stats,
                    },
                    "timestamp": int(time.time() * 1000),
                }
                with open("/Users/supantha/Documents/code_v2/protein/.cursor/debug.log", "a") as f:
                    f.write(json.dumps(payload, default=str) + "\n")
                # Also emit a concise line to stdout/logs (useful on cluster runs)
                print(f"[AGENTLOG EMB] D={D} mut_n={len(mut_positions) if mut_positions else 0} last1={last1_stats} last2={last2_stats}")
        except Exception:
            pass
        #endregion
        
        # Create mutation indicator channel (just one channel)
        # FIX FOR DOUBLE-INDICATOR BUG: Check if base_emb already has indicator (D=1153)
        # If D=1153, the cached embedding already has an old indicator channel - OVERWRITE it
        # If D=1152, this is a fresh ESM embedding - APPEND indicator channel
        D = base_emb.shape[-1]
        L = base_emb.shape[0]
        
        if D == 1153:
            # Already has indicator channel (from cache) - overwrite it with correct mutation positions
            new_emb = base_emb.clone()
            new_emb[:, -1] = 0.0  # Reset old indicator
            for pos in mut_positions:
                if isinstance(pos, int) and 0 <= pos < L:
                    new_emb[pos, -1] = 1.0
            print(f"[AGENTLOG INDICATOR-FIX] D=1153 OVERWRITING last channel with {len(mut_positions)} positions")
        else:
            # Fresh ESM embedding (D=1152) - append indicator channel
            chan = torch.zeros((L, 1), dtype=base_emb.dtype, device=base_emb.device)
            for pos in mut_positions:
                if isinstance(pos, int) and 0 <= pos < L:
                    chan[pos, 0] = 1.0
            new_emb = torch.cat([base_emb, chan], dim=-1)
            print(f"[AGENTLOG INDICATOR-FIX] D={D} APPENDING indicator channel with {len(mut_positions)} positions")
        
        return new_emb

    def _get_or_create_embedding(self, seq: str) -> torch.Tensor:
        # Check LRU cache first (limited size to control memory)
        if seq in self._embedding_cache:
            self._cache_hits += 1
            return self._embedding_cache[seq].clone()
        
        seq_hash = hashlib.md5(seq.encode()).hexdigest()
        pt_file = self.embedding_dir / f"{seq_hash}.pt"
        npy_file = self.embedding_dir / f"{seq_hash}.npy"
        
        emb = None
        load_source = None  # Track where embedding came from
        
        # Try .npy first (pre-computed), then .pt
        if npy_file.is_file():
            try:
                import numpy as np
                emb = torch.from_numpy(np.load(npy_file))
                load_source = "npy"
            except Exception:
                pass
        if emb is None and pt_file.is_file():
            try:
                emb = torch.load(pt_file, map_location="cpu")
                load_source = "pt"
            except Exception:
                pt_file.unlink(missing_ok=True)  # Delete corrupted file
        if emb is None:
            # On-the-fly embedding generation for missing sequences (e.g., inferred WT)
            # This is slower but ensures accurate embeddings
            try:
                emb = self.featurizer.transform(seq)  # [L, 1152]
                # Save for future use
                torch.save(emb, pt_file)
                load_source = "generated"
                
                # Track on-the-fly generation stats
                if not hasattr(self, '_on_the_fly_count'):
                    self._on_the_fly_count = 0
                self._on_the_fly_count += 1
                
                # Log first few on-the-fly generations
                if self._on_the_fly_count <= 5:
                    print(f"[EMBEDDING] Generated on-the-fly #{self._on_the_fly_count}: len={len(seq)}, saved to {pt_file.name}")
                elif self._on_the_fly_count == 6:
                    print(f"[EMBEDDING] Generated 5+ embeddings on-the-fly (suppressing further logs)")
                    
            except Exception as e:
                raise RuntimeError(
                    f"Embedding not found and on-the-fly generation failed for sequence (len={len(seq)}): {e}"
                )

        #region agent log
        try:
            if not hasattr(self, "_agent_embload_counter"):
                self._agent_embload_counter = 0
            if self._agent_embload_counter < 8:
                self._agent_embload_counter += 1
                shape = tuple(int(x) for x in emb.shape)
                D = int(shape[1]) if len(shape) == 2 else None
                payload = {
                    "sessionId": "debug-session",
                    "runId": "pre-fix",
                    "hypothesisId": "A",
                    "location": "modules.py:AdvancedSiameseDataset:_get_or_create_embedding",
                    "message": "Loaded embedding tensor (source + shape) before any indicator is appended",
                    "data": {
                        "load_source": load_source,
                        "seq_len": int(len(seq)),
                        "shape": shape,
                        "D": D,
                        "looks_like_has_indicator": bool(D is not None and D >= 1153),
                        "file_pt_exists": bool(pt_file.is_file()),
                        "file_npy_exists": bool(npy_file.is_file()),
                    },
                    "timestamp": int(time.time() * 1000),
                }
                with open("/Users/supantha/Documents/code_v2/protein/.cursor/debug.log", "a") as f:
                    f.write(json.dumps(payload, default=str) + "\n")
                print(f"[AGENTLOG EMBLOAD] src={load_source} shape={shape} D={D}")
        except Exception:
            pass
        #endregion
        
        # SAFETY: Ensure embedding has valid shape (at least 5 residues for interpolation)
        if emb.shape[0] < 5:
            # Pad to minimum length of 5 by repeating
            repeats = (5 // emb.shape[0]) + 1
            emb = emb.repeat(repeats, 1)[:5]  # Ensure exactly 5 rows
        
        # Track cache miss
        self._cache_misses += 1
        
        # Add to LRU cache (evict oldest if full)
        if len(self._embedding_cache) >= self._cache_max_size:
            # Remove oldest entry (first key in dict)
            oldest_key = next(iter(self._embedding_cache))
            del self._embedding_cache[oldest_key]
        self._embedding_cache[seq] = emb
        
        return emb.clone()  # Return clone to avoid mutation issues
    
    def get_cache_stats(self):
        """Return cache statistics."""
        total = self._cache_hits + self._cache_misses
        hit_rate = (self._cache_hits / total * 100) if total > 0 else 0
        on_the_fly = getattr(self, '_on_the_fly_count', 0)
        wt_missing = getattr(self, '_wt_missing_count', 0)
        return {
            "hits": self._cache_hits,
            "misses": self._cache_misses,
            "total": total,
            "hit_rate": hit_rate,
            "cache_size": len(self._embedding_cache),
            "cache_max": self._cache_max_size,
            "on_the_fly_generated": on_the_fly,
            "wt_embedding_failed": wt_missing
        }
    
    def print_cache_stats(self):
        """Print cache statistics."""
        stats = self.get_cache_stats()
        print(f"  [Cache] Hits: {stats['hits']:,} | Misses: {stats['misses']:,} | "
              f"Hit Rate: {stats['hit_rate']:.1f}% | Size: {stats['cache_size']:,}/{stats['cache_max']:,}")
        if stats['on_the_fly_generated'] > 0:
            print(f"  [Cache] On-the-fly generated: {stats['on_the_fly_generated']:,} embeddings")
        if stats['wt_embedding_failed'] > 0:
            print(f"  [Cache] ⚠️ WT embedding failures: {stats['wt_embedding_failed']:,} (excluded from ddG training)")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        
        # DEBUG: Track sequence difference statistics
        if not hasattr(self, '_seq_diff_stats'):
            self._seq_diff_stats = {'same': 0, 'different': 0, 'no_wt': 0}
        if not hasattr(self, '_mutpos_stats'):
            self._mutpos_stats = {'has_mutpos': 0, 'no_mutpos': 0}
        
        # LAZY LOADING: Load embeddings on-demand
        b1_mutpos = self._parse_mutpos(item["b1_mutpos_str"])
        b2_mutpos = self._parse_mutpos(item["b2_mutpos_str"])

        #region agent log
        try:
            if not hasattr(self, "_agent_mutpos_getitem_counter"):
                self._agent_mutpos_getitem_counter = 0
            if self._agent_mutpos_getitem_counter < 20:
                self._agent_mutpos_getitem_counter += 1
                payload = {
                    "sessionId": "debug-session",
                    "runId": "pre-fix",
                    "hypothesisId": "G",
                    "location": "modules.py:AdvancedSiameseDataset:__getitem__",
                    "message": "Parsed mut_positions passed to _get_embedding",
                    "data": {
                        "idx": int(idx),
                        "pdb": str(item.get("pdb")),
                        "is_wt": bool(item.get("is_wt")),
                        "b1_mutpos_str": str(item.get("b1_mutpos_str")),
                        "b2_mutpos_str": str(item.get("b2_mutpos_str")),
                        "b1_mutpos_n": int(len(b1_mutpos)),
                        "b2_mutpos_n": int(len(b2_mutpos)),
                        "b1_mutpos_first5": b1_mutpos[:5],
                        "b2_mutpos_first5": b2_mutpos[:5],
                    },
                    "timestamp": int(time.time() * 1000),
                }
                with open("/Users/supantha/Documents/code_v2/protein/.cursor/debug.log", "a") as f:
                    f.write(json.dumps(payload, default=str) + "\n")
                print(f"[AGENTLOG MUTPOSGET] idx={idx} b1n={len(b1_mutpos)} b2n={len(b2_mutpos)} b1str={item.get('b1_mutpos_str')} b2str={item.get('b2_mutpos_str')}")
        except Exception:
            pass
        #endregion
        
        # Track mutation position statistics
        if len(b1_mutpos) > 0 or len(b2_mutpos) > 0:
            self._mutpos_stats['has_mutpos'] += 1
        else:
            self._mutpos_stats['no_mutpos'] += 1
        
        # Log mutation position stats periodically
        total = sum(self._mutpos_stats.values())
        if total in [100, 1000, 10000]:
            has_mp = self._mutpos_stats['has_mutpos']
            no_mp = self._mutpos_stats['no_mutpos']
            print(f"  [MUTPOS] After {total} samples: {has_mp} have mutation positions ({100*has_mp/total:.1f}%), "
                  f"{no_mp} have NO mutation positions ({100*no_mp/total:.1f}%)")
        
        c1_emb = self._get_embedding(item["seq1"], b1_mutpos)
        c2_emb = self._get_embedding(item["seq2"], b2_mutpos)
        
        if self.normalize:
            c1_emb[:, :-1] = torch.nn.functional.normalize(c1_emb[:, :-1], p=2, dim=-1)
            c2_emb[:, :-1] = torch.nn.functional.normalize(c2_emb[:, :-1], p=2, dim=-1)
        
        # Load WT embeddings if available
        if item["seq1_wt"] is not None:
            # DEBUG: Track sequence differences
            seq1_same = (item["seq1"] == item["seq1_wt"])
            seq2_same = (item["seq2"] == item["seq2_wt"])
            if seq1_same and seq2_same:
                self._seq_diff_stats['same'] += 1
            else:
                self._seq_diff_stats['different'] += 1
            
            # Periodic logging
            total_samples = sum(self._seq_diff_stats.values())
            if total_samples in [100, 1000, 10000, 50000]:
                same = self._seq_diff_stats['same']
                diff = self._seq_diff_stats['different']
                no_wt = self._seq_diff_stats['no_wt']
                print(f"  [SEQ DIFF] After {total_samples} samples: {same} same seq ({100*same/total_samples:.1f}%), "
                      f"{diff} different ({100*diff/total_samples:.1f}%), {no_wt} no WT")
            
            b1_wtpos = self._parse_mutpos(item["b1_wtpos_str"])
            b2_wtpos = self._parse_mutpos(item["b2_wtpos_str"])

            #region agent log
            try:
                if not hasattr(self, "_agent_embed_call_counter_wt"):
                    self._agent_embed_call_counter_wt = 0
                if self._agent_embed_call_counter_wt < 10:
                    self._agent_embed_call_counter_wt += 1
                    print(
                        f"[AGENTLOG EMBCALL] idx={idx} role=wt "
                        f"b1_wtpos_n={len(b1_wtpos)} b2_wtpos_n={len(b2_wtpos)} "
                        f"seq1_wt_len={len(item.get('seq1_wt','') or '')} seq2_wt_len={len(item.get('seq2_wt','') or '')}"
                    )
            except Exception:
                pass
            #endregion
            
            try:
                cw1 = self._get_embedding(item["seq1_wt"], b1_wtpos)
                cw2 = self._get_embedding(item["seq2_wt"], b2_wtpos)
            except RuntimeError as e:
                # WT embedding unavailable - mark as no WT for this sample
                # DO NOT use mutant embedding as proxy - this corrupts the mutation signal!
                # Instead, set cw1, cw2 to None and let training handle missing WT
                cw1, cw2 = None, None
                if not hasattr(self, '_wt_missing_count'):
                    self._wt_missing_count = 0
                self._wt_missing_count += 1
                if self._wt_missing_count <= 3:  # Only log first 3 to avoid spam
                    print(f"  [WARN] WT embedding missing #{self._wt_missing_count}, sample will be WT-less: {e}")
            
            if cw1 is not None and self.normalize:
                cw1[:, :-1] = torch.nn.functional.normalize(cw1[:, :-1], p=2, dim=-1)
                cw2[:, :-1] = torch.nn.functional.normalize(cw2[:, :-1], p=2, dim=-1)
        else:
            cw1, cw2 = None, None
            self._seq_diff_stats['no_wt'] += 1
        
        data_tuple = (c1_emb, c2_emb, item["delg"],
                      cw1, cw2, item["delg_wt"])
        meta = {
            "pdb": item["pdb"],
            "is_wt": item["is_wt"],
            "has_real_wt": item["has_real_wt"],
            "has_dg": item["has_dg"],
            "has_ddg": item["has_ddg"],  # Whether sample has valid explicit ddG value
            "has_inferred_ddg": item["has_inferred_ddg"],  # Whether sample has inferred ddG (dG_mut - dG_wt)
            "has_both_dg_ddg": item["has_both_dg_ddg"],
            "ddg": item["ddg"],
            "ddg_inferred": item["ddg_inferred"],  # Inferred ddG value (needed for Fix #1)
            "has_any_wt": item["has_any_wt"],  # Include inferred WT status (CRITICAL!)
            "b1_mutpos": b1_mutpos,
            "b2_mutpos": b2_mutpos,
            "data_source": item["data_source"]
        }
        return (data_tuple, meta)

#########################################
# AffinityDataModule
#########################################
from sklearn.model_selection import GroupKFold

class AffinityDataModule(pl.LightningDataModule):
    """
    Data module for protein binding affinity prediction.
    
    Supports multiple splitting strategies:
    1. split_indices_dir: Load pre-computed cluster-based splits (RECOMMENDED)
    2. use_cluster_split: Create new cluster-based splits on the fly
    3. split column: Use existing 'split' column in CSV (legacy)
    4. num_folds > 1: GroupKFold on PDB IDs
    """
    def __init__(
        self,
        data_csv: str,
        protein_featurizer: ESM3Featurizer,
        embedding_dir: str = "precomputed_esm",
        batch_size: int = 32,
        num_workers: int = 4,
        shuffle: bool = True,
        num_folds: int = 1,
        fold_index: int = 0,
        # New cluster-based splitting options
        split_indices_dir: str = None,  # Path to pre-computed split indices
        benchmark_indices_dir: str = None,  # Path to balanced benchmark subset indices (optional override)
        use_cluster_split: bool = False,  # Create cluster-based splits on the fly
        train_ratio: float = 0.70,
        val_ratio: float = 0.15,
        test_ratio: float = 0.15,
        random_state: int = 42
    ):
        super().__init__()
        self.data_csv = data_csv
        self.featurizer = protein_featurizer
        self.embedding_dir = embedding_dir
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.shuffle = shuffle
        self.num_folds = num_folds
        self.fold_index = fold_index
        
        # Cluster-based splitting options
        self.split_indices_dir = split_indices_dir
        self.benchmark_indices_dir = benchmark_indices_dir  # Optional balanced benchmark override
        self.use_cluster_split = use_cluster_split
        self.train_ratio = train_ratio
        self.val_ratio = val_ratio
        self.test_ratio = test_ratio
        self.random_state = random_state

        self.train_dataset = None
        self.val_dataset = None
        self.test_dataset = None
        
        # Dual-split datasets (separate for dG and ddG heads)
        self.dg_train_dataset = None  # WT-only training set for Stage A
        self.ddg_train_dataset = None  # Mutation training set for Stage B
        self.dg_val_dataset = None
        self.dg_test_dataset = None
        self.ddg_val_dataset = None
        self.ddg_test_dataset = None
        self.use_dual_split = False

    def prepare_data(self):
        if not os.path.exists(self.data_csv):
            raise FileNotFoundError(f"Data CSV not found => {self.data_csv}")

    def setup(self, stage=None):
        data = pd.read_csv(self.data_csv, low_memory=False)
        
        # Check if this is a dual-split directory
        dual_split_file = os.path.join(self.split_indices_dir, 'dg_val_indices.csv') if self.split_indices_dir else None
        
        # Strategy 0: Load DUAL splits (separate for dG and ddG heads)
        if self.split_indices_dir and dual_split_file and os.path.exists(dual_split_file):
            from data_splitting import load_dual_splits
            print(f"\n[DataModule] Loading DUAL splits from {self.split_indices_dir}")
            
            splits = load_dual_splits(self.split_indices_dir)
            self.use_dual_split = True
            
            # Combined training set (union of dG and ddG train indices)
            train_idx = splits['combined_train']
            train_df = data.iloc[train_idx].reset_index(drop=True)
            
            # For backward compatibility, use ddG validation as default val set
            # (since most validation is on mutation data)
            val_idx = splits['ddg']['val']
            val_df = data.iloc[val_idx].reset_index(drop=True)
            test_idx = splits['ddg']['test']
            test_df = data.iloc[test_idx].reset_index(drop=True)
            
            # Create separate datasets for each head
            # CRITICAL: Create separate dG (WT-only) and ddG (MT-only) TRAINING sets
            # This fixes Stage A WT starvation where WT is diluted to 2.75% in combined_train
            dg_train_df = data.iloc[splits['dg']['train']].reset_index(drop=True)
            ddg_train_df = data.iloc[splits['ddg']['train']].reset_index(drop=True)
            
            dg_val_df = data.iloc[splits['dg']['val']].reset_index(drop=True)
            dg_test_df = data.iloc[splits['dg']['test']].reset_index(drop=True)
            ddg_val_df = data.iloc[splits['ddg']['val']].reset_index(drop=True)
            ddg_test_df = data.iloc[splits['ddg']['test']].reset_index(drop=True)
            
            print(f"\n[DataModule] Creating dG TRAIN dataset ({len(dg_train_df)} WT rows)...")
            self.dg_train_dataset = AdvancedSiameseDataset(dg_train_df, self.featurizer, self.embedding_dir, augment=False)  # Baseline: no augment
            
            print(f"[DataModule] Creating ddG TRAIN dataset ({len(ddg_train_df)} MT rows)...")
            self.ddg_train_dataset = AdvancedSiameseDataset(ddg_train_df, self.featurizer, self.embedding_dir, augment=False)  # Baseline: no augment
            
            # === BALANCED BENCHMARK OVERRIDE ===
            # If benchmark_indices_dir is provided, use those for ddG val/test instead
            if self.benchmark_indices_dir and os.path.exists(self.benchmark_indices_dir):
                print(f"\n[DataModule] Loading BALANCED BENCHMARK indices from {self.benchmark_indices_dir}")
                
                # Load ddG benchmark val indices
                ddg_val_bench_file = os.path.join(self.benchmark_indices_dir, 'ddg_val_benchmark_indices.csv')
                if os.path.exists(ddg_val_bench_file):
                    bench_val_idx = pd.read_csv(ddg_val_bench_file, header=None).iloc[:, 0].values.tolist()
                    ddg_val_df = data.iloc[bench_val_idx].reset_index(drop=True)
                    print(f"  ddG val: {len(ddg_val_df)} rows (balanced benchmark)")
                
                # Load ddG benchmark test indices
                ddg_test_bench_file = os.path.join(self.benchmark_indices_dir, 'ddg_test_benchmark_indices.csv')
                if os.path.exists(ddg_test_bench_file):
                    bench_test_idx = pd.read_csv(ddg_test_bench_file, header=None).iloc[:, 0].values.tolist()
                    ddg_test_df = data.iloc[bench_test_idx].reset_index(drop=True)
                    print(f"  ddG test: {len(ddg_test_df)} rows (balanced benchmark)")
            
            print(f"\n[DataModule] Creating dG val dataset ({len(dg_val_df)} rows)...")
            # NOTE: Do NOT subsample validation - we want accurate metrics on full set
            self.dg_val_dataset = AdvancedSiameseDataset(
                dg_val_df, self.featurizer, self.embedding_dir, augment=False,
                wt_reference_df=data # FIX: Use full data for WT lookup (robust to split boundaries)
            )
            
            print(f"\n[DataModule] Creating dG test dataset ({len(dg_test_df)} rows)...")
            self.dg_test_dataset = AdvancedSiameseDataset(
                dg_test_df, self.featurizer, self.embedding_dir, augment=False,
                wt_reference_df=data # FIX: Use full data for WT lookup
            )
            
            print(f"\n[DataModule] Creating ddG val dataset ({len(ddg_val_df)} rows)...")
            # NOTE: Do NOT subsample validation - we want accurate metrics on full set
            # cap_k only applies to training DMS data
            self.ddg_val_dataset = AdvancedSiameseDataset(
                ddg_val_df, self.featurizer, self.embedding_dir, augment=False,
                wt_reference_df=data # FIX: Use full data for WT lookup
            )
            
            print(f"\n[DataModule] Creating ddG test dataset ({len(ddg_test_df)} rows)...")
            self.ddg_test_dataset = AdvancedSiameseDataset(
                ddg_test_df, self.featurizer, self.embedding_dir, augment=False,
                wt_reference_df=data # FIX: Use full data for WT lookup
            )
            
            print(f"\n[DataModule] Dual split datasets created:")
            print(f"  dG train: {len(self.dg_train_dataset)} samples (WT-only for Stage A)")
            print(f"  ddG train: {len(self.ddg_train_dataset)} samples (MT-only)")
            print(f"  dG val:   {len(self.dg_val_dataset)} samples")
            print(f"  dG test:  {len(self.dg_test_dataset)} samples")
            print(f"  ddG val:  {len(self.ddg_val_dataset)} samples")
            print(f"  ddG test: {len(self.ddg_test_dataset)} samples")
            
        # Strategy 1: Load pre-computed cluster-based splits (single split)
        elif self.split_indices_dir and os.path.exists(self.split_indices_dir):
            from data_splitting import load_split_indices, verify_no_leakage
            train_idx, val_idx, test_idx = load_split_indices(self.split_indices_dir)
            
            train_df = data.iloc[train_idx].reset_index(drop=True)
            val_df = data.iloc[val_idx].reset_index(drop=True)
            test_df = data.iloc[test_idx].reset_index(drop=True)
            
            # Verify no leakage
            verify_no_leakage(data, train_idx, val_idx, test_idx)
            
        # Strategy 2: Create cluster-based splits on the fly
        elif self.use_cluster_split:
            from data_splitting import create_cluster_splits, verify_no_leakage
            
            # Create splits directory if needed
            splits_dir = os.path.join(os.path.dirname(self.data_csv), 'splits')
            
            train_idx, val_idx, test_idx = create_cluster_splits(
                data,
                train_ratio=self.train_ratio,
                val_ratio=self.val_ratio,
                test_ratio=self.test_ratio,
                random_state=self.random_state,
                save_dir=splits_dir
            )
            
            train_df = data.iloc[train_idx].reset_index(drop=True)
            val_df = data.iloc[val_idx].reset_index(drop=True)
            test_df = data.iloc[test_idx].reset_index(drop=True)
            
        # Strategy 3: Legacy - use 'split' column in CSV
        else:
            # must have block1_sequence, block1_mut_positions, block2_sequence, ...
            bench_df = data[data["split"]=="Benchmark test"].copy()
            trainval_df = data[data["split"]!="Benchmark test"].copy()

            if self.num_folds > 1:
                gkf = GroupKFold(n_splits=self.num_folds)
                groups = trainval_df["#Pdb"].values
                folds = list(gkf.split(trainval_df, groups=groups))
                train_idx, val_idx = folds[self.fold_index]
                train_df = trainval_df.iloc[train_idx].reset_index(drop=True)
                val_df = trainval_df.iloc[val_idx].reset_index(drop=True)
            else:
                train_df = trainval_df[trainval_df["split"]=="train"].reset_index(drop=True)
                val_df = trainval_df[trainval_df["split"]=="val"].reset_index(drop=True)

            test_df = bench_df

        print(f"\n[DataModule] Creating TRAIN dataset ({len(train_df)} rows)...")
        self.train_dataset = AdvancedSiameseDataset(
            train_df, self.featurizer, self.embedding_dir, augment=False  # Baseline: no augment (enable later for antisymmetry)
        )
        print(f"\n[DataModule] Creating VAL dataset ({len(val_df)} rows)...")
        # Subsampling disabled for v20 ablation to ensure robust Macro-PCC evaluation
        # (need full diversity of PDB families for honest reporting)
        self.val_dataset = AdvancedSiameseDataset(
            val_df, self.featurizer, self.embedding_dir, augment=False,
            wt_reference_df=train_df  # Pass training set as source for WTs
        )
        print(f"\n[DataModule] Creating TEST dataset ({len(test_df)} rows)...")
        self.test_dataset = AdvancedSiameseDataset(
            test_df, self.featurizer, self.embedding_dir, augment=False,
            wt_reference_df=train_df  # Pass training set as source for WTs (no leakage, WTs are known)
        )
        
        # FIX: Create separate dg_test and ddg_test datasets for proper test metric logging
        # This is CRITICAL for sweep runs - without this, test metrics are never computed!
        if self.dg_test_dataset is None and self.ddg_test_dataset is None:
            # Determine WT/MT based on Mutation(s)_cleaned column
            def is_wt_row(row):
                mut_str = str(row.get('Mutation(s)_cleaned', '')).strip()
                return mut_str == '' or mut_str.lower() == 'nan' or mut_str == 'WT'
            
            # Separate test_df into WT (for dG test) and MT (for ddG test)
            test_is_wt = test_df.apply(is_wt_row, axis=1)
            dg_test_df = test_df[test_is_wt].reset_index(drop=True)
            ddg_test_df = test_df[~test_is_wt].reset_index(drop=True)
            
            if len(dg_test_df) > 0:
                print(f"\n[DataModule] Creating dG TEST dataset ({len(dg_test_df)} WT rows)...")
                self.dg_test_dataset = AdvancedSiameseDataset(
                    dg_test_df, self.featurizer, self.embedding_dir, augment=False,
                    wt_reference_df=data  # Use full data for WT lookup
                )
            else:
                print(f"[DataModule] WARNING: No WT rows in test set for dG test dataset!")
                
            if len(ddg_test_df) > 0:
                print(f"\n[DataModule] Creating ddG TEST dataset ({len(ddg_test_df)} MT rows)...")
                self.ddg_test_dataset = AdvancedSiameseDataset(
                    ddg_test_df, self.featurizer, self.embedding_dir, augment=False,
                    wt_reference_df=data  # Use full data for WT lookup
                )
            else:
                print(f"[DataModule] WARNING: No MT rows in test set for ddG test dataset!")
        
        # Log dataset sizes
        print(f"\nDataset sizes:")
        print(f"  Train: {len(self.train_dataset)} samples")
        print(f"  Val:   {len(self.val_dataset)} samples")
        print(f"  Test:  {len(self.test_dataset)} samples")
        if self.dg_test_dataset:
            print(f"  dG Test:  {len(self.dg_test_dataset)} samples (WT)")
        if self.ddg_test_dataset:
            print(f"  ddG Test: {len(self.ddg_test_dataset)} samples (MT)")


    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=self.shuffle,
            num_workers=self.num_workers,
            collate_fn=advanced_collate_fn
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            collate_fn=advanced_collate_fn
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            collate_fn=advanced_collate_fn
        )
    
    # Dual-split training dataloaders for separate dG-only (Stage A) and ddG (Stage B) training
    def dg_train_dataloader(self):
        """Training dataloader for dG head (WT data only for Stage A pretraining)."""
        if self.dg_train_dataset is None:
            return None
        return DataLoader(
            self.dg_train_dataset,
            batch_size=self.batch_size,
            shuffle=self.shuffle,
            num_workers=self.num_workers,
            collate_fn=advanced_collate_fn
        )
    
    def ddg_train_dataloader(self):
        """Training dataloader for ddG head (mutation data for Stage B training)."""
        if self.ddg_train_dataset is None:
            return None
        return DataLoader(
            self.ddg_train_dataset,
            batch_size=self.batch_size,
            shuffle=self.shuffle,
            num_workers=self.num_workers,
            collate_fn=advanced_collate_fn
        )
    
    # Dual-split dataloaders for separate dG and ddG validation
    def dg_val_dataloader(self):
        """Validation dataloader for dG head (WT data only)."""
        if self.dg_val_dataset is None:
            return None
        return DataLoader(
            self.dg_val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            collate_fn=advanced_collate_fn
        )
    
    def dg_test_dataloader(self):
        """Test dataloader for dG head (WT data only)."""
        if self.dg_test_dataset is None:
            return None
        return DataLoader(
            self.dg_test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            collate_fn=advanced_collate_fn
        )
    
    def ddg_val_dataloader(self):
        """Validation dataloader for ddG head (mutation data including DMS)."""
        if self.ddg_val_dataset is None:
            return None
        return DataLoader(
            self.ddg_val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            collate_fn=advanced_collate_fn
        )
    
    def ddg_test_dataloader(self):
        """Test dataloader for ddG head (mutation data including DMS)."""
        if self.ddg_test_dataset is None:
            return None
        return DataLoader(
            self.ddg_test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            collate_fn=advanced_collate_fn
        )