Spaces:

kaurm43
/

PolyFusionAgent

Running

File size: 66,915 Bytes

import os
import random
import time
from pathlib import Path
import math
import json
import shutil
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, KFold
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import sys
import csv
import copy
from typing import List, Dict, Optional, Tuple, Any

# Increase CSV field size limit 
csv.field_size_limit(sys.maxsize)

# =============================================================================
# Imports: Shared encoders/helpers from PolyFusion
# =============================================================================
from PolyFusion.GINE import GineEncoder, match_edge_attr_to_index, safe_get
from PolyFusion.SchNet import NodeSchNetWrapper
from PolyFusion.Transformer import PooledFingerprintEncoder as FingerprintEncoder
from PolyFusion.DeBERTav2 import PSMILESDebertaEncoder, build_psmiles_tokenizer

# =============================================================================
# Configuration
# =============================================================================
BASE_DIR = "/path/to/Polymer_Foundational_Model"
POLYINFO_PATH = "/path/to/polyinfo_with_modalities.csv"

# Pretrained encoder directories
PRETRAINED_MULTIMODAL_DIR = "/path/to/multimodal_output/best"
BEST_GINE_DIR = "/path/to/gin_output/best"
BEST_SCHNET_DIR = "/path/to/schnet_output/best"
BEST_FP_DIR = "/path/to/fingerprint_mlm_output/best"
BEST_PSMILES_DIR = "/path/to/polybert_output/best"

# Output log file (per-run json lines + per-property aggregated summary)
OUTPUT_RESULTS = "/path/to/multimodal_downstream_results.txt"

# Directory to save best-performing checkpoint bundle per property (best CV run)
BEST_WEIGHTS_DIR = "/path/to/multimodal_downstream_bestweights"

# -----------------------------------------------------------------------------
# Model sizes / dims 
# -----------------------------------------------------------------------------
MAX_ATOMIC_Z = 85
MASK_ATOM_ID = MAX_ATOMIC_Z + 1

# GINE
NODE_EMB_DIM = 300
EDGE_EMB_DIM = 300
NUM_GNN_LAYERS = 5

# SchNet
SCHNET_NUM_GAUSSIANS = 50
SCHNET_NUM_INTERACTIONS = 6
SCHNET_CUTOFF = 10.0
SCHNET_MAX_NEIGHBORS = 64
SCHNET_HIDDEN = 600

# Fingerprints
FP_LENGTH = 2048
MASK_TOKEN_ID_FP = 2
VOCAB_SIZE_FP = 3

# Contrastive embedding dim
CL_EMB_DIM = 600

# PSMILES/DeBERTa
DEBERTA_HIDDEN = 600
PSMILES_MAX_LEN = 128

# -----------------------------------------------------------------------------
# Fusion + regression head hyperparameters
# -----------------------------------------------------------------------------
POLYF_EMB_DIM = 600
POLYF_ATTN_HEADS = 8
POLYF_DROPOUT = 0.1
POLYF_FF_MULT = 4  # FFN hidden = 4*d 

# -----------------------------------------------------------------------------
# Fine-tuning parameters (single-task per property)
# -----------------------------------------------------------------------------
MAX_LEN = 128
BATCH_SIZE = 32
NUM_EPOCHS = 100
PATIENCE = 10
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 0.0

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Properties to evaluate
REQUESTED_PROPERTIES = [
    "density",
    "glass transition",
    "melting",
    "thermal decomposition"
]

# True K-fold evaluation to match "fivefold per property"
NUM_RUNS = 5
TEST_SIZE = 0.10
VAL_SIZE_WITHIN_TRAINVAL = 0.10  # fraction of trainval reserved for val split

# Duplicate aggregation (noise reduction) key preference order
AGG_KEYS_PREFERENCE = ["polymer_id", "PolymerID", "poly_id", "psmiles", "smiles", "canonical_smiles"]

# =============================================================================
# Utilities
# =============================================================================
def set_seed(seed: int):
    """Set all relevant RNG seeds for reproducible folds."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    # Deterministic settings: reproducible but may reduce throughput.
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def make_json_serializable(obj):
    """Convert common numpy/torch/pandas objects into JSON-safe Python types."""
    if isinstance(obj, dict):
        return {make_json_serializable(k): make_json_serializable(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple, set)):
        return [make_json_serializable(x) for x in obj]
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    if isinstance(obj, np.generic):
        try:
            return obj.item()
        except Exception:
            return float(obj)
    if isinstance(obj, torch.Tensor):
        try:
            return obj.detach().cpu().tolist()
        except Exception:
            return None
    if isinstance(obj, (pd.Timestamp, pd.Timedelta)):
        return str(obj)
    try:
        if isinstance(obj, (float, int, str, bool, type(None))):
            return obj
    except Exception:
        pass
    return obj


def summarize_state_dict_load(full_state: dict, model_state: dict, filtered_state: dict):
    """
    Print a concise load report:
      - how many checkpoint keys exist
      - how many model keys exist
      - how many keys will be loaded (intersection with matching shapes)
      - common reasons for skipped keys
    """
    n_ckpt = len(full_state)
    n_model = len(model_state)
    n_loaded = len(filtered_state)

    missing_in_model = [k for k in full_state.keys() if k not in model_state]
    shape_mismatch = [
        k for k in full_state.keys()
        if k in model_state and hasattr(full_state[k], "shape")
        and tuple(full_state[k].shape) != tuple(model_state[k].shape)
    ]

    print("\n[CKPT LOAD SUMMARY]")
    print(f"  ckpt keys:   {n_ckpt}")
    print(f"  model keys:  {n_model}")
    print(f"  loaded keys: {n_loaded}")
    print(f"  skipped (not in model):     {len(missing_in_model)}")
    print(f"  skipped (shape mismatch):   {len(shape_mismatch)}")

    if missing_in_model:
        print("  examples skipped (not in model):", missing_in_model[:10])
    if shape_mismatch:
        print("  examples skipped (shape mismatch):")
        for k in shape_mismatch[:10]:
            print(f"    {k}: ckpt={tuple(full_state[k].shape)} model={tuple(model_state[k].shape)}")
    print("")

def find_property_columns(columns):
    """
    Robust property column matching with guardrails:
      - Prefer word-level (token) matches over substring matches.
      - For 'density', avoid confusing with 'cohesive energy density' columns.
      - Log chosen column and competing candidates when ambiguous.
    """
    lowered = {c.lower(): c for c in columns}
    found = {}

    for req in REQUESTED_PROPERTIES:
        req_low = req.lower().strip()
        exact = None

        # Pass 1: token-level exactness (safer than substring match)
        for c_low, c_orig in lowered.items():
            tokens = set(c_low.replace('_', ' ').split())
            if req_low in tokens or c_low == req_low:
                if req_low == "density" and ("cohesive" in c_low or "cohesive energy" in c_low):
                    continue
                exact = c_orig
                break

        if exact is not None:
            found[req] = exact
            continue

        # Pass 2: substring match as fallback
        candidates = [c_orig for c_low, c_orig in lowered.items() if req_low in c_low]
        if req_low == "density":
            candidates = [c for c in candidates if "cohesive" not in c.lower() and "cohesive energy" not in c.lower()]

        if len(candidates) == 1:
            found[req] = candidates[0]
        else:
            chosen = candidates[0] if candidates else None
            found[req] = chosen
            print(f"[COLMAP] Requested '{req}' -> chosen column: {chosen}")
            if candidates:
                print(f"[COLMAP] Candidates for '{req}': {candidates}")
            else:
                print(f"[COLMAP][WARN] No candidates found for '{req}' using substring search.")
    return found


def choose_aggregation_key(df: pd.DataFrame) -> Optional[str]:
    """Pick the most stable identifier available for duplicate aggregation."""
    for k in AGG_KEYS_PREFERENCE:
        if k in df.columns:
            return k
    return None


def aggregate_polyinfo_duplicates(df: pd.DataFrame, modality_cols: List[str], property_cols: List[str]) -> pd.DataFrame:
    """
    Optional noise reduction: group duplicate polymer entries and average properties.

    - Modalities are taken as "first" (they should be consistent per polymer key).
    - Properties are averaged (mean).
    """
    key = choose_aggregation_key(df)
    if key is None:
        print("[AGG] No aggregation key found; skipping duplicate aggregation.")
        return df

    df2 = df.copy()
    df2[key] = df2[key].astype(str)
    df2 = df2[df2[key].str.strip() != ""].copy()
    if len(df2) == 0:
        print("[AGG] Aggregation key exists but is empty; skipping duplicate aggregation.")
        return df

    agg_dict = {}
    for mc in modality_cols:
        if mc in df2.columns:
            agg_dict[mc] = "first"
    for pc in property_cols:
        if pc in df2.columns:
            agg_dict[pc] = "mean"

    grouped = df2.groupby(key, as_index=False).agg(agg_dict)
    print(f"[AGG] Grouped by '{key}': {len(df)} rows -> {len(grouped)} unique keys")
    return grouped


def _sanitize_name(s: str) -> str:
    """Create a filesystem-safe name for property directories."""
    s2 = str(s).strip().lower()
    keep = []
    for ch in s2:
        if ch.isalnum():
            keep.append(ch)
        elif ch in (" ", "-", "_", "."):
            keep.append("_")
        else:
            keep.append("_")
    out = "".join(keep)
    while "__" in out:
        out = out.replace("__", "_")
    out = out.strip("_")
    return out or "property"


# =============================================================================
# Multimodal backbone: encode + project + modality-aware fusion
# =============================================================================
class MultimodalContrastiveModel(nn.Module):
    """
    Multimodal encoder wrapper:

      1) Runs each available modality encoder:
           - GINE (graph)
           - SchNet (3D geometry)
           - Transformer FP encoder (Morgan bit sequence)
           - DeBERTa-based PSMILES encoder (sequence)

      2) Projects each modality embedding to a shared dim (emb_dim).

      3) Normalizes each modality embedding (L2), drops out, then fuses via
         a masked mean across modalities that are present for each sample.

      4) Normalizes the final fused embedding (L2).

    Expected downstream usage:
        z = model(batch_mods, modality_mask=modality_mask)  # (B, emb_dim)
    """

    def __init__(
        self,
        gine_encoder: Optional[nn.Module] = None,
        schnet_encoder: Optional[nn.Module] = None,
        fp_encoder: Optional[nn.Module] = None,
        psmiles_encoder: Optional[nn.Module] = None,
        *,
        emb_dim: int = CL_EMB_DIM,
        modalities: Optional[List[str]] = None,
        dropout: float = 0.1,
        psmiles_tokenizer: Optional[Any] = None,
    ):
        super().__init__()

        self.gine = gine_encoder
        self.schnet = schnet_encoder
        self.fp = fp_encoder
        self.psmiles = psmiles_encoder
        self.psm_tok = psmiles_tokenizer

        self.emb_dim = int(emb_dim)
        self.out_dim = self.emb_dim
        self.dropout = nn.Dropout(float(dropout))

        # Determine which modalities are enabled
        if modalities is None:
            mods = []
            if self.gine is not None:
                mods.append("gine")
            if self.schnet is not None:
                mods.append("schnet")
            if self.fp is not None:
                mods.append("fp")
            if self.psmiles is not None:
                mods.append("psmiles")
            self.modalities = mods
        else:
            self.modalities = [m for m in modalities if m in ("gine", "schnet", "fp", "psmiles")]

        # Projection heads into shared embedding space
        self.proj_gine = nn.Linear(NODE_EMB_DIM, self.emb_dim) if self.gine is not None else None
        self.proj_schnet = nn.Linear(SCHNET_HIDDEN, self.emb_dim) if self.schnet is not None else None
        self.proj_fp = nn.Linear(256, self.emb_dim) if self.fp is not None else None

        # Infer PSMILES hidden size if possible; fallback to DEBERTA_HIDDEN
        psm_in = None
        if self.psmiles is not None:
            if hasattr(self.psmiles, "out_dim"):
                try:
                    psm_in = int(self.psmiles.out_dim)
                except Exception:
                    psm_in = None
            if psm_in is None and hasattr(self.psmiles, "model") and hasattr(self.psmiles.model, "config"):
                try:
                    psm_in = int(self.psmiles.model.config.hidden_size)
                except Exception:
                    psm_in = None
            if psm_in is None:
                psm_in = int(DEBERTA_HIDDEN)

        self.proj_psmiles = nn.Linear(psm_in, self.emb_dim) if (self.psmiles is not None) else None

    def freeze_cl_encoders(self):
        """Freeze all modality encoders (optional for evaluation-only usage)."""
        for enc in (self.gine, self.schnet, self.fp, self.psmiles):
            if enc is None:
                continue
            enc.eval()
            for p in enc.parameters():
                p.requires_grad = False

    def _masked_mean_combine(self, zs: List[torch.Tensor], masks: List[torch.Tensor]) -> torch.Tensor:
        """
        Compute sample-wise mean over available modalities.

        zs:    list of modality embeddings, each (B,D)
        masks: list of modality presence masks, each (B,) bool
        returns: (B,D)
        """
        if not zs:
            device = next(self.parameters()).device
            return torch.zeros((1, self.emb_dim), device=device)

        device = zs[0].device
        B = zs[0].size(0)

        sum_z = torch.zeros((B, self.emb_dim), device=device)
        count = torch.zeros((B, 1), device=device)

        for z, m in zip(zs, masks):
            m = m.to(device).view(B, 1).float()
            sum_z = sum_z + z * m
            count = count + m

        count = count.clamp(min=1.0)
        return sum_z / count

    def forward(self, batch_mods: dict, modality_mask: Optional[dict] = None) -> torch.Tensor:
        """
        batch_mods keys: 'gine', 'schnet', 'fp', 'psmiles'
        modality_mask: dict {modality_name: (B,) bool} describing presence.
        """
        device = next(self.parameters()).device

        zs = []
        ms = []

        # Infer batch size B
        B = None
        if modality_mask is not None:
            for _, v in modality_mask.items():
                if isinstance(v, torch.Tensor) and v.numel() > 0:
                    B = int(v.size(0))
                    break

        if B is None:
            if "fp" in batch_mods and batch_mods["fp"] is not None and isinstance(batch_mods["fp"].get("input_ids", None), torch.Tensor):
                B = int(batch_mods["fp"]["input_ids"].size(0))
            elif "psmiles" in batch_mods and batch_mods["psmiles"] is not None and isinstance(batch_mods["psmiles"].get("input_ids", None), torch.Tensor):
                B = int(batch_mods["psmiles"]["input_ids"].size(0))

        if B is None:
            return torch.zeros((1, self.emb_dim), device=device)

        def _get_mask(name: str) -> torch.Tensor:
            if modality_mask is not None and name in modality_mask and isinstance(modality_mask[name], torch.Tensor):
                return modality_mask[name].to(device).bool()
            return torch.ones((B,), device=device, dtype=torch.bool)

        # -------------------------
        # GINE (graph modality)
        # -------------------------
        if "gine" in self.modalities and self.gine is not None and batch_mods.get("gine", None) is not None:
            g = batch_mods["gine"]
            if isinstance(g.get("z", None), torch.Tensor) and g["z"].numel() > 0:
                emb_g = self.gine(
                    g["z"].to(device),
                    g.get("chirality", None).to(device) if isinstance(g.get("chirality", None), torch.Tensor) else None,
                    g.get("formal_charge", None).to(device) if isinstance(g.get("formal_charge", None), torch.Tensor) else None,
                    g.get("edge_index", torch.empty((2, 0), dtype=torch.long)).to(device) if isinstance(g.get("edge_index", None), torch.Tensor) else torch.empty((2, 0), dtype=torch.long, device=device),
                    g.get("edge_attr", torch.zeros((0, 3), dtype=torch.float)).to(device) if isinstance(g.get("edge_attr", None), torch.Tensor) else torch.zeros((0, 3), dtype=torch.float, device=device),
                    g.get("batch", None).to(device) if isinstance(g.get("batch", None), torch.Tensor) else None
                )
                z = self.proj_gine(emb_g) if self.proj_gine is not None else emb_g
                z = F.normalize(z, dim=-1)
                z = self.dropout(z)
                zs.append(z)
                ms.append(_get_mask("gine"))

        # -------------------------
        # SchNet (3D geometry)
        # -------------------------
        if "schnet" in self.modalities and self.schnet is not None and batch_mods.get("schnet", None) is not None:
            s = batch_mods["schnet"]
            if isinstance(s.get("z", None), torch.Tensor) and s["z"].numel() > 0:
                emb_s = self.schnet(
                    s["z"].to(device),
                    s["pos"].to(device) if isinstance(s.get("pos", None), torch.Tensor) else torch.zeros((0, 3), device=device),
                    s.get("batch", None).to(device) if isinstance(s.get("batch", None), torch.Tensor) else None
                )
                z = self.proj_schnet(emb_s) if self.proj_schnet is not None else emb_s
                z = F.normalize(z, dim=-1)
                z = self.dropout(z)
                zs.append(z)
                ms.append(_get_mask("schnet"))

        # -------------------------
        # Fingerprint modality
        # -------------------------
        if "fp" in self.modalities and self.fp is not None and batch_mods.get("fp", None) is not None:
            f = batch_mods["fp"]
            if isinstance(f.get("input_ids", None), torch.Tensor) and f["input_ids"].numel() > 0:
                emb_f = self.fp(
                    f["input_ids"].to(device),
                    f.get("attention_mask", None).to(device) if isinstance(f.get("attention_mask", None), torch.Tensor) else None
                )
                z = self.proj_fp(emb_f) if self.proj_fp is not None else emb_f
                z = F.normalize(z, dim=-1)
                z = self.dropout(z)
                zs.append(z)
                ms.append(_get_mask("fp"))

        # -------------------------
        # PSMILES text modality
        # -------------------------
        if "psmiles" in self.modalities and self.psmiles is not None and batch_mods.get("psmiles", None) is not None:
            p = batch_mods["psmiles"]
            if isinstance(p.get("input_ids", None), torch.Tensor) and p["input_ids"].numel() > 0:
                emb_p = self.psmiles(
                    p["input_ids"].to(device),
                    p.get("attention_mask", None).to(device) if isinstance(p.get("attention_mask", None), torch.Tensor) else None
                )
                z = self.proj_psmiles(emb_p) if self.proj_psmiles is not None else emb_p
                z = F.normalize(z, dim=-1)
                z = self.dropout(z)
                zs.append(z)
                ms.append(_get_mask("psmiles"))

        # Fuse and normalize
        if not zs:
            return torch.zeros((B, self.emb_dim), device=device)

        z = self._masked_mean_combine(zs, ms)
        z = F.normalize(z, dim=-1)
        return z

    @torch.no_grad()
    def encode_psmiles(
        self,
        psmiles_list: List[str],
        max_len: int = PSMILES_MAX_LEN,
        batch_size: int = 64,
        device: str = DEVICE
    ) -> np.ndarray:
        """
        PSMILES embeddings
        """
        self.eval()
        if self.psm_tok is None or self.psmiles is None or self.proj_psmiles is None:
            raise RuntimeError("PSMILES tokenizer/encoder/projection not available.")

        outs = []
        for i in range(0, len(psmiles_list), batch_size):
            chunk = [str(x) for x in psmiles_list[i:i + batch_size]]
            enc = self.psm_tok(chunk, truncation=True, padding="max_length", max_length=max_len, return_tensors="pt")
            input_ids = enc["input_ids"].to(device)
            attn = enc["attention_mask"].to(device).bool()
            emb_p = self.psmiles(input_ids, attn)
            z = F.normalize(self.proj_psmiles(emb_p), dim=-1)
            outs.append(z.detach().cpu().numpy())
        return np.concatenate(outs, axis=0) if outs else np.zeros((0, self.emb_dim), dtype=np.float32)

    @torch.no_grad()
    def encode_multimodal(
        self,
        records: List[dict],
        batch_size: int = 32,
        device: str = DEVICE
    ) -> np.ndarray:
        """
        Convenience: multimodal embedding for records carrying:
          - graph, geometry, fingerprints, psmiles
        Missing modalities are handled sample-wise via modality masking.
        """
        self.eval()
        dev = torch.device(device)
        self.to(dev)

        outs = []
        for i in range(0, len(records), batch_size):
            chunk = records[i:i + batch_size]

            # PSMILES batch
            psmiles_texts = [str(r.get("psmiles", "")) for r in chunk]
            p_enc = None
            if self.psm_tok is not None:
                p_enc = self.psm_tok(psmiles_texts, truncation=True, padding="max_length", max_length=PSMILES_MAX_LEN, return_tensors="pt")

            # FP batch (always stack; missing handled by attention_mask downstream)
            fp_ids, fp_attn = [], []
            for r in chunk:
                f = _parse_fingerprints(r.get("fingerprints", None), fp_len=FP_LENGTH)
                fp_ids.append(f["input_ids"])
                fp_attn.append(f["attention_mask"])
            fp_ids = torch.stack(fp_ids, dim=0)
            fp_attn = torch.stack(fp_attn, dim=0)

            # GINE + SchNet packed batching
            gine_all = {"z": [], "chirality": [], "formal_charge": [], "edge_index": [], "edge_attr": [], "batch": []}
            node_offset = 0
            for bi, r in enumerate(chunk):
                g = _parse_graph_for_gine(r.get("graph", None))
                if g is None or g["z"].numel() == 0:
                    continue
                n = g["z"].size(0)
                gine_all["z"].append(g["z"])
                gine_all["chirality"].append(g["chirality"])
                gine_all["formal_charge"].append(g["formal_charge"])
                gine_all["batch"].append(torch.full((n,), bi, dtype=torch.long))

                ei = g["edge_index"]
                ea = g["edge_attr"]
                if ei is not None and ei.numel() > 0:
                    gine_all["edge_index"].append(ei + node_offset)
                    gine_all["edge_attr"].append(ea)
                node_offset += n

            gine_batch = None
            if len(gine_all["z"]) > 0:
                z_b = torch.cat(gine_all["z"], dim=0)
                ch_b = torch.cat(gine_all["chirality"], dim=0)
                fc_b = torch.cat(gine_all["formal_charge"], dim=0)
                b_b = torch.cat(gine_all["batch"], dim=0)
                if len(gine_all["edge_index"]) > 0:
                    ei_b = torch.cat(gine_all["edge_index"], dim=1)
                    ea_b = torch.cat(gine_all["edge_attr"], dim=0)
                else:
                    ei_b = torch.empty((2, 0), dtype=torch.long)
                    ea_b = torch.zeros((0, 3), dtype=torch.float)
                gine_batch = {"z": z_b, "chirality": ch_b, "formal_charge": fc_b, "edge_index": ei_b, "edge_attr": ea_b, "batch": b_b}

            sch_all_z, sch_all_pos, sch_all_batch = [], [], []
            for bi, r in enumerate(chunk):
                s = _parse_geometry_for_schnet(r.get("geometry", None))
                if s is None or s["z"].numel() == 0:
                    continue
                n = s["z"].size(0)
                sch_all_z.append(s["z"])
                sch_all_pos.append(s["pos"])
                sch_all_batch.append(torch.full((n,), bi, dtype=torch.long))
            schnet_batch = None
            if len(sch_all_z) > 0:
                schnet_batch = {
                    "z": torch.cat(sch_all_z, dim=0),
                    "pos": torch.cat(sch_all_pos, dim=0),
                    "batch": torch.cat(sch_all_batch, dim=0),
                }

            batch_mods = {
                "gine": gine_batch,
                "schnet": schnet_batch,
                "fp": {"input_ids": fp_ids, "attention_mask": fp_attn},
                "psmiles": {"input_ids": p_enc["input_ids"], "attention_mask": p_enc["attention_mask"]} if p_enc is not None else None
            }

            # NOTE: This script uses forward() as the encoder entry point.
            z = self.forward(batch_mods, modality_mask=None)
            outs.append(z.detach().cpu().numpy())

        return np.concatenate(outs, axis=0) if outs else np.zeros((0, self.emb_dim), dtype=np.float32)


# =============================================================================
# Tokenizer setup
# =============================================================================
SPM_MODEL = "/path/to/spm.model"
tokenizer = build_psmiles_tokenizer(spm_path=SPM_MODEL, max_len=PSMILES_MAX_LEN)

# =============================================================================
# Dataset: single-task property prediction (with modality parsing)
# =============================================================================
class PolymerPropertyDataset(Dataset):
    """
    Dataset that prepares one sample with up to four modalities:
      - graph (for GINE)
      - geometry (for SchNet)
      - fingerprints (for FP transformer)
      - psmiles text (for DeBERTa encoder)

    Target is a single scalar per sample (already scaled externally).
    """
    def __init__(self, data_list, tokenizer, max_length=128):
        self.data_list = data_list
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        data = self.data_list[idx]

        # ---------------------------------------------------------------------
        # Graph -> GINE tensors (robust parsing of stored JSON fields)
        # ---------------------------------------------------------------------
        gine_data = None
        if 'graph' in data and data['graph']:
            try:
                graph_field = json.loads(data['graph']) if isinstance(data['graph'], str) else data['graph']

                node_features = safe_get(graph_field, "node_features", None)
                if node_features:
                    atomic_nums = []
                    chirality_vals = []
                    formal_charges = []
                    for nf in node_features:
                        an = safe_get(nf, "atomic_num", None)
                        if an is None:
                            an = safe_get(nf, "atomic_number", 0)
                        ch = safe_get(nf, "chirality", 0)
                        fc = safe_get(nf, "formal_charge", 0)
                        try:
                            atomic_nums.append(int(an))
                        except Exception:
                            atomic_nums.append(0)
                        chirality_vals.append(float(ch))
                        formal_charges.append(float(fc))

                    edge_indices_raw = safe_get(graph_field, "edge_indices", None)
                    edge_features_raw = safe_get(graph_field, "edge_features", None)
                    edge_index = None
                    edge_attr = None

                    # Fallback: adjacency matrix if edge_indices missing
                    if edge_indices_raw is None:
                        adj_mat = safe_get(graph_field, "adjacency_matrix", None)
                        if adj_mat:
                            srcs = []
                            dsts = []
                            for i_r, row_adj in enumerate(adj_mat):
                                for j, val2 in enumerate(row_adj):
                                    if val2:
                                        srcs.append(i_r)
                                        dsts.append(j)
                            if len(srcs) > 0:
                                edge_index = [srcs, dsts]
                                E = len(srcs)
                                edge_attr = [[0.0, 0.0, 0.0] for _ in range(E)]
                    else:
                        # edge_indices can be [[srcs],[dsts]] or list of pairs
                        srcs, dsts = [], []
                        if isinstance(edge_indices_raw, list) and len(edge_indices_raw) > 0:
                            if isinstance(edge_indices_raw[0], list):
                                first = edge_indices_raw[0]
                                if len(first) == 2 and isinstance(first[0], int):
                                    try:
                                        srcs = [int(p[0]) for p in edge_indices_raw]
                                        dsts = [int(p[1]) for p in edge_indices_raw]
                                    except Exception:
                                        srcs, dsts = [], []
                                else:
                                    try:
                                        srcs = [int(x) for x in edge_indices_raw[0]]
                                        dsts = [int(x) for x in edge_indices_raw[1]]
                                    except Exception:
                                        srcs, dsts = [], []
                            else:
                                try:
                                    srcs = [int(x) for x in edge_indices_raw[0]]
                                    dsts = [int(x) for x in edge_indices_raw[1]]
                                except Exception:
                                    srcs, dsts = [], []

                        if len(srcs) > 0:
                            edge_index = [srcs, dsts]

                        # edge_features: attempt to map known fields; otherwise zeros
                        if edge_features_raw and isinstance(edge_features_raw, list):
                            bond_types = []
                            stereos = []
                            is_conjs = []
                            for ef in edge_features_raw:
                                bt = safe_get(ef, "bond_type", 0)
                                st = safe_get(ef, "stereo", 0)
                                ic = safe_get(ef, "is_conjugated", False)
                                bond_types.append(float(bt))
                                stereos.append(float(st))
                                is_conjs.append(float(1.0 if ic else 0.0))
                            edge_attr = list(zip(bond_types, stereos, is_conjs))
                        else:
                            E = len(srcs)
                            edge_attr = [[0.0, 0.0, 0.0] for _ in range(E)]

                    if edge_index is not None:
                        gine_data = {
                            'z': torch.tensor(atomic_nums, dtype=torch.long),
                            'chirality': torch.tensor(chirality_vals, dtype=torch.float),
                            'formal_charge': torch.tensor(formal_charges, dtype=torch.float),
                            'edge_index': torch.tensor(edge_index, dtype=torch.long),
                            'edge_attr': torch.tensor(edge_attr, dtype=torch.float)
                        }
            except Exception:
                gine_data = None

        # ---------------------------------------------------------------------
        # Geometry -> SchNet tensors (best conformer)
        # ---------------------------------------------------------------------
        schnet_data = None
        if 'geometry' in data and data['geometry']:
            try:
                geom = json.loads(data['geometry']) if isinstance(data['geometry'], str) else data['geometry']
                conf = geom.get("best_conformer") if isinstance(geom, dict) else None
                if conf:
                    atomic = conf.get("atomic_numbers", [])
                    coords = conf.get("coordinates", [])
                    if len(atomic) == len(coords) and len(atomic) > 0:
                        schnet_data = {
                            'z': torch.tensor(atomic, dtype=torch.long),
                            'pos': torch.tensor(coords, dtype=torch.float)
                        }
            except Exception:
                schnet_data = None

        # ---------------------------------------------------------------------
        # Fingerprints -> FP transformer inputs (bit sequence)
        # ---------------------------------------------------------------------
        fp_data = None
        if 'fingerprints' in data and data['fingerprints']:
            try:
                fpval = data['fingerprints']
                if fpval is not None and not (isinstance(fpval, str) and fpval.strip() == ""):
                    try:
                        fp_json = json.loads(fpval) if isinstance(fpval, str) else fpval
                    except Exception:
                        try:
                            fp_json = json.loads(str(fpval).replace("'", '"'))
                        except Exception:
                            parts = [p.strip().strip('"').strip("'") for p in str(fpval).split(",")]
                            bits = [1 if p in ("1", "True", "true") else 0 for p in parts[:FP_LENGTH]]
                            if len(bits) < FP_LENGTH:
                                bits += [0] * (FP_LENGTH - len(bits))
                            fp_json = bits

                    if isinstance(fp_json, dict):
                        bits = safe_get(fp_json, "morgan_r3_bits", None)
                        if bits is None:
                            bits = [0] * FP_LENGTH
                        else:
                            normalized = []
                            for b in bits:
                                if isinstance(b, str):
                                    b_clean = b.strip().strip('"').strip("'")
                                    normalized.append(1 if b_clean in ("1", "True", "true") else 0)
                                elif isinstance(b, (int, np.integer)):
                                    normalized.append(1 if int(b) != 0 else 0)
                                else:
                                    normalized.append(0)
                                if len(normalized) >= FP_LENGTH:
                                    break
                            if len(normalized) < FP_LENGTH:
                                normalized.extend([0] * (FP_LENGTH - len(normalized)))
                            bits = normalized[:FP_LENGTH]
                    elif isinstance(fp_json, list):
                        bits = fp_json[:FP_LENGTH]
                        if len(bits) < FP_LENGTH:
                            bits += [0] * (FP_LENGTH - len(bits))
                    else:
                        bits = [0] * FP_LENGTH

                    fp_data = {
                        'input_ids': torch.tensor(bits, dtype=torch.long),
                        'attention_mask': torch.ones(FP_LENGTH, dtype=torch.bool)
                    }
            except Exception:
                fp_data = None

        # ---------------------------------------------------------------------
        # PSMILES -> DeBERTa tokenizer inputs
        # ---------------------------------------------------------------------
        psmiles_data = None
        if 'psmiles' in data and data['psmiles'] and self.tokenizer is not None:
            try:
                s = str(data['psmiles'])
                enc = self.tokenizer(
                    s,
                    truncation=True,
                    padding="max_length",
                    max_length=PSMILES_MAX_LEN
                )
                psmiles_data = {
                    'input_ids': torch.tensor(enc["input_ids"], dtype=torch.long),
                    'attention_mask': torch.tensor(enc["attention_mask"], dtype=torch.bool)
                }
            except Exception:
                psmiles_data = None

        # ---------------------------------------------------------------------
        # Fill defaults for missing modalities
        # ---------------------------------------------------------------------
        if gine_data is None:
            gine_data = {
                'z': torch.tensor([], dtype=torch.long),
                'chirality': torch.tensor([], dtype=torch.float),
                'formal_charge': torch.tensor([], dtype=torch.float),
                'edge_index': torch.tensor([[], []], dtype=torch.long),
                'edge_attr': torch.zeros((0, 3), dtype=torch.float)
            }

        if schnet_data is None:
            schnet_data = {
                'z': torch.tensor([], dtype=torch.long),
                'pos': torch.tensor([], dtype=torch.float)
            }

        if fp_data is None:
            fp_data = {
                'input_ids': torch.zeros(FP_LENGTH, dtype=torch.long),
                'attention_mask': torch.zeros(FP_LENGTH, dtype=torch.bool)
            }

        if psmiles_data is None:
            psmiles_data = {
                'input_ids': torch.zeros(PSMILES_MAX_LEN, dtype=torch.long),
                'attention_mask': torch.zeros(PSMILES_MAX_LEN, dtype=torch.bool)
            }

        # Single-task regression target (already scaled)
        target_scaled = float(data.get("target_scaled", 0.0))

        return {
            'gine': gine_data,
            'schnet': schnet_data,
            'fp': fp_data,
            'psmiles': psmiles_data,
            'target': torch.tensor(target_scaled, dtype=torch.float32),
        }


# =============================================================================
# Collate: pack variable-sized graph/3D into batch tensors + modality masks
# =============================================================================
def multimodal_collate_fn(batch):
    """
    Collate samples into a single minibatch.

    - GINE: concatenate nodes across samples and build a `batch` vector.
    - SchNet: concatenate atoms/coords across samples and build a `batch` vector.
    - FP/PSMILES: stack to (B, L).
    - modality_mask: per-sample boolean flags indicating availability.
    """
    B = len(batch)

    # -------------------------
    # GINE packing
    # -------------------------
    all_z = []
    all_ch = []
    all_fc = []
    all_edge_index = []
    all_edge_attr = []
    batch_mapping = []
    node_offset = 0
    gine_present = []

    for i, item in enumerate(batch):
        g = item["gine"]
        z = g["z"]
        n = z.size(0)
        gine_present.append(bool(n > 0))

        all_z.append(z)
        all_ch.append(g["chirality"])
        all_fc.append(g["formal_charge"])
        batch_mapping.append(torch.full((n,), i, dtype=torch.long))

        if g["edge_index"] is not None and g["edge_index"].numel() > 0:
            ei_offset = g["edge_index"] + node_offset
            all_edge_index.append(ei_offset)
            ea = match_edge_attr_to_index(g["edge_index"], g["edge_attr"], target_dim=3)
            all_edge_attr.append(ea)
        node_offset += n

    if len(all_z) == 0:
        z_batch = torch.tensor([], dtype=torch.long)
        ch_batch = torch.tensor([], dtype=torch.float)
        fc_batch = torch.tensor([], dtype=torch.float)
        batch_batch = torch.tensor([], dtype=torch.long)
        edge_index_batched = torch.empty((2, 0), dtype=torch.long)
        edge_attr_batched = torch.zeros((0, 3), dtype=torch.float)
    else:
        z_batch = torch.cat(all_z, dim=0)
        ch_batch = torch.cat(all_ch, dim=0)
        fc_batch = torch.cat(all_fc, dim=0)
        batch_batch = torch.cat(batch_mapping, dim=0) if len(batch_mapping) > 0 else torch.tensor([], dtype=torch.long)
        if len(all_edge_index) > 0:
            edge_index_batched = torch.cat(all_edge_index, dim=1)
            edge_attr_batched = torch.cat(all_edge_attr, dim=0) if len(all_edge_attr) > 0 else torch.zeros((0, 3), dtype=torch.float)
        else:
            edge_index_batched = torch.empty((2, 0), dtype=torch.long)
            edge_attr_batched = torch.zeros((0, 3), dtype=torch.float)

    # -------------------------
    # SchNet packing
    # -------------------------
    all_sz = []
    all_pos = []
    schnet_batch = []
    schnet_present = [False] * B
    for i, item in enumerate(batch):
        s = item["schnet"]
        s_z = s["z"]
        s_pos = s["pos"]
        if s_z.numel() == 0:
            continue
        schnet_present[i] = True
        all_sz.append(s_z)
        all_pos.append(s_pos)
        schnet_batch.append(torch.full((s_z.size(0),), i, dtype=torch.long))

    if len(all_sz) == 0:
        s_z_batch = torch.tensor([], dtype=torch.long)
        s_pos_batch = torch.tensor([], dtype=torch.float)
        s_batch_batch = torch.tensor([], dtype=torch.long)
    else:
        s_z_batch = torch.cat(all_sz, dim=0)
        s_pos_batch = torch.cat(all_pos, dim=0)
        s_batch_batch = torch.cat(schnet_batch, dim=0)

    # -------------------------
    # FP stacking
    # -------------------------
    fp_ids = torch.stack([item["fp"]["input_ids"] for item in batch], dim=0)
    fp_attn = torch.stack([item["fp"]["attention_mask"] for item in batch], dim=0)
    fp_present = (fp_attn.sum(dim=1) > 0).cpu().numpy().tolist()

    # -------------------------
    # PSMILES stacking
    # -------------------------
    p_ids = torch.stack([item["psmiles"]["input_ids"] for item in batch], dim=0)
    p_attn = torch.stack([item["psmiles"]["attention_mask"] for item in batch], dim=0)
    psmiles_present = (p_attn.sum(dim=1) > 0).cpu().numpy().tolist()

    # Target
    target = torch.stack([item["target"] for item in batch], dim=0)  # (B,)

    # Presence mask for fusion (per-sample modality availability)
    modality_mask = {
        "gine": torch.tensor(gine_present, dtype=torch.bool),
        "schnet": torch.tensor(schnet_present, dtype=torch.bool),
        "fp": torch.tensor(fp_present, dtype=torch.bool),
        "psmiles": torch.tensor(psmiles_present, dtype=torch.bool),
    }

    return {
        "gine": {
            "z": z_batch,
            "chirality": ch_batch,
            "formal_charge": fc_batch,
            "edge_index": edge_index_batched,
            "edge_attr": edge_attr_batched,
            "batch": batch_batch
        },
        "schnet": {
            "z": s_z_batch,
            "pos": s_pos_batch,
            "batch": s_batch_batch
        },
        "fp": {
            "input_ids": fp_ids,
            "attention_mask": fp_attn
        },
        "psmiles": {
            "input_ids": p_ids,
            "attention_mask": p_attn
        },
        "target": target,
        "modality_mask": modality_mask
    }


# =============================================================================
# Single-task regressor head 
# =============================================================================
class PolyFPropertyRegressor(nn.Module):
    """
    Simple MLP head on top of the multimodal fused embedding.
    Predicts one scalar (scaled target) per sample.
    """
    def __init__(self, polyf_model: MultimodalContrastiveModel, emb_dim: int = POLYF_EMB_DIM, dropout: float = 0.1):
        super().__init__()
        self.polyf = polyf_model
        self.head = nn.Sequential(
            nn.Linear(emb_dim, emb_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(emb_dim // 2, 1)
        )

    def forward(self, batch_mods, modality_mask=None):
        emb = self.polyf(batch_mods, modality_mask=modality_mask)  # (B,d)
        y = self.head(emb).squeeze(-1)  # (B,)
        return y


# =============================================================================
# Training / evaluation helpers
# =============================================================================
def compute_metrics(y_true, y_pred):
    """Compute standard regression metrics in original units."""
    mse = mean_squared_error(y_true, y_pred)
    rmse = math.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {"mse": float(mse), "rmse": float(rmse), "mae": float(mae), "r2": float(r2)}


def train_one_epoch(model, dataloader, optimizer, device):
    """One epoch of supervised regression training (MSE loss)."""
    model.train()
    total_loss = 0.0
    total_n = 0

    for batch in dataloader:
        # Move nested batch dict to device
        for k in batch:
            if k == "target":
                batch[k] = batch[k].to(device)
            elif k == "modality_mask":
                for mk in batch[k]:
                    if isinstance(batch[k][mk], torch.Tensor):
                        batch[k][mk] = batch[k][mk].to(device)
            else:
                for subk in batch[k]:
                    if isinstance(batch[k][subk], torch.Tensor):
                        batch[k][subk] = batch[k][subk].to(device)

        y = batch["target"]  # (B,)
        modality_mask = batch.get("modality_mask", None)
        batch_mods = {k: v for k, v in batch.items() if k not in ("target", "modality_mask")}

        pred = model(batch_mods, modality_mask=modality_mask)
        loss = F.mse_loss(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        bs = int(y.size(0))
        total_loss += float(loss.item()) * bs
        total_n += bs

    return total_loss / max(1, total_n)


@torch.no_grad()
def evaluate(model, dataloader, device):
    """
    Evaluate on a dataloader:
      - returns avg loss, predicted scaled values, true scaled values
    """
    model.eval()
    preds = []
    trues = []
    total_loss = 0.0
    total_n = 0

    for batch in dataloader:
        # Move nested batch dict to device
        for k in batch:
            if k == "target":
                batch[k] = batch[k].to(device)
            elif k == "modality_mask":
                for mk in batch[k]:
                    if isinstance(batch[k][mk], torch.Tensor):
                        batch[k][mk] = batch[k][mk].to(device)
            else:
                for subk in batch[k]:
                    if isinstance(batch[k][subk], torch.Tensor):
                        batch[k][subk] = batch[k][subk].to(device)

        y = batch["target"]
        modality_mask = batch.get("modality_mask", None)
        batch_mods = {k: v for k, v in batch.items() if k not in ("target", "modality_mask")}

        pred = model(batch_mods, modality_mask=modality_mask)
        loss = F.mse_loss(pred, y)

        bs = int(y.size(0))
        total_loss += float(loss.item()) * bs
        total_n += bs

        preds.append(pred.detach().cpu().numpy())
        trues.append(y.detach().cpu().numpy())

    if total_n == 0:
        return None, None, None

    preds = np.concatenate(preds, axis=0)
    trues = np.concatenate(trues, axis=0)
    avg_loss = total_loss / max(1, total_n)
    return float(avg_loss), preds, trues


# =============================================================================
# Pretrained loading helpers
# =============================================================================
def load_pretrained_multimodal(pretrained_path: str) -> MultimodalContrastiveModel:
    """
    Construct modality encoders and load any available pretrained weights:
      - modality-specific checkpoints (BEST_*_DIR)
      - full multimodal checkpoint from `pretrained_path/pytorch_model.bin`

    Returns a ready-to-fine-tune MultimodalContrastiveModel.
    """
    # -------------------------
    # GINE encoder
    # -------------------------
    gine_encoder = GineEncoder(
        node_emb_dim=NODE_EMB_DIM,
        edge_emb_dim=EDGE_EMB_DIM,
        num_layers=NUM_GNN_LAYERS,
        max_atomic_z=MAX_ATOMIC_Z
    )
    gine_ckpt = os.path.join(BEST_GINE_DIR, "pytorch_model.bin")
    if os.path.exists(gine_ckpt):
        try:
            gine_encoder.load_state_dict(torch.load(gine_ckpt, map_location="cpu"), strict=False)
            print(f"[LOAD] GINE weights: {gine_ckpt}")
        except Exception as e:
            print(f"[LOAD][WARN] Could not load GINE weights: {e}")

    # -------------------------
    # SchNet encoder
    # -------------------------
    schnet_encoder = NodeSchNetWrapper(
        hidden_channels=SCHNET_HIDDEN,
        num_interactions=SCHNET_NUM_INTERACTIONS,
        num_gaussians=SCHNET_NUM_GAUSSIANS,
        cutoff=SCHNET_CUTOFF,
        max_num_neighbors=SCHNET_MAX_NEIGHBORS
    )
    sch_ckpt = os.path.join(BEST_SCHNET_DIR, "pytorch_model.bin")
    if os.path.exists(sch_ckpt):
        try:
            schnet_encoder.load_state_dict(torch.load(sch_ckpt, map_location="cpu"), strict=False)
            print(f"[LOAD] SchNet weights: {sch_ckpt}")
        except Exception as e:
            print(f"[LOAD][WARN] Could not load SchNet weights: {e}")

    # -------------------------
    # Fingerprint encoder
    # -------------------------
    fp_encoder = FingerprintEncoder(
        vocab_size=VOCAB_SIZE_FP,
        hidden_dim=256,
        seq_len=FP_LENGTH,
        num_layers=4,
        nhead=8,
        dim_feedforward=1024,
        dropout=0.1
    )
    fp_ckpt = os.path.join(BEST_FP_DIR, "pytorch_model.bin")
    if os.path.exists(fp_ckpt):
        try:
            fp_encoder.load_state_dict(torch.load(fp_ckpt, map_location="cpu"), strict=False)
            print(f"[LOAD] FP encoder weights: {fp_ckpt}")
        except Exception as e:
            print(f"[LOAD][WARN] Could not load fingerprint weights: {e}")

    # -------------------------
    # PSMILES encoder
    # -------------------------
    psmiles_encoder = None
    if os.path.isdir(BEST_PSMILES_DIR):
        try:
            psmiles_encoder = PSMILESDebertaEncoder(model_dir_or_name=BEST_PSMILES_DIR)
            print(f"[LOAD] PSMILES encoder: {BEST_PSMILES_DIR}")
        except Exception as e:
            print(f"[LOAD][WARN] Could not load PSMILES encoder from dir: {e}")

    # Fallback: initialize with vocab fallback
    if psmiles_encoder is None:
        try:
            psmiles_encoder = PSMILESDebertaEncoder(
                model_dir_or_name=None,
                vocab_fallback=int(getattr(tokenizer, "vocab_size", 300))
            )
            print("[LOAD] PSMILES encoder: initialized fallback (no pretrained dir).")
        except Exception as e:
            print(f"[LOAD][WARN] Could not initialize PSMILES encoder: {e}")

    # Build multimodal wrapper
    multimodal_model = MultimodalContrastiveModel(
        gine_encoder,
        schnet_encoder,
        fp_encoder,
        psmiles_encoder,
        emb_dim=POLYF_EMB_DIM,
        modalities=["gine", "schnet", "fp", "psmiles"]
    )

    # -------------------------
    # Optional: load full multimodal checkpoint
    # -------------------------
    ckpt_path = os.path.join(pretrained_path, "pytorch_model.bin")
    if os.path.isfile(ckpt_path):
        try:
            state = torch.load(ckpt_path, map_location="cpu")
            model_state = multimodal_model.state_dict()
            filtered_state = {}
            for k, v in state.items():
                if k not in model_state:
                    continue
                if model_state[k].shape != v.shape:
                    continue
                filtered_state[k] = v

            summarize_state_dict_load(state, model_state, filtered_state)
            missing, unexpected = multimodal_model.load_state_dict(filtered_state, strict=False)
            print(f"[LOAD] Multimodal checkpoint: {ckpt_path}")
            print(f"[LOAD] load_state_dict -> missing={len(missing)} unexpected={len(unexpected)}")
            if missing:
                print("[LOAD] Missing keys (sample):", missing[:50])
            if unexpected:
                print("[LOAD] Unexpected keys (sample):", unexpected[:50])

        except Exception as e:
            print(f"[LOAD][WARN] Failed to load multimodal pretrained weights: {e}")
    else:
        print(f"[LOAD] No multimodal checkpoint found at: {ckpt_path}")

    return multimodal_model


# =============================================================================
# Downstream: sample construction + CV training loop
# =============================================================================
def build_samples_for_property(df: pd.DataFrame, prop_col: str) -> List[dict]:
    """
    Construct training samples for a single property:
      - Keep rows that have at least one modality present.
      - Keep rows with a finite property value in `prop_col`.
      - Store raw target (will be scaled per fold).
    """
    samples = []
    for _, row in df.iterrows():
        # Require at least one modality present
        has_modality = False
        for col in ['graph', 'geometry', 'fingerprints', 'psmiles']:
            if col in row and row[col] and str(row[col]).strip() != "":
                has_modality = True
                break
        if not has_modality:
            continue

        val = row.get(prop_col, np.nan)
        if val is None or (isinstance(val, float) and np.isnan(val)):
            continue

        try:
            y = float(val)
        except Exception:
            continue

        samples.append({
            'graph': row.get('graph', ''),
            'geometry': row.get('geometry', ''),
            'fingerprints': row.get('fingerprints', ''),
            'psmiles': row.get('psmiles', ''),
            'target_raw': y
        })
    return samples


def run_polyf_downstream(property_list: List[str], property_cols: List[str], df_raw: pd.DataFrame,
                           pretrained_path: str, output_file: str):
    """
    Downstream evaluation:

      For each property:
        - Build samples from PolyInfo
        - 5-fold CV:
            - Split into trainval/test (by KFold)
            - Split trainval into train/val
            - Fit StandardScaler on train targets
            - Fine-tune encoder+head end-to-end with early stopping by val loss
            - Evaluate on held-out test fold in original units
        - Save per-fold results and per-property mean±std
        - Save best fold checkpoint bundle (by test R2) for later reuse
    """
    os.makedirs(pretrained_path, exist_ok=True)

    # Optional duplicate aggregation (noise reduction)
    modality_cols = ["graph", "geometry", "fingerprints", "psmiles"]
    df_proc = aggregate_polyinfo_duplicates(df_raw, modality_cols=modality_cols, property_cols=property_cols)

    all_results = {"per_property": {}, "mode": "POLYF_MATCHED_SINGLE_TASK"}

    for pname, pcol in zip(property_list, property_cols):
        samples = build_samples_for_property(df_proc, pcol)

        print(f"[DATA] {pname}: n_samples={len(samples)}")
        if len(samples) < 200:
            print(f"[DATA][WARN] '{pname}' has <200 samples; results may be noisy.")
        if len(samples) < 50:
            print(f"[DATA][WARN] Skipping '{pname}' (insufficient samples).")
            continue

        run_metrics = []
        run_records = []

        # Track best-performing fold for this property (by test R2)
        best_overall_r2 = -1e18
        best_overall_payload = None

        idxs = np.arange(len(samples))
        cv = KFold(n_splits=NUM_RUNS, shuffle=True, random_state=42)

        for run_idx, (trainval_idx, test_idx) in enumerate(cv.split(idxs)):
            seed = 42 + run_idx
            set_seed(seed)

            print(f"\n--- [CV] {pname}: fold {run_idx+1}/{NUM_RUNS} | seed={seed} ---")

            trainval = [copy.deepcopy(samples[i]) for i in trainval_idx]
            test = [copy.deepcopy(samples[i]) for i in test_idx]

            # Split trainval into train/val
            tr_idx, va_idx = train_test_split(
                np.arange(len(trainval)),
                test_size=VAL_SIZE_WITHIN_TRAINVAL,
                random_state=seed,
                shuffle=True
            )
            train = [copy.deepcopy(trainval[i]) for i in tr_idx]
            val = [copy.deepcopy(trainval[i]) for i in va_idx]

            # Standardize target using training fold only (prevents leakage)
            sc = StandardScaler()
            sc.fit(np.array([s["target_raw"] for s in train]).reshape(-1, 1))

            def _apply_scale(lst):
                for s in lst:
                    s["target_scaled"] = float(sc.transform(np.array([[s["target_raw"]]])).ravel()[0])

            _apply_scale(train)
            _apply_scale(val)
            _apply_scale(test)

            ds_train = PolymerPropertyDataset(train, tokenizer, max_length=MAX_LEN)
            ds_val = PolymerPropertyDataset(val, tokenizer, max_length=MAX_LEN)
            ds_test = PolymerPropertyDataset(test, tokenizer, max_length=MAX_LEN)

            dl_train = DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True, collate_fn=multimodal_collate_fn)
            dl_val = DataLoader(ds_val, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True, collate_fn=multimodal_collate_fn)
            dl_test = DataLoader(ds_test, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True, collate_fn=multimodal_collate_fn)

            print(f"[SPLIT] train={len(ds_train)} val={len(ds_val)} test={len(ds_test)}")

            # Fresh base model per fold to avoid any cross-fold leakage
            polyf_base = load_pretrained_multimodal(pretrained_path)
            model = PolyFPropertyRegressor(polyf_base, emb_dim=POLYF_EMB_DIM, dropout=POLYF_DROPOUT).to(DEVICE)

            optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS)

            best_val = float("inf")
            best_state = None
            no_improve = 0

            # Train with early stopping on validation loss
            for epoch in range(1, NUM_EPOCHS + 1):
                tr_loss = train_one_epoch(model, dl_train, optimizer, DEVICE)
                va_loss, _, _ = evaluate(model, dl_val, DEVICE)
                va_loss = va_loss if va_loss is not None else float("inf")

                scheduler.step()

                print(f"[{pname}] fold {run_idx+1}/{NUM_RUNS} epoch {epoch:03d} | train={tr_loss:.6f} | val={va_loss:.6f}")

                if va_loss < best_val - 1e-8:
                    best_val = va_loss
                    no_improve = 0
                    best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
                else:
                    no_improve += 1
                    if no_improve >= PATIENCE:
                        print(f"[{pname}] fold {run_idx+1}: early stopping (patience={PATIENCE}) at epoch {epoch}.")
                        break

            if best_state is None:
                print(f"[{pname}][WARN] No best checkpoint captured for fold {run_idx+1}; skipping fold.")
                continue

            # Restore best state and evaluate on test fold
            model.load_state_dict({k: v.to(DEVICE) for k, v in best_state.items()}, strict=True)
            _, pred_scaled, true_scaled = evaluate(model, dl_test, DEVICE)
            if pred_scaled is None:
                print(f"[{pname}][WARN] Test evaluation returned empty predictions for fold {run_idx+1}.")
                continue

            # Convert from scaled space back to original units
            pred = sc.inverse_transform(pred_scaled.reshape(-1, 1)).ravel()
            true = sc.inverse_transform(true_scaled.reshape(-1, 1)).ravel()

            m = compute_metrics(true, pred)
            run_metrics.append(m)

            print(f"[{pname}] fold {run_idx+1} TEST | r2={m['r2']:.4f}  mae={m['mae']:.4f}  rmse={m['rmse']:.4f}")

            record = {
                "property": pname,
                "property_col": pcol,
                "run": run_idx + 1,
                "seed": seed,
                "n_train": len(ds_train),
                "n_val": len(ds_val),
                "n_test": len(ds_test),
                "best_val_loss": float(best_val),
                "test_metrics": m
            }
            run_records.append(record)

            with open(output_file, "a") as fh:
                fh.write(json.dumps(make_json_serializable(record)) + "\n")

            # Update best fold bundle (by test R2)
            if float(m.get("r2", -1e18)) > float(best_overall_r2):
                best_overall_r2 = float(m.get("r2", -1e18))
                best_overall_payload = {
                    "property": pname,
                    "property_col": pcol,
                    "best_run": int(run_idx + 1),
                    "seed": int(seed),
                    "n_train": int(len(ds_train)),
                    "n_val": int(len(ds_val)),
                    "n_test": int(len(ds_test)),
                    "best_val_loss": float(best_val),
                    "test_metrics": make_json_serializable(m),
                    "scaler_mean": make_json_serializable(getattr(sc, "mean_", None)),
                    "scaler_scale": make_json_serializable(getattr(sc, "scale_", None)),
                    "scaler_var": make_json_serializable(getattr(sc, "var_", None)),
                    "scaler_n_samples_seen": make_json_serializable(getattr(sc, "n_samples_seen_", None)),
                    "model_state_dict": best_state,  # CPU tensors
                }

        # Save best fold weights + metadata per property
        if best_overall_payload is not None and "model_state_dict" in best_overall_payload:
            os.makedirs(BEST_WEIGHTS_DIR, exist_ok=True)
            prop_dir = os.path.join(BEST_WEIGHTS_DIR, _sanitize_name(pname))
            os.makedirs(prop_dir, exist_ok=True)

            ckpt_bundle = {k: v for k, v in best_overall_payload.items() if k != "test_metrics"}
            ckpt_bundle["test_metrics"] = best_overall_payload["test_metrics"]

            torch.save(ckpt_bundle, os.path.join(prop_dir, "best_run_checkpoint.pt"))

            meta = {k: v for k, v in best_overall_payload.items() if k != "model_state_dict"}
            with open(os.path.join(prop_dir, "best_run_metadata.json"), "w") as fh:
                fh.write(json.dumps(make_json_serializable(meta), indent=2))

            print(f"[BEST] Saved best fold for '{pname}' -> {prop_dir}")
            print(f"[BEST]   best_run={best_overall_payload['best_run']}  best_test_r2={best_overall_payload['test_metrics'].get('r2', None)}")

        # Aggregate metrics across folds
        if run_metrics:
            r2s = [x["r2"] for x in run_metrics]
            maes = [x["mae"] for x in run_metrics]
            rmses = [x["rmse"] for x in run_metrics]
            mses = [x["mse"] for x in run_metrics]
            agg = {
                "r2": {"mean": float(np.mean(r2s)), "std": float(np.std(r2s, ddof=0))},
                "mae": {"mean": float(np.mean(maes)), "std": float(np.std(maes, ddof=0))},
                "rmse": {"mean": float(np.mean(rmses)), "std": float(np.std(rmses, ddof=0))},
                "mse": {"mean": float(np.mean(mses)), "std": float(np.std(mses, ddof=0))},
            }
            print(f"[AGG] {pname} | r2={agg['r2']['mean']:.4f}±{agg['r2']['std']:.4f}  mae={agg['mae']['mean']:.4f}±{agg['mae']['std']:.4f}")
        else:
            agg = None
            print(f"[AGG][WARN] No successful folds for '{pname}' (no aggregate computed).")

        all_results["per_property"][pname] = {
            "property_col": pcol,
            "n_samples": len(samples),
            "runs": run_records,
            "agg": agg
        }

        with open(output_file, "a") as fh:
            fh.write("AGG_PROPERTY: " + json.dumps(make_json_serializable({pname: agg})) + "\n")

    return all_results


# =============================================================================
# Main
# =============================================================================
def main():
    # Start a fresh results file (back up old results if present)
    if os.path.exists(OUTPUT_RESULTS):
        backup = OUTPUT_RESULTS + ".bak"
        shutil.copy(OUTPUT_RESULTS, backup)
        print(f"[INIT] Existing results backed up: {backup}")
    open(OUTPUT_RESULTS, "w").close()
    print(f"[INIT] Writing results to: {OUTPUT_RESULTS}")

    # Load PolyInfo
    if not os.path.isfile(POLYINFO_PATH):
        raise FileNotFoundError(f"PolyInfo file not found at {POLYINFO_PATH}")
    polyinfo_raw = pd.read_csv(POLYINFO_PATH, engine="python")
    print(f"[DATA] Loaded PolyInfo: n_rows={len(polyinfo_raw)} n_cols={len(polyinfo_raw.columns)}")

    # Map requested properties to dataframe columns
    found = find_property_columns(polyinfo_raw.columns)
    prop_map = {req: col for req, col in found.items()}
    print(f"[COLMAP] Property-to-column map: {prop_map}")

    property_list = []
    property_cols = []
    for req in REQUESTED_PROPERTIES:
        col = prop_map.get(req)
        if col is None:
            print(f"[COLMAP][WARN] Could not find a column for '{req}'; skipping.")
            continue
        property_list.append(req)
        property_cols.append(col)

    overall = run_polyf_downstream(property_list, property_cols, polyinfo_raw, PRETRAINED_MULTIMODAL_DIR, OUTPUT_RESULTS)

    # Write final summary (aggregated per property) 
    final_agg = {}
    if overall and "per_property" in overall:
        for pname, info in overall["per_property"].items():
            final_agg[pname] = info.get("agg", None)

    with open(OUTPUT_RESULTS, "a") as fh:
        fh.write("\nFINAL_SUMMARY\n")
        fh.write(json.dumps(make_json_serializable(final_agg), indent=2))
        fh.write("\n")

    print(f"\n Results appended to: {OUTPUT_RESULTS}")
    print(f" Best checkpoints saved under: {BEST_WEIGHTS_DIR}")


if __name__ == "__main__":
    main()