"""
PolyFusion - CL.py
Multimodal contrastive pretraining script (DeBERTaV2 + GINE + SchNet + Transformer).
"""

import os
import sys
import csv
import json
import time
import math
import random
import shutil
from pathlib import Path
from typing import List, Optional, Tuple, Dict

# Increase csv field size limit safely
try:
    csv.field_size_limit(sys.maxsize)
except OverflowError:
    csv.field_size_limit(2**31 - 1)

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Shared model utilities 
from PolyFusion.GINE import GineEncoder, GineBlock, MaskedGINE, match_edge_attr_to_index, safe_get
from PolyFusion.SchNet import NodeSchNetWrapper
from PolyFusion.Transformer import PooledFingerprintEncoder as FingerprintEncoder
from PolyFusion.DeBERTav2 import PSMILESDebertaEncoder, build_psmiles_tokenizer

# HF Trainer & Transformers
from transformers import TrainingArguments, Trainer
from transformers.trainer_callback import TrainerCallback

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# =============================================================================
# Configuration (paths are placeholders; update for your environment)
# =============================================================================

P_MASK = 0.15
MAX_ATOMIC_Z = 85
MASK_ATOM_ID = MAX_ATOMIC_Z + 1

# GINE params
NODE_EMB_DIM = 300
EDGE_EMB_DIM = 300
NUM_GNN_LAYERS = 5

# SchNet params
SCHNET_NUM_GAUSSIANS = 50
SCHNET_NUM_INTERACTIONS = 6
SCHNET_CUTOFF = 10.0
SCHNET_MAX_NEIGHBORS = 64
SCHNET_HIDDEN = 600

# Fingerprint Transformer params
FP_LENGTH = 2048
MASK_TOKEN_ID_FP = 2
VOCAB_SIZE_FP = 3

# DeBERTaV2 params
DEBERTA_HIDDEN = 600
PSMILES_MAX_LEN = 128

# Contrastive params
TEMPERATURE = 0.07
REC_LOSS_WEIGHT = 1.0  # Reconstruction loss weight

# Data / preprocessing
CSV_PATH = "/path/to/polymer_structures_unified_processed.csv"
TARGET_ROWS = 2000000
CHUNKSIZE = 50000
PREPROC_DIR = "/path/to/preprocessed_samples"

# Tokenizer assets
SPM_MODEL = "/path/to/spm.model"

# Outputs / checkpoints
OUTPUT_DIR = "/path/to/multimodal_output"
BEST_GINE_DIR = "/path/to/gin_output/best"
BEST_SCHNET_DIR = "/path/to/schnet_output/best"
BEST_FP_DIR = "/path/to/fingerprint_mlm_output/best"
BEST_PSMILES_DIR = "/path/to/polybert_output/best"


# =============================================================================
# Reproducibility + device
# =============================================================================

def get_device() -> torch.device:
    """Select CUDA if available (respects CUDA_VISIBLE_DEVICES), else CPU."""
    return torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


def set_seed(seed: int = 42) -> None:
    """Set Python/Numpy/Torch seeds for deterministic-ish behavior."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


# =============================================================================
# Preprocessing (streaming to disk to avoid large memory spikes)
# =============================================================================

def ensure_dir(path: str) -> None:
    """Create a directory if it doesn't exist."""
    os.makedirs(path, exist_ok=True)


def prepare_or_load_data_streaming(
    csv_path: str,
    preproc_dir: str,
    target_rows: int = TARGET_ROWS,
    chunksize: int = CHUNKSIZE
) -> List[str]:
    """
    Prepare per-sample serialized files (torch .pt) for lazy loading.

    - If `preproc_dir` already contains `sample_*.pt`, reuse them.
    - Else stream the CSV in chunks and write `sample_{idx:08d}.pt` files.
    """
    ensure_dir(preproc_dir)

    existing = sorted([p for p in Path(preproc_dir).glob("sample_*.pt")])
    if len(existing) > 0:
        print(f"Found {len(existing)} preprocessed sample files in {preproc_dir}; reusing those (no reparse).")
        return [str(p) for p in existing]

    print("No existing per-sample preprocessed folder found. Parsing CSV chunked and writing per-sample files (streaming).")
    rows_written = 0
    sample_idx = 0

    for chunk in pd.read_csv(csv_path, engine="python", chunksize=chunksize):
        has_graph = "graph" in chunk.columns
        has_geometry = "geometry" in chunk.columns
        has_fp = "fingerprints" in chunk.columns
        has_psmiles = "psmiles" in chunk.columns

        for i_row in range(len(chunk)):
            if rows_written >= target_rows:
                break

            row = chunk.iloc[i_row]

            # Per-row modality payloads (None if missing)
            gine_sample = None
            schnet_sample = None
            fp_sample = None
            psmiles_raw = None

            # -------- Graph / GINE modality --------
            if has_graph:
                val = row.get("graph", "")
                try:
                    graph_field = (
                        json.loads(val)
                        if isinstance(val, str) and val.strip() != ""
                        else (val if not isinstance(val, str) else None)
                    )
                except Exception:
                    graph_field = None

                if graph_field:
                    node_features = safe_get(graph_field, "node_features", None)
                    if node_features:
                        atomic_nums = []
                        chirality_vals = []
                        formal_charges = []

                        for nf in node_features:
                            an = safe_get(nf, "atomic_num", None)
                            if an is None:
                                an = safe_get(nf, "atomic_number", 0)
                            ch = safe_get(nf, "chirality", 0)
                            fc = safe_get(nf, "formal_charge", 0)

                            try:
                                atomic_nums.append(int(an))
                            except Exception:
                                atomic_nums.append(0)

                            chirality_vals.append(float(ch))
                            formal_charges.append(float(fc))

                        edge_indices_raw = safe_get(graph_field, "edge_indices", None)
                        edge_features_raw = safe_get(graph_field, "edge_features", None)

                        edge_index = None
                        edge_attr = None

                        # Handle missing edge_indices via adjacency_matrix
                        if edge_indices_raw is None:
                            adj_mat = safe_get(graph_field, "adjacency_matrix", None)
                            if adj_mat:
                                srcs, dsts = [], []
                                for i_r, row_adj in enumerate(adj_mat):
                                    for j, val2 in enumerate(row_adj):
                                        if val2:
                                            srcs.append(i_r)
                                            dsts.append(j)
                                if len(srcs) > 0:
                                    edge_index = [srcs, dsts]
                                    E = len(srcs)
                                    edge_attr = [[0.0, 0.0, 0.0] for _ in range(E)]
                        else:
                            # edge_indices_raw can be:
                            # - list of pairs [[u,v], ...]
                            # - two lists [[srcs], [dsts]]
                            srcs, dsts = [], []

                            if isinstance(edge_indices_raw, list) and len(edge_indices_raw) > 0 and isinstance(edge_indices_raw[0], list):
                                first = edge_indices_raw[0]
                                if len(first) == 2 and isinstance(first[0], int):
                                    # list of pairs
                                    try:
                                        srcs = [int(p[0]) for p in edge_indices_raw]
                                        dsts = [int(p[1]) for p in edge_indices_raw]
                                    except Exception:
                                        srcs, dsts = [], []
                                else:
                                    # two lists
                                    try:
                                        srcs = [int(x) for x in edge_indices_raw[0]]
                                        dsts = [int(x) for x in edge_indices_raw[1]]
                                    except Exception:
                                        srcs, dsts = [], []

                            if len(srcs) == 0 and isinstance(edge_indices_raw, list) and all(
                                isinstance(p, (list, tuple)) and len(p) == 2 for p in edge_indices_raw
                            ):
                                srcs = [int(p[0]) for p in edge_indices_raw]
                                dsts = [int(p[1]) for p in edge_indices_raw]

                            if len(srcs) > 0:
                                edge_index = [srcs, dsts]

                                if edge_features_raw and isinstance(edge_features_raw, list):
                                    bond_types, stereos, is_conjs = [], [], []
                                    for ef in edge_features_raw:
                                        bt = safe_get(ef, "bond_type", 0)
                                        st = safe_get(ef, "stereo", 0)
                                        ic = safe_get(ef, "is_conjugated", False)
                                        bond_types.append(float(bt))
                                        stereos.append(float(st))
                                        is_conjs.append(float(1.0 if ic else 0.0))
                                    edge_attr = list(zip(bond_types, stereos, is_conjs))
                                else:
                                    E = len(srcs)
                                    edge_attr = [[0.0, 0.0, 0.0] for _ in range(E)]

                        if edge_index is not None:
                            gine_sample = {
                                "node_atomic": atomic_nums,
                                "node_chirality": chirality_vals,
                                "node_charge": formal_charges,
                                "edge_index": edge_index,
                                "edge_attr": edge_attr,
                            }

            # -------- Geometry / SchNet modality --------
            if has_geometry and schnet_sample is None:
                val = row.get("geometry", "")
                try:
                    geom = (
                        json.loads(val)
                        if isinstance(val, str) and val.strip() != ""
                        else (val if not isinstance(val, str) else None)
                    )
                    conf = geom.get("best_conformer") if isinstance(geom, dict) else None
                    if conf:
                        atomic = conf.get("atomic_numbers", [])
                        coords = conf.get("coordinates", [])
                        if len(atomic) == len(coords) and len(atomic) > 0:
                            schnet_sample = {"atomic": atomic, "coords": coords}
                except Exception:
                    schnet_sample = None

            # -------- Fingerprint modality --------
            if has_fp:
                fpval = row.get("fingerprints", "")
                if fpval is None or (isinstance(fpval, str) and fpval.strip() == ""):
                    fp_sample = [0] * FP_LENGTH
                else:
                    fp_json = None
                    try:
                        fp_json = json.loads(fpval) if isinstance(fpval, str) else fpval
                    except Exception:
                        try:
                            fp_json = json.loads(str(fpval).replace("'", '"'))
                        except Exception:
                            parts = [p.strip().strip('"').strip("'") for p in str(fpval).split(",")]
                            bits = [1 if p in ("1", "True", "true") else 0 for p in parts[:FP_LENGTH]]
                            if len(bits) < FP_LENGTH:
                                bits += [0] * (FP_LENGTH - len(bits))
                            fp_sample = bits

                    if fp_sample is None:
                        bits = (
                            safe_get(fp_json, "morgan_r3_bits", None)
                            if isinstance(fp_json, dict)
                            else (fp_json if isinstance(fp_json, list) else None)
                        )
                        if bits is None:
                            fp_sample = [0] * FP_LENGTH
                        else:
                            normalized = []
                            for b in bits:
                                if isinstance(b, str):
                                    b_clean = b.strip().strip('"').strip("'")
                                    normalized.append(1 if b_clean in ("1", "True", "true") else 0)
                                elif isinstance(b, (int, np.integer)):
                                    normalized.append(1 if int(b) != 0 else 0)
                                else:
                                    normalized.append(0)

                                if len(normalized) >= FP_LENGTH:
                                    break

                            if len(normalized) < FP_LENGTH:
                                normalized.extend([0] * (FP_LENGTH - len(normalized)))
                            fp_sample = normalized[:FP_LENGTH]

            # -------- PSMILES modality --------
            if has_psmiles:
                s = row.get("psmiles", "")
                psmiles_raw = "" if s is None else str(s)

            # Require at least 2 modalities 
            modalities_present = sum(
                [1 if x is not None else 0 for x in [gine_sample, schnet_sample, fp_sample, psmiles_raw]]
            )
            if modalities_present >= 2:
                sample = {
                    "gine": gine_sample,
                    "schnet": schnet_sample,
                    "fp": fp_sample,
                    "psmiles_raw": psmiles_raw,
                }

                sample_path = os.path.join(preproc_dir, f"sample_{sample_idx:08d}.pt")
                try:
                    torch.save(sample, sample_path)
                except Exception as save_e:
                    print("Warning: failed to torch.save sample:", save_e)
                    # fallback JSON for debugging
                    try:
                        with open(sample_path + ".json", "w") as fjson:
                            json.dump(sample, fjson)
                    except Exception:
                        pass

                sample_idx += 1
                rows_written += 1

        if rows_written >= target_rows:
            break

    print(f"Wrote {sample_idx} sample files to {preproc_dir}.")
    return [str(p) for p in sorted(Path(preproc_dir).glob("sample_*.pt"))]


# =============================================================================
# Dataset + collate
# =============================================================================

class LazyMultimodalDataset(Dataset):
    """
    Lazily loads per-sample files from disk and converts them into tensors.

    Each sample file is expected to contain:
      - gine: dict or None
      - schnet: dict or None
      - fp: list[int] or tensor
      - psmiles_raw: str
    """

    def __init__(self, sample_file_list: List[str], tokenizer, fp_length: int = FP_LENGTH, psmiles_max_len: int = PSMILES_MAX_LEN):
        self.files = sample_file_list
        self.tokenizer = tokenizer
        self.fp_length = fp_length
        self.psmiles_max_len = psmiles_max_len

    def __len__(self) -> int:
        return len(self.files)

    def __getitem__(self, idx: int) -> Dict[str, Dict[str, torch.Tensor]]:
        sample_path = self.files[idx]

        # prefer torch.load if .pt, else try json
        if sample_path.endswith(".pt"):
            sample = torch.load(sample_path, map_location="cpu")
        else:
            with open(sample_path, "r") as f:
                sample = json.load(f)

        # ---- GINE tensors ----
        gine_raw = sample.get("gine", None)
        if gine_raw:
            node_atomic = torch.tensor(gine_raw.get("node_atomic", []), dtype=torch.long)
            node_chirality = torch.tensor(gine_raw.get("node_chirality", []), dtype=torch.float)
            node_charge = torch.tensor(gine_raw.get("node_charge", []), dtype=torch.float)

            if gine_raw.get("edge_index", None) is not None:
                edge_index = torch.tensor(gine_raw["edge_index"], dtype=torch.long)
            else:
                edge_index = torch.tensor([[], []], dtype=torch.long)

            ea_raw = gine_raw.get("edge_attr", None)
            if ea_raw:
                edge_attr = torch.tensor(ea_raw, dtype=torch.float)
            else:
                edge_attr = torch.zeros((edge_index.size(1), 3), dtype=torch.float)

            gine_item = {
                "z": node_atomic,
                "chirality": node_chirality,
                "formal_charge": node_charge,
                "edge_index": edge_index,
                "edge_attr": edge_attr,
            }
        else:
            gine_item = {
                "z": torch.tensor([], dtype=torch.long),
                "chirality": torch.tensor([], dtype=torch.float),
                "formal_charge": torch.tensor([], dtype=torch.float),
                "edge_index": torch.tensor([[], []], dtype=torch.long),
                "edge_attr": torch.zeros((0, 3), dtype=torch.float),
            }

        # ---- SchNet tensors ----
        schnet_raw = sample.get("schnet", None)
        if schnet_raw:
            s_z = torch.tensor(schnet_raw.get("atomic", []), dtype=torch.long)
            s_pos = torch.tensor(schnet_raw.get("coords", []), dtype=torch.float)
            schnet_item = {"z": s_z, "pos": s_pos}
        else:
            schnet_item = {"z": torch.tensor([], dtype=torch.long), "pos": torch.tensor([], dtype=torch.float)}

        # ---- Fingerprint tensors ----
        fp_raw = sample.get("fp", None)
        if fp_raw is None:
            fp_vec = torch.zeros((self.fp_length,), dtype=torch.long)
        else:
            if isinstance(fp_raw, (list, tuple)):
                arr = list(fp_raw)[:self.fp_length]
                if len(arr) < self.fp_length:
                    arr = arr + [0] * (self.fp_length - len(arr))
                fp_vec = torch.tensor(arr, dtype=torch.long)
            elif isinstance(fp_raw, torch.Tensor):
                fp_vec = fp_raw.clone().to(torch.long)
            else:
                fp_vec = torch.zeros((self.fp_length,), dtype=torch.long)

        # ---- PSMILES tensors ----
        psm_raw = sample.get("psmiles_raw", "") or ""
        enc = self.tokenizer(psm_raw, truncation=True, padding="max_length", max_length=self.psmiles_max_len)
        p_input_ids = torch.tensor(enc["input_ids"], dtype=torch.long)
        p_attn = torch.tensor(enc["attention_mask"], dtype=torch.bool)

        return {
            "gine": {
                "z": gine_item["z"],
                "chirality": gine_item["chirality"],
                "formal_charge": gine_item["formal_charge"],
                "edge_index": gine_item["edge_index"],
                "edge_attr": gine_item["edge_attr"],
                "num_nodes": int(gine_item["z"].size(0)) if gine_item["z"].numel() > 0 else 0,
            },
            "schnet": {"z": schnet_item["z"], "pos": schnet_item["pos"]},
            "fp": {"input_ids": fp_vec},
            "psmiles": {"input_ids": p_input_ids, "attention_mask": p_attn},
        }


def multimodal_collate(batch_list: List[Dict[str, Dict[str, torch.Tensor]]]) -> Dict[str, Dict[str, torch.Tensor]]:
    """
    Collate a list of LazyMultimodalDataset samples into a single multimodal batch.

    Output keys:
      - gine: {z, chirality, formal_charge, edge_index, edge_attr, batch}
      - schnet: {z, pos, batch}
      - fp: {input_ids, attention_mask}
      - psmiles: {input_ids, attention_mask}
    """
    # ---- GINE batching ----
    all_z, all_ch, all_fc = [], [], []
    all_edge_index, all_edge_attr = [], []
    batch_mapping = []
    node_offset = 0

    for i, item in enumerate(batch_list):
        g = item["gine"]
        z = g["z"]
        n = z.size(0)

        all_z.append(z)
        all_ch.append(g["chirality"])
        all_fc.append(g["formal_charge"])
        batch_mapping.append(torch.full((n,), i, dtype=torch.long))

        if g["edge_index"] is not None and g["edge_index"].numel() > 0:
            ei_offset = g["edge_index"] + node_offset
            all_edge_index.append(ei_offset)

            ea = match_edge_attr_to_index(g["edge_index"], g["edge_attr"], target_dim=3)
            all_edge_attr.append(ea)

        node_offset += n

    if len(all_z) == 0:
        z_batch = torch.tensor([], dtype=torch.long)
        ch_batch = torch.tensor([], dtype=torch.float)
        fc_batch = torch.tensor([], dtype=torch.float)
        batch_batch = torch.tensor([], dtype=torch.long)
        edge_index_batched = torch.empty((2, 0), dtype=torch.long)
        edge_attr_batched = torch.zeros((0, 3), dtype=torch.float)
    else:
        z_batch = torch.cat(all_z, dim=0)
        ch_batch = torch.cat(all_ch, dim=0)
        fc_batch = torch.cat(all_fc, dim=0)
        batch_batch = torch.cat(batch_mapping, dim=0)

        if len(all_edge_index) > 0:
            edge_index_batched = torch.cat(all_edge_index, dim=1)
            edge_attr_batched = torch.cat(all_edge_attr, dim=0)
        else:
            edge_index_batched = torch.empty((2, 0), dtype=torch.long)
            edge_attr_batched = torch.zeros((0, 3), dtype=torch.float)

    # ---- SchNet batching ----
    all_sz, all_pos, schnet_batch = [], [], []
    for i, item in enumerate(batch_list):
        s = item["schnet"]
        s_z = s["z"]
        s_pos = s["pos"]
        if s_z.numel() == 0:
            continue
        all_sz.append(s_z)
        all_pos.append(s_pos)
        schnet_batch.append(torch.full((s_z.size(0),), i, dtype=torch.long))

    if len(all_sz) == 0:
        s_z_batch = torch.tensor([], dtype=torch.long)
        s_pos_batch = torch.tensor([], dtype=torch.float)
        s_batch_batch = torch.tensor([], dtype=torch.long)
    else:
        s_z_batch = torch.cat(all_sz, dim=0)
        s_pos_batch = torch.cat(all_pos, dim=0)
        s_batch_batch = torch.cat(schnet_batch, dim=0)

    # ---- FP batching ----
    fp_ids = torch.stack(
        [
            item["fp"]["input_ids"] if isinstance(item["fp"]["input_ids"], torch.Tensor)
            else torch.tensor(item["fp"]["input_ids"], dtype=torch.long)
            for item in batch_list
        ],
        dim=0
    )
    fp_attn = torch.ones_like(fp_ids, dtype=torch.bool)

    # ---- PSMILES batching ----
    p_ids = torch.stack([item["psmiles"]["input_ids"] for item in batch_list], dim=0)
    p_attn = torch.stack([item["psmiles"]["attention_mask"] for item in batch_list], dim=0)

    return {
        "gine": {
            "z": z_batch,
            "chirality": ch_batch,
            "formal_charge": fc_batch,
            "edge_index": edge_index_batched,
            "edge_attr": edge_attr_batched,
            "batch": batch_batch,
        },
        "schnet": {"z": s_z_batch, "pos": s_pos_batch, "batch": s_batch_batch},
        "fp": {"input_ids": fp_ids, "attention_mask": fp_attn},
        "psmiles": {"input_ids": p_ids, "attention_mask": p_attn},
    }


def build_dataloaders(
    sample_files: List[str],
    tokenizer,
    train_batch_size: int,
    eval_batch_size: int,
    seed: int = 42
) -> Tuple[DataLoader, DataLoader, torch.utils.data.Subset, torch.utils.data.Subset]:
    """
    Create train/val subsets and corresponding DataLoaders.
    """
    dataset = LazyMultimodalDataset(sample_files, tokenizer, fp_length=FP_LENGTH, psmiles_max_len=PSMILES_MAX_LEN)

    train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=0.2, random_state=seed)
    train_subset = torch.utils.data.Subset(dataset, train_idx)
    val_subset = torch.utils.data.Subset(dataset, val_idx)

    train_loader = DataLoader(
        train_subset,
        batch_size=train_batch_size,
        shuffle=True,
        collate_fn=multimodal_collate,
        num_workers=0,
        drop_last=False,
    )
    val_loader = DataLoader(
        val_subset,
        batch_size=eval_batch_size,
        shuffle=False,
        collate_fn=multimodal_collate,
        num_workers=0,
        drop_last=False,
    )
    return train_loader, val_loader, train_subset, val_subset


# =============================================================================
# Multimodal contrastive model
# =============================================================================

class MultimodalContrastiveModel(nn.Module):
    """
    Wraps unimodal encoders and computes:
      - InfoNCE between masked modality embedding vs mean anchor of other modalities
      - Optional reconstruction losses for masked tokens/atoms when labels are present
    """

    def __init__(
        self,
        gine_encoder: Optional[GineEncoder],
        schnet_encoder: Optional[NodeSchNetWrapper],
        fp_encoder: Optional[FingerprintEncoder],
        psmiles_encoder: Optional[PSMILESDebertaEncoder],
        emb_dim: int = 600,
    ):
        super().__init__()
        self.gine = gine_encoder
        self.schnet = schnet_encoder
        self.fp = fp_encoder
        self.psmiles = psmiles_encoder

        self.proj_gine = nn.Linear(getattr(self.gine, "pool_proj").out_features if self.gine is not None else emb_dim, emb_dim) if self.gine is not None else None
        self.proj_schnet = nn.Linear(getattr(self.schnet, "pool_proj").out_features if self.schnet is not None else emb_dim, emb_dim) if self.schnet is not None else None
        self.proj_fp = nn.Linear(getattr(self.fp, "pool_proj").out_features if self.fp is not None else emb_dim, emb_dim) if self.fp is not None else None
        self.proj_psmiles = nn.Linear(getattr(self.psmiles, "pool_proj").out_features if self.psmiles is not None else emb_dim, emb_dim) if self.psmiles is not None else None

        self.temperature = TEMPERATURE
        self.ce_loss = nn.CrossEntropyLoss(ignore_index=-100, reduction="mean")

    def encode(self, batch_mods: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        """Compute normalized projected embeddings for available modalities."""
        embs = {}

        if "gine" in batch_mods and self.gine is not None:
            g = batch_mods["gine"]
            emb_g = self.gine(g["z"], g["chirality"], g["formal_charge"], g["edge_index"], g["edge_attr"], g.get("batch", None))
            embs["gine"] = F.normalize(self.proj_gine(emb_g), dim=-1)

        if "schnet" in batch_mods and self.schnet is not None:
            s = batch_mods["schnet"]
            emb_s = self.schnet(s["z"], s["pos"], s.get("batch", None))
            embs["schnet"] = F.normalize(self.proj_schnet(emb_s), dim=-1)

        if "fp" in batch_mods and self.fp is not None:
            f = batch_mods["fp"]
            emb_f = self.fp(f["input_ids"], f.get("attention_mask", None))
            embs["fp"] = F.normalize(self.proj_fp(emb_f), dim=-1)

        if "psmiles" in batch_mods and self.psmiles is not None:
            p = batch_mods["psmiles"]
            emb_p = self.psmiles(p["input_ids"], p.get("attention_mask", None))
            embs["psmiles"] = F.normalize(self.proj_psmiles(emb_p), dim=-1)

        return embs

    def forward(self, batch_mods: Dict[str, torch.Tensor], mask_target: str):
        """
        Compute total loss = InfoNCE + REC_LOSS_WEIGHT * reconstruction_loss
        """
        device = next(self.parameters()).device
        embs = self.encode(batch_mods)
        info = {}

        if mask_target not in embs:
            return torch.tensor(0.0, device=device), {"batch_size": 0}

        target = embs[mask_target]
        other_keys = [k for k in embs.keys() if k != mask_target]
        if len(other_keys) == 0:
            return torch.tensor(0.0, device=device), {"batch_size": target.size(0)}

        anchor = torch.stack([embs[k] for k in other_keys], dim=0).mean(dim=0)
        logits = torch.matmul(anchor, target.T) / self.temperature
        B = logits.size(0)
        labels = torch.arange(B, device=logits.device)
        info_nce_loss = F.cross_entropy(logits, labels)
        info["info_nce_loss"] = float(info_nce_loss.detach().cpu().item())

        # Optional reconstruction terms
        rec_losses = []
        rec_details = {}

        # GINE node reconstruction (atomic ids) if labels present
        try:
            if "gine" in batch_mods and self.gine is not None:
                gm = batch_mods["gine"]
                labels_nodes = gm.get("labels", None)
                if labels_nodes is not None:
                    node_logits = self.gine.node_logits(gm["z"], gm["chirality"], gm["formal_charge"], gm["edge_index"], gm["edge_attr"])
                    if labels_nodes.dim() == 1 and node_logits.size(0) == labels_nodes.size(0):
                        loss_gine = self.ce_loss(node_logits, labels_nodes.to(node_logits.device))
                        rec_losses.append(loss_gine)
                        rec_details["gine_rec_loss"] = float(loss_gine.detach().cpu().item())
        except Exception as e:
            print("Warning: GINE reconstruction loss computation failed:", e)

        # SchNet node reconstruction if labels present
        try:
            if "schnet" in batch_mods and self.schnet is not None:
                sm = batch_mods["schnet"]
                labels_nodes = sm.get("labels", None)
                if labels_nodes is not None:
                    node_logits = self.schnet.node_logits(sm["z"], sm["pos"], sm.get("batch", None))
                    if labels_nodes.dim() == 1 and node_logits.size(0) == labels_nodes.size(0):
                        loss_schnet = self.ce_loss(node_logits, labels_nodes.to(node_logits.device))
                        rec_losses.append(loss_schnet)
                        rec_details["schnet_rec_loss"] = float(loss_schnet.detach().cpu().item())
        except Exception as e:
            print("Warning: SchNet reconstruction loss computation failed:", e)

        # FP token reconstruction if labels present
        try:
            if "fp" in batch_mods and self.fp is not None:
                fm = batch_mods["fp"]
                labels_fp = fm.get("labels", None)
                if labels_fp is not None:
                    token_logits = self.fp.token_logits(fm["input_ids"], fm.get("attention_mask", None))
                    Bf, Lf, V = token_logits.shape
                    logits2 = token_logits.view(-1, V)
                    labels2 = labels_fp.view(-1).to(logits2.device)
                    loss_fp = self.ce_loss(logits2, labels2)
                    rec_losses.append(loss_fp)
                    rec_details["fp_rec_loss"] = float(loss_fp.detach().cpu().item())
        except Exception as e:
            print("Warning: FP reconstruction loss computation failed:", e)

        # PSMILES MLM loss if labels present
        try:
            if "psmiles" in batch_mods and self.psmiles is not None:
                pm = batch_mods["psmiles"]
                labels_ps = pm.get("labels", None)
                if labels_ps is not None:
                    loss_ps = self.psmiles.token_logits(pm["input_ids"], pm.get("attention_mask", None), labels=labels_ps)
                    if isinstance(loss_ps, torch.Tensor):
                        rec_losses.append(loss_ps)
                        rec_details["psmiles_mlm_loss"] = float(loss_ps.detach().cpu().item())
        except Exception as e:
            print("Warning: PSMILES MLM loss computation failed:", e)

        if len(rec_losses) > 0:
            rec_loss_total = sum(rec_losses) / len(rec_losses)
            info["reconstruction_loss"] = float(rec_loss_total.detach().cpu().item())
            total_loss = info_nce_loss + REC_LOSS_WEIGHT * rec_loss_total
            info["total_loss"] = float(total_loss.detach().cpu().item())
            info.update(rec_details)
        else:
            total_loss = info_nce_loss
            info["reconstruction_loss"] = 0.0
            info["total_loss"] = float(total_loss.detach().cpu().item())

        return total_loss, info


# =============================================================================
# Masking utilities
# =============================================================================

def mask_batch_for_modality(batch: dict, modality: str, tokenizer, p_mask: float = P_MASK) -> dict:
    """
    Apply BERT-style masking to the selected modality and attach `labels`.
    """
    b = {}

    # ---------------- GINE ----------------
    if "gine" in batch:
        z = batch["gine"]["z"].clone()
        chir = batch["gine"]["chirality"].clone()
        fc = batch["gine"]["formal_charge"].clone()
        edge_index = batch["gine"]["edge_index"]
        edge_attr = batch["gine"]["edge_attr"]
        batch_map = batch["gine"].get("batch", None)

        n_nodes = z.size(0)
        dev = z.device
        is_selected = torch.rand(n_nodes, device=dev) < p_mask
        if is_selected.numel() > 0 and is_selected.all():
            is_selected[torch.randint(0, n_nodes, (1,), device=dev)] = False

        labels_z = torch.full_like(z, fill_value=-100)
        if is_selected.any():
            sel_idx = torch.nonzero(is_selected).squeeze(-1)
            if sel_idx.dim() == 0:
                sel_idx = sel_idx.unsqueeze(0)

            labels_z[is_selected] = z[is_selected]
            rand_atomic = torch.randint(1, MAX_ATOMIC_Z + 1, (sel_idx.size(0),), dtype=torch.long, device=dev)

            probs = torch.rand(sel_idx.size(0), device=dev)
            mask_choice = probs < 0.8
            rand_choice = (probs >= 0.8) & (probs < 0.9)

            if mask_choice.any():
                z[sel_idx[mask_choice]] = MASK_ATOM_ID
            if rand_choice.any():
                z[sel_idx[rand_choice]] = rand_atomic[rand_choice]

        b["gine"] = {
            "z": z,
            "chirality": chir,
            "formal_charge": fc,
            "edge_index": edge_index,
            "edge_attr": edge_attr,
            "batch": batch_map,
            "labels": labels_z,
        }

    # ---------------- SchNet ----------------
    if "schnet" in batch:
        z = batch["schnet"]["z"].clone()
        pos = batch["schnet"]["pos"].clone()
        batch_map = batch["schnet"].get("batch", None)

        n_nodes = z.size(0)
        dev = z.device
        is_selected = torch.rand(n_nodes, device=dev) < p_mask
        if is_selected.numel() > 0 and is_selected.all():
            is_selected[torch.randint(0, n_nodes, (1,), device=dev)] = False

        labels_z = torch.full((n_nodes,), -100, dtype=torch.long, device=dev)
        if is_selected.any():
            sel_idx = torch.nonzero(is_selected).squeeze(-1)
            if sel_idx.dim() == 0:
                sel_idx = sel_idx.unsqueeze(0)

            labels_z[is_selected] = z[is_selected]
            probs_c = torch.rand(sel_idx.size(0), device=dev)
            noisy_choice = probs_c < 0.8
            randpos_choice = (probs_c >= 0.8) & (probs_c < 0.9)

            if noisy_choice.any():
                idx = sel_idx[noisy_choice]
                noise = torch.randn((idx.size(0), 3), device=pos.device) * 0.5
                pos[idx] = pos[idx] + noise

            if randpos_choice.any():
                idx = sel_idx[randpos_choice]
                mins = pos.min(dim=0).values
                maxs = pos.max(dim=0).values
                randpos = (torch.rand((idx.size(0), 3), device=pos.device) * (maxs - mins)) + mins
                pos[idx] = randpos

        b["schnet"] = {"z": z, "pos": pos, "batch": batch_map, "labels": labels_z}

    # ---------------- FP ----------------
    if "fp" in batch:
        input_ids = batch["fp"]["input_ids"].clone()
        attn = batch["fp"].get("attention_mask", torch.ones_like(input_ids, dtype=torch.bool))

        B, L = input_ids.shape
        dev = input_ids.device
        labels_z = torch.full_like(input_ids, -100)

        for i in range(B):
            sel = torch.rand(L, device=dev) < p_mask
            if sel.numel() > 0 and sel.all():
                sel[torch.randint(0, L, (1,), device=dev)] = False

            sel_idx = torch.nonzero(sel).squeeze(-1)
            if sel_idx.numel() > 0:
                if sel_idx.dim() == 0:
                    sel_idx = sel_idx.unsqueeze(0)

                labels_z[i, sel_idx] = input_ids[i, sel_idx]

                probs = torch.rand(sel_idx.size(0), device=dev)
                mask_choice = probs < 0.8
                rand_choice = (probs >= 0.8) & (probs < 0.9)

                if mask_choice.any():
                    input_ids[i, sel_idx[mask_choice]] = MASK_TOKEN_ID_FP
                if rand_choice.any():
                    rand_bits = torch.randint(0, 2, (rand_choice.sum().item(),), dtype=torch.long, device=dev)
                    input_ids[i, sel_idx[rand_choice]] = rand_bits

        b["fp"] = {"input_ids": input_ids, "attention_mask": attn, "labels": labels_z}

    # ---------------- PSMILES ----------------
    if "psmiles" in batch:
        input_ids = batch["psmiles"]["input_ids"].clone()
        attn = batch["psmiles"]["attention_mask"].clone()

        B, L = input_ids.shape
        dev = input_ids.device
        labels_z = torch.full_like(input_ids, -100)

        # If tokenizer is unavailable, keep labels=-100 (no MLM loss)
        if tokenizer is None:
            b["psmiles"] = {"input_ids": input_ids, "attention_mask": attn, "labels": labels_z}
        else:
            mask_token_id = tokenizer.mask_token_id if getattr(tokenizer, "mask_token_id", None) is not None else getattr(tokenizer, "vocab", {}).get("<mask>", 1)

            for i in range(B):
                sel = torch.rand(L, device=dev) < p_mask
                if sel.numel() > 0 and sel.all():
                    sel[torch.randint(0, L, (1,), device=dev)] = False

                sel_idx = torch.nonzero(sel).squeeze(-1)
                if sel_idx.numel() > 0:
                    if sel_idx.dim() == 0:
                        sel_idx = sel_idx.unsqueeze(0)

                    labels_z[i, sel_idx] = input_ids[i, sel_idx]

                    probs = torch.rand(sel_idx.size(0), device=dev)
                    mask_choice = probs < 0.8
                    rand_choice = (probs >= 0.8) & (probs < 0.9)

                    if mask_choice.any():
                        input_ids[i, sel_idx[mask_choice]] = mask_token_id
                    if rand_choice.any():
                        rand_ids = torch.randint(0, getattr(tokenizer, "vocab_size", 300), (rand_choice.sum().item(),), dtype=torch.long, device=dev)
                        input_ids[i, sel_idx[rand_choice]] = rand_ids

            b["psmiles"] = {"input_ids": input_ids, "attention_mask": attn, "labels": labels_z}

    return b


def mm_batch_to_model_input(masked_batch: dict) -> dict:
    """
    Normalize the masked batch dict into the exact structure expected by MultimodalContrastiveModel.
    """
    mm = {}
    if "gine" in masked_batch:
        gm = masked_batch["gine"]
        mm["gine"] = {
            "z": gm["z"],
            "chirality": gm["chirality"],
            "formal_charge": gm["formal_charge"],
            "edge_index": gm["edge_index"],
            "edge_attr": gm["edge_attr"],
            "batch": gm.get("batch", None),
            "labels": gm.get("labels", None),
        }
    if "schnet" in masked_batch:
        sm = masked_batch["schnet"]
        mm["schnet"] = {"z": sm["z"], "pos": sm["pos"], "batch": sm.get("batch", None), "labels": sm.get("labels", None)}
    if "fp" in masked_batch:
        fm = masked_batch["fp"]
        mm["fp"] = {"input_ids": fm["input_ids"], "attention_mask": fm.get("attention_mask", None), "labels": fm.get("labels", None)}
    if "psmiles" in masked_batch:
        pm = masked_batch["psmiles"]
        mm["psmiles"] = {"input_ids": pm["input_ids"], "attention_mask": pm.get("attention_mask", None), "labels": pm.get("labels", None)}
    return mm


# =============================================================================
# Evaluation
# =============================================================================

def evaluate_multimodal(model: MultimodalContrastiveModel, val_loader: DataLoader, device: torch.device, tokenizer, mask_target: str = "fp") -> Dict[str, float]:
    """
    Contrastive-only evaluation:
      - masks one modality
      - computes InfoNCE logits = anchor·target / T
      - reports eval_loss, top1 acc, weighted F1
    """
    model.eval()
    total_loss = 0.0
    total_examples = 0
    acc_sum = 0.0
    f1_sum = 0.0

    with torch.no_grad():
        for batch in val_loader:
            masked_batch = mask_batch_for_modality(batch, mask_target, tokenizer=tokenizer, p_mask=P_MASK)

            # Move tensors to device
            for k in masked_batch:
                for subk in masked_batch[k]:
                    if isinstance(masked_batch[k][subk], torch.Tensor):
                        masked_batch[k][subk] = masked_batch[k][subk].to(device)

            mm_in = mm_batch_to_model_input(masked_batch)
            embs = model.encode(mm_in)

            if mask_target not in embs:
                continue

            target = embs[mask_target]
            other_keys = [k for k in embs.keys() if k != mask_target]
            if len(other_keys) == 0:
                continue

            anchor = torch.stack([embs[k] for k in other_keys], dim=0).mean(dim=0)
            logits = torch.matmul(anchor, target.T) / model.temperature

            B = logits.size(0)
            labels = torch.arange(B, device=logits.device)

            loss = F.cross_entropy(logits, labels)
            total_loss += loss.item() * B
            total_examples += B

            preds = logits.argmax(dim=1)
            acc = (preds == labels).float().mean().item()
            acc_sum += acc * B

            # Weighted F1 over instance IDs
            try:
                labels_np = labels.cpu().numpy()
                preds_np = preds.cpu().numpy()
                if len(np.unique(labels_np)) < 2:
                    batch_f1 = float(acc)
                else:
                    batch_f1 = f1_score(labels_np, preds_np, average="weighted")
            except Exception:
                batch_f1 = float(acc)
            f1_sum += batch_f1 * B

    if total_examples == 0:
        return {"eval_loss": float("nan"), "eval_accuracy": 0.0, "eval_f1_weighted": 0.0}

    return {
        "eval_loss": total_loss / total_examples,
        "eval_accuracy": acc_sum / total_examples,
        "eval_f1_weighted": f1_sum / total_examples,
    }


# =============================================================================
# HF wrapper + collator + trainer
# =============================================================================

class HFMultimodalModule(nn.Module):
    """
    HuggingFace Trainer-facing wrapper:
    - Receives a full multimodal batch
    - Randomly masks one modality (provided by collator) inside forward
    - Returns a dict compatible with Trainer (loss, logits, labels)
    """

    def __init__(self, mm_model: MultimodalContrastiveModel, tokenizer):
        super().__init__()
        self.mm = mm_model
        self._tokenizer = tokenizer

    def forward(self, **kwargs):
        if "batch" in kwargs:
            batch = kwargs["batch"]
            mask_target = kwargs.get("mask_target", "fp")
        else:
            modality_keys = ["gine", "schnet", "fp", "psmiles"]
            found = {k: v for k, v in kwargs.items() if k in modality_keys}
            if len(found) > 0:
                batch = {k: found[k] for k in found}
                mask_target = kwargs.get("mask_target", "fp")
            else:
                raise ValueError(
                    "HFMultimodalModule.forward could not find 'batch' nor modality keys in inputs. "
                    f"Inputs keys: {list(kwargs.keys())}"
                )

        masked_batch = mask_batch_for_modality(batch, mask_target, tokenizer=self._tokenizer, p_mask=P_MASK)

        device = next(self.parameters()).device
        for k in masked_batch:
            for subk in list(masked_batch[k].keys()):
                val = masked_batch[k][subk]
                if isinstance(val, torch.Tensor):
                    masked_batch[k][subk] = val.to(device)

        mm_in = mm_batch_to_model_input(masked_batch)
        loss, info = self.mm(mm_in, mask_target)

        logits = None
        labels = None
        try:
            with torch.no_grad():
                embs = self.mm.encode(mm_in)
                if mask_target in embs:
                    target = embs[mask_target]
                    other_keys = [k for k in embs.keys() if k != mask_target]
                    if len(other_keys) > 0:
                        anchor = torch.stack([embs[k] for k in other_keys], dim=0).mean(dim=0)
                        logits = torch.matmul(anchor, target.T) / self.mm.temperature
                        B = logits.size(0)
                        labels = torch.arange(B, device=logits.device)
        except Exception as e:
            print("Warning: failed to compute logits/labels inside HFMultimodalModule.forward:", e)
            logits = None
            labels = None

        eval_loss = loss.detach() if isinstance(loss, torch.Tensor) else torch.tensor(float(loss), device=device)
        out = {"loss": loss, "eval_loss": eval_loss}
        if logits is not None:
            out["logits"] = logits
        if labels is not None:
            out["labels"] = labels
        out["mm_info"] = info
        return out


class ContrastiveDataCollator:
    """
    Collator used by Trainer:
    - If given raw samples (list of dicts), it calls multimodal_collate
    - Then selects a random modality to mask (mask_target)
    """

    def __init__(self, mask_prob: float = P_MASK, modalities: Optional[List[str]] = None):
        self.mask_prob = mask_prob
        self.modalities = modalities if modalities is not None else ["gine", "schnet", "fp", "psmiles"]

    def __call__(self, features):
        if isinstance(features, dict):
            collated = features
            mask_target = random.choice([m for m in self.modalities if m in collated])
            return {"batch": collated, "mask_target": mask_target}

        if isinstance(features, (list, tuple)) and len(features) > 0:
            first = features[0]
            if isinstance(first, dict) and "gine" in first:
                collated = multimodal_collate(list(features))
                mask_target = random.choice([m for m in self.modalities if m in collated])
                return {"batch": collated, "mask_target": mask_target}

            if isinstance(first, dict) and "batch" in first:
                collated = first["batch"]
                mask_target = first.get("mask_target", random.choice([m for m in self.modalities if m in collated]))
                return {"batch": collated, "mask_target": mask_target}

        print("ContrastiveDataCollator received unexpected 'features' shape/type.")
        raise ValueError("ContrastiveDataCollator could not collate input. Expected list[dict] with 'gine' key or already-collated dict.")


class VerboseTrainingCallback(TrainerCallback):
    """
    Console-first training callback with early stopping on eval_loss.
    """

    def __init__(self, patience: int = 10):
        self.start_time = time.time()
        self.epoch_start_time = time.time()
        self._last_train_loss = None
        self.best_val_loss = float("inf")
        self.best_epoch = 0
        self.epochs_no_improve = 0
        self.patience = patience
        self.trainer_ref = None

    def save_best_model(self, output_dir_suffix: str = "best"):
        if self.trainer_ref is None:
            return
        try:
            ckpt_dir = os.path.join(OUTPUT_DIR, output_dir_suffix)
            os.makedirs(ckpt_dir, exist_ok=True)
            self.trainer_ref._save(ckpt_dir)
            print(f"Saved best model checkpoint to {ckpt_dir}")
        except Exception as e:
            try:
                self.trainer_ref.save_model(os.path.join(OUTPUT_DIR, output_dir_suffix))
                print(f"Saved best model (fallback) to {os.path.join(OUTPUT_DIR, output_dir_suffix)}")
            except Exception as e2:
                print("Warning: failed to save best model:", e, e2)

    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = time.time()
        print("=" * 80)
        print(" STARTING MULTIMODAL CONTRASTIVE LEARNING TRAINING")
        print("=" * 80)

        model = kwargs.get("model")
        if model is not None:
            total_params = sum(p.numel() for p in model.parameters())
            trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
            non_trainable_params = total_params - trainable_params
            print(" MODEL PARAMETERS:")
            print(f"   Total Parameters: {total_params:,}")
            print(f"   Trainable Parameters: {trainable_params:,}")
            print(f"   Non-trainable Parameters: {non_trainable_params:,}")
            print(f"   Training Progress: 0/{args.num_train_epochs} epochs")

        print("=" * 80)

    def on_epoch_begin(self, args, state, control, **kwargs):
        self.epoch_start_time = time.time()
        current_epoch = state.epoch if state is not None else 0.0
        print(f" Epoch {current_epoch + 1:.1f}/{args.num_train_epochs} Starting...")

    def on_epoch_end(self, args, state, control, **kwargs):
        train_loss = None
        for log in reversed(state.log_history):
            if isinstance(log, dict) and "loss" in log and float(log.get("loss", 0)) != 0.0:
                train_loss = log["loss"]
                break
        self._last_train_loss = train_loss

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None and "loss" in logs:
            current_step = state.global_step
            current_epoch = state.epoch
            try:
                steps_per_epoch = max(1, len(train_loader) // args.gradient_accumulation_steps)
            except Exception:
                steps_per_epoch = 1

            if current_step % max(1, steps_per_epoch // 10) == 0:
                progress = current_epoch + (current_step % steps_per_epoch) / steps_per_epoch
                print(f"   Step {current_step:4d} | Epoch {progress:.1f} | Train Loss: {logs['loss']:.6f}")

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        current_epoch = state.epoch if state is not None else 0.0
        epoch_time = time.time() - self.epoch_start_time

        hf_metrics = metrics if metrics is not None else kwargs.get("metrics", None)
        hf_eval_loss = None
        hf_train_loss = self._last_train_loss

        if hf_metrics is not None:
            hf_eval_loss = hf_metrics.get("eval_loss", hf_metrics.get("loss", None))
            if hf_train_loss is None:
                hf_train_loss = hf_metrics.get("train_loss", hf_train_loss)

        cl_metrics = {}
        try:
            model = kwargs.get("model", None)
            if model is not None:
                cl_model = model.mm if hasattr(model, "mm") else model
                cl_metrics = evaluate_multimodal(cl_model, val_loader, device, tokenizer, mask_target="fp")
            else:
                cl_metrics = evaluate_multimodal(multimodal_model, val_loader, device, tokenizer, mask_target="fp")
        except Exception as e:
            print("Warning: evaluate_multimodal inside callback failed:", e)

        if hf_eval_loss is None:
            hf_eval_loss = cl_metrics.get("eval_loss", None)

        val_acc = cl_metrics.get("eval_accuracy", "N/A")
        val_f1 = cl_metrics.get("eval_f1_weighted", "N/A")

        print(f" EPOCH {current_epoch + 1:.1f} RESULTS:")
        if hf_train_loss is not None:
            try:
                print(f"   Train Loss (HF reported): {hf_train_loss:.6f}")
            except Exception:
                print(f"   Train Loss (HF reported): {hf_train_loss}")
        else:
            print("   Train Loss (HF reported): N/A")

        if hf_eval_loss is not None:
            try:
                print(f"   Eval Loss (HF reported): {hf_eval_loss:.6f}")
            except Exception:
                print(f"   Eval Loss (HF reported): {hf_eval_loss}")
        else:
            print("   Eval Loss (HF reported): N/A")

        if isinstance(val_acc, float):
            print(f"   Eval Acc (CL evaluator): {val_acc:.6f}")
        else:
            print(f"   Eval Acc (CL evaluator): {val_acc}")

        if isinstance(val_f1, float):
            print(f"   Eval F1 Weighted (CL evaluator): {val_f1:.6f}")
        else:
            print(f"   Eval F1 Weighted (CL evaluator): {val_f1}")

        current_val = hf_eval_loss if hf_eval_loss is not None else float("inf")

        if current_val < self.best_val_loss - 1e-6:
            self.best_val_loss = current_val
            self.best_epoch = current_epoch
            self.epochs_no_improve = 0
            try:
                self.save_best_model("best")
            except Exception as e:
                print("Warning: saving best model failed:", e)
        else:
            self.epochs_no_improve += 1

        if self.epochs_no_improve >= self.patience:
            print(f"Early stopping: no improvement in val_loss for {self.patience} epochs.")
            control.should_training_stop = True

        print(f"   Epoch Training Time: {epoch_time:.2f}s")
        print(f"   Best Val Loss so far: {self.best_val_loss}")
        print(f"   Epochs since improvement: {self.epochs_no_improve}/{self.patience}")
        print("-" * 50)

    def on_train_end(self, args, state, control, **kwargs):
        total_time = time.time() - self.start_time
        print("=" * 80)
        print(" TRAINING COMPLETED")
        print("=" * 80)
        print(f"   Total Training Time: {total_time:.2f}s")
        if state is not None:
            try:
                print(f"   Total Epochs Completed: {state.epoch + 1:.1f}")
            except Exception:
                pass
        print("=" * 80)


class CLTrainer(Trainer):
    """
    Custom Trainer:
    - evaluate(): merges HF eval with contrastive evaluator
    - _save(): saves a state_dict under pytorch_model.bin
    - _load_best_model(): loads best pytorch_model.bin
    """

    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        try:
            metrics = super().evaluate(eval_dataset=eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) or {}
        except Exception as e:
            print("Warning: super().evaluate() raised an exception. Falling back to CL-only evaluator.")
            import traceback
            traceback.print_exc()
            try:
                cl_model = self.model.mm if hasattr(self.model, "mm") else self.model
                cl_metrics = evaluate_multimodal(cl_model, val_loader, device, tokenizer, mask_target="fp")
                metrics = {k: float(v) if isinstance(v, (float, int, np.floating, np.integer)) else v for k, v in cl_metrics.items()}
                metrics["epoch"] = float(self.state.epoch) if getattr(self.state, "epoch", None) is not None else metrics.get("epoch", 0.0)
            except Exception as e2:
                print("Fallback evaluate_multimodal failed as well:", e2)
                traceback.print_exc()
                metrics = {"eval_loss": float("nan"), "epoch": float(self.state.epoch) if getattr(self.state, "epoch", None) is not None else 0.0}
            return metrics

        try:
            cl_model = self.model.mm if hasattr(self.model, "mm") else self.model
            cl_metrics = evaluate_multimodal(cl_model, val_loader, device, tokenizer, mask_target="fp")
        except Exception as e:
            print("Warning: evaluate_multimodal failed inside CLTrainer.evaluate():", e)
            cl_metrics = {}

        for k, v in cl_metrics.items():
            try:
                metrics[k] = float(v)
            except Exception:
                metrics[k] = v

        if "eval_loss" not in metrics and "eval_loss" in cl_metrics:
            try:
                metrics["eval_loss"] = float(cl_metrics["eval_loss"])
            except Exception:
                metrics["eval_loss"] = cl_metrics["eval_loss"]

        if "epoch" not in metrics:
            metrics["epoch"] = float(self.state.epoch) if getattr(self.state, "epoch", None) is not None else metrics.get("epoch", 0.0)

        return metrics

    def _save(self, output_dir: str):
        os.makedirs(output_dir, exist_ok=True)

        try:
            self.state.save_to_json(os.path.join(output_dir, "trainer_state.json"))
        except Exception:
            pass

        try:
            model_to_save = self.model.mm if hasattr(self.model, "mm") else self.model
            torch.save(model_to_save.state_dict(), os.path.join(output_dir, "pytorch_model.bin"))
        except Exception as e:
            try:
                if hasattr(self.model, "save_pretrained"):
                    self.model.save_pretrained(output_dir)
                else:
                    raise e
            except Exception as e2:
                print("Warning: failed to save model state_dict:", e2)

        try:
            torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
        except Exception:
            pass

        try:
            torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
        except Exception:
            pass

    def _load_best_model(self):
        best_ckpt = self.state.best_model_checkpoint
        if not best_ckpt:
            return

        candidate = os.path.join(best_ckpt, "pytorch_model.bin")
        if not os.path.exists(candidate):
            candidate = os.path.join(best_ckpt, "model.bin")
            if not os.path.exists(candidate):
                candidate = None

        if candidate is None:
            print(f"CLTrainer._load_best_model(): no compatible pytorch_model.bin found in {best_ckpt}; skipping load.")
            return

        try:
            state_dict = torch.load(candidate, map_location=self.args.device)
            model_to_load = self.model.mm if hasattr(self.model, "mm") else self.model
            model_to_load.load_state_dict(state_dict, strict=False)
            print(f"CLTrainer: loaded best model state_dict from {candidate}")
        except Exception as e:
            print("CLTrainer._load_best_model: failed to load state_dict using torch.load:", e)
            return


# =============================================================================
# Model construction + weight loading
# =============================================================================

def load_state_dict_if_present(model: nn.Module, ckpt_dir: str, filename: str = "pytorch_model.bin") -> None:
    """Load model weights if the checkpoint file exists."""
    path = os.path.join(ckpt_dir, filename)
    if os.path.exists(path):
        try:
            model.load_state_dict(torch.load(path, map_location="cpu"), strict=False)
            print(f"Loaded weights from {path}")
        except Exception as e:
            print(f"Could not load weights from {path}: {e}")


def build_models(device: torch.device) -> Tuple[MultimodalContrastiveModel, PSMILESDebertaEncoder]:
    """Instantiate unimodal encoders, optionally load best checkpoints, and assemble the multimodal model."""
    # GINE
    gine_encoder = GineEncoder(node_emb_dim=NODE_EMB_DIM, edge_emb_dim=EDGE_EMB_DIM, num_layers=NUM_GNN_LAYERS, max_atomic_z=MAX_ATOMIC_Z)
    load_state_dict_if_present(gine_encoder, BEST_GINE_DIR)
    gine_encoder.to(device)

    # SchNet
    schnet_encoder = NodeSchNetWrapper(
        hidden_channels=SCHNET_HIDDEN,
        num_interactions=SCHNET_NUM_INTERACTIONS,
        num_gaussians=SCHNET_NUM_GAUSSIANS,
        cutoff=SCHNET_CUTOFF,
        max_num_neighbors=SCHNET_MAX_NEIGHBORS,
    )
    load_state_dict_if_present(schnet_encoder, BEST_SCHNET_DIR)
    schnet_encoder.to(device)

    # Fingerprint encoder
    fp_encoder = FingerprintEncoder(
        vocab_size=VOCAB_SIZE_FP,
        hidden_dim=256,
        seq_len=FP_LENGTH,
        num_layers=4,
        nhead=8,
        dim_feedforward=1024,
        dropout=0.1,
    )
    load_state_dict_if_present(fp_encoder, BEST_FP_DIR)
    fp_encoder.to(device)

    # PSMILES / DeBERTa
    psmiles_encoder = None
    if os.path.isdir(BEST_PSMILES_DIR):
        try:
            psmiles_encoder = PSMILESDebertaEncoder(model_dir_or_name=BEST_PSMILES_DIR)
            print("Loaded Deberta (PSMILES) from", BEST_PSMILES_DIR)
        except Exception as e:
            print("Failed to load Deberta from saved directory:", e)

    if psmiles_encoder is None:
        psmiles_encoder = PSMILESDebertaEncoder(model_dir_or_name=None)

    psmiles_encoder.to(device)

    multimodal_model = MultimodalContrastiveModel(gine_encoder, schnet_encoder, fp_encoder, psmiles_encoder, emb_dim=600)
    multimodal_model.to(device)

    return multimodal_model, psmiles_encoder


# =============================================================================
# Main execution
# =============================================================================

def main():
    # ---- setup ----
    ensure_dir(OUTPUT_DIR)
    ensure_dir(PREPROC_DIR)

    device_local = get_device()
    print("Device:", device_local)

    set_seed(42)

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        overwrite_output_dir=True,
        num_train_epochs=25,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=4,
        eval_strategy="epoch",
        logging_steps=100,
        learning_rate=1e-4,
        weight_decay=0.01,
        eval_accumulation_steps=1000,
        fp16=torch.cuda.is_available(),
        save_strategy="epoch",
        save_steps=500,
        disable_tqdm=False,
        logging_first_step=True,
        report_to=[],
        dataloader_num_workers=0,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
    )

    # ---- data ----
    sample_files = prepare_or_load_data_streaming(
        csv_path=CSV_PATH,
        preproc_dir=PREPROC_DIR,
        target_rows=TARGET_ROWS,
        chunksize=CHUNKSIZE,
    )

    tokenizer_local = build_psmiles_tokenizer(spm_path=SPM_MODEL, max_len=PSMILES_MAX_LEN)

    global train_loader, val_loader, multimodal_model, device, tokenizer 
    tokenizer = tokenizer_local
    device = device_local

    train_loader, val_loader, train_subset, val_subset = build_dataloaders(
        sample_files=sample_files,
        tokenizer=tokenizer_local,
        train_batch_size=training_args.per_device_train_batch_size,
        eval_batch_size=training_args.per_device_eval_batch_size,
        seed=42,
    )

    # ---- models ----
    multimodal_model, _psmiles_encoder = build_models(device_local)

    hf_model = HFMultimodalModule(multimodal_model, tokenizer_local).to(device_local)
    data_collator = ContrastiveDataCollator(mask_prob=P_MASK)

    callback = VerboseTrainingCallback(patience=10)

    trainer = CLTrainer(
        model=hf_model,
        args=training_args,
        train_dataset=train_subset,
        eval_dataset=val_subset,
        data_collator=data_collator,
        callbacks=[callback],
    )
    callback.trainer_ref = trainer

    # Force HF Trainer to use our prebuilt PyTorch DataLoaders
    trainer.get_train_dataloader = lambda dataset=None: train_loader
    trainer.get_eval_dataloader = lambda eval_dataset=None: val_loader

    # Optimizer
    _optimizer = torch.optim.AdamW(multimodal_model.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay)

    total_params = sum(p.numel() for p in multimodal_model.parameters())
    trainable_params = sum(p.numel() for p in multimodal_model.parameters() if p.requires_grad)
    non_trainable_params = total_params - trainable_params

    print("\n MODEL PARAMETERS:")
    print(f"   Total Parameters: {total_params:,}")
    print(f"   Trainable Parameters: {trainable_params:,}")
    print(f"   Non-trainable Parameters: {non_trainable_params:,}")

    # Clear GPU cache
    if torch.cuda.is_available():
        try:
            torch.cuda.empty_cache()
        except Exception:
            pass

    # ---- Train ----
    training_start_time = time.time()
    trainer.train()
    training_end_time = time.time()

    # ---- Save best ----
    best_dir = os.path.join(OUTPUT_DIR, "best")
    os.makedirs(best_dir, exist_ok=True)

    try:
        best_ckpt = trainer.state.best_model_checkpoint
        if best_ckpt:
            multimodal_model.load_state_dict(torch.load(os.path.join(best_ckpt, "pytorch_model.bin"), map_location=device_local), strict=False)
            print(f"Loaded best checkpoint from {best_ckpt} into multimodal_model for final evaluation.")
        torch.save(multimodal_model.state_dict(), os.path.join(best_dir, "pytorch_model.bin"))
        print(f" Saved best multimodal model to {os.path.join(best_dir, 'pytorch_model.bin')}")
    except Exception as e:
        print("Warning: failed to load/save best model from Trainer:", e)

    # ---- Final Evaluation ----
    final_metrics = {}
    try:
        if trainer.state.best_model_checkpoint:
            trainer._load_best_model()
            final_metrics = trainer.evaluate(eval_dataset=val_subset)
        else:
            final_metrics = evaluate_multimodal(multimodal_model, val_loader, device_local, tokenizer_local, mask_target="fp")
    except Exception as e:
        print("Warning: final evaluation via trainer.evaluate failed, falling back to direct evaluate_multimodal:", e)
        final_metrics = evaluate_multimodal(multimodal_model, val_loader, device_local, tokenizer_local, mask_target="fp")

    print("\n" + "=" * 80)
    print(" FINAL TRAINING RESULTS")
    print("=" * 80)
    print(f"Total Training Time: {training_end_time - training_start_time:.2f}s")

    best_ckpt = trainer.state.best_model_checkpoint if hasattr(trainer.state, "best_model_checkpoint") else None
    print(f"Best Checkpoint: {best_ckpt if best_ckpt else '(none saved)'}")

    hf_eval_loss = final_metrics.get("eval_loss", float("nan"))
    hf_eval_acc = final_metrics.get("eval_accuracy", 0.0)
    hf_eval_f1 = final_metrics.get("eval_f1_weighted", 0.0)

    print(f"Val Loss (HF reported / trainer.evaluate): {hf_eval_loss:.4f}")
    print(f"Val Acc (CL evaluator): {hf_eval_acc:.4f}")
    print(f"Val F1 Weighted (CL evaluator): {hf_eval_f1:.4f}")
    print(f"Total Trainable Params: {trainable_params:,}")
    print(f"Total Non-trainable Params: {non_trainable_params:,}")
    print("=" * 80)


if __name__ == "__main__":
    main()