"""
src/diffusion.py
V-1 (Base):    TransformerDenoiser
V-2 (NOVEL#1): Entropy-adaptive mass gate + gate_confidence export
V-3 (NOVEL#4): Spectral noise augmentation during training
"""
import math
import os
import glob
import sys
import random
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd

sys.path.insert(0, os.path.dirname(__file__))
from baseline import Encoder
from preprocessing import (
    load_labeled_spectra, preprocess_spectrum, encode_peptide,
    VOCAB, CHAR_TO_IDX,
)

# ── Constants ──────────────────────────────────────────────────────────────────
VOCAB_SIZE  = 24       # PAD=0, SOS=1, EOS=2, AAs=3..22, MASK=23
MASK_TOK    = 23       # absorbing state — never predicted, only used in q_sample
SEQ_LEN     = 32       # 30 residues + SOS + EOS
T_STEPS     = 200
BETA_START  = 1e-3
BETA_END    = 2e-2
PROTON_MASS = 1.007276
WATER_MASS  = 18.010565
N_PEAKS     = 200      # top-K peaks kept per spectrum
MAX_MZ      = 2000.0   # m/z normalisation ceiling
BY_SIGMA    = 0.1      # Da — Gaussian half-width for b/y ion proximity bias

# Entropy-adaptive gate hyperparameters (NOVEL #1)
GATE_BASE_TOL = 0.02   # Da — tight tolerance when model is confident
GATE_ALPHA    = 2.0    # scales how much entropy relaxes the gate

# Spectral noise augmentation hyperparameters (NOVEL #4)
AUG_PROB      = 0.4    # probability of augmenting a training spectrum
AUG_NOISE_STD = 0.05   # fraction of per-spectrum std to add as noise

# Mass consistency loss weight (Option A fix)
# Uses relative L1 (|pred-true|/true), so values are in [0,1] range.
# 0.1 weight means mass term contributes ~10% as much as CE when error is ~10%.
MASS_LOSS_WEIGHT = 0.1

# VOCAB = "ACDEFGHIKLMNPQRSTVWY"  →  token[i+3] = VOCAB[i]
_MONO_MASSES = [
    71.03711,   # A  token 3
    103.00919,  # C  token 4
    115.02694,  # D  token 5
    129.04259,  # E  token 6
    147.06841,  # F  token 7
    57.02146,   # G  token 8
    137.05891,  # H  token 9
    113.08406,  # I  token 10
    128.09496,  # K  token 11
    113.08406,  # L  token 12
    131.04049,  # M  token 13
    114.04293,  # N  token 14
    97.05276,   # P  token 15
    128.05858,  # Q  token 16
    156.10111,  # R  token 17
    87.03203,   # S  token 18
    101.04768,  # T  token 19
    99.06841,   # V  token 20
    186.07931,  # W  token 21
    163.06333,  # Y  token 22
]
RESIDUE_MASS = torch.zeros(VOCAB_SIZE)
for _i, _m in enumerate(_MONO_MASSES):
    RESIDUE_MASS[_i + 3] = _m

IDX_TO_CHAR = {i + 3: c for i, c in enumerate(VOCAB)}


def extract_top_peaks(mz_arr, int_arr, n_peaks: int = N_PEAKS,
                      max_mz: float = MAX_MZ) -> np.ndarray:
    """Return (n_peaks, 2) array of top-K peaks sorted by m/z, normalised.
    Column 0: m/z / max_mz ∈ [0, 1].  Column 1: intensity / max_intensity ∈ [0, 1].
    Rows beyond the actual peak count are zero-padded."""
    mz  = np.asarray(mz_arr,  dtype=np.float32)
    ity = np.asarray(int_arr, dtype=np.float32)
    valid = (mz > 50) & (mz < max_mz)
    mz, ity = mz[valid], ity[valid]
    if len(ity) == 0:
        return np.zeros((n_peaks, 2), dtype=np.float32)
    ity = ity / (ity.max() + 1e-9)
    if len(ity) > n_peaks:
        top = np.argpartition(ity, -n_peaks)[-n_peaks:]
    else:
        top = np.arange(len(ity))
    mz_s, ity_s = mz[top], ity[top]
    order = np.argsort(mz_s)
    mz_s, ity_s = mz_s[order], ity_s[order]
    out = np.zeros((n_peaks, 2), dtype=np.float32)
    n = len(mz_s)
    out[:n, 0] = mz_s / max_mz
    out[:n, 1] = ity_s
    return out


# ── Diffusion Schedule (cosine, D3PM-style) ────────────────────────────────────
# Cosine schedule absorbs tokens more uniformly than linear — avoids degenerate
# distributions at very small or very large t.
_t_cos   = torch.arange(T_STEPS + 1, dtype=torch.float) / T_STEPS
_f       = torch.cos((_t_cos + 0.008) / 1.008 * math.pi / 2) ** 2
_alpha_bars = (_f / _f[0]).clamp(min=1e-5)[1:]           # (T,) ᾱ_t ∈ (0,1]
_betas      = (1 - _alpha_bars[1:] / _alpha_bars[:-1]).clamp(0, 0.999)
_betas      = torch.cat([torch.tensor([1 - _alpha_bars[0]]), _betas])

# Accelerated inference anchors: 20 evenly-spaced steps (avoids error accumulation)
_INFER_STEPS = list(range(T_STEPS - 1, -1, -(T_STEPS // 20)))
if _INFER_STEPS[-1] != 0:
    _INFER_STEPS.append(0)


def set_seed(seed: int):
    """Set all RNG seeds for reproducibility across training runs."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def q_sample(x0: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
    """Forward process: keep x0 token with prob ᾱ_t, else absorb to MASK_TOK.
    Absorbing diffusion aligns with mask-predict decoding: re-corrupting uncertain
    positions at inference with MASK_TOK now matches the training distribution."""
    abar = _alpha_bars[t.cpu()].to(x0.device).unsqueeze(1)
    keep = torch.bernoulli(abar.expand_as(x0.float())).bool()
    mask = torch.full_like(x0, MASK_TOK)
    return torch.where(keep, x0, mask)


# ── Peak-Level Augmentation ────────────────────────────────────────────────────
def augment_peaks(peaks: torch.Tensor, p: float = AUG_PROB,
                  noise_frac: float = AUG_NOISE_STD) -> torch.Tensor:
    """
    Peak-level augmentation for the PeakEncoder (replaces old augment_spectrum).
    With probability p per spectrum:
      - Add Gaussian noise to intensities (noise_frac fraction of max intensity).
      - Randomly zero out 10% of peaks (simulates missing ions).
    peaks: (B, K, 2) — column 0 = m/z/MAX_MZ, column 1 = intensity.
    Augmentation is skipped at inference (torch.is_grad_enabled() == False).
    """
    if not torch.is_grad_enabled():
        return peaks
    B, K, _ = peaks.shape
    aug_mask = torch.rand(B, device=peaks.device) < p  # (B,) spectra to augment
    if not aug_mask.any():
        return peaks

    out = peaks.clone()
    # Intensity noise
    int_noise = torch.randn(B, K, device=peaks.device) * noise_frac
    out[:, :, 1] = (out[:, :, 1] + int_noise).clamp(0.0, 1.0)
    # m/z jitter: ±0.01 normalised ≈ ±20 Da — simulates instrument calibration variance
    mz_jitter = torch.randn(B, K, device=peaks.device) * 0.005
    present = out[:, :, 0] > 0                                    # don't jitter padded peaks
    out[:, :, 0] = (out[:, :, 0] + mz_jitter * present).clamp(0.0, 1.0)
    # Random peak dropout (10%)
    drop = torch.rand(B, K, device=peaks.device) < 0.10
    out[:, :, 0] = out[:, :, 0].masked_fill(drop, 0.0)
    out[:, :, 1] = out[:, :, 1].masked_fill(drop, 0.0)

    aug_mask_2d = aug_mask.unsqueeze(1).unsqueeze(2)  # (B, 1, 1)
    return torch.where(aug_mask_2d, out, peaks)


# ── NOVEL #1: Entropy-Adaptive Mass Gate ──────────────────────────────────────
def entropy_adaptive_gate(logits: torch.Tensor,
                           precursor_masses: torch.Tensor,
                           base_tol: float = GATE_BASE_TOL,
                           alpha: float    = GATE_ALPHA) -> tuple:
    """
    Per-position tolerance = base_tol * (1 + alpha * H_t(i) / log(23)).
    High entropy (uncertain) → relax gate.  Low entropy (confident) → keep tight.

    logits:           (B, L, V)   — modified in-place clone
    precursor_masses: (B,)        — neutral peptide mass in Da
    Returns: (gated_logits, gate_confidence)
        gate_confidence: (B, L) — 1 = tight gate held, 0 = gate relaxed
    """
    B, L, V = logits.shape
    mass_lut = RESIDUE_MASS.to(logits.device)   # (V,)
    out      = logits.clone()

    # Skip gate entirely for spectra with unknown precursor mass
    valid = precursor_masses > 1.0              # (B,)

    # Per-position entropy → per-position tolerance
    with torch.no_grad():
        probs   = F.softmax(logits, dim=-1)                          # (B, L, V)
        entropy = -(probs * (probs + 1e-9).log()).sum(-1)            # (B, L)
        tol     = base_tol * (1 + alpha * entropy / math.log(V))    # (B, L)

        gate_conf = torch.ones(B, L, device=logits.device)          # 1 = tight

        # Best-guess token at each position for the "other positions" mass estimate
        best   = logits.argmax(-1)          # (B, L)
        best_m = mass_lut[best]             # (B, L)
        is_aa  = (best >= 3).float()        # (B, L)

        for pos in range(L):
            # Mass from all OTHER real-AA positions
            other = (best_m * is_aa).sum(1) \
                    - best_m[:, pos] * is_aa[:, pos] \
                    + WATER_MASS                                     # (B,)

            # Candidate total mass for each token at this position
            cand     = other.unsqueeze(1) + mass_lut.unsqueeze(0)   # (B, V)
            pos_tol  = tol[:, pos].unsqueeze(1)                     # (B, 1)
            feasible = (cand - precursor_masses.unsqueeze(1)).abs() < pos_tol

            feasible[:, :3] = True          # PAD/SOS/EOS always pass
            feasible[~valid] = True         # skip gate for unknown-mass spectra

            needs_relax = ~feasible.any(dim=-1)                     # (B,)
            gate_conf[:, pos] = (~needs_relax).float()

            # If even relaxed gate (0.1 Da) zeros everything, leave logits unchanged
            if needs_relax.any():
                relax_feasible = (cand - precursor_masses.unsqueeze(1)).abs() < 0.1
                relax_feasible[:, :3] = True
                relax_feasible[~valid] = True
                final_mask = torch.where(
                    needs_relax.unsqueeze(1), relax_feasible, feasible
                )
            else:
                final_mask = feasible

            out[:, pos].masked_fill_(~final_mask, float('-inf'))

    return out, gate_conf


# ── Model Components ───────────────────────────────────────────────────────────
class SinusoidalEmbedding(nn.Module):
    def __init__(self, dim: int):
        super().__init__()
        half = dim // 2
        freq = torch.exp(-math.log(10000) * torch.arange(half) / half)
        self.register_buffer('freq', freq)
        self.proj = nn.Linear(dim, dim)

    def forward(self, t: torch.Tensor) -> torch.Tensor:
        t = t.float().unsqueeze(1)
        x = t * self.freq.unsqueeze(0)
        x = torch.cat([x.sin(), x.cos()], dim=-1)
        return self.proj(x)


class PeakEncoder(nn.Module):
    """
    Transformer encoder over the top-K (m/z, intensity) peak pairs.
    Replaces the binned-spectrum CNN encoder with direct peak-level attention,
    which is how InstaNovo and all top de-novo models encode spectra.
    Prepends a precursor-mass token so the model always knows the target mass.
    """
    def __init__(self, d_model: int = 512, n_peaks: int = N_PEAKS,
                 nhead: int = 8, n_layers: int = 6):
        super().__init__()
        self.mz_emb   = SinusoidalEmbedding(d_model)   # embed m/z as float
        self.int_proj  = nn.Linear(1, d_model)
        self.mass_emb  = SinusoidalEmbedding(d_model)  # embed precursor mass
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=d_model * 4,
            batch_first=True, norm_first=True, dropout=0.1)
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)

    def forward(self, peaks: torch.Tensor,
                masses: torch.Tensor) -> tuple:
        """
        peaks:  (B, K, 2) — (mz/MAX_MZ, intensity)
        masses: (B,)      — neutral precursor mass in Da
        Returns ((B, K+1, d_model), (B, K+1) bool pad mask)
        The pad mask is True for positions that should be ignored (padded peaks).
        """
        B, K, _ = peaks.shape
        mz_scaled = peaks[:, :, 0].reshape(-1) * MAX_MZ        # (B*K,)
        mz_tok    = self.mz_emb(mz_scaled).reshape(B, K, -1)   # (B, K, d)
        int_tok   = self.int_proj(peaks[:, :, 1:])              # (B, K, d)
        peak_tok  = mz_tok + int_tok                            # (B, K, d)
        mass_tok  = self.mass_emb(masses).unsqueeze(1)          # (B, 1, d)
        x = torch.cat([mass_tok, peak_tok], dim=1)              # (B, K+1, d)
        # Mask zero-padded peaks (m/z == 0 means no peak)
        pad = torch.zeros(B, K + 1, dtype=torch.bool, device=peaks.device)
        pad[:, 1:] = (peaks[:, :, 0] == 0)
        out = self.encoder(x, src_key_padding_mask=pad)         # (B, K+1, d)
        return out, pad


def compute_by_pair_bias(xt: torch.Tensor, peaks: torch.Tensor,
                          precursor_masses: torch.Tensor,
                          sigma: float = BY_SIGMA) -> torch.Tensor:
    """
    AlphaFold3-inspired B/Y ion pair bias.
    For each (sequence position i, peak j) pair, compute the Gaussian proximity
    of the theoretical b/y ion at position i to peak j's m/z, weighted by
    peak intensity.  Committed (non-MASK) positions contribute signal;
    masked positions contribute zero — so the model learns to use committed
    positions to guide prediction of uncertain ones.

    Returns (B, L, K+1) — first column is the mass-token (always 0 bias).
    """
    B, L = xt.shape
    K    = peaks.shape[1]
    device = xt.device

    mass_lut = RESIDUE_MASS.to(device)          # (V,)
    # Token masses; MASK and special tokens → 0
    is_aa    = ((xt >= 3) & (xt < MASK_TOK)).float()           # (B, L)
    tok_mass = mass_lut[xt.clamp(0, VOCAB_SIZE - 1)] * is_aa   # (B, L)

    # Cumulative residue mass from position 0 → b-ion series
    cum_mass = torch.cumsum(tok_mass, dim=1)                    # (B, L)
    b_ions   = cum_mass + PROTON_MASS                           # (B, L)

    # y-ion series: precursor_mass − cum_mass (approximate)
    pm     = precursor_masses.unsqueeze(1)                      # (B, 1)
    valid  = (pm > 1.0).float()                                 # skip unknown mass
    y_ions = (pm - cum_mass + WATER_MASS + PROTON_MASS) * valid # (B, L)

    # Observed peak m/z, unnormalised
    mz_obs  = peaks[:, :, 0] * MAX_MZ  # (B, K)
    int_obs = peaks[:, :, 1]            # (B, K)

    # Pairwise Gaussian proximity: (B, L, K)
    b_diff  = b_ions.unsqueeze(2) - mz_obs.unsqueeze(1)        # (B, L, K)
    y_diff  = y_ions.unsqueeze(2) - mz_obs.unsqueeze(1)        # (B, L, K)
    prox    = (torch.exp(-b_diff**2 / (2 * sigma**2)) +
               torch.exp(-y_diff**2 / (2 * sigma**2)))          # (B, L, K)
    prox    = prox * int_obs.unsqueeze(1)                       # weight by intensity
    prox    = prox * is_aa.unsqueeze(2)                         # zero out masked pos

    # Prepend zero column for the mass token: (B, L, K+1)
    return F.pad(prox, (1, 0))


class BYBiasedDecoderLayer(nn.Module):
    """
    Pre-norm Transformer decoder layer with B/Y ion pair bias injected
    into the cross-attention logits (AlphaFold3-style pair bias).
    A learnable scalar `bias_scale` (initialised to 0) lets the model
    start as a standard decoder and gradually lean on the ion signal.
    """
    def __init__(self, d_model: int, nhead: int, dim_ff: int):
        super().__init__()
        self.self_attn  = nn.MultiheadAttention(d_model, nhead, batch_first=True)
        self.cross_attn = nn.MultiheadAttention(d_model, nhead, batch_first=True)
        self.ff         = nn.Sequential(
            nn.Linear(d_model, dim_ff), nn.GELU(), nn.Linear(dim_ff, d_model))
        self.norm1      = nn.LayerNorm(d_model)
        self.norm2      = nn.LayerNorm(d_model)
        self.norm3      = nn.LayerNorm(d_model)
        self.nhead      = nhead
        # Starts at 0 so model is identical to unbiased decoder at init
        self.bias_scale = nn.Parameter(torch.zeros(1))

    def forward(self, tgt: torch.Tensor, memory: torch.Tensor,
                by_bias: torch.Tensor | None = None,
                memory_key_padding_mask: torch.Tensor | None = None) -> torch.Tensor:
        B, L, _ = tgt.shape
        # Self-attention (pre-norm)
        h = self.norm1(tgt)
        h, _ = self.self_attn(h, h, h)
        tgt = tgt + h
        # Cross-attention with optional B/Y pair bias and padding mask
        h = self.norm2(tgt)
        if by_bias is not None:
            # by_bias: (B, L, K+1) → (B*nhead, L, K+1) float additive mask
            bias = (by_bias.unsqueeze(1)
                          .expand(-1, self.nhead, -1, -1)
                          .reshape(B * self.nhead, L, -1))
            bias = self.bias_scale * bias
            # Fuse padding mask into attn_mask as -inf (avoids bool/float mismatch)
            if memory_key_padding_mask is not None:
                pad_f = memory_key_padding_mask.float().masked_fill(
                    memory_key_padding_mask, float('-inf'))          # (B, K+1)
                pad_f = (pad_f.unsqueeze(1).unsqueeze(1)
                              .expand(-1, self.nhead, L, -1)
                              .reshape(B * self.nhead, L, -1))       # (B*h, L, K+1)
                bias = bias + pad_f
            h, _ = self.cross_attn(h, memory, memory, attn_mask=bias)
        else:
            h, _ = self.cross_attn(h, memory, memory,
                                   key_padding_mask=memory_key_padding_mask)
        tgt = tgt + h
        # Feed-forward (pre-norm)
        h = self.norm3(tgt)
        tgt = tgt + self.ff(h)
        return tgt


class TransformerDenoiser(nn.Module):
    """
    Bidirectional denoiser with peak-level cross-attention.
    memory: (B, K+1, d) sequence of peak embeddings from PeakEncoder.
    B/Y ion pair bias (AF3-inspired) injected into every cross-attention layer.
    Self-conditioning: optionally accepts x0-hat from the previous denoising step
    as an additional per-position embedding (MDLM-style, Sahoo 2024).
    """
    def __init__(self, vocab_size: int = VOCAB_SIZE, d_model: int = 512,
                 nhead: int = 8, dim_ff: int = 2048,
                 num_layers: int = 6, seq_len: int = SEQ_LEN):
        super().__init__()
        self.token_emb    = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pos_emb      = nn.Embedding(seq_len, d_model)
        self.time_emb     = SinusoidalEmbedding(d_model)
        # Self-conditioning embedding — init to zero so it has no effect at epoch 0
        self.self_cond_emb = nn.Embedding(vocab_size, d_model)
        nn.init.zeros_(self.self_cond_emb.weight)
        self.layers = nn.ModuleList([
            BYBiasedDecoderLayer(d_model, nhead, dim_ff)
            for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(d_model)
        self.out  = nn.Linear(d_model, vocab_size)

    def forward(self, xt: torch.Tensor, t: torch.Tensor,
                memory: torch.Tensor,
                peaks: torch.Tensor | None = None,
                precursor_masses: torch.Tensor | None = None,
                memory_key_padding_mask: torch.Tensor | None = None,
                self_cond: torch.Tensor | None = None) -> torch.Tensor:
        B, L = xt.shape
        pos  = torch.arange(L, device=xt.device).unsqueeze(0).expand(B, -1)
        x    = self.token_emb(xt) + self.pos_emb(pos)
        x    = x + self.time_emb(t).unsqueeze(1)
        # Self-conditioning: add x0-hat embedding from previous step
        if self_cond is not None:
            x = x + self.self_cond_emb(self_cond)
        by_bias = (compute_by_pair_bias(xt, peaks, precursor_masses)
                   if peaks is not None and precursor_masses is not None
                   else None)
        for layer in self.layers:
            x = layer(x, memory, by_bias=by_bias,
                      memory_key_padding_mask=memory_key_padding_mask)
        return self.out(self.norm(x))   # (B, L, V)


# ── Dataset ────────────────────────────────────────────────────────────────────
class DiffusionDataset(Dataset):
    """Stores (peaks, sequence, precursor_mass) triples for peak-level training."""
    def __init__(self, peaks, y, precursor_masses):
        self.peaks  = torch.tensor(peaks,  dtype=torch.float32)  # (N, K, 2)
        self.y      = torch.tensor(y,      dtype=torch.long)
        self.masses = torch.tensor(precursor_masses, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.peaks[idx], self.y[idx], self.masses[idx]


def build_diffusion_dataset(mzml_path: str, xlsx_path: str,
                            max_spectra: int = 5000,
                            n_peaks: int = N_PEAKS,
                            return_raw: bool = False):
    """Returns peaks (N, K, 2), y (N, 32), neutral_masses (N,).
    If return_raw=True also returns raw_peaks: list of (mz_arr, int_arr) for SGIR."""
    spectra = load_labeled_spectra(mzml_path, xlsx_path, max_spectra)
    peaks_list, y, masses, raw_peaks = [], [], [], []
    for s in spectra:
        peaks_list.append(extract_top_peaks(s['mz'], s['intensity'], n_peaks))
        y.append(encode_peptide(s['peptide']))
        prec_mz = s.get('precursor_mz') or 0.0
        charge  = s.get('charge') or 0
        neutral = float(charge) * (float(prec_mz) - PROTON_MASS) if charge else 0.0
        masses.append(neutral)
        if return_raw:
            raw_peaks.append((np.asarray(s['mz'], dtype=np.float32),
                              np.asarray(s['intensity'], dtype=np.float32)))
    out = (np.array(peaks_list, dtype=np.float32),
           np.array(y),
           np.array(masses, dtype=np.float32))
    return out + (raw_peaks,) if return_raw else out


# ── Training ───────────────────────────────────────────────────────────────────
def train_diffusion(mzml_paths, xlsx_paths, checkpoint_dir='checkpoints',
                    epochs=50, batch_size=32, lr=1e-3, device=None, seed=42):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    set_seed(seed)
    print(f"Device: {device}  |  Seed: {seed}")
    os.makedirs(checkpoint_dir, exist_ok=True)

    all_peaks, ys, ms = [], [], []
    for mzml, xlsx in zip(mzml_paths, xlsx_paths):
        pk, y, m = build_diffusion_dataset(mzml, xlsx)
        all_peaks.append(pk); ys.append(y); ms.append(m)
    peaks  = np.concatenate(all_peaks)
    y      = np.concatenate(ys)
    masses = np.concatenate(ms)
    print(f"Total spectra: {len(peaks)}")

    N   = len(peaks)
    rng = np.random.default_rng(42)
    idx = rng.permutation(N)
    n_tr = int(0.70 * N); n_va = int(0.15 * N)
    tr, va = idx[:n_tr], idx[n_tr:n_tr + n_va]
    te = idx[n_tr + n_va:]

    train_dl = DataLoader(DiffusionDataset(peaks[tr], y[tr], masses[tr]),
                          batch_size=batch_size, shuffle=True, drop_last=True)
    val_dl   = DataLoader(DiffusionDataset(peaks[va], y[va], masses[va]),
                          batch_size=batch_size, drop_last=True)

    encoder  = PeakEncoder().to(device)
    denoiser = TransformerDenoiser().to(device)
    params   = list(encoder.parameters()) + list(denoiser.parameters())
    opt        = optim.AdamW(params, lr=lr, weight_decay=1e-2)
    warmup_ep  = max(5, epochs // 20)   # 5% warmup
    def _lr_lambda(ep):
        if ep < warmup_ep:
            return (ep + 1) / warmup_ep
        t = (ep - warmup_ep) / max(epochs - warmup_ep, 1)
        return 0.5 * (1.0 + math.cos(math.pi * t)) * (1 - 1e-5 / lr) + 1e-5 / lr
    scheduler = optim.lr_scheduler.LambdaLR(opt, _lr_lambda)
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1, ignore_index=0)

    best_val = float('inf')
    for epoch in range(1, epochs + 1):
        encoder.train(); denoiser.train()
        tr_loss = 0.0
        for pks, seq, mass in train_dl:
            pks, seq, mass = pks.to(device), seq.to(device), mass.to(device)
            B = seq.shape[0]

            t      = torch.randint(0, T_STEPS, (B,), device=device)
            xt     = q_sample(seq, t)
            pks_aug = augment_peaks(pks)
            memory, pad_mask = encoder(pks_aug, mass)
            # Self-conditioning (MDLM): 50% of steps, run a no-grad forward pass
            # first to get x0-hat, then pass it as self_cond for the actual step.
            self_cond = None
            if torch.rand(1).item() < 0.5:
                with torch.no_grad():
                    sc_logits = denoiser(xt, t, memory, peaks=pks_aug,
                                         precursor_masses=mass,
                                         memory_key_padding_mask=pad_mask)
                    self_cond = sc_logits.argmax(-1).detach()
            logits = denoiser(xt, t, memory, peaks=pks_aug, precursor_masses=mass,
                              memory_key_padding_mask=pad_mask, self_cond=self_cond)
            ce_loss = criterion(logits.reshape(-1, VOCAB_SIZE), seq.reshape(-1))

            # Mass consistency loss (relative L1 on expected sequence mass)
            valid_m = mass > 1.0
            if valid_m.any():
                mass_lut   = RESIDUE_MASS.to(device)
                probs      = F.softmax(logits, dim=-1)
                exp_mass   = (probs * mass_lut).sum(-1)
                is_aa_soft = 1.0 - probs[:, :, :3].sum(-1)
                pred_mass  = (exp_mass * is_aa_soft).sum(-1) + WATER_MASS
                mass_loss  = ((pred_mass[valid_m] - mass[valid_m]).abs()
                              / mass[valid_m]).mean()
                loss = ce_loss + MASS_LOSS_WEIGHT * mass_loss
            else:
                loss = ce_loss

            opt.zero_grad(); loss.backward()
            nn.utils.clip_grad_norm_(params, 1.0)
            opt.step()
            tr_loss += ce_loss.item() * B

        encoder.eval(); denoiser.eval()
        va_loss = 0.0
        with torch.no_grad():
            for pks, seq, mass in val_dl:
                pks, seq, mass = pks.to(device), seq.to(device), mass.to(device)
                t      = torch.randint(0, T_STEPS, (seq.shape[0],), device=device)
                xt     = q_sample(seq, t)
                memory, pad_mask = encoder(pks, mass)
                logits = denoiser(xt, t, memory, peaks=pks, precursor_masses=mass,
                                  memory_key_padding_mask=pad_mask)
                va_loss += criterion(logits.reshape(-1, VOCAB_SIZE),
                                     seq.reshape(-1)).item() * pks.shape[0]

        tr_avg = tr_loss / len(train_dl.dataset)
        va_avg = va_loss / len(val_dl.dataset)
        print(f"Epoch {epoch:3d} | train {tr_avg:.4f} | val {va_avg:.4f}")

        if epoch % 10 == 0:
            path = os.path.join(checkpoint_dir, f'diffusion_ckpt_{epoch}.pt')
            torch.save({'epoch': epoch, 'encoder': encoder.state_dict(),
                        'denoiser': denoiser.state_dict()}, path)
            print(f"  Saved {path}")

        scheduler.step()

        if va_avg < best_val:
            best_val = va_avg
            torch.save({'epoch': epoch, 'encoder': encoder.state_dict(),
                        'denoiser': denoiser.state_dict()},
                       os.path.join(checkpoint_dir, 'diffusion_best.pt'))

    # Always save the last epoch — AA recall peaks here, not at best val CE loss
    torch.save({'epoch': epochs, 'encoder': encoder.state_dict(),
                'denoiser': denoiser.state_dict()},
               os.path.join(checkpoint_dir, 'diffusion_final.pt'))
    print(f"  Saved diffusion_final.pt (epoch {epochs})")

    return encoder, denoiser, (peaks[te], y[te], masses[te])


# ── Inference ──────────────────────────────────────────────────────────────────
def _seq_mass(seq: str) -> float:
    """Monoisotopic neutral mass of a peptide string (residues + H2O)."""
    return sum(_MONO_MASSES[VOCAB.index(c)] for c in seq if c in VOCAB) + WATER_MASS


def mass_correct_sequence(seq: str, precursor_mass: float,
                           tol: float = 0.05) -> str:
    """
    Option B: post-hoc mass correction.
    If the predicted sequence mass is outside tol Da of precursor_mass,
    try all single-position amino acid swaps and keep the one that minimises
    the mass error. Returns seq unchanged when precursor_mass is unknown (≤1 Da)
    or already within tolerance.
    """
    if precursor_mass <= 1.0 or not seq:
        return seq
    current_delta = abs(_seq_mass(seq) - precursor_mass)
    if current_delta <= tol:
        return seq

    best_seq, best_delta = seq, current_delta
    for pos in range(len(seq)):
        for aa in VOCAB:
            if aa == seq[pos]:
                continue
            candidate = seq[:pos] + aa + seq[pos + 1:]
            delta = abs(_seq_mass(candidate) - precursor_mass)
            if delta < best_delta:
                best_delta, best_seq = delta, candidate

    # Only apply if the correction actually achieves tolerance — a swap that
    # merely reduces the error without reaching tolerance swaps a correct AA
    # for a wrong one more often than not.
    return best_seq if best_delta <= tol else seq


# ── NOVEL #8: Mass-Constrained Beam Search ────────────────────────────────────
_MIN_RESIDUE_MASS = min(_MONO_MASSES)   # G = 57.02 Da
_MAX_RESIDUE_MASS = max(_MONO_MASSES)   # W = 186.08 Da

def mass_constrained_beam_search(logits_t0: torch.Tensor,
                                  precursor_masses: torch.Tensor,
                                  beam_width: int = 20,
                                  tol: float = 0.1) -> list:
    """
    NOVEL #8: replace independent argmax with left-to-right beam search.
    At each position, expand beams using top-k token log-probs and prune
    branches whose remaining mass budget is infeasible given precursor_mass.

    logits_t0:        (B, L, V)
    precursor_masses: (B,)
    Returns: list[B] of str — best sequence per spectrum.
    """
    B, L, V = logits_t0.shape
    log_probs = F.log_softmax(logits_t0, dim=-1)  # (B, L, V)
    mass_lut  = RESIDUE_MASS.to(logits_t0.device) # (V,)

    results = []
    for b in range(B):
        pm = float(precursor_masses[b].cpu())
        lp = log_probs[b]          # (L, V)

        # beam: list of (score, [token_ids], accumulated_aa_mass)
        beams = [(0.0, [], 0.0)]

        for pos in range(L):
            top_scores, top_toks = lp[pos].topk(beam_width)
            new_beams = []
            for score, toks, acc_mass in beams:
                for s, tok in zip(top_scores.tolist(), top_toks.tolist()):
                    new_score = score + s
                    tok_mass  = float(mass_lut[tok].cpu())
                    new_mass  = acc_mass + tok_mass if tok >= 3 else acc_mass

                    # Remaining positions: [pos+1 .. L-1]
                    remaining = L - pos - 1
                    # Feasibility: can any filling of remaining positions hit pm?
                    lo = new_mass + WATER_MASS + remaining * _MIN_RESIDUE_MASS
                    hi = new_mass + WATER_MASS + remaining * _MAX_RESIDUE_MASS

                    # If tok is EOS, we're done — check mass now
                    if tok == 2:
                        delta = abs(new_mass + WATER_MASS - pm)
                        if pm <= 1.0 or delta <= tol * 3:
                            new_beams.append((new_score, toks + [tok], new_mass))
                        continue

                    # Prune if no remaining filling can reach pm (unless mass unknown)
                    if pm > 1.0 and (lo > pm + tol or hi < pm - tol):
                        continue

                    new_beams.append((new_score, toks + [tok], new_mass))

            if not new_beams:
                # Fallback: keep existing beams without pruning
                new_beams = [(sc + float(lp[pos].max().cpu()),
                              tk + [int(lp[pos].argmax().cpu())], am)
                             for sc, tk, am in beams]

            # Keep top beam_width by score
            new_beams.sort(key=lambda x: x[0], reverse=True)
            beams = new_beams[:beam_width]

        best_toks = beams[0][1]
        results.append(best_toks)
    return results


# ── NOVEL #11: Mask-Predict Iterative Decoding ────────────────────────────────
@torch.no_grad()
def mask_predict_decode(logits_t0: torch.Tensor,
                        context: torch.Tensor,
                        denoiser: nn.Module,
                        n_iter: int = 8,
                        peaks: torch.Tensor | None = None,
                        precursor_masses: torch.Tensor | None = None,
                        memory_key_padding_mask: torch.Tensor | None = None,
                        self_cond: torch.Tensor | None = None) -> list:
    """
    NOVEL #11: Mask-Predict iterative decoding for the bidirectional denoiser.
    Correct counterpart to left-to-right beam search: keeps highest-confidence
    positions fixed and re-corrupts the uncertain ones, letting the bidirectional
    model refine them conditioned on the already-committed positions.
    Grounded in Ghazvininejad et al. 2019 (Mask-Predict) and LLaDA 2025.
    No training changes needed — works on existing checkpoints.
    """
    B, L, V = logits_t0.shape
    device  = logits_t0.device

    probs = F.softmax(logits_t0, dim=-1)
    conf  = probs.max(dim=-1).values     # (B, L)
    x0    = logits_t0.argmax(dim=-1)     # (B, L)

    t_zero = torch.zeros(B, dtype=torch.long, device=device)
    # Linear schedule: commit progressively more positions each round
    n_uncertain_schedule = [
        int(L * (1.0 - (i + 1) / n_iter)) for i in range(n_iter)
    ]  # e.g. n_iter=4, L=32 → [24, 16, 8, 0]

    for step in range(n_iter):
        n_uncertain = n_uncertain_schedule[step]
        if n_uncertain == 0:
            break

        # Mask least-confident positions
        rank = conf.argsort(dim=-1, descending=False)   # lowest conf first
        uncertain_mask = torch.zeros(B, L, dtype=torch.bool, device=device)
        uncertain_mask.scatter_(1, rank[:, :n_uncertain], True)

        # Corrupt uncertain positions with MASK_TOK — matches absorbing q_sample
        mask_fill = torch.full((B, L), MASK_TOK, dtype=torch.long, device=device)
        xt_new    = torch.where(uncertain_mask, mask_fill, x0)

        # Re-denoise at t=0 — bidirectional model conditions on fixed positions
        logits_new = denoiser(xt_new, t_zero, context,
                              peaks=peaks, precursor_masses=precursor_masses,
                              memory_key_padding_mask=memory_key_padding_mask,
                              self_cond=self_cond)
        logits_new[..., MASK_TOK] = float('-inf')   # never output MASK
        probs_new  = F.softmax(logits_new, dim=-1)

        # Update only uncertain positions; carry x0 forward as self_cond
        x0        = torch.where(uncertain_mask, logits_new.argmax(dim=-1), x0)
        conf      = torch.where(uncertain_mask, probs_new.max(dim=-1).values, conf)
        self_cond = x0

    return [decode_tokens(x0[b].cpu().tolist()) for b in range(B)]


# ── NOVEL #9: Spectrally-Grounded Iterative Refinement ────────────────────────
def _compute_by_ions(seq: str) -> tuple:
    """Compute theoretical b and y ion m/z values (singly charged, +1 proton)."""
    masses = [_MONO_MASSES[VOCAB.index(c)] for c in seq if c in VOCAB]
    if not masses:
        return np.array([]), np.array([])
    b_ions = np.cumsum(masses) + PROTON_MASS
    y_ions = np.cumsum(masses[::-1]) + WATER_MASS + PROTON_MASS
    return b_ions, y_ions


def _peak_support(theoretical_mz: np.ndarray, obs_mz: np.ndarray,
                  obs_int: np.ndarray, tol: float = 0.02) -> float:
    """Intensity-weighted fraction of theoretical ions matched in observed spectrum."""
    if len(theoretical_mz) == 0 or len(obs_mz) == 0:
        return 0.0
    total = 0.0
    for tmz in theoretical_mz:
        delta = np.abs(obs_mz - tmz)
        idx = delta.argmin()
        if delta[idx] <= tol:
            total += float(obs_int[idx])
    denom = obs_int.sum()
    return total / denom if denom > 0 else 0.0


def spectrally_grounded_refine(seq: str, obs_mz: np.ndarray, obs_int: np.ndarray,
                                precursor_mass: float, t0_log_probs: torch.Tensor,
                                max_iter: int = 5, ion_tol: float = 0.02) -> str:
    """
    NOVEL #9: Spectrally-Grounded Iterative Refinement (SGIR).
    For each iteration:
      1. Compute b/y ion support per position.
      2. Find weakest-supported position.
      3. Try all mass-feasible AA substitutions there; pick the one maximising
         support + t=0 log-prob. Stop when no improvement.

    t0_log_probs: (L, V) log-softmax from denoiser at t=0 for this spectrum.
    """
    if not seq or precursor_mass <= 1.0 or len(obs_mz) == 0:
        return seq

    # Normalise observed intensities
    obs_int = obs_int / (obs_int.max() + 1e-9)

    for _ in range(max_iter):
        b_ions, y_ions = _compute_by_ions(seq)
        if len(b_ions) == 0:
            break

        # Per-position support: average of b[i] and y[len-1-i] support
        n = len(seq)
        support = np.zeros(n)
        for i in range(n):
            bi = b_ions[i:i+1] if i < len(b_ions) else np.array([])
            yi = y_ions[n-1-i:n-i] if (n-1-i) < len(y_ions) else np.array([])
            ions = np.concatenate([bi, yi]) if len(bi) or len(yi) else np.array([])
            support[i] = _peak_support(ions, obs_mz, obs_int, ion_tol)

        worst_pos = int(np.argmin(support))
        best_seq, best_score = seq, support[worst_pos]

        for aa in VOCAB:
            if aa == seq[worst_pos]:
                continue
            candidate = seq[:worst_pos] + aa + seq[worst_pos + 1:]
            # Mass feasibility
            if abs(_seq_mass(candidate) - precursor_mass) > 0.1:
                continue
            # New support at worst_pos
            b_c, y_c = _compute_by_ions(candidate)
            if len(b_c) == 0:
                continue
            bi = b_c[worst_pos:worst_pos+1] if worst_pos < len(b_c) else np.array([])
            yi = y_c[n-1-worst_pos:n-worst_pos] if (n-1-worst_pos) < len(y_c) else np.array([])
            ions = np.concatenate([bi, yi]) if len(bi) or len(yi) else np.array([])
            new_support = _peak_support(ions, obs_mz, obs_int, ion_tol)
            # Tiebreak with t=0 log-prob
            tok = CHAR_TO_IDX.get(aa, 0)
            lp_bonus = float(t0_log_probs[worst_pos, tok].cpu()) if tok < t0_log_probs.shape[1] else 0.0
            score = new_support + 0.05 * lp_bonus
            if score > best_score:
                best_score, best_seq = score, candidate

        if best_seq == seq:
            break
        seq = best_seq

    return seq


# ── NOVEL #10: ESM-2 + Spectral Posterior Reranking ──────────────────────────
_esm2_model  = None
_esm2_tokenizer = None

def _get_esm2():
    global _esm2_model, _esm2_tokenizer
    if _esm2_model is None:
        try:
            from transformers import EsmModel, EsmTokenizer
            _esm2_tokenizer = EsmTokenizer.from_pretrained('facebook/esm2_t6_8M_UR50D')
            _esm2_model     = EsmModel.from_pretrained('facebook/esm2_t6_8M_UR50D')
            _esm2_model.eval()
        except Exception:
            _esm2_model = None
    return _esm2_model, _esm2_tokenizer


def esm2_pseudo_perplexity(seqs: list) -> list:
    """Batch ESM-2 pseudo-perplexity using one-fell-swoop masking."""
    model, tok = _get_esm2()
    if model is None or not seqs:
        return [0.0] * len(seqs)
    ppls = []
    for seq in seqs:
        if not seq:
            ppls.append(999.0)
            continue
        try:
            import torch
            inputs = tok(seq, return_tensors='pt')
            ids    = inputs['input_ids'][0]  # (L+2,)
            L      = len(ids) - 2
            log_prob_sum = 0.0
            with torch.no_grad():
                for i in range(1, L + 1):
                    masked = ids.clone()
                    masked[i] = tok.mask_token_id
                    out = model(input_ids=masked.unsqueeze(0),
                                attention_mask=inputs['attention_mask'])
                    logits = out.last_hidden_state[0, i]
                    lp = float(torch.log_softmax(logits, dim=-1)[ids[i]])
                    log_prob_sum += lp
            ppls.append(float(np.exp(-log_prob_sum / L)))
        except Exception:
            ppls.append(999.0)
    return ppls


def rank_candidates(candidates: list, spectral_lps: list, obs_mz: np.ndarray,
                    obs_int: np.ndarray, precursor_mass: float,
                    lam: float = 0.05, use_esm: bool = True) -> str:
    """
    NOVEL #10: score each candidate by:
      score(c) = spectral_logprob(c) - lam * esm2_ppl(c) + spectral_support(c)
    Returns the best candidate string.
    """
    if not candidates:
        return ''
    if len(candidates) == 1:
        return candidates[0]

    obs_int_norm = obs_int / (obs_int.max() + 1e-9) if len(obs_int) > 0 else obs_int
    ppls = esm2_pseudo_perplexity(candidates) if use_esm else [0.0]*len(candidates)

    best_seq, best_score = candidates[0], float('-inf')
    for seq, sp_lp, ppl in zip(candidates, spectral_lps, ppls):
        b_ions, y_ions = _compute_by_ions(seq)
        all_ions = np.concatenate([b_ions, y_ions]) if len(b_ions) > 0 else np.array([])
        supp = _peak_support(all_ions, obs_mz, obs_int_norm) if len(all_ions) > 0 else 0.0
        score = sp_lp - lam * ppl + supp
        if score > best_score:
            best_score, best_seq = score, seq
    return best_seq


def decode_tokens(tokens) -> str:
    result = []
    for tok in tokens:
        if int(tok) == 2:
            break
        if int(tok) >= 3:
            result.append(IDX_TO_CHAR.get(int(tok), '?'))
    return ''.join(result)


@torch.no_grad()
def generate_sequences(encoder, denoiser, spectra, precursor_masses,
                        n_candidates: int = 5, T_sample: float = 1.0,
                        t_infer: int = 100, device=None, use_gate: bool = False,
                        use_beam: bool = False, use_cfid: bool = False):
    """
    Iterative reverse diffusion over _INFER_STEPS (20 accelerated steps T→0).

    use_beam=True: final step uses mass-constrained beam search (NOVEL #8).
    T_sample < 1.0: final step samples with temperature instead of argmax
                    (used for multi-candidate generation for reranking).

    Returns:
        sequences:    list[N] of list[n_candidates] strings
        spectral_lps: list[N] of list[n_candidates] floats
        gate_confs:   list[N] of list[n_candidates] floats (1.0 = gate not applied)
        t0_logits:    list[N] of (L, V) tensor at t=0 (last candidate, for SGIR)
    """
    if device is None:
        device = next(encoder.parameters()).device
    encoder.eval(); denoiser.eval()

    spec_t  = torch.tensor(spectra,          dtype=torch.float32, device=device)
    mass_t  = torch.tensor(precursor_masses, dtype=torch.float32, device=device)
    context, peak_pad_mask = encoder(spec_t, mass_t)   # (N, K+1, d), (N, K+1)
    N       = len(spectra)

    sequences    = [[] for _ in range(N)]
    spectral_lps = [[] for _ in range(N)]
    gate_confs   = [[] for _ in range(N)]
    t0_logits    = [None] * N

    n_steps = len(_INFER_STEPS)

    for cand_idx in range(n_candidates):
        # Start from fully masked sequence (absorbing diffusion: t=T ≡ all-MASK)
        xt        = torch.full((N, SEQ_LEN), MASK_TOK, dtype=torch.long, device=device)
        self_cond = None   # self-conditioning: x0-hat from previous step

        for step_idx, t_cur in enumerate(_INFER_STEPS):
            t_vec  = torch.full((N,), t_cur, dtype=torch.long, device=device)
            logits = denoiser(xt, t_vec, context,
                              peaks=spec_t, precursor_masses=mass_t,
                              memory_key_padding_mask=peak_pad_mask,
                              self_cond=self_cond)                     # (N, L, V)
            logits[..., MASK_TOK] = float('-inf')   # never predict MASK as output

            is_final = (step_idx == n_steps - 1)

            if not is_final:
                x0_hat    = logits.argmax(-1)        # (N, L) — EOS/PAD allowed
                self_cond = x0_hat                   # carry forward for next step
                t_next     = _INFER_STEPS[step_idx + 1]
                t_next_vec = torch.full((N,), t_next, dtype=torch.long, device=device)
                xt = q_sample(x0_hat, t_next_vec)
            else:
                log_fin = F.log_softmax(logits, dim=-1)  # (N, L, V)

                # Store t=0 logits from last candidate for SGIR
                for i in range(N):
                    t0_logits[i] = log_fin[i].cpu()

                if use_beam:
                    # NOVEL #8: mass-constrained beam search
                    beam_toks_list = mass_constrained_beam_search(
                        logits, mass_t, beam_width=20)
                    for bi, toks in enumerate(beam_toks_list):
                        toks_pad = (toks + [0] * SEQ_LEN)[:SEQ_LEN]
                        tok_t    = torch.tensor(toks_pad, dtype=torch.long, device=device)
                        lp       = log_fin[bi].gather(-1, tok_t.unsqueeze(-1)).squeeze(-1)
                        aa_mask  = (tok_t >= 3).float()
                        sp_lp_i  = (lp * aa_mask).sum() / aa_mask.sum().clamp(min=1)
                        sequences[bi].append(decode_tokens(toks_pad))
                        spectral_lps[bi].append(float(sp_lp_i.cpu()))
                        gate_confs[bi].append(1.0)
                elif T_sample != 1.0 and n_candidates > 1:
                    # Temperature sampling: diverse candidates for reranking
                    scaled = logits / max(T_sample, 1e-6)
                    probs  = F.softmax(scaled, dim=-1)
                    x0_samp = torch.multinomial(
                        probs.reshape(-1, VOCAB_SIZE), 1).reshape(N, SEQ_LEN)
                    sp_lp = log_fin.gather(-1, x0_samp.unsqueeze(-1)).squeeze(-1)
                    aa_mask = (x0_samp >= 3).float()
                    sp_lp   = (sp_lp * aa_mask).sum(-1) / aa_mask.sum(-1).clamp(min=1)
                    for i, seq in enumerate(x0_samp.cpu().numpy()):
                        sequences[i].append(decode_tokens(seq))
                        spectral_lps[i].append(float(sp_lp[i].cpu()))
                        gate_confs[i].append(1.0)
                elif use_cfid:
                    # NOVEL #11: Mask-Predict iterative decoding (n_iter=8 for better convergence)
                    cfid_seqs = mask_predict_decode(logits, context, denoiser, n_iter=8,
                                                    peaks=spec_t, precursor_masses=mass_t,
                                                    memory_key_padding_mask=peak_pad_mask,
                                                    self_cond=x0_hat)
                    for bi, seq in enumerate(cfid_seqs):
                        tok_ids = [(CHAR_TO_IDX.get(c, 0) if c in CHAR_TO_IDX else 0)
                                   for c in seq]
                        tok_t   = torch.tensor((tok_ids + [0] * SEQ_LEN)[:SEQ_LEN],
                                               dtype=torch.long, device=device)
                        lp      = log_fin[bi].gather(-1, tok_t.unsqueeze(-1)).squeeze(-1)
                        aa_mask = (tok_t >= 3).float()
                        sp_lp_i = (lp * aa_mask).sum() / aa_mask.sum().clamp(min=1)
                        sequences[bi].append(seq)
                        spectral_lps[bi].append(float(sp_lp_i.cpu()))
                        gate_confs[bi].append(1.0)
                else:
                    # Default: argmax
                    x0_final = logits.argmax(-1)                         # (N, L)
                    sp_lp    = log_fin.gather(-1, x0_final.unsqueeze(-1)).squeeze(-1)
                    aa_mask  = (x0_final >= 3).float()
                    sp_lp    = (sp_lp * aa_mask).sum(-1) / aa_mask.sum(-1).clamp(min=1)
                    for i, seq in enumerate(x0_final.cpu().numpy()):
                        sequences[i].append(decode_tokens(seq))
                        spectral_lps[i].append(float(sp_lp[i].cpu()))
                        gate_confs[i].append(1.0)

    return sequences, spectral_lps, gate_confs, t0_logits


# ── Save Predictions CSV (deliverable) ────────────────────────────────────────
def save_predictions(sequences, spectral_lps, gate_confs,
                     out_path='results/diffusion_predictions.csv'):
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    rows = []
    for i, (seqs, lps, gcs) in enumerate(zip(sequences, spectral_lps, gate_confs)):
        for seq, lp, gc in zip(seqs, lps, gcs):
            rows.append({'spectrum_id': i, 'sequence': seq,
                         'spectral_logprob': lp, 'gate_confidence': gc})
    df = pd.DataFrame(rows)
    df.to_csv(out_path, index=False)
    print(f"Saved {len(df)} predictions → {out_path}")
    return df


# ── Evaluation ─────────────────────────────────────────────────────────────────
def aa_recall(pred: str, true: str) -> float:
    """Positional recall: fraction of positions where pred[i] == true[i]."""
    matches = sum(a == b for a, b in zip(pred, true))
    return matches / max(len(true), 1)


def evaluate_aa_recall(encoder, denoiser, X_test, y_test, masses_test,
                        batch_size=32, results_dir='results', device=None,
                        use_gate=False, use_beam=False, use_cfid=False,
                        use_rerank=False, n_rerank=30, T_sample=0.8,
                        use_sgir=False, raw_peaks=None, use_esm=False,
                        use_mass_correct=False):
    """
    Evaluate de-novo sequencing on a held-out test set.

    Flags (can be combined):
      use_beam         – NOVEL #8: mass-constrained beam search at final step
      use_cfid         – NOVEL #11: Mask-Predict iterative decoding (bidirectional)
      use_rerank       – NOVEL #10: generate n_rerank candidates, ESM-2+spectral rerank
      use_sgir         – NOVEL #9: spectrally-grounded iterative refinement post-decode
                         requires raw_peaks: list[(mz_arr, int_arr)] aligned with X_test
      use_mass_correct – Option B: post-hoc single-swap mass correction
    """
    if device is None:
        device = next(encoder.parameters()).device

    n_cands = n_rerank if use_rerank else 1
    t_samp  = T_sample if use_rerank else 1.0

    all_seqs, all_lps, all_gcs = [], [], []
    recalls, pep_correct = [], []

    for i in range(0, len(X_test), batch_size):
        bs   = X_test[i:i+batch_size]
        bm   = masses_test[i:i+batch_size]
        byt  = y_test[i:i+batch_size]
        brp  = raw_peaks[i:i+batch_size] if raw_peaks is not None else None

        seqs, lps, gcs, t0_lgs = generate_sequences(
            encoder, denoiser, bs, bm,
            n_candidates=n_cands, T_sample=t_samp,
            device=device, use_gate=use_gate, use_beam=use_beam,
            use_cfid=use_cfid)

        for j, (pred_list, lp_list, gc_list, true_tok) in enumerate(
                zip(seqs, lps, gcs, byt)):
            pm    = float(bm[j])
            obs_mz  = brp[j][0] if brp is not None else np.array([])
            obs_int = brp[j][1] if brp is not None else np.array([])

            # NOVEL #10: rerank candidates
            if use_rerank and len(pred_list) > 1:
                best = rank_candidates(pred_list, lp_list, obs_mz, obs_int, pm,
                                       use_esm=use_esm)
                best_lp = lp_list[pred_list.index(best)] if best in pred_list else lp_list[0]
            else:
                best    = pred_list[0]
                best_lp = lp_list[0]

            # NOVEL #9: SGIR refinement
            if use_sgir and brp is not None and t0_lgs[j] is not None:
                best = spectrally_grounded_refine(
                    best, obs_mz, obs_int, pm, t0_lgs[j])

            # Option B: post-hoc single-swap mass correction
            if use_mass_correct:
                best = mass_correct_sequence(best, pm)

            all_seqs.append([best]); all_lps.append([best_lp])
            all_gcs.append([gc_list[0]])
            true_seq = decode_tokens(true_tok)
            recalls.append(aa_recall(best, true_seq))
            pep_correct.append(best == true_seq)

    aa_rec  = float(np.mean(recalls)) * 100
    pep_acc = float(np.mean(pep_correct)) * 100
    print(f"AA Recall  : {aa_rec:.2f}%")
    print(f"Peptide Acc: {pep_acc:.2f}%")

    os.makedirs(results_dir, exist_ok=True)
    save_predictions(all_seqs, all_lps, all_gcs,
                     os.path.join(results_dir, 'diffusion_predictions.csv'))
    return aa_rec, pep_acc


# ── Checkpoint I/O ─────────────────────────────────────────────────────────────
def load_checkpoint(path: str, device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    ckpt = torch.load(path, map_location=device, weights_only=False)
    encoder  = PeakEncoder().to(device)
    denoiser = TransformerDenoiser().to(device)
    encoder.load_state_dict(ckpt['encoder'])
    denoiser.load_state_dict(ckpt['denoiser'])
    print(f"Loaded checkpoint from epoch {ckpt.get('epoch', '?')}")
    return encoder, denoiser


# ── Main ───────────────────────────────────────────────────────────────────────
if __name__ == '__main__':
    BASE = os.path.join(os.path.dirname(__file__), '..', 'Data', 'E coli EV proteomics')
    mzml_paths = sorted(glob.glob(os.path.join(BASE, '*.mzML')))
    xlsx_paths = sorted(glob.glob(os.path.join(BASE, 'Database search output*.xlsx')))
    if not mzml_paths:
        raise FileNotFoundError(f"No mzML files in {BASE}")
    encoder, denoiser, (X_te, y_te, m_te) = train_diffusion(
        mzml_paths, xlsx_paths, checkpoint_dir='checkpoints', epochs=50)
    evaluate_aa_recall(encoder, denoiser, X_te, y_te, m_te)