import os
import torch
import torch.nn as nn
import pytorch_lightning as pl
from omegaconf import DictConfig
import torch.nn.functional as F
from model.transformer import AnyOrderMaskInsertionFlow
from model.interpolant import AnyOrderMaskInsertionInterpolant, ModelPrediction
from .bregman import jump_kernel_elbo, mse
from .schedule import get_schedule_from_config
from lightning_modules.any_order import AnyOrderInsertionFlowModule
from model.model_wrapper import RemaskingAnyOrder
from sampling import _sample_tokens

import re
from typing import Dict, Any
from dataclasses import dataclass

def strip_orig_mod_keys(state_dict: Dict[str, Any]) -> Dict[str, Any]:
    """
    Returns a new state_dict where any key containing '._orig_mod.' is replaced
    by removing the '_orig_mod' segment, e.g.
      'model._orig_mod.vocab_embed.embedding'
    becomes
      'model.vocab_embed.embedding'
    """
    new_state_dict: Dict[str, Any] = {}
    for key, value in state_dict.items():
        # remove all occurrences of '._orig_mod.'
        clean_key = re.sub(r"\._orig_mod\.", ".", key)
        new_state_dict[clean_key] = value
    return new_state_dict


@torch.no_grad()
def _binary_auc(scores: torch.Tensor, labels: torch.Tensor) -> float:
    """Rank-based AUROC (Mann-Whitney U statistic).

    AUC = P(score[pos] > score[neg]); 0.5 means no discrimination. Returns NaN
    when only one class is present (AUC undefined). Ties are not averaged, which
    is fine for continuous logits used here.
    """
    scores = scores.float().reshape(-1)
    labels = labels.float().reshape(-1)
    n_pos = labels.sum()
    n_neg = labels.numel() - n_pos
    if n_pos == 0 or n_neg == 0:
        return float("nan")
    order = torch.argsort(scores)
    ranks = torch.empty_like(scores)
    ranks[order] = torch.arange(1, scores.numel() + 1, device=scores.device, dtype=scores.dtype)
    auc = (ranks[labels == 1].sum() - n_pos * (n_pos + 1) / 2) / (n_pos * n_neg)
    return auc.item()


class AnyOrderInsertionFlowModuleFT(AnyOrderInsertionFlowModule):
    """
    Wrapper around AnyOrderInsertionFlowModule that adds adaptive schedule model
    for fine-tuning. Can load a pretrained AnyOrderInsertionFlowModule checkpoint
    and add the schedule model on top.
    """
    def __init__(self, config, args, pretrained_checkpoint, insertion_planner=False):
        # Initialize parent class first
        super().__init__(config)
        
        self.args = args
        self.insertion_planner = insertion_planner
        
        # Save hyperparameters for this class (overrides parent's save)
        self.save_hyperparameters(ignore=['pretrained_checkpoint', 'args'])
        
        # Load pretrained model weights BEFORE initializing planner to avoid circular reference
        if pretrained_checkpoint is not None:
            self.load_pretrained_model(pretrained_checkpoint)
        
        # Initialize adaptive schedule model AFTER loading pretrained weights
        self.planner = RemaskingAnyOrder(
            backbone=self,
            d_model=self.config.model.hidden_size,
            insertion_planner=insertion_planner)
        
    def load_pretrained_model(self, checkpoint_path: str):
        """
        Load pretrained AnyOrderInsertionFlowModule weights.
        Only loads the base model and interpolant, not the schedule model.
        """
        print(f"Loading pretrained model from {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=False)
        
        # Extract state dict - handle different checkpoint formats
        if 'state_dict' in checkpoint:
            state_dict = checkpoint['state_dict']
        else:
            state_dict = checkpoint
        
        # Strip _orig_mod keys if present
        state_dict = strip_orig_mod_keys(state_dict)
        
        # Filter out planner keys (if any exist from a previous FT checkpoint)
        base_state_dict = {k: v for k, v in state_dict.items() 
                          if not k.startswith('planner.')}
        
        # Load the base model weights
        # Use strict=False to ignore missing schedule_model keys
        incompatible_keys = self.load_state_dict(base_state_dict, strict=False)
        
        # Filter out expected missing planner keys for cleaner output
        unexpected_missing = [k for k in incompatible_keys.missing_keys 
                            if not k.startswith('planner.')]
        planner_missing = [k for k in incompatible_keys.missing_keys 
                          if k.startswith('planner.')]
        
        if unexpected_missing:
            print(f"Warning: Unexpected missing keys from pretrained checkpoint: {unexpected_missing}")
        if planner_missing:
            print(f"Note: Planner will be trained from scratch ({len(planner_missing)} parameters)")
        if incompatible_keys.unexpected_keys:
            print(f"Warning: Unexpected keys in pretrained checkpoint: {incompatible_keys.unexpected_keys}")
        
        # Freeze base model if specified
        if self.config.training.get('freeze_base_model', False):
            print("Freezing base model parameters")
            for name, param in self.named_parameters():
                if not name.startswith('planner.'):
                    param.requires_grad = False

    def forward(self, x, t, return_features=False):
        # Use parent class forward method
        return super().forward(x, t, return_features=return_features)

    def training_loss(self, x1, t):
        # Use parent class training_loss for base model loss
        # Planner is trained separately via loss_planner_flexible with reward gradients
        unmask_loss, insertion_loss, total_loss = super().training_loss(x1, t)
        return unmask_loss, insertion_loss, total_loss
    
    
    def training_step(self, batch, batch_idx):
        # Extract input data
        if isinstance(batch, dict):
            batch = batch["input_ids"]

        x1 = batch
        t = self.sample_time(x1.shape[0], x1.device)

        # Calculate the base model loss (planner trained separately, not here)
        unmask_loss, len_loss, loss = self.training_loss(x1, t)
        
        # Log component losses
        self.log("train/unmask_loss", unmask_loss, prog_bar=True)
        self.log("train/len_loss", len_loss, prog_bar=True)
        self.log("train/total_loss", loss, prog_bar=True)
        
        return loss

    def validation_step(self, batch, batch_idx):
        if isinstance(batch, dict):
            batch = batch["input_ids"]

        x1 = batch
        t = self.sample_time(x1.shape[0], x1.device)
        unmask_loss, len_loss, loss = self.training_loss(x1, t)

        self.log("val/unmask_loss", unmask_loss, prog_bar=True, sync_dist=True)
        self.log("val/len_loss", len_loss, prog_bar=True, sync_dist=True)
        self.log("val_loss", loss, prog_bar=True, sync_dist=True)

        return loss
    
    @classmethod
    def load_from_checkpoint(cls, checkpoint_path, map_location=None, strict=True, **kwargs):
        """
        Custom checkpoint loading that handles finetuned checkpoints wrapped by PeptideFinetuner.
        Extracts config from original pretrained checkpoint and loads finetuned weights.
        """
        print(f"Loading finetuned checkpoint from {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path, map_location=map_location or 'cpu', weights_only=False)
        
        # Check if this is a wrapped checkpoint (from PeptideFinetuner)
        hparams = checkpoint.get('hyper_parameters', {})
        state_dict = checkpoint.get('state_dict', {})
        
        # Check for policy_model prefix in state_dict (indicates PeptideFinetuner wrapper)
        has_policy_prefix = any(k.startswith('policy_model.') for k in state_dict.keys())
        
        if has_policy_prefix:
            # Detect model type (molecule vs peptide) based on vocab size in checkpoint
            # Molecule models have vocab size ~1882, peptide models have ~587
            vocab_size = None
            for k, v in state_dict.items():
                if 'vocab_embed.embedding' in k:
                    vocab_size = v.shape[0]
                    break
            
            is_molecule_model = vocab_size is not None and vocab_size > 1000
            model_type = "MolFinetuner" if is_molecule_model else "PeptideFinetuner"
            print(f"Detected wrapped finetuned checkpoint ({model_type}, vocab_size={vocab_size})")
            
            # Extract args from hyperparameters
            if 'args' not in hparams:
                raise ValueError(f"Cannot find 'args' in hyperparameters. This checkpoint may not be from {model_type}.")
            
            args = hparams['args']
            print(f"Found args in hyperparameters, type: {type(args)}")
            
            # Get original checkpoint path from args
            # Handle both Namespace (hasattr) and dict (get) access patterns
            original_ckpt_path = None
            if hasattr(args, 'checkpoint_path'):
                original_ckpt_path = args.checkpoint_path
            elif isinstance(args, dict) and 'checkpoint_path' in args:
                original_ckpt_path = args['checkpoint_path']
            
            # If checkpoint_path is not set or is None, use default pretrained checkpoint
            # Select appropriate default based on detected model type
            if original_ckpt_path is None:
                _repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
                if is_molecule_model:
                    original_ckpt_path = os.path.join(_repo_root, 'pretrained', 'anylength_mol.ckpt')
                    print(f"Warning: checkpoint_path not found in args, using default molecule pretrained checkpoint")
                else:
                    original_ckpt_path = os.path.join(_repo_root, 'pretrained', 'anylength_pep.ckpt')
                    print(f"Warning: checkpoint_path not found in args, using default peptide pretrained checkpoint")
            
            # Try to load config directly from checkpoint first (new checkpoints)
            # Fall back to loading from original checkpoint (old checkpoints)
            if 'config' in checkpoint:
                print("Found config directly in checkpoint")
                config = checkpoint['config']
            else:
                print(f"Config not in checkpoint, loading from original checkpoint: {original_ckpt_path}")
                
                # Load config from original pretrained checkpoint
                orig_ckpt = torch.load(original_ckpt_path, map_location='cpu', weights_only=False)
                if 'config' not in orig_ckpt:
                    raise ValueError(f"Original checkpoint {original_ckpt_path} does not contain config")
                
                config = orig_ckpt['config']
            
            # Ensure adaptive schedule is enabled
            # Need to disable struct mode to add new keys to OmegaConf config
            from omegaconf import OmegaConf
            if hasattr(config, 'training'):
                OmegaConf.set_struct(config, False)
                config.training.use_adaptive_schedule = True
                OmegaConf.set_struct(config, True)
            
            # Create args object if needed
            if not hasattr(args, '__dict__'):
                # Convert dict to object with attributes
                class Args:
                    pass
                args_obj = Args()
                for k, v in args.items():
                    setattr(args_obj, k, v)
                args = args_obj
            
            # Initialize model with config and args
            model = cls(
                config=config,
                args=args,
                pretrained_checkpoint=None,  # Don't reload pretrained, weights already in checkpoint
                insertion_planner=getattr(args, 'insertion_planner', False)
            )
            
            # Extract policy_model weights from state_dict
            policy_state = {}
            for k, v in state_dict.items():
                if k.startswith('policy_model.'):
                    # Strip 'policy_model.' prefix
                    new_key = k[len('policy_model.'):]
                    policy_state[new_key] = v
            
            # Load the finetuned weights
            incompatible = model.load_state_dict(policy_state, strict=False)
            if incompatible.missing_keys or incompatible.unexpected_keys:
                print(f"Warning: Incompatible keys when loading finetuned weights:")
                if incompatible.missing_keys:
                    print(f"  Missing: {incompatible.missing_keys[:5]}...")
                if incompatible.unexpected_keys:
                    print(f"  Unexpected: {incompatible.unexpected_keys[:5]}...")
            
            # Initialize or load EMA params
            if model.use_ema:
                if "ema_params" in checkpoint:
                    # Load EMA params from checkpoint
                    model.ema_params = checkpoint["ema_params"]
                    print("Loaded EMA params from checkpoint")
                else:
                    # Initialize empty EMA params (will be populated if needed)
                    model.ema_params = {
                        name: param.clone().detach()
                        for name, param in model.named_parameters()
                    }
                    print("Initialized EMA params from current model state")
            else:
                model.ema_params = {}
            
            # Load planner state if it exists
            if "planner_state" in checkpoint and hasattr(model, 'planner'):
                model.planner.load_state_dict(checkpoint["planner_state"], strict=False)
                print("Loaded planner state from checkpoint")
            
            return model
        else:
            # Not a wrapped checkpoint, use default Lightning loading
            # But we still need to provide required __init__ arguments
            raise NotImplementedError(
                "Direct finetuned checkpoints (not wrapped by PeptideFinetuner) are not yet supported. "
                "Please provide config and args as kwargs."
            )
    
    def on_save_checkpoint(self, checkpoint):
        """Save config and EMA params, including planner state."""
        # Call parent to save config and base model EMA
        super().on_save_checkpoint(checkpoint)
        
        # Explicitly save planner state
        if hasattr(self, 'planner'):
            checkpoint["planner_state"] = self.planner.state_dict()
    
    def on_load_checkpoint(self, checkpoint):
        """Load config and reinitialize interpolant, including planner."""
        # For finetuned checkpoints loaded via custom load_from_checkpoint,
        # config may not be in checkpoint (it's loaded from original checkpoint)
        if "config" in checkpoint:
            # Call parent to restore config and interpolant
            super().on_load_checkpoint(checkpoint)
        else:
            # Config already set during __init__ via load_from_checkpoint
            # Just restore EMA params if they exist
            if self.use_ema and "ema_params" in checkpoint:
                self.ema_params = checkpoint["ema_params"]
        
        # Restore planner state if it exists in checkpoint
        if hasattr(self, 'planner') and "planner_state" in checkpoint:
            self.planner.load_state_dict(checkpoint["planner_state"])
            print("Loaded planner from checkpoint")
            
    def loss_wdce_flexible(self, log_rnd, x, num_replicates=16, weight_func=lambda l: 1/l, eps=1e-3, centering=False, centering_strength=1.0, softmax_temperature=1.0):
        r"""
        Weighted denoising cross entropy loss
        X_T ~ P^u_T and weights \log\frac{dP^*}{dP^u}(X)
        
        log_rnd: [B] — pre-computed importance weights (already softmax-normalized over the full buffer)
        x: [B, L] (no mask)
        num_replicates: R, number of replicates of each row in x
        weight_func: w(lambda) for each sample, 1/lambda by default
        centering_strength: float, controls how much of the mean is subtracted (DMPO-style)
        softmax_temperature: float, temperature for softmax on log_rnd (>1 smooths weights)
        """
        
        batch = x.repeat_interleave(num_replicates, dim=0) # [B*R, L]
        
        batch_weights = (log_rnd.detach() / softmax_temperature).softmax(dim=-1)  # [B]
        if centering:
            batch_weights = batch_weights - centering_strength * batch_weights.mean()
        
        batch_weights = batch_weights.repeat_interleave(num_replicates, dim=0)
        
        lamda = torch.rand(batch.shape[0], device=batch.device) # [B*R]
        lamda_weights = weight_func(lamda).clamp(max=1e5) # [B*R]
        
        t = lamda
        
        # compute unmasking and insertion loss
        interpolant_sample = self.interpolant.sample_interpolant(t, batch)
        unmask_weight, insert_weight = self.interpolant.elbo_weight(t, batch)

        prediction: ModelPrediction = self(interpolant_sample.xt, t)

        scale_factor = self.config.interpolant.max_length

        match self.unmask_loss_fn:
            case "elbo":
                mask_indices = interpolant_sample.mask_indices
                unmask_loss_all = torch.zeros_like(unmask_weight)  # [B*R, L]
                unmask_loss_all[mask_indices] = unmask_weight[mask_indices] * F.cross_entropy(
                    prediction.token_logits[mask_indices],
                    interpolant_sample.unmasked[mask_indices],
                    reduction="none",
                )
                unmask_loss = unmask_loss_all.sum(dim=1) / scale_factor  # [B*R]
            case _:
                raise ValueError(f"Invalid unmask loss type: {self.unmask_loss_fn}")

        match self.insert_loss_fn:
            case "expectation":
                gaps, gaps_mask = interpolant_sample.gaps_and_mask
                insertion_loss_all = torch.zeros_like(insert_weight)  # [B*R, L+1]
                insertion_loss_all[gaps_mask] = insert_weight[gaps_mask] * jump_kernel_elbo(
                    gaps[gaps_mask], prediction.expected_gaps[gaps_mask]
                )
                insertion_loss = insertion_loss_all.sum(dim=1) / scale_factor  # [B*R]

            case "distribution":
                gaps, gaps_mask = interpolant_sample.gaps_and_mask
                insertion_loss_all = torch.zeros_like(insert_weight)  # [B*R, L+1]
                insertion_loss_all[gaps_mask] = insert_weight[gaps_mask] * F.cross_entropy(
                    prediction.length_posterior[gaps_mask], gaps[gaps_mask]
                )
                insertion_loss = insertion_loss_all.sum(dim=1) / scale_factor  # [B*R]

        total_loss = unmask_loss + insertion_loss  # [B*R]
        # end compute unmasking and insertion loss
        
        weighted_loss = total_loss * batch_weights  # [B*R]
        return weighted_loss.mean()
    
    def one_step_sampler(self, xt, t, pred_rate=None):
        """
        Sample one step of unmasking using model predictions.
        
        Args:
            xt: Current state [B, L]
            t: Time [B]
            pred_rate: Optional pre-computed ModelPrediction. If None, will compute from model.
        
        Returns:
            new_xt: Next state [B, L]
            update_ids: Boolean mask of updated positions [B, L]
        """
        mask = self.interpolant.mask_token
        pad = self.interpolant.pad_token
        batch_size, L = xt.shape
        device = xt.device
        steps = self.args.total_num_steps
        dt = 1.0 / steps
        max_length = self.interpolant.max_length
        # Use actual tensor dimension L instead of max_length to handle replicated batches
        batch_idx_L = (
            torch.arange(batch_size, device=device)
            .view(batch_size, 1)
            .expand(batch_size, L)
        )
        pos_idx_L = (
            torch.arange(L, device=device)
            .view(1, L)
            .expand(batch_size, L)
        )
        
        # ——— predict and convert rates ———
        if pred_rate is None:
            pred_rate = self(xt, t)
        pred_rate = self.interpolant.to_actual_rate(xt, pred_rate, t)
        unmask_rate = pred_rate.unmask_rate  # (B, L, V)
        len_rate = pred_rate.length_rate  # (B, L+1)

        # ——— unmask step (Euler) ———
        mask_pos = (xt == self.interpolant.mask_token).nonzero(as_tuple=True)
        unmask_rate[xt != mask] = 0
        unmask_rate[mask_pos + (mask,)] = 0
        unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
        trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)
        
        # add "stay" probability
        _xt = xt.clone()
        _xt[xt == pad] = mask
        trans_prob.scatter_add_(
            2,
            _xt.unsqueeze(-1),
            torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
        )

        trans_prob[mask_pos + (mask,)] = 0.0  # remove mask token from sampling at the last step
        
        # Renormalize probabilities to ensure they sum to 1
        prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
        # Avoid division by zero; if all probs are 0, use uniform distribution (excluding mask and pad)
        mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
        if mask_has_zero_prob.any():
            # Create uniform distribution over valid tokens (excluding mask and pad)
            num_zero_prob = mask_has_zero_prob.sum().item()
            uniform_prob = torch.zeros((num_zero_prob, trans_prob.shape[-1]), device=device, dtype=trans_prob.dtype)
            uniform_prob[:, :mask] = 1.0 / mask  # Uniform over tokens 0 to mask-1
            trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
        else:
            # Normalize to sum to 1
            trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum

        new_xt = _sample_tokens(trans_prob)
        new_xt[xt == pad] = pad
        new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
       
        # update indices--boolean tensor of shape (B, max_length)
        # A position is updated if:
        # 1. The token changed (xt != new_xt)
        # 2. It's not a pad position
        # 3. It WAS a mask token that got unmasked (so we check xt == mask, not xt != mask)
        
        # Debug before fix
        old_update_ids = (xt != new_xt) & (xt != pad) & (xt != mask)
        
        # Correct logic: updated positions are where mask tokens were changed
        update_ids = (xt != new_xt) & (xt != pad)
        
        if self.insertion_planner is False:
            return new_xt, update_ids
        
        # ——— Poisson insertion (tau-leaping) — can insert multiple masks per gap ———
        ext = torch.poisson(len_rate * dt).long()  # (B, L+1)
        xt_len = xt.ne(pad).sum(dim=1)  # (B,)
        # Use ext.shape[1] to get the actual max_length dimension from the data
        actual_max_length = ext.shape[1] - 1  # ext is (B, L+1), so L = ext.shape[1] - 1
        gaps = torch.arange(ext.shape[1], device=device).view(1, -1)
        ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
        total_ext = ext.sum(dim=1)
        valid = xt_len + total_ext <= actual_max_length
        ext = ext * valid.view(batch_size, 1).long()

        ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
        new_len = xt_len + total_ext  # (B,)

        xt_tmp = torch.full_like(xt, pad)
        # Create position indices that match xt_tmp's shape
        pos_idx_for_fill = torch.arange(xt_tmp.shape[1], device=device).view(1, -1).expand(batch_size, -1)
        mask_fill = pos_idx_for_fill < new_len.view(batch_size, 1)
        xt_tmp[mask_fill] = mask

        new_pos_orig = pos_idx_L + ext_ex[:, :actual_max_length]  # (B, L)
        orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
        flat_b = batch_idx_L[orig_mask]
        flat_p = new_pos_orig[orig_mask]
        xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
        
        new_ins_xt = xt_tmp
        
        # Newly inserted masks: positions that are mask now but weren't before.
        newly_inserted_masks = (new_ins_xt == mask) & (xt != mask) & (xt != pad)
        
        update_ins_ids = newly_inserted_masks
        
        return new_xt, update_ids, new_ins_xt, update_ins_ids
    
    def loss_planner_flexible(self, log_rnd, x, num_replicates=16, weight_func=lambda l: 1/l, eps=1e-3, centering=False, centering_strength=1.0, softmax_temperature=1.0):
        r"""
        Weighted denoising cross entropy loss
        X_T ~ P^u_T and weights \log\frac{dP^*}{dP^u}(X)
        
        log_rnd: [B] — pre-computed importance weights (already softmax-normalized over the full buffer)
        x: [B, L] (no mask)
        num_replicates: R, number of replicates of each row in x
        weight_func: w(lambda) for each sample, 1/lambda by default
        centering_strength: float, controls how much of the mean is subtracted (DMPO-style)
        softmax_temperature: float, temperature for softmax on log_rnd (>1 smooths weights)
        """
        
        batch = x.repeat_interleave(num_replicates, dim=0) # [B*R, L]
        batch_size = batch.shape[0]
        
        batch_weights = (log_rnd.detach() / softmax_temperature).softmax(dim=-1)  # [B]
        if centering:
            batch_weights = batch_weights - centering_strength * batch_weights.mean()
        
        batch_weights = batch_weights.repeat_interleave(num_replicates, dim=0)
        
        lamda = torch.rand(batch.shape[0], device=batch.device) # [B*R]
        lamda_weights = weight_func(lamda).clamp(max=1e5) # [B*R]
        
        t = lamda
        scale_factor = self.config.interpolant.max_length
        
        # compute unmasking and insertion loss
        interpolant_sample = self.interpolant.sample_interpolant(t, batch)
        unmask_weight, insert_weight = self.interpolant.elbo_weight(t, batch)

        prediction: ModelPrediction = self(interpolant_sample.xt, t)
        
        with torch.no_grad(): # no need to compute gradient in this step
            sampler_out = self.one_step_sampler(interpolant_sample.xt, t, prediction)
            # one_step_sampler returns (xs, update_ids) or (xs, update_ids, new_ins_xt, update_ins_ids)
            xs, update_ids = sampler_out[0], sampler_out[1]

        # The remasking head scores the freshly-decoded tokens to decide which to
        # remask, so it reads the POST-unmask state xs (matching inference, which
        # calls the planner on the decoded new_xt).
        planner = self.planner(xs, t)
        remasking_conf = planner["remasking_conf"]  # [B*R, L, 1]

        # Compute per-sample loss
        # IMPORTANT: interpolant_sample.xt has been reordered via st permutation
        # We need to map back to the original positions to compare with batch
        st = interpolant_sample.st  # [B*R, L] permutation indices
        batch_reordered = torch.gather(batch, 1, st)  # Apply same permutation to ground truth
        
        binary_label = (xs == batch_reordered).float() 
        
        # Only compute loss on positions that were updated
        per_token_loss = F.binary_cross_entropy_with_logits(
            remasking_conf.squeeze(-1),  # [B*R, L]
            binary_label,  # [B*R, L]
            reduction="none"  # [B*R, L]
        )
        
        per_token_loss = per_token_loss * update_ids.float()  # [B*R, L]
        
        # Mask out non-updated positions and average per sample
        per_sample_loss = per_token_loss.sum(dim=1) / (update_ids.sum(dim=1).float() + 1e-8)  # [B*R]
        
        # Weight by importance sampling weights
        weighted_loss = per_sample_loss * batch_weights  # [B*R]

        # ——— AUC / label-balance diagnostics (see loss_insert_planner_flexible) ———
        with torch.no_grad():
            metrics = {}
            sel_u = update_ids.bool()
            if sel_u.any():
                u_scores = remasking_conf.squeeze(-1)[sel_u]
                u_labels = binary_label[sel_u]
                metrics["unmask_auc"] = _binary_auc(u_scores, u_labels)
                metrics["unmask_label_mean"] = u_labels.mean().item()
                metrics["unmask_conf_mean"] = torch.sigmoid(u_scores).mean().item()
                metrics["unmask_n"] = float(sel_u.sum().item())
            self._last_planner_metrics = metrics

        return weighted_loss.mean()
    
    def loss_insert_planner_flexible(self, log_rnd, x, num_replicates=16, weight_func=lambda l: 1/l, eps=1e-3, centering=False, centering_strength=1.0, softmax_temperature=1.0):
        r"""
        Weighted denoising cross entropy loss
        X_T ~ P^u_T and weights \log\frac{dP^*}{dP^u}(X)
        
        log_rnd: [B] — pre-computed importance weights
        x: [B, L] (no mask)
        num_replicates: R, number of replicates of each row in x
        weight_func: w(lambda) for each sample, 1/lambda by default
        centering_strength: float, controls how much of the mean is subtracted (DMPO-style)
        softmax_temperature: float, temperature for softmax on log_rnd (>1 smooths weights)
        """
        
        batch = x.repeat_interleave(num_replicates, dim=0) # [B*R, L]
        batch_size = batch.shape[0]
        
        batch_weights = (log_rnd.detach() / softmax_temperature).softmax(dim=-1)  # [B]
        if centering:
            batch_weights = batch_weights - centering_strength * batch_weights.mean()
        
        batch_weights = batch_weights.repeat_interleave(num_replicates, dim=0)
        
        lamda = torch.rand(batch.shape[0], device=batch.device) # [B*R]
        lamda_weights = weight_func(lamda).clamp(max=1e5) # [B*R]
        
        t = lamda
        scale_factor = self.config.interpolant.max_length
        
        # compute unmasking and insertion loss
        # deleted mask: binary tensor [B*R, L] where true tokens in batch were deleted
        # gap_assignment: [B*R, max_gaps, L] maps x1 positions to gap indices
        interpolant_sample, deleted_mask, gap_assignment = self.interpolant.sample_interpolant_plan(t, batch)
        unmask_weight, insert_weight = self.interpolant.elbo_weight(t, batch)

        prediction: ModelPrediction = self(interpolant_sample.xt, t)
        
        with torch.no_grad(): # no need to compute gradient in this step
            xs_unmask, update_unmask_ids, xs_insert, update_ins_ids = self.one_step_sampler(interpolant_sample.xt, t, prediction)

        # The remasking head scores the freshly-decoded tokens to decide which to
        # remask, so it must see the POST-unmask state xs_unmask (matching
        # inference in inference_quality.py, which calls the planner on the
        # decoded new_xt). Grad stays on here since this head is what we train.
        planner = self.planner(xs_unmask, t)
        remasking_conf = planner["remasking_conf"]  # [B*R, L, 1]

        # The insertion-quality head scores the freshly-inserted mask tokens, so
        # it must see the POST-insertion state xs_insert (aligned with
        # update_ins_ids / insertion_quality below, and matching inference in
        # remasking_scheduleaware.apply_schedule_aware_insertion). Grad stays on
        # here since this head is what we are training.
        if self.planner.insertion_planner:
            insertion_conf = self.planner(xs_insert, t)["insertion_conf"]  # [B*R, L, 1]
        else:
            insertion_conf = None
        
        # Compute per-sample loss
        # IMPORTANT: interpolant_sample.xt has been reordered via st permutation
        # We need to map back to the original positions to compare with batch
        # Use the st (permutation) to get the ground truth in the reordered space
        st = interpolant_sample.st  # [B*R, L] permutation indices
        batch_reordered = torch.gather(batch, 1, st)  # Apply same permutation to ground truth
        
        # Now compare in the reordered space
        binary_label = (xs_unmask == batch_reordered).float() 
        
        # Only compute loss on positions that were updated
        per_token_loss = F.binary_cross_entropy_with_logits(
            remasking_conf.squeeze(-1),  # [B*R, L]
            binary_label,  # [B*R, L]
            reduction="none"  # [B*R, L]
        )
        
        per_token_loss = per_token_loss * update_unmask_ids.float()  # [B*R, L]
        
        # Mask out non-updated positions and average per sample
        unmask_per_sample_loss = per_token_loss.sum(dim=1) / (update_unmask_ids.sum(dim=1).float() + 1e-8)  # [B*R]
        
        # compute insertion planner loss
        # For positions where masks were inserted, we evaluate the quality of insertion
        # by computing the probability that the ground truth token would be predicted at that position
        
        # IMPORTANT: We need to recompute predictions using xs_insert since that's where the masks were inserted
        # The original prediction was computed from xt (before insertion)
        with torch.no_grad():
            prediction_after_insert: ModelPrediction = self(xs_insert, t)
        
        # Get the token prediction probabilities at inserted mask positions
        # prediction_after_insert.token_logits: [B*R, L, V] - logits for all positions in xs_insert
        token_probs = F.softmax(prediction_after_insert.token_logits, dim=-1)  # [B*R, L, V]
        
        # For each gap where masks were inserted, compute the sum of probabilities
        # of the ground truth tokens that were deleted in that specific gap
        # gap_assignment: [B*R, max_gaps, L] - maps x1 positions to gap indices
        # batch: [B*R, L] - ground truth tokens in original space (before permutation)
        
        vocab_size = token_probs.shape[-1]
        L = token_probs.shape[1]
        max_gaps = gap_assignment.shape[1]
        
        # For each gap, create a vocabulary mask of tokens that belong to that gap
        # gap_vocab_mask[b, gap_idx, token_id] = 1 if token_id was deleted in gap gap_idx
        gap_vocab_mask = torch.zeros(batch_size, max_gaps, vocab_size, device=batch.device, dtype=torch.float)
        
        # Vectorized: gather tokens from batch for all gaps at once
        # tokens_expanded[b, gap_idx, pos] = batch[b, pos] for all positions
        tokens_expanded = batch.unsqueeze(1).expand(batch_size, max_gaps, L)  # [B*R, max_gaps, L]
        
        # valid_mask[b, gap_idx, pos] = 1 if position pos belongs to gap gap_idx and is not pad
        valid_mask = (gap_assignment > 0) & (tokens_expanded != self.interpolant.pad_token)  # [B*R, max_gaps, L]
        
        # Scatter tokens into vocabulary dimension: mark which tokens appear in each gap
        gap_vocab_mask.scatter_add_(
            2,  # scatter along vocabulary dimension
            tokens_expanded.clamp(0, vocab_size - 1),  # token indices [B*R, max_gaps, L]
            valid_mask.float()  # values to add [B*R, max_gaps, L]
        )
        
        # Binarize: a token either appears in the gap or not
        gap_vocab_mask = (gap_vocab_mask > 0).float()  # [B*R, max_gaps, V]
        
        # For each insertion position in xs_insert, determine which gap it corresponds to
        # Position p in xs_insert corresponds to gap p (insertions occur between existing tokens)
        # Vectorized: compute for all positions at once
        # token_probs: [B*R, L, V]
        # gap_vocab_mask[:, :L, :]: [B*R, L, V] - vocab mask for gaps 0 to L-1
        insertion_quality_full = (token_probs * gap_vocab_mask[:, :L, :]).sum(dim=-1)  # [B*R, L]
        
        # Only consider quality at positions where masks were actually inserted
        insertion_quality = insertion_quality_full * update_ins_ids.float()  # [B*R, L]
        
        # Compute insertion planner loss only if insertion_planner is enabled
        if insertion_conf is not None:
            # The planner predicts insertion confidence with insertion_conf
            # We want to train it to predict high confidence when insertion_quality is high
            # Use Bernoulli cross-entropy: treat insertion_quality as the "success probability"
            
            # Binary cross-entropy with insertion_quality as continuous labels in [0,1]
            ins_per_token_loss = F.binary_cross_entropy_with_logits(
                insertion_conf.squeeze(-1),  # [B*R, L] - planner's insertion confidence logits
                insertion_quality,  # [B*R, L] - ground truth token probability as quality metric
                reduction="none"
            )
            
            # Only compute loss where masks were actually inserted
            ins_per_token_loss = ins_per_token_loss * update_ins_ids.float()
            
            # Average per sample
            ins_per_sample_loss = ins_per_token_loss.sum(dim=1) / (update_ins_ids.sum(dim=1).float() + 1e-8)
        else:
            # No insertion planner - set loss to zero
            ins_per_sample_loss = torch.zeros_like(unmask_per_sample_loss)
        
        # Add to total loss
        per_sample_loss = unmask_per_sample_loss + ins_per_sample_loss
        
        # Weight by importance sampling weights
        weighted_loss = per_sample_loss * batch_weights  # [B*R]

        # ——— AUC / label-balance diagnostics (the loss alone hides degenerate
        # targets; near-0 BCE can mean "all labels one class", not "learned") ———
        with torch.no_grad():
            metrics = {}
            sel_u = update_unmask_ids.bool()
            if sel_u.any():
                u_scores = remasking_conf.squeeze(-1)[sel_u]
                u_labels = binary_label[sel_u]
                metrics["unmask_auc"] = _binary_auc(u_scores, u_labels)
                metrics["unmask_label_mean"] = u_labels.mean().item()
                metrics["unmask_conf_mean"] = torch.sigmoid(u_scores).mean().item()
                metrics["unmask_n"] = float(sel_u.sum().item())
            if insertion_conf is not None:
                sel_i = update_ins_ids.bool()
                if sel_i.any():
                    i_scores = insertion_conf.squeeze(-1)[sel_i]
                    i_targets = insertion_quality[sel_i]
                    i_labels = (i_targets > 0.5).float()
                    metrics["insert_auc"] = _binary_auc(i_scores, i_labels)
                    metrics["insert_target_mean"] = i_targets.mean().item()
                    metrics["insert_conf_mean"] = torch.sigmoid(i_scores).mean().item()
                    metrics["insert_n"] = float(sel_i.sum().item())
            self._last_planner_metrics = metrics

        return unmask_per_sample_loss.mean(), ins_per_sample_loss.mean(), weighted_loss.mean()