# models/diffloss.py

import math
import torch
import torch.nn as nn
from torch.utils.checkpoint import checkpoint
from models.diffusion import create_diffusion


# ---------------- utils ----------------
def modulate(x, shift, scale):
    return x * (1 + scale) + shift


class TimestepEmbedder(nn.Module):
    def __init__(self, hidden_size, frequency_embedding_size=256):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
            nn.SiLU(),
            nn.Linear(hidden_size, hidden_size, bias=True),
        )
        self.frequency_embedding_size = frequency_embedding_size

    @staticmethod
    def timestep_embedding(t, dim, max_period=10000):
        half = dim // 2
        freqs = torch.exp(-math.log(max_period) * torch.arange(0, half, dtype=torch.float32) / half).to(t.device)
        args = t[:, None].float() * freqs[None]
        emb = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
        if dim % 2:
            emb = torch.cat([emb, torch.zeros_like(emb[:, :1])], dim=-1)
        return emb

    def forward(self, t):
        return self.mlp(self.timestep_embedding(t, self.frequency_embedding_size))


class SinPos1D(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.dim = dim
    def forward(self, L, device, dtype):
        pe = torch.zeros(L, self.dim, device=device, dtype=torch.float32)
        pos = torch.arange(0, L, device=device, dtype=torch.float32).unsqueeze(1)
        div = torch.exp(torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) * (-math.log(10000.0)/self.dim))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        return pe.to(dtype)


# --------------- DiT block (causal) ---------------
class TemporalDiTBlock(nn.Module):
    """
    Transformer block with AdaLN (DiT-style), **causal** self-attention over time.
    """
    def __init__(self, dim, n_heads, mlp_ratio=4.0, dropout=0.0):
        super().__init__()
        self.dim = dim
        self.n_heads = n_heads
        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
        self.attn = nn.MultiheadAttention(dim, n_heads, dropout=dropout, batch_first=True)
        self.norm2 = nn.LayerNorm(dim, eps=1e-6)
        hidden = int(dim * mlp_ratio)
        self.ffn = nn.Sequential(
            nn.Linear(dim, 2 * hidden, bias=True),
            nn.SiLU(),
            nn.Linear(2 * hidden, dim, bias=True),
        )
        # AdaLN params: shift/scale/gate for attn and ffn
        self.adaLN = nn.Sequential(nn.SiLU(), nn.Linear(dim, 6 * dim, bias=True))
        nn.init.constant_(self.adaLN[-1].weight, 0)
        nn.init.constant_(self.adaLN[-1].bias, 0)

    def forward(self, x, y, causal_mask):
        """
        x: [B, L, D], y: [B, D], causal_mask: [L, L] bool, True = mask (disallow)
        """
        s1, sc1, g1, s2, sc2, g2 = self.adaLN(y).chunk(6, dim=-1)  # [B, D] each

        # attn (causal)
        h = modulate(self.norm1(x), s1.unsqueeze(1), sc1.unsqueeze(1))
        # torch's attn expects attn_mask shape [L, L] or [B*nH, L, L]; True means -inf
        h, _ = self.attn(h, h, h, attn_mask=causal_mask, need_weights=False)
        x = x + g1.unsqueeze(1) * h

        # ffn
        h2 = modulate(self.norm2(x), s2.unsqueeze(1), sc2.unsqueeze(1))
        h2 = self.ffn(h2)
        x = x + g2.unsqueeze(1) * h2
        return x


class FinalLayer(nn.Module):
    def __init__(self, dim, out_channels):
        super().__init__()
        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
        self.linear = nn.Linear(dim, out_channels, bias=True)
        self.adaLN = nn.Sequential(nn.SiLU(), nn.Linear(dim, 2 * dim, bias=True))
        nn.init.constant_(self.adaLN[-1].weight, 0)
        nn.init.constant_(self.adaLN[-1].bias, 0)
        nn.init.constant_(self.linear.weight, 0)
        nn.init.constant_(self.linear.bias, 0)

    def forward(self, x, c):
        shift, scale = self.adaLN(c).chunk(2, dim=-1)
        x = modulate(self.norm(x), shift.unsqueeze(1), scale.unsqueeze(1))
        return self.linear(x)


# --------------- Temporal DiT (sequence-aware, causal) ---------------
class TemporalDiTAdaLN(nn.Module):
    """
    DiT-like denoiser that:
      - operates on [B, L, C]
      - uses **causal** attention (each position sees only <= t)
      - accepts (B, L) via set_sequence_layout for flatten↔sequence reshaping
      - returns all positions but we usually **read only the last token** for streaming
    """
    def __init__(self, in_channels, model_channels, out_channels, z_channels, depth, n_heads=8,
                 mlp_ratio=4.0, grad_checkpointing=False):
        super().__init__()
        self.in_channels = in_channels
        self.model_channels = model_channels
        self.out_channels = out_channels
        self.z_channels = z_channels
        self.depth = depth
        self.n_heads = n_heads
        self.grad_checkpointing = grad_checkpointing

        self.time_embed = TimestepEmbedder(model_channels)
        self.cond_embed = nn.Linear(z_channels, model_channels)
        self.input_proj = nn.Linear(in_channels, model_channels)
        self.pos = SinPos1D(model_channels)

        self.blocks = nn.ModuleList([
            TemporalDiTBlock(model_channels, n_heads=n_heads, mlp_ratio=mlp_ratio)
            for _ in range(depth)
        ])
        self.final = FinalLayer(model_channels, out_channels)

        self._seq_B = None
        self._seq_L = None

        self._init_weights()

    def _init_weights(self):
        def _xav(m):
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None: nn.init.constant_(m.bias, 0)
        self.apply(_xav)
        nn.init.normal_(self.time_embed.mlp[0].weight, std=0.02)
        nn.init.normal_(self.time_embed.mlp[2].weight, std=0.02)

    def set_sequence_layout(self, B, L):
        self._seq_B = int(B)
        self._seq_L = int(L)

    def _flatten_to_seq(self, x_flat, c_flat):
        if self._seq_B is None or self._seq_L is None:
            B, L = x_flat.shape[0], 1
        else:
            B, L = self._seq_B, self._seq_L
            assert B * L == x_flat.shape[0], f"set_sequence_layout({B},{L}) mismatch"
        x = x_flat.view(B, L, -1)
        c = c_flat.view(B, L, -1)
        return x, c

    @staticmethod
    def _causal_mask(L, device):
        # True where masked (disallowed)
        m = torch.ones(L, L, device=device, dtype=torch.bool).triu(1)
        # MultiheadAttention expects float mask with -inf where we mask.
        # But newer PyTorch also supports bool with True=mask. We'll pass bool here.
        return m

    def forward(self, x_flat, t, c_flat, cfg_scale: float = 1.0):
        x, c = self._flatten_to_seq(x_flat, c_flat)      # [B, L, C], [B, L, Cz]
        B, L, _ = x.shape

        x = self.input_proj(x)
        pos = self.pos(L, x.device, x.dtype)
        x = x + pos.unsqueeze(0)

        # pool cond to a single AdaLN vector per batch (like DiT)
        t_emb = self.time_embed(t).view(B, L, -1).mean(dim=1)   # [B, D]
        c_emb = self.cond_embed(c).mean(dim=1)                  # [B, D]
        y = t_emb + c_emb

        causal_mask = self._causal_mask(L, x.device)

        if self.grad_checkpointing and not torch.jit.is_scripting():
            for blk in self.blocks:
                x = checkpoint(blk, x, y, causal_mask)
        else:
            for blk in self.blocks:
                x = blk(x, y, causal_mask)

        out = self.final(x, y)                         # [B, L, out_channels]
        return out.view(B * L, -1)

    def forward_with_cfg(self, x, t, c, cfg_scale):
        half = x[: len(x) // 2]
        combined = torch.cat([half, half], dim=0)
        model_out = self.forward(combined, t, c, cfg_scale=cfg_scale)
        eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
        guided = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
        eps = torch.cat([guided, guided], dim=0)
        return torch.cat([eps, rest], dim=1)


# --------------- Wrapper (same training API) + streaming helpers ---------------
class DiffLoss(nn.Module):
    """
    Diffusion loss with **causal, streamable** temporal DiT denoiser.
    Training API unchanged; plus:
      - set_sequence_layout(B, L)
      - sample_next_token(z_seq, temperature=1.0, cfg=1.0) -> [B, C] (last token)
    """
    def __init__(self, target_channels, z_channels, depth, width, num_sampling_steps,
                 grad_checkpointing=False, learn_sigma=False, n_heads=8, mlp_ratio=4.0):
        super().__init__()
        self.in_channels = target_channels
        self.learn_sigma = learn_sigma

        self.net = TemporalDiTAdaLN(
            in_channels=target_channels,
            model_channels=width,
            out_channels=target_channels * 2 if learn_sigma else target_channels,
            z_channels=z_channels,
            depth=depth,
            n_heads=n_heads,
            mlp_ratio=mlp_ratio,
            grad_checkpointing=grad_checkpointing
        )

        self.train_diffusion = create_diffusion(timestep_respacing="", noise_schedule="cosine")
        self.gen_diffusion = create_diffusion(timestep_respacing=num_sampling_steps, noise_schedule="cosine")

        # cached (B,L) for flatten↔sequence
        self._B = None
        self._L = None

    # --- layout for flatten<->sequence ---
    def set_sequence_layout(self, B, L):
        self._B, self._L = int(B), int(L)
        self.net.set_sequence_layout(B, L)

    # --- training ---
    def forward(self, target, z, mask=None):
        t = torch.randint(0, self.train_diffusion.num_timesteps, (target.shape[0],), device=target.device)
        loss_dict = self.train_diffusion.training_losses(self.net, target, t, dict(c=z))
        loss, pred_xstart = loss_dict["loss"], loss_dict["pred_xstart"]
        if mask is not None:
            loss = (loss * mask).sum() / mask.sum()
        return loss.mean(), pred_xstart

    # --- full sequence sampling (kept for compatibility) ---
    def sample(self, z, temperature=1.0, cfg=1.0):
        if cfg != 1.0:
            noise = torch.randn(z.shape[0] // 2, self.in_channels, device=z.device)
            noise = torch.cat([noise, noise], dim=0)
            sample_fn = self.net.forward_with_cfg
            kwargs = dict(c=z, cfg_scale=cfg)
        else:
            noise = torch.randn(z.shape[0], self.in_channels, device=z.device)
            sample_fn = self.net.forward
            kwargs = dict(c=z)

        return self.gen_diffusion.p_sample_loop(
            sample_fn, noise.shape, noise, clip_denoised=False, model_kwargs=kwargs,
            progress=False, temperature=temperature
        )

    # --- STREAMING: sample only the **last token** of current window ---
    @torch.no_grad()
    def sample_next_token(self, z_seq, temperature=1.0, cfg=1.0):
        """
        z_seq: [B, L, Cz] AR conditions for the current streaming window (history + 1 step).
               Call set_sequence_layout(B, L) first.
        Returns: next_token: [B, C] (the last position’s denoised sample).
        Mechanism: denoise **entire window** with causal attention and read the last index only.
        """
        assert self._B is not None and self._L is not None, "Call set_sequence_layout(B, L) first."
        B, L, Cz = z_seq.shape
        assert B == self._B and L == self._L, "z_seq shape must match set_sequence_layout."

        z_flat = z_seq.reshape(B * L, Cz)

        if cfg != 1.0:
            noise = torch.randn((B * L) // 2, self.in_channels, device=z_seq.device)
            noise = torch.cat([noise, noise], dim=0)
            sample_fn = self.net.forward_with_cfg
            kwargs = dict(c=z_flat, cfg_scale=cfg)
        else:
            noise = torch.randn(B * L, self.in_channels, device=z_seq.device)
            sample_fn = self.net.forward
            kwargs = dict(c=z_flat)

        x = self.gen_diffusion.p_sample_loop(
            sample_fn, noise.shape, noise, clip_denoised=False, model_kwargs=kwargs,
            progress=False, temperature=temperature
        )  # [B*L, C]

        x_seq = x.view(B, L, self.in_channels)
        return x_seq[:, -1, :]   # last token only