import torch
import torch.nn as nn
import math
import numpy as np
import torch.nn.functional as F
# from timm.models.vision_transformer import Attention, Mlp -> handson_tims

class Mlp(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


class Attention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = head_dim ** -0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        # attn = (q @ k.transpose(-2, -1)) * self.scale
        # attn = attn.softmax(dim=-1)
        # attn = self.attn_drop(attn)

        # x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        # x = self.proj(x)
        # x = self.proj_drop(x)

        ## Replace: use Flash-Attention
        x = F.scaled_dot_product_attention(q, k, v, dropout_p=0.0) 
        
        x = x.transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)

        return x


class Patch1D(nn.Module):
    """
    [B, L, D] -> [B, L/P, D*P]
    """
    def __init__(self, patch_size):
        super().__init__()
        self.patch_size = patch_size

    def forward(self, x): 
        B, L, D = x.shape
        # Pad sequence if not divisible by patch_size
        # [B,31,4]->patch_size = 2 -> [B,16,8],pad is [x_31, padding_0,,,]
        if L % self.patch_size != 0:
            pad = self.patch_size - (L % self.patch_size)
            x = F.pad(x, (0, 0, 0, pad))
            
        B, L_new, D = x.shape
        # View as patches
        return x.view(B, L_new // self.patch_size, D * self.patch_size)

class Unpatch1D(nn.Module):
    """
    [B, L/P, D*P] -> [B, L, D]
    """
    def __init__(self, patch_size):
        super().__init__()
        self.patch_size = patch_size

    def forward(self, x):
        B, L_new, DP = x.shape
        return x.view(B, L_new * self.patch_size, DP // self.patch_size)

### 这里DiT的pos_embed没有使用到三角函数；另外，没有forward_with_cfg的函数实现 -> 暂时没有label_embedding
## from: https://github.com/willisma/SiT/blob/main/models.py
class TimestepEmbedder(nn.Module):
    """Sinusoidal Time Embeddings"""
    def __init__(self, hidden_size, frequency_embedding_size=256):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(frequency_embedding_size, hidden_size,bias=True),
            nn.SiLU(),
            nn.Linear(hidden_size, hidden_size,bias=True),
        )
        self.frequency_embedding_size = frequency_embedding_size

    @staticmethod
    def timestep_embedding(t, dim, max_period=10000):
        """
        Create sinusoidal timestep embeddings.
        :param t: a 1-D Tensor of N indices, one per batch element.
                          These may be fractional.
        :param dim: the dimension of the output.
        :param max_period: controls the minimum frequency of the embeddings.
        :return: an (N, D) Tensor of positional embeddings.
        """
        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
        ## 兼容更多的 t 格式
        if t.ndim > 1:
            t = t.view(-1)

        half = dim // 2
        freqs = torch.exp(
            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
        ).to(device=t.device)
        args = t[:, None].float() * freqs[None]
        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
        if dim % 2:
            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
        return embedding

    def forward(self, t):
        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
        t_emb = self.mlp(t_freq)
        return t_emb


def modulate(x, shift, scale):
    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)

## DiTBlock, adaptive layer norm conditioning 
class DiTBlock(nn.Module):
    """Transformer Block with Adaptive Layer Norm (adaLN)"""
    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
        super().__init__()
        self.hidden_size = hidden_size
        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True)
        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        mlp_hidden_dim = int(hidden_size * mlp_ratio)
        approx_gelu = lambda: nn.GELU(approximate="tanh")
        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
        self.adaLN_modulation = nn.Sequential(
            nn.SiLU(),
            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
        )

    def forward(self, x, c):
        # c shape: [B, hidden_size]
        # adaLN_out shape 应该是 [B, 6 * hidden_size]
        adaLN_out = self.adaLN_modulation(c)
        
        # --- Debug 探针 (如果再次报错，请查看这里打印的形状) ---
        if adaLN_out.shape[1] != 6 * self.hidden_size:
            print(f"⚠️ DiTBlock Shape Error!")
            print(f"Input c shape: {c.shape}")
            print(f"adaLN output shape: {adaLN_out.shape}")
            print(f"Expected dim1: {6 * self.hidden_size}")
            raise ValueError("adaLN output dimension mismatch!")
        # ----------------------------------------------------

        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = adaLN_out.chunk(6, dim=1)
        x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
        return x

class PatchedFlowDiT(nn.Module):
    """
    Main DiT Architecture for Flow Matching
    Input: z_t (Noisy Latent) + t (Time) + condition (Original Latent)
    Output: velocity vector
    """
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        ## add patch and unpatch block here
        self.patcher = Patch1D(cfg.patch_size)
        self.unpatcher = Unpatch1D(cfg.patch_size)

        # 计算 Patch 后的输入维度
        # Input to DiT = Patch(z_t) + Patch(Condition)
        # 维度 = (Latent * Patch) * 2
        input_feat_dim = cfg.latent_dim * cfg.patch_size
        
        # Projection to DiT Hidden Size
        self.input_proj = nn.Linear(input_feat_dim * 2, cfg.dit_hidden)

        # Time & Pos Embeddings
        self.time_embed = TimestepEmbedder(cfg.dit_hidden)
        patched_len = (cfg.max_seq_len + cfg.patch_size - 1) // cfg.patch_size
        self.pos_embed = nn.Parameter(torch.zeros(1, patched_len, cfg.dit_hidden))
        
        self.blocks = nn.ModuleList([
            DiTBlock(cfg.dit_hidden, cfg.dit_heads) for _ in range(cfg.dit_layers)
        ])
        
        # Output Projection (Predict Velocity)
        self.final_layer = nn.Linear(cfg.dit_hidden, input_feat_dim)
        self.initialize_weights()

    def initialize_weights(self):
        # Initialize transformer layers:
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                torch.nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)
        self.apply(_basic_init)

        # Initialize pos_embed
        nn.init.normal_(self.pos_embed, std=0.02)

        # Zero-out adaLN modulation layers
        for block in self.blocks:
            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)

        # Zero-out final layer -> modify: to predict data, so initialize is xavier or normal
        # nn.init.constant_(self.final_layer.weight, 0)
        # nn.init.constant_(self.final_layer.bias, 0)
        nn.init.xavier_uniform_(self.final_layer.weight)
        nn.init.constant_(self.final_layer.bias, 0)

    def forward(self, z_t, t, condition):
        # x: [Batch, Seq, Dim]
        # t: [Batch]
        # condition: [Batch, Seq, Dim] (Optional, e.g., Source Sentence)
        """
        z_t: [B, L, D]
        condition: [B, L, D]
        """
        # 1. Patching
        z_p = self.patcher(z_t)
        c_p = self.patcher(condition)
        
        # 2. Concat & Project(Jit Style: Explicit Conditioning)
        x = torch.cat([z_p, c_p], dim=-1)
        x = self.input_proj(x)
        
        # 3. Add Embeddings
        t_emb = self.time_embed(t)
        # Handle length mismatch due to padding
        L_curr = x.shape[1]
        x = x + self.pos_embed[:, :L_curr, :]
        
        # 4. Transformer
        for block in self.blocks:
            x = block(x, t_emb)
            
        # 5. Output & Unpatch
        v_p = self.final_layer(x)
        v = self.unpatcher(v_p)
        
        # Crop to original length
        return v[:, :z_t.shape[1], :]

    def forward_with_cfg(self, x, t, condition, cfg_scale):
        """
        支持 Classifier-Free Guidance 的前向传播
        """
        # 1. condition
        cond_out = self.forward(x, t, condition)
        
        # 2.uncondition
        uncond_out = self.forward(x, t, condition=None)
        
        # 3. classifier-free guidance
        # eps = eps_uncond + s * (eps_cond - eps_uncond)
        return uncond_out + cfg_scale * (cond_out - uncond_out)