"""
LiquidGen: A Novel Liquid Neural Network Image Generation Model

Architecture Overview:
- Frozen VAE encoder/decoder (SDXL VAE, 4ch latent, 8x compression, no login needed)
- Liquid backbone for denoising (fully parallelizable, no attention, no sequential ODE)
- Flow matching training objective (velocity prediction)

Key Innovation: Replaces attention with Liquid Neural Network dynamics:
- CfC-inspired closed-form update: x_new = α·x + (1-α)·h(x)
- Per-channel learnable decay rates (liquid time constants)
- Depthwise + pointwise convolutions for spatial context (no attention needed)
- Zigzag spatial scanning for global receptive field
- Gated stimulus with biologically-inspired sign constraints
- U-Net style long skip connections from shallow to deep blocks

Math Foundation (from Hasani et al., CfC paper):
  x_{t+1} = exp(-Δt/τ_t) · x_t + (1 - exp(-Δt/τ_t)) · h(x_t, u_t)
  
Our parallelizable adaptation (inspired by LiquidTAD):
  α = exp(-softplus(ρ))  [per-channel learnable decay]
  h = gate · stimulus    [gated depthwise conv output]  
  out = α · x + (1 - α) · h  [liquid relaxation blend]

This removes the input-dependent τ (which requires sequential computation)
and replaces it with a per-channel learned decay — making it fully parallel
while preserving the liquid dynamics' ability to blend old state with new input.

Design for 16GB VRAM (Colab free tier):
- VAE frozen: ~1GB
- Backbone: ~55-280M params (~100-550MB in fp16)  
- Training overhead (grads + optimizer): ~3-8GB
- Batch of latents: ~1-2GB
- Total: fits comfortably in 16GB

References:
- Hasani et al., "Liquid Time-constant Networks" (NeurIPS 2020)
- Hasani et al., "Closed-form Continuous-depth Models" (Nature Machine Intelligence 2022)
- Lechner et al., "Neural Circuit Policies" (Nature Machine Intelligence 2020)
- LiquidTAD (2025) - Parallelized liquid dynamics
- ZigMa (ECCV 2024) - Zigzag scanning for SSM-based diffusion
- DiMSUM (NeurIPS 2024) - Attention-free diffusion
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.checkpoint import checkpoint
import math
from typing import Optional, Tuple


# =============================================================================
# Building Blocks
# =============================================================================

class LiquidTimeConstant(nn.Module):
    """
    Core liquid time-constant module.
    
    Implements the CfC closed-form dynamics in a fully parallelizable way:
      out = α · x + (1 - α) · stimulus
    
    where α = exp(-softplus(ρ)) is a learnable per-channel decay rate,
    derived from the liquid time constant τ = 1/softplus(ρ).
    
    This preserves the key property of Liquid Neural Networks:
    - Exponential relaxation toward a target (stimulus)
    - Rate controlled by τ (how fast to adapt)
    - No sequential ODE solving required
    
    Stability guarantee (from LTC Theorem 1):
    τ_sys ∈ [τ/(1+τW), τ] — time constants NEVER explode
    """
    def __init__(self, channels: int):
        super().__init__()
        # ρ parameterizes the decay: λ = softplus(ρ), α = exp(-λ)
        # Initialize ρ=0 → λ≈0.693 → α≈0.5 (equal blend of old and new)
        self.rho = nn.Parameter(torch.zeros(channels))
    
    def forward(self, x: torch.Tensor, stimulus: torch.Tensor) -> torch.Tensor:
        """
        x: [B, C, H, W] - current state (residual path)
        stimulus: [B, C, H, W] - computed target from context
        returns: [B, C, H, W] - liquid-blended output
        """
        lam = F.softplus(self.rho) + 1e-5
        alpha = torch.exp(-lam).view(1, -1, 1, 1)
        return alpha * x + (1.0 - alpha) * stimulus


class GatedDepthwiseStimulusConv(nn.Module):
    """
    Computes the spatial stimulus using depthwise-separable convolutions
    with a sigmoid gate (inspired by GLU / gated mechanisms in SSMs).
    
    This replaces attention for capturing local spatial context:
    - Depthwise conv: captures local spatial patterns per channel
    - Pointwise conv: mixes channel information
    - Sigmoid gate: controls information flow (like synaptic gating in NCP)
    
    Two parallel paths (inspired by NCP inter→command split):
    1. Stimulus path: DW-conv → PW-conv → GELU → project back
    2. Gate path: DW-conv → PW-conv → sigmoid
    Output = stimulus * gate
    """
    def __init__(self, channels: int, kernel_size: int = 7, expand_ratio: float = 2.0):
        super().__init__()
        hidden = int(channels * expand_ratio)
        
        self.stim_dw = nn.Conv2d(channels, channels, kernel_size, 
                                  padding=kernel_size // 2, groups=channels, bias=False)
        self.stim_pw = nn.Conv2d(channels, hidden, 1, bias=False)
        self.stim_act = nn.GELU()
        self.stim_proj = nn.Conv2d(hidden, channels, 1, bias=False)
        
        self.gate_dw = nn.Conv2d(channels, channels, kernel_size,
                                  padding=kernel_size // 2, groups=channels, bias=False)
        self.gate_pw = nn.Conv2d(channels, channels, 1, bias=True)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        stim = self.stim_proj(self.stim_act(self.stim_pw(self.stim_dw(x))))
        gate = torch.sigmoid(self.gate_pw(self.gate_dw(x)))
        return stim * gate


class ChannelMixMLP(nn.Module):
    """Channel mixing MLP with GELU activation (command neuron processing in NCP)."""
    def __init__(self, channels: int, expand_ratio: float = 4.0):
        super().__init__()
        hidden = int(channels * expand_ratio)
        self.fc1 = nn.Conv2d(channels, hidden, 1, bias=True)
        self.act = nn.GELU()
        self.fc2 = nn.Conv2d(hidden, channels, 1, bias=True)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.fc2(self.act(self.fc1(x)))


class AdaptiveGroupNorm(nn.Module):
    """
    Adaptive Group Normalization conditioned on timestep embedding.
    Applies: out = (1 + scale) * GroupNorm(x) + shift
    """
    def __init__(self, channels: int, cond_dim: int, num_groups: int = 32):
        super().__init__()
        self.norm = nn.GroupNorm(num_groups, channels, affine=False)
        self.proj = nn.Linear(cond_dim, channels * 2)
        nn.init.zeros_(self.proj.weight)
        nn.init.zeros_(self.proj.bias)
    
    def forward(self, x: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
        h = self.norm(x)
        params = self.proj(cond)
        scale, shift = params.chunk(2, dim=-1)
        return h * (1.0 + scale.unsqueeze(-1).unsqueeze(-1)) + shift.unsqueeze(-1).unsqueeze(-1)


class ZigzagScan1D(nn.Module):
    """
    1D global mixing via zigzag-scanned depthwise conv.
    
    Gives quasi-global receptive field without attention's O(n²) cost.
    Zigzag scan preserves spatial continuity (from ZigMa, ECCV 2024).
    """
    def __init__(self, channels: int, kernel_size: int = 31):
        super().__init__()
        self.conv1d = nn.Conv1d(channels, channels, kernel_size, 
                                padding=kernel_size // 2, groups=channels, bias=False)
        self.pw = nn.Conv1d(channels, channels, 1, bias=True)
        self.act = nn.GELU()
        self._idx_cache = {}
    
    def _get_indices(self, H: int, W: int, device: torch.device):
        key = (H, W, device)
        if key not in self._idx_cache:
            indices = []
            for i in range(H):
                row = list(range(i * W, (i + 1) * W))
                if i % 2 == 1:
                    row = row[::-1]
                indices.extend(row)
            fwd = torch.tensor(indices, device=device, dtype=torch.long)
            inv = torch.empty_like(fwd)
            inv[fwd] = torch.arange(H * W, device=device)
            self._idx_cache[key] = (fwd, inv)
        return self._idx_cache[key]
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, C, H, W = x.shape
        zz_idx, inv_idx = self._get_indices(H, W, x.device)
        x_flat = x.reshape(B, C, H * W)
        x_zz = x_flat[:, :, zz_idx]
        x_mixed = self.pw(self.act(self.conv1d(x_zz)))
        x_restored = x_mixed[:, :, inv_idx]
        return x_restored.reshape(B, C, H, W)


# =============================================================================
# Liquid Block: The core building block
# =============================================================================

class LiquidBlock(nn.Module):
    """
    A single Liquid Neural Network block for image denoising.
    
    Architecture (maps to NCP hierarchy):
    1. [SENSORY] AdaGN conditioning → spatial context extraction
    2. [INTER]   Zigzag 1D scan for global mixing
    3. [COMMAND] Liquid time-constant blend (CfC dynamics)
    4. [MOTOR]   Channel mixing MLP for output projection
    
    All operations are fully parallelizable — no sequential dependencies.
    """
    def __init__(
        self, channels: int, cond_dim: int, spatial_kernel: int = 7,
        scan_kernel: int = 31, expand_ratio: float = 2.0, mlp_ratio: float = 4.0,
        drop_rate: float = 0.0, use_zigzag: bool = True,
    ):
        super().__init__()
        self.norm1 = AdaptiveGroupNorm(channels, cond_dim)
        self.norm2 = AdaptiveGroupNorm(channels, cond_dim)
        self.spatial_stim = GatedDepthwiseStimulusConv(channels, spatial_kernel, expand_ratio)
        self.use_zigzag = use_zigzag
        if use_zigzag:
            self.zigzag = ZigzagScan1D(channels, scan_kernel)
            self.zigzag_gate = nn.Parameter(torch.zeros(1))
        self.liquid = LiquidTimeConstant(channels)
        self.channel_mix = ChannelMixMLP(channels, mlp_ratio)
        self.liquid2 = LiquidTimeConstant(channels)
        self.drop = nn.Dropout2d(drop_rate) if drop_rate > 0 else nn.Identity()
    
    def forward(self, x: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
        h = self.norm1(x, cond)
        stim = self.spatial_stim(h)
        if self.use_zigzag:
            zz = self.zigzag(h)
            stim = stim + torch.sigmoid(self.zigzag_gate) * zz
        stim = self.drop(stim)
        x = self.liquid(x, stim)
        h2 = self.norm2(x, cond)
        ch_out = self.drop(self.channel_mix(h2))
        x = self.liquid2(x, ch_out)
        return x


# =============================================================================
# Timestep and Class Embeddings
# =============================================================================

class TimestepEmbedding(nn.Module):
    """Sinusoidal timestep embedding followed by MLP projection."""
    def __init__(self, dim: int, freq_dim: int = 256):
        super().__init__()
        self.freq_dim = freq_dim
        self.mlp = nn.Sequential(nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
    
    def forward(self, t: torch.Tensor) -> torch.Tensor:
        half = self.freq_dim // 2
        freqs = torch.exp(-math.log(10000.0) * torch.arange(half, device=t.device, dtype=t.dtype) / half)
        args = t.unsqueeze(-1) * freqs.unsqueeze(0)
        emb = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
        return self.mlp(emb)


class ClassEmbedding(nn.Module):
    """Optional class-conditional embedding with CFG null embedding."""
    def __init__(self, num_classes: int, dim: int):
        super().__init__()
        self.embed = nn.Embedding(num_classes, dim)
        self.null_embed = nn.Parameter(torch.randn(dim) * 0.02)
    
    def forward(self, labels: torch.Tensor, drop_prob: float = 0.0) -> torch.Tensor:
        emb = self.embed(labels)
        if self.training and drop_prob > 0:
            mask = torch.rand(labels.shape[0], 1, device=labels.device) < drop_prob
            emb = torch.where(mask, self.null_embed.unsqueeze(0).expand_as(emb), emb)
        return emb


# =============================================================================
# LiquidGen: Full Model
# =============================================================================

class LiquidGen(nn.Module):
    """
    LiquidGen: Liquid Neural Network Image Generator
    
    A novel attention-free diffusion model that uses Liquid Neural Network
    dynamics (CfC closed-form continuous-depth) for image generation.
    
    Features:
    - NO self-attention anywhere — O(n) complexity
    - NO sequential ODE solving — fully parallelizable
    - Liquid time constants for adaptive information blending
    - Zigzag scanning for global context
    - Depthwise convolutions for local spatial structure
    - Gated stimulus (biologically-inspired from NCP)
    - U-Net long skip connections (from U-ViT/DiM)
    
    Config Presets:
    - LiquidGen-S: ~55M params (256px, fast training)
    - LiquidGen-B: ~140M params (256/512px, balanced)
    - LiquidGen-L: ~280M params (512px, high quality)
    """
    
    def __init__(
        self,
        in_channels: int = 4,         # 4 for SDXL VAE
        patch_size: int = 2,
        embed_dim: int = 512,
        depth: int = 16,
        spatial_kernel: int = 7,
        scan_kernel: int = 31,
        expand_ratio: float = 2.0,
        mlp_ratio: float = 4.0,
        drop_rate: float = 0.0,
        num_classes: int = 0,
        class_drop_prob: float = 0.1,
        use_zigzag: bool = True,
    ):
        super().__init__()
        self.in_channels = in_channels
        self.patch_size = patch_size
        self.embed_dim = embed_dim
        self.depth = depth
        self.num_classes = num_classes
        self.class_drop_prob = class_drop_prob
        
        cond_dim = embed_dim
        
        self.time_embed = TimestepEmbedding(cond_dim)
        self.class_embed = ClassEmbedding(num_classes, cond_dim) if num_classes > 0 else None
        
        self.patch_embed = nn.Conv2d(in_channels, embed_dim, patch_size, stride=patch_size)
        
        self.pos_embed_size = 32
        self.pos_embed = nn.Parameter(
            torch.randn(1, embed_dim, self.pos_embed_size, self.pos_embed_size) * 0.02
        )
        
        self.input_proj = nn.Sequential(
            nn.Conv2d(embed_dim, embed_dim, 3, padding=1, groups=embed_dim, bias=False),
            nn.Conv2d(embed_dim, embed_dim, 1, bias=True),
            nn.GELU(),
        )
        
        self.blocks = nn.ModuleList([
            LiquidBlock(embed_dim, cond_dim, spatial_kernel, scan_kernel,
                       expand_ratio, mlp_ratio, drop_rate, use_zigzag)
            for _ in range(depth)
        ])
        
        self.final_norm = nn.GroupNorm(32, embed_dim)
        self.final_proj = nn.Sequential(
            nn.Conv2d(embed_dim, embed_dim, 3, padding=1, bias=True),
            nn.GELU(),
        )
        
        self.unpatch = nn.ConvTranspose2d(embed_dim, in_channels, patch_size, stride=patch_size)
        nn.init.zeros_(self.unpatch.weight)
        nn.init.zeros_(self.unpatch.bias)
        
        self.apply(self._init_weights)
        self._gradient_checkpointing = False
    
    def enable_gradient_checkpointing(self):
        """Enable gradient checkpointing to reduce VRAM by ~40-60%.
        Recomputes block activations during backward instead of storing them.
        Slower training (~30%) but allows much larger batch sizes or models."""
        self._gradient_checkpointing = True
    
    def disable_gradient_checkpointing(self):
        self._gradient_checkpointing = False
    
    def _init_weights(self, m):
        if isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Embedding):
            nn.init.normal_(m.weight, std=0.02)
    
    def _interpolate_pos_embed(self, H: int, W: int) -> torch.Tensor:
        if H == self.pos_embed_size and W == self.pos_embed_size:
            return self.pos_embed
        return F.interpolate(self.pos_embed, size=(H, W), mode='bilinear', align_corners=False)
    
    def forward(
        self, x: torch.Tensor, t: torch.Tensor, class_labels: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        """
        Predict velocity field for flow matching.
        Args:
            x: [B, C, H, W] noisy latent (C=4 for SDXL VAE)
            t: [B] timestep in [0, 1]
            class_labels: [B] optional class labels
        Returns:
            v: [B, C, H, W] predicted velocity
        """
        cond = self.time_embed(t)
        if self.class_embed is not None and class_labels is not None:
            drop_p = self.class_drop_prob if self.training else 0.0
            cond = cond + self.class_embed(class_labels, drop_prob=drop_p)
        
        h = self.patch_embed(x)
        B, C, H_p, W_p = h.shape
        h = h + self._interpolate_pos_embed(H_p, W_p)
        h = self.input_proj(h)
        
        # U-Net style long skip connections
        skip_connections = []
        mid = self.depth // 2
        for i, block in enumerate(self.blocks):
            if i < mid:
                skip_connections.append(h)
            elif i >= mid and len(skip_connections) > 0:
                skip = skip_connections.pop()
                h = h + skip
            if self._gradient_checkpointing and self.training:
                h = checkpoint(block, h, cond, use_reentrant=False)
            else:
                h = block(h, cond)
        
        h = self.final_norm(h)
        h = self.final_proj(h)
        v = self.unpatch(h)
        return v
    
    def count_params(self) -> int:
        return sum(p.numel() for p in self.parameters() if p.requires_grad)


# =============================================================================
# Model Presets
# =============================================================================

def liquidgen_small(**kwargs) -> LiquidGen:
    """~55M params - for 256px, fast training/testing"""
    defaults = dict(
        embed_dim=512, depth=12, spatial_kernel=7, scan_kernel=31,
        expand_ratio=2.0, mlp_ratio=3.0, use_zigzag=True,
    )
    defaults.update(kwargs)
    return LiquidGen(**defaults)

def liquidgen_base(**kwargs) -> LiquidGen:
    """~140M params - for 256/512px, balanced (fits T4 16GB easily)"""
    defaults = dict(
        embed_dim=640, depth=18, spatial_kernel=7, scan_kernel=31,
        expand_ratio=2.0, mlp_ratio=4.0, use_zigzag=True,
    )
    defaults.update(kwargs)
    return LiquidGen(**defaults)

def liquidgen_large(**kwargs) -> LiquidGen:
    """~280M params - for 512px, high quality (fits T4 16GB with small batch)"""
    defaults = dict(
        embed_dim=768, depth=24, spatial_kernel=7, scan_kernel=31,
        expand_ratio=2.5, mlp_ratio=4.0, use_zigzag=True,
    )
    defaults.update(kwargs)
    return LiquidGen(**defaults)


if __name__ == "__main__":
    device = "cpu"
    for name, factory in [("Small", liquidgen_small), ("Base", liquidgen_base), ("Large", liquidgen_large)]:
        model = factory(num_classes=27).to(device)
        print(f"LiquidGen-{name}: {model.count_params() / 1e6:.1f}M params")
        
        # 256px: image/8 = 32x32 latent, 4 channels (SDXL VAE)
        x = torch.randn(2, 4, 32, 32, device=device)
        t = torch.rand(2, device=device)
        labels = torch.randint(0, 27, (2,), device=device)
        v = model(x, t, labels)
        assert v.shape == x.shape
        
        # 512px: image/8 = 64x64 latent
        x512 = torch.randn(1, 4, 64, 64, device=device)
        v512 = model(x512, t[:1], labels[:1])
        assert v512.shape == x512.shape
        print(f"  256px ✅ 512px ✅")
        del model
    
    print("\n✅ All tests passed!")