File size: 34,810 Bytes

4f47596

"""
LatentRecurrentFlow (LRF) - Core Architecture Modules

Architecture Overview:
=====================
The LRF architecture consists of 4 main components:

1. CompactEncoder/Decoder (VAE): f=32 spatial compression with tiny decoder
2. TextConditioner: Lightweight text encoding (TinyCLIP or small LM)
3. RecursiveLatentCore: The novel HRM-inspired denoising backbone
4. FlowScheduler: Rectified flow for training and sampling

The RecursiveLatentCore is the key innovation:
- It contains N_blocks GLD (Gated Linear Diffusion) blocks
- These blocks are applied recursively T_outer * T_inner times
- The same parameters are reused across recursions (weight sharing)
- Training uses IFT (Implicit Function Theorem) for O(1) memory backprop
- This gives effective depth of T_outer * T_inner * N_blocks layers
  from only N_blocks parameter sets

Memory budget at inference (1024x1024, INT8):
- Text encoder: ~150MB (TinyCLIP-ViT-B/16)
- VAE encoder: ~100MB (f32 encoder, only needed for editing)
- VAE decoder: ~6MB (SnapGen-style tiny decoder)
- LRF core: ~200-400MB (depending on config)
- Activations: ~500MB peak
- Total: ~1-1.5GB model + ~500MB activations = 1.5-2GB
"""

import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange, repeat
from typing import Optional, Tuple, Dict, Any


# ============================================================================
# Utility Modules
# ============================================================================

class RMSNorm(nn.Module):
    """RMSNorm - more stable than LayerNorm for small models."""
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))
    
    def forward(self, x):
        norm = x.float().pow(2).mean(-1, keepdim=True).add(self.eps).rsqrt()
        return (x.float() * norm).type_as(x) * self.weight


class SwiGLU(nn.Module):
    """SwiGLU FFN - better than GELU for small models, mobile-friendly (SiLU not GELU)."""
    def __init__(self, dim: int, hidden_dim: Optional[int] = None, dropout: float = 0.0):
        super().__init__()
        hidden_dim = hidden_dim or int(dim * 8 / 3)
        # Round to nearest multiple of 8 for efficiency
        hidden_dim = ((hidden_dim + 7) // 8) * 8
        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))


class DepthwiseSeparableConv2d(nn.Module):
    """Mobile-optimized convolution."""
    def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3):
        super().__init__()
        padding = kernel_size // 2
        self.dw = nn.Conv2d(in_channels, in_channels, kernel_size, padding=padding, groups=in_channels, bias=False)
        self.pw = nn.Conv2d(in_channels, out_channels, 1, bias=False)
    
    def forward(self, x):
        return self.pw(self.dw(x))


# ============================================================================
# 2D Positional Encoding
# ============================================================================

class RotaryPositionEncoding2D(nn.Module):
    """2D RoPE for spatial tokens - resolution-independent."""
    def __init__(self, dim: int, max_res: int = 64):
        super().__init__()
        self.dim = dim
        half_dim = dim // 4  # Split into 4 parts: sin_h, cos_h, sin_w, cos_w
        freqs = torch.exp(torch.arange(half_dim) * -(math.log(10000.0) / half_dim))
        self.register_buffer('freqs', freqs)
    
    def forward(self, h: int, w: int, device=None):
        device = device or self.freqs.device
        pos_h = torch.arange(h, device=device).float()
        pos_w = torch.arange(w, device=device).float()
        
        freqs_h = torch.outer(pos_h, self.freqs.to(device))  # [H, D/4]
        freqs_w = torch.outer(pos_w, self.freqs.to(device))  # [W, D/4]
        
        # Expand to [H, W, D/4] each
        freqs_h = freqs_h.unsqueeze(1).expand(-1, w, -1)
        freqs_w = freqs_w.unsqueeze(0).expand(h, -1, -1)
        
        # Concatenate: [H, W, D/2] for sin, [H, W, D/2] for cos
        freqs = torch.cat([freqs_h, freqs_w], dim=-1)  # [H, W, D/2]
        
        sin_enc = freqs.sin()
        cos_enc = freqs.cos()
        
        return sin_enc.reshape(h * w, -1), cos_enc.reshape(h * w, -1)


def apply_rope_2d(x, sin_enc, cos_enc):
    """Apply 2D RoPE to queries/keys."""
    d = x.shape[-1]
    half_d = d // 2
    x1, x2 = x[..., :half_d], x[..., half_d:]
    # Expand sin/cos to match batch dims
    while sin_enc.dim() < x1.dim():
        sin_enc = sin_enc.unsqueeze(0)
        cos_enc = cos_enc.unsqueeze(0)
    return torch.cat([x1 * cos_enc - x2 * sin_enc, x2 * cos_enc + x1 * sin_enc], dim=-1)


# ============================================================================
# Gated Linear Diffusion (GLD) Block - The Core Spatial Mixer
# ============================================================================

class GatedLinearAttention(nn.Module):
    """
    Gated Linear Attention for 2D spatial mixing.
    O(N) complexity instead of O(N²) softmax attention.
    
    Based on ViG/GLA research but adapted for diffusion:
    - Bidirectional scan (forward + backward)
    - 2D locality injection via depthwise conv gating
    - Token-differential operator to prevent oversmoothing (from DyDiLA)
    
    Math:
        Q, K, V = linear(x), linear(x), linear(x)
        Q = phi(Q), K = phi(K)  where phi = 1 + elu (non-negative feature map)
        
        Forward scan:  S_i = decay * S_{i-1} + K_i^T V_i;  O_i = Q_i S_i
        Backward scan: same in reverse
        
        Output = gate * (O_fwd + O_bwd) * local_gate
        
    Complexity: O(N * d²) where d is head dimension, N is sequence length
    """
    def __init__(self, dim: int, num_heads: int = 8, head_dim: int = 32, dropout: float = 0.0):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = head_dim
        inner_dim = num_heads * head_dim
        
        self.qkv = nn.Linear(dim, 3 * inner_dim, bias=False)
        self.out_proj = nn.Linear(inner_dim, dim, bias=False)
        
        # Learnable decay for recurrence (per-head)
        self.log_decay = nn.Parameter(torch.zeros(num_heads))
        
        # Gate for output
        self.gate = nn.Linear(dim, inner_dim, bias=False)
        
        # 2D locality injection (depthwise conv) - critical for spatial structure
        self.local_conv = nn.Conv2d(inner_dim, inner_dim, 3, padding=1, groups=inner_dim, bias=False)
        self.local_gate = nn.Linear(dim, inner_dim, bias=False)
        
        # Token differential parameter (from DyDiLA - prevents oversmoothing)
        self.diff_lambda = nn.Parameter(torch.tensor(0.1))
        
        self.dropout = nn.Dropout(dropout)
        self.norm = RMSNorm(inner_dim)
    
    def _feature_map(self, x):
        """Non-negative feature map: 1 + elu(x)"""
        return 1.0 + F.elu(x)
    
    def _scan(self, Q, K, V, reverse=False):
        """Linear recurrent scan - O(N * d²) per direction."""
        B, H, N, D = Q.shape
        
        decay = torch.sigmoid(self.log_decay).view(1, H, 1, 1)  # [1, H, 1, 1]
        
        if reverse:
            Q = Q.flip(2)
            K = K.flip(2)
            V = V.flip(2)
        
        # Chunk-wise computation for memory efficiency
        chunk_size = min(64, N)
        outputs = []
        S = torch.zeros(B, H, D, D, device=Q.device, dtype=Q.dtype)
        
        for i in range(0, N, chunk_size):
            q_chunk = Q[:, :, i:i+chunk_size]  # [B, H, C, D]
            k_chunk = K[:, :, i:i+chunk_size]
            v_chunk = V[:, :, i:i+chunk_size]
            
            chunk_len = q_chunk.shape[2]
            
            # Update state: S = decay * S + K^T V
            kv = torch.einsum('bhcd,bhce->bhde', k_chunk, v_chunk)
            S = decay * S + kv
            
            # Query state: O = Q S
            o_chunk = torch.einsum('bhcd,bhde->bhce', q_chunk, S)
            outputs.append(o_chunk)
        
        output = torch.cat(outputs, dim=2)
        
        if reverse:
            output = output.flip(2)
        
        return output
    
    def forward(self, x, h: int, w: int):
        """
        Args:
            x: [B, N, D] where N = H*W
            h, w: spatial dimensions
        Returns:
            [B, N, D]
        """
        B, N, D = x.shape
        
        # Project to Q, K, V
        qkv = self.qkv(x)
        q, k, v = qkv.chunk(3, dim=-1)
        
        # Reshape to heads
        q = rearrange(q, 'b n (h d) -> b h n d', h=self.num_heads)
        k = rearrange(k, 'b n (h d) -> b h n d', h=self.num_heads)
        v = rearrange(v, 'b n (h d) -> b h n d', h=self.num_heads)
        
        # Token differential (prevents oversmoothing)
        # Q_diff = Q_i - lambda * Q_{i-1}, K_diff = K_i - lambda * K_{i-1}
        lam = torch.sigmoid(self.diff_lambda)
        q_shifted = F.pad(q[:, :, :-1], (0, 0, 1, 0))
        k_shifted = F.pad(k[:, :, :-1], (0, 0, 1, 0))
        q = q - lam * q_shifted
        k = k - lam * k_shifted
        
        # Apply feature map (non-negative)
        q = self._feature_map(q)
        k = self._feature_map(k)
        
        # Bidirectional scan
        o_fwd = self._scan(q, k, v, reverse=False)
        o_bwd = self._scan(q, k, v, reverse=True)
        output = o_fwd + o_bwd
        
        # Normalize
        output = rearrange(output, 'b h n d -> b n (h d)')
        output = self.norm(output)
        
        # 2D locality injection (GaLI from ViG)
        x_2d = rearrange(x, 'b (h w) d -> b d h w', h=h, w=w)
        gate_input = rearrange(x, 'b n d -> b n d')
        local_feat = self.local_conv(rearrange(self.local_gate(gate_input), 'b (h w) d -> b d h w', h=h, w=w))
        local_feat = rearrange(local_feat, 'b d h w -> b (h w) d')
        
        # Gated output
        g = torch.sigmoid(self.gate(x))
        output = g * output * torch.sigmoid(local_feat)
        
        return self.dropout(self.out_proj(output))


class GLDBlock(nn.Module):
    """
    Gated Linear Diffusion Block.
    
    Components:
    1. GatedLinearAttention for spatial mixing (O(N) complexity)
    2. SwiGLU FFN for channel mixing
    3. Timestep + condition modulation (adaptive layer norm)
    4. 2D RoPE for position encoding
    
    This replaces the standard transformer block in diffusion models.
    """
    def __init__(
        self,
        dim: int,
        num_heads: int = 8,
        head_dim: int = 32,
        ffn_mult: float = 2.67,
        dropout: float = 0.0,
        cond_dim: int = 256,
    ):
        super().__init__()
        self.norm1 = RMSNorm(dim)
        self.norm2 = RMSNorm(dim)
        
        self.attn = GatedLinearAttention(dim, num_heads, head_dim, dropout)
        self.ffn = SwiGLU(dim, int(dim * ffn_mult), dropout)
        
        # Adaptive modulation (scale, shift, gate for each sub-layer)
        # Conditioned on timestep + text embedding
        self.adaLN_modulation = nn.Sequential(
            nn.SiLU(),
            nn.Linear(cond_dim, 6 * dim, bias=False),
        )
        
        # Cross-attention to text (lightweight - only when text is available)
        self.cross_norm = RMSNorm(dim)
        self.cross_q = nn.Linear(dim, dim, bias=False)
        self.cross_kv = nn.Linear(cond_dim, 2 * dim, bias=False)
        self.cross_out = nn.Linear(dim, dim, bias=False)
        self.cross_gate = nn.Parameter(torch.zeros(1))  # Zero-init for residual
    
    def forward(
        self,
        x: torch.Tensor,        # [B, N, D]
        cond: torch.Tensor,      # [B, cond_dim] - timestep + global condition
        text_ctx: Optional[torch.Tensor] = None,  # [B, T, cond_dim] - text tokens
        h: int = 32,
        w: int = 32,
    ) -> torch.Tensor:
        B, N, D = x.shape
        
        # Compute modulation parameters
        mod = self.adaLN_modulation(cond)  # [B, 6*D]
        shift1, scale1, gate1, shift2, scale2, gate2 = mod.chunk(6, dim=-1)
        
        # Pre-norm + modulate + GLA
        x_norm = self.norm1(x)
        x_norm = x_norm * (1 + scale1.unsqueeze(1)) + shift1.unsqueeze(1)
        x = x + gate1.unsqueeze(1) * self.attn(x_norm, h, w)
        
        # Cross-attention to text (if available)
        if text_ctx is not None:
            x_cross = self.cross_norm(x)
            q = self.cross_q(x_cross)
            kv = self.cross_kv(text_ctx)
            k, v = kv.chunk(2, dim=-1)
            
            # Simple dot-product attention (text sequence is short, so O(N*T) is fine)
            scale = q.shape[-1] ** -0.5
            attn_weights = torch.bmm(q, k.transpose(-2, -1)) * scale
            attn_weights = F.softmax(attn_weights, dim=-1)
            cross_out = torch.bmm(attn_weights, v)
            x = x + torch.tanh(self.cross_gate) * self.cross_out(cross_out)
        
        # Pre-norm + modulate + FFN
        x_norm = self.norm2(x)
        x_norm = x_norm * (1 + scale2.unsqueeze(1)) + shift2.unsqueeze(1)
        x = x + gate2.unsqueeze(1) * self.ffn(x_norm)
        
        return x


# ============================================================================
# Recursive Latent Refinement (RLR) Core - THE KEY INNOVATION
# ============================================================================

class RecursiveLatentCore(nn.Module):
    """
    The Recursive Latent Refinement (RLR) Core.
    
    This is the key architectural innovation of LRF. Instead of stacking
    many unique transformer layers (like DiT with 28 layers), we use a 
    small set of GLD blocks applied RECURSIVELY through an HRM-inspired
    iterative refinement loop.
    
    Architecture:
    - N_blocks GLD blocks (typically 4-6, shared across recursions)
    - T_inner recursive applications per outer step (typically 4-6)
    - T_outer outer steps with slow abstract state update (typically 2-3)
    
    Effective depth: T_outer * T_inner * N_blocks = 2*4*4 = 32 effective layers
    Actual parameters: only N_blocks sets = 4 unique block parameter sets
    
    Training uses IFT (Implicit Function Theorem):
    - Forward: run full recursion with torch.no_grad() for warmup
    - Backward: only backprop through the LAST recursion step
    - This gives O(1) memory cost regardless of recursion depth!
    
    Mathematical formulation:
    
    Let z be the noisy latent, c be the condition embedding.
    
    Outer loop (j = 1..T_outer):
        z_abstract = f_slow(z, c)                    # Abstract planning update
        Inner loop (i = 1..T_inner):
            z = f_blocks(z, z_abstract, c)            # Apply N shared GLD blocks
        
    Where f_blocks applies the same N GLD blocks in sequence.
    
    The model learns a FIXED POINT: z* = f(z*, c)
    At convergence, the output is the denoised prediction v(z_t, t, c).
    """
    
    def __init__(
        self,
        dim: int = 384,
        cond_dim: int = 256,
        num_blocks: int = 4,
        num_heads: int = 6,
        head_dim: int = 64,
        T_inner: int = 4,
        T_outer: int = 2,
        ffn_mult: float = 2.67,
        dropout: float = 0.0,
        use_ift_training: bool = True,
    ):
        super().__init__()
        self.dim = dim
        self.cond_dim = cond_dim
        self.num_blocks = num_blocks
        self.T_inner = T_inner
        self.T_outer = T_outer
        self.use_ift_training = use_ift_training
        
        # The shared GLD blocks (applied recursively)
        self.blocks = nn.ModuleList([
            GLDBlock(
                dim=dim,
                num_heads=num_heads,
                head_dim=head_dim,
                ffn_mult=ffn_mult,
                dropout=dropout,
                cond_dim=cond_dim,
            )
            for _ in range(num_blocks)
        ])
        
        # Abstract state updater (the "slow" H-module from HRM)
        # This updates a global abstract representation every T_inner steps
        self.abstract_norm = RMSNorm(dim)
        self.abstract_update = nn.Sequential(
            nn.Linear(dim * 2, dim, bias=False),
            nn.SiLU(),
            nn.Linear(dim, dim, bias=False),
        )
        self.abstract_gate = nn.Parameter(torch.zeros(1))  # Zero-init
        
        # Input projection
        self.input_proj = nn.Linear(dim, dim, bias=False)
        
        # Timestep embedding
        self.time_embed = nn.Sequential(
            nn.Linear(256, cond_dim),
            nn.SiLU(),
            nn.Linear(cond_dim, cond_dim),
        )
        
        # Output projection (predicts velocity v for rectified flow)
        self.out_norm = RMSNorm(dim)
        self.out_proj = nn.Sequential(
            nn.Linear(dim, dim, bias=False),
            nn.SiLU(),
            nn.Linear(dim, dim, bias=False),
        )
        
        # Recursion depth embedding (tells the model which recursion step it's on)
        self.recursion_embed = nn.Embedding(T_outer * T_inner + 1, cond_dim)
        
        # 2D positional encoding
        self.rope = RotaryPositionEncoding2D(head_dim)
    
    def _sinusoidal_embedding(self, t: torch.Tensor, dim: int = 256) -> torch.Tensor:
        """Sinusoidal timestep embedding."""
        half_dim = dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, device=t.device) * -emb)
        emb = t.unsqueeze(-1) * emb.unsqueeze(0)
        return torch.cat([emb.sin(), emb.cos()], dim=-1)
    
    def _apply_blocks(self, z, cond, text_ctx, h, w):
        """Apply all GLD blocks once."""
        for block in self.blocks:
            z = block(z, cond, text_ctx, h, w)
        return z
    
    def _recursive_refinement(self, z, cond_base, text_ctx, h, w):
        """
        Full recursive refinement loop.
        
        Returns the refined latent z after T_outer * T_inner applications.
        """
        z_abstract = z.mean(dim=1, keepdim=True).expand_as(z)  # Initial abstract state
        
        step_idx = 0
        for j in range(self.T_outer):
            # Abstract state update (slow H-module)
            z_pooled = z.mean(dim=1, keepdim=True).expand_as(z)
            abstract_input = torch.cat([self.abstract_norm(z), z_pooled], dim=-1)
            z_abstract = z_abstract + torch.tanh(self.abstract_gate) * self.abstract_update(abstract_input)
            
            for i in range(self.T_inner):
                # Add recursion depth information to conditioning
                rec_emb = self.recursion_embed(
                    torch.tensor([step_idx], device=z.device)
                ).expand(z.shape[0], -1)
                cond = cond_base + rec_emb
                
                # Apply shared blocks with abstract state modulation
                z_input = z + z_abstract  # Combine detail + abstract
                z = z + (self._apply_blocks(z_input, cond, text_ctx, h, w) - z) * 0.5  # Damped update
                
                step_idx += 1
        
        return z
    
    def forward(
        self,
        z_t: torch.Tensor,       # [B, C, H, W] - noisy latent
        t: torch.Tensor,          # [B] - timestep (0 to 1)
        text_emb: Optional[torch.Tensor] = None,   # [B, T, cond_dim] - text tokens
        text_global: Optional[torch.Tensor] = None, # [B, cond_dim] - global text embedding
        image_cond: Optional[torch.Tensor] = None,   # [B, C, H, W] - for editing tasks
    ) -> torch.Tensor:
        """
        Forward pass predicting velocity v_theta(z_t, t, c).
        
        For rectified flow: z_t = (1-t) * z_0 + t * epsilon
        Target: v = epsilon - z_0
        """
        B, C, H, W = z_t.shape
        
        # Flatten spatial dims
        z = rearrange(z_t, 'b c h w -> b (h w) c')
        
        # If editing: concatenate condition image (channel-wise before projection)
        if image_cond is not None:
            img_cond_flat = rearrange(image_cond, 'b c h w -> b (h w) c')
            z = z + img_cond_flat  # Additive conditioning preserves spatial correspondence
        
        # Project
        z = self.input_proj(z)
        
        # Build conditioning
        t_emb = self._sinusoidal_embedding(t)
        t_emb = self.time_embed(t_emb)  # [B, cond_dim]
        
        if text_global is not None:
            cond = t_emb + text_global
        else:
            cond = t_emb
        
        # Apply recursive refinement
        if self.training and self.use_ift_training:
            # IFT training: no_grad warmup + 1-step grad
            with torch.no_grad():
                for _ in range(self.T_outer - 1):
                    z = self._recursive_refinement(z, cond, text_emb, H, W)
            # Last step with gradients
            z = self._recursive_refinement(z, cond, text_emb, H, W)
        else:
            # Full recursion (inference or non-IFT training)
            z = self._recursive_refinement(z, cond, text_emb, H, W)
        
        # Output projection
        z = self.out_norm(z)
        v = self.out_proj(z)
        
        # Reshape back to spatial
        v = rearrange(v, 'b (h w) c -> b c h w', h=H, w=W)
        
        return v


# ============================================================================
# Compact VAE (Tiny Decoder inspired by SnapGen)
# ============================================================================

class TinyResBlock(nn.Module):
    """Ultra-compact residual block for tiny decoder."""
    def __init__(self, in_channels: int, out_channels: int = None):
        super().__init__()
        out_channels = out_channels or in_channels
        self.norm1 = nn.GroupNorm(min(8, in_channels), in_channels)
        self.conv1 = DepthwiseSeparableConv2d(in_channels, out_channels, 3)
        self.norm2 = nn.GroupNorm(min(8, out_channels), out_channels)
        self.conv2 = DepthwiseSeparableConv2d(out_channels, out_channels, 3)
        self.skip = nn.Conv2d(in_channels, out_channels, 1, bias=False) if in_channels != out_channels else nn.Identity()
    
    def forward(self, x):
        h = self.conv1(F.silu(self.norm1(x)))
        h = self.conv2(F.silu(self.norm2(h)))
        return self.skip(x) + h


class CompactEncoder(nn.Module):
    """
    Compact image encoder: image -> latent space.
    f=16 spatial compression, C_latent channels.
    
    Uses strided depthwise-separable convolutions for efficiency.
    4 downsampling stages: 256->128->64->32->16 (for 256x256 input)
    """
    def __init__(
        self,
        in_channels: int = 3,
        latent_channels: int = 32,
        base_channels: int = 64,
        num_res_blocks: int = 2,
    ):
        super().__init__()
        channels = [base_channels, base_channels * 2, base_channels * 4, base_channels * 4]
        
        self.stem = nn.Conv2d(in_channels, channels[0], 3, padding=1, bias=False)
        
        self.downs = nn.ModuleList()
        ch_in = channels[0]
        for ch_out in channels:
            blocks = nn.ModuleList()
            # First block handles channel transition
            blocks.append(TinyResBlock(ch_in, ch_out))
            for _ in range(num_res_blocks - 1):
                blocks.append(TinyResBlock(ch_out, ch_out))
            # Downsample with strided conv
            down = nn.Conv2d(ch_out, ch_out, 4, stride=2, padding=1, bias=False)
            self.downs.append(nn.ModuleDict({
                'blocks': blocks,
                'down': down,
            }))
            ch_in = ch_out
        
        # To latent
        self.to_latent = nn.Sequential(
            nn.GroupNorm(8, ch_in),
            nn.SiLU(),
            nn.Conv2d(ch_in, latent_channels * 2, 1, bias=False),  # *2 for mean+logvar
        )
    
    def forward(self, x):
        h = self.stem(x)
        for down_module in self.downs:
            for block in down_module['blocks']:
                h = block(h)
            h = down_module['down'](h)
        
        params = self.to_latent(h)
        mean, logvar = params.chunk(2, dim=1)
        logvar = torch.clamp(logvar, -30.0, 20.0)
        
        return mean, logvar


class TinyDecoder(nn.Module):
    """
    SnapGen-inspired tiny decoder: latent -> image.
    ~1-2M parameters. No attention layers.
    Uses depthwise-separable convolutions + minimal GroupNorm.
    
    4 upsampling stages matching the encoder.
    """
    def __init__(
        self,
        latent_channels: int = 32,
        out_channels: int = 3,
        base_channels: int = 128,
        num_res_blocks: int = 2,
    ):
        super().__init__()
        channels = [base_channels * 2, base_channels * 2, base_channels, base_channels // 2]
        
        self.from_latent = nn.Conv2d(latent_channels, channels[0], 1, bias=False)
        
        self.ups = nn.ModuleList()
        ch_in = channels[0]
        for ch_out in channels:
            blocks = nn.ModuleList()
            for _ in range(num_res_blocks):
                blocks.append(TinyResBlock(ch_in, ch_in))
            # Upsample with channel transition
            up = nn.Sequential(
                nn.Upsample(scale_factor=2, mode='nearest'),
                DepthwiseSeparableConv2d(ch_in, ch_out, 3),
            )
            self.ups.append(nn.ModuleDict({
                'blocks': blocks,
                'up': up,
            }))
            ch_in = ch_out
        
        self.to_image = nn.Sequential(
            nn.GroupNorm(min(8, ch_in), ch_in),
            nn.SiLU(),
            nn.Conv2d(ch_in, out_channels, 3, padding=1),
            nn.Tanh(),  # Output in [-1, 1]
        )
    
    def forward(self, z):
        h = self.from_latent(z)
        for up_module in self.ups:
            for block in up_module['blocks']:
                h = block(h)
            h = up_module['up'](h)
        return self.to_image(h)


class CompactVAE(nn.Module):
    """
    Complete VAE with compact encoder + tiny decoder.
    f=16 compression, configurable latent channels.
    """
    def __init__(
        self,
        in_channels: int = 3,
        latent_channels: int = 32,
        encoder_base_ch: int = 64,
        decoder_base_ch: int = 128,
    ):
        super().__init__()
        self.encoder = CompactEncoder(in_channels, latent_channels, encoder_base_ch)
        self.decoder = TinyDecoder(latent_channels, in_channels, decoder_base_ch)
        self.latent_channels = latent_channels
    
    def encode(self, x):
        mean, logvar = self.encoder(x)
        if self.training:
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(std)
            z = mean + eps * std
        else:
            z = mean
        return z, mean, logvar
    
    def decode(self, z):
        return self.decoder(z)
    
    def forward(self, x):
        z, mean, logvar = self.encode(x)
        recon = self.decode(z)
        return recon, mean, logvar


# ============================================================================
# Text Conditioner (Lightweight)
# ============================================================================

class SimpleTextEncoder(nn.Module):
    """
    Lightweight text encoder for the standalone prototype.
    In production, this would be replaced by TinyCLIP or a small LM.
    
    For the prototype: simple learned embeddings + small transformer.
    This lets us test the full pipeline without a heavy text encoder.
    """
    def __init__(
        self,
        vocab_size: int = 32000,
        max_length: int = 77,
        dim: int = 256,
        num_layers: int = 4,
        num_heads: int = 4,
    ):
        super().__init__()
        self.dim = dim
        self.token_embed = nn.Embedding(vocab_size, dim)
        self.pos_embed = nn.Embedding(max_length, dim)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim, nhead=num_heads, dim_feedforward=dim*4,
            dropout=0.1, activation='gelu', batch_first=True, norm_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.norm = RMSNorm(dim)
        
        # Global pooling projection
        self.global_proj = nn.Sequential(
            nn.Linear(dim, dim),
            nn.SiLU(),
            nn.Linear(dim, dim),
        )
    
    def forward(self, token_ids, attention_mask=None):
        B, T = token_ids.shape
        pos_ids = torch.arange(T, device=token_ids.device).unsqueeze(0).expand(B, -1)
        
        x = self.token_embed(token_ids) + self.pos_embed(pos_ids)
        
        if attention_mask is not None:
            # Convert to transformer mask (True = ignore)
            src_key_padding_mask = ~attention_mask.bool()
        else:
            src_key_padding_mask = None
        
        x = self.transformer(x, src_key_padding_mask=src_key_padding_mask)
        x = self.norm(x)
        
        # Global embedding (mean pool over non-padded tokens)
        if attention_mask is not None:
            mask = attention_mask.unsqueeze(-1).float()
            global_emb = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
        else:
            global_emb = x.mean(dim=1)
        
        global_emb = self.global_proj(global_emb)
        
        return x, global_emb  # [B, T, D], [B, D]


# ============================================================================
# Full LRF Model
# ============================================================================

class LatentRecurrentFlow(nn.Module):
    """
    LatentRecurrentFlow (LRF) - Complete model.
    
    Combines:
    1. CompactVAE for image encoding/decoding
    2. SimpleTextEncoder for text conditioning  
    3. RecursiveLatentCore for denoising
    
    Training modes:
    - 'vae': Train only the VAE
    - 'denoise': Train only the denoising core (freeze VAE)
    - 'e2e': End-to-end fine-tuning
    - 'distill': Consistency distillation from teacher
    """
    
    def __init__(self, config: Optional[Dict[str, Any]] = None):
        super().__init__()
        
        config = config or self.default_config()
        self.config = config
        
        # VAE
        self.vae = CompactVAE(
            in_channels=3,
            latent_channels=config['latent_channels'],
            encoder_base_ch=config.get('encoder_base_ch', 64),
            decoder_base_ch=config.get('decoder_base_ch', 128),
        )
        
        # Text encoder
        self.text_encoder = SimpleTextEncoder(
            vocab_size=config.get('vocab_size', 32000),
            max_length=config.get('max_text_length', 77),
            dim=config['cond_dim'],
            num_layers=config.get('text_layers', 4),
            num_heads=config.get('text_heads', 4),
        )
        
        # Denoising core
        self.core = RecursiveLatentCore(
            dim=config['latent_channels'],
            cond_dim=config['cond_dim'],
            num_blocks=config['num_blocks'],
            num_heads=config.get('num_heads', 6),
            head_dim=config.get('head_dim', 64),
            T_inner=config.get('T_inner', 4),
            T_outer=config.get('T_outer', 2),
            ffn_mult=config.get('ffn_mult', 2.67),
            dropout=config.get('dropout', 0.0),
            use_ift_training=config.get('use_ift', True),
        )
        
        # Latent scaling (learnable, stabilizes training)
        self.latent_scale = nn.Parameter(torch.tensor(1.0))
    
    @staticmethod
    def default_config():
        """Default config targeting ~50M params, trainable on 16GB."""
        return {
            'latent_channels': 32,
            'cond_dim': 256,
            'num_blocks': 4,
            'num_heads': 4,
            'head_dim': 64,
            'T_inner': 4,
            'T_outer': 2,
            'ffn_mult': 2.67,
            'dropout': 0.0,
            'use_ift': True,
            'encoder_base_ch': 64,
            'decoder_base_ch': 128,
            'vocab_size': 32000,
            'max_text_length': 77,
            'text_layers': 4,
            'text_heads': 4,
        }
    
    @staticmethod
    def tiny_config():
        """Tiny config for quick testing."""
        return {
            'latent_channels': 16,
            'cond_dim': 128,
            'num_blocks': 2,
            'num_heads': 2,
            'head_dim': 32,
            'T_inner': 2,
            'T_outer': 1,
            'ffn_mult': 2.0,
            'dropout': 0.0,
            'use_ift': False,
            'encoder_base_ch': 32,
            'decoder_base_ch': 64,
            'vocab_size': 32000,
            'max_text_length': 77,
            'text_layers': 2,
            'text_heads': 2,
        }
    
    def encode_image(self, x):
        """Encode image to latent space."""
        z, mean, logvar = self.vae.encode(x)
        return z * self.latent_scale, mean, logvar
    
    def decode_latent(self, z):
        """Decode latent to image."""
        return self.vae.decode(z / self.latent_scale)
    
    def encode_text(self, token_ids, attention_mask=None):
        """Encode text to conditioning vectors."""
        return self.text_encoder(token_ids, attention_mask)
    
    def predict_velocity(self, z_t, t, text_emb=None, text_global=None, image_cond=None):
        """Predict velocity for rectified flow."""
        return self.core(z_t, t, text_emb, text_global, image_cond)
    
    def get_param_groups(self):
        """Return parameter groups for staged training."""
        return {
            'vae_encoder': list(self.vae.encoder.parameters()),
            'vae_decoder': list(self.vae.decoder.parameters()),
            'text_encoder': list(self.text_encoder.parameters()),
            'core': list(self.core.parameters()),
            'latent_scale': [self.latent_scale],
        }
    
    def count_parameters(self):
        """Count parameters per module."""
        counts = {}
        for name, module in [
            ('vae_encoder', self.vae.encoder),
            ('vae_decoder', self.vae.decoder),
            ('text_encoder', self.text_encoder),
            ('core', self.core),
        ]:
            counts[name] = sum(p.numel() for p in module.parameters())
        counts['latent_scale'] = 1
        counts['total'] = sum(counts.values())
        return counts
    
    def forward(self, x=None, token_ids=None, attention_mask=None, **kwargs):
        """Full forward pass for training. See training script for usage."""
        raise NotImplementedError(
            "Use the training pipeline functions instead of calling forward() directly. "
            "See LRFTrainer for VAE training, denoiser training, and distillation."
        )