krystv
/

LiquidFlow

+"""
+LiquidFlow: A Novel Liquid-SSM Flow Matching Image Generator
+Architecture combines:
+1. Liquid Time-Constant (LTC) dynamics as the velocity field (Hasani et al. 2020)
+2. Selective State Space scanning (Mamba-style) in pure PyTorch for parallel training
+3. Zigzag scanning patterns for 2D spatial awareness (ZigMa, 2024)
+4. Physics-informed regularization (smoothness + continuity constraints)
+5. Closed-form Continuous-depth (CfC) approximation for fast forward pass
+6. Rectified Flow / Flow Matching training objective (Lipman et al. 2022)
+Designed for:
+- Training on Google Colab free tier (T4 16GB) or Kaggle (P100 16GB)
+- Mobile deployment (< 15M parameters for 128x128, < 25M for 512x512)
+- No custom CUDA kernels required - pure PyTorch
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+# ============================================================
+# 1. LIQUID TIME-CONSTANT CELL (CfC - Closed-Form Continuous)
+# ============================================================
+class LiquidCfCCell(nn.Module):
+    """
+    Closed-form Continuous-depth Liquid Cell.
+    Instead of solving the LTC ODE numerically:
+        dx/dt = -[1/τ + f(x,I,t)] * x + f(x,I,t)
+    We use the CfC closed-form solution:
+        x(t+Δt) = σ(-f_τ) ⊙ x(t) + (1 - σ(-f_τ)) ⊙ f_x
+    Where:
+        f_τ = learned time-constant modulation
+        f_x = learned state update
+        σ  = sigmoid (ensures bounded dynamics → no explosion)
+    This is parallelizable (no sequential ODE steps) and stable by construction.
+    """
+    def __init__(self, input_dim, hidden_dim):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        # Time-constant network (τ modulation)
+        self.tau_net = nn.Sequential(
+            nn.Linear(hidden_dim + hidden_dim, hidden_dim),
+            nn.Tanh(),  # Tanh per PINN stability research (Wang et al. 2020)
+            nn.Linear(hidden_dim, hidden_dim),
+        )
+        # State update network
+        self.state_net = nn.Sequential(
+            nn.Linear(hidden_dim + hidden_dim, hidden_dim),
+            nn.Tanh(),
+            nn.Linear(hidden_dim, hidden_dim),
+        )
+        # Backbone mixing (replaces wiring in original NCP)
+        self.backbone = nn.Linear(input_dim, hidden_dim)
+    def forward(self, x, h=None):
+        """
+        x: (B, L, input_dim) - input features
+        h: (B, hidden_dim) - hidden state (optional, zeros if None)
+        Returns: (B, L, hidden_dim) - output for all positions (parallelized)
+        """
+        B, L, D = x.shape
+        # Backbone projection: input preprocessing (NCP-style wiring)
+        x_proj = self.backbone(x)  # (B, L, hidden_dim)
+        if h is None:
+            h = torch.zeros(B, self.hidden_dim, device=x.device, dtype=x.dtype)
+        # Expand h to match sequence length for parallel computation
+        h_expanded = h.unsqueeze(1).expand(-1, L, -1)  # (B, L, hidden_dim)
+        # Use backbone-projected input + state for gating
+        xh = torch.cat([x_proj, h_expanded], dim=-1)  # (B, L, hidden_dim + hidden_dim)
+        # Compute time-constant modulation and state update
+        f_tau = self.tau_net(xh)    # (B, L, hidden_dim)
+        f_x = self.state_net(xh)    # (B, L, hidden_dim)
+        # CfC closed-form update:
+        # gate = σ(-f_τ) controls how much old state to keep
+        # new_h = gate * h + (1 - gate) * f_x
+        gate = torch.sigmoid(-f_tau)
+        new_h = gate * h_expanded + (1.0 - gate) * f_x
+        return new_h  # (B, L, hidden_dim)
+# ============================================================
+# 2. SELECTIVE STATE SPACE BLOCK (Pure PyTorch Mamba-style)
+# ============================================================
+class SelectiveSSM(nn.Module):
+    """
+    Simplified Selective State Space Model in pure PyTorch.
+    Key insight from Mamba: make B, C, Δ input-dependent (selective)
+    while keeping A fixed (diagonal, learned).
+    The discretized SSM:
+        h_i = Ā * h_{i-1} + B̄ * x_i
+        y_i = C * h_i
+    Where Ā = exp(Δ * A), B̄ ≈ Δ * B
+    """
+    def __init__(self, d_model, d_state=16, d_conv=4, expand=2):
+        super().__init__()
+        self.d_model = d_model
+        self.d_state = d_state
+        self.d_inner = int(d_model * expand)
+        # Input projection (expand)
+        self.in_proj = nn.Linear(d_model, self.d_inner * 2, bias=False)
+        # 1D convolution for local context
+        self.conv1d = nn.Conv1d(
+            in_channels=self.d_inner,
+            out_channels=self.d_inner,
+            kernel_size=d_conv,
+            padding=d_conv - 1,
+            groups=self.d_inner,
+            bias=True,
+        )
+        # SSM parameters
+        # A: diagonal state matrix (fixed, learned)
+        # Initialize A with negative values for stability (ensures exp(ΔA) < 1)
+        A = torch.arange(1, d_state + 1, dtype=torch.float32)
+        self.A_log = nn.Parameter(torch.log(A).unsqueeze(0).expand(self.d_inner, -1).clone())
+        # D: skip connection
+        self.D = nn.Parameter(torch.ones(self.d_inner))
+        # Input-dependent projections for B, C, Δ
+        self.x_proj = nn.Linear(self.d_inner, d_state * 2 + 1, bias=False)  # B, C, Δ
+        self.dt_proj = nn.Linear(1, self.d_inner, bias=True)
+        # Output projection
+        self.out_proj = nn.Linear(self.d_inner, d_model, bias=False)
+        # Initialize dt_proj bias for stable Δ range
+        with torch.no_grad():
+            dt_init = torch.exp(
+                torch.rand(self.d_inner) * (math.log(0.1) - math.log(0.001)) + math.log(0.001)
+            )
+            inv_dt = dt_init + torch.log(-torch.expm1(-dt_init))
+            self.dt_proj.bias.copy_(inv_dt)
+    def forward(self, x):
+        """
+        x: (B, L, d_model)
+        Returns: (B, L, d_model)
+        """
+        B, L, D = x.shape
+        # Input projection → split into x and z (gating)
+        xz = self.in_proj(x)  # (B, L, 2*d_inner)
+        x_inner, z = xz.chunk(2, dim=-1)  # each (B, L, d_inner)
+        # 1D convolution for local context
+        x_conv = self.conv1d(x_inner.transpose(1, 2))[:, :, :L].transpose(1, 2)
+        x_conv = F.silu(x_conv)
+        # Compute input-dependent B, C, Δ
+        x_proj = self.x_proj(x_conv)  # (B, L, 2*d_state + 1)
+        B_sel = x_proj[:, :, :self.d_state]  # (B, L, d_state)
+        C_sel = x_proj[:, :, self.d_state:2*self.d_state]  # (B, L, d_state)
+        dt = x_proj[:, :, -1:]  # (B, L, 1)
+        # Project Δ to per-channel
+        dt = F.softplus(self.dt_proj(dt))  # (B, L, d_inner)
+        # Discretize: Ā = exp(Δ * A), B̄ = Δ * B
+        A = -torch.exp(self.A_log)  # (d_inner, d_state), negative for stability
+        # SSM scan
+        y = self._selective_scan(x_conv, dt, A, B_sel, C_sel)
+        # Apply skip connection (D parameter)
+        y = y + x_conv * self.D.unsqueeze(0).unsqueeze(0)
+        # Gate with z
+        y = y * F.silu(z)
+        # Output projection
+        return self.out_proj(y)
+    def _selective_scan(self, x, dt, A, B, C):
+        """
+        Sequential selective scan (PyTorch-compatible, works on CPU/GPU).
+        For short sequences (image patches), this is fast enough.
+        No custom CUDA kernels needed.
+        """
+        B_batch, L, d_inner = x.shape
+        d_state = A.shape[1]
+        # Compute discretized parameters
+        dA = torch.einsum('bld,dn->bldn', dt, A)  # (B, L, d_inner, d_state)
+        dA = torch.exp(dA)  # Ā
+        dB = torch.einsum('bld,bln->bldn', dt, B)  # (B, L, d_inner, d_state)
+        # x contribution: dB * x
+        dBx = dB * x.unsqueeze(-1)  # (B, L, d_inner, d_state)
+        # Sequential scan
+        h = torch.zeros(B_batch, d_inner, d_state, device=x.device, dtype=x.dtype)
+        ys = []
+        for i in range(L):
+            h = dA[:, i] * h + dBx[:, i]  # (B, d_inner, d_state)
+            y_i = torch.einsum('bdn,bn->bd', h, C[:, i])  # (B, d_inner)
+            ys.append(y_i)
+        y = torch.stack(ys, dim=1)  # (B, L, d_inner)
+        return y
+# ============================================================
+# 3. ZIGZAG SCAN PATTERNS
+# ============================================================
+def create_scan_patterns(H, W):
+    """
+    Create zigzag scan patterns for 2D spatial awareness.
+    Returns 4 patterns: row-major, reversed, column-major, zigzag.
+    """
+    total = H * W
+    indices = torch.arange(total)
+    row_major = indices.clone()
+    row_major_rev = indices.flip(0)
+    grid = indices.view(H, W)
+    col_major = grid.t().contiguous().view(-1)
+    zigzag = []
+    for i in range(H):
+        row = grid[i]
+        if i % 2 == 1:
+            row = row.flip(0)
+        zigzag.append(row)
+    zigzag = torch.cat(zigzag)
+    patterns = [row_major, row_major_rev, col_major, zigzag]
+    inverse_patterns = []
+    for p in patterns:
+        inv = torch.zeros_like(p)
+        inv[p] = torch.arange(total)
+        inverse_patterns.append(inv)
+    return patterns, inverse_patterns
+# ============================================================
+# 4. LIQUID-SSM BLOCK (Core Building Block)
+# ============================================================
+class LiquidSSMBlock(nn.Module):
+    """
+    Combines Liquid CfC dynamics with Selective SSM in one block.
+    Dual-path: SSM captures long-range spatial dependencies via scanning,
+    Liquid CfC adds continuous-time adaptive dynamics with bounded gates.
+    """
+    def __init__(self, d_model, d_state=16, d_conv=4, expand=2, dropout=0.0):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(d_model)
+        self.ssm = SelectiveSSM(d_model, d_state, d_conv, expand)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.liquid = LiquidCfCCell(d_model, d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.ff = nn.Sequential(
+            nn.Linear(d_model, d_model * 4),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(d_model * 4, d_model),
+            nn.Dropout(dropout),
+        )
+        self.mix_alpha = nn.Parameter(torch.tensor(0.5))
+    def forward(self, x, scan_idx=None, unscan_idx=None):
+        if scan_idx is not None:
+            x_scanned = x[:, scan_idx]
+        else:
+            x_scanned = x
+        ssm_out = self.ssm(self.norm1(x_scanned))
+        if unscan_idx is not None:
+            ssm_out = ssm_out[:, unscan_idx]
+        liquid_out = self.liquid(self.norm2(x))
+        alpha = torch.sigmoid(self.mix_alpha)
+        mixed = alpha * ssm_out + (1.0 - alpha) * liquid_out
+        x = x + mixed
+        x = x + self.ff(self.norm3(x))
+        return x
+# ============================================================
+# 5. TIMESTEP & CONDITION EMBEDDINGS
+# ============================================================
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, t):
+        device = t.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = t.unsqueeze(-1) * emb.unsqueeze(0)
+        emb = torch.cat([emb.sin(), emb.cos()], dim=-1)
+        return emb
+class AdaptiveLayerNorm(nn.Module):
+    """DiT-style Adaptive Layer Norm with scale and shift from condition."""
+    def __init__(self, d_model, cond_dim):
+        super().__init__()
+        self.norm = nn.LayerNorm(d_model, elementwise_affine=False)
+        self.proj = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(cond_dim, d_model * 2),
+        )
+    def forward(self, x, cond):
+        scale_shift = self.proj(cond)
+        scale, shift = scale_shift.chunk(2, dim=-1)
+        scale = scale.unsqueeze(1)
+        shift = shift.unsqueeze(1)
+        return self.norm(x) * (1 + scale) + shift
+# ============================================================
+# 6. LIQUIDFLOW VELOCITY NETWORK (Full Architecture)
+# ============================================================
+class LiquidFlowNet(nn.Module):
+    """
+    LiquidFlow: The complete velocity field network for flow matching.
+    Training: ||v_θ(x_t, t) - (x_1 - x_0)||²  (rectified flow)
+    Sampling: x_{t+dt} = x_t + v_θ(x_t, t) * dt   (Euler method)
+    """
+    def __init__(
+        self,
+        img_size=128,
+        patch_size=4,
+        in_channels=3,
+        d_model=256,
+        depth=8,
+        d_state=16,
+        d_conv=4,
+        expand=2,
+        dropout=0.0,
+        num_classes=0,
+    ):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.d_model = d_model
+        self.depth = depth
+        self.num_classes = num_classes
+        self.num_patches_h = img_size // patch_size
+        self.num_patches_w = img_size // patch_size
+        self.num_patches = self.num_patches_h * self.num_patches_w
+        self.patch_dim = in_channels * patch_size * patch_size
+        self.patch_embed = nn.Sequential(
+            nn.Linear(self.patch_dim, d_model),
+            nn.LayerNorm(d_model),
+        )
+        self.pos_embed = nn.Parameter(
+            torch.randn(1, self.num_patches, d_model) * 0.02
+        )
+        self.time_embed = nn.Sequential(
+            SinusoidalPosEmb(d_model),
+            nn.Linear(d_model, d_model * 4),
+            nn.GELU(),
+            nn.Linear(d_model * 4, d_model),
+        )
+        if num_classes > 0:
+            self.class_embed = nn.Embedding(num_classes, d_model)
+        else:
+            self.class_embed = None
+        cond_dim = d_model
+        self.blocks = nn.ModuleList([
+            LiquidSSMBlock(d_model, d_state, d_conv, expand, dropout)
+            for _ in range(depth)
+        ])
+        self.adaln_blocks = nn.ModuleList([
+            AdaptiveLayerNorm(d_model, cond_dim)
+            for _ in range(depth)
+        ])
+        self.skip_projs = nn.ModuleList()
+        for i in range(depth // 2):
+            self.skip_projs.append(nn.Linear(d_model * 2, d_model))
+        self.final_norm = nn.LayerNorm(d_model)
+        self.final_proj = nn.Linear(d_model, self.patch_dim)
+        patterns, inv_patterns = create_scan_patterns(
+            self.num_patches_h, self.num_patches_w
+        )
+        for i, (p, ip) in enumerate(zip(patterns, inv_patterns)):
+            self.register_buffer(f'scan_{i}', p)
+            self.register_buffer(f'unscan_{i}', ip)
+        self.num_scan_patterns = len(patterns)
+        self.pre_conv = nn.Conv2d(d_model, d_model, 3, padding=1, groups=d_model)
+        self.post_conv = nn.Conv2d(d_model, d_model, 3, padding=1, groups=d_model)
+        self._init_weights()
+    def _init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.Conv2d, nn.Conv1d)):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+        nn.init.zeros_(self.final_proj.weight)
+        nn.init.zeros_(self.final_proj.bias)
+    def patchify(self, x):
+        B, C, H, W = x.shape
+        p = self.patch_size
+        x = x.unfold(2, p, p).unfold(3, p, p)
+        x = x.contiguous().view(B, C, self.num_patches_h, self.num_patches_w, p * p)
+        x = x.permute(0, 2, 3, 1, 4)
+        x = x.contiguous().view(B, self.num_patches, self.patch_dim)
+        return x
+    def unpatchify(self, x):
+        B = x.shape[0]
+        p = self.patch_size
+        C = self.in_channels
+        H = self.num_patches_h
+        W = self.num_patches_w
+        x = x.view(B, H, W, C, p, p)
+        x = x.permute(0, 3, 1, 4, 2, 5)
+        x = x.contiguous().view(B, C, H * p, W * p)
+        return x
+    def forward(self, x, t, class_label=None):
+        B = x.shape[0]
+        tokens = self.patchify(x)
+        tokens = self.patch_embed(tokens)
+        tokens = tokens + self.pos_embed
+        h_2d = tokens.view(B, self.num_patches_h, self.num_patches_w, self.d_model)
+        h_2d = h_2d.permute(0, 3, 1, 2)
+        h_2d = self.pre_conv(h_2d)
+        tokens = h_2d.permute(0, 2, 3, 1).contiguous().view(B, self.num_patches, self.d_model)
+        t_emb = self.time_embed(t)
+        if self.class_embed is not None and class_label is not None:
+            t_emb = t_emb + self.class_embed(class_label)
+        skips = []
+        for i, (block, adaln) in enumerate(zip(self.blocks, self.adaln_blocks)):
+            tokens = adaln(tokens, t_emb)
+            scan_pattern_idx = i % self.num_scan_patterns
+            scan_idx = getattr(self, f'scan_{scan_pattern_idx}')
+            unscan_idx = getattr(self, f'unscan_{scan_pattern_idx}')
+            if i < self.depth // 2:
+                skips.append(tokens)
+            tokens = block(tokens, scan_idx, unscan_idx)
+            if i >= self.depth // 2:
+                skip_idx = self.depth - 1 - i
+                if skip_idx < len(skips):
+                    skip_proj = self.skip_projs[skip_idx]
+                    tokens = skip_proj(torch.cat([tokens, skips[skip_idx]], dim=-1))
+        h_2d = tokens.view(B, self.num_patches_h, self.num_patches_w, self.d_model)
+        h_2d = h_2d.permute(0, 3, 1, 2)
+        h_2d = self.post_conv(h_2d)
+        tokens = h_2d.permute(0, 2, 3, 1).contiguous().view(B, self.num_patches, self.d_model)
+        tokens = self.final_norm(tokens)
+        velocity = self.final_proj(tokens)
+        velocity = self.unpatchify(velocity)
+        return velocity
+    def count_params(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+# ============================================================
+# 7. MODEL CONFIGURATIONS
+# ============================================================
+def liquidflow_tiny(img_size=128, num_classes=0):
+    """~5M params - for quick experiments and 128x128"""
+    return LiquidFlowNet(
+        img_size=img_size, patch_size=4, in_channels=3,
+        d_model=192, depth=6, d_state=8, d_conv=4, expand=2,
+        num_classes=num_classes,
+    )
+def liquidflow_small(img_size=128, num_classes=0):
+    """~12M params - main model for 128x128"""
+    return LiquidFlowNet(
+        img_size=img_size, patch_size=4, in_channels=3,
+        d_model=256, depth=8, d_state=16, d_conv=4, expand=2,
+        num_classes=num_classes,
+    )
+def liquidflow_base(img_size=256, num_classes=0):
+    """~25M params - for 256x256"""
+    return LiquidFlowNet(
+        img_size=img_size, patch_size=8, in_channels=3,
+        d_model=384, depth=10, d_state=16, d_conv=4, expand=2,
+        num_classes=num_classes,
+    )
+def liquidflow_512(img_size=512, num_classes=0):
+    """~25M params - for 512x512"""
+    return LiquidFlowNet(
+        img_size=img_size, patch_size=16, in_channels=3,
+        d_model=384, depth=10, d_state=16, d_conv=4, expand=2,
+        num_classes=num_classes,
+    )
+if __name__ == "__main__":
+    device = torch.device("cpu")
+    for name, factory in [
+        ("tiny-128", lambda: liquidflow_tiny(128)),
+        ("small-128", lambda: liquidflow_small(128)),
+        ("base-256", lambda: liquidflow_base(256)),
+        ("512", lambda: liquidflow_512(512)),
+    ]:
+        model = factory().to(device)
+        params = model.count_params()
+        print(f"\n{name}: {params/1e6:.2f}M params")
+        B = 2
+        img_size = model.img_size
+        x = torch.randn(B, 3, img_size, img_size, device=device)
+        t = torch.rand(B, device=device)
+        v = model(x, t)
+        print(f"  Input: {x.shape} → Output: {v.shape}")
+        assert v.shape == x.shape
+        print(f"  ✓ Forward pass OK")