krystv
/

LiquidFlow-Gen

Model card Files Files and versions

xet

Community

krystv commited on 19 days ago

Commit

f4749f1

verified ·

1 Parent(s): 029ca89

Upload liquid_flow/cfc_cell.py

Browse files

Files changed (1) hide show

liquid_flow/cfc_cell.py +116 -111

liquid_flow/cfc_cell.py CHANGED Viewed

@@ -1,28 +1,21 @@
 """
 CfC Cell — Closed-form Continuous-time neural network cell.
 From: "Closed-form Continuous-time Neural Networks" (Hasani et al., 2022)
-The CfC model provides an approximate closed-form solution to Liquid Time-Constant (LTC)
-network dynamics without needing ODE solvers.
-Architecture:
-    x(t) = σ(-f(x,I;θ_f) · t) ⊙ g(x,I;θ_g) + (1 - σ(-f(x,I;θ_f) · t)) ⊙ h(x,I;θ_h)
-Where:
-    - f, g, h are neural network heads sharing a backbone
-    - σ is the sigmoid (replacing exponential decay for gradient stability)
-    - t is a time parameter
-    - The sigmoidal terms act as time-continuous gates between g and h
-Key properties:
-    - No ODE solving → 100x+ faster than Neural ODEs
-    - Time-continuous gating mechanism → adaptive computation
-    - Closed-form → stable gradients, easy to train
-    - Naturally causal → good for sequential processing
-For 2D image inputs: we treat the spatial sequence as "time" steps for the CfC,
-allowing the liquid dynamics to model spatial dependencies with adaptive gates.
 """
 import torch
@@ -32,138 +25,150 @@ import torch.nn.functional as F
 class CfCCell(nn.Module):
     """
-    Single CfC cell with backbone + 3 heads (f, g, h).
     Args:
-        dim: Hidden dimension
-        backbone_dropout: Dropout in backbone layers
-        time_scale: Range [a, b] for time parameter sampling
-        use_conv: Add conv1d for local context
     """
-    def __init__(self, dim, backbone_dropout=0.0, time_scale=(0.0, 1.0), use_conv=True):
         super().__init__()
         self.dim = dim
         self.time_scale = time_scale
-        # Shared backbone
-        backbone_dim = dim * 3
         self.backbone = nn.Sequential(
-            nn.Linear(dim + dim, backbone_dim),
-            nn.LayerNorm(backbone_dim),
-            nn.SiLU(),
-            nn.Dropout(backbone_dropout),
-            nn.Linear(backbone_dim, dim * 4),
             nn.LayerNorm(dim * 4),
         )
-        # Optional 1D conv
-        self.conv = nn.Conv1d(dim, dim, kernel_size=3, padding=1, groups=dim) if use_conv else None
-        # Heads
-        self.f_head = nn.Sequential(nn.Linear(dim, dim), nn.LayerNorm(dim), nn.Tanh())
-        self.g_head = nn.Sequential(nn.Linear(dim, dim), nn.LayerNorm(dim), nn.GELU())
-        self.h_head = nn.Sequential(nn.Linear(dim, dim), nn.LayerNorm(dim), nn.GELU())
-        self.out_proj = nn.Linear(dim, dim)
         self._init_weights()
     def _init_weights(self):
         for m in self.modules():
             if isinstance(m, nn.Linear):
-                nn.init.normal_(m.weight, std=0.02)
                 if m.bias is not None:
                     nn.init.zeros_(m.bias)
-    def forward(self, x, h_prev=None, t=None):
         """
         Args:
-            x: [B, dim] or [B, L, dim]
-            h_prev: Previous hidden state [B, dim]
-            t: Time parameter
-        Returns: h: [B, dim] or [B, L, dim]
         """
-        is_seq = x.dim() == 3
-        B, device = x.shape[0], x.device
-        if is_seq:
-            return self._forward_seq(x, h_prev, t)
-        if h_prev is None:
-            h_prev = torch.zeros(B, self.dim, device=device)
-        if t is None:
-            t = torch.rand(B, 1, device=device) * (self.time_scale[1] - self.time_scale[0]) + self.time_scale[0]
-        elif t.dim() == 1:
-            t = t.unsqueeze(1)
-        return self._step(x, h_prev, t)
-    def _forward_seq(self, x, h_prev=None, t=None):
         B, L, D = x.shape
         device = x.device
         if t is None:
-            t = torch.rand(B, 1, 1, device=device) * (self.time_scale[1] - self.time_scale[0]) + self.time_scale[0]
-        outputs = []
-        h = torch.zeros(B, D, device=device) if h_prev is None else h_prev
-        for step in range(L):
-            h = self._step(x[:, step, :], h, t.squeeze(-1) if t.dim() == 3 else t)
-            outputs.append(h)
-        return torch.stack(outputs, dim=1)
-    def _step(self, x, h_prev, t):
-        """Core CfC step."""
-        combined = torch.cat([x, h_prev], dim=-1)
-        backbone_out = self.backbone(combined)
-        f_base, g_base, h_base, skip = backbone_out.chunk(4, dim=-1)
-        if self.conv is not None:
-            f_base = f_base + self.conv(f_base.unsqueeze(1).transpose(1,2)).transpose(1,2).squeeze(1)
-            g_base = g_base + self.conv(g_base.unsqueeze(1).transpose(1,2)).transpose(1,2).squeeze(1)
-            h_base = h_base + self.conv(h_base.unsqueeze(1).transpose(1,2)).transpose(1,2).squeeze(1)
-        f_out = self.f_head(f_base)
-        g_out = self.g_head(g_base)
-        h_out = self.h_head(h_base)
-        gate = torch.sigmoid(-f_out * t)
-        h = gate * g_out + (1 - gate) * h_out + skip
-        return self.out_proj(h)
 class CfCBlock(nn.Module):
-    """CfC block for 2D image processing with residual connection."""
-    def __init__(self, dim, dropout=0.0, time_scale=(0.0, 1.0), expansion_factor=2):
         super().__init__()
-        self.dim = dim
         self.norm1 = nn.LayerNorm(dim)
-        self.norm2 = nn.LayerNorm(dim)
-        self.cfc = CfCCell(dim=dim, backbone_dropout=dropout, time_scale=time_scale, use_conv=True)
         ff_dim = dim * expansion_factor
         self.ff = nn.Sequential(
-            nn.Linear(dim, ff_dim), nn.GELU(), nn.Dropout(dropout),
-            nn.Linear(ff_dim, dim), nn.Dropout(dropout),
         )
-        self.pos_embed = nn.Parameter(torch.randn(1, 4096, dim) * 0.02)
-    def forward(self, x, return_2d=True):
         is_2d = x.dim() == 4
         if is_2d:
             B, C, H, W = x.shape
-            L = H * W
-            x = x.flatten(2).transpose(1, 2)
-        else:
-            B, L, C = x.shape
-        x_with_pos = x + self.pos_embed[:, :L, :]
-        residual = x
-        h = self.cfc(self.norm1(x_with_pos))
-        x_out = h + self.ff(self.norm2(h + residual))
-        if is_2d and return_2d:
-            x_out = x_out.transpose(1, 2).reshape(B, C, H, W)
-        return x_out

 """
 CfC Cell — Closed-form Continuous-time neural network cell.
+FULLY PARALLEL implementation — no sequential loops.
 From: "Closed-form Continuous-time Neural Networks" (Hasani et al., 2022)
+Core CfC equation (Eq. 10 from paper):
+    x(t) = σ(-f(x,I;θ_f)·t) ⊙ g(x,I;θ_g) + (1 - σ(-f(x,I;θ_f)·t)) ⊙ h(x,I;θ_h)
+Key insight for parallelization:
+    The CfC equation is a CLOSED-FORM expression. It maps (input, time) → output
+    with NO recurrent dependency between timesteps. This means for image processing
+    we can compute ALL spatial positions in a single parallel pass.
+    We use it as an adaptive gating mechanism:
+    - f network produces position-dependent time constants
+    - g/h networks produce two candidate feature maps
+    - The sigmoid gate blends them adaptively per-position
 """
 import torch
 class CfCCell(nn.Module):
     """
+    Parallel CfC cell — processes ALL positions simultaneously.
+    The key realization: CfC's closed-form solution is NOT recurrent.
+    It's a function of (input, time) → output. So we apply it to all
+    spatial positions in parallel.
+    For a sequence [B, L, D]:
+        - f, g, h networks are applied to ALL L positions in parallel
+        - The time parameter t modulates the gate per-position
+        - Output is computed in a single vectorized operation
     Args:
+        dim: Feature dimension
+        dropout: Dropout rate
+        time_scale: Range for time parameter
     """
+    def __init__(self, dim, dropout=0.0, time_scale=(0.1, 1.0)):
         super().__init__()
         self.dim = dim
         self.time_scale = time_scale
+        # Shared backbone (processes all positions in parallel)
         self.backbone = nn.Sequential(
+            nn.Linear(dim, dim * 4),
             nn.LayerNorm(dim * 4),
+            nn.SiLU(),
+            nn.Dropout(dropout),
         )
+        # f head: time-constant (bounded by tanh for stability)
+        self.f_head = nn.Sequential(
+            nn.Linear(dim * 4, dim),
+            nn.Tanh(),
+        )
+        # g head: "fast" feature (dominant when gate ≈ 1, i.e. small t)
+        self.g_head = nn.Sequential(
+            nn.Linear(dim * 4, dim),
+        )
+        # h head: "slow" feature (dominant when gate ≈ 0, i.e. large t)
+        self.h_head = nn.Sequential(
+            nn.Linear(dim * 4, dim),
+        )
+        # Learnable time-bias per channel (makes time adaptive per feature)
+        self.time_bias = nn.Parameter(torch.zeros(dim))
         self._init_weights()
     def _init_weights(self):
         for m in self.modules():
             if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight, gain=0.02)
                 if m.bias is not None:
                     nn.init.zeros_(m.bias)
+    def forward(self, x, t=None):
         """
+        Fully parallel CfC forward pass.
         Args:
+            x: [B, L, D] — all positions processed simultaneously
+            t: Optional time parameter [B, 1, 1] or scalar.
+               If None, sampled randomly during training, fixed during eval.
+        Returns:
+            out: [B, L, D]
         """
         B, L, D = x.shape
         device = x.device
+        # Time parameter
         if t is None:
+            if self.training:
+                # Random time per batch during training (data augmentation)
+                t = torch.rand(B, 1, 1, device=device) * (
+                    self.time_scale[1] - self.time_scale[0]
+                ) + self.time_scale[0]
+            else:
+                # Fixed midpoint during inference
+                t = torch.full((B, 1, 1), 0.5 * (self.time_scale[0] + self.time_scale[1]), device=device)
+        # Shared backbone (parallel over all B*L positions)
+        features = self.backbone(x)  # [B, L, dim*4]
+        # Three heads (all parallel)
+        f_out = self.f_head(features)  # [B, L, D] — bounded by tanh
+        g_out = self.g_head(features)  # [B, L, D]
+        h_out = self.h_head(features)  # [B, L, D]
+        # CfC gating: σ(-f * (t + time_bias))
+        # time_bias makes gating adaptive per-channel
+        effective_t = t + self.time_bias.view(1, 1, -1)  # [B, 1, D] broadcast
+        gate = torch.sigmoid(-f_out * effective_t)  # [B, L, D]
+        # CfC output: gate * g + (1-gate) * h
+        out = gate * g_out + (1 - gate) * h_out  # [B, L, D]
+        return out
 class CfCBlock(nn.Module):
+    """
+    CfC block for 2D image processing.
+    Fully parallel — no sequential loops.
+    Architecture:
+        Input [B, C, H, W] → flatten → CfC (parallel) → reshape → Output
+        With: pre-norm, residual connection, feed-forward
+    """
+    def __init__(self, dim, dropout=0.0, expansion_factor=2):
         super().__init__()
         self.norm1 = nn.LayerNorm(dim)
+        self.cfc = CfCCell(dim=dim, dropout=dropout)
+        self.norm2 = nn.LayerNorm(dim)
         ff_dim = dim * expansion_factor
         self.ff = nn.Sequential(
+            nn.Linear(dim, ff_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(ff_dim, dim),
+            nn.Dropout(dropout),
         )
+    def forward(self, x):
+        """
+        Args:
+            x: [B, C, H, W] or [B, L, C]
+        Returns:
+            Same shape as input
+        """
         is_2d = x.dim() == 4
         if is_2d:
             B, C, H, W = x.shape
+            x = x.flatten(2).transpose(1, 2)  # [B, HW, C]
+        # Pre-norm + CfC + residual
+        x = x + self.cfc(self.norm1(x))
+        # Pre-norm + FF + residual
+        x = x + self.ff(self.norm2(x))
+        if is_2d:
+            x = x.transpose(1, 2).reshape(B, C, H, W)
+        return x