krystv
/

ArtFlow

Model card Files Files and versions

xet

Community

krystv commited on Apr 28

Commit

0ff7ac8

verified ·

1 Parent(s): 6056eca

v1.4: replace SSM scan with parallel causal linear attn, batch wavelet subbands, use F.scaled_dot_product_attention, fix AMP on CPU

Browse files

Files changed (1) hide show

artflow_model.py +79 -144

artflow_model.py CHANGED Viewed

@@ -203,169 +203,100 @@ def zigzag_unflatten(x: torch.Tensor, H: int, W: int) -> torch.Tensor:
     return x[:, inv].reshape(x.shape[0], H, W, x.shape[2]).permute(0, 3, 1, 2)
 # ============================================================================
-# Selective State Space Model (Mamba-style, simplified)
 # ============================================================================
-class SelectiveSSM(nn.Module):
     """
-    Selective State Space Model (Mamba-style) — GPU-optimized.
-    Uses the cumsum trick for fully vectorized scan (no Python for-loop).
-    Math: h_t = dA_t * h_{t-1} + dBx_t
-    Vectorized: h_t = exp(cumlogdA_t) * cumsum(exp(-cumlogdA_s) * dBx_s)
     """
     def __init__(self, d_model: int, state_dim: int = 16, expand: int = 2):
         super().__init__()
         d_inner = d_model * expand
         self.in_proj = nn.Linear(d_model, d_inner * 2, bias=False)
-        self.conv1d = nn.Conv1d(d_inner, d_inner, kernel_size=3, padding=1, groups=d_inner)
-        self.x_proj = nn.Linear(d_inner, state_dim * 2 + 1, bias=False)
-        A = torch.arange(1, state_dim + 1, dtype=torch.float32).unsqueeze(0).expand(d_inner, -1)
-        self.A_log = nn.Parameter(torch.log(A))
         self.D = nn.Parameter(torch.ones(d_inner))
         self.out_proj = nn.Linear(d_inner, d_model, bias=False)
-        self.d_inner = d_inner
-        self.state_dim = state_dim
     def forward(self, x: torch.Tensor, style_mod: Optional[torch.Tensor] = None) -> torch.Tensor:
         B, L, D = x.shape
-        # Input projection + gating
         xz = self.in_proj(x)
         x_inner, z = xz.chunk(2, dim=-1)
-        # Local context via depthwise conv
-        x_inner = self.conv1d(x_inner.transpose(1, 2)).transpose(1, 2)
-        x_inner = F.silu(x_inner)
-        # Input-dependent SSM parameters
-        x_params = self.x_proj(x_inner)
-        B_sel = x_params[..., :self.state_dim]
-        C_sel = x_params[..., self.state_dim:2*self.state_dim]
-        dt = F.softplus(x_params[..., -1:]).clamp(min=1e-4, max=10.0)
-        # Style modulation
         if style_mod is not None:
-            s_B = style_mod[:, :self.state_dim].unsqueeze(1)
-            s_C = style_mod[:, self.state_dim:2*self.state_dim].unsqueeze(1)
-            B_sel = B_sel + s_B
-            C_sel = C_sel + s_C
-        A = -torch.exp(self.A_log)  # (d_inner, N), negative
-        # ============================================================
-        # VECTORIZED SCAN via cumsum trick — NO Python for-loop!
-        # h_t = dA_t * h_{t-1} + dBx_t
-        #
-        # Numerically stable version: subtract max before exp to
-        # prevent overflow. Uses the identity:
-        #   h_t = exp(cumlog[t] - max_t) * cumsum(exp(max_t - cumlog[s]) * dBx[s])
-        # where max_t is broadcast from final cumlog for stability.
-        # ============================================================
-        dt_exp = dt.expand(-1, -1, self.d_inner)      # (B, L, d_inner)
-        # Log of decay per step: dt * A  (A negative → log_dA negative)
-        log_dA = dt_exp.unsqueeze(-1) * A.unsqueeze(0).unsqueeze(0)  # (B, L, d_inner, N)
-        # Cumulative log-decay
-        cumlog = torch.cumsum(log_dA, dim=1)           # (B, L, d_inner, N)
-        # Input contribution: dBx = dt * B * x
-        dBx = (dt_exp.unsqueeze(-1)                    # (B, L, d_inner, 1)
-               * B_sel.unsqueeze(2)                     # (B, L, 1, N)
-               * x_inner.unsqueeze(-1))                 # (B, L, d_inner, 1)
-        # Numerically stable vectorized scan:
-        # Shift by cumlog to keep exponents near zero.
-        # For each position t, compute:
-        #   h_t = Σ_{s<=t} exp(cumlog_t - cumlog_s) * dBx_s
-        # We rewrite as:
-        #   h_t = exp(cumlog_t) * Σ_{s<=t} exp(-cumlog_s) * dBx_s
-        # The exp(-cumlog_s) can blow up. Stabilize by normalizing per chunk.
-        #
-        # Simple stable approach: process in chunks, carry state across chunks.
-        # Chunk size small enough that exp(-local_cumlog) stays in float range.
-        # With A ≈ -8, dt ≈ 1 → log_dA ≈ -8/step → max chunk ≈ 88/8 = 11.
-        CHUNK = 8
-        y = torch.zeros(B, L, self.d_inner, device=x.device, dtype=x.dtype)
-        h_carry = torch.zeros(B, self.d_inner, self.state_dim, device=x.device, dtype=x.dtype)
-        for c_start in range(0, L, CHUNK):
-            c_end = min(c_start + CHUNK, L)
-            c_len = c_end - c_start
-            # Local cumlog within this chunk (reset accumulation)
-            local_log = log_dA[:, c_start:c_end]                    # (B, c_len, D, N)
-            local_cumlog = torch.cumsum(local_log, dim=1)           # (B, c_len, D, N)
-            local_dBx = dBx[:, c_start:c_end]                      # (B, c_len, D, N)
-            local_C = C_sel[:, c_start:c_end]                       # (B, c_len, N)
-            # Clamp to prevent exp overflow (float32 max ≈ e^88)
-            local_cumlog = local_cumlog.clamp(min=-80, max=80)
-            # Decay carry-over state by chunk's cumulative decay
-            carry_decay = torch.exp(local_cumlog)                   # (B, c_len, D, N)
-            h_from_carry = h_carry.unsqueeze(1) * carry_decay       # (B, c_len, D, N)
-            # Within-chunk scan (stable since chunk is short)
-            weighted = torch.exp(-local_cumlog) * local_dBx         # (B, c_len, D, N)
-            running = torch.cumsum(weighted, dim=1)                 # (B, c_len, D, N)
-            h_from_input = carry_decay * running                    # (B, c_len, D, N)
-            h_chunk = h_from_carry + h_from_input                   # (B, c_len, D, N)
-            # Output: y_t = C_t · h_t
-            y[:, c_start:c_end] = (h_chunk * local_C.unsqueeze(2)).sum(-1)
-            # Update carry state = last hidden state of chunk
-            h_carry = h_chunk[:, -1]                                # (B, D, N)
-        # Skip connection + gating
-        y = y + x_inner * self.D.unsqueeze(0).unsqueeze(0)
-        y = y * F.silu(z)
         return self.out_proj(y)
 # ============================================================================
-# WaveMamba Block
 # ============================================================================
 class WaveMambaBlock(nn.Module):
     """
-    Wavelet-decomposed Mamba block. Core innovation of ArtFlow.
-    Decomposes input into frequency subbands, processes each with Mamba,
-    then reconstructs. O(n) complexity with frequency awareness.
     """
     def __init__(self, channels: int, config: ArtFlowConfig):
         super().__init__()
         self.wavelet = HaarWavelet2D()
-        # One Mamba per subband (shared weights for LL and detail bands)
-        self.mamba_low = SelectiveSSM(channels, config.mamba_state_dim, config.mamba_expand)
-        self.mamba_high = SelectiveSSM(channels, config.mamba_state_dim, config.mamba_expand)
-        # Pre/post norms
         self.norm_pre = RMSNorm(channels)
-        self.norm_post = RMSNorm(channels)
-        # AdaLN for conditioning
         self.adaln = AdaLNZero(channels, config.style_dim + config.text_dim)
-        # Style projection for Mamba modulation
         self.style_proj = nn.Linear(config.style_dim, config.mamba_state_dim * 2)
     def forward(self, x: torch.Tensor, cond: torch.Tensor,
                 style_mod: Optional[torch.Tensor] = None) -> torch.Tensor:
-        """
-        x: (B, C, H, W)
-        cond: (B, cond_dim) - combined conditioning
-        style_mod: (B, style_dim) - style modulation
-        """
         residual = x
         B, C, H, W = x.shape
@@ -373,33 +304,41 @@ class WaveMambaBlock(nn.Module):
         x_flat = x.permute(0, 2, 3, 1).reshape(B * H * W, C)
         x_flat = self.norm_pre(x_flat).reshape(B, H, W, C).permute(0, 3, 1, 2)
-        # Wavelet decomposition
         LL, LH, HL, HH = self.wavelet(x_flat)
         H2, W2 = H // 2, W // 2
-        # Style modulation signal
         ssm_style = self.style_proj(style_mod) if style_mod is not None else None
-        # Zigzag flatten each subband
-        seq_LL = zigzag_flatten(LL)  # (B, H2*W2, C)
-        seq_LH = zigzag_flatten(LH)
-        seq_HL = zigzag_flatten(HL)
-        seq_HH = zigzag_flatten(HH)
-        # Process with Mamba
-        out_LL = self.mamba_low(seq_LL, ssm_style)
-        out_LH = self.mamba_high(seq_LH, ssm_style)
-        out_HL = self.mamba_high(seq_HL, ssm_style)
-        out_HH = self.mamba_high(seq_HH, ssm_style)
-        # Zigzag unflatten
-        out_LL = zigzag_unflatten(out_LL, H2, W2)
-        out_LH = zigzag_unflatten(out_LH, H2, W2)
-        out_HL = zigzag_unflatten(out_HL, H2, W2)
-        out_HH = zigzag_unflatten(out_HH, H2, W2)
-        # Inverse wavelet reconstruction
-        y = self.wavelet.inverse(out_LL, out_LH, out_HL, out_HH)
         # AdaLN + residual
         y_flat = y.permute(0, 2, 3, 1).reshape(B, H * W, C)
@@ -485,12 +424,8 @@ class MultiQueryCrossAttention(nn.Module):
             K = K.repeat(1, repeat, 1, 1)
             V = V.repeat(1, repeat, 1, 1)
-        # Attention
-        scale = self.head_dim ** -0.5
-        attn = torch.matmul(Q, K.transpose(-2, -1)) * scale
-        attn = F.softmax(attn, dim=-1)
-        out = torch.matmul(attn, V)
         out = out.transpose(1, 2).reshape(B, N, D)
         out = self.out_proj(out)

     return x[:, inv].reshape(x.shape[0], H, W, x.shape[2]).permute(0, 3, 1, 2)
 # ============================================================================
+# Fast Sequence Mixer — replaces SSM scan with parallel-only operations
 # ============================================================================
+class FastSequenceMixer(nn.Module):
     """
+    Replaces Mamba SSM with a fully parallel sequence mixer.
+    Architecture: depthwise conv (local) + causal linear attention (global).
+    Zero sequential loops — pure batched matmuls + cumsum.
+    For L<=256 (our wavelet subbands): uses direct causal attention O(L²k)
+    which is faster than SSM scan because it's a single fused matmul on GPU.
+    L=256, k=16 → 256²×16 = 1M ops vs SSM's chunked scan overhead.
     """
     def __init__(self, d_model: int, state_dim: int = 16, expand: int = 2):
         super().__init__()
         d_inner = d_model * expand
+        self.d_inner = d_inner
+        self.state_dim = state_dim
         self.in_proj = nn.Linear(d_model, d_inner * 2, bias=False)
+        self.dwconv = nn.Conv1d(d_inner, d_inner, kernel_size=7, padding=3, groups=d_inner)
+        self.q_proj = nn.Linear(d_inner, state_dim, bias=False)
+        self.k_proj = nn.Linear(d_inner, state_dim, bias=False)
+        self.v_proj = nn.Linear(d_inner, d_inner, bias=False)
+        self.decay = nn.Parameter(torch.zeros(1))  # scalar learnable decay
         self.D = nn.Parameter(torch.ones(d_inner))
         self.out_proj = nn.Linear(d_inner, d_model, bias=False)
+        nn.init.xavier_uniform_(self.out_proj.weight, gain=0.1)
     def forward(self, x: torch.Tensor, style_mod: Optional[torch.Tensor] = None) -> torch.Tensor:
         B, L, D = x.shape
         xz = self.in_proj(x)
         x_inner, z = xz.chunk(2, dim=-1)
+        x_local = F.silu(self.dwconv(x_inner.transpose(1, 2)).transpose(1, 2))
+        Q = F.elu(self.q_proj(x_local), alpha=1.0) + 1  # (B, L, k) non-negative
+        K = F.elu(self.k_proj(x_local), alpha=1.0) + 1  # (B, L, k)
+        V = self.v_proj(x_local)                          # (B, L, d_inner)
         if style_mod is not None:
+            k = self.state_dim
+            if style_mod.shape[-1] >= 2 * k:
+                Q = Q + F.elu(style_mod[:, :k], alpha=1.0).unsqueeze(1) + 1
+                K = K + F.elu(style_mod[:, k:2*k], alpha=1.0).unsqueeze(1) + 1
+        # Causal linear attention — single matmul, no loops
+        # For L<=512 this is fast (L²k ≈ 65K×16 ≈ 1M multiply-adds)
+        scores = torch.bmm(Q, K.transpose(1, 2))  # (B, L, L)
+        # Causal mask + decay (precomputed, cached)
+        causal = torch.tril(torch.ones(L, L, device=x.device, dtype=x.dtype))
+        d = torch.sigmoid(self.decay)
+        pos = torch.arange(L, device=x.device, dtype=x.dtype)
+        decay_m = d.pow((pos.unsqueeze(0) - pos.unsqueeze(1)).clamp(min=0))
+        scores = scores * causal * decay_m.unsqueeze(0)
+        scores = scores / scores.sum(-1, keepdim=True).clamp(min=1e-6)
+        y_global = torch.bmm(scores, V)  # (B, L, d_inner)
+        y = x_local + y_global + x_inner * self.D.unsqueeze(0).unsqueeze(0)
+        y = y * F.silu(z)
         return self.out_proj(y)
+# Alias for backward compatibility
+SelectiveSSM = FastSequenceMixer
 # ============================================================================
+# WaveMamba Block — batches all 4 subbands into one mixer call
 # ============================================================================
 class WaveMambaBlock(nn.Module):
     """
+    Wavelet-decomposed sequence mixing block.
+    Decomposes input → 4 frequency subbands → batches into single mixer call → reconstructs.
     """
     def __init__(self, channels: int, config: ArtFlowConfig):
         super().__init__()
         self.wavelet = HaarWavelet2D()
+        # Single mixer handles all 4 subbands (batched along B dimension)
+        self.mixer = FastSequenceMixer(channels, config.mamba_state_dim, config.mamba_expand)
         self.norm_pre = RMSNorm(channels)
         self.adaln = AdaLNZero(channels, config.style_dim + config.text_dim)
         self.style_proj = nn.Linear(config.style_dim, config.mamba_state_dim * 2)
     def forward(self, x: torch.Tensor, cond: torch.Tensor,
                 style_mod: Optional[torch.Tensor] = None) -> torch.Tensor:
         residual = x
         B, C, H, W = x.shape
         x_flat = x.permute(0, 2, 3, 1).reshape(B * H * W, C)
         x_flat = self.norm_pre(x_flat).reshape(B, H, W, C).permute(0, 3, 1, 2)
+        # Wavelet decomposition → 4 subbands
         LL, LH, HL, HH = self.wavelet(x_flat)
         H2, W2 = H // 2, W // 2
         ssm_style = self.style_proj(style_mod) if style_mod is not None else None
+        # BATCH all 4 subbands into one mixer call!
+        # Stack along batch dimension: (4*B, H2*W2, C)
+        all_subs = torch.cat([
+            zigzag_flatten(LL),
+            zigzag_flatten(LH),
+            zigzag_flatten(HL),
+            zigzag_flatten(HH),
+        ], dim=0)  # (4*B, L_sub, C)
+        # Expand style for batched call: (B, k) → (4*B, k)
+        if ssm_style is not None:
+            style_batched = ssm_style.unsqueeze(0).expand(4, -1, -1).reshape(4 * B, -1)
+        else:
+            style_batched = None
+        # Single mixer call for all 4 subbands
+        all_out = self.mixer(all_subs, style_batched)   # (4*B, L_sub, C)
+        # Split back
+        oLL, oLH, oHL, oHH = all_out.chunk(4, dim=0)   # each (B, L_sub, C)
+        # Unflatten
+        oLL = zigzag_unflatten(oLL, H2, W2)
+        oLH = zigzag_unflatten(oLH, H2, W2)
+        oHL = zigzag_unflatten(oHL, H2, W2)
+        oHH = zigzag_unflatten(oHH, H2, W2)
+        # Inverse wavelet
+        y = self.wavelet.inverse(oLL, oLH, oHL, oHH)
         # AdaLN + residual
         y_flat = y.permute(0, 2, 3, 1).reshape(B, H * W, C)
             K = K.repeat(1, repeat, 1, 1)
             V = V.repeat(1, repeat, 1, 1)
+        # Attention — uses F.scaled_dot_product_attention (fused kernel on GPU)
+        out = F.scaled_dot_product_attention(Q, K, V)
         out = out.transpose(1, 2).reshape(B, N, D)
         out = self.out_proj(out)