NoesisLab
/

Spartacus-1B-Instruct

@@ -483,28 +483,39 @@ class MonoidAttention(nn.Module):
             return self.o_proj(o), final_state
         # ══════════════════════════════════════════════════════════
-        # Training path: memory-efficient sequential scan + inline readout
-        # 训练路径: 内存高效的串行扫描 + 内联读出
         # ══════════════════════════════════════════════════════════
-        # Loop token-by-token with running state S=[B,H,d,d].
-        # Peak memory: O(B·H·d²) instead of O(B·H·T·d²).
-        # Autograd records each step for correct gradient computation.
         #
-        # 逐 token 循环, 使用运行状态 S=[B,H,d,d]。
-        # 峰值内存: O(B·H·d²) 而非 O(B·H·T·d²)。
-        # Autograd 记录每步操作以正确计算梯度。
-        S = self.h0.expand(B, -1, -1, -1).clone()    # [B,H,d,d]
-        o_parts = []
-        for t in range(T):
-            kv_t = torch.einsum('bhd, bhe -> bhde', k[:, :, t], v[:, :, t])
-            decay = torch.exp(log_alpha[:, :, t])     # [B,H,1]
-            while decay.dim() < S.dim():
-                decay = decay.unsqueeze(-1)
-            S = S * decay + kv_t
-            o_parts.append(torch.einsum('bhd, bhde -> bhe', q[:, :, t], S))
-        o = torch.stack(o_parts, dim=2)               # [B,H,T,d]
         o = o.transpose(1, 2).contiguous().view(B, T, -1)
         return self.o_proj(o), None

             return self.o_proj(o), final_state
         # ══════════════════════════════════════════════════════════
+        # Training path: parallel scan + vectorized readout
+        # 训练路径: 并行扫描 + 向量化读出
         # ══════════════════════════════════════════════════════════
+        # Materialize full kv tensor [B,H,T,d,d] and scan in one pass.
+        # Memory: O(B·H·T·d²) — trades memory for speed.
+        # Eliminates T×30 Python-loop kernel launches for outer product
+        # and readout; scan itself is parallel when CUDA kernel available.
         #
+        # 物化完整 kv 张量 [B,H,T,d,d] 并一次性扫描。
+        # 内存: O(B·H·T·d²) — 以内存换速度。
+        # 消除外积和读出的 T×30 次 Python 循环 kernel launch;
+        # 当 CUDA kernel 可用时扫描本身也是并行的。
+        # Vectorized outer product: kv_t = k_t ⊗ v_t for all t at once
+        # 向量化外积: 一次性计算所有 t 的 k_t ⊗ v_t
+        kv = torch.einsum('bhtd, bhte -> bhtde', k, v)           # [B,H,T,d,d]
+        # Parallel prefix scan: S_t = α_t·S_{t-1} + kv_t (from S=0)
+        # 并行前缀扫描: S_t = α_t·S_{t-1} + kv_t (从 S=0 开始)
+        # Keep log_alpha as [B,H,T,1] — CUDA kernel backward expects this shape.
+        # 保持 log_alpha 为 [B,H,T,1] — CUDA kernel 反向传播需要此形状。
+        states = parallel_scan(log_alpha, kv)                     # [B,H,T,d,d]
+        # Add h0 contribution: S_t += (∏_{i=0}^{t} α_i) · h0
+        # 叠加 h0 贡献: S_t += (∏_{i=0}^{t} α_i) · h0
+        cum_log_alpha = torch.cumsum(log_alpha, dim=2)            # [B,H,T,1]
+        h0_decay = torch.exp(cum_log_alpha).unsqueeze(-1)         # [B,H,T,1,1]
+        states = states + h0_decay * self.h0.unsqueeze(2)         # broadcast h0 [1,H,1,d,d]
+        # Vectorized readout: o_t = q_t · S_t for all t at once
+        # 向量化读出: 一次性计算所有 t 的 q_t · S_t
+        o = torch.einsum('bhtd, bhtde -> bhte', q, states)       # [B,H,T,d]
         o = o.transpose(1, 2).contiguous().view(B, T, -1)
         return self.o_proj(o), None