NoesisLab
/

Spartacus-1B-Instruct

@@ -483,43 +483,30 @@ class MonoidAttention(nn.Module):
             return self.o_proj(o), final_state
         # ══════════════════════════════════════════════════════════
-        # Training path (parallel scan): O(T) via prefix sum
-        # 训练路径 (并行扫描): 通过前缀和 O(T)
         # ══════════════════════════════════════════════════════════
-        # For a full sequence of length T, compute ALL prefix states
-        # S_1, S_2, ..., S_T simultaneously using parallel prefix scan.
-        # Complexity: O(T) work, O(log T) depth — GPU-friendly.
         #
-        # 对长度为 T 的完整序列, 使用并行前缀扫描同时计算所有前缀状态
-        # S_1, S_2, ..., S_T。
-        # 复杂度: O(T) 工作量, O(log T) 深度 — GPU 友好。
-        # Batch outer product: kv_{t} = k_t ⊗ v_t for all t
-        # 批量外积: kv_{t} = k_t ⊗ v_t, 对所有 t
-        kv = torch.einsum('bhtd, bhte -> bhtde', k, v)  # [B,H,T,d,d]
-        states = parallel_scan(log_alpha, kv)
-        del kv                                           # free [B,H,T,d,d] early
-        final_state = None
-        # ── Incorporate h0: make training consistent with inference ──
-        # ── 融入 h0: 使训练与推理一致 ──
-        # parallel_scan starts from S_0 = 0, but inference starts from S_0 = h0.
-        # Fix: S_t(with h0) = h0 · Π_{i=1}^{t} α_i + S_t(from scan)
-        # The cumulative decay Π_{i=1}^{t} α_i = exp(Σ_{i=1}^{t} log_α_i).
-        # parallel_scan 从 S_0 = 0 开始, 但推理从 S_0 = h0 开始。
-        # 修正: S_t(含h0) = h0 · Π_{i=1}^{t} α_i + S_t(扫描结果)
-        # 累积衰减 Π_{i=1}^{t} α_i = exp(Σ_{i=1}^{t} log_α_i)。
-        cum_log_decay = torch.cumsum(log_alpha.squeeze(-1), dim=2)    # [B,H,T]
-        cum_decay = torch.exp(cum_log_decay).unsqueeze(-1).unsqueeze(-1)  # [B,H,T,1,1]
-        states = states + self.h0.unsqueeze(2) * cum_decay            # [B,H,T,d,d]
-        del cum_decay
-        # Readout: o_t = q_t · S_t for all t simultaneously
-        # 读出: o_t = q_t · S_t, 对所有 t 同时计算
-        o = torch.einsum('bhtd, bhtde -> bhte', q, states)
-        del states                                       # free [B,H,T,d,d]
         o = o.transpose(1, 2).contiguous().view(B, T, -1)
-        return self.o_proj(o), final_state
 # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

             return self.o_proj(o), final_state
         # ══════════════════════════════════════════════════════════
+        # Training path: memory-efficient sequential scan + inline readout
+        # 训练路径: 内存高效的串行扫描 + 内联读出
         # ══════════════════════════════════════════════════════════
+        # Loop token-by-token with running state S=[B,H,d,d].
+        # Peak memory: O(B·H·d²) instead of O(B·H·T·d²).
+        # Autograd records each step for correct gradient computation.
         #
+        # 逐 token 循环, 使用运行状态 S=[B,H,d,d]。
+        # 峰值内存: O(B·H·d²) 而非 O(B·H·T·d²)。
+        # Autograd 记录每步操作以正确计算梯度。
+        S = self.h0.expand(B, -1, -1, -1).clone()    # [B,H,d,d]
+        o_parts = []
+        for t in range(T):
+            kv_t = torch.einsum('bhd, bhe -> bhde', k[:, :, t], v[:, :, t])
+            decay = torch.exp(log_alpha[:, :, t])     # [B,H,1]
+            while decay.dim() < S.dim():
+                decay = decay.unsqueeze(-1)
+            S = S * decay + kv_t
+            o_parts.append(torch.einsum('bhd, bhde -> bhe', q[:, :, t], S))
+        o = torch.stack(o_parts, dim=2)               # [B,H,T,d]
         o = o.transpose(1, 2).contiguous().view(B, T, -1)
+        return self.o_proj(o), None
 # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━