Upload 2 files
Browse files- MonoidForCausalLM.py +30 -19
MonoidForCausalLM.py
CHANGED
|
@@ -483,28 +483,39 @@ class MonoidAttention(nn.Module):
|
|
| 483 |
return self.o_proj(o), final_state
|
| 484 |
|
| 485 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 486 |
-
# Training path:
|
| 487 |
-
# ่ฎญ็ป่ทฏๅพ:
|
| 488 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 489 |
-
#
|
| 490 |
-
#
|
| 491 |
-
#
|
|
|
|
| 492 |
#
|
| 493 |
-
#
|
| 494 |
-
#
|
| 495 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
|
| 497 |
-
S = self.h0.expand(B, -1, -1, -1).clone() # [B,H,d,d]
|
| 498 |
-
o_parts = []
|
| 499 |
-
for t in range(T):
|
| 500 |
-
kv_t = torch.einsum('bhd, bhe -> bhde', k[:, :, t], v[:, :, t])
|
| 501 |
-
decay = torch.exp(log_alpha[:, :, t]) # [B,H,1]
|
| 502 |
-
while decay.dim() < S.dim():
|
| 503 |
-
decay = decay.unsqueeze(-1)
|
| 504 |
-
S = S * decay + kv_t
|
| 505 |
-
o_parts.append(torch.einsum('bhd, bhde -> bhe', q[:, :, t], S))
|
| 506 |
-
|
| 507 |
-
o = torch.stack(o_parts, dim=2) # [B,H,T,d]
|
| 508 |
o = o.transpose(1, 2).contiguous().view(B, T, -1)
|
| 509 |
return self.o_proj(o), None
|
| 510 |
|
|
|
|
| 483 |
return self.o_proj(o), final_state
|
| 484 |
|
| 485 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 486 |
+
# Training path: parallel scan + vectorized readout
|
| 487 |
+
# ่ฎญ็ป่ทฏๅพ: ๅนถ่กๆซๆ + ๅ้ๅ่ฏปๅบ
|
| 488 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 489 |
+
# Materialize full kv tensor [B,H,T,d,d] and scan in one pass.
|
| 490 |
+
# Memory: O(BยทHยทTยทdยฒ) โ trades memory for speed.
|
| 491 |
+
# Eliminates Tร30 Python-loop kernel launches for outer product
|
| 492 |
+
# and readout; scan itself is parallel when CUDA kernel available.
|
| 493 |
#
|
| 494 |
+
# ็ฉๅๅฎๆด kv ๅผ ้ [B,H,T,d,d] ๅนถไธๆฌกๆงๆซๆใ
|
| 495 |
+
# ๅ
ๅญ: O(BยทHยทTยทdยฒ) โ ไปฅๅ
ๅญๆข้ๅบฆใ
|
| 496 |
+
# ๆถ้คๅค็งฏๅ่ฏปๅบ็ Tร30 ๆฌก Python ๅพช็ฏ kernel launch;
|
| 497 |
+
# ๅฝ CUDA kernel ๅฏ็จๆถๆซๆๆฌ่บซไนๆฏๅนถ่ก็ใ
|
| 498 |
+
|
| 499 |
+
# Vectorized outer product: kv_t = k_t โ v_t for all t at once
|
| 500 |
+
# ๅ้ๅๅค็งฏ: ไธๆฌกๆง่ฎก็ฎๆๆ t ็ k_t โ v_t
|
| 501 |
+
kv = torch.einsum('bhtd, bhte -> bhtde', k, v) # [B,H,T,d,d]
|
| 502 |
+
|
| 503 |
+
# Parallel prefix scan: S_t = ฮฑ_tยทS_{t-1} + kv_t (from S=0)
|
| 504 |
+
# ๅนถ่กๅ็ผๆซๆ: S_t = ฮฑ_tยทS_{t-1} + kv_t (ไป S=0 ๅผๅง)
|
| 505 |
+
# Keep log_alpha as [B,H,T,1] โ CUDA kernel backward expects this shape.
|
| 506 |
+
# ไฟๆ log_alpha ไธบ [B,H,T,1] โ CUDA kernel ๅๅไผ ๆญ้่ฆๆญคๅฝข็ถใ
|
| 507 |
+
states = parallel_scan(log_alpha, kv) # [B,H,T,d,d]
|
| 508 |
+
|
| 509 |
+
# Add h0 contribution: S_t += (โ_{i=0}^{t} ฮฑ_i) ยท h0
|
| 510 |
+
# ๅ ๅ h0 ่ดก็ฎ: S_t += (โ_{i=0}^{t} ฮฑ_i) ยท h0
|
| 511 |
+
cum_log_alpha = torch.cumsum(log_alpha, dim=2) # [B,H,T,1]
|
| 512 |
+
h0_decay = torch.exp(cum_log_alpha).unsqueeze(-1) # [B,H,T,1,1]
|
| 513 |
+
states = states + h0_decay * self.h0.unsqueeze(2) # broadcast h0 [1,H,1,d,d]
|
| 514 |
+
|
| 515 |
+
# Vectorized readout: o_t = q_t ยท S_t for all t at once
|
| 516 |
+
# ๅ้ๅ่ฏปๅบ: ไธๆฌกๆง่ฎก็ฎๆๆ t ็ q_t ยท S_t
|
| 517 |
+
o = torch.einsum('bhtd, bhtde -> bhte', q, states) # [B,H,T,d]
|
| 518 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
o = o.transpose(1, 2).contiguous().view(B, T, -1)
|
| 520 |
return self.o_proj(o), None
|
| 521 |
|