Replace F.scaled_dot_product_attention with explicit implementation

Expand the single library call into 5 visible steps (scale, mask,
softmax, dropout, value-multiply) so learners can inspect each stage
of Scaled Dot-Product Attention directly in the source code.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

llm_lab/model/attention.py +31 -7

llm_lab/model/attention.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Grouped Query Attention (GQA)."""
 from typing import Optional
 import torch
@@ -100,13 +101,36 @@ class GroupedQueryAttention(nn.Module):
         # ──────────────────────────────────────────────
         # Step 4: Scaled Dot-Product Attention
         # ──────────────────────────────────────────────
-        # Uses PyTorch >= 2.0's optimized implementation (Flash Attention applied automatically)
-        attn_out = F.scaled_dot_product_attention(
-            q, k, v,
-            attn_mask=mask,
-            dropout_p=self.config.dropout if self.training else 0.0,
-            is_causal=(mask is None),  # apply automatic causal masking when no mask is provided
-        )
         # → (batch_size, num_heads, seq_len, head_dim)
         # ──────────────────────────────────────────────

 """Grouped Query Attention (GQA)."""
+import math
 from typing import Optional
 import torch
         # ──────────────────────────────────────────────
         # Step 4: Scaled Dot-Product Attention
         # ──────────────────────────────────────────────
+        # Step 4-1: Compute scaled attention scores
+        # Q @ K^T → (batch_size, num_heads, seq_len, seq_len)
+        # Dividing by √d_k prevents dot products from growing too large,
+        # which would push softmax into regions with vanishing gradients.
+        scale = math.sqrt(self.head_dim)
+        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / scale
+        # Step 4-2: Apply mask
+        # Causal mask fills future positions with -inf so they become 0 after softmax,
+        # ensuring the model can only attend to past and current tokens (autoregressive).
+        if mask is not None:
+            attn_scores = attn_scores + mask
+        else:
+            causal_mask = torch.triu(
+                torch.full((seq_len, seq_len), float("-inf"), device=q.device, dtype=q.dtype),
+                diagonal=1,
+            )
+            attn_scores = attn_scores + causal_mask
+        # Step 4-3: Softmax → attention weights (probability distribution over keys)
+        attn_weights = F.softmax(attn_scores, dim=-1)
+        # Step 4-4: Dropout (only during training)
+        # Randomly zeroing some attention weights acts as regularization,
+        # preventing the model from relying too heavily on specific token relationships.
+        if self.training and self.config.dropout > 0.0:
+            attn_weights = F.dropout(attn_weights, p=self.config.dropout)
+        # Step 4-5: Weighted sum of values
+        attn_out = torch.matmul(attn_weights, v)
         # → (batch_size, num_heads, seq_len, head_dim)
         # ──────────────────────────────────────────────