refactor(model): replace single-letter vars with descriptive names for readability

Rename B/S → batch_size/seq_len and h → hidden_states across attention,
transformer_block, and llm_model modules. No functional changes.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (3) hide show

llm_lab/model/attention.py +19 -19
llm_lab/model/llm_model.py +7 -7
llm_lab/model/transformer_block.py +2 -2

llm_lab/model/attention.py CHANGED Viewed

@@ -69,21 +69,21 @@ class GroupedQueryAttention(nn.Module):
         Returns:
             (batch_size, seq_len, hidden_dim)
         """
-        B, S, _ = x.shape
         # ──────────────────────────────────────────────
         # Step 1: Q, K, V projections
         # ──────────────────────────────────────────────
-        q = self.q_proj(x)  # (B, S, num_heads × head_dim)
-        k = self.k_proj(x)  # (B, S, num_kv_heads × head_dim)
-        v = self.v_proj(x)  # (B, S, num_kv_heads × head_dim)
         # Reshape into multi-head form
-        q = q.view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
-        # → (B, num_heads, S, head_dim)
-        k = k.view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2)
-        # → (B, num_kv_heads, S, head_dim)
-        v = v.view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2)
         # ──────────────────────────────────────────────
         # Step 2: Apply RoPE (to Q and K only! Not to V)
@@ -97,7 +97,7 @@ class GroupedQueryAttention(nn.Module):
         # ──────────────────────────────────────────────
         # num_kv_heads=4 → num_heads=16: repeat each KV 4 times
         if self.num_kv_groups > 1:
-            k = self._repeat_kv(k)  # (B, num_heads, S, head_dim)
             v = self._repeat_kv(v)
         # ──────────────────────────────────────────────
@@ -110,25 +110,25 @@ class GroupedQueryAttention(nn.Module):
             dropout_p=self.config.dropout if self.training else 0.0,
             is_causal=(mask is None),  # apply automatic causal masking when no mask is provided
         )
-        # → (B, num_heads, S, head_dim)
         # ──────────────────────────────────────────────
         # Step 5: Merge heads + output projection
         # ──────────────────────────────────────────────
-        attn_out = attn_out.transpose(1, 2).contiguous().view(B, S, -1)
-        # → (B, S, num_heads × head_dim)
-        return self.o_proj(attn_out)  # → (B, S, hidden_dim)
     def _repeat_kv(self, x: torch.Tensor) -> torch.Tensor:
         """Repeat KV heads to match the number of Q heads.
-        (B, num_kv_heads, S, head_dim) → (B, num_heads, S, head_dim)
         Example: num_kv_heads=4, num_kv_groups=4
           [kv0, kv1, kv2, kv3] → [kv0,kv0,kv0,kv0, kv1,kv1,kv1,kv1, ...]
         """
-        B, H_kv, S, D = x.shape
-        x = x[:, :, None, :, :]               # (B, H_kv, 1, S, D)
-        x = x.expand(B, H_kv, self.num_kv_groups, S, D)  # (B, H_kv, groups, S, D)
-        return x.reshape(B, self.num_heads, S, D)

         Returns:
             (batch_size, seq_len, hidden_dim)
         """
+        batch_size, seq_len, _ = x.shape
         # ──────────────────────────────────────────────
         # Step 1: Q, K, V projections
         # ──────────────────────────────────────────────
+        q = self.q_proj(x)  # (batch_size, seq_len, num_heads × head_dim)
+        k = self.k_proj(x)  # (batch_size, seq_len, num_kv_heads × head_dim)
+        v = self.v_proj(x)  # (batch_size, seq_len, num_kv_heads × head_dim)
         # Reshape into multi-head form
+        q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # → (batch_size, num_heads, seq_len, head_dim)
+        k = k.view(batch_size, seq_len, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        # → (batch_size, num_kv_heads, seq_len, head_dim)
+        v = v.view(batch_size, seq_len, self.num_kv_heads, self.head_dim).transpose(1, 2)
         # ──────────────────────────────────────────────
         # Step 2: Apply RoPE (to Q and K only! Not to V)
         # ──────────────────────────────────────────────
         # num_kv_heads=4 → num_heads=16: repeat each KV 4 times
         if self.num_kv_groups > 1:
+            k = self._repeat_kv(k)  # (batch_size, num_heads, seq_len, head_dim)
             v = self._repeat_kv(v)
         # ──────────────────────────────────────────────
             dropout_p=self.config.dropout if self.training else 0.0,
             is_causal=(mask is None),  # apply automatic causal masking when no mask is provided
         )
+        # → (batch_size, num_heads, seq_len, head_dim)
         # ──────────────────────────────────────────────
         # Step 5: Merge heads + output projection
         # ──────────────────────────────────────────────
+        attn_out = attn_out.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
+        # → (batch_size, seq_len, num_heads × head_dim)
+        return self.o_proj(attn_out)  # → (batch_size, seq_len, hidden_dim)
     def _repeat_kv(self, x: torch.Tensor) -> torch.Tensor:
         """Repeat KV heads to match the number of Q heads.
+        (batch_size, num_kv_heads, seq_len, head_dim) → (batch_size, num_heads, seq_len, head_dim)
         Example: num_kv_heads=4, num_kv_groups=4
           [kv0, kv1, kv2, kv3] → [kv0,kv0,kv0,kv0, kv1,kv1,kv1,kv1, ...]
         """
+        batch_size, num_kv_heads, seq_len, head_dim = x.shape
+        x = x[:, :, None, :, :]               # (batch_size, num_kv_heads, 1, seq_len, head_dim)
+        x = x.expand(batch_size, num_kv_heads, self.num_kv_groups, seq_len, head_dim)
+        return x.reshape(batch_size, self.num_heads, seq_len, head_dim)

llm_lab/model/llm_model.py CHANGED Viewed

@@ -97,11 +97,11 @@ class LLMModel(nn.Module):
             logits: (batch_size, seq_len, vocab_size)
             loss:   scalar (when targets are provided) or None
         """
-        B, S = input_ids.shape
         # ── Step 1: Token Embedding ──
         # Convert each token ID into a vector of dimension hidden_dim
-        h = self.token_embedding(input_ids)  # (B, S, hidden_dim)
         # ── Step 2: Transformer Blocks ──
         # Activation Checkpointing: saves memory during training
@@ -109,18 +109,18 @@ class LLMModel(nn.Module):
         for layer in self.layers:
             if self.training and torch.is_grad_enabled():
                 # Apply Activation Checkpointing
-                h = torch.utils.checkpoint.checkpoint(
-                    layer, h, None, position_offset,
                     use_reentrant=False,  # recommended for PyTorch >= 2.0
                 )
             else:
-                h = layer(h, mask=None, position_offset=position_offset)
         # ── Step 3: Final normalization ──
-        h = self.final_norm(h)
         # ── Step 4: Compute output logits ──
-        logits = self.lm_head(h)  # (B, S, vocab_size)
         # ── Step 5: Compute loss (during training) ──
         loss = None

             logits: (batch_size, seq_len, vocab_size)
             loss:   scalar (when targets are provided) or None
         """
+        batch_size, seq_len = input_ids.shape
         # ── Step 1: Token Embedding ──
         # Convert each token ID into a vector of dimension hidden_dim
+        hidden_states = self.token_embedding(input_ids)  # (batch_size, seq_len, hidden_dim)
         # ── Step 2: Transformer Blocks ──
         # Activation Checkpointing: saves memory during training
         for layer in self.layers:
             if self.training and torch.is_grad_enabled():
                 # Apply Activation Checkpointing
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    layer, hidden_states, None, position_offset,
                     use_reentrant=False,  # recommended for PyTorch >= 2.0
                 )
             else:
+                hidden_states = layer(hidden_states, mask=None, position_offset=position_offset)
         # ── Step 3: Final normalization ──
+        hidden_states = self.final_norm(hidden_states)
         # ── Step 4: Compute output logits ──
+        logits = self.lm_head(hidden_states)  # (batch_size, seq_len, vocab_size)
         # ── Step 5: Compute loss (during training) ──
         loss = None

llm_lab/model/transformer_block.py CHANGED Viewed

@@ -56,10 +56,10 @@ class TransformerBlock(nn.Module):
         """
         # ── Attention sublayer with residual ──
         # h = x + Attention(RMSNorm(x))
-        h = x + self.attention(self.attn_norm(x), mask, position_offset)
         # ── FFN sublayer with residual ──
         # out = h + FFN(RMSNorm(h))
-        out = h + self.feed_forward(self.ffn_norm(h))
         return out

         """
         # ── Attention sublayer with residual ──
         # h = x + Attention(RMSNorm(x))
+        hidden_states = x + self.attention(self.attn_norm(x), mask, position_offset)
         # ── FFN sublayer with residual ──
         # out = h + FFN(RMSNorm(h))
+        out = hidden_states + self.feed_forward(self.ffn_norm(hidden_states))
         return out