Replace F.silu with explicit SiLU implementation in SwiGLUFeedForward

Replaces F.silu(x) with x * sigmoid(x) to make the Swish activation
formula directly visible in code, consistent with the explicit attention
implementation approach used in this project.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

llm_lab/model/feedforward.py +2 -2

llm_lab/model/feedforward.py CHANGED Viewed

@@ -2,7 +2,6 @@
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from llm_lab.config import ModelConfig
@@ -41,7 +40,8 @@ class SwiGLUFeedForward(nn.Module):
         # SwiGLU(x) = (Swish(gate(x)) ⊙ up(x)) · down
         #
         # 1) gate: decides which information to pass through (Swish activation)
-        gate = F.silu(self.gate_proj(x))  # silu = Swish = x * sigmoid(x)
         # 2) up: projects information to a higher dimension
         up = self.up_proj(x)
         # 3) element-wise multiplication (gating) → project back to original dimension

 import torch
 import torch.nn as nn
 from llm_lab.config import ModelConfig
         # SwiGLU(x) = (Swish(gate(x)) ⊙ up(x)) · down
         #
         # 1) gate: decides which information to pass through (Swish activation)
+        gate_val = self.gate_proj(x)
+        gate = gate_val * torch.sigmoid(gate_val)  # SiLU(x) = x * sigmoid(x)
         # 2) up: projects information to a higher dimension
         up = self.up_proj(x)
         # 3) element-wise multiplication (gating) → project back to original dimension