Replace F.silu with explicit SiLU implementation in SwiGLUFeedForward
Browse filesReplaces F.silu(x) with x * sigmoid(x) to make the Swish activation
formula directly visible in code, consistent with the explicit attention
implementation approach used in this project.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
llm_lab/model/feedforward.py
CHANGED
|
@@ -2,7 +2,6 @@
|
|
| 2 |
|
| 3 |
import torch
|
| 4 |
import torch.nn as nn
|
| 5 |
-
import torch.nn.functional as F
|
| 6 |
|
| 7 |
from llm_lab.config import ModelConfig
|
| 8 |
|
|
@@ -41,7 +40,8 @@ class SwiGLUFeedForward(nn.Module):
|
|
| 41 |
# SwiGLU(x) = (Swish(gate(x)) ⊙ up(x)) · down
|
| 42 |
#
|
| 43 |
# 1) gate: decides which information to pass through (Swish activation)
|
| 44 |
-
|
|
|
|
| 45 |
# 2) up: projects information to a higher dimension
|
| 46 |
up = self.up_proj(x)
|
| 47 |
# 3) element-wise multiplication (gating) → project back to original dimension
|
|
|
|
| 2 |
|
| 3 |
import torch
|
| 4 |
import torch.nn as nn
|
|
|
|
| 5 |
|
| 6 |
from llm_lab.config import ModelConfig
|
| 7 |
|
|
|
|
| 40 |
# SwiGLU(x) = (Swish(gate(x)) ⊙ up(x)) · down
|
| 41 |
#
|
| 42 |
# 1) gate: decides which information to pass through (Swish activation)
|
| 43 |
+
gate_val = self.gate_proj(x)
|
| 44 |
+
gate = gate_val * torch.sigmoid(gate_val) # SiLU(x) = x * sigmoid(x)
|
| 45 |
# 2) up: projects information to a higher dimension
|
| 46 |
up = self.up_proj(x)
|
| 47 |
# 3) element-wise multiplication (gating) → project back to original dimension
|