Vjeong Claude Sonnet 4.6 commited on
Commit
baf4768
·
1 Parent(s): e072b51

Replace F.silu with explicit SiLU implementation in SwiGLUFeedForward

Browse files

Replaces F.silu(x) with x * sigmoid(x) to make the Swish activation
formula directly visible in code, consistent with the explicit attention
implementation approach used in this project.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. llm_lab/model/feedforward.py +2 -2
llm_lab/model/feedforward.py CHANGED
@@ -2,7 +2,6 @@
2
 
3
  import torch
4
  import torch.nn as nn
5
- import torch.nn.functional as F
6
 
7
  from llm_lab.config import ModelConfig
8
 
@@ -41,7 +40,8 @@ class SwiGLUFeedForward(nn.Module):
41
  # SwiGLU(x) = (Swish(gate(x)) ⊙ up(x)) · down
42
  #
43
  # 1) gate: decides which information to pass through (Swish activation)
44
- gate = F.silu(self.gate_proj(x)) # silu = Swish = x * sigmoid(x)
 
45
  # 2) up: projects information to a higher dimension
46
  up = self.up_proj(x)
47
  # 3) element-wise multiplication (gating) → project back to original dimension
 
2
 
3
  import torch
4
  import torch.nn as nn
 
5
 
6
  from llm_lab.config import ModelConfig
7
 
 
40
  # SwiGLU(x) = (Swish(gate(x)) ⊙ up(x)) · down
41
  #
42
  # 1) gate: decides which information to pass through (Swish activation)
43
+ gate_val = self.gate_proj(x)
44
+ gate = gate_val * torch.sigmoid(gate_val) # SiLU(x) = x * sigmoid(x)
45
  # 2) up: projects information to a higher dimension
46
  up = self.up_proj(x)
47
  # 3) element-wise multiplication (gating) → project back to original dimension