File size: 1,860 Bytes
27871e7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | """
Feed-Forward Network for SLM.
Uses GELU activation (not SwiGLU) for better INT8 quantization.
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from .config import SLMConfig
class FeedForward(nn.Module):
"""Feed-Forward Network with GELU activation.
Architecture: Linear -> GELU -> Linear
- Input: [batch, seq, hidden_size=768]
- Hidden: [batch, seq, intermediate_size=3072]
- Output: [batch, seq, hidden_size=768]
Why GELU over SwiGLU:
- Fewer operations (2 matmuls vs 3)
- Better INT8 quantization behavior
- Full QNN support without decomposition
- SwiGLU benefits mainly appear at >1B parameters
"""
def __init__(self, config: SLMConfig):
"""Initialize FFN.
Args:
config: Model configuration
"""
super().__init__()
self.hidden_size = config.hidden_size
self.intermediate_size = config.intermediate_size
# Up projection: hidden -> intermediate
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
# Down projection: intermediate -> hidden
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
self.dropout = config.dropout
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Forward pass through FFN.
Args:
x: Input tensor [batch, seq, hidden_size]
Returns:
Output tensor [batch, seq, hidden_size]
"""
# Up project and apply GELU
hidden = self.up_proj(x)
hidden = F.gelu(hidden, approximate="tanh")
# Down project
output = self.down_proj(hidden)
# Apply dropout during training
if self.training and self.dropout > 0:
output = F.dropout(output, p=self.dropout)
return output
|