nameissakthi
/

PebbleLM-117M

Text Generation

small-language-model

edge-deployment

Model card Files Files and versions

PebbleLM-117M / src /model /ffn.py

nameissakthi's picture

Add model architecture code

27871e7 20 days ago

history blame contribute delete

1.86 kB

	"""
	Feed-Forward Network for SLM.
	Uses GELU activation (not SwiGLU) for better INT8 quantization.
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	from .config import SLMConfig


	class FeedForward(nn.Module):
	"""Feed-Forward Network with GELU activation.

	Architecture: Linear -> GELU -> Linear
	- Input: [batch, seq, hidden_size=768]
	- Hidden: [batch, seq, intermediate_size=3072]
	- Output: [batch, seq, hidden_size=768]

	Why GELU over SwiGLU:
	- Fewer operations (2 matmuls vs 3)
	- Better INT8 quantization behavior
	- Full QNN support without decomposition
	- SwiGLU benefits mainly appear at >1B parameters
	"""

	def __init__(self, config: SLMConfig):
	"""Initialize FFN.

	Args:
	config: Model configuration
	"""
	super().__init__()

	self.hidden_size = config.hidden_size
	self.intermediate_size = config.intermediate_size

	# Up projection: hidden -> intermediate
	self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)

	# Down projection: intermediate -> hidden
	self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)

	self.dropout = config.dropout

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	"""Forward pass through FFN.

	Args:
	x: Input tensor [batch, seq, hidden_size]

	Returns:
	Output tensor [batch, seq, hidden_size]
	"""
	# Up project and apply GELU
	hidden = self.up_proj(x)
	hidden = F.gelu(hidden, approximate="tanh")

	# Down project
	output = self.down_proj(hidden)

	# Apply dropout during training
	if self.training and self.dropout > 0:
	output = F.dropout(output, p=self.dropout)

	return output