|
|
""" |
|
|
Model configuration for SLM v1. |
|
|
Defines all hyperparameters based on architecture specification. |
|
|
""" |
|
|
|
|
|
from dataclasses import dataclass |
|
|
from typing import Optional |
|
|
import yaml |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class SLMConfig: |
|
|
"""Configuration class for the SLM model. |
|
|
|
|
|
Architecture: 120M parameter decoder-only transformer |
|
|
- 8 layers, 1024 hidden size, 16 attention heads |
|
|
- RMSNorm (pre-norm), GELU FFN, RoPE positions |
|
|
- Explicit KV cache for efficient inference |
|
|
""" |
|
|
|
|
|
|
|
|
vocab_size: int = 16384 |
|
|
hidden_size: int = 1024 |
|
|
num_layers: int = 8 |
|
|
num_heads: int = 16 |
|
|
head_dim: int = 64 |
|
|
intermediate_size: int = 4096 |
|
|
|
|
|
|
|
|
max_position_embeddings: int = 1024 |
|
|
rope_theta: float = 10000.0 |
|
|
|
|
|
|
|
|
rms_norm_eps: float = 1e-6 |
|
|
|
|
|
|
|
|
tie_word_embeddings: bool = True |
|
|
|
|
|
|
|
|
dropout: float = 0.0 |
|
|
attention_dropout: float = 0.0 |
|
|
|
|
|
|
|
|
torch_dtype: str = "float16" |
|
|
|
|
|
def __post_init__(self): |
|
|
"""Validate configuration after initialization.""" |
|
|
assert self.hidden_size % self.num_heads == 0, \ |
|
|
f"hidden_size ({self.hidden_size}) must be divisible by num_heads ({self.num_heads})" |
|
|
assert self.head_dim == self.hidden_size // self.num_heads, \ |
|
|
f"head_dim ({self.head_dim}) must equal hidden_size // num_heads ({self.hidden_size // self.num_heads})" |
|
|
|
|
|
@classmethod |
|
|
def from_yaml(cls, path: str) -> "SLMConfig": |
|
|
"""Load configuration from YAML file.""" |
|
|
with open(path, "r") as f: |
|
|
config_dict = yaml.safe_load(f) |
|
|
|
|
|
model_config = config_dict.get("model", {}) |
|
|
return cls(**model_config) |
|
|
|
|
|
def to_dict(self) -> dict: |
|
|
"""Convert configuration to dictionary.""" |
|
|
return { |
|
|
"vocab_size": self.vocab_size, |
|
|
"hidden_size": self.hidden_size, |
|
|
"num_layers": self.num_layers, |
|
|
"num_heads": self.num_heads, |
|
|
"head_dim": self.head_dim, |
|
|
"intermediate_size": self.intermediate_size, |
|
|
"max_position_embeddings": self.max_position_embeddings, |
|
|
"rope_theta": self.rope_theta, |
|
|
"rms_norm_eps": self.rms_norm_eps, |
|
|
"tie_word_embeddings": self.tie_word_embeddings, |
|
|
"dropout": self.dropout, |
|
|
"attention_dropout": self.attention_dropout, |
|
|
"torch_dtype": self.torch_dtype, |
|
|
} |
|
|
|
|
|
@property |
|
|
def num_parameters(self) -> int: |
|
|
"""Estimate total number of parameters.""" |
|
|
|
|
|
embedding_params = self.vocab_size * self.hidden_size |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
attention_params = 4 * self.hidden_size * self.hidden_size |
|
|
ffn_params = 2 * self.hidden_size * self.intermediate_size |
|
|
norm_params = 2 * self.hidden_size |
|
|
|
|
|
layer_params = attention_params + ffn_params + norm_params |
|
|
total_layer_params = self.num_layers * layer_params |
|
|
|
|
|
|
|
|
output_params = 0 if self.tie_word_embeddings else self.vocab_size * self.hidden_size |
|
|
|
|
|
|
|
|
final_norm_params = self.hidden_size |
|
|
|
|
|
return embedding_params + total_layer_params + output_params + final_norm_params |
|
|
|
|
|
def __repr__(self) -> str: |
|
|
params_m = self.num_parameters / 1e6 |
|
|
return ( |
|
|
f"SLMConfig(\n" |
|
|
f" vocab_size={self.vocab_size},\n" |
|
|
f" hidden_size={self.hidden_size},\n" |
|
|
f" num_layers={self.num_layers},\n" |
|
|
f" num_heads={self.num_heads},\n" |
|
|
f" max_position_embeddings={self.max_position_embeddings},\n" |
|
|
f" estimated_params={params_m:.1f}M\n" |
|
|
f")" |
|
|
) |
|
|
|