| | """ |
| | Model configuration for SLM v1. |
| | Defines all hyperparameters based on architecture specification. |
| | """ |
| |
|
| | from dataclasses import dataclass |
| | from typing import Optional |
| | import yaml |
| |
|
| |
|
| | @dataclass |
| | class SLMConfig: |
| | """Configuration class for the SLM model. |
| | |
| | Architecture: 120M parameter decoder-only transformer |
| | - 8 layers, 1024 hidden size, 16 attention heads |
| | - RMSNorm (pre-norm), GELU FFN, RoPE positions |
| | - Explicit KV cache for efficient inference |
| | """ |
| |
|
| | |
| | vocab_size: int = 16384 |
| | hidden_size: int = 1024 |
| | num_layers: int = 8 |
| | num_heads: int = 16 |
| | head_dim: int = 64 |
| | intermediate_size: int = 4096 |
| |
|
| | |
| | max_position_embeddings: int = 1024 |
| | rope_theta: float = 10000.0 |
| |
|
| | |
| | rms_norm_eps: float = 1e-6 |
| |
|
| | |
| | tie_word_embeddings: bool = True |
| |
|
| | |
| | dropout: float = 0.0 |
| | attention_dropout: float = 0.0 |
| |
|
| | |
| | torch_dtype: str = "float16" |
| |
|
| | def __post_init__(self): |
| | """Validate configuration after initialization.""" |
| | assert self.hidden_size % self.num_heads == 0, \ |
| | f"hidden_size ({self.hidden_size}) must be divisible by num_heads ({self.num_heads})" |
| | assert self.head_dim == self.hidden_size // self.num_heads, \ |
| | f"head_dim ({self.head_dim}) must equal hidden_size // num_heads ({self.hidden_size // self.num_heads})" |
| |
|
| | @classmethod |
| | def from_yaml(cls, path: str) -> "SLMConfig": |
| | """Load configuration from YAML file.""" |
| | with open(path, "r") as f: |
| | config_dict = yaml.safe_load(f) |
| |
|
| | model_config = config_dict.get("model", {}) |
| | return cls(**model_config) |
| |
|
| | def to_dict(self) -> dict: |
| | """Convert configuration to dictionary.""" |
| | return { |
| | "vocab_size": self.vocab_size, |
| | "hidden_size": self.hidden_size, |
| | "num_layers": self.num_layers, |
| | "num_heads": self.num_heads, |
| | "head_dim": self.head_dim, |
| | "intermediate_size": self.intermediate_size, |
| | "max_position_embeddings": self.max_position_embeddings, |
| | "rope_theta": self.rope_theta, |
| | "rms_norm_eps": self.rms_norm_eps, |
| | "tie_word_embeddings": self.tie_word_embeddings, |
| | "dropout": self.dropout, |
| | "attention_dropout": self.attention_dropout, |
| | "torch_dtype": self.torch_dtype, |
| | } |
| |
|
| | @property |
| | def num_parameters(self) -> int: |
| | """Estimate total number of parameters.""" |
| | |
| | embedding_params = self.vocab_size * self.hidden_size |
| |
|
| | |
| | |
| | |
| | |
| | attention_params = 4 * self.hidden_size * self.hidden_size |
| | ffn_params = 2 * self.hidden_size * self.intermediate_size |
| | norm_params = 2 * self.hidden_size |
| |
|
| | layer_params = attention_params + ffn_params + norm_params |
| | total_layer_params = self.num_layers * layer_params |
| |
|
| | |
| | output_params = 0 if self.tie_word_embeddings else self.vocab_size * self.hidden_size |
| |
|
| | |
| | final_norm_params = self.hidden_size |
| |
|
| | return embedding_params + total_layer_params + output_params + final_norm_params |
| |
|
| | def __repr__(self) -> str: |
| | params_m = self.num_parameters / 1e6 |
| | return ( |
| | f"SLMConfig(\n" |
| | f" vocab_size={self.vocab_size},\n" |
| | f" hidden_size={self.hidden_size},\n" |
| | f" num_layers={self.num_layers},\n" |
| | f" num_heads={self.num_heads},\n" |
| | f" max_position_embeddings={self.max_position_embeddings},\n" |
| | f" estimated_params={params_m:.1f}M\n" |
| | f")" |
| | ) |
| |
|