# model/config.py # PyCraft-1 model configuration # All architectural hyperparameters live here. # Other files import this — never hardcode numbers elsewhere. from dataclasses import dataclass, field from typing import Optional import math @dataclass class PyCraftConfig: # ------------------------------------------------------------------ # # Vocabulary & sequence # ------------------------------------------------------------------ # vocab_size: int = 32000 # BPE tokenizer vocab (trained in Phase 2) max_seq_len: int = 2048 # context window # ------------------------------------------------------------------ # # Model dimensions # ------------------------------------------------------------------ # d_model: int = 512 # embedding / hidden dimension n_layers: int = 8 # number of transformer blocks n_heads: int = 8 # number of query heads # number of key/value heads (GQA 4:1 ratio) n_kv_heads: int = 2 # SwiGLU FFN intermediate dim. # Standard formula: (4 * d_model * 2/3), rounded to nearest multiple of 64 # 512 * 4 * 2/3 = 1365.3 → round to 1408 for clean tensor ops d_ff: int = 1408 # ------------------------------------------------------------------ # # Attention settings # ------------------------------------------------------------------ # use_qk_norm: bool = True # QK-Norm (OLMo 2 / Qwen 3 technique) rope_theta: float = 10000.0 # RoPE base frequency attn_dropout: float = 0.0 # keep 0.0 during pretraining # ------------------------------------------------------------------ # # Training knobs # ------------------------------------------------------------------ # dropout: float = 0.0 # residual dropout (0 for pretraining) weight_tying: bool = False # tie input embedding ↔ output projection # False: we have enough params at 120M # ------------------------------------------------------------------ # # FIM (Fill-in-the-Middle) special token IDs # These will be set after tokenizer training. # Defaults match standard FIM token positions. # ------------------------------------------------------------------ # fim_prefix_id: Optional[int] = None fim_suffix_id: Optional[int] = None fim_middle_id: Optional[int] = None fim_pad_id: Optional[int] = None # ------------------------------------------------------------------ # # Derived / computed properties # ------------------------------------------------------------------ # @property def head_dim(self) -> int: """Dimension of each attention head.""" assert self.d_model % self.n_heads == 0, ( f"d_model ({self.d_model}) must be divisible by n_heads ({self.n_heads})" ) return self.d_model // self.n_heads @property def n_heads_per_kv(self) -> int: """How many Q heads share each KV head.""" assert self.n_heads % self.n_kv_heads == 0, ( f"n_heads ({self.n_heads}) must be divisible by n_kv_heads ({self.n_kv_heads})" ) return self.n_heads // self.n_kv_heads @property def param_count_approx(self) -> int: """Rough parameter count for sanity checking.""" embed = self.vocab_size * self.d_model attn = self.n_layers * ( self.d_model * self.d_model + # Wq 2 * self.d_model * (self.n_kv_heads * self.head_dim) + # Wk, Wv self.d_model * self.d_model # Wo ) ffn = self.n_layers * ( 3 * self.d_model * self.d_ff # SwiGLU: gate, up, down ) norms = self.n_layers * 2 * self.d_model # RMSNorm per block x2 lm_head = self.vocab_size * self.d_model # output projection return embed + attn + ffn + norms + lm_head def __post_init__(self): # Validate GQA constraint assert self.n_heads % self.n_kv_heads == 0, ( f"n_heads must be divisible by n_kv_heads. " f"Got {self.n_heads} and {self.n_kv_heads}." ) # Validate head dimension assert self.d_model % self.n_heads == 0, ( f"d_model must be divisible by n_heads." ) # ------------------------------------------------------------------ # # Convenience presets # ------------------------------------------------------------------ # def get_config_120m() -> PyCraftConfig: """ PyCraft-1 (120M parameters). Designed to train on a single RTX 3050 4GB laptop GPU. """ return PyCraftConfig( vocab_size=32000, max_seq_len=1024, d_model=512, n_layers=8, n_heads=8, n_kv_heads=2, d_ff=1408, use_qk_norm=True, rope_theta=10000.0, dropout=0.1, ) def get_config_tiny() -> PyCraftConfig: """ PyCraft-tiny (~ 15M parameters). For rapid iteration and smoke-testing the training loop. Use this first before committing to a full training run. """ return PyCraftConfig( vocab_size=32000, max_seq_len=512, d_model=256, n_layers=4, n_heads=4, n_kv_heads=2, d_ff=704, use_qk_norm=True, rope_theta=10000.0, ) # ------------------------------------------------------------------ # # Quick self-test # ------------------------------------------------------------------ # if __name__ == "__main__": cfg = get_config_120m() params_m = cfg.param_count_approx / 1e6 print(f"PyCraft-1 config loaded.") print(f" d_model : {cfg.d_model}") print(f" n_layers : {cfg.n_layers}") print(f" n_heads : {cfg.n_heads} (Q)") print( f" n_kv_heads : {cfg.n_kv_heads} (KV, GQA {cfg.n_heads_per_kv}:1)") print(f" head_dim : {cfg.head_dim}") print(f" d_ff : {cfg.d_ff} (SwiGLU)") print(f" QK-Norm : {cfg.use_qk_norm}") print(f" Approx params: {params_m:.1f}M") cfg_tiny = get_config_tiny() print(f"\nPyCraft-tiny config loaded.") print(f" Approx params: {cfg_tiny.param_count_approx / 1e6:.1f}M")