pycraft-1 / model /config.py
imshadow0's picture
Upload model/config.py with huggingface_hub
82036d2 verified
Raw
History Blame Contribute Delete
6.39 kB
# model/config.py
# PyCraft-1 model configuration
# All architectural hyperparameters live here.
# Other files import this — never hardcode numbers elsewhere.
from dataclasses import dataclass, field
from typing import Optional
import math
@dataclass
class PyCraftConfig:
# ------------------------------------------------------------------ #
# Vocabulary & sequence
# ------------------------------------------------------------------ #
vocab_size: int = 32000 # BPE tokenizer vocab (trained in Phase 2)
max_seq_len: int = 2048 # context window
# ------------------------------------------------------------------ #
# Model dimensions
# ------------------------------------------------------------------ #
d_model: int = 512 # embedding / hidden dimension
n_layers: int = 8 # number of transformer blocks
n_heads: int = 8 # number of query heads
# number of key/value heads (GQA 4:1 ratio)
n_kv_heads: int = 2
# SwiGLU FFN intermediate dim.
# Standard formula: (4 * d_model * 2/3), rounded to nearest multiple of 64
# 512 * 4 * 2/3 = 1365.3 → round to 1408 for clean tensor ops
d_ff: int = 1408
# ------------------------------------------------------------------ #
# Attention settings
# ------------------------------------------------------------------ #
use_qk_norm: bool = True # QK-Norm (OLMo 2 / Qwen 3 technique)
rope_theta: float = 10000.0 # RoPE base frequency
attn_dropout: float = 0.0 # keep 0.0 during pretraining
# ------------------------------------------------------------------ #
# Training knobs
# ------------------------------------------------------------------ #
dropout: float = 0.0 # residual dropout (0 for pretraining)
weight_tying: bool = False # tie input embedding ↔ output projection
# False: we have enough params at 120M
# ------------------------------------------------------------------ #
# FIM (Fill-in-the-Middle) special token IDs
# These will be set after tokenizer training.
# Defaults match standard FIM token positions.
# ------------------------------------------------------------------ #
fim_prefix_id: Optional[int] = None
fim_suffix_id: Optional[int] = None
fim_middle_id: Optional[int] = None
fim_pad_id: Optional[int] = None
# ------------------------------------------------------------------ #
# Derived / computed properties
# ------------------------------------------------------------------ #
@property
def head_dim(self) -> int:
"""Dimension of each attention head."""
assert self.d_model % self.n_heads == 0, (
f"d_model ({self.d_model}) must be divisible by n_heads ({self.n_heads})"
)
return self.d_model // self.n_heads
@property
def n_heads_per_kv(self) -> int:
"""How many Q heads share each KV head."""
assert self.n_heads % self.n_kv_heads == 0, (
f"n_heads ({self.n_heads}) must be divisible by n_kv_heads ({self.n_kv_heads})"
)
return self.n_heads // self.n_kv_heads
@property
def param_count_approx(self) -> int:
"""Rough parameter count for sanity checking."""
embed = self.vocab_size * self.d_model
attn = self.n_layers * (
self.d_model * self.d_model + # Wq
2 * self.d_model * (self.n_kv_heads * self.head_dim) + # Wk, Wv
self.d_model * self.d_model # Wo
)
ffn = self.n_layers * (
3 * self.d_model * self.d_ff # SwiGLU: gate, up, down
)
norms = self.n_layers * 2 * self.d_model # RMSNorm per block x2
lm_head = self.vocab_size * self.d_model # output projection
return embed + attn + ffn + norms + lm_head
def __post_init__(self):
# Validate GQA constraint
assert self.n_heads % self.n_kv_heads == 0, (
f"n_heads must be divisible by n_kv_heads. "
f"Got {self.n_heads} and {self.n_kv_heads}."
)
# Validate head dimension
assert self.d_model % self.n_heads == 0, (
f"d_model must be divisible by n_heads."
)
# ------------------------------------------------------------------ #
# Convenience presets
# ------------------------------------------------------------------ #
def get_config_120m() -> PyCraftConfig:
"""
PyCraft-1 (120M parameters).
Designed to train on a single RTX 3050 4GB laptop GPU.
"""
return PyCraftConfig(
vocab_size=32000,
max_seq_len=1024,
d_model=512,
n_layers=8,
n_heads=8,
n_kv_heads=2,
d_ff=1408,
use_qk_norm=True,
rope_theta=10000.0,
dropout=0.1,
)
def get_config_tiny() -> PyCraftConfig:
"""
PyCraft-tiny (~ 15M parameters).
For rapid iteration and smoke-testing the training loop.
Use this first before committing to a full training run.
"""
return PyCraftConfig(
vocab_size=32000,
max_seq_len=512,
d_model=256,
n_layers=4,
n_heads=4,
n_kv_heads=2,
d_ff=704,
use_qk_norm=True,
rope_theta=10000.0,
)
# ------------------------------------------------------------------ #
# Quick self-test
# ------------------------------------------------------------------ #
if __name__ == "__main__":
cfg = get_config_120m()
params_m = cfg.param_count_approx / 1e6
print(f"PyCraft-1 config loaded.")
print(f" d_model : {cfg.d_model}")
print(f" n_layers : {cfg.n_layers}")
print(f" n_heads : {cfg.n_heads} (Q)")
print(
f" n_kv_heads : {cfg.n_kv_heads} (KV, GQA {cfg.n_heads_per_kv}:1)")
print(f" head_dim : {cfg.head_dim}")
print(f" d_ff : {cfg.d_ff} (SwiGLU)")
print(f" QK-Norm : {cfg.use_qk_norm}")
print(f" Approx params: {params_m:.1f}M")
cfg_tiny = get_config_tiny()
print(f"\nPyCraft-tiny config loaded.")
print(f" Approx params: {cfg_tiny.param_count_approx / 1e6:.1f}M")