|
|
|
|
|
|
|
|
|
|
| from dataclasses import dataclass, field
|
| from typing import Optional
|
| import math
|
|
|
|
|
| @dataclass
|
| class PyCraftConfig:
|
|
|
|
|
|
|
| vocab_size: int = 32000
|
| max_seq_len: int = 2048
|
|
|
|
|
|
|
|
|
| d_model: int = 512
|
| n_layers: int = 8
|
| n_heads: int = 8
|
|
|
| n_kv_heads: int = 2
|
|
|
|
|
|
|
|
|
| d_ff: int = 1408
|
|
|
|
|
|
|
|
|
| use_qk_norm: bool = True
|
| rope_theta: float = 10000.0
|
| attn_dropout: float = 0.0
|
|
|
|
|
|
|
|
|
| dropout: float = 0.0
|
| weight_tying: bool = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| fim_prefix_id: Optional[int] = None
|
| fim_suffix_id: Optional[int] = None
|
| fim_middle_id: Optional[int] = None
|
| fim_pad_id: Optional[int] = None
|
|
|
|
|
|
|
|
|
| @property
|
| def head_dim(self) -> int:
|
| """Dimension of each attention head."""
|
| assert self.d_model % self.n_heads == 0, (
|
| f"d_model ({self.d_model}) must be divisible by n_heads ({self.n_heads})"
|
| )
|
| return self.d_model // self.n_heads
|
|
|
| @property
|
| def n_heads_per_kv(self) -> int:
|
| """How many Q heads share each KV head."""
|
| assert self.n_heads % self.n_kv_heads == 0, (
|
| f"n_heads ({self.n_heads}) must be divisible by n_kv_heads ({self.n_kv_heads})"
|
| )
|
| return self.n_heads // self.n_kv_heads
|
|
|
| @property
|
| def param_count_approx(self) -> int:
|
| """Rough parameter count for sanity checking."""
|
| embed = self.vocab_size * self.d_model
|
| attn = self.n_layers * (
|
| self.d_model * self.d_model +
|
| 2 * self.d_model * (self.n_kv_heads * self.head_dim) +
|
| self.d_model * self.d_model
|
| )
|
| ffn = self.n_layers * (
|
| 3 * self.d_model * self.d_ff
|
| )
|
| norms = self.n_layers * 2 * self.d_model
|
| lm_head = self.vocab_size * self.d_model
|
| return embed + attn + ffn + norms + lm_head
|
|
|
| def __post_init__(self):
|
|
|
| assert self.n_heads % self.n_kv_heads == 0, (
|
| f"n_heads must be divisible by n_kv_heads. "
|
| f"Got {self.n_heads} and {self.n_kv_heads}."
|
| )
|
|
|
| assert self.d_model % self.n_heads == 0, (
|
| f"d_model must be divisible by n_heads."
|
| )
|
|
|
|
|
|
|
|
|
|
|
|
|
| def get_config_120m() -> PyCraftConfig:
|
| """
|
| PyCraft-1 (120M parameters).
|
| Designed to train on a single RTX 3050 4GB laptop GPU.
|
| """
|
| return PyCraftConfig(
|
| vocab_size=32000,
|
| max_seq_len=1024,
|
| d_model=512,
|
| n_layers=8,
|
| n_heads=8,
|
| n_kv_heads=2,
|
| d_ff=1408,
|
| use_qk_norm=True,
|
| rope_theta=10000.0,
|
| dropout=0.1,
|
| )
|
|
|
|
|
| def get_config_tiny() -> PyCraftConfig:
|
| """
|
| PyCraft-tiny (~ 15M parameters).
|
| For rapid iteration and smoke-testing the training loop.
|
| Use this first before committing to a full training run.
|
| """
|
| return PyCraftConfig(
|
| vocab_size=32000,
|
| max_seq_len=512,
|
| d_model=256,
|
| n_layers=4,
|
| n_heads=4,
|
| n_kv_heads=2,
|
| d_ff=704,
|
| use_qk_norm=True,
|
| rope_theta=10000.0,
|
| )
|
|
|
|
|
|
|
|
|
|
|
| if __name__ == "__main__":
|
| cfg = get_config_120m()
|
| params_m = cfg.param_count_approx / 1e6
|
| print(f"PyCraft-1 config loaded.")
|
| print(f" d_model : {cfg.d_model}")
|
| print(f" n_layers : {cfg.n_layers}")
|
| print(f" n_heads : {cfg.n_heads} (Q)")
|
| print(
|
| f" n_kv_heads : {cfg.n_kv_heads} (KV, GQA {cfg.n_heads_per_kv}:1)")
|
| print(f" head_dim : {cfg.head_dim}")
|
| print(f" d_ff : {cfg.d_ff} (SwiGLU)")
|
| print(f" QK-Norm : {cfg.use_qk_norm}")
|
| print(f" Approx params: {params_m:.1f}M")
|
|
|
| cfg_tiny = get_config_tiny()
|
| print(f"\nPyCraft-tiny config loaded.")
|
| print(f" Approx params: {cfg_tiny.param_count_approx / 1e6:.1f}M")
|
|
|