""" config.py — Hyperparameters for the Liquid Chess Model (LCM). All model dimensions, training objectives, and architectural choices are defined here. Nothing is hardcoded in model.py. """ from dataclasses import dataclass @dataclass class ChessModelConfig: # ── Vocabulary ──────────────────────────────────────────────────────────── # 1977 tokens: 9 special tokens + 1968 UCI moves. vocab_size: int = 1977 # ── Sequence ────────────────────────────────────────────────────────────── # 1 POV token + 253 moves + 1 terminal token = 255. max_seq_len: int = 255 # ── Model dimensions ────────────────────────────────────────────────────── d_model: int = 512 # ── Depth ───────────────────────────────────────────────────────────────── # 16 total layers: 6 GQA attention + 10 LIV convolution. # GQA layers are distributed evenly via Bresenham algorithm. n_layers: int = 16 n_gqa_layers: int = 6 # ── Attention (GQA) ─────────────────────────────────────────────────────── n_heads: int = 8 # query heads n_kv_heads: int = 2 # key-value heads (4 query heads share each KV head) # ── Feed-Forward Network ────────────────────────────────────────────────── ffn_expansion: float = 2.67 # SwiGLU expansion ratio ffn_hidden_size: int = 1376 # round(512 * 2.67 / 64) * 64 # ── LIV Convolution ─────────────────────────────────────────────────────── # kernel_size=4: current token + 3 previous tokens. conv_kernel_size: int = 4 # ── LRM (Learnable Rate Multipliers) ────────────────────────────────────── # Per-layer learned scalar applied to block output before residual add. # Initialized to 1.0 (no effect at start of training). # Ref: Velikanov et al., 2026. use_lrm: bool = True # ── Training objectives ─────────────────────────────────────────────────── # NTP: next token prediction (move generation). # TOP: token order prediction (auxiliary training signal). # Ref: Zuhri et al., 2026. ntp_weight: float = 0.30 top_weight: float = 0.70 top_window: int = 255 # ── Regularization ──────────────────────────────────────────────────────── dropout: float = 0.1 # ── Special token IDs ───────────────────────────────────────────────────── pad_id: int = 0 w_id: int = 1 # white to move b_id: int = 2 # black to move # ── Derived properties ──────────────────────────────────────────────────── @property def head_dim(self) -> int: """Attention head dimension.""" return self.d_model // self.n_heads @property def n_liv_layers(self) -> int: """Number of LIV convolution layers.""" return self.n_layers - self.n_gqa_layers def __post_init__(self): assert self.d_model % self.n_heads == 0, \ f"d_model ({self.d_model}) must be divisible by n_heads ({self.n_heads})" assert self.n_heads % self.n_kv_heads == 0, \ f"n_heads ({self.n_heads}) must be divisible by n_kv_heads ({self.n_kv_heads})" assert self.n_gqa_layers <= self.n_layers, \ f"n_gqa_layers ({self.n_gqa_layers}) can't exceed n_layers ({self.n_layers})" assert abs(self.ntp_weight + self.top_weight - 1.0) < 1e-6, \ f"Loss weights must sum to 1.0, got {self.ntp_weight + self.top_weight}" if __name__ == "__main__": cfg = ChessModelConfig() print(f"d_model : {cfg.d_model}") print(f"n_layers : {cfg.n_layers} ({cfg.n_gqa_layers} GQA + {cfg.n_liv_layers} LIV)") print(f"n_heads : {cfg.n_heads} query, {cfg.n_kv_heads} KV") print(f"head_dim : {cfg.head_dim}") print(f"ffn_hidden_size : {cfg.ffn_hidden_size}") print(f"use_lrm : {cfg.use_lrm}") print(f"loss weights : {cfg.ntp_weight} NTP + {cfg.top_weight} TOP")