| """ | |
| config.py β Hyperparameters for the Liquid Chess Model (LCM). | |
| All model dimensions, training objectives, and architectural choices | |
| are defined here. Nothing is hardcoded in model.py. | |
| """ | |
| from dataclasses import dataclass | |
| class ChessModelConfig: | |
| # ββ Vocabulary ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1977 tokens: 9 special tokens + 1968 UCI moves. | |
| vocab_size: int = 1977 | |
| # ββ Sequence ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1 POV token + 253 moves + 1 terminal token = 255. | |
| max_seq_len: int = 255 | |
| # ββ Model dimensions ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| d_model: int = 512 | |
| # ββ Depth βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 16 total layers: 6 GQA attention + 10 LIV convolution. | |
| # GQA layers are distributed evenly via Bresenham algorithm. | |
| n_layers: int = 16 | |
| n_gqa_layers: int = 6 | |
| # ββ Attention (GQA) βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| n_heads: int = 8 # query heads | |
| n_kv_heads: int = 2 # key-value heads (4 query heads share each KV head) | |
| # ββ Feed-Forward Network ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ffn_expansion: float = 2.67 # SwiGLU expansion ratio | |
| ffn_hidden_size: int = 1376 # round(512 * 2.67 / 64) * 64 | |
| # ββ LIV Convolution βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # kernel_size=4: current token + 3 previous tokens. | |
| conv_kernel_size: int = 4 | |
| # ββ LRM (Learnable Rate Multipliers) ββββββββββββββββββββββββββββββββββββββ | |
| # Per-layer learned scalar applied to block output before residual add. | |
| # Initialized to 1.0 (no effect at start of training). | |
| # Ref: Velikanov et al., 2026. | |
| use_lrm: bool = True | |
| # ββ Training objectives βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # NTP: next token prediction (move generation). | |
| # TOP: token order prediction (auxiliary training signal). | |
| # Ref: Zuhri et al., 2026. | |
| ntp_weight: float = 0.30 | |
| top_weight: float = 0.70 | |
| top_window: int = 255 | |
| # ββ Regularization ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| dropout: float = 0.1 | |
| # ββ Special token IDs βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| pad_id: int = 0 | |
| w_id: int = 1 # <W> white to move | |
| b_id: int = 2 # <B> black to move | |
| # ββ Derived properties ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def head_dim(self) -> int: | |
| """Attention head dimension.""" | |
| return self.d_model // self.n_heads | |
| def n_liv_layers(self) -> int: | |
| """Number of LIV convolution layers.""" | |
| return self.n_layers - self.n_gqa_layers | |
| def __post_init__(self): | |
| assert self.d_model % self.n_heads == 0, \ | |
| f"d_model ({self.d_model}) must be divisible by n_heads ({self.n_heads})" | |
| assert self.n_heads % self.n_kv_heads == 0, \ | |
| f"n_heads ({self.n_heads}) must be divisible by n_kv_heads ({self.n_kv_heads})" | |
| assert self.n_gqa_layers <= self.n_layers, \ | |
| f"n_gqa_layers ({self.n_gqa_layers}) can't exceed n_layers ({self.n_layers})" | |
| assert abs(self.ntp_weight + self.top_weight - 1.0) < 1e-6, \ | |
| f"Loss weights must sum to 1.0, got {self.ntp_weight + self.top_weight}" | |
| if __name__ == "__main__": | |
| cfg = ChessModelConfig() | |
| print(f"d_model : {cfg.d_model}") | |
| print(f"n_layers : {cfg.n_layers} ({cfg.n_gqa_layers} GQA + {cfg.n_liv_layers} LIV)") | |
| print(f"n_heads : {cfg.n_heads} query, {cfg.n_kv_heads} KV") | |
| print(f"head_dim : {cfg.head_dim}") | |
| print(f"ffn_hidden_size : {cfg.ffn_hidden_size}") | |
| print(f"use_lrm : {cfg.use_lrm}") | |
| print(f"loss weights : {cfg.ntp_weight} NTP + {cfg.top_weight} TOP") |