File size: 5,331 Bytes
b2c1dad | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | """
config.py β Hyperparameters for the Liquid Chess Model (LCM).
All model dimensions, training objectives, and architectural choices
are defined here. Nothing is hardcoded in model.py.
"""
from dataclasses import dataclass
@dataclass
class ChessModelConfig:
# ββ Vocabulary ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 1977 tokens: 9 special tokens + 1968 UCI moves.
vocab_size: int = 1977
# ββ Sequence ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 1 POV token + 253 moves + 1 terminal token = 255.
max_seq_len: int = 255
# ββ Model dimensions ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
d_model: int = 512
# ββ Depth βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 16 total layers: 6 GQA attention + 10 LIV convolution.
# GQA layers are distributed evenly via Bresenham algorithm.
n_layers: int = 16
n_gqa_layers: int = 6
# ββ Attention (GQA) βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
n_heads: int = 8 # query heads
n_kv_heads: int = 2 # key-value heads (4 query heads share each KV head)
# ββ Feed-Forward Network ββββββββββββββββββββββββββββββββββββββββββββββββββ
ffn_expansion: float = 2.67 # SwiGLU expansion ratio
ffn_hidden_size: int = 1376 # round(512 * 2.67 / 64) * 64
# ββ LIV Convolution βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# kernel_size=4: current token + 3 previous tokens.
conv_kernel_size: int = 4
# ββ LRM (Learnable Rate Multipliers) ββββββββββββββββββββββββββββββββββββββ
# Per-layer learned scalar applied to block output before residual add.
# Initialized to 1.0 (no effect at start of training).
# Ref: Velikanov et al., 2026.
use_lrm: bool = True
# ββ Training objectives βββββββββββββββββββββββββββββββββββββββββββββββββββ
# NTP: next token prediction (move generation).
# TOP: token order prediction (auxiliary training signal).
# Ref: Zuhri et al., 2026.
ntp_weight: float = 0.30
top_weight: float = 0.70
top_window: int = 255
# ββ Regularization ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
dropout: float = 0.1
# ββ Special token IDs βββββββββββββββββββββββββββββββββββββββββββββββββββββ
pad_id: int = 0
w_id: int = 1 # <W> white to move
b_id: int = 2 # <B> black to move
# ββ Derived properties ββββββββββββββββββββββββββββββββββββββββββββββββββββ
@property
def head_dim(self) -> int:
"""Attention head dimension."""
return self.d_model // self.n_heads
@property
def n_liv_layers(self) -> int:
"""Number of LIV convolution layers."""
return self.n_layers - self.n_gqa_layers
def __post_init__(self):
assert self.d_model % self.n_heads == 0, \
f"d_model ({self.d_model}) must be divisible by n_heads ({self.n_heads})"
assert self.n_heads % self.n_kv_heads == 0, \
f"n_heads ({self.n_heads}) must be divisible by n_kv_heads ({self.n_kv_heads})"
assert self.n_gqa_layers <= self.n_layers, \
f"n_gqa_layers ({self.n_gqa_layers}) can't exceed n_layers ({self.n_layers})"
assert abs(self.ntp_weight + self.top_weight - 1.0) < 1e-6, \
f"Loss weights must sum to 1.0, got {self.ntp_weight + self.top_weight}"
if __name__ == "__main__":
cfg = ChessModelConfig()
print(f"d_model : {cfg.d_model}")
print(f"n_layers : {cfg.n_layers} ({cfg.n_gqa_layers} GQA + {cfg.n_liv_layers} LIV)")
print(f"n_heads : {cfg.n_heads} query, {cfg.n_kv_heads} KV")
print(f"head_dim : {cfg.head_dim}")
print(f"ffn_hidden_size : {cfg.ffn_hidden_size}")
print(f"use_lrm : {cfg.use_lrm}")
print(f"loss weights : {cfg.ntp_weight} NTP + {cfg.top_weight} TOP") |