lcm-chess / config.py
MostLime's picture
init upload
b2c1dad verified
"""
config.py β€” Hyperparameters for the Liquid Chess Model (LCM).
All model dimensions, training objectives, and architectural choices
are defined here. Nothing is hardcoded in model.py.
"""
from dataclasses import dataclass
@dataclass
class ChessModelConfig:
# ── Vocabulary ────────────────────────────────────────────────────────────
# 1977 tokens: 9 special tokens + 1968 UCI moves.
vocab_size: int = 1977
# ── Sequence ──────────────────────────────────────────────────────────────
# 1 POV token + 253 moves + 1 terminal token = 255.
max_seq_len: int = 255
# ── Model dimensions ──────────────────────────────────────────────────────
d_model: int = 512
# ── Depth ─────────────────────────────────────────────────────────────────
# 16 total layers: 6 GQA attention + 10 LIV convolution.
# GQA layers are distributed evenly via Bresenham algorithm.
n_layers: int = 16
n_gqa_layers: int = 6
# ── Attention (GQA) ───────────────────────────────────────────────────────
n_heads: int = 8 # query heads
n_kv_heads: int = 2 # key-value heads (4 query heads share each KV head)
# ── Feed-Forward Network ──────────────────────────────────────────────────
ffn_expansion: float = 2.67 # SwiGLU expansion ratio
ffn_hidden_size: int = 1376 # round(512 * 2.67 / 64) * 64
# ── LIV Convolution ───────────────────────────────────────────────────────
# kernel_size=4: current token + 3 previous tokens.
conv_kernel_size: int = 4
# ── LRM (Learnable Rate Multipliers) ──────────────────────────────────────
# Per-layer learned scalar applied to block output before residual add.
# Initialized to 1.0 (no effect at start of training).
# Ref: Velikanov et al., 2026.
use_lrm: bool = True
# ── Training objectives ───────────────────────────────────────────────────
# NTP: next token prediction (move generation).
# TOP: token order prediction (auxiliary training signal).
# Ref: Zuhri et al., 2026.
ntp_weight: float = 0.30
top_weight: float = 0.70
top_window: int = 255
# ── Regularization ────────────────────────────────────────────────────────
dropout: float = 0.1
# ── Special token IDs ─────────────────────────────────────────────────────
pad_id: int = 0
w_id: int = 1 # <W> white to move
b_id: int = 2 # <B> black to move
# ── Derived properties ────────────────────────────────────────────────────
@property
def head_dim(self) -> int:
"""Attention head dimension."""
return self.d_model // self.n_heads
@property
def n_liv_layers(self) -> int:
"""Number of LIV convolution layers."""
return self.n_layers - self.n_gqa_layers
def __post_init__(self):
assert self.d_model % self.n_heads == 0, \
f"d_model ({self.d_model}) must be divisible by n_heads ({self.n_heads})"
assert self.n_heads % self.n_kv_heads == 0, \
f"n_heads ({self.n_heads}) must be divisible by n_kv_heads ({self.n_kv_heads})"
assert self.n_gqa_layers <= self.n_layers, \
f"n_gqa_layers ({self.n_gqa_layers}) can't exceed n_layers ({self.n_layers})"
assert abs(self.ntp_weight + self.top_weight - 1.0) < 1e-6, \
f"Loss weights must sum to 1.0, got {self.ntp_weight + self.top_weight}"
if __name__ == "__main__":
cfg = ChessModelConfig()
print(f"d_model : {cfg.d_model}")
print(f"n_layers : {cfg.n_layers} ({cfg.n_gqa_layers} GQA + {cfg.n_liv_layers} LIV)")
print(f"n_heads : {cfg.n_heads} query, {cfg.n_kv_heads} KV")
print(f"head_dim : {cfg.head_dim}")
print(f"ffn_hidden_size : {cfg.ffn_hidden_size}")
print(f"use_lrm : {cfg.use_lrm}")
print(f"loss weights : {cfg.ntp_weight} NTP + {cfg.top_weight} TOP")