File size: 5,331 Bytes

b2c1dad

"""

config.py — Hyperparameters for the Liquid Chess Model (LCM).



All model dimensions, training objectives, and architectural choices

are defined here. Nothing is hardcoded in model.py.

"""

from dataclasses import dataclass


@dataclass
class ChessModelConfig:

    # ── Vocabulary ────────────────────────────────────────────────────────────
    # 1977 tokens: 9 special tokens + 1968 UCI moves.
    vocab_size: int = 1977

    # ── Sequence ──────────────────────────────────────────────────────────────
    # 1 POV token + 253 moves + 1 terminal token = 255.
    max_seq_len: int = 255

    # ── Model dimensions ──────────────────────────────────────────────────────
    d_model: int = 512

    # ── Depth ─────────────────────────────────────────────────────────────────
    # 16 total layers: 6 GQA attention + 10 LIV convolution.
    # GQA layers are distributed evenly via Bresenham algorithm.
    n_layers:     int = 16
    n_gqa_layers: int = 6

    # ── Attention (GQA) ───────────────────────────────────────────────────────
    n_heads:    int = 8   # query heads
    n_kv_heads: int = 2   # key-value heads (4 query heads share each KV head)

    # ── Feed-Forward Network ──────────────────────────────────────────────────
    ffn_expansion:   float = 2.67   # SwiGLU expansion ratio
    ffn_hidden_size: int   = 1376   # round(512 * 2.67 / 64) * 64

    # ── LIV Convolution ───────────────────────────────────────────────────────
    # kernel_size=4: current token + 3 previous tokens.
    conv_kernel_size: int = 4

    # ── LRM (Learnable Rate Multipliers) ──────────────────────────────────────
    # Per-layer learned scalar applied to block output before residual add.
    # Initialized to 1.0 (no effect at start of training).
    # Ref: Velikanov et al., 2026.
    use_lrm: bool = True

    # ── Training objectives ───────────────────────────────────────────────────
    # NTP: next token prediction (move generation).
    # TOP: token order prediction (auxiliary training signal).
    # Ref: Zuhri et al., 2026.
    ntp_weight: float = 0.30
    top_weight: float = 0.70
    top_window: int   = 255

    # ── Regularization ────────────────────────────────────────────────────────
    dropout: float = 0.1

    # ── Special token IDs ─────────────────────────────────────────────────────
    pad_id: int = 0
    w_id:   int = 1   # <W> white to move
    b_id:   int = 2   # <B> black to move

    # ── Derived properties ────────────────────────────────────────────────────
    @property
    def head_dim(self) -> int:
        """Attention head dimension."""
        return self.d_model // self.n_heads

    @property
    def n_liv_layers(self) -> int:
        """Number of LIV convolution layers."""
        return self.n_layers - self.n_gqa_layers

    def __post_init__(self):
        assert self.d_model % self.n_heads == 0, \
            f"d_model ({self.d_model}) must be divisible by n_heads ({self.n_heads})"
        assert self.n_heads % self.n_kv_heads == 0, \
            f"n_heads ({self.n_heads}) must be divisible by n_kv_heads ({self.n_kv_heads})"
        assert self.n_gqa_layers <= self.n_layers, \
            f"n_gqa_layers ({self.n_gqa_layers}) can't exceed n_layers ({self.n_layers})"
        assert abs(self.ntp_weight + self.top_weight - 1.0) < 1e-6, \
            f"Loss weights must sum to 1.0, got {self.ntp_weight + self.top_weight}"


if __name__ == "__main__":
    cfg = ChessModelConfig()
    print(f"d_model         : {cfg.d_model}")
    print(f"n_layers        : {cfg.n_layers} ({cfg.n_gqa_layers} GQA + {cfg.n_liv_layers} LIV)")
    print(f"n_heads         : {cfg.n_heads} query, {cfg.n_kv_heads} KV")
    print(f"head_dim        : {cfg.head_dim}")
    print(f"ffn_hidden_size : {cfg.ffn_hidden_size}")
    print(f"use_lrm         : {cfg.use_lrm}")
    print(f"loss weights    : {cfg.ntp_weight} NTP + {cfg.top_weight} TOP")