init upload

b2c1dad verified 2 days ago

5.33 kB

	"""
	config.py — Hyperparameters for the Liquid Chess Model (LCM).

	All model dimensions, training objectives, and architectural choices
	are defined here. Nothing is hardcoded in model.py.
	"""

	from dataclasses import dataclass


	@dataclass
	class ChessModelConfig:

	# ── Vocabulary ────────────────────────────────────────────────────────────
	# 1977 tokens: 9 special tokens + 1968 UCI moves.
	vocab_size: int = 1977

	# ── Sequence ──────────────────────────────────────────────────────────────
	# 1 POV token + 253 moves + 1 terminal token = 255.
	max_seq_len: int = 255

	# ── Model dimensions ──────────────────────────────────────────────────────
	d_model: int = 512

	# ── Depth ─────────────────────────────────────────────────────────────────
	# 16 total layers: 6 GQA attention + 10 LIV convolution.
	# GQA layers are distributed evenly via Bresenham algorithm.
	n_layers: int = 16
	n_gqa_layers: int = 6

	# ── Attention (GQA) ───────────────────────────────────────────────────────
	n_heads: int = 8 # query heads
	n_kv_heads: int = 2 # key-value heads (4 query heads share each KV head)

	# ── Feed-Forward Network ──────────────────────────────────────────────────
	ffn_expansion: float = 2.67 # SwiGLU expansion ratio
	ffn_hidden_size: int = 1376 # round(512 * 2.67 / 64) * 64

	# ── LIV Convolution ───────────────────────────────────────────────────────
	# kernel_size=4: current token + 3 previous tokens.
	conv_kernel_size: int = 4

	# ── LRM (Learnable Rate Multipliers) ──────────────────────────────────────
	# Per-layer learned scalar applied to block output before residual add.
	# Initialized to 1.0 (no effect at start of training).
	# Ref: Velikanov et al., 2026.
	use_lrm: bool = True

	# ── Training objectives ───────────────────────────────────────────────────
	# NTP: next token prediction (move generation).
	# TOP: token order prediction (auxiliary training signal).
	# Ref: Zuhri et al., 2026.
	ntp_weight: float = 0.30
	top_weight: float = 0.70
	top_window: int = 255

	# ── Regularization ────────────────────────────────────────────────────────
	dropout: float = 0.1

	# ── Special token IDs ─────────────────────────────────────────────────────
	pad_id: int = 0
	w_id: int = 1 # <W> white to move
	b_id: int = 2 # <B> black to move

	# ── Derived properties ────────────────────────────────────────────────────
	@property
	def head_dim(self) -> int:
	"""Attention head dimension."""
	return self.d_model // self.n_heads

	@property
	def n_liv_layers(self) -> int:
	"""Number of LIV convolution layers."""
	return self.n_layers - self.n_gqa_layers

	def __post_init__(self):
	assert self.d_model % self.n_heads == 0, \
	f"d_model ({self.d_model}) must be divisible by n_heads ({self.n_heads})"
	assert self.n_heads % self.n_kv_heads == 0, \
	f"n_heads ({self.n_heads}) must be divisible by n_kv_heads ({self.n_kv_heads})"
	assert self.n_gqa_layers <= self.n_layers, \
	f"n_gqa_layers ({self.n_gqa_layers}) can't exceed n_layers ({self.n_layers})"
	assert abs(self.ntp_weight + self.top_weight - 1.0) < 1e-6, \
	f"Loss weights must sum to 1.0, got {self.ntp_weight + self.top_weight}"


	if __name__ == "__main__":
	cfg = ChessModelConfig()
	print(f"d_model : {cfg.d_model}")
	print(f"n_layers : {cfg.n_layers} ({cfg.n_gqa_layers} GQA + {cfg.n_liv_layers} LIV)")
	print(f"n_heads : {cfg.n_heads} query, {cfg.n_kv_heads} KV")
	print(f"head_dim : {cfg.head_dim}")
	print(f"ffn_hidden_size : {cfg.ffn_hidden_size}")
	print(f"use_lrm : {cfg.use_lrm}")
	print(f"loss weights : {cfg.ntp_weight} NTP + {cfg.top_weight} TOP")