chess_done2 / model.py

Upload model.py

7e23d8b verified about 1 month ago

29.8 kB

	"""
	Chess Transformer Model for the Chess Challenge.

	This module provides a modular GPT-style transformer architecture
	designed to fit within the 1M parameter constraint.

	Key components:
	- ChessConfig: Configuration class for model hyperparameters
	- ChessForCausalLM: The main model class for next-move prediction

	Modular options:
	- Attention: MHA (standard), GQA (grouped query), MQA (multi-query)
	- Position encoding: learned, rope (rotary), alibi
	- FFN activation: gelu, swiglu
	"""

	from __future__ import annotations

	import math
	from dataclasses import dataclass
	from typing import List, Optional, Tuple, Union, Literal

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from transformers import PretrainedConfig, PreTrainedModel
	try:
	from transformers.generation.utils import GenerationMixin
	except ImportError: # Fallback for older transformers
	from transformers import GenerationMixin
	from transformers.modeling_outputs import CausalLMOutputWithPast

	# Type aliases for configuration options
	AttentionType = Literal["mha", "gqa", "mqa"]
	PositionEncoding = Literal["learned", "rope", "alibi"]
	FFNType = Literal["gelu", "swiglu"]


	class ChessConfig(PretrainedConfig):
	"""
	Configuration class for the Chess Transformer model.

	This configuration is designed for a ~1M parameter model.
	Students can adjust these values to explore different architectures.

	Parameter budget breakdown (with default values):
	- Embeddings (vocab): 1200 x 128 = 153,600
	- Position Embeddings: 256 x 128 = 32,768 (0 with rope/alibi)
	- Transformer Layers: 6 x ~120,000 = ~720,000
	- LM Head (with weight tying): 0 (shared with embeddings)
	- Total: ~906,000 parameters

	Attributes:
	vocab_size: Size of the vocabulary (number of unique moves).
	n_embd: Embedding dimension (d_model).
	n_layer: Number of transformer layers.
	n_head: Number of attention heads.
	n_kv_heads: Number of key-value heads (for GQA/MQA). None = same as n_head.
	n_ctx: Maximum sequence length (context window).
	n_inner: Feed-forward inner dimension (default: 3 * n_embd).
	dropout: Dropout probability.
	layer_norm_epsilon: Epsilon for layer normalization.
	tie_weights: Whether to tie embedding and output weights.
	attention_type: Type of attention mechanism ("mha", "gqa", "mqa").
	pos_encoding: Type of position encoding ("learned", "rope", "alibi").
	ffn_type: Type of FFN activation ("gelu", "swiglu").
	rope_theta: Base frequency for RoPE (default 10000.0).
	legal_loss_weight: Auxiliary legal-move loss weight (default 0.0).
	"""

	model_type = "chess_transformer"

	def __init__(
	self,
	vocab_size: int = 1200,
	n_embd: int = 128,
	n_layer: int = 6,
	n_head: int = 4,
	n_kv_heads: Optional[int] = None,
	n_ctx: int = 256,
	n_inner: Optional[int] = None,
	dropout: float = 0.1,
	layer_norm_epsilon: float = 1e-5,
	tie_weights: bool = True,
	# New modular options
	attention_type: AttentionType = "mha",
	pos_encoding: PositionEncoding = "learned",
	ffn_type: FFNType = "gelu",
	rope_theta: float = 10000.0,
	legal_loss_weight: float = 0.0,
	# Token IDs
	pad_token_id: int = 0,
	bos_token_id: int = 1,
	eos_token_id: int = 2,
	**kwargs,
	):
	super().__init__(
	pad_token_id=pad_token_id,
	bos_token_id=bos_token_id,
	eos_token_id=eos_token_id,
	**kwargs,
	)

	self.vocab_size = vocab_size
	self.n_embd = n_embd
	self.n_layer = n_layer
	self.n_head = n_head
	self.n_ctx = n_ctx
	self.n_inner = n_inner if n_inner is not None else 3 * n_embd
	self.dropout = dropout
	self.layer_norm_epsilon = layer_norm_epsilon
	self.tie_weights = tie_weights
	# Inform HF base class about tying behavior
	self.tie_word_embeddings = bool(tie_weights)

	# Modular architecture options
	self.attention_type = attention_type
	self.pos_encoding = pos_encoding
	self.ffn_type = ffn_type
	self.rope_theta = rope_theta
	self.legal_loss_weight = legal_loss_weight

	# Handle n_kv_heads based on attention type
	if n_kv_heads is None:
	if attention_type == "mqa":
	self.n_kv_heads = 1
	elif attention_type == "gqa":
	# Default to n_head // 2 for GQA, but at least 1
	self.n_kv_heads = max(1, n_head // 2)
	else: # mha
	self.n_kv_heads = n_head
	else:
	self.n_kv_heads = n_kv_heads

	# Validation
	assert n_embd % n_head == 0, f"n_embd ({n_embd}) must be divisible by n_head ({n_head})"
	assert n_head % self.n_kv_heads == 0, f"n_head ({n_head}) must be divisible by n_kv_heads ({self.n_kv_heads})"
	assert attention_type in ("mha", "gqa", "mqa"), f"Invalid attention_type: {attention_type}"
	assert pos_encoding in ("learned", "rope", "alibi"), f"Invalid pos_encoding: {pos_encoding}"
	assert ffn_type in ("gelu", "swiglu"), f"Invalid ffn_type: {ffn_type}"


	# ==============================================================================
	# Position Encoding Modules
	# ==============================================================================


	class RotaryEmbedding(nn.Module):
	"""
	Rotary Position Embedding (RoPE).

	Applies rotary embeddings to queries and keys, encoding position
	information through rotation in the complex plane. This allows
	relative position information without explicit position embeddings.

	Reference: https://arxiv.org/abs/2104.09864
	"""

	def __init__(self, dim: int, max_seq_len: int = 256, theta: float = 10000.0):
	super().__init__()
	self.dim = dim
	self.max_seq_len = max_seq_len
	self.theta = theta

	# Precompute frequency bands
	inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
	self.register_buffer("inv_freq", inv_freq, persistent=False)

	# Precompute sin/cos for all positions
	self._build_cache(max_seq_len)

	def _build_cache(self, seq_len: int):
	"""Build sin/cos cache for given sequence length."""
	positions = torch.arange(seq_len, dtype=torch.float32)
	freqs = torch.outer(positions, self.inv_freq)
	# Create [cos, sin] interleaved for rotation
	emb = torch.cat([freqs, freqs], dim=-1)
	self.register_buffer("cos_cached", emb.cos(), persistent=False)
	self.register_buffer("sin_cached", emb.sin(), persistent=False)

	def forward(self, x: torch.Tensor, seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]:
	"""Return cos and sin for the given sequence length."""
	if seq_len > self.max_seq_len:
	self._build_cache(seq_len)
	self.max_seq_len = seq_len
	return (
	self.cos_cached[:seq_len].to(x.dtype),
	self.sin_cached[:seq_len].to(x.dtype),
	)


	def rotate_half(x: torch.Tensor) -> torch.Tensor:
	"""Rotate half the hidden dims of the input."""
	x1 = x[..., : x.shape[-1] // 2]
	x2 = x[..., x.shape[-1] // 2 :]
	return torch.cat([-x2, x1], dim=-1)


	def apply_rotary_pos_emb(
	q: torch.Tensor,
	k: torch.Tensor,
	cos: torch.Tensor,
	sin: torch.Tensor,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Apply rotary position embedding to queries and keys.

	Args:
	q: Query tensor of shape (batch, n_heads, seq_len, head_dim)
	k: Key tensor of shape (batch, n_kv_heads, seq_len, head_dim)
	cos: Cosine of rotation angles
	sin: Sine of rotation angles

	Returns:
	Rotated q and k tensors
	"""
	# cos/sin shape: (seq_len, head_dim) -> (1, 1, seq_len, head_dim)
	cos = cos.unsqueeze(0).unsqueeze(0)
	sin = sin.unsqueeze(0).unsqueeze(0)

	q_embed = (q * cos) + (rotate_half(q) * sin)
	k_embed = (k * cos) + (rotate_half(k) * sin)
	return q_embed, k_embed


	def build_alibi_slopes(n_heads: int) -> torch.Tensor:
	"""
	Build ALiBi slopes for attention bias.

	ALiBi adds a linear bias to attention scores based on position distance.
	The slope decreases geometrically for each head.

	Reference: https://arxiv.org/abs/2108.12409
	"""
	def get_slopes_power_of_2(n: int) -> list:
	start = 2 (-(2 -(math.log2(n) - 3)))
	ratio = start
	return [start * (ratio ** i) for i in range(n)]

	if math.log2(n_heads).is_integer():
	slopes = get_slopes_power_of_2(n_heads)
	else:
	# For non-power-of-2, use closest power of 2 and interpolate
	closest_power_of_2 = 2 ** math.floor(math.log2(n_heads))
	slopes = get_slopes_power_of_2(closest_power_of_2)
	extra_slopes = get_slopes_power_of_2(2 * closest_power_of_2)
	slopes = slopes + extra_slopes[0::2][: n_heads - closest_power_of_2]

	return torch.tensor(slopes, dtype=torch.float32)


	def build_alibi_bias(seq_len: int, slopes: torch.Tensor) -> torch.Tensor:
	"""
	Build the ALiBi attention bias matrix.

	Args:
	seq_len: Sequence length
	slopes: ALiBi slopes tensor of shape (n_heads,)

	Returns:
	Bias tensor of shape (1, n_heads, seq_len, seq_len)
	"""
	# Create distance matrix: distance[i, j] = j - i (negative for causal)
	positions = torch.arange(seq_len)
	distance = positions.unsqueeze(0) - positions.unsqueeze(1) # (seq_len, seq_len)

	# Apply slopes: (n_heads, 1, 1) * (seq_len, seq_len) -> (n_heads, seq_len, seq_len)
	alibi = slopes.unsqueeze(1).unsqueeze(1) * distance.unsqueeze(0)

	return alibi.unsqueeze(0) # (1, n_heads, seq_len, seq_len)


	# ==============================================================================
	# Attention Modules
	# ==============================================================================


	class Attention(nn.Module):
	"""
	Unified attention module supporting MHA, GQA, and MQA.

	Supports multiple position encoding methods:
	- learned: Standard learned position embeddings (handled externally)
	- rope: Rotary Position Embeddings (applied to Q and K)
	- alibi: Attention with Linear Biases (added to attention scores)

	Architecture variants:
	- MHA (Multi-Head Attention): n_kv_heads == n_head
	- GQA (Grouped Query Attention): n_kv_heads < n_head, n_head % n_kv_heads == 0
	- MQA (Multi-Query Attention): n_kv_heads == 1
	"""

	def __init__(self, config: ChessConfig):
	super().__init__()

	self.n_head = config.n_head
	self.n_kv_heads = config.n_kv_heads
	self.n_embd = config.n_embd
	self.head_dim = config.n_embd // config.n_head
	self.n_rep = config.n_head // config.n_kv_heads # Repetition factor for GQA/MQA
	self.pos_encoding = config.pos_encoding

	# Compute projection sizes
	# Q: n_head * head_dim = n_embd
	# K, V: n_kv_heads * head_dim (smaller for GQA/MQA)
	self.q_proj = nn.Linear(config.n_embd, config.n_head * self.head_dim, bias=False)
	self.k_proj = nn.Linear(config.n_embd, config.n_kv_heads * self.head_dim, bias=False)
	self.v_proj = nn.Linear(config.n_embd, config.n_kv_heads * self.head_dim, bias=False)
	self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)

	self.dropout = nn.Dropout(config.dropout)

	# Position encoding components
	if config.pos_encoding == "rope":
	self.rotary_emb = RotaryEmbedding(
	dim=self.head_dim,
	max_seq_len=config.n_ctx,
	theta=config.rope_theta,
	)
	elif config.pos_encoding == "alibi":
	# Precompute ALiBi slopes
	slopes = build_alibi_slopes(config.n_head)
	self.register_buffer("alibi_slopes", slopes, persistent=False)

	# Causal mask
	self.register_buffer(
	"causal_mask",
	torch.tril(torch.ones(config.n_ctx, config.n_ctx)).view(
	1, 1, config.n_ctx, config.n_ctx
	),
	persistent=False,
	)

	def _repeat_kv(self, x: torch.Tensor) -> torch.Tensor:
	"""
	Repeat KV heads to match the number of query heads.

	For GQA/MQA, we need to expand K and V to match Q's head count.
	Input shape: (batch, n_kv_heads, seq_len, head_dim)
	Output shape: (batch, n_head, seq_len, head_dim)
	"""
	if self.n_rep == 1:
	return x
	batch, n_kv_heads, seq_len, head_dim = x.shape
	x = x.unsqueeze(2).expand(batch, n_kv_heads, self.n_rep, seq_len, head_dim)
	return x.reshape(batch, n_kv_heads * self.n_rep, seq_len, head_dim)

	def forward(
	self,
	x: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	batch_size, seq_len, _ = x.size()

	# Compute Q, K, V projections
	q = self.q_proj(x)
	k = self.k_proj(x)
	v = self.v_proj(x)

	# Reshape for multi-head attention
	q = q.view(batch_size, seq_len, self.n_head, self.head_dim).transpose(1, 2)
	k = k.view(batch_size, seq_len, self.n_kv_heads, self.head_dim).transpose(1, 2)
	v = v.view(batch_size, seq_len, self.n_kv_heads, self.head_dim).transpose(1, 2)

	# Apply rotary embeddings if using RoPE
	if self.pos_encoding == "rope":
	cos, sin = self.rotary_emb(q, seq_len)
	q, k = apply_rotary_pos_emb(q, k, cos, sin)

	# Repeat K and V for GQA/MQA
	k = self._repeat_kv(k)
	v = self._repeat_kv(v)

	# Scaled dot-product attention
	attn_weights = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)

	# Apply ALiBi bias if using ALiBi
	if self.pos_encoding == "alibi":
	alibi_bias = build_alibi_bias(seq_len, self.alibi_slopes.to(x.device))
	attn_weights = attn_weights + alibi_bias.to(attn_weights.dtype)

	# Apply causal mask
	causal_mask = self.causal_mask[:, :, :seq_len, :seq_len]
	attn_weights = attn_weights.masked_fill(causal_mask == 0, float("-inf"))

	# Apply attention mask (for padding)
	if attention_mask is not None:
	attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
	attn_weights = attn_weights.masked_fill(attention_mask == 0, float("-inf"))

	attn_weights = F.softmax(attn_weights, dim=-1)
	attn_weights = self.dropout(attn_weights)

	# Apply attention to values
	attn_output = torch.matmul(attn_weights, v)

	# Reshape back
	attn_output = attn_output.transpose(1, 2).contiguous().view(
	batch_size, seq_len, self.n_embd
	)

	# Output projection
	attn_output = self.c_proj(attn_output)

	return attn_output


	# Alias for backward compatibility
	MultiHeadAttention = Attention


	# ==============================================================================
	# Feed-Forward Modules
	# ==============================================================================


	class FeedForward(nn.Module):
	"""
	Feed-forward network (MLP) module with configurable activation.

	Supports:
	- gelu: Standard GELU activation (2 weight matrices)
	- swiglu: SwiGLU activation (3 weight matrices, better performance)

	For SwiGLU, the hidden dimension is adjusted to keep parameter count similar:
	- GELU: 2 * n_embd * n_inner parameters
	- SwiGLU: 3 * n_embd * n_inner_swiglu parameters
	To match, n_inner_swiglu = 2/3 * n_inner
	"""

	def __init__(self, config: ChessConfig):
	super().__init__()

	self.ffn_type = config.ffn_type

	if config.ffn_type == "swiglu":
	# SwiGLU uses 3 projections, so reduce hidden dim to compensate
	# Adjust n_inner for SwiGLU to maintain similar parameter count
	hidden_dim = int(2 * config.n_inner / 3)
	# Round to nearest multiple of 8 for efficiency
	hidden_dim = ((hidden_dim + 7) // 8) * 8

	self.w1 = nn.Linear(config.n_embd, hidden_dim, bias=False) # Gate
	self.w2 = nn.Linear(config.n_embd, hidden_dim, bias=False) # Up
	self.w3 = nn.Linear(hidden_dim, config.n_embd, bias=False) # Down
	else: # gelu
	self.c_fc = nn.Linear(config.n_embd, config.n_inner)
	self.c_proj = nn.Linear(config.n_inner, config.n_embd)

	self.dropout = nn.Dropout(config.dropout)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	if self.ffn_type == "swiglu":
	# SwiGLU: Swish(W1x) W2*x, then W3
	gate = F.silu(self.w1(x)) # Swish activation
	up = self.w2(x)
	x = gate * up
	x = self.w3(x)
	x = self.dropout(x)
	else: # gelu
	x = self.c_fc(x)
	x = F.gelu(x)
	x = self.c_proj(x)
	x = self.dropout(x)
	return x


	# ==============================================================================
	# Transformer Block
	# ==============================================================================


	class TransformerBlock(nn.Module):
	"""
	A single transformer block with attention and feed-forward layers.

	Uses pre-normalization (LayerNorm before attention/FFN) for better
	training stability.
	"""

	def __init__(self, config: ChessConfig):
	super().__init__()

	self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
	self.attn = Attention(config)
	self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
	self.mlp = FeedForward(config)

	def forward(
	self,
	x: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	# Pre-norm attention
	x = x + self.attn(self.ln_1(x), attention_mask=attention_mask)
	# Pre-norm FFN
	x = x + self.mlp(self.ln_2(x))
	return x


	# ==============================================================================
	# Main Model
	# ==============================================================================


	class ChessForCausalLM(PreTrainedModel, GenerationMixin):
	"""
	Chess Transformer for Causal Language Modeling (next-move prediction).

	This model is designed to predict the next chess move given a sequence
	of previous moves. It uses a modular GPT-style architecture with:
	- Token embeddings for chess moves
	- Configurable positional embeddings (learned/RoPE/ALiBi)
	- Stacked transformer blocks with configurable attention (MHA/GQA/MQA)
	- Configurable FFN activation (GELU/SwiGLU)
	- Linear head for next-token prediction

	The model supports weight tying between the embedding layer and the
	output projection to save parameters.

	Example:
	>>> # Baseline configuration
	>>> config = ChessConfig(vocab_size=1200, n_embd=128, n_layer=6)
	>>> model = ChessForCausalLM(config)

	>>> # GQA with RoPE (saves parameters, allows more layers)
	>>> config = ChessConfig(
	... vocab_size=1200, n_embd=128, n_layer=8,
	... attention_type="gqa", n_kv_heads=2,
	... pos_encoding="rope"
	... )
	>>> model = ChessForCausalLM(config)
	"""

	config_class = ChessConfig
	base_model_prefix = "transformer"
	supports_gradient_checkpointing = True
	# Suppress missing-key warning for tied lm_head when loading
	keys_to_ignore_on_load_missing = ["lm_head.weight"]

	def __init__(self, config: ChessConfig):
	super().__init__(config)

	self.pos_encoding = config.pos_encoding

	# Token embeddings (always needed)
	self.wte = nn.Embedding(config.vocab_size, config.n_embd)

	# Position embeddings (only for learned position encoding)
	if config.pos_encoding == "learned":
	self.wpe = nn.Embedding(config.n_ctx, config.n_embd)
	else:
	# RoPE and ALiBi don't need position embeddings
	self.wpe = None

	self.drop = nn.Dropout(config.dropout)

	# Transformer blocks
	self.h = nn.ModuleList([
	TransformerBlock(config) for _ in range(config.n_layer)
	])

	# Final layer norm
	self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)

	# Output head
	self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

	# Declare tied weights for proper serialization
	if config.tie_weights:
	self._tied_weights_keys = ["lm_head.weight"]

	# Initialize weights
	self.post_init()

	# Tie weights if configured
	if config.tie_weights:
	self.tie_weights()

	def get_input_embeddings(self) -> nn.Module:
	return self.wte

	def set_input_embeddings(self, new_embeddings: nn.Module):
	self.wte = new_embeddings
	if getattr(self.config, "tie_weights", False):
	self.tie_weights()

	def get_output_embeddings(self) -> nn.Module:
	return self.lm_head

	def set_output_embeddings(self, new_embeddings: nn.Module):
	self.lm_head = new_embeddings

	def tie_weights(self):
	# Use HF helper to tie or clone depending on config
	if getattr(self.config, "tie_weights", False) or getattr(self.config, "tie_word_embeddings", False):
	self._tie_or_clone_weights(self.lm_head, self.wte)

	def prepare_inputs_for_generation(
	self,
	input_ids: torch.LongTensor,
	past_key_values: Optional[Tuple] = None,
	attention_mask: Optional[torch.Tensor] = None,
	**kwargs,
	) -> dict:
	# No KV-cache support; fall back to full forward each step.
	if past_key_values is not None:
	input_ids = input_ids[:, -1:]
	return {
	"input_ids": input_ids,
	"attention_mask": attention_mask,
	"past_key_values": past_key_values,
	}

	def _init_weights(self, module: nn.Module):
	"""Initialize weights following GPT-2 style."""
	if isinstance(module, nn.Linear):
	torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
	if module.bias is not None:
	torch.nn.init.zeros_(module.bias)
	elif isinstance(module, nn.Embedding):
	torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
	elif isinstance(module, nn.LayerNorm):
	torch.nn.init.ones_(module.weight)
	torch.nn.init.zeros_(module.bias)

	def forward(
	self,
	input_ids: torch.LongTensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	return_dict: Optional[bool] = None,
	legal_token_ids: Optional[List[List[int]]] = None,
	**kwargs,
	) -> Union[Tuple, CausalLMOutputWithPast]:
	"""
	Forward pass of the model.

	Args:
	input_ids: Token IDs of shape (batch_size, seq_len).
	attention_mask: Attention mask of shape (batch_size, seq_len).
	position_ids: Position IDs of shape (batch_size, seq_len).
	labels: Labels for language modeling loss.
	return_dict: Whether to return a ModelOutput object.

	Returns:
	CausalLMOutputWithPast containing loss (if labels provided) and logits.
	"""
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	batch_size, seq_len = input_ids.size()
	device = input_ids.device

	# Get token embeddings
	hidden_states = self.wte(input_ids)

	# Add position embeddings only for learned encoding
	if self.pos_encoding == "learned":
	if position_ids is None:
	position_ids = torch.arange(seq_len, device=device).unsqueeze(0).expand(batch_size, -1)
	position_embeds = self.wpe(position_ids)
	hidden_states = hidden_states + position_embeds

	# Apply dropout
	hidden_states = self.drop(hidden_states)

	# Pass through transformer blocks
	for block in self.h:
	hidden_states = block(hidden_states, attention_mask=attention_mask)

	# Final layer norm
	hidden_states = self.ln_f(hidden_states)

	# Get logits
	logits = self.lm_head(hidden_states)

	# Compute loss if labels are provided
	loss = None
	if labels is not None:
	# Shift logits and labels for next-token prediction
	shift_logits = logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()

	# Flatten for cross-entropy
	loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
	# loss_fct = nn.CrossEntropyLoss(ignore_index=self.config.pad_token_id)
	loss = loss_fct(
	shift_logits.view(-1, shift_logits.size(-1)),
	shift_labels.view(-1),
	)

	if self.config.legal_loss_weight > 0 and legal_token_ids:
	aux_loss = self._legal_move_loss(logits, labels, legal_token_ids)
	if aux_loss is not None:
	loss = loss + self.config.legal_loss_weight * aux_loss

	if not return_dict:
	output = (logits,)
	return ((loss,) + output) if loss is not None else output

	return CausalLMOutputWithPast(
	loss=loss,
	logits=logits,
	past_key_values=None,
	hidden_states=None,
	attentions=None,
	)

	def _legal_move_loss(
	self,
	logits: torch.Tensor,
	labels: torch.Tensor,
	legal_token_ids: List[List[int]],
	) -> Optional[torch.Tensor]:
	batch_size = logits.size(0)
	total_loss = logits.new_tensor(0.0)
	count = 0

	for batch_idx in range(batch_size):
	if batch_idx >= len(legal_token_ids):
	continue
	legal_ids = legal_token_ids[batch_idx]
	if not legal_ids:
	continue

	label_row = labels[batch_idx]
	valid_mask = label_row != -100
	for special_id in (
	getattr(self.config, "pad_token_id", None),
	getattr(self.config, "bos_token_id", None),
	getattr(self.config, "eos_token_id", None),
	):
	if special_id is not None:
	valid_mask = valid_mask & (label_row != int(special_id))

	valid_positions = valid_mask.nonzero(as_tuple=False)
	if valid_positions.numel() == 0:
	continue

	last_pos = int(valid_positions[-1].item())
	pred_pos = last_pos - 1
	if pred_pos < 0:
	continue

	logits_slice = logits[batch_idx, pred_pos]
	legal_logits = logits_slice.index_select(
	0,
	torch.tensor(legal_ids, device=logits_slice.device, dtype=torch.long),
	)

	loss = torch.logsumexp(logits_slice, dim=-1) - torch.logsumexp(legal_logits, dim=-1)
	total_loss = total_loss + loss
	count += 1

	if count == 0:
	return None
	return total_loss / count

	@torch.no_grad()
	def generate_move(
	self,
	input_ids: torch.LongTensor,
	temperature: float = 1.0,
	top_k: Optional[int] = None,
	top_p: Optional[float] = None,
	) -> int:
	"""
	Generate the next move given a sequence of moves.

	Args:
	input_ids: Token IDs of shape (1, seq_len).
	temperature: Sampling temperature (1.0 = no change).
	top_k: If set, only sample from top k tokens.
	top_p: If set, use nucleus sampling with this threshold.

	Returns:
	The token ID of the predicted next move.
	"""
	self.eval()

	# Get logits for the last position
	outputs = self(input_ids)
	logits = outputs.logits[:, -1, :] / temperature

	# Apply top-k filtering
	if top_k is not None:
	indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
	logits[indices_to_remove] = float("-inf")

	# Apply top-p (nucleus) filtering
	if top_p is not None:
	sorted_logits, sorted_indices = torch.sort(logits, descending=True)
	cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

	# Remove tokens with cumulative probability above the threshold
	sorted_indices_to_remove = cumulative_probs > top_p
	sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
	sorted_indices_to_remove[..., 0] = 0

	indices_to_remove = sorted_indices_to_remove.scatter(
	dim=-1, index=sorted_indices, src=sorted_indices_to_remove
	)
	logits[indices_to_remove] = float("-inf")

	# Sample from the distribution
	probs = F.softmax(logits, dim=-1)
	next_token = torch.multinomial(probs, num_samples=1)

	return next_token.item()


	# Register the model with Auto classes for easy loading
	from transformers import AutoConfig, AutoModelForCausalLM

	AutoConfig.register("chess_transformer", ChessConfig)
	AutoModelForCausalLM.register(ChessConfig, ChessForCausalLM)