Chess Challenge submission by steakmagnan

8b3dbb3 verified about 1 month ago

13.3 kB


	"""
	Chess Transformer Model for the Chess Challenge.

	This module provides a simple GPT-style transformer architecture
	designed to fit within the 1M parameter constraint.

	Key components:
	- ChessConfig: Configuration class for model hyperparameters
	- ChessForCausalLM: The main model class for next-move prediction
	"""

	from __future__ import annotations

	import math
	from dataclasses import dataclass
	from typing import Optional, Tuple, Union, List

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from transformers import PretrainedConfig, PreTrainedModel
	from transformers.modeling_outputs import CausalLMOutputWithPast


	class ChessConfig(PretrainedConfig):
	"""
	Configuration class for the Chess Transformer model.
	"""

	model_type = "chess_transformer"

	def __init__(
	self,
	vocab_size: int = 200, # Approx size for component vocab
	n_embd: int = 120, # Reduced to be divisible by heads and fit budget
	n_layer: int = 6,
	n_head: int = 4,
	n_ctx: int = 250, # Max moves (not tokens)
	n_inner: Optional[int] = None,
	dropout: float = 0.1,
	layer_norm_epsilon: float = 1e-5,
	tie_weights: bool = True,
	pad_token_id: int = 0,
	bos_token_id: int = 1,
	eos_token_id: int = 2,
	**kwargs,
	):
	super().__init__(
	pad_token_id=pad_token_id,
	bos_token_id=bos_token_id,
	eos_token_id=eos_token_id,
	**kwargs,
	)

	self.vocab_size = vocab_size
	self.n_embd = n_embd
	self.n_layer = n_layer
	self.n_head = n_head
	self.n_ctx = n_ctx
	self.n_inner = n_inner if n_inner is not None else 3 * n_embd
	self.dropout = dropout
	self.layer_norm_epsilon = layer_norm_epsilon
	self.tie_weights = tie_weights
	self.tie_word_embeddings = bool(tie_weights)


	class MultiHeadAttention(nn.Module):
	def __init__(self, config: ChessConfig):
	super().__init__()
	assert config.n_embd % config.n_head == 0
	self.n_head = config.n_head
	self.n_embd = config.n_embd
	self.head_dim = config.n_embd // config.n_head

	self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
	self.c_proj = nn.Linear(config.n_embd, config.n_embd)
	self.dropout = nn.Dropout(config.dropout)

	self.register_buffer(
	"bias",
	torch.tril(torch.ones(config.n_ctx, config.n_ctx)).view(
	1, 1, config.n_ctx, config.n_ctx
	),
	persistent=False,
	)

	def forward(self, x, attention_mask=None):
	batch_size, seq_len, _ = x.size()
	qkv = self.c_attn(x)
	q, k, v = qkv.split(self.n_embd, dim=2)
	q = q.view(batch_size, seq_len, self.n_head, self.head_dim).transpose(1, 2)
	k = k.view(batch_size, seq_len, self.n_head, self.head_dim).transpose(1, 2)
	v = v.view(batch_size, seq_len, self.n_head, self.head_dim).transpose(1, 2)

	attn_weights = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)

	causal_mask = self.bias[:, :, :seq_len, :seq_len]
	attn_weights = attn_weights.masked_fill(causal_mask == 0, float("-inf"))

	if attention_mask is not None:
	# Mask should be broadcastable
	attn_weights = attn_weights + attention_mask

	attn_weights = F.softmax(attn_weights, dim=-1)
	attn_weights = self.dropout(attn_weights)

	attn_output = torch.matmul(attn_weights, v)
	attn_output = attn_output.transpose(1, 2).contiguous().view(
	batch_size, seq_len, self.n_embd
	)
	return self.c_proj(attn_output)


	class FeedForward(nn.Module):
	def __init__(self, config: ChessConfig):
	super().__init__()
	self.c_fc = nn.Linear(config.n_embd, config.n_inner)
	self.c_proj = nn.Linear(config.n_inner, config.n_embd)
	self.dropout = nn.Dropout(config.dropout)

	def forward(self, x):
	x = self.c_fc(x)
	x = F.gelu(x)
	x = self.c_proj(x)
	x = self.dropout(x)
	return x


	class TransformerBlock(nn.Module):
	def __init__(self, config: ChessConfig):
	super().__init__()
	self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
	self.attn = MultiHeadAttention(config)
	self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
	self.mlp = FeedForward(config)

	def forward(self, x, attention_mask=None):
	x = x + self.attn(self.ln_1(x), attention_mask=attention_mask)
	x = x + self.mlp(self.ln_2(x))
	return x


	class ChessForCausalLM(PreTrainedModel):
	config_class = ChessConfig
	base_model_prefix = "transformer"
	supports_gradient_checkpointing = True

	def __init__(self, config: ChessConfig):
	super().__init__(config)

	# Component embeddings (Color, Piece, Src, Dst, Suffix)
	self.wte_color = nn.Embedding(config.vocab_size, config.n_embd)
	self.wte_piece = nn.Embedding(config.vocab_size, config.n_embd)
	self.wte_src = nn.Embedding(config.vocab_size, config.n_embd)
	self.wte_dst = nn.Embedding(config.vocab_size, config.n_embd)
	self.wte_suf = nn.Embedding(config.vocab_size, config.n_embd)

	self.wpe = nn.Embedding(config.n_ctx, config.n_embd)
	self.drop = nn.Dropout(config.dropout)

	self.h = nn.ModuleList([
	TransformerBlock(config) for _ in range(config.n_layer)
	])

	self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)

	# 5 Heads for predicting next components
	# We model p(NextMove \| History).
	# Components of NextMove are predicted conditionally independent given History (simplification)
	# or we could make them autoregressive within the move.
	# For "product encoding", parallel prediction is natural.
	self.head_color = nn.Linear(config.n_embd, config.vocab_size, bias=False)
	self.head_piece = nn.Linear(config.n_embd, config.vocab_size, bias=False)
	self.head_src = nn.Linear(config.n_embd, config.vocab_size, bias=False)
	self.head_dst = nn.Linear(config.n_embd, config.vocab_size, bias=False)
	self.head_suf = nn.Linear(config.n_embd, config.vocab_size, bias=False)

	self.post_init()

	def _init_weights(self, module):
	if isinstance(module, nn.Linear):
	torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
	if module.bias is not None:
	torch.nn.init.zeros_(module.bias)
	elif isinstance(module, nn.Embedding):
	torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
	elif isinstance(module, nn.LayerNorm):
	torch.nn.init.ones_(module.weight)
	torch.nn.init.zeros_(module.bias)

	def get_input_embeddings(self):
	# Return first embedding as proxy, though we have multiple
	return self.wte_color

	def forward(
	self,
	input_ids: torch.LongTensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	return_dict: Optional[bool] = None,
	**kwargs,
	) -> Union[Tuple, CausalLMOutputWithPast]:

	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	batch_size, seq_len = input_ids.size()

	# Ensure sequence length is multiple of 5
	if seq_len % 5 != 0:
	# Pad or truncate? For training we expect aligned batches
	# Truncate to nearest multiple of 5
	new_len = (seq_len // 5) * 5
	input_ids = input_ids[:, :new_len]
	if labels is not None:
	labels = labels[:, :new_len]
	if attention_mask is not None:
	attention_mask = attention_mask[:, :new_len]
	seq_len = new_len

	num_moves = seq_len // 5

	# Reshape to (B, L, 5)
	# Components: 0=Color, 1=Piece, 2=Src, 3=Dst, 4=Suf
	reshaped_ids = input_ids.view(batch_size, num_moves, 5)

	# Product Embedding
	emb_c = self.wte_color(reshaped_ids[:, :, 0])
	emb_p = self.wte_piece(reshaped_ids[:, :, 1])
	emb_s = self.wte_src(reshaped_ids[:, :, 2])
	emb_d = self.wte_dst(reshaped_ids[:, :, 3])
	emb_f = self.wte_suf(reshaped_ids[:, :, 4])

	# Element-wise product
	token_embeds = emb_c * emb_p * emb_s * emb_d * emb_f

	# Position Embeddings
	device = input_ids.device
	if position_ids is None:
	position_ids = torch.arange(num_moves, device=device).unsqueeze(0)

	position_embeds = self.wpe(position_ids)
	hidden_states = self.drop(token_embeds + position_embeds)

	# Attention mask adaptation
	# input mask is (B, 5L). We need (B, L).
	# We consider a move valid if ALL components are valid? Or ANY?
	# Typically padding is consistent.
	if attention_mask is not None:
	# Take every 5th element or min/max
	reshaped_mask = attention_mask.view(batch_size, num_moves, 5)
	# If any part is unmasked (1), keep it?
	# Usually PAD=0. If all are PAD, then 0.
	chess_mask = reshaped_mask.all(dim=-1).float() # (B, L)
	# Standard broadcast for attention: (B, 1, 1, L)
	extended_attention_mask = (1.0 - chess_mask) * -10000.0
	extended_attention_mask = extended_attention_mask.unsqueeze(1).unsqueeze(2)
	else:
	extended_attention_mask = None

	# Transformer
	for block in self.h:
	hidden_states = block(hidden_states, attention_mask=extended_attention_mask)

	hidden_states = self.ln_f(hidden_states)

	# Output Heads (Predicting Next Move Components)
	logits_c = self.head_color(hidden_states)
	logits_p = self.head_piece(hidden_states)
	logits_s = self.head_src(hidden_states)
	logits_d = self.head_dst(hidden_states)
	logits_f = self.head_suf(hidden_states)

	# Stack logits: (B, L, 5, V)
	logits_stacked = torch.stack([logits_c, logits_p, logits_s, logits_d, logits_f], dim=2)

	# Compute Loss
	loss = None
	if labels is not None:
	# Reshape labels: (B, L, 5)
	labels_reshaped = labels.view(batch_size, num_moves, 5)

	# Shift: Hidden[t] predicts Labels[t+1]
	shift_logits = logits_stacked[:, :-1, :, :].contiguous()
	shift_labels = labels_reshaped[:, 1:, :].contiguous()

	# Flatten
	loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
	loss = loss_fct(
	shift_logits.view(-1, self.config.vocab_size),
	shift_labels.view(-1)
	)

	# Return structured output
	# To satisfy Trainer, we might need to return (B, 5L, V) logits?
	# But we produced (B, L, 5, V). Flattening gives (B, 5L, V).
	# Trainer expects logits matching input length usually, or labels length.

	flat_logits = logits_stacked.view(batch_size, -1, self.config.vocab_size)

	if not return_dict:
	output = (flat_logits,)
	return ((loss,) + output) if loss is not None else output

	return CausalLMOutputWithPast(
	loss=loss,
	logits=flat_logits,
	)

	@torch.no_grad()
	def generate_move(
	self,
	input_ids: torch.LongTensor,
	temperature: float = 1.0,
	top_k: Optional[int] = None,
	top_p: Optional[float] = None,
	) -> List[int]:
	"""
	Generate the next move (5 tokens).
	"""
	self.eval()

	# Forward pass
	# input_ids (1, 5L)
	outputs = self(input_ids)
	# Logits: (1, 5L, V)
	# We want the last move prediction.
	# The logits for the NEXT move are at the very end.
	# Specifically, the last block of 5 logits corresponds to predictions from the last hidden state.

	# Check dimensions
	next_move_logits = outputs.logits[:, -5:, :] # (1, 5, V)

	generated = []
	for i in range(5):
	logits = next_move_logits[:, i, :] / temperature
	# Apply filtering
	if top_k is not None:
	v, _ = torch.topk(logits, top_k)
	logits[logits < v[:, [-1]]] = -float('Inf')

	probs = F.softmax(logits, dim=-1)
	next_token = torch.multinomial(probs, num_samples=1)
	generated.append(next_token.item())

	return generated

	# Register
	from transformers import AutoConfig, AutoModelForCausalLM
	AutoConfig.register("chess_transformer", ChessConfig)
	AutoModelForCausalLM.register(ChessConfig, ChessForCausalLM)