Chess Challenge submission by ykolo

7b8a427 verified about 2 months ago

10.9 kB

	"""
	Compositional Chess Tokenizer for the Chess Challenge.

	This tokenizer decomposes chess moves into meaningful components:
	- Color (W/B), Piece (P/N/B/R/Q/K), Squares, Actions, Modifiers

	Reduces vocabulary from 3803 to 86 tokens while enabling better generalization.

	Example:
	WPe2e4 -> [W, P, e2, ->, e4]
	BNg8f6(x) -> [B, N, g8, x, f6]
	"""

	from __future__ import annotations

	import json
	import os
	import re
	from typing import Dict, List, Optional

	from transformers import PreTrainedTokenizer


	class ChessTokenizer(PreTrainedTokenizer):
	"""Compositional tokenizer for chess moves (86 tokens vs 3803 baseline)."""

	model_input_names = ["input_ids", "attention_mask"]
	vocab_files_names = {"vocab_file": "vocab.json"}

	PAD_TOKEN = "[PAD]"
	BOS_TOKEN = "[BOS]"
	EOS_TOKEN = "[EOS]"
	UNK_TOKEN = "[UNK]"
	# Use ASCII-safe tokens to avoid encoding issues and mismatches
	MOVE_ARROW = "->"
	CAPTURE_CROSS = "x"

	def __init__(self, vocab_file: Optional[str] = None, vocab: Optional[Dict[str, int]] = None, **kwargs):
	"""Initialize compositional chess tokenizer."""
	self._pad_token = self.PAD_TOKEN
	self._bos_token = self.BOS_TOKEN
	self._eos_token = self.EOS_TOKEN
	self._unk_token = self.UNK_TOKEN

	kwargs.pop("pad_token", None)
	kwargs.pop("bos_token", None)
	kwargs.pop("eos_token", None)
	kwargs.pop("unk_token", None)

	if vocab is not None:
	self._vocab = vocab
	elif vocab_file is not None and os.path.exists(vocab_file):
	with open(vocab_file, "r", encoding="utf-8") as f:
	self._vocab = json.load(f)
	else:
	print("Building compositional vocabulary (86 tokens)...")
	self._vocab = self._build_compositional_vocab()

	self._ids_to_tokens = {v: k for k, v in self._vocab.items()}

	super().__init__(
	pad_token=self._pad_token,
	bos_token=self._bos_token,
	eos_token=self._eos_token,
	unk_token=self._unk_token,
	**kwargs,
	)

	def _build_compositional_vocab(self) -> Dict[str, int]:
	"""Build 86-token compositional vocabulary."""
	vocab = {}
	idx = 0

	# Special tokens (4 tokens)
	for token in [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]:
	vocab[token] = idx
	idx += 1

	# Colors (2 tokens: W, B)
	for color in ["W", "B"]:
	vocab[color] = idx
	idx += 1

	# Pieces (6 tokens: P, N, B, R, Q, K)
	for piece in ["P", "N", "B", "R", "Q", "K"]:
	vocab[piece] = idx
	idx += 1

	# Squares (64 tokens: a1-h8)
	for f in ["a", "b", "c", "d", "e", "f", "g", "h"]:
	for r in ["1", "2", "3", "4", "5", "6", "7", "8"]:
	vocab[f + r] = idx
	idx += 1

	# Actions (2 tokens: →move, ×capture)
	vocab[self.MOVE_ARROW] = idx
	idx += 1
	vocab[self.CAPTURE_CROSS] = idx
	idx += 1

	# Modifiers (6 tokens: +check, +*checkmate, =Q/R/B/N promotions)
	for mod in ["+", "+*", "=Q", "=R", "=B", "=N"]:
	vocab[mod] = idx
	idx += 1

	# Special moves (2 tokens: O-O, O-O-O)
	for move in ["O-O", "O-O-O"]:
	vocab[move] = idx
	idx += 1

	return vocab

	@property
	def vocab_size(self) -> int:
	"""Return vocabulary size."""
	return len(self._vocab)

	def get_vocab(self) -> Dict[str, int]:
	"""Return vocabulary dictionary."""
	return dict(self._vocab)

	def _decompose_move(self, move: str) -> List[str]:
	"""Decompose chess move into component tokens.

	Args:
	move: Chess move in format WPe2e4 or BNg8f6(x)

	Returns:
	List of component tokens [color, piece, from_sq, action, to_sq, modifiers...]
	"""
	move = move.strip()

	# Handle castling
	if "O-O-O" in move or "o-o-o" in move.lower():
	return [move[0], "O-O-O"] # Color + castling
	elif "O-O" in move or "o-o" in move.lower():
	return [move[0], "O-O"] # Color + castling

	# Basic validation
	if len(move) < 6:
	return [self.UNK_TOKEN]

	# Extract basic components
	tokens = [
	move[0], # Color (W/B)
	move[1], # Piece (P/N/B/R/Q/K)
	move[2:4] # From square (e.g., e2)
	]

	# Determine action (capture vs move)
	is_capture = "(x)" in move or "(x+" in move or "(x+*)" in move
	tokens.append(self.CAPTURE_CROSS if is_capture else self.MOVE_ARROW)

	# To square
	tokens.append(move[4:6])

	# Add modifiers (check, checkmate)
	if "(+)" in move or "(x+)" in move:
	tokens.append("+*")
	elif "(+)" in move or "(x+)" in move:
	tokens.append("+")

	# Handle promotions (e.g., (Q), (+Q), (xQ))
	promotion_match = re.search(r'\((?:x\s)?(?:\+\s)?([QRBN])\)', move)
	if promotion_match:
	tokens.append(f"={promotion_match.group(1)}")

	return tokens

	def _tokenize(self, text: str) -> List[str]:
	"""Tokenize string of moves into component tokens.

	Args:
	text: String of space-separated chess moves

	Returns:
	List of all component tokens
	"""
	all_tokens = []
	for move in text.strip().split():
	all_tokens.extend(self._decompose_move(move))
	return all_tokens

	def _convert_token_to_id(self, token: str) -> int:
	"""Convert token to ID."""
	return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN, 3))

	def _convert_id_to_token(self, index: int) -> str:
	"""Convert ID to token."""
	return self._ids_to_tokens.get(index, self.UNK_TOKEN)

	def convert_tokens_to_string(self, tokens: List[str]) -> str:
	"""Convert component tokens back to move strings.

	Args:
	tokens: List of component tokens

	Returns:
	String of reconstructed chess moves
	"""
	# Filter out special tokens
	special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
	tokens = [t for t in tokens if t not in special]

	moves = []
	i = 0
	while i < len(tokens):
	# Handle castling
	if i + 1 < len(tokens) and tokens[i + 1] in ["O-O", "O-O-O"]:
	# Castling is just color + castle notation
	i += 2
	continue

	# Regular move: need at least 5 tokens [color, piece, from_sq, action, to_sq]
	if i + 4 < len(tokens):
	color, piece, from_sq, action, to_sq = tokens[i:i+5]
	move = f"{color}{piece}{from_sq}{to_sq}"

	# Collect modifiers that follow
	j = i + 5
	while j < len(tokens) and tokens[j] not in ["W", "B"]:
	mod = tokens[j]
	if mod == "+":
	move += "(+)"
	elif mod == "+*":
	move += "(+*)"
	elif mod.startswith("="):
	move += f"({mod[1]})" # Promotion
	j += 1

	# Add capture marker if action was capture
	if action == self.CAPTURE_CROSS and "(x)" not in move:
	move += "(x)"

	moves.append(move)
	i = j
	else:
	i += 1

	return " ".join(moves)

	def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
	"""Build model inputs with BOS/EOS tokens."""
	bos_id = self._vocab[self.BOS_TOKEN]
	eos_id = self._vocab[self.EOS_TOKEN]

	def wrap(ids: List[int]) -> List[int]:
	out = list(ids)
	# Only add BOS if missing
	if len(out) == 0 or out[0] != bos_id:
	out = [bos_id] + out
	# Only add EOS if missing
	if len(out) == 0 or out[-1] != eos_id:
	out = out + [eos_id]
	return out

	if token_ids_1 is None:
	return wrap(token_ids_0)

	# For pair inputs, keep it simple/consistent: wrap concatenation
	return wrap(token_ids_0 + token_ids_1)

	def get_special_tokens_mask(
	self,
	token_ids_0: List[int],
	token_ids_1: Optional[List[int]] = None,
	already_has_special_tokens: bool = False
	) -> List[int]:
	"""Return mask identifying special tokens."""
	bos_id = self._vocab[self.BOS_TOKEN]
	eos_id = self._vocab[self.EOS_TOKEN]
	pad_id = self._vocab[self.PAD_TOKEN]
	unk_id = self._vocab[self.UNK_TOKEN]

	def mask_for(ids: List[int]) -> List[int]:
	return [1 if tid in {bos_id, eos_id, pad_id, unk_id} else 0 for tid in ids]

	if already_has_special_tokens:
	if token_ids_1 is None:
	return mask_for(token_ids_0)
	return mask_for(token_ids_0 + token_ids_1)

	# No special tokens yet: build mask consistent with build_inputs_with_special_tokens
	if token_ids_1 is None:
	return [1] + ([0] * len(token_ids_0)) + [1]
	return [1] + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + [1]

	def create_token_type_ids_from_sequences(
	self,
	token_ids_0: List[int],
	token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	"""Create token type IDs (all zeros for single sequence type)."""
	# Build final ids consistently with build_inputs_with_special_tokens
	final_ids = self.build_inputs_with_special_tokens(token_ids_0, token_ids_1)
	return [0] * len(final_ids)

	def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
	"""Save vocabulary to JSON file."""
	if not os.path.isdir(save_directory):
	os.makedirs(save_directory, exist_ok=True)

	vocab_file = os.path.join(
	save_directory,
	(filename_prefix + "-" if filename_prefix else "") + "vocab.json"
	)

	with open(vocab_file, "w", encoding="utf-8") as f:
	json.dump(self._vocab, f, ensure_ascii=False, indent=2)

	return (vocab_file,)