""" Role-marked square tokenizer for Chess Challenge. Each move is represented as: _f _t [promo?] [EOS] Examples: WPe2e4 -> e2_f e4_t [EOS] BPe7e8=Q -> e7_f e8_t q [EOS] """ from __future__ import annotations import json import os import re from typing import Dict, List, Optional, Tuple, Any from transformers import PreTrainedTokenizer _MOVE_RE = re.compile(r"^([WB])([PNBRQK])([a-h][1-8])([a-h][1-8])(.*)$") _PROMO_RE = re.compile(r"=([QRBNqrbn])") SQUARES = [f"{f}{r}" for r in "12345678" for f in "abcdefgh"] PROMOS = ["q", "r", "b", "n"] class ChessTokenizer(PreTrainedTokenizer): model_input_names = ["input_ids", "attention_mask"] vocab_files_names = {"vocab_file": "vocab.json"} PAD_TOKEN = "[PAD]" BOS_TOKEN = "[BOS]" EOS_TOKEN = "[EOS]" UNK_TOKEN = "[UNK]" def __init__( self, vocab_file: Optional[str] = None, vocab: Optional[Dict[str, int]] = None, **kwargs: Any, ): self._pad_token = self.PAD_TOKEN self._bos_token = self.BOS_TOKEN self._eos_token = self.EOS_TOKEN self._unk_token = self.UNK_TOKEN for k in ["pad_token", "bos_token", "eos_token", "unk_token"]: kwargs.pop(k, None) if vocab is not None: self._vocab = vocab elif vocab_file is not None and os.path.isfile(vocab_file): with open(vocab_file, "r", encoding="utf-8") as f: self._vocab = json.load(f) else: self._vocab = self._build_fixed_vocab() self._ids_to_tokens = {v: k for k, v in self._vocab.items()} super().__init__( pad_token=self._pad_token, bos_token=self._bos_token, eos_token=self._eos_token, unk_token=self._unk_token, **kwargs, ) @staticmethod def _build_fixed_vocab() -> Dict[str, int]: tokens: List[str] = [ ChessTokenizer.PAD_TOKEN, ChessTokenizer.BOS_TOKEN, ChessTokenizer.EOS_TOKEN, ChessTokenizer.UNK_TOKEN, ] tokens += [f"{sq}_f" for sq in SQUARES] tokens += [f"{sq}_t" for sq in SQUARES] tokens += PROMOS return {tok: i for i, tok in enumerate(tokens)} @property def vocab_size(self) -> int: return len(self._vocab) def get_vocab(self) -> Dict[str, int]: return dict(self._vocab) @classmethod def build_vocab_from_dataset(cls, *args: Any, **kwargs: Any) -> "ChessTokenizer": return cls() @classmethod def build_vocab_from_iterator(cls, *args: Any, **kwargs: Any) -> "ChessTokenizer": return cls() def _tokenize(self, text: str) -> List[str]: """Tokenize a space-separated list of dataset moves into role-marked tokens.""" text = (text or "").strip() if not text: return [] out: List[str] = [] for move in text.split(): # Allow already-tokenized text (debugging) if move in self._vocab: out.append(move) continue # Try to match Standard Lichess Format (WPe2e4) m = _MOVE_RE.match(move) if not m: # If it's plain UCI like e2e4 or e7e8q if re.fullmatch(r"[a-h][1-8][a-h][1-8][qrbn]?", move): src, dst = move[:2], move[2:4] out.append(f"{src}_f") out.append(f"{dst}_t") if len(move) == 5: out.append(move[4]) out.append(self.EOS_TOKEN) continue # Unknown token out.append(self.UNK_TOKEN) out.append(self.EOS_TOKEN) continue # Extract parts from WPe2e4... _side, _piece, src, dst, suffix = m.groups() out.append(f"{src}_f") out.append(f"{dst}_t") promo = None pm = _PROMO_RE.search(suffix or "") if pm: promo = pm.group(1).lower() if promo in PROMOS: out.append(promo) out.append(self.EOS_TOKEN) return out def _convert_token_to_id(self, token: str) -> int: return self._vocab.get(token, self._vocab[self.UNK_TOKEN]) def _convert_id_to_token(self, index: int) -> str: return self._ids_to_tokens.get(index, self.UNK_TOKEN) def convert_tokens_to_string(self, tokens: List[str]) -> str: return " ".join(tokens) def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: os.makedirs(save_directory, exist_ok=True) name = "vocab.json" if not filename_prefix else f"{filename_prefix}-vocab.json" path = os.path.join(save_directory, name) with open(path, "w", encoding="utf-8") as f: json.dump(self._vocab, f, indent=2, ensure_ascii=False) return (path,)