Chess Challenge submission by Chiensaucisse67

Browse files

Files changed (8) hide show

README.md +26 -0
config.json +28 -0
model.py +307 -0
model.safetensors +3 -0
special_tokens_map.json +6 -0
tokenizer.py +565 -0
tokenizer_config.json +44 -0
vocab.json +81 -0

README.md ADDED Viewed

	@@ -0,0 +1,26 @@

+---
+library_name: transformers
+tags:
+- chess
+- llm-course
+- chess-challenge
+license: mit
+---
+# chess-trm-powerful
+Chess model submitted to the LLM Course Chess Challenge.
+## Submission Info
+- **Submitted by**: [Chiensaucisse67](https://huggingface.co/Chiensaucisse67)
+- **Parameters**: 994,464
+- **Organization**: LLM-course
+## Model Details
+- **Architecture**: Chess Transformer (GPT-style)
+- **Vocab size**: 79
+- **Embedding dim**: 216
+- **Layers**: 2
+- **Heads**: 4

config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "architectures": [
+    "ChessForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "model.ChessConfig",
+    "AutoModelForCausalLM": "model.ChessForCausalLM"
+  },
+  "bos_token_id": 1,
+  "dropout": 0.0,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "h_cycles": 2,
+  "l_cycles": 2,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "chess_transformer",
+  "n_ctx": 256,
+  "n_embd": 216,
+  "n_head": 4,
+  "n_inner": 560,
+  "n_layers": null,
+  "n_layers_per_block": 2,
+  "pad_token_id": 0,
+  "rope_theta": 10000.0,
+  "tie_weights": true,
+  "transformers_version": "4.57.1",
+  "vocab_size": 79
+}

model.py ADDED Viewed

	@@ -0,0 +1,307 @@

+"""
+TRM (Tiny Recursive Model) adapted for Causal Language Modeling (Chess).
+Based on the official implementation: TinyRecursiveModels/models/recursive_reasoning/trm.py
+"""
+from __future__ import annotations
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+# -----------------------------------------------------------------------------
+# Configuration
+# -----------------------------------------------------------------------------
+class ChessConfig(PretrainedConfig):
+    model_type = "chess_transformer"
+    def __init__(
+        self,
+        vocab_size: int = 1200,
+        n_embd: int = 128,
+        n_head: int = 4,
+        n_ctx: int = 256,
+        h_cycles: int = 2,      # Number of High-level reasoning cycles
+        l_cycles: int = 2,      # Number of Low-level reasoning cycles per H-cycle
+        n_layers_per_block: int = 1, # Number of physical layers in the shared block
+        n_inner: Optional[int] = None,
+        n_layer: Optional[int] = None,  # Not used directly; total layers = h_cycles * l_cycles
+        dropout: float = 0.0,   # TRM usually uses 0 dropout for reasoning
+        layer_norm_epsilon: float = 1e-5,
+        tie_weights: bool = True,
+        rope_theta: float = 10000.0,
+        pad_token_id: int = 0, # Assuming 0 is padding based on your log
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.n_embd = n_embd
+        self.n_head = n_head
+        self.n_ctx = n_ctx
+        self.h_cycles = h_cycles
+        self.l_cycles = l_cycles
+        self.n_layers_per_block = n_layers_per_block
+        self.n_layers = n_layer
+        self.n_inner = n_inner if n_inner is not None else int(n_embd * 8/3) # SwiGLU convention
+        self.dropout = dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.tie_weights = tie_weights
+        self.rope_theta = rope_theta
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        var = torch.mean(x**2, dim=-1, keepdim=True)
+        x = x * torch.rsqrt(var + self.eps)
+        return self.weight * x
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000.0, device=None):
+        super().__init__()
+        self.dim = dim
+        self.base = base
+        self.max_position_embeddings = max_position_embeddings
+        self.register_buffer("inv_freq", None, persistent=False)
+        self.register_buffer("cos_cached", None, persistent=False)
+        self.register_buffer("sin_cached", None, persistent=False)
+    def _update_cos_sin_tables(self, x, seq_len):
+        if (self.cos_cached is None or self.cos_cached.shape[0] < seq_len):
+            self.inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=x.device).float() / self.dim))
+            t = torch.arange(max(seq_len, self.max_position_embeddings), device=x.device).float()
+            freqs = torch.outer(t, self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            self.cos_cached = emb.cos()
+            self.sin_cached = emb.sin()
+    def forward(self, x, seq_len=None):
+        if seq_len is None:
+            seq_len = x.shape[1]
+        self._update_cos_sin_tables(x, seq_len)
+        return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...]
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin):
+    # q, k: [batch, seq, head, dim] (after transpose)
+    # cos, sin: [seq, dim] -> need broadcast
+    cos = cos.unsqueeze(0).unsqueeze(2) # [1, seq, 1, dim]
+    sin = sin.unsqueeze(0).unsqueeze(2)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class MultiQueryAttention(nn.Module):
+    """
+    Standard Attention with RoPE support.
+    Using Multi-Query (MQA) or standard MHA depending on config.
+    Adapted for Causal Masking.
+    """
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.head_dim = config.n_embd // config.n_head
+        self.c_q = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.c_k = nn.Linear(config.n_embd, self.head_dim, bias=False)
+        self.c_v = nn.Linear(config.n_embd, self.head_dim, bias=False)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x, cos, sin, attention_mask=None):
+        B, T, C = x.size()
+        q = self.c_q(x).view(B, T, self.n_head, self.head_dim)
+        k = self.c_k(x).view(B, T, 1, self.head_dim)
+        v = self.c_v(x).view(B, T, 1, self.head_dim)
+        q, k = apply_rotary_pos_emb(q, k, cos, sin)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        k = k.expand(-1, self.n_head, -1, -1)
+        v = v.expand(-1, self.n_head, -1, -1)
+        y = F.scaled_dot_product_attention(
+            q, k, v,
+            attn_mask=None,
+            dropout_p=self.dropout.p if self.training else 0.0,
+            is_causal=True
+        )
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        y = self.c_proj(y)
+        return y
+class SwiGLU(nn.Module):
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        self.w1 = nn.Linear(config.n_embd, config.n_inner, bias=False)
+        self.w2 = nn.Linear(config.n_embd, config.n_inner, bias=False)
+        self.w3 = nn.Linear(config.n_inner, config.n_embd, bias=False)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        x1 = self.w1(x)
+        x2 = self.w2(x)
+        hidden = F.silu(x1) * x2
+        return self.dropout(self.w3(hidden))
+class TRMBlock(nn.Module):
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        self.self_attn = MultiQueryAttention(config)
+        self.mlp = SwiGLU(config)
+        self.ln_1 = RMSNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.ln_2 = RMSNorm(config.n_embd, eps=config.layer_norm_epsilon)
+    def forward(self, x, cos, sin):
+        attn_out = self.self_attn(x, cos, sin)
+        x = self.ln_1(x + attn_out)
+        mlp_out = self.mlp(x)
+        x = self.ln_2(x + mlp_out)
+        return x
+class TRMReasoningModule(nn.Module):
+    """
+    The reusable module containing shared layers.
+    Implements Input Injection: hidden_states = hidden_states + injection
+    """
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        self.layers = nn.ModuleList([TRMBlock(config) for _ in range(config.n_layers_per_block)])
+    def forward(self, hidden_states, input_injection, cos, sin):
+        hidden_states = hidden_states + input_injection
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, cos, sin)
+        return hidden_states
+class ChessForCausalLM(PreTrainedModel):
+    config_class = ChessConfig
+    def __init__(self, config: ChessConfig):
+        super().__init__(config)
+        self.config = config
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        self.rotary = RotaryEmbedding(config.n_embd // config.n_head, max_position_embeddings=config.n_ctx, base=config.rope_theta)
+        self.reasoning_module = TRMReasoningModule(config)
+        self.z_H_init = nn.Parameter(torch.randn(1, 1, config.n_embd) * 0.02)
+        self.z_L_init = nn.Parameter(torch.randn(1, 1, config.n_embd) * 0.02)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        if config.tie_weights:
+            self.lm_head.weight = self.wte.weight
+        self.post_init()
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        B, T = input_ids.size()
+        x_emb = self.wte(input_ids)
+        cos, sin = self.rotary(x_emb, seq_len=T)
+        z_H = self.z_H_init.expand(B, T, -1).contiguous()
+        z_L = self.z_L_init.expand(B, T, -1).contiguous()
+        with torch.no_grad():
+            for _h in range(self.config.h_cycles - 1):
+                # L-loop (updates z_L)
+                for _l in range(self.config.l_cycles):
+                    z_L = self.reasoning_module(
+                        hidden_states=z_L,
+                        input_injection=(z_H + x_emb),
+                        cos=cos, sin=sin
+                    )
+                # H-loop step (updates z_H)
+                z_H = self.reasoning_module(
+                    hidden_states=z_H,
+                    input_injection=z_L,
+                    cos=cos, sin=sin
+                )
+        for _l in range(self.config.l_cycles):
+            z_L = self.reasoning_module(
+                hidden_states=z_L,
+                input_injection=(z_H + x_emb),
+                cos=cos, sin=sin
+            )
+        z_H = self.reasoning_module(
+            hidden_states=z_H,
+            input_injection=z_L,
+            cos=cos, sin=sin
+        )
+        logits = self.lm_head(z_H)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=None
+        )
+from transformers import AutoConfig, AutoModelForCausalLM
+AutoConfig.register("chess_transformer", ChessConfig)
+AutoModelForCausalLM.register(ChessConfig, ChessForCausalLM)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b2a2be23bdc0258e1bc997d35d871cb856d78ea5457ed0594bde2f4830255bf
+size 3980192

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "[BOS]",
+  "eos_token": "[EOS]",
+  "pad_token": "[PAD]",
+  "unk_token": "[UNK]"
+}

tokenizer.py ADDED Viewed

	@@ -0,0 +1,565 @@

+"""
+Custom Chess Tokenizer for the Chess Challenge.
+This tokenizer treats each move as a single token using the extended UCI notation
+from the Lichess dataset (e.g., WPe2e4, BNg8f6).
+The dataset format uses:
+- W/B prefix for White/Black
+- Piece letter: P=Pawn, N=Knight, B=Bishop, R=Rook, Q=Queen, K=King
+- Source and destination squares (e.g., e2e4)
+- Special suffixes: (x)=capture, (+)=check, (+*)=checkmate, (o)/(O)=castling
+"""
+from __future__ import annotations
+import json
+import os
+from pathlib import Path
+from token import OP
+from typing import Dict, List, Optional
+from transformers import PreTrainedTokenizer
+import re
+class ChessTokenizer(PreTrainedTokenizer):
+    """
+    A custom tokenizer for chess moves using extended UCI notation.
+    This tokenizer maps each possible chess move to a unique token ID.
+    The vocabulary is built from the training dataset to ensure all moves
+    encountered during training have a corresponding token.
+    Example:
+        >>> tokenizer = ChessTokenizer()
+        >>> tokenizer.encode("WPe2e4 BPe7e5")
+        [1, 42, 87, 2]  # [BOS, e2e4, e7e5, EOS]
+    """
+    model_input_names = ["input_ids", "attention_mask"]
+    vocab_files_names = {"vocab_file": "vocab.json"}
+    # Special tokens
+    PAD_TOKEN = "[PAD]"
+    BOS_TOKEN = "[BOS]"
+    EOS_TOKEN = "[EOS]"
+    UNK_TOKEN = "[UNK]"
+    def __init__(
+        self,
+        vocab_file: Optional[str] = None,
+        vocab: Optional[Dict[str, int]] = None,
+        **kwargs,
+    ):
+        """
+        Initialize the chess tokenizer.
+        Args:
+            vocab_file: Path to a JSON file containing the vocabulary mapping.
+            vocab: Dictionary mapping tokens to IDs (alternative to vocab_file).
+            **kwargs: Additional arguments passed to PreTrainedTokenizer.
+        """
+        # Initialize special tokens
+        self._pad_token = self.PAD_TOKEN
+        self._bos_token = self.BOS_TOKEN
+        self._eos_token = self.EOS_TOKEN
+        self._unk_token = self.UNK_TOKEN
+        # Remove any duplicate special-token entries passed through kwargs
+        # to avoid "multiple values for keyword" errors when loading from disk.
+        kwargs.pop("pad_token", None)
+        kwargs.pop("bos_token", None)
+        kwargs.pop("eos_token", None)
+        kwargs.pop("unk_token", None)
+        # Load or create vocabulary
+        if vocab is not None:
+            self._vocab = vocab
+        elif vocab_file is not None and os.path.exists(vocab_file):
+            with open(vocab_file, "r", encoding="utf-8") as f:
+                self._vocab = json.load(f)
+        else:
+            # Create a minimal vocabulary with just special tokens
+            # The full vocabulary should be built from the dataset
+            self._vocab = self._create_default_vocab()
+        # Create reverse mapping
+        self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
+        # Call parent init AFTER setting up vocab
+        super().__init__(
+            pad_token=self._pad_token,
+            bos_token=self._bos_token,
+            eos_token=self._eos_token,
+            unk_token=self._unk_token,
+            **kwargs,
+        )
+    def _create_default_vocab(self) -> Dict[str, int]:
+        """
+        Create a minimal default vocabulary with just special tokens.
+        For the full vocabulary, use `build_vocab_from_dataset()`.
+        This minimal vocab is just a placeholder - you should build from data.
+        """
+        special_tokens = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
+        vocab = {token: idx for idx, token in enumerate(special_tokens)}
+        return vocab
+    @classmethod
+    def build_vocab_from_iterator(
+        cls,
+        iterator,
+        min_frequency: int = 1,
+    ) -> "ChessTokenizer":
+        """
+        Build a tokenizer vocabulary from an iterator of game strings.
+        Args:
+            iterator: An iterator yielding game strings (space-separated moves).
+            min_frequency: Minimum frequency for a token to be included.
+        Returns:
+            A ChessTokenizer with the built vocabulary.
+        """
+        from collections import Counter
+        token_counts = Counter()
+        for game in iterator:
+            moves = game.strip().split()
+            token_counts.update(moves)
+        # Filter by frequency
+        tokens = [
+            token for token, count in token_counts.items()
+            if count >= min_frequency
+        ]
+        # Sort for reproducibility
+        tokens = sorted(tokens)
+        # Build vocabulary
+        special_tokens = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN]
+        vocab = {token: idx for idx, token in enumerate(special_tokens + tokens)}
+        return cls(vocab=vocab)
+    @classmethod
+    def build_vocab_from_dataset(
+        cls,
+        dataset_name: str = "dlouapre/lichess_2025-01_1M",
+        split: str = "train",
+        column: str = "text",
+        min_frequency: int = 500,
+        max_samples: Optional[int] = 100000,
+    ) -> "ChessTokenizer":
+        """
+        Build a tokenizer vocabulary from a Hugging Face dataset.
+        Args:
+            dataset_name: Name of the dataset on Hugging Face Hub.
+            split: Dataset split to use.
+            column: Column containing the game strings.
+            min_frequency: Minimum frequency for a token to be included (default: 500).
+            max_samples: Maximum number of samples to process (default: 100k).
+        Returns:
+            A ChessTokenizer with the built vocabulary.
+        """
+        from datasets import load_dataset
+        dataset = load_dataset(dataset_name, split=split)
+        if max_samples is not None:
+            dataset = dataset.select(range(min(max_samples, len(dataset))))
+        def game_iterator():
+            for example in dataset:
+                yield example[column]
+        return cls.build_vocab_from_iterator(game_iterator(), min_frequency=min_frequency)
+    @property
+    def vocab_size(self) -> int:
+        """Return the size of the vocabulary."""
+        return len(self._vocab)
+    def get_vocab(self) -> Dict[str, int]:
+        """Return the vocabulary as a dictionary."""
+        return dict(self._vocab)
+    def _tokenize(self, text: str) -> List[str]:
+        """
+        Tokenize a string of moves into a list of tokens.
+        Args:
+            text: A string of space-separated moves.
+        Returns:
+            List of move tokens.
+        """
+        return text.strip().split()
+    def _convert_token_to_id(self, token: str) -> int:
+        """Convert a token to its ID."""
+        return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN, 0))
+    def _convert_id_to_token(self, index: int) -> str:
+        """Convert an ID to its token."""
+        return self._ids_to_tokens.get(index, self.UNK_TOKEN)
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Convert a list of tokens back to a string."""
+        # Filter out special tokens for cleaner output
+        special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
+        return " ".join(t for t in tokens if t not in special)
+    def save_vocabulary(
+        self,
+        save_directory: str,
+        filename_prefix: Optional[str] = None,
+    ) -> tuple:
+        """
+        Save the vocabulary to a JSON file.
+        Args:
+            save_directory: Directory to save the vocabulary.
+            filename_prefix: Optional prefix for the filename.
+        Returns:
+            Tuple containing the path to the saved vocabulary file.
+        """
+        if not os.path.isdir(save_directory):
+            os.makedirs(save_directory, exist_ok=True)
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            json.dump(self._vocab, f, ensure_ascii=False, indent=2)
+        return (vocab_file,)
+def count_vocab_from_dataset(
+    dataset_name: str = "dlouapre/lichess_2025-01_1M",
+    split: str = "train",
+    column: str = "text",
+    max_samples: Optional[int] = 10000,
+) -> Dict[str, int]:
+    """
+    Count token frequencies in a dataset (useful for vocabulary analysis).
+    Args:
+        dataset_name: Name of the dataset on Hugging Face Hub.
+        split: Dataset split to use.
+        column: Column containing the game strings.
+        max_samples: Maximum number of samples to process.
+    Returns:
+        Dictionary mapping tokens to their frequencies.
+    """
+    from collections import Counter
+    from datasets import load_dataset
+    dataset = load_dataset(dataset_name, split=split)
+    if max_samples is not None:
+        dataset = dataset.select(range(min(max_samples, len(dataset))))
+    token_counts = Counter()
+    for example in dataset:
+        moves = example[column].strip().split()
+        token_counts.update(moves)
+    return dict(token_counts)
+class CoordinateTokenizer(ChessTokenizer):
+    def __init__(self, **kwargs):
+        squares = [f"{f}{r}" for f in "abcdefgh" for r in "12345678"]
+        promotions = ["q", "r", "b", "n"]
+        control = ["[PAD]", "[BOS]", "[EOS]", "[UNK]"]
+        vocab_list = control + squares + promotions
+        self._vocab = {t: i for i, t in enumerate(vocab_list)}
+        self._ids_to_token = {i: t for t, i in self._vocab.items()}
+        super().__init__(
+            vocab=self._vocab,
+            pad_token="[PAD]",
+            bos_token="[BOS]",
+            eos_token="[EOS]",
+            unk_token="[UNK]",
+            truncation_side="left",
+            **kwargs
+        )
+    def _tokenize(self, text: str) -> List[str]:
+        raw_moves = text.strip().split()
+        tokens = []
+        for raw_move in raw_moves:
+            squares = re.findall(r'[a-h][1-8]', raw_move)
+            tokens.extend(squares)
+            if "=" in raw_move:
+                idx = raw_move.index("=")
+                if idx + 1 < len(raw_move):
+                    tokens.append(raw_move[idx+1].lower())
+            elif "q" in raw_move[-2:].lower():
+                tokens.append(raw_move[-1].lower())
+        return tokens
+class CoordinateChessTokenizer(PreTrainedTokenizer):
+    """
+    Tokenizer that decomposes chess moves into coordinate components.
+    Example:
+        WPe2e4 -> ['e2', 'e4']
+        WPa7a8q -> ['a7', 'a8', 'q']  # pawn promotion
+    Vocabulary size: 72 tokens
+    - 64 squares (a1-h8)
+    - 4 promotions (q, r, b, n)
+    - 4 special tokens
+    """
+    model_input_names = ["input_ids", "attention_mask"]
+    vocab_files_names = {"vocab_file": "vocab.json"}
+    PAD_TOKEN = "[PAD]"
+    BOS_TOKEN = "[BOS]"
+    EOS_TOKEN = "[EOS]"
+    UNK_TOKEN = "[UNK]"
+    # Regex to extract from-square, to-square, and optional promotion
+    MOVE_PATTERN = re.compile(r'([a-h][1-8])([a-h][1-8])([qrbn])?')
+    def __init__(self, vocab_file: Optional[str] = None, **kwargs):
+        # Remove duplicate special token kwargs
+        kwargs.pop("pad_token", None)
+        kwargs.pop("bos_token", None)
+        kwargs.pop("eos_token", None)
+        kwargs.pop("unk_token", None)
+        # Build fixed vocabulary
+        if vocab_file is not None and os.path.exists(vocab_file):
+            with open(vocab_file, "r", encoding="utf-8") as f:
+                self._vocab = json.load(f)
+        else:
+            self._vocab = self._create_vocab()
+        self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
+        super().__init__(
+            pad_token=self.PAD_TOKEN,
+            bos_token=self.BOS_TOKEN,
+            eos_token=self.EOS_TOKEN,
+            unk_token=self.UNK_TOKEN,
+            **kwargs,
+        )
+    def _create_vocab(self) -> Dict[str, int]:
+        """Create fixed vocabulary of 72 tokens."""
+        tokens = [
+            self.PAD_TOKEN,
+            self.BOS_TOKEN,
+            self.EOS_TOKEN,
+            self.UNK_TOKEN,
+        ]
+        # Add all 64 squares
+        for file in 'abcdefgh':
+            for rank in '12345678':
+                tokens.append(f"{file}{rank}")
+        # Add promotion pieces
+        tokens.extend(['q', 'r', 'b', 'n'])
+        return {token: idx for idx, token in enumerate(tokens)}
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab)
+    def get_vocab(self) -> Dict[str, int]:
+        return dict(self._vocab)
+    def _tokenize(self, text: str) -> List[str]:
+        """
+        Tokenize move string into coordinate components.
+        Args:
+            text: Space-separated moves like "WPe2e4 BNg8f6"
+        Returns:
+            List of coordinate tokens: ['e2', 'e4', 'g8', 'f6']
+        """
+        tokens = []
+        raw_moves = text.strip().split()
+        for move in raw_moves:
+            match = self.MOVE_PATTERN.search(move)
+            if match:
+                from_sq, to_sq, promotion = match.groups()
+                tokens.append(from_sq)
+                tokens.append(to_sq)
+                if promotion:
+                    tokens.append(promotion)
+        return tokens
+    def _convert_token_to_id(self, token: str) -> int:
+        return self._vocab.get(token, self._vocab[self.UNK_TOKEN])
+    def _convert_id_to_token(self, index: int) -> str:
+        return self._ids_to_tokens.get(index, self.UNK_TOKEN)
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Reconstruct moves from coordinate tokens."""
+        special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
+        clean = [t for t in tokens if t not in special]
+        # Group into moves (2 or 3 tokens per move)
+        moves = []
+        i = 0
+        while i < len(clean):
+            if i + 1 < len(clean):
+                move = clean[i] + clean[i + 1]
+                i += 2
+                # Check for promotion
+                if i < len(clean) and clean[i] in ['q', 'r', 'b', 'n']:
+                    move += clean[i]
+                    i += 1
+                moves.append(move)
+            else:
+                i += 1
+        return " ".join(moves)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
+        if not os.path.isdir(save_directory):
+            os.makedirs(save_directory, exist_ok=True)
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            json.dump(self._vocab, f, ensure_ascii=False, indent=2)
+        return (vocab_file,)
+class EnhancedCoordinateTokenizer(CoordinateChessTokenizer):
+    """
+    Extended version that preserves piece information as optional metadata.
+    Vocabulary: 76 tokens (adds W, B, P, N, B, R, Q, K but makes them optional)
+    Use this if you want to preserve color/piece info with minimal vocab growth.
+    """
+    def _create_vocab(self) -> Dict[str, int]:
+        vocab = super()._create_vocab()
+        # Add optional color and piece tokens
+        piece_tokens = ['W', 'B', 'P', 'N', 'R', 'Q', 'K']  # Note: B appears in both contexts
+        next_id = len(vocab)
+        for token in piece_tokens:
+            if token not in vocab:
+                vocab[token] = next_id
+                next_id += 1
+        return vocab
+    def _tokenize(self, text: str) -> List[str]:
+        """
+        Optionally include piece info: WPe2e4 -> ['W', 'P', 'e2', 'e4']
+        Or strip it for minimal version: WPe2e4 -> ['e2', 'e4']
+        """
+        tokens = []
+        raw_moves = text.strip().split()
+        for move in raw_moves:
+            # Extract color and piece if present
+            if len(move) >= 2 and move[0] in 'WB' and move[1] in 'PNBRQK':
+                # Uncomment to include piece info (increases sequence length):
+                # tokens.extend([move[0], move[1]])
+                pass
+            # Extract coordinates
+            match = self.MOVE_PATTERN.search(move)
+            if match:
+                from_sq, to_sq, promotion = match.groups()
+                tokens.append(from_sq)
+                tokens.append(to_sq)
+                if promotion:
+                    tokens.append(promotion)
+        return tokens
+class SanitizedChessTokenizer(ChessTokenizer):
+    # Strategy:
+    # 1. Strip suffixes: (, ), x, +, *, o, O, E
+    # 2. Strip prefixes: W or B followed by P, N, B, R, Q, K
+    #    Regex: ^[WB][PNBRQK] matches the start of the string
+    # We can use a single regex to find the "Pure Move" part.
+    # We look for the square-to-square pattern (e.g., e2e4) and optional promotion (q,r,b,n)
+    # This is safer than stripping because it ignores all noise around the move.
+    MOVE_PATTERN = re.compile(r'([a-h][1-8][a-h][1-8][qrbn]?)')
+    def _sanitize(self, text: str) -> str:
+        # Extract just the move part (e.g., "WPe2e4(x)" -> "e2e4")
+        match = self.MOVE_PATTERN.search(text)
+        if match:
+            return match.group(1)
+        return self.unk_token # Fallback if no valid move found
+    def _tokenize(self, text: str) -> List[str]:
+        # Tokenize by splitting space, then extracting the move
+        tokens = []
+        for t in text.strip().split():
+            clean = self._sanitize(t)
+            if clean != self.unk_token:
+                tokens.append(clean)
+        return tokens
+    @classmethod
+    def build_vocab_from_iterator(cls, iterator, min_frequency: int = 1) -> "SanitizedChessTokenizer":
+        from collections import Counter
+        token_counts = Counter()
+        for game in iterator:
+            moves = game.strip().split()
+            # Extract only the Pure UCI part
+            clean_moves = []
+            for m in moves:
+                match = cls.MOVE_PATTERN.search(m)
+                if match:
+                    clean_moves.append(match.group(1))
+            token_counts.update(clean_moves)
+        # Filter by frequency
+        tokens = [
+            token for token, count in token_counts.items()
+            if count >= min_frequency
+        ]
+        tokens = sorted(tokens)
+        # Build vocabulary
+        special_tokens = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN]
+        vocab = {token: idx for idx, token in enumerate(special_tokens + tokens)}
+        return cls(vocab=vocab)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[EOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[BOS]",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "[EOS]",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "tokenizer_class": "EnhancedCoordinateTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.json ADDED Viewed

	@@ -0,0 +1,81 @@

+{
+  "[PAD]": 0,
+  "[BOS]": 1,
+  "[EOS]": 2,
+  "[UNK]": 3,
+  "a1": 4,
+  "a2": 5,
+  "a3": 6,
+  "a4": 7,
+  "a5": 8,
+  "a6": 9,
+  "a7": 10,
+  "a8": 11,
+  "b1": 12,
+  "b2": 13,
+  "b3": 14,
+  "b4": 15,
+  "b5": 16,
+  "b6": 17,
+  "b7": 18,
+  "b8": 19,
+  "c1": 20,
+  "c2": 21,
+  "c3": 22,
+  "c4": 23,
+  "c5": 24,
+  "c6": 25,
+  "c7": 26,
+  "c8": 27,
+  "d1": 28,
+  "d2": 29,
+  "d3": 30,
+  "d4": 31,
+  "d5": 32,
+  "d6": 33,
+  "d7": 34,
+  "d8": 35,
+  "e1": 36,
+  "e2": 37,
+  "e3": 38,
+  "e4": 39,
+  "e5": 40,
+  "e6": 41,
+  "e7": 42,
+  "e8": 43,
+  "f1": 44,
+  "f2": 45,
+  "f3": 46,
+  "f4": 47,
+  "f5": 48,
+  "f6": 49,
+  "f7": 50,
+  "f8": 51,
+  "g1": 52,
+  "g2": 53,
+  "g3": 54,
+  "g4": 55,
+  "g5": 56,
+  "g6": 57,
+  "g7": 58,
+  "g8": 59,
+  "h1": 60,
+  "h2": 61,
+  "h3": 62,
+  "h4": 63,
+  "h5": 64,
+  "h6": 65,
+  "h7": 66,
+  "h8": 67,
+  "q": 68,
+  "r": 69,
+  "b": 70,
+  "n": 71,
+  "W": 72,
+  "B": 73,
+  "P": 74,
+  "N": 75,
+  "R": 76,
+  "Q": 77,
+  "K": 78
+}