update model

Browse files

Files changed (7) hide show

__init__.py +0 -0
config.json +11 -6
model.py +362 -0
model.safetensors +2 -2
tokenizer.py +119 -193
tokenizer_config.json +6 -0
vocab.json +80 -145

__init__.py ADDED Viewed

File without changes

config.json CHANGED Viewed

@@ -1,4 +1,5 @@
 {
   "architectures": [
     "ChessForCausalLM"
   ],
@@ -8,13 +9,17 @@
   "eos_token_id": 2,
   "layer_norm_epsilon": 1e-05,
   "model_type": "chess_transformer",
-  "n_ctx": 360,
-  "n_embd": 102,
-  "n_head": 6,
-  "n_inner": 360,
-  "n_layer": 8,
   "pad_token_id": 0,
   "tie_weights": true,
   "transformers_version": "4.57.5",
-  "vocab_size": 149
 }

 {
+  "_name_or_path": "./output/final_model",
   "architectures": [
     "ChessForCausalLM"
   ],
   "eos_token_id": 2,
   "layer_norm_epsilon": 1e-05,
   "model_type": "chess_transformer",
+  "auto_map": {
+    "AutoConfig": "model.ChessConfig",
+    "AutoModelForCausalLM": "model.ChessForCausalLM"
+  },
+  "n_ctx": 512,
+  "n_embd": 128,
+  "n_head": 4,
+  "n_inner": 256,
+  "n_layer": 7,
   "pad_token_id": 0,
   "tie_weights": true,
   "transformers_version": "4.57.5",
+  "vocab_size": 84
 }

model.py ADDED Viewed

	@@ -0,0 +1,362 @@

+"""
+Chess Transformer model for the Chess Challenge.
+Lightweight GPT-style architecture sized to stay within a ~1M parameter budget.
+Key pieces:
+- ChessConfig: hyperparameter container
+- ChessForCausalLM: autoregressive model for next-move prediction
+"""
+from __future__ import annotations
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+class ChessConfig(PretrainedConfig):
+    """
+    Configuration for the small chess transformer.
+    Defaults target roughly 1M parameters; adjust values to explore variants.
+    """
+    model_type = "chess_transformer"
+    def __init__(
+        self,
+        vocab_size: int = 1200,
+        n_embd: int = 128,
+        n_layer: int = 6,
+        n_head: int = 4,
+        n_ctx: int = 256,
+        n_inner: Optional[int] = None,
+        dropout: float = 0.1,
+        layer_norm_epsilon: float = 1e-5,
+        tie_weights: bool = True,
+        pad_token_id: int = 0,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_ctx = n_ctx
+        self.n_inner = n_inner if n_inner is not None else 3 * n_embd  # Reduced from 4x to 3x
+        self.dropout = dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.tie_weights = tie_weights
+        # Inform HF base class about tying behavior
+        self.tie_word_embeddings = bool(tie_weights)
+class MultiHeadAttention(nn.Module):
+    """Standard masked self-attention with combined QKV projection."""
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0, \
+            f"n_embd ({config.n_embd}) must be divisible by n_head ({config.n_head})"
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.head_dim = config.n_embd // config.n_head
+        # Combined QKV projection for efficiency
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.dropout = nn.Dropout(config.dropout)
+        # Causal mask (will be created on first forward pass)
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones(config.n_ctx, config.n_ctx)).view(
+                1, 1, config.n_ctx, config.n_ctx
+            ),
+            persistent=False,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, seq_len, _ = x.size()
+        # Compute Q, K, V
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(self.n_embd, dim=2)
+        # Reshape for multi-head attention
+        q = q.view(batch_size, seq_len, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, seq_len, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, seq_len, self.n_head, self.head_dim).transpose(1, 2)
+        # Scaled dot-product attention
+        attn_weights = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        # Apply causal mask
+        causal_mask = self.bias[:, :, :seq_len, :seq_len]
+        attn_weights = attn_weights.masked_fill(causal_mask == 0, float("-inf"))
+        # Apply attention mask (for padding)
+        if attention_mask is not None:
+            # attention_mask shape: (batch_size, seq_len) -> (batch_size, 1, 1, seq_len)
+            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            attn_weights = attn_weights.masked_fill(attention_mask == 0, float("-inf"))
+        attn_weights = F.softmax(attn_weights, dim=-1)
+        attn_weights = self.dropout(attn_weights)
+        # Apply attention to values
+        attn_output = torch.matmul(attn_weights, v)
+        # Reshape back
+        attn_output = attn_output.transpose(1, 2).contiguous().view(
+            batch_size, seq_len, self.n_embd
+        )
+        # Output projection
+        attn_output = self.c_proj(attn_output)
+        return attn_output
+class FeedForward(nn.Module):
+    """Two-layer MLP with GELU and dropout."""
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, config.n_inner)
+        self.c_proj = nn.Linear(config.n_inner, config.n_embd, bias=False)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.c_fc(x)
+        x = F.gelu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+class TransformerBlock(nn.Module):
+    """Pre-norm attention + MLP block."""
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.attn = MultiHeadAttention(config)
+        self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.mlp = FeedForward(config)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # Pre-norm attention
+        x = x + self.attn(self.ln_1(x), attention_mask=attention_mask)
+        # Pre-norm FFN
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class ChessForCausalLM(PreTrainedModel):
+    """
+    GPT-style causal LM for chess move prediction.
+    Stacks transformer blocks over token and position embeddings; ties output
+    head to embeddings when configured.
+    """
+    config_class = ChessConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    # Suppress missing-key warning for tied lm_head when loading
+    keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    def __init__(self, config: ChessConfig):
+        super().__init__(config)
+        # Token and position embeddings
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        self.wpe = nn.Embedding(config.n_ctx, config.n_embd)
+        self.drop = nn.Dropout(config.dropout)
+        # Transformer blocks
+        self.h = nn.ModuleList([
+            TransformerBlock(config) for _ in range(config.n_layer)
+        ])
+        # Final layer norm
+        self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        # Output head
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Declare tied weights for proper serialization
+        if config.tie_weights:
+            self._tied_weights_keys = ["lm_head.weight"]
+        # Initialize weights
+        self.post_init()
+        # Tie weights if configured
+        if config.tie_weights:
+            self.tie_weights()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.wte
+    def set_input_embeddings(self, new_embeddings: nn.Module):
+        self.wte = new_embeddings
+        if getattr(self.config, "tie_weights", False):
+            self.tie_weights()
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings: nn.Module):
+        self.lm_head = new_embeddings
+    def tie_weights(self):
+        if getattr(self.config, "tie_weights", False) or getattr(self.config, "tie_word_embeddings", False):
+            self._tie_or_clone_weights(self.lm_head, self.wte)
+    def _init_weights(self, module: nn.Module):
+        """GPT-2 style init."""
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+        elif isinstance(module, nn.LayerNorm):
+            torch.nn.init.ones_(module.weight)
+            torch.nn.init.zeros_(module.bias)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        """Forward pass with optional label loss."""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size, seq_len = input_ids.size()
+        device = input_ids.device
+        # Create position IDs if not provided
+        if position_ids is None:
+            position_ids = torch.arange(seq_len, device=device).unsqueeze(0).expand(batch_size, -1)
+        # Get embeddings
+        token_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        hidden_states = self.drop(token_embeds + position_embeds)
+        # Pass through transformer blocks
+        for block in self.h:
+            hidden_states = block(hidden_states, attention_mask=attention_mask)
+        # Final layer norm
+        hidden_states = self.ln_f(hidden_states)
+        # Get logits
+        logits = self.lm_head(hidden_states)
+        # Compute loss if labels are provided
+        loss = None
+        if labels is not None:
+            # Shift logits and labels for next-token prediction
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten for cross-entropy
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+            )
+        if not return_dict:
+            output = (logits,)
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=None,
+            hidden_states=None,
+            attentions=None,
+        )
+    @torch.no_grad()
+    def generate_move(
+        self,
+        input_ids: torch.LongTensor,
+        temperature: float = 1.0,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+    ) -> int:
+        """
+        Sample a next move token ID from logits, with optional top-k/p filtering.
+        Expects input_ids shaped (1, seq_len).
+        """
+        self.eval()
+        # Get logits for the last position
+        outputs = self(input_ids)
+        logits = outputs.logits[:, -1, :] / temperature
+        # Apply top-k filtering
+        if top_k is not None:
+            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+            logits[indices_to_remove] = float("-inf")
+        # Apply top-p (nucleus) filtering
+        if top_p is not None:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+            # Remove tokens with cumulative probability above the threshold
+            sorted_indices_to_remove = cumulative_probs > top_p
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+            indices_to_remove = sorted_indices_to_remove.scatter(
+                dim=-1, index=sorted_indices, src=sorted_indices_to_remove
+            )
+            logits[indices_to_remove] = float("-inf")
+        # Sample from the distribution
+        probs = F.softmax(logits, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1)
+        return next_token.item()
+# Register the model with Auto classes for easy loading
+from transformers import AutoConfig, AutoModelForCausalLM
+AutoConfig.register("chess_transformer", ChessConfig)
+AutoModelForCausalLM.register(ChessConfig, ChessForCausalLM)

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:27e3c4ddd2a3cfa3cf53001092e6c540678540427ddc709e73bc309aa23687a8
-size 3939664

 version https://git-lfs.github.com/spec/v1
+oid sha256:f9f6c2d3a9303683f3e532a8d924fe30f9a77e70bea8d1577b4a39eb167183e6
+size 4003376

tokenizer.py CHANGED Viewed

@@ -1,87 +1,66 @@
-"""
-Custom Chess Tokenizer for the Chess Challenge.
-This tokenizer uses a Factorized strategy (Triple Tokenization) to split moves
-into atomic components: [Piece, From_Square, To_Square, Suffix].
-Example: "WPe2e4" -> ["WP", "e2_f", "e4_t"]
-This drastically reduces vocabulary size (~155 tokens vs 1700), allowing
-for deeper models within the 1M parameter budget.
-"""
 from __future__ import annotations
 import json
 import os
 import re
-from typing import Dict, List, Optional
 from transformers import PreTrainedTokenizer
 class ChessTokenizer(PreTrainedTokenizer):
-    """
-    A custom tokenizer for chess moves using factorized notation.
-    This tokenizer maps chess concepts (Pieces, Squares) to unique token IDs.
-    The vocabulary is fixed and does not need to be built from a dataset.
-    Example:
-        >>> tokenizer = ChessTokenizer()
-        >>> tokenizer.tokenize("WPe2e4")
-        ['WP', 'e2_f', 'e4_t']
-    """
     model_input_names = ["input_ids", "attention_mask"]
     vocab_files_names = {"vocab_file": "vocab.json"}
-    # Special tokens
     PAD_TOKEN = "[PAD]"
     BOS_TOKEN = "[BOS]"
     EOS_TOKEN = "[EOS]"
     UNK_TOKEN = "[UNK]"
     def __init__(
         self,
         vocab_file: Optional[str] = None,
         vocab: Optional[Dict[str, int]] = None,
         **kwargs,
     ):
-        """
-        Initialize the chess tokenizer.
-        Args:
-            vocab_file: Path to a JSON file containing the vocabulary mapping.
-            vocab: Dictionary mapping tokens to IDs (alternative to vocab_file).
-            **kwargs: Additional arguments passed to PreTrainedTokenizer.
-        """
-        # Initialize special tokens
         self._pad_token = self.PAD_TOKEN
         self._bos_token = self.BOS_TOKEN
         self._eos_token = self.EOS_TOKEN
         self._unk_token = self.UNK_TOKEN
-        # Remove duplicate special-token entries passed through kwargs
-        kwargs.pop("pad_token", None)
-        kwargs.pop("bos_token", None)
-        kwargs.pop("eos_token", None)
-        kwargs.pop("unk_token", None)
-        # Load or create vocabulary
         if vocab is not None:
             self._vocab = vocab
         elif vocab_file is not None and os.path.exists(vocab_file):
             with open(vocab_file, "r", encoding="utf-8") as f:
                 self._vocab = json.load(f)
         else:
-            # Create the Factorized Vocabulary (Fixed ~155 tokens)
-            self._vocab = self._create_default_vocab()
-        # Create reverse mapping
         self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
-        # Call parent init
         super().__init__(
             pad_token=self._pad_token,
             bos_token=self._bos_token,
@@ -89,167 +68,114 @@ class ChessTokenizer(PreTrainedTokenizer):
             unk_token=self._unk_token,
             **kwargs,
         )
-    def _create_default_vocab(self) -> Dict[str, int]:
-        """
-        Create the fixed factorized vocabulary.
-        Includes Special Tokens, Pieces, From-Squares, To-Squares, and Suffixes.
-        """
-        special_tokens = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
-        pieces = ["WP", "WN", "WB", "WR", "WQ", "WK", "BP", "BN", "BB", "BR", "BQ", "BK"]
-        suffixes = ["(x)", "(+)", "(+*)", "(o)", "(O)"]
-        cols = "abcdefgh"
-        rows = "12345678"
-        squares = [f"{c}{r}" for r in rows for c in cols] # a1...h8
-        vocab = {token: idx for idx, token in enumerate(special_tokens)}
-        # Helper to add tokens sequentially
-        def add_tokens(token_list, suffix=""):
-            offset = len(vocab)
-            for i, t in enumerate(token_list):
-                vocab[f"{t}{suffix}"] = offset + i
-        add_tokens(pieces)          # WP, WN...
-        add_tokens(squares, "_f")   # a1_f, b1_f... (From Squares)
-        add_tokens(squares, "_t")   # a1_t, b1_t... (To Squares)
-        add_tokens(suffixes)        # (x), (+)...
-        return vocab
-    @classmethod
-    def build_vocab_from_iterator(cls, iterator, min_frequency: int = 1) -> "ChessTokenizer":
-        return cls()
-    @classmethod
-    def build_vocab_from_dataset(cls, **kwargs) -> "ChessTokenizer":
-        return cls()
     @property
     def vocab_size(self) -> int:
         return len(self._vocab)
     def get_vocab(self) -> Dict[str, int]:
         return dict(self._vocab)
     def _tokenize(self, text: str) -> List[str]:
-        """
-        Tokenize a string of moves into factorized tokens.
-        Robustly handles both "WPe2e4" and "e2e4".
-        """
-        raw_chunks = text.strip().split()
-        tokens = []
-        for chunk in raw_chunks:
-            if chunk in self._vocab:
-                tokens.append(chunk)
-                continue
-            # Regex Parsing: Matches Optional Piece + From + To + Optional Suffix
-            # Matches "WPe2e4" OR "e2e4" (robust)
-            match = re.match(r'([WB]?[PRNBQK]?)?([a-h][1-8])([a-h][1-8])(.*)', chunk)
-            if match:
-                p, f, t, s = match.groups()
-                if p: tokens.append(p)          # Piece (if present)
-                tokens.extend([f"{f}_f", f"{t}_t"]) # From + To
-                if s: tokens.append(s)          # Suffix (if present)
-            # Castling Special Case
-            elif "(o)" in chunk or "(O)" in chunk:
-                 match_castle = re.match(r'([WB]K)?([a-h][1-8])([a-h][1-8])(.*)', chunk)
-                 if match_castle:
-                     p, f, t, s = match_castle.groups()
-                     if p: tokens.append(p)
-                     tokens.extend([f"{f}_f", f"{t}_t", s])
-                 else:
-                     tokens.append(self.unk_token)
-            else:
-                tokens.append(self.unk_token)
         return tokens
     def _convert_token_to_id(self, token: str) -> int:
-        return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN, 0))
     def _convert_id_to_token(self, index: int) -> str:
         return self._ids_to_tokens.get(index, self.UNK_TOKEN)
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        """
-        Converts tokens back to a single string for evaluation.
-        Logic:
-        1. Strips '_f' and '_t' suffixes.
-        2. Joins parts without spaces (e.g. 'WP' + 'e2' + 'e4' -> 'WPe2e4').
-        3. Inserts a space ONLY before a new Piece token to separate moves.
-        """
         output = []
         special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
-        # 1. Clean tokens (remove special tokens and suffixes)
-        clean_tokens = []
         for t in tokens:
             if t in special: continue
-            clean_tokens.append(t.replace("_f", "").replace("_t", ""))
-        # 2. Join intelligently
-        final_str = ""
-        for i, token in enumerate(clean_tokens):
-            # Check if this token is a Piece (starts with W or B, length 2)
-            # This marks the start of a new move.
-            is_new_move = (len(token) == 2 and token[0] in "WB" and token[1] in "PRNBQK")
-            # Add space if it's a new move (and not the very first token)
-            if i > 0 and is_new_move:
-                final_str += " " + token
             else:
-                final_str += token
-        return final_str.strip()
-    def save_vocabulary(
-        self,
-        save_directory: str,
-        filename_prefix: Optional[str] = None,
-    ) -> tuple:
-        if not os.path.isdir(save_directory):
-            os.makedirs(save_directory, exist_ok=True)
-        vocab_file = os.path.join(
-            save_directory,
-            (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
-        )
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            json.dump(self._vocab, f, ensure_ascii=False, indent=2)
-        return (vocab_file,)
-def count_vocab_from_dataset(
-    dataset_name: str = "dlouapre/lichess_2025-01_1M",
-    split: str = "train",
-    column: str = "text",
-    max_samples: Optional[int] = 10000,
-) -> Dict[str, int]:
-    """
-    Count token frequencies in a dataset.
-    """
-    from collections import Counter
-    from datasets import load_dataset
-    tokenizer = ChessTokenizer()
-    dataset = load_dataset(dataset_name, split=split)
-    if max_samples is not None:
-        dataset = dataset.select(range(min(max_samples, len(dataset))))
-    token_counts = Counter()
-    for example in dataset:
-        tokens = tokenizer.tokenize(example[column])
-        token_counts.update(tokens)
-    return dict(token_counts)

 from __future__ import annotations
 import json
 import os
 import re
+from typing import Dict, List, Optional, Tuple, Any, Union, Sequence
 from transformers import PreTrainedTokenizer
 class ChessTokenizer(PreTrainedTokenizer):
     model_input_names = ["input_ids", "attention_mask"]
     vocab_files_names = {"vocab_file": "vocab.json"}
     PAD_TOKEN = "[PAD]"
     BOS_TOKEN = "[BOS]"
     EOS_TOKEN = "[EOS]"
     UNK_TOKEN = "[UNK]"
+    SIDE_W = "SIDE_W"
+    SIDE_B = "SIDE_B"
+    PROMO_PREFIX = "PROMO_"
+    CAPTURE = "CAPTURE"
+    CHECK = "CHECK"
+    MATE = "MATE"
+    CASTLE = "CASTLE"
+    PIECES = ["P", "N", "B", "R", "Q", "K"]
+    MOVE_RE = re.compile(
+        r"^(?P<side>[WB])"
+        r"(?P<piece>[PNBRQK])"
+        r"(?P<from>[a-h][1-8])"
+        r"(?P<to>[a-h][1-8])"
+        r"(?P<rest>.*)$"
+    )
     def __init__(
         self,
         vocab_file: Optional[str] = None,
         vocab: Optional[Dict[str, int]] = None,
         **kwargs,
     ):
+        kwargs.pop("pad_token", None)
+        kwargs.pop("bos_token", None)
+        kwargs.pop("eos_token", None)
+        kwargs.pop("unk_token", None)
         self._pad_token = self.PAD_TOKEN
         self._bos_token = self.BOS_TOKEN
         self._eos_token = self.EOS_TOKEN
         self._unk_token = self.UNK_TOKEN
         if vocab is not None:
             self._vocab = vocab
         elif vocab_file is not None and os.path.exists(vocab_file):
             with open(vocab_file, "r", encoding="utf-8") as f:
                 self._vocab = json.load(f)
         else:
+            self._vocab = self._build_fixed_vocab()
         self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
         super().__init__(
             pad_token=self._pad_token,
             bos_token=self._bos_token,
             unk_token=self._unk_token,
             **kwargs,
         )
+    def _build_fixed_vocab(self) -> Dict[str, int]:
+        special = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
+        sides = [self.SIDE_W, self.SIDE_B]
+        pieces = [f"PIECE_{p}" for p in self.PIECES]
+        squares = [f"SQ_{file}{rank}" for file in "abcdefgh" for rank in "12345678"]
+        promos = [f"{self.PROMO_PREFIX}{p}" for p in ["Q", "R", "B", "N"]]
+        flags = [self.CAPTURE, self.CHECK, self.MATE, self.CASTLE]
+        tokens = special + sides + pieces + squares + promos + flags
+        return {tok: i for i, tok in enumerate(tokens)}
     @property
     def vocab_size(self) -> int:
         return len(self._vocab)
     def get_vocab(self) -> Dict[str, int]:
         return dict(self._vocab)
     def _tokenize(self, text: str) -> List[str]:
+        out: List[str] = []
+        for move in text.strip().split():
+            out.extend(self._tokenize_move(move))
+        return out
+    def _tokenize_move(self, move: str) -> List[str]:
+        m = self.MOVE_RE.match(move)
+        if not m: return [self.UNK_TOKEN]
+        side = m.group("side")
+        piece = m.group("piece")
+        frm = m.group("from")
+        to = m.group("to")
+        rest = m.group("rest") or ""
+        tokens: List[str] = []
+        tokens.append(self.SIDE_W if side == "W" else self.SIDE_B)
+        tokens.append(f"PIECE_{piece}")
+        tokens.append(f"SQ_{frm}")
+        tokens.append(f"SQ_{to}")
+        promo = self._parse_promotion(rest)
+        if promo is not None:
+            tokens.append(f"{self.PROMO_PREFIX}{promo}")
+        if "(x)" in rest: tokens.append(self.CAPTURE)
+        if "++" in rest or "(+*)" in rest or "#" in rest:
+            tokens.append(self.MATE)
+        elif "+" in rest or "(+)" in rest:
+            tokens.append(self.CHECK)
+        if "(o)" in rest or "(O)" in rest:
+            tokens.append(self.CASTLE)
         return tokens
+    def _parse_promotion(self, rest: str) -> Optional[str]:
+        m = re.search(r"=([QRBNqrbn])", rest)
+        if m: return m.group(1).upper()
+        m2 = re.search(r"([QRBNqrbn])", rest)
+        if m2 and "(" not in rest:
+            if rest.strip() in ["Q", "R", "B", "N", "q", "r", "b", "n"]:
+                return rest.strip().upper()
+        return None
     def _convert_token_to_id(self, token: str) -> int:
+        return self._vocab.get(token, self._vocab[self.UNK_TOKEN])
     def _convert_id_to_token(self, index: int) -> str:
         return self._ids_to_tokens.get(index, self.UNK_TOKEN)
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
         output = []
         special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
         for t in tokens:
             if t in special: continue
+            if t == self.SIDE_W: output.append("W")
+            elif t == self.SIDE_B: output.append("B")
+            elif t.startswith("PIECE_"): output.append(t.replace("PIECE_", ""))
+            elif t.startswith("SQ_"): output.append(t.replace("SQ_", ""))
+            elif t.startswith(self.PROMO_PREFIX): output.append("=" + t.replace(self.PROMO_PREFIX, ""))
+            elif t == self.CAPTURE: output.append("(x)")
+            elif t == self.CHECK: output.append("(+)")
+            elif t == self.MATE: output.append("(+*)")
+            elif t == self.CASTLE: output.append("(o)")
             else:
+                pass
+        return "".join(output)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
+        if not os.path.isdir(save_directory): os.makedirs(save_directory, exist_ok=True)
+        vocab_file = os.path.join(save_directory, (filename_prefix + "-" if filename_prefix else "") + "vocab.json")
+        with open(vocab_file, "w", encoding="utf-8") as f: json.dump(self._vocab, f, ensure_ascii=False, indent=2)
+        return (vocab_file,)
+    def decode(self, token_ids: Union[int, Sequence[int]], skip_special_tokens: bool = False, **kwargs) -> str:
+        if isinstance(token_ids, int): ids = [token_ids]
+        elif "torch" in str(type(token_ids)): ids = token_ids.detach().cpu().flatten().tolist()
+        else: ids = list(token_ids)
+        toks = [self._convert_id_to_token(i) for i in ids]
+        if skip_special_tokens:
+            special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
+            toks = [t for t in toks if t not in special]
+        return self.convert_tokens_to_string(toks)

tokenizer_config.json CHANGED Viewed

@@ -40,5 +40,11 @@
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "[PAD]",
   "tokenizer_class": "ChessTokenizer",
   "unk_token": "[UNK]"
 }

   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "[PAD]",
   "tokenizer_class": "ChessTokenizer",
+  "auto_map": {
+    "AutoTokenizer": ["tokenizer.ChessTokenizer", null]
+  },
+  "tokenizer_auto_map": {
+    "AutoTokenizer": ["tokenizer.ChessTokenizer", null]
+  },
   "unk_token": "[UNK]"
 }

vocab.json CHANGED Viewed

@@ -3,149 +3,84 @@
   "[BOS]": 1,
   "[EOS]": 2,
   "[UNK]": 3,
-  "WP": 4,
-  "WN": 5,
-  "WB": 6,
-  "WR": 7,
-  "WQ": 8,
-  "WK": 9,
-  "BP": 10,
-  "BN": 11,
-  "BB": 12,
-  "BR": 13,
-  "BQ": 14,
-  "BK": 15,
-  "a1_f": 16,
-  "b1_f": 17,
-  "c1_f": 18,
-  "d1_f": 19,
-  "e1_f": 20,
-  "f1_f": 21,
-  "g1_f": 22,
-  "h1_f": 23,
-  "a2_f": 24,
-  "b2_f": 25,
-  "c2_f": 26,
-  "d2_f": 27,
-  "e2_f": 28,
-  "f2_f": 29,
-  "g2_f": 30,
-  "h2_f": 31,
-  "a3_f": 32,
-  "b3_f": 33,
-  "c3_f": 34,
-  "d3_f": 35,
-  "e3_f": 36,
-  "f3_f": 37,
-  "g3_f": 38,
-  "h3_f": 39,
-  "a4_f": 40,
-  "b4_f": 41,
-  "c4_f": 42,
-  "d4_f": 43,
-  "e4_f": 44,
-  "f4_f": 45,
-  "g4_f": 46,
-  "h4_f": 47,
-  "a5_f": 48,
-  "b5_f": 49,
-  "c5_f": 50,
-  "d5_f": 51,
-  "e5_f": 52,
-  "f5_f": 53,
-  "g5_f": 54,
-  "h5_f": 55,
-  "a6_f": 56,
-  "b6_f": 57,
-  "c6_f": 58,
-  "d6_f": 59,
-  "e6_f": 60,
-  "f6_f": 61,
-  "g6_f": 62,
-  "h6_f": 63,
-  "a7_f": 64,
-  "b7_f": 65,
-  "c7_f": 66,
-  "d7_f": 67,
-  "e7_f": 68,
-  "f7_f": 69,
-  "g7_f": 70,
-  "h7_f": 71,
-  "a8_f": 72,
-  "b8_f": 73,
-  "c8_f": 74,
-  "d8_f": 75,
-  "e8_f": 76,
-  "f8_f": 77,
-  "g8_f": 78,
-  "h8_f": 79,
-  "a1_t": 80,
-  "b1_t": 81,
-  "c1_t": 82,
-  "d1_t": 83,
-  "e1_t": 84,
-  "f1_t": 85,
-  "g1_t": 86,
-  "h1_t": 87,
-  "a2_t": 88,
-  "b2_t": 89,
-  "c2_t": 90,
-  "d2_t": 91,
-  "e2_t": 92,
-  "f2_t": 93,
-  "g2_t": 94,
-  "h2_t": 95,
-  "a3_t": 96,
-  "b3_t": 97,
-  "c3_t": 98,
-  "d3_t": 99,
-  "e3_t": 100,
-  "f3_t": 101,
-  "g3_t": 102,
-  "h3_t": 103,
-  "a4_t": 104,
-  "b4_t": 105,
-  "c4_t": 106,
-  "d4_t": 107,
-  "e4_t": 108,
-  "f4_t": 109,
-  "g4_t": 110,
-  "h4_t": 111,
-  "a5_t": 112,
-  "b5_t": 113,
-  "c5_t": 114,
-  "d5_t": 115,
-  "e5_t": 116,
-  "f5_t": 117,
-  "g5_t": 118,
-  "h5_t": 119,
-  "a6_t": 120,
-  "b6_t": 121,
-  "c6_t": 122,
-  "d6_t": 123,
-  "e6_t": 124,
-  "f6_t": 125,
-  "g6_t": 126,
-  "h6_t": 127,
-  "a7_t": 128,
-  "b7_t": 129,
-  "c7_t": 130,
-  "d7_t": 131,
-  "e7_t": 132,
-  "f7_t": 133,
-  "g7_t": 134,
-  "h7_t": 135,
-  "a8_t": 136,
-  "b8_t": 137,
-  "c8_t": 138,
-  "d8_t": 139,
-  "e8_t": 140,
-  "f8_t": 141,
-  "g8_t": 142,
-  "h8_t": 143,
-  "(x)": 144,
-  "(+)": 145,
-  "(+*)": 146,
-  "(o)": 147,
-  "(O)": 148
 }

   "[BOS]": 1,
   "[EOS]": 2,
   "[UNK]": 3,
+  "SIDE_W": 4,
+  "SIDE_B": 5,
+  "PIECE_P": 6,
+  "PIECE_N": 7,
+  "PIECE_B": 8,
+  "PIECE_R": 9,
+  "PIECE_Q": 10,
+  "PIECE_K": 11,
+  "SQ_a1": 12,
+  "SQ_a2": 13,
+  "SQ_a3": 14,
+  "SQ_a4": 15,
+  "SQ_a5": 16,
+  "SQ_a6": 17,
+  "SQ_a7": 18,
+  "SQ_a8": 19,
+  "SQ_b1": 20,
+  "SQ_b2": 21,
+  "SQ_b3": 22,
+  "SQ_b4": 23,
+  "SQ_b5": 24,
+  "SQ_b6": 25,
+  "SQ_b7": 26,
+  "SQ_b8": 27,
+  "SQ_c1": 28,
+  "SQ_c2": 29,
+  "SQ_c3": 30,
+  "SQ_c4": 31,
+  "SQ_c5": 32,
+  "SQ_c6": 33,
+  "SQ_c7": 34,
+  "SQ_c8": 35,
+  "SQ_d1": 36,
+  "SQ_d2": 37,
+  "SQ_d3": 38,
+  "SQ_d4": 39,
+  "SQ_d5": 40,
+  "SQ_d6": 41,
+  "SQ_d7": 42,
+  "SQ_d8": 43,
+  "SQ_e1": 44,
+  "SQ_e2": 45,
+  "SQ_e3": 46,
+  "SQ_e4": 47,
+  "SQ_e5": 48,
+  "SQ_e6": 49,
+  "SQ_e7": 50,
+  "SQ_e8": 51,
+  "SQ_f1": 52,
+  "SQ_f2": 53,
+  "SQ_f3": 54,
+  "SQ_f4": 55,
+  "SQ_f5": 56,
+  "SQ_f6": 57,
+  "SQ_f7": 58,
+  "SQ_f8": 59,
+  "SQ_g1": 60,
+  "SQ_g2": 61,
+  "SQ_g3": 62,
+  "SQ_g4": 63,
+  "SQ_g5": 64,
+  "SQ_g6": 65,
+  "SQ_g7": 66,
+  "SQ_g8": 67,
+  "SQ_h1": 68,
+  "SQ_h2": 69,
+  "SQ_h3": 70,
+  "SQ_h4": 71,
+  "SQ_h5": 72,
+  "SQ_h6": 73,
+  "SQ_h7": 74,
+  "SQ_h8": 75,
+  "PROMO_Q": 76,
+  "PROMO_R": 77,
+  "PROMO_B": 78,
+  "PROMO_N": 79,
+  "CAPTURE": 80,
+  "CHECK": 81,
+  "MATE": 82,
+  "CASTLE": 83
 }