LLM-course
/

chess-hamonk-v6

@@ -1,14 +1,13 @@
 """
 Custom Chess Tokenizer for the Chess Challenge.
-This tokenizer treats each move as a single token using the extended UCI notation
-from the Lichess dataset (e.g., WPe2e4, BNg8f6).
-The dataset format uses:
 - W/B prefix for White/Black
 - Piece letter: P=Pawn, N=Knight, B=Bishop, R=Rook, Q=Queen, K=King
-- Source and destination squares (e.g., e2e4)
 - Special suffixes: (x)=capture, (+)=check, (+*)=checkmate, (o)/(O)=castling
 """
 from __future__ import annotations
@@ -16,6 +15,8 @@ from __future__ import annotations
 import json
 import os
 from pathlib import Path
 from typing import Dict, List, Optional
 from transformers import PreTrainedTokenizer
@@ -23,16 +24,12 @@ from transformers import PreTrainedTokenizer
 class ChessTokenizer(PreTrainedTokenizer):
     """
-    A custom tokenizer for chess moves using extended UCI notation.
-    This tokenizer maps each possible chess move to a unique token ID.
-    The vocabulary is built from the training dataset to ensure all moves
-    encountered during training have a corresponding token.
     Example:
         >>> tokenizer = ChessTokenizer()
         >>> tokenizer.encode("WPe2e4 BPe7e5")
-        [1, 42, 87, 2]  # [BOS, e2e4, e7e5, EOS]
     """
     model_input_names = ["input_ids", "attention_mask"]
@@ -43,6 +40,7 @@ class ChessTokenizer(PreTrainedTokenizer):
     BOS_TOKEN = "[BOS]"
     EOS_TOKEN = "[EOS]"
     UNK_TOKEN = "[UNK]"
     def __init__(
         self,
@@ -63,6 +61,7 @@ class ChessTokenizer(PreTrainedTokenizer):
         self._bos_token = self.BOS_TOKEN
         self._eos_token = self.EOS_TOKEN
         self._unk_token = self.UNK_TOKEN
         # Remove any duplicate special-token entries passed through kwargs
         # to avoid "multiple values for keyword" errors when loading from disk.
@@ -70,6 +69,7 @@ class ChessTokenizer(PreTrainedTokenizer):
         kwargs.pop("bos_token", None)
         kwargs.pop("eos_token", None)
         kwargs.pop("unk_token", None)
         # Load or create vocabulary
         if vocab is not None:
@@ -91,6 +91,7 @@ class ChessTokenizer(PreTrainedTokenizer):
             bos_token=self._bos_token,
             eos_token=self._eos_token,
             unk_token=self._unk_token,
             **kwargs,
         )
@@ -101,48 +102,10 @@ class ChessTokenizer(PreTrainedTokenizer):
         For the full vocabulary, use `build_vocab_from_dataset()`.
         This minimal vocab is just a placeholder - you should build from data.
         """
-        special_tokens = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
         vocab = {token: idx for idx, token in enumerate(special_tokens)}
         return vocab
-    @classmethod
-    def build_vocab_from_iterator(
-        cls,
-        iterator,
-        min_frequency: int = 1,
-    ) -> "ChessTokenizer":
-        """
-        Build a tokenizer vocabulary from an iterator of game strings.
-        Args:
-            iterator: An iterator yielding game strings (space-separated moves).
-            min_frequency: Minimum frequency for a token to be included.
-        Returns:
-            A ChessTokenizer with the built vocabulary.
-        """
-        from collections import Counter
-        token_counts = Counter()
-        for game in iterator:
-            moves = game.strip().split()
-            token_counts.update(moves)
-        # Filter by frequency
-        tokens = [
-            token for token, count in token_counts.items()
-            if count >= min_frequency
-        ]
-        # Sort for reproducibility
-        tokens = sorted(tokens)
-        # Build vocabulary
-        special_tokens = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN]
-        vocab = {token: idx for idx, token in enumerate(special_tokens + tokens)}
-        return cls(vocab=vocab)
     @classmethod
     def build_vocab_from_dataset(
@@ -150,8 +113,7 @@ class ChessTokenizer(PreTrainedTokenizer):
         dataset_name: str = "dlouapre/lichess_2025-01_1M",
         split: str = "train",
         column: str = "text",
-        min_frequency: int = 500,
-        max_samples: Optional[int] = 100000,
     ) -> "ChessTokenizer":
         """
         Build a tokenizer vocabulary from a Hugging Face dataset.
@@ -160,24 +122,101 @@ class ChessTokenizer(PreTrainedTokenizer):
             dataset_name: Name of the dataset on Hugging Face Hub.
             split: Dataset split to use.
             column: Column containing the game strings.
-            min_frequency: Minimum frequency for a token to be included (default: 500).
-            max_samples: Maximum number of samples to process (default: 100k).
         Returns:
             A ChessTokenizer with the built vocabulary.
         """
         from datasets import load_dataset
         dataset = load_dataset(dataset_name, split=split)
-        if max_samples is not None:
-            dataset = dataset.select(range(min(max_samples, len(dataset))))
-        def game_iterator():
-            for example in dataset:
-                yield example[column]
-        return cls.build_vocab_from_iterator(game_iterator(), min_frequency=min_frequency)
     @property
     def vocab_size(self) -> int:
@@ -198,7 +237,34 @@ class ChessTokenizer(PreTrainedTokenizer):
         Returns:
             List of move tokens.
         """
-        return text.strip().split()
     def _convert_token_to_id(self, token: str) -> int:
         """Convert a token to its ID."""
@@ -213,7 +279,16 @@ class ChessTokenizer(PreTrainedTokenizer):
         # Filter out special tokens for cleaner output
         special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
         return " ".join(t for t in tokens if t not in special)
     def save_vocabulary(
         self,
         save_directory: str,
@@ -242,6 +317,80 @@ class ChessTokenizer(PreTrainedTokenizer):
         return (vocab_file,)
 def count_vocab_from_dataset(
     dataset_name: str = "dlouapre/lichess_2025-01_1M",
@@ -269,10 +418,10 @@ def count_vocab_from_dataset(
     if max_samples is not None:
         dataset = dataset.select(range(min(max_samples, len(dataset))))
     token_counts = Counter()
     for example in dataset:
-        moves = example[column].strip().split()
-        token_counts.update(moves)
-    return dict(token_counts)

 """
 Custom Chess Tokenizer for the Chess Challenge.
+We build a vocabulary with:
 - W/B prefix for White/Black
 - Piece letter: P=Pawn, N=Knight, B=Bishop, R=Rook, Q=Queen, K=King
+- Source and rank and file: e.g e 2
+- Destination and rank and file: e.g e 4
 - Special suffixes: (x)=capture, (+)=check, (+*)=checkmate, (o)/(O)=castling
 """
 from __future__ import annotations
 import json
 import os
 from pathlib import Path
+import shutil
+import inspect
 from typing import Dict, List, Optional
 from transformers import PreTrainedTokenizer
 class ChessTokenizer(PreTrainedTokenizer):
     """
+    A custom tokenizer for chess moves.
     Example:
         >>> tokenizer = ChessTokenizer()
         >>> tokenizer.encode("WPe2e4 BPe7e5")
+        # [BOS, W, P, e, 2, e, 4, B, P, e, 7, e, 5, EOS]
     """
     model_input_names = ["input_ids", "attention_mask"]
     BOS_TOKEN = "[BOS]"
     EOS_TOKEN = "[EOS]"
     UNK_TOKEN = "[UNK]"
+    SEP_TOKEN = "[SEP]"
     def __init__(
         self,
         self._bos_token = self.BOS_TOKEN
         self._eos_token = self.EOS_TOKEN
         self._unk_token = self.UNK_TOKEN
+        self._sep_token = self.SEP_TOKEN
         # Remove any duplicate special-token entries passed through kwargs
         # to avoid "multiple values for keyword" errors when loading from disk.
         kwargs.pop("bos_token", None)
         kwargs.pop("eos_token", None)
         kwargs.pop("unk_token", None)
+        kwargs.pop("sep_token", None)
         # Load or create vocabulary
         if vocab is not None:
             bos_token=self._bos_token,
             eos_token=self._eos_token,
             unk_token=self._unk_token,
+            sep_token=self._sep_token,
             **kwargs,
         )
         For the full vocabulary, use `build_vocab_from_dataset()`.
         This minimal vocab is just a placeholder - you should build from data.
         """
+        special_tokens = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN, self.SEP_TOKEN]
         vocab = {token: idx for idx, token in enumerate(special_tokens)}
         return vocab
     @classmethod
     def build_vocab_from_dataset(
         dataset_name: str = "dlouapre/lichess_2025-01_1M",
         split: str = "train",
         column: str = "text",
+        save_path: Optional[str] = None,
     ) -> "ChessTokenizer":
         """
         Build a tokenizer vocabulary from a Hugging Face dataset.
             dataset_name: Name of the dataset on Hugging Face Hub.
             split: Dataset split to use.
             column: Column containing the game strings.
         Returns:
             A ChessTokenizer with the built vocabulary.
+        Args:
+            save_path: Optional path to write the generated vocab JSON. If not
+                provided, the vocab will be saved to ``./chess_tokenizer_vocab.json``.
         """
         from datasets import load_dataset
+        # If a saved vocab exists at `save_path`, load it and return a tokenizer
+        if save_path is None:
+            cwd = os.getcwd()
+            save_path = os.path.join(cwd, "chess_tokenizer_vocab.json")
+        if os.path.exists(save_path):
+            try:
+                with open(save_path, "r", encoding="utf-8") as f:
+                    print("Loading existing tokenizer vocab from", save_path)
+                    vocab = json.load(f)
+                return cls(vocab=vocab)
+            except Exception:
+                # If loading fails, fall through to rebuild the vocab.
+                pass
         dataset = load_dataset(dataset_name, split=split)
+        # Iterator over games (respect max_samples if provided)
+        samples = dataset[column]
+        tokens = set()
+        for game in samples:
+            if not isinstance(game, str):
+                continue
+            moves = game.strip().split()
+            for move in moves:
+                # Basic parsing of move token components
+                if len(move) < 2:
+                    continue
+                color = move[0]
+                piece = move[1]
+                from_square = move[2:4] if len(move) >= 4 else ''
+                to_square = move[4:6] if len(move) >= 6 else ''
+                suffix = move[6:] if len(move) > 6 else ''
+                tokens.add(color)
+                tokens.add(piece)
+                tokens.add(from_square)
+                tokens.add(to_square)
+                if suffix:
+                    tokens.add(suffix)
+        # Sort tokens
+        tokens = sorted(tokens)
+        # Ensure special tokens are present at fixed ids
+        special_tokens = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN, cls.SEP_TOKEN]
+        # Build vocab mapping: special tokens first, then tokens
+        vocab: Dict[str, int] = {}
+        idx = 0
+        for st in special_tokens:
+            vocab[st] = idx
+            idx += 1
+        for t in tokens:
+            if t in vocab:
+                continue
+            vocab[t] = idx
+            idx += 1
+        # Create tokenizer instance with this vocab
+        tokenizer = cls(vocab=vocab)
+        # Save vocab to disk. Use provided `save_path` or default file name.
+        try:
+            if save_path is None:
+                cwd = os.getcwd()
+                save_path = os.path.join(cwd, "chess_tokenizer_vocab.json")
+            # Write to a temporary file first and atomically replace final file.
+            tmp_path = save_path + ".tmp"
+            with open(tmp_path, "w", encoding="utf-8") as f:
+                json.dump(vocab, f, ensure_ascii=False, indent=2)
+            os.replace(tmp_path, save_path)
+        except Exception:
+            # Non-fatal: ignore save errors but don't leave temp files behind.
+            try:
+                if 'tmp_path' in locals() and os.path.exists(tmp_path):
+                    os.remove(tmp_path)
+            except Exception:
+                pass
+        return tokenizer
     @property
     def vocab_size(self) -> int:
         Returns:
             List of move tokens.
         """
+        tokens: List[str] = []
+        for move in text.strip().split():
+            if len(move) < 2:
+                continue
+            color, piece, from_square, to_square, suffix = self._decompose_move(move)
+            tokens.append(color)
+            tokens.append(piece)
+            tokens.append(from_square)
+            tokens.append(to_square)
+            if suffix:
+                tokens.append(suffix)
+            tokens.append(self._sep_token)
+        return tokens[:-1]  # Remove last SEP token
+    @staticmethod
+    def _decompose_move(move: str):
+        """Decompose a move string into components: color, piece, from_square, to_square, suffix.
+        Returns a 5-tuple of strings (empty strings for missing parts).
+        """
+        color = move[0]
+        piece = move[1] if len(move) >= 2 else ''
+        from_square = move[2:4] if len(move) >= 4 else ''
+        to_square = move[4:6] if len(move) >= 6 else ''
+        suffix = move[6:] if len(move) > 6 else ''
+        return color, piece, from_square, to_square, suffix
     def _convert_token_to_id(self, token: str) -> int:
         """Convert a token to its ID."""
         # Filter out special tokens for cleaner output
         special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
         return " ".join(t for t in tokens if t not in special)
+    def decode(self, token_ids: List[int], skip_special_tokens: bool = True) -> str:
+        """Decode a list of token IDs back to a string."""
+        tokens = [self._convert_id_to_token(int(tid)) for tid in token_ids]
+        if skip_special_tokens:
+            special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
+            # SEP token should be replace by space
+            tokens = [t if t != self.SEP_TOKEN else " " for t in tokens if t not in special]
+        return "".join(tokens)
     def save_vocabulary(
         self,
         save_directory: str,
         return (vocab_file,)
+    def save_pretrained(
+        self,
+        save_directory: str,
+        filename_prefix: Optional[str] = None,
+        save_tokenizer_code: bool = True,
+    ) -> None:
+        """Save tokenizer files to a directory in a HF-compatible layout.
+        This writes the vocab JSON (via `save_vocabulary`), a small
+        `tokenizer_config.json` describing special tokens and the vocab
+        filename, and optionally copies the tokenizer module source file
+        into the directory so others can import the implementation.
+        """
+        if not os.path.isdir(save_directory):
+            os.makedirs(save_directory, exist_ok=True)
+        # Save the vocabulary file
+        vocab_file_tuple = self.save_vocabulary(save_directory, filename_prefix)
+        vocab_file = vocab_file_tuple[0]
+        # Write a minimal tokenizer config
+        config = {
+            "tokenizer_class": self.__class__.__name__,
+            "vocab_file": os.path.basename(vocab_file),
+            "pad_token": self.PAD_TOKEN,
+            "bos_token": self.BOS_TOKEN,
+            "eos_token": self.EOS_TOKEN,
+            "unk_token": self.UNK_TOKEN,
+        }
+        config_path = os.path.join(save_directory, "tokenizer_config.json")
+        with open(config_path, "w", encoding="utf-8") as f:
+            json.dump(config, f, ensure_ascii=False, indent=2)
+        # Optionally copy this module file so the tokenizer class implementation
+        # is available alongside the saved vocab/config. This helps when
+        # transferring the saved tokenizer to another environment.
+        if save_tokenizer_code:
+            try:
+                src_file = Path(inspect.getsourcefile(self.__class__))
+                dst_file = Path(save_directory) / src_file.name
+                shutil.copy2(src_file, dst_file)
+            except Exception:
+                # Non-fatal; we still saved vocab and config
+                pass
+    @classmethod
+    def from_pretrained(cls, load_directory: str) -> "ChessTokenizer":
+        """Load tokenizer from a directory previously written with `save_pretrained`.
+        This primarily reads the vocab file and constructs the tokenizer.
+        If a `tokenizer_config.json` exists it will be consulted for the
+        vocab filename and special tokens (but we still instantiate using
+        the provided class).
+        """
+        config_path = os.path.join(load_directory, "tokenizer_config.json")
+        vocab_file = None
+        if os.path.exists(config_path):
+            try:
+                with open(config_path, "r", encoding="utf-8") as f:
+                    cfg = json.load(f)
+                vocab_file = os.path.join(load_directory, cfg.get("vocab_file", "vocab.json"))
+            except Exception:
+                pass
+        if vocab_file is None:
+            # Fallback: look for a vocab file in the directory
+            candidates = [p for p in os.listdir(load_directory) if p.endswith("vocab.json")]
+            if candidates:
+                vocab_file = os.path.join(load_directory, candidates[0])
+        if vocab_file is None or not os.path.exists(vocab_file):
+            raise FileNotFoundError(f"No vocab file found in {load_directory}")
+        return cls(vocab_file=vocab_file)
 def count_vocab_from_dataset(
     dataset_name: str = "dlouapre/lichess_2025-01_1M",
     if max_samples is not None:
         dataset = dataset.select(range(min(max_samples, len(dataset))))
+    tokenizer = ChessTokenizer()
     token_counts = Counter()
     for example in dataset:
+        token_counts.update(tokenizer._tokenize(example[column]))
+    return dict(token_counts)