LLM-course
/

chess_earlytok

@@ -1,22 +1,3 @@
-"""
-Custom Chess Tokenizer for the Chess Challenge (structured, decomposed).
-This tokenizer parses the dataset's extended UCI tokens (e.g., WPe2e4, BNg8f6(x))
-and decomposes each move into a small set of atomic tokens:
-    [MOVE] e2 e4
-    [MOVE] e7 e8 promo_q   (promotion when detected)
-Design goals for <1M parameter models:
-- Small, fixed vocabulary (no dataset scan needed)
-- Reduced sparsity (share statistics across moves)
-- Fewer failure modes (drop suffix tokens like (x), (+), etc.)
-- Compatible with HF Trainer / PreTrainedTokenizer
-Note: evaluation extracts UCI moves by detecting square patterns in generated text.
-This tokenizer ensures squares appear as tokens ("e2", "e4") which is evaluator-friendly.
-"""
 from __future__ import annotations
 import json
@@ -28,15 +9,6 @@ from transformers import PreTrainedTokenizer
 class ChessTokenizer(PreTrainedTokenizer):
-    """
-    A custom tokenizer for chess moves.
-    Each dataset move like 'WPe2e4(x)' becomes tokens:
-        ['[MOVE]', 'e2', 'e4']  (+ optional 'promo_q/r/b/n')
-    This helps small models learn legality by learning square transitions
-    rather than memorizing thousands of full-move tokens.
-    """
     model_input_names = ["input_ids", "attention_mask"]
     vocab_files_names = {"vocab_file": "vocab.json"}
@@ -50,11 +22,9 @@ class ChessTokenizer(PreTrainedTokenizer):
     # Structure token
     MOVE_TOKEN = "[MOVE]"
-    # Regex to parse dataset moves: W/B + piece + from + to + rest
     _MOVE_RE = re.compile(
         r'^(?P<color>[WB])(?P<piece>[PNBRQK])(?P<from>[a-h][1-8])(?P<to>[a-h][1-8])(?P<rest>.*)$'
     )
-    # Promotion detection (be permissive)
     _PROMO_RE = re.compile(r'=?([QRBNqrbn])')
     def __init__(
@@ -86,7 +56,6 @@ class ChessTokenizer(PreTrainedTokenizer):
         self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
-        # Call parent init AFTER vocab is ready
         super().__init__(
             pad_token=self._pad_token,
             bos_token=self._bos_token,
@@ -96,21 +65,11 @@ class ChessTokenizer(PreTrainedTokenizer):
         )
     def _create_default_vocab(self) -> Dict[str, int]:
-        """
-        Minimal default vocab (placeholder). Prefer build_structured_vocab().
-        """
         special = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN, self.MOVE_TOKEN]
         return {t: i for i, t in enumerate(special)}
     @classmethod
     def build_structured_vocab(cls) -> "ChessTokenizer":
-        """
-        Build a fixed, complete vocabulary:
-          - special tokens
-          - [MOVE]
-          - 64 squares: a1..h8
-          - promotion tokens: promo_q/r/b/n
-        """
         special = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN, cls.MOVE_TOKEN]
         files = "abcdefgh"
@@ -123,8 +82,6 @@ class ChessTokenizer(PreTrainedTokenizer):
         vocab = {t: i for i, t in enumerate(tokens)}
         return cls(vocab=vocab)
-    # Backwards-compatible API: if someone calls dataset-based vocab build,
-    # we return structured vocab by default (dataset scan is unnecessary here).
     @classmethod
     def build_vocab_from_dataset(
         cls,
@@ -134,7 +91,6 @@ class ChessTokenizer(PreTrainedTokenizer):
         min_frequency: int = 500,
         max_samples: Optional[int] = 100000,
     ) -> "ChessTokenizer":
-        # Keep signature, but use structured vocab for this tokenizer design.
         return cls.build_structured_vocab()
     @property
@@ -151,20 +107,10 @@ class ChessTokenizer(PreTrainedTokenizer):
         return self._ids_to_tokens.get(index, self.UNK_TOKEN)
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        """
-        Convert tokens back to a string.
-        We keep squares and promo tokens; we drop PAD/BOS/EOS/UNK for cleaner output.
-        Keeping [MOVE] is useful for structure (but you can drop it if you want).
-        """
         drop = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
         return " ".join(t for t in tokens if t not in drop)
     def _decompose_one_move(self, move_tok: str) -> List[str]:
-        """
-        Parse dataset move token 'WPe2e4(x)' -> ['[MOVE]', 'e2', 'e4'] (+ promo)
-        If parsing fails, emit [UNK].
-        """
         m = self._MOVE_RE.match(move_tok)
         if not m:
             return [self.UNK_TOKEN]
@@ -185,31 +131,19 @@ class ChessTokenizer(PreTrainedTokenizer):
         return out
     def _tokenize(self, text: str) -> List[str]:
-        """
-        Tokenize text.
-        Important: HF may call _tokenize() on already-split "words".
-        So this must handle both:
-          - full strings with spaces
-          - a single token like "WPe2e4(x)"
-        """
         text = text.strip()
         if not text:
             return []
         special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN, self.MOVE_TOKEN}
-        # If HF already split: single "word"
         if " " not in text:
             if text in special:
                 return [text]
-            # If it's already a square or promo token, keep it
             if text in self._vocab:
                 return [text]
-            # Otherwise treat as a dataset move token
             return self._decompose_one_move(text)
-        # Otherwise split ourselves
         out: List[str] = []
         for part in text.split():
             if part in special:
@@ -245,10 +179,6 @@ def count_vocab_from_dataset(
     column: str = "text",
     max_samples: Optional[int] = 10000,
 ) -> Dict[str, int]:
-    """
-    Left here for convenience if you still want frequency stats,
-    but it's not used by the structured tokenizer.
-    """
     from collections import Counter
     from datasets import load_dataset

 from __future__ import annotations
 import json
 class ChessTokenizer(PreTrainedTokenizer):
     model_input_names = ["input_ids", "attention_mask"]
     vocab_files_names = {"vocab_file": "vocab.json"}
     # Structure token
     MOVE_TOKEN = "[MOVE]"
     _MOVE_RE = re.compile(
         r'^(?P<color>[WB])(?P<piece>[PNBRQK])(?P<from>[a-h][1-8])(?P<to>[a-h][1-8])(?P<rest>.*)$'
     )
     _PROMO_RE = re.compile(r'=?([QRBNqrbn])')
     def __init__(
         self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
         super().__init__(
             pad_token=self._pad_token,
             bos_token=self._bos_token,
         )
     def _create_default_vocab(self) -> Dict[str, int]:
         special = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN, self.MOVE_TOKEN]
         return {t: i for i, t in enumerate(special)}
     @classmethod
     def build_structured_vocab(cls) -> "ChessTokenizer":
         special = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN, cls.MOVE_TOKEN]
         files = "abcdefgh"
         vocab = {t: i for i, t in enumerate(tokens)}
         return cls(vocab=vocab)
     @classmethod
     def build_vocab_from_dataset(
         cls,
         min_frequency: int = 500,
         max_samples: Optional[int] = 100000,
     ) -> "ChessTokenizer":
         return cls.build_structured_vocab()
     @property
         return self._ids_to_tokens.get(index, self.UNK_TOKEN)
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
         drop = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
         return " ".join(t for t in tokens if t not in drop)
     def _decompose_one_move(self, move_tok: str) -> List[str]:
         m = self._MOVE_RE.match(move_tok)
         if not m:
             return [self.UNK_TOKEN]
         return out
     def _tokenize(self, text: str) -> List[str]:
         text = text.strip()
         if not text:
             return []
         special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN, self.MOVE_TOKEN}
         if " " not in text:
             if text in special:
                 return [text]
             if text in self._vocab:
                 return [text]
             return self._decompose_one_move(text)
         out: List[str] = []
         for part in text.split():
             if part in special:
     column: str = "text",
     max_samples: Optional[int] = 10000,
 ) -> Dict[str, int]:
     from collections import Counter
     from datasets import load_dataset