LLM-course
/

chess_swdo_s

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c2f6badcac6a0d4167ee6f95d6322d2d821159f95ce87172d365dc48ec691b74
 size 3490096

 version https://git-lfs.github.com/spec/v1
+oid sha256:02aaa4f1c65a9dc94d710f1071176353e932da7f89b161579755aa4c93adcc71
 size 3490096

tokenizer.py CHANGED Viewed

@@ -1,16 +1,4 @@
-"""
-Decomposed Chess Tokenizer v2 for the Chess Challenge.
-This tokenizer decomposes moves into structural components:
-- Color (W/B)
-- Piece (P/N/B/R/Q/K)
-- From square (a1-h8)
-- To square (a1-h8)
-- Modifiers (capture, check, checkmate, promotion, castling)
-This allows the model to learn chess structure and generalize better
-while using a much smaller vocabulary (~90 tokens vs ~1200+).
-"""
 from __future__ import annotations
@@ -25,18 +13,12 @@ from transformers import PreTrainedTokenizer
 class ChessTokenizer(PreTrainedTokenizer):
     """
-    Decomposed chess move tokenizer.
-    Breaks moves into structural components for better learning.
     Example:
         >>> tokenizer = ChessTokenizer()
-        >>> tokens = tokenizer.tokenize("WPe2e4 BPe7e5")
-        >>> print(tokens)
-        ['W', 'P', 'e2', 'e4', 'B', 'P', 'e7', 'e5']
-        >>> tokenizer.encode("WNg1f3(+)")
-        [1, 5, 8, 39, 29, 12, 2]  # [BOS, W, N, g1, f3, +, EOS]
     """
     model_input_names = ["input_ids", "attention_mask"]
@@ -47,18 +29,14 @@ class ChessTokenizer(PreTrainedTokenizer):
     BOS_TOKEN = "[BOS]"
     EOS_TOKEN = "[EOS]"
     UNK_TOKEN = "[UNK]"
-    SEP_TOKEN = "[SEP]"  # Optional: separate moves
     # Chess components
-    # Use [W] and [B] for colors to avoid collision with piece 'B' (Bishop)
     COLORS = ["[W]", "[B]"]
     PIECES = ["P", "N", "B", "R", "Q", "K"]
     FILES = ["a", "b", "c", "d", "e", "f", "g", "h"]
     RANKS = ["1", "2", "3", "4", "5", "6", "7", "8"]
-    # Generate all 64 squares
     SQUARES = [f + r for f in FILES for r in ["1", "2", "3", "4", "5", "6", "7", "8"]]
-    # Modifiers
     MODIFIERS = [
         "x",      # Capture
         "+",      # Check
@@ -74,8 +52,6 @@ class ChessTokenizer(PreTrainedTokenizer):
         "O",      # Queenside castling (dataset format)
     ]
-    # Regex pattern to parse extended UCI moves
-    # Format: [W|B][Piece][from_sq][to_sq][promotion]?[suffixes]?
     MOVE_PATTERN = re.compile(
         r'^([WB])'                    # Color
         r'([PNBRQK])'                 # Piece
@@ -89,24 +65,16 @@ class ChessTokenizer(PreTrainedTokenizer):
         self,
         vocab_file: Optional[str] = None,
         vocab: Optional[Dict[str, int]] = None,
-        add_move_separator: bool = False,
         **kwargs,
     ):
-        """
-        Initialize the decomposed chess tokenizer.
-        Args:
-            vocab_file: Path to vocabulary JSON file.
-            vocab: Pre-built vocabulary dictionary.
-            add_move_separator: Whether to add [SEP] between moves.
-        """
         self._pad_token = self.PAD_TOKEN
         self._bos_token = self.BOS_TOKEN
         self._eos_token = self.EOS_TOKEN
         self._unk_token = self.UNK_TOKEN
-        self.add_move_separator = add_move_separator
-        # Remove duplicates from kwargs
         kwargs.pop("pad_token", None)
         kwargs.pop("bos_token", None)
         kwargs.pop("eos_token", None)
@@ -119,9 +87,10 @@ class ChessTokenizer(PreTrainedTokenizer):
             with open(vocab_file, "r", encoding="utf-8") as f:
                 self._vocab = json.load(f)
         else:
-            self._vocab = self._create_vocab()
-        # Reverse mapping
         self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
         super().__init__(
@@ -132,14 +101,17 @@ class ChessTokenizer(PreTrainedTokenizer):
             **kwargs,
         )
-    def _create_vocab(self) -> Dict[str, int]:
-        """Create the fixed vocabulary from chess components."""
         tokens = []
         # Special tokens first
         tokens.extend([self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN])
-        if self.add_move_separator:
-            tokens.append(self.SEP_TOKEN)
         # Colors
         tokens.extend(self.COLORS)
@@ -155,6 +127,56 @@ class ChessTokenizer(PreTrainedTokenizer):
         return {token: idx for idx, token in enumerate(tokens)}
     @property
     def vocab_size(self) -> int:
         return len(self._vocab)
@@ -175,12 +197,11 @@ class ChessTokenizer(PreTrainedTokenizer):
         match = self.MOVE_PATTERN.match(move)
         if not match:
-            # Fallback: return as unknown
             return [self.UNK_TOKEN]
         tokens = []
-        # Color - map 'W' -> '[W]' and 'B' -> '[B]' to avoid collision with piece Bishop
         color = match.group(1)
         tokens.append(f"[{color}]")
@@ -195,15 +216,13 @@ class ChessTokenizer(PreTrainedTokenizer):
         # Promotion (optional)
         if match.group(5):
-            tokens.append(match.group(5))  # e.g., "=Q"
         # Parse suffixes (optional)
         if match.group(6):
-            suffix = match.group(6)  # e.g., "(x+)"
-            # Remove parentheses
             suffix_content = suffix[1:-1]
-            # Parse individual modifiers
             if "x" in suffix_content:
                 tokens.append("x")
             if "+*" in suffix_content:
@@ -219,7 +238,7 @@ class ChessTokenizer(PreTrainedTokenizer):
     def _tokenize(self, text: str) -> List[str]:
         """
-        Tokenize a string of moves.
         Args:
             text: Space-separated moves in extended UCI format.
@@ -230,13 +249,9 @@ class ChessTokenizer(PreTrainedTokenizer):
         tokens = []
         moves = text.strip().split()
-        for i, move in enumerate(moves):
             move_tokens = self._parse_move(move)
             tokens.extend(move_tokens)
-            # Add separator between moves (optional)
-            if self.add_move_separator and i < len(moves) - 1:
-                tokens.append(self.SEP_TOKEN)
         return tokens
@@ -252,7 +267,7 @@ class ChessTokenizer(PreTrainedTokenizer):
         Reconstructs moves from component tokens.
         """
-        special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN, self.SEP_TOKEN}
         result = []
         current_move = []
@@ -288,7 +303,6 @@ class ChessTokenizer(PreTrainedTokenizer):
             tokens[2] in self.SQUARES and
             tokens[3] in self.SQUARES):
-            # Check if next token would start a new move
             if len(tokens) == 4:
                 return True
@@ -296,7 +310,7 @@ class ChessTokenizer(PreTrainedTokenizer):
             remaining = tokens[4:]
             for t in remaining:
                 if t in self.COLORS:
-                    return True  # Next move starting
                 if t not in self.MODIFIERS and not t.startswith("="):
                     return True
@@ -309,12 +323,11 @@ class ChessTokenizer(PreTrainedTokenizer):
         if not tokens:
             return ""
-        # Basic structure: Color + Piece + From + To
         if len(tokens) >= 4:
             # Convert [W] -> W and [B] -> B for colors
             color = tokens[0]
             if color in self.COLORS:
-                color = color[1]  # Extract 'W' from '[W]' or 'B' from '[B]'
             move = color + "".join(tokens[1:4])
@@ -338,6 +351,16 @@ class ChessTokenizer(PreTrainedTokenizer):
         save_directory: str,
         filename_prefix: Optional[str] = None,
     ) -> Tuple[str]:
         if not os.path.isdir(save_directory):
             os.makedirs(save_directory, exist_ok=True)
@@ -349,50 +372,42 @@ class ChessTokenizer(PreTrainedTokenizer):
         with open(vocab_file, "w", encoding="utf-8") as f:
             json.dump(self._vocab, f, ensure_ascii=False, indent=2)
-        # Also save config with auto_map for HuggingFace to find our custom tokenizer
-        # Format: (slow_tokenizer_class, fast_tokenizer_class) - we don't have a fast version
-        config = {
-            "tokenizer_class": "ChessTokenizer",
-            "auto_map": {
-                "AutoTokenizer": ["tokenizer.ChessTokenizer", None]
-            },
-            "add_move_separator": self.add_move_separator,
-            "vocab_size": self.vocab_size,
-        }
-        config_file = os.path.join(save_directory, "tokenizer_config.json")
-        with open(config_file, "w", encoding="utf-8") as f:
-            json.dump(config, f, indent=2)
         return (vocab_file,)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        """Load tokenizer from directory or hub."""
-        path = Path(pretrained_model_name_or_path)
-        if path.is_dir():
-            vocab_file = path / "vocab.json"
-            config_file = path / "tokenizer_config.json"
-            add_move_separator = False
-            if config_file.exists():
-                with open(config_file, "r") as f:
-                    config = json.load(f)
-                    add_move_separator = config.get("add_move_separator", False)
-            return cls(
-                vocab_file=str(vocab_file) if vocab_file.exists() else None,
-                add_move_separator=add_move_separator,
-                **kwargs,
-            )
-        # Fallback to HuggingFace hub
-        from huggingface_hub import hf_hub_download
-        vocab_file = hf_hub_download(
-            repo_id=pretrained_model_name_or_path,
-            filename="vocab.json",
-        )
-        return cls(vocab_file=vocab_file, **kwargs)

 from __future__ import annotations
 class ChessTokenizer(PreTrainedTokenizer):
     """
+    A custom tokenizer
     Example:
         >>> tokenizer = ChessTokenizer()
+        >>> tokenizer.encode("WPe2e4 BPe7e5")
+        [1, 4, 6, 45, 47, 5, 6, 50, 48, 2]  # [BOS, components..., EOS]
     """
     model_input_names = ["input_ids", "attention_mask"]
     BOS_TOKEN = "[BOS]"
     EOS_TOKEN = "[EOS]"
     UNK_TOKEN = "[UNK]"
     # Chess components
     COLORS = ["[W]", "[B]"]
     PIECES = ["P", "N", "B", "R", "Q", "K"]
     FILES = ["a", "b", "c", "d", "e", "f", "g", "h"]
     RANKS = ["1", "2", "3", "4", "5", "6", "7", "8"]
     SQUARES = [f + r for f in FILES for r in ["1", "2", "3", "4", "5", "6", "7", "8"]]
     MODIFIERS = [
         "x",      # Capture
         "+",      # Check
         "O",      # Queenside castling (dataset format)
     ]
     MOVE_PATTERN = re.compile(
         r'^([WB])'                    # Color
         r'([PNBRQK])'                 # Piece
         self,
         vocab_file: Optional[str] = None,
         vocab: Optional[Dict[str, int]] = None,
         **kwargs,
     ):
         self._pad_token = self.PAD_TOKEN
         self._bos_token = self.BOS_TOKEN
         self._eos_token = self.EOS_TOKEN
         self._unk_token = self.UNK_TOKEN
+        # Remove any duplicate
         kwargs.pop("pad_token", None)
         kwargs.pop("bos_token", None)
         kwargs.pop("eos_token", None)
             with open(vocab_file, "r", encoding="utf-8") as f:
                 self._vocab = json.load(f)
         else:
+            # Create the fixed decomposed vocabulary
+            self._vocab = self._create_default_vocab()
+        # Create reverse mapping
         self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
         super().__init__(
             **kwargs,
         )
+    def _create_default_vocab(self) -> Dict[str, int]:
+        """
+        Create the fixed vocabulary from chess components.
+        Unlike the standard tokenizer, this creates a small fixed vocab
+        of ~88 tokens for decomposed move representation.
+        """
         tokens = []
         # Special tokens first
         tokens.extend([self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN])
         # Colors
         tokens.extend(self.COLORS)
         return {token: idx for idx, token in enumerate(tokens)}
+    @classmethod
+    def build_vocab_from_iterator(
+        cls,
+        iterator,
+        min_frequency: int = 1,
+    ) -> "ChessTokenizer":
+        """
+        Build a tokenizer vocabulary from an iterator of game strings.
+        Note: For decomposed tokenizer, this ignores the iterator and
+        creates the fixed vocabulary. Provided for API compatibility.
+        Args:
+            iterator: An iterator yielding game strings (ignored).
+            min_frequency: Minimum frequency for a token (ignored).
+        Returns:
+            A ChessTokenizer with the fixed decomposed vocabulary.
+        """
+        # Decomposed tokenizer uses fixed vocabulary
+        return cls()
+    @classmethod
+    def build_vocab_from_dataset(
+        cls,
+        dataset_name: str = "dlouapre/lichess_2025-01_1M",
+        split: str = "train",
+        column: str = "moves",
+        min_frequency: int = 1,
+        max_samples: Optional[int] = None,
+    ) -> "ChessTokenizer":
+        """
+        Build a tokenizer vocabulary from a Hugging Face dataset.
+        Note: For decomposed tokenizer, this ignores the dataset and
+        creates the fixed vocabulary. Provided for API compatibility.
+        Args:
+            dataset_name: Name of the dataset on Hugging Face Hub (ignored).
+            split: Dataset split to use (ignored).
+            column: Column containing move strings (ignored).
+            min_frequency: Minimum frequency for inclusion (ignored).
+            max_samples: Maximum samples to process (ignored).
+        Returns:
+            A ChessTokenizer with the fixed decomposed vocabulary.
+        """
+        print(f"Note: Decomposed tokenizer uses fixed vocabulary (~88 tokens)")
+        return cls()
     @property
     def vocab_size(self) -> int:
         return len(self._vocab)
         match = self.MOVE_PATTERN.match(move)
         if not match:
             return [self.UNK_TOKEN]
         tokens = []
+        # Color - map 'W' -> '[W]' and 'B' -> '[B]'
         color = match.group(1)
         tokens.append(f"[{color}]")
         # Promotion (optional)
         if match.group(5):
+            tokens.append(match.group(5))
         # Parse suffixes (optional)
         if match.group(6):
+            suffix = match.group(6)
             suffix_content = suffix[1:-1]
             if "x" in suffix_content:
                 tokens.append("x")
             if "+*" in suffix_content:
     def _tokenize(self, text: str) -> List[str]:
         """
+        Tokenize a string of moves into component tokens.
         Args:
             text: Space-separated moves in extended UCI format.
         tokens = []
         moves = text.strip().split()
+        for move in moves:
             move_tokens = self._parse_move(move)
             tokens.extend(move_tokens)
         return tokens
         Reconstructs moves from component tokens.
         """
+        special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
         result = []
         current_move = []
             tokens[2] in self.SQUARES and
             tokens[3] in self.SQUARES):
             if len(tokens) == 4:
                 return True
             remaining = tokens[4:]
             for t in remaining:
                 if t in self.COLORS:
+                    return True
                 if t not in self.MODIFIERS and not t.startswith("="):
                     return True
         if not tokens:
             return ""
         if len(tokens) >= 4:
             # Convert [W] -> W and [B] -> B for colors
             color = tokens[0]
             if color in self.COLORS:
+                color = color[1]
             move = color + "".join(tokens[1:4])
         save_directory: str,
         filename_prefix: Optional[str] = None,
     ) -> Tuple[str]:
+        """
+        Save the vocabulary to a file.
+        Args:
+            save_directory: Directory to save the vocabulary.
+            filename_prefix: Optional prefix for the vocabulary file.
+        Returns:
+            Tuple containing the path to the saved vocabulary file.
+        """
         if not os.path.isdir(save_directory):
             os.makedirs(save_directory, exist_ok=True)
         with open(vocab_file, "w", encoding="utf-8") as f:
             json.dump(self._vocab, f, ensure_ascii=False, indent=2)
         return (vocab_file,)
+def count_vocab_from_dataset(
+    dataset_name: str = "dlouapre/lichess_2025-01_1M",
+    split: str = "train",
+    column: str = "moves",
+    max_samples: Optional[int] = None,
+) -> Dict[str, int]:
+    """
+    Count token frequencies in a dataset.
+    Note: For decomposed tokenizer, this counts component frequencies
+    rather than whole-move frequencies.
+    Args:
+        dataset_name: Name of the dataset.
+        split: Dataset split.
+        column: Column with moves.
+        max_samples: Max samples to process.
+    Returns:
+        Dictionary of token frequencies.
+    """
+    from collections import Counter
+    from datasets import load_dataset
+    tokenizer = ChessTokenizer()
+    dataset = load_dataset(dataset_name, split=split)
+    if max_samples:
+        dataset = dataset.select(range(min(max_samples, len(dataset))))
+    counts = Counter()
+    for example in dataset:
+        tokens = tokenizer.tokenize(example[column])
+        counts.update(tokens)
+    return dict(counts)