Chess Challenge submission by gabriel-mariadass

Browse files

Files changed (5) hide show

README.md +2 -2
config.json +2 -1
model.safetensors +3 -0
tokenizer.py +210 -105
vocab.json +104 -140

README.md CHANGED Viewed

@@ -14,13 +14,13 @@ Chess model submitted to the LLM Course Chess Challenge.
 ## Submission Info
 - **Submitted by**: [gabriel-mariadass](https://huggingface.co/gabriel-mariadass)
-- **Parameters**: 100,992
 - **Organization**: LLM-course
 ## Model Details
 - **Architecture**: Chess Transformer (GPT-style)
-- **Vocab size**: 144
 - **Embedding dim**: 64
 - **Layers**: 2
 - **Heads**: 2

 ## Submission Info
 - **Submitted by**: [gabriel-mariadass](https://huggingface.co/gabriel-mariadass)
+- **Parameters**: 98,688
 - **Organization**: LLM-course
 ## Model Details
 - **Architecture**: Chess Transformer (GPT-style)
+- **Vocab size**: 108
 - **Embedding dim**: 64
 - **Layers**: 2
 - **Heads**: 2

config.json CHANGED Viewed

@@ -6,6 +6,7 @@
   "dropout": 0.1,
   "dtype": "float32",
   "eos_token_id": 2,
   "model_type": "chess_transformer",
   "n_ctx": 128,
   "n_embd": 64,
@@ -15,5 +16,5 @@
   "pad_token_id": 0,
   "tie_weights": true,
   "transformers_version": "4.57.3",
-  "vocab_size": 144
 }

   "dropout": 0.1,
   "dtype": "float32",
   "eos_token_id": 2,
+  "layer_norm_epsilon": 1e-05,
   "model_type": "chess_transformer",
   "n_ctx": 128,
   "n_embd": 64,
   "pad_token_id": 0,
   "tie_weights": true,
   "transformers_version": "4.57.3",
+  "vocab_size": 108
 }

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da32936787cc88a6bb980274667ecc7c7255633fbbc49ffcfeaad73828d8228d
+size 397024

tokenizer.py CHANGED Viewed

@@ -1,16 +1,14 @@
 """
-High-performance Chess Tokenizer for the Chess Challenge.
-Tokenizer DECOMPOSED :
-Each move is split into atomic tokens:
-- Color + Piece : WP, BN, ...
-- From square   : e2_f
-- To square     : e4_t
-Example move:
-    WPe2e4  →  ["WP", "e2_f", "e4_t"]
-This strongly constrains generation and massively improves legality.
 """
 from __future__ import annotations
@@ -24,41 +22,70 @@ from transformers import PreTrainedTokenizer
 class ChessTokenizer(PreTrainedTokenizer):
-    vocab_files_names = {"vocab_file": "vocab.json"}
     model_input_names = ["input_ids", "attention_mask"]
     PAD_TOKEN = "[PAD]"
     BOS_TOKEN = "[BOS]"
     EOS_TOKEN = "[EOS]"
     UNK_TOKEN = "[UNK]"
     def __init__(
         self,
         vocab_file: Optional[str] = None,
         vocab: Optional[Dict[str, int]] = None,
         **kwargs,
     ):
         self._pad_token = self.PAD_TOKEN
         self._bos_token = self.BOS_TOKEN
         self._eos_token = self.EOS_TOKEN
         self._unk_token = self.UNK_TOKEN
         kwargs.pop("pad_token", None)
         kwargs.pop("bos_token", None)
         kwargs.pop("eos_token", None)
         kwargs.pop("unk_token", None)
         if vocab is not None:
             self._vocab = vocab
         elif vocab_file is not None and os.path.exists(vocab_file):
             with open(vocab_file, "r", encoding="utf-8") as f:
                 self._vocab = json.load(f)
         else:
-            self._vocab = self._build_fixed_vocab()
         self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
         super().__init__(
             pad_token=self._pad_token,
             bos_token=self._bos_token,
@@ -66,108 +93,186 @@ class ChessTokenizer(PreTrainedTokenizer):
             unk_token=self._unk_token,
             **kwargs,
         )
-    # --------------------------------------------------
-    # 🔥 CORE IDEA : FIXED, CLOSED, STRUCTURED VOCAB
-    # --------------------------------------------------
-    def _build_fixed_vocab(self) -> Dict[str, int]:
-        vocab = {}
-        special = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
-        for i, tok in enumerate(special):
-            vocab[tok] = i
-        idx = len(vocab)
-        # Pieces with color
-        for color in ["W", "B"]:
-            for piece in ["P", "N", "B", "R", "Q", "K"]:
-                vocab[f"{color}{piece}"] = idx
-                idx += 1
-        # Board squares
-        files = "abcdefgh"
-        ranks = "12345678"
-        for f in files:
-            for r in ranks:
-                vocab[f"{f}{r}_f"] = idx
-                idx += 1
-                vocab[f"{f}{r}_t"] = idx
-                idx += 1
         return vocab
-    # --------------------------------------------------
-    # Tokenizer API
-    # --------------------------------------------------
     @property
     def vocab_size(self) -> int:
         return len(self._vocab)
     def get_vocab(self) -> Dict[str, int]:
         return dict(self._vocab)
-    # --------------------------------------------------
-    # TOKENIZATION LOGIC
-    # --------------------------------------------------
     def _tokenize(self, text: str) -> List[str]:
         """
-        Convert move string into decomposed tokens.
-        Input example:
-            WPe2e4 BNg8f6
-        Output:
-            ["WP", "e2_f", "e4_t", "BN", "g8_f", "f6_t"]
         """
-        tokens = []
-        moves = text.strip().split()
-        for move in moves:
-            if len(move) < 6:
-                continue
-            color = move[0]
-            piece = move[1]
-            from_sq = move[2:4]
-            to_sq = move[4:6]
-            tokens.append(f"{color}{piece}")
-            tokens.append(f"{from_sq}_f")
-            tokens.append(f"{to_sq}_t")
-        return tokens
     def _convert_token_to_id(self, token: str) -> int:
-        return self._vocab.get(token, self._vocab[self.UNK_TOKEN])
     def _convert_id_to_token(self, index: int) -> str:
         return self._ids_to_tokens.get(index, self.UNK_TOKEN)
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
         """
-        Reassemble tokens into extended UCI-like format.
         """
-        out = []
-        i = 0
-        while i + 2 < len(tokens):
-            try:
-                cp = tokens[i]
-                f = tokens[i + 1].replace("_f", "")
-                t = tokens[i + 2].replace("_t", "")
-                out.append(cp + f + t)
-            except Exception:
-                pass
-            i += 3
-        return " ".join(out)
-    # --------------------------------------------------
-    # SAVE / LOAD
-    # --------------------------------------------------
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
-        os.makedirs(save_directory, exist_ok=True)
-        path = os.path.join(
             save_directory,
             (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
         )
-        with open(path, "w", encoding="utf-8") as f:
-            json.dump(self._vocab, f, indent=2)
-        return (path,)

 """
+Custom Chess Tokenizer for the Chess Challenge.
+This tokenizer treats each move as a single token using the extended UCI notation
+from the Lichess dataset (e.g., WPe2e4, BNg8f6).
+The dataset format uses:
+- W/B prefix for White/Black
+- Piece letter: P=Pawn, N=Knight, B=Bishop, R=Rook, Q=Queen, K=King
+- Source and destination squares (e.g., e2e4)
+- Special suffixes: (x)=capture, (+)=check, (+*)=checkmate, (o)/(O)=castling
 """
 from __future__ import annotations
 class ChessTokenizer(PreTrainedTokenizer):
+    """
+    A custom tokenizer for chess moves using extended UCI notation.
+    This tokenizer maps each possible chess move to a unique token ID.
+    The vocabulary is built from the training dataset to ensure all moves
+    encountered during training have a corresponding token.
+    Example:
+        >>> tokenizer = ChessTokenizer()
+        >>> tokenizer.encode("WPe2e4 BPe7e5")
+        [1, 42, 87, 2]  # [BOS, e2e4, e7e5, EOS]
+    """
     model_input_names = ["input_ids", "attention_mask"]
+    vocab_files_names = {"vocab_file": "vocab.json"}
+    # Special tokens
     PAD_TOKEN = "[PAD]"
     BOS_TOKEN = "[BOS]"
     EOS_TOKEN = "[EOS]"
     UNK_TOKEN = "[UNK]"
     def __init__(
         self,
         vocab_file: Optional[str] = None,
         vocab: Optional[Dict[str, int]] = None,
         **kwargs,
     ):
+        """
+        Initialize the chess tokenizer.
+        Args:
+            vocab_file: Path to a JSON file containing the vocabulary mapping.
+            vocab: Dictionary mapping tokens to IDs (alternative to vocab_file).
+            **kwargs: Additional arguments passed to PreTrainedTokenizer.
+        """
+        # Initialize special tokens
         self._pad_token = self.PAD_TOKEN
         self._bos_token = self.BOS_TOKEN
         self._eos_token = self.EOS_TOKEN
         self._unk_token = self.UNK_TOKEN
+        # Remove any duplicate special-token entries passed through kwargs
+        # to avoid "multiple values for keyword" errors when loading from disk.
         kwargs.pop("pad_token", None)
         kwargs.pop("bos_token", None)
         kwargs.pop("eos_token", None)
         kwargs.pop("unk_token", None)
+        # Load or create vocabulary
         if vocab is not None:
             self._vocab = vocab
         elif vocab_file is not None and os.path.exists(vocab_file):
             with open(vocab_file, "r", encoding="utf-8") as f:
                 self._vocab = json.load(f)
         else:
+            # Create a minimal vocabulary with just special tokens
+            # The full vocabulary should be built from the dataset
+            self._vocab = self._create_default_vocab()
+        # Create reverse mapping
         self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
+        # Call parent init AFTER setting up vocab
         super().__init__(
             pad_token=self._pad_token,
             bos_token=self._bos_token,
             unk_token=self._unk_token,
             **kwargs,
         )
+    def _create_default_vocab(self) -> Dict[str, int]:
+        """
+        Create a minimal default vocabulary with just special tokens.
+        For the full vocabulary, use `build_vocab_from_dataset()`.
+        This minimal vocab is just a placeholder - you should build from data.
+        """
+        special_tokens = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
+        vocab = {token: idx for idx, token in enumerate(special_tokens)}
         return vocab
+    @classmethod
+    def build_vocab_from_iterator(
+        cls,
+        iterator,
+        min_frequency: int = 1,
+    ) -> "ChessTokenizer":
+        """
+        Build a tokenizer vocabulary from an iterator of game strings.
+        Args:
+            iterator: An iterator yielding game strings (space-separated moves).
+            min_frequency: Minimum frequency for a token to be included.
+        Returns:
+            A ChessTokenizer with the built vocabulary.
+        """
+        from collections import Counter
+        token_counts = Counter()
+        for game in iterator:
+            moves = game.strip().split()
+            token_counts.update(moves)
+        # Filter by frequency
+        tokens = [
+            token for token, count in token_counts.items()
+            if count >= min_frequency
+        ]
+        # Sort for reproducibility
+        tokens = sorted(tokens)
+        # Build vocabulary
+        special_tokens = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN]
+        vocab = {token: idx for idx, token in enumerate(special_tokens + tokens)}
+        return cls(vocab=vocab)
+    @classmethod
+    def build_vocab_from_dataset(
+        cls,
+        dataset_name: str = "dlouapre/lichess_2025-01_1M",
+        split: str = "train",
+        column: str = "text",
+        min_frequency: int = 500,
+        max_samples: Optional[int] = 100000,
+    ) -> "ChessTokenizer":
+        """
+        Build a tokenizer vocabulary from a Hugging Face dataset.
+        Args:
+            dataset_name: Name of the dataset on Hugging Face Hub.
+            split: Dataset split to use.
+            column: Column containing the game strings.
+            min_frequency: Minimum frequency for a token to be included (default: 500).
+            max_samples: Maximum number of samples to process (default: 100k).
+        Returns:
+            A ChessTokenizer with the built vocabulary.
+        """
+        from datasets import load_dataset
+        dataset = load_dataset(dataset_name, split=split)
+        if max_samples is not None:
+            dataset = dataset.select(range(min(max_samples, len(dataset))))
+        def game_iterator():
+            for example in dataset:
+                yield example[column]
+        return cls.build_vocab_from_iterator(game_iterator(), min_frequency=min_frequency)
     @property
     def vocab_size(self) -> int:
+        """Return the size of the vocabulary."""
         return len(self._vocab)
     def get_vocab(self) -> Dict[str, int]:
+        """Return the vocabulary as a dictionary."""
         return dict(self._vocab)
     def _tokenize(self, text: str) -> List[str]:
         """
+        Tokenize a string of moves into a list of tokens.
+        Args:
+            text: A string of space-separated moves.
+        Returns:
+            List of move tokens.
         """
+        return text.strip().split()
     def _convert_token_to_id(self, token: str) -> int:
+        """Convert a token to its ID."""
+        return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN, 0))
     def _convert_id_to_token(self, index: int) -> str:
+        """Convert an ID to its token."""
         return self._ids_to_tokens.get(index, self.UNK_TOKEN)
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Convert a list of tokens back to a string."""
+        # Filter out special tokens for cleaner output
+        special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
+        return " ".join(t for t in tokens if t not in special)
+    def save_vocabulary(
+        self,
+        save_directory: str,
+        filename_prefix: Optional[str] = None,
+    ) -> tuple:
         """
+        Save the vocabulary to a JSON file.
+        Args:
+            save_directory: Directory to save the vocabulary.
+            filename_prefix: Optional prefix for the filename.
+        Returns:
+            Tuple containing the path to the saved vocabulary file.
         """
+        if not os.path.isdir(save_directory):
+            os.makedirs(save_directory, exist_ok=True)
+        vocab_file = os.path.join(
             save_directory,
             (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
         )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            json.dump(self._vocab, f, ensure_ascii=False, indent=2)
+        return (vocab_file,)
+def count_vocab_from_dataset(
+    dataset_name: str = "dlouapre/lichess_2025-01_1M",
+    split: str = "train",
+    column: str = "text",
+    max_samples: Optional[int] = 10000,
+) -> Dict[str, int]:
+    """
+    Count token frequencies in a dataset (useful for vocabulary analysis).
+    Args:
+        dataset_name: Name of the dataset on Hugging Face Hub.
+        split: Dataset split to use.
+        column: Column containing the game strings.
+        max_samples: Maximum number of samples to process.
+    Returns:
+        Dictionary mapping tokens to their frequencies.
+    """
+    from collections import Counter
+    from datasets import load_dataset
+    dataset = load_dataset(dataset_name, split=split)
+    if max_samples is not None:
+        dataset = dataset.select(range(min(max_samples, len(dataset))))
+    token_counts = Counter()
+    for example in dataset:
+        moves = example[column].strip().split()
+        token_counts.update(moves)
+    return dict(token_counts)

vocab.json CHANGED Viewed

@@ -3,144 +3,108 @@
   "[BOS]": 1,
   "[EOS]": 2,
   "[UNK]": 3,
-  "WP": 4,
-  "WN": 5,
-  "WB": 6,
-  "WR": 7,
-  "WQ": 8,
-  "WK": 9,
-  "BP": 10,
-  "BN": 11,
-  "BB": 12,
-  "BR": 13,
-  "BQ": 14,
-  "BK": 15,
-  "a1_f": 16,
-  "a1_t": 17,
-  "a2_f": 18,
-  "a2_t": 19,
-  "a3_f": 20,
-  "a3_t": 21,
-  "a4_f": 22,
-  "a4_t": 23,
-  "a5_f": 24,
-  "a5_t": 25,
-  "a6_f": 26,
-  "a6_t": 27,
-  "a7_f": 28,
-  "a7_t": 29,
-  "a8_f": 30,
-  "a8_t": 31,
-  "b1_f": 32,
-  "b1_t": 33,
-  "b2_f": 34,
-  "b2_t": 35,
-  "b3_f": 36,
-  "b3_t": 37,
-  "b4_f": 38,
-  "b4_t": 39,
-  "b5_f": 40,
-  "b5_t": 41,
-  "b6_f": 42,
-  "b6_t": 43,
-  "b7_f": 44,
-  "b7_t": 45,
-  "b8_f": 46,
-  "b8_t": 47,
-  "c1_f": 48,
-  "c1_t": 49,
-  "c2_f": 50,
-  "c2_t": 51,
-  "c3_f": 52,
-  "c3_t": 53,
-  "c4_f": 54,
-  "c4_t": 55,
-  "c5_f": 56,
-  "c5_t": 57,
-  "c6_f": 58,
-  "c6_t": 59,
-  "c7_f": 60,
-  "c7_t": 61,
-  "c8_f": 62,
-  "c8_t": 63,
-  "d1_f": 64,
-  "d1_t": 65,
-  "d2_f": 66,
-  "d2_t": 67,
-  "d3_f": 68,
-  "d3_t": 69,
-  "d4_f": 70,
-  "d4_t": 71,
-  "d5_f": 72,
-  "d5_t": 73,
-  "d6_f": 74,
-  "d6_t": 75,
-  "d7_f": 76,
-  "d7_t": 77,
-  "d8_f": 78,
-  "d8_t": 79,
-  "e1_f": 80,
-  "e1_t": 81,
-  "e2_f": 82,
-  "e2_t": 83,
-  "e3_f": 84,
-  "e3_t": 85,
-  "e4_f": 86,
-  "e4_t": 87,
-  "e5_f": 88,
-  "e5_t": 89,
-  "e6_f": 90,
-  "e6_t": 91,
-  "e7_f": 92,
-  "e7_t": 93,
-  "e8_f": 94,
-  "e8_t": 95,
-  "f1_f": 96,
-  "f1_t": 97,
-  "f2_f": 98,
-  "f2_t": 99,
-  "f3_f": 100,
-  "f3_t": 101,
-  "f4_f": 102,
-  "f4_t": 103,
-  "f5_f": 104,
-  "f5_t": 105,
-  "f6_f": 106,
-  "f6_t": 107,
-  "f7_f": 108,
-  "f7_t": 109,
-  "f8_f": 110,
-  "f8_t": 111,
-  "g1_f": 112,
-  "g1_t": 113,
-  "g2_f": 114,
-  "g2_t": 115,
-  "g3_f": 116,
-  "g3_t": 117,
-  "g4_f": 118,
-  "g4_t": 119,
-  "g5_f": 120,
-  "g5_t": 121,
-  "g6_f": 122,
-  "g6_t": 123,
-  "g7_f": 124,
-  "g7_t": 125,
-  "g8_f": 126,
-  "g8_t": 127,
-  "h1_f": 128,
-  "h1_t": 129,
-  "h2_f": 130,
-  "h2_t": 131,
-  "h3_f": 132,
-  "h3_t": 133,
-  "h4_f": 134,
-  "h4_t": 135,
-  "h5_f": 136,
-  "h5_t": 137,
-  "h6_f": 138,
-  "h6_t": 139,
-  "h7_f": 140,
-  "h7_t": 141,
-  "h8_f": 142,
-  "h8_t": 143
 }

   "[BOS]": 1,
   "[EOS]": 2,
   "[UNK]": 3,
+  "BBc8b7": 4,
+  "BBc8d7": 5,
+  "BBc8e6": 6,
+  "BBc8f5": 7,
+  "BBc8g4": 8,
+  "BBf8c5": 9,
+  "BBf8d6": 10,
+  "BBf8e7": 11,
+  "BBf8g7": 12,
+  "BKe8c8(O)": 13,
+  "BKe8g8(o)": 14,
+  "BKg8h8": 15,
+  "BNb8c6": 16,
+  "BNb8d7": 17,
+  "BNf6e4": 18,
+  "BNf6e4(x)": 19,
+  "BNg8e7": 20,
+  "BNg8f6": 21,
+  "BPa7a5": 22,
+  "BPa7a6": 23,
+  "BPb5b4": 24,
+  "BPb7b5": 25,
+  "BPb7b6": 26,
+  "BPb7c6(x)": 27,
+  "BPc5d4(x)": 28,
+  "BPc6c5": 29,
+  "BPc6d5(x)": 30,
+  "BPc7c5": 31,
+  "BPc7c6": 32,
+  "BPd5d4": 33,
+  "BPd5e4(x)": 34,
+  "BPd6d5": 35,
+  "BPd6e5(x)": 36,
+  "BPd7d5": 37,
+  "BPd7d6": 38,
+  "BPe5d4(x)": 39,
+  "BPe5e4": 40,
+  "BPe6d5(x)": 41,
+  "BPe6e5": 42,
+  "BPe7e5": 43,
+  "BPe7e6": 44,
+  "BPf7f5": 45,
+  "BPf7f6": 46,
+  "BPg7g5": 47,
+  "BPg7g6": 48,
+  "BPh7h5": 49,
+  "BPh7h6": 50,
+  "BQd8c7": 51,
+  "BQd8e7": 52,
+  "BRa8b8": 53,
+  "BRa8c8": 54,
+  "BRa8d8": 55,
+  "BRf8e8": 56,
+  "WBc1b2": 57,
+  "WBc1d2": 58,
+  "WBc1e3": 59,
+  "WBc1f4": 60,
+  "WBc1g5": 61,
+  "WBf1c4": 62,
+  "WBf1d3": 63,
+  "WBf1e2": 64,
+  "WBf1g2": 65,
+  "WKe1c1(O)": 66,
+  "WKe1g1(o)": 67,
+  "WKg1h1": 68,
+  "WNb1c3": 69,
+  "WNb1d2": 70,
+  "WNc3d5": 71,
+  "WNf3d4(x)": 72,
+  "WNf3e5": 73,
+  "WNf3e5(x)": 74,
+  "WNf3g5": 75,
+  "WNg1f3": 76,
+  "WPa2a3": 77,
+  "WPa2a4": 78,
+  "WPb2b3": 79,
+  "WPb2b4": 80,
+  "WPc2c3": 81,
+  "WPc2c4": 82,
+  "WPc3c4": 83,
+  "WPc3d4(x)": 84,
+  "WPc4d5(x)": 85,
+  "WPd2d3": 86,
+  "WPd2d4": 87,
+  "WPd4d5": 88,
+  "WPd4e5(x)": 89,
+  "WPe2e3": 90,
+  "WPe2e4": 91,
+  "WPe3e4": 92,
+  "WPe4d5(x)": 93,
+  "WPe4e5": 94,
+  "WPf2f3": 95,
+  "WPf2f4": 96,
+  "WPg2g3": 97,
+  "WPg2g4": 98,
+  "WPh2h3": 99,
+  "WPh2h4": 100,
+  "WPh4h5": 101,
+  "WQd1d2": 102,
+  "WQd1e2": 103,
+  "WRa1b1": 104,
+  "WRa1c1": 105,
+  "WRa1d1": 106,
+  "WRf1e1": 107
 }