Chess Challenge submission by raphael-mathiot

Browse files

Files changed (5) hide show

README.md +22 -1
config.json +5 -5
model.safetensors +2 -2
tokenizer.py +62 -203
vocab.json +68 -81

README.md CHANGED Viewed

@@ -1,5 +1,26 @@
 ---
 tags:
 - chess
 ---
-# chess-stonkfish

 ---
+library_name: transformers
 tags:
 - chess
+- llm-course
+- chess-challenge
+license: mit
 ---
+# chess-stonkfish
+Chess model submitted to the LLM Course Chess Challenge.
+## Submission Info
+- **Submitted by**: [raphael-mathiot](https://huggingface.co/raphael-mathiot)
+- **Parameters**: 991,320
+- **Organization**: LLM-course
+## Model Details
+- **Architecture**: Chess Transformer (GPT-style)
+- **Vocab size**: 72
+- **Embedding dim**: 128
+- **Layers**: 6
+- **Heads**: 8

config.json CHANGED Viewed

@@ -12,12 +12,12 @@
   "layer_norm_epsilon": 1e-05,
   "model_type": "chess_transformer",
   "n_ctx": 256,
-  "n_embd": 96,
-  "n_head": 6,
-  "n_inner": 288,
-  "n_layer": 10,
   "pad_token_id": 0,
   "tie_weights": true,
   "transformers_version": "4.57.1",
-  "vocab_size": 85
 }

   "layer_norm_epsilon": 1e-05,
   "model_type": "chess_transformer",
   "n_ctx": 256,
+  "n_embd": 128,
+  "n_head": 8,
+  "n_inner": 356,
+  "n_layer": 6,
   "pad_token_id": 0,
   "tie_weights": true,
   "transformers_version": "4.57.1",
+  "vocab_size": 72
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5d3d5e7e5bd0d5e40c2268eb3e4d00753e923393a3eedbe22c20fa54d9575c3c
-size 3874600

 version https://git-lfs.github.com/spec/v1
+oid sha256:68f2b547f340aaf26abb5a994f3dc47e741d65c2939db2511ef66481b31ae60a
+size 3971728

tokenizer.py CHANGED Viewed

@@ -24,14 +24,18 @@ from transformers import PreTrainedTokenizer
 class ChessTokenizer(PreTrainedTokenizer):
     """
-    A custom tokenizer for chess moves using extended UCI notation.
-    This tokenizer splits moves into semantic components (Pieces, Squares, Metadata).
-    Example: "WPe2e4" -> ["WP", "e2", "e4"]
     """
     model_input_names = ["input_ids", "attention_mask"]
-    vocab_files_names = {"vocab_file": "vocab.json"}
     # Special tokens
     PAD_TOKEN = "[PAD]"
@@ -45,45 +49,27 @@ class ChessTokenizer(PreTrainedTokenizer):
         vocab: Optional[Dict[str, int]] = None,
         **kwargs,
     ):
-        """
-        Initialize the chess tokenizer.
-        """
-        # Initialize special tokens
         self._pad_token = self.PAD_TOKEN
         self._bos_token = self.BOS_TOKEN
         self._eos_token = self.EOS_TOKEN
         self._unk_token = self.UNK_TOKEN
-        # Clean kwargs
         kwargs.pop("pad_token", None)
         kwargs.pop("bos_token", None)
         kwargs.pop("eos_token", None)
         kwargs.pop("unk_token", None)
-        # Regex for splitting moves into:
-        # 1. Castling: (O), (o)
-        # 2. Metadata: (x), (+*), (+)
-        # 3. Pieces: WP, BR, etc.
-        # 4. Squares: a1, h8, etc.
-        self.token_pattern = re.compile(
-            r'\(O\)|\(o\)|'        # Castling
-            r'\(x\)|\(\+\*\)|\(\+\)|'  # Metadata (Capture, Mate, Check)
-            r'[WB][PRNBQK]|'       # Pieces (Color + Type)
-            r'[a-h][1-8]'          # Squares
-        )
-        # Load or create vocabulary
         if vocab is not None:
             self._vocab = vocab
         elif vocab_file is not None and os.path.exists(vocab_file):
             with open(vocab_file, "r", encoding="utf-8") as f:
                 self._vocab = json.load(f)
         else:
-            # In this version, the default vocab is the FULL vocab
-            # because chess rules are static.
             self._vocab = self._create_default_vocab()
-        # Create reverse mapping
         self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
         super().__init__(
@@ -95,216 +81,89 @@ class ChessTokenizer(PreTrainedTokenizer):
         )
     def _create_default_vocab(self) -> Dict[str, int]:
-        """
-        Create the full static vocabulary for Chess.
-        Since the 'rules' of the tokens are known (squares a1-h8, pieces),
-        we generate the full map here instead of learning it.
-        """
-        # 1. Special Tokens
         special_tokens = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
         vocab = {token: idx for idx, token in enumerate(special_tokens)}
         idx = len(vocab)
-        # 2. Pieces (White/Black + Pawn/Rook/Knight/Bishop/Queen/King)
-        colors = ['W', 'B']
-        pieces = ['P', 'R', 'N', 'B', 'Q', 'K']
-        for c in colors:
-            for p in pieces:
-                token = f"{c}{p}"
-                if token not in vocab:
-                    vocab[token] = idx
-                    idx += 1
-        # 3. Squares (a1 to h8)
-        files = 'abcdefgh'
-        ranks = '12345678'
-        for f in files:
-            for r in ranks:
-                token = f"{f}{r}"
-                if token not in vocab:
-                    vocab[token] = idx
-                    idx += 1
-        # 4. Special Move Suffixes
-        # Note: Order is handled by regex, but we just need them in vocab here
-        specials = ['(O)', '(o)', '(x)', '(+)', '(+*)']
-        for s in specials:
-            if s not in vocab:
-                vocab[s] = idx
                 idx += 1
         return vocab
-    @classmethod
-    def build_vocab_from_iterator(
-        cls,
-        iterator: Iterator,
-        min_frequency: int = 1,
-    ) -> "ChessTokenizer":
-        """
-        API Compatibility Method.
-        Since this tokenizer uses a static vocabulary based on Chess rules,
-        scanning the iterator is not necessary. We simply consume the iterator
-        (optional) and return the standard tokenizer.
-        """
-        # We explicitly ignore the iterator data because our vocab
-        # is pre-defined by the rules of the game.
-        return cls()
-    @classmethod
-    def build_vocab_from_dataset(
-        cls,
-        dataset_name: str = "dlouapre/lichess_2025-01_1M",
-        split: str = "train",
-        column: str = "text",
-        min_frequency: int = 500,
-        max_samples: Optional[int] = 100000,
-    ) -> "ChessTokenizer":
-        """
-        API Compatibility Method.
-        Returns a tokenizer with the standard chess vocabulary.
-        Does not download the dataset as the vocabulary is static.
-        """
-        return cls()
-    @property
-    def vocab_size(self) -> int:
-        """Return the size of the vocabulary."""
-        return len(self._vocab)
-    def get_vocab(self) -> Dict[str, int]:
-        """Return the vocabulary as a dictionary."""
-        return dict(self._vocab)
     def _tokenize(self, text: str) -> List[str]:
         """
-        Tokenize a string of moves into semantic components using Regex.
-        Args:
-            text: A string of space-separated moves (e.g., "WPe2e4 BPe7e5")
-        Returns:
-            List of components (e.g., ["WP", "e2", "e4", "BP", "e7", "e5"])
         """
-        # findall will ignore spaces and return only the matching components
         return self.token_pattern.findall(text)
     def _convert_token_to_id(self, token: str) -> int:
         """Convert a token to its ID."""
-        return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN))
     def _convert_id_to_token(self, index: int) -> str:
         """Convert an ID to its token."""
         return self._ids_to_tokens.get(index, self.UNK_TOKEN)
-    def _is_start_of_move(self, token: str) -> bool:
-            """
-            Helper to determine if a token represents the start of a new move.
-            Moves start with a Piece (e.g., 'WP') or Castling (e.g., '(O)').
-            """
-            # 1. Check for Castling (Short or Long)
-            if token in ['(O)', '(o)']:
-                return True
-            # 2. Check for Pieces (Length 2, starts with W/B, ends with Piece type)
-            # We check specific characters to avoid confusion with squares or suffixes
-            if len(token) == 2 and token[0] in 'WB' and token[1] in 'PRNBQK':
-                return True
-            return False
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        """
-        Converts a list of tokens back to a string, respecting Chess notation rules.
-        Logic:
-        - Spaces are inserted BEFORE a token ONLY if that token marks the start of a new move.
-        - Squares (e2, e4) and Suffixes (x, +) are concatenated to the previous token.
-        """
         output = []
-        special_tokens = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
-        for i, token in enumerate(tokens):
-            # 1. Handle Special Tokens (keep them, surround with spaces if needed)
-            if token in special_tokens:
-                if output and output[-1] != " ":
-                    output.append(" ")
-                output.append(token)
-            # 2. Handle Start of New Move (Insert space before)
-            elif self._is_start_of_move(token):
-                # Add a space if we aren't at the very start and the previous char isn't already a space
-                if output and output[-1] != " ":
-                    output.append(" ")
-                output.append(token)
-            # 3. Handle Continuations (Squares 'e2', Suffixes '(x)') -> Concatenate
             else:
                 output.append(token)
-        return "".join(output).strip()
-    def save_vocabulary(
-        self,
-        save_directory: str,
-        filename_prefix: Optional[str] = None,
-    ) -> tuple:
-        """
-        Save the vocabulary to a JSON file.
-        Args:
-            save_directory: Directory to save the vocabulary.
-            filename_prefix: Optional prefix for the filename.
-        Returns:
-            Tuple containing the path to the saved vocabulary file.
-        """
         if not os.path.isdir(save_directory):
             os.makedirs(save_directory, exist_ok=True)
         vocab_file = os.path.join(
-            save_directory,
-            (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
         )
         with open(vocab_file, "w", encoding="utf-8") as f:
             json.dump(self._vocab, f, ensure_ascii=False, indent=2)
         return (vocab_file,)
-def count_vocab_from_dataset(
-    dataset_name: str = "dlouapre/lichess_2025-01_1M",
-    split: str = "train",
-    column: str = "text",
-    max_samples: Optional[int] = 10000,
-) -> Dict[str, int]:
-    """
-    Count token frequencies in a dataset (useful for vocabulary analysis).
-    Args:
-        dataset_name: Name of the dataset on Hugging Face Hub.
-        split: Dataset split to use.
-        column: Column containing the game strings.
-        max_samples: Maximum number of samples to process.
-    Returns:
-        Dictionary mapping tokens to their frequencies.
-    """
-    from collections import Counter
-    from datasets import load_dataset
-    dataset = load_dataset(dataset_name, split=split)
-    if max_samples is not None:
-        dataset = dataset.select(range(min(max_samples, len(dataset))))
-    token_counts = Counter()
-    for example in dataset:
-        moves = example[column].strip().split()
-        token_counts.update(moves)
-    return dict(token_counts)

 class ChessTokenizer(PreTrainedTokenizer):
     """
+    A robust chess tokenizer using a 72-token vocabulary.
+    It handles raw Extended UCI notation (e.g., "WPa7a8(Q)", "BQd8h4(+*)")
+    by automatically cleaning and extracting only the necessary board moves.
+    Vocabulary:
+    - 4 Special: [PAD], [BOS], [EOS], [UNK]
+    - 64 Squares: a1...h8
+    - 4 Promotions: q, r, b, n
     """
     model_input_names = ["input_ids", "attention_mask"]
     # Special tokens
     PAD_TOKEN = "[PAD]"
         vocab: Optional[Dict[str, int]] = None,
         **kwargs,
     ):
         self._pad_token = self.PAD_TOKEN
         self._bos_token = self.BOS_TOKEN
         self._eos_token = self.EOS_TOKEN
         self._unk_token = self.UNK_TOKEN
         kwargs.pop("pad_token", None)
         kwargs.pop("bos_token", None)
         kwargs.pop("eos_token", None)
         kwargs.pop("unk_token", None)
+        # Regex to find Squares (a1-h8) OR lowercase promotion letters (qrbn)
+        self.token_pattern = re.compile(r'[a-h][1-8]|[qrbn]')
         if vocab is not None:
             self._vocab = vocab
         elif vocab_file is not None and os.path.exists(vocab_file):
             with open(vocab_file, "r", encoding="utf-8") as f:
                 self._vocab = json.load(f)
         else:
             self._vocab = self._create_default_vocab()
         self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
         super().__init__(
         )
     def _create_default_vocab(self) -> Dict[str, int]:
         special_tokens = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
         vocab = {token: idx for idx, token in enumerate(special_tokens)}
         idx = len(vocab)
+        # Squares (4-67)
+        for f in 'abcdefgh':
+            for r in '12345678':
+                vocab[f"{f}{r}"] = idx
                 idx += 1
+        # Promotions (68-71)
+        for p in ['q', 'r', 'b', 'n']:
+            vocab[p] = idx
+            idx += 1
         return vocab
     def _tokenize(self, text: str) -> List[str]:
         """
+        Tokenizes text by first normalizing specific chess patterns
+        and then extracting squares/promotions.
         """
+        # 1. NORMALIZE: Handle the bracketed promotions found in your dataset
+        #    Convert (Q) -> q, (N) -> n, etc.
+        text = (text.replace("(Q)", "q")
+                    .replace("(R)", "r")
+                    .replace("(B)", "b")
+                    .replace("(N)", "n"))
+        # 2. EXTRACT: Use regex to find valid tokens.
+        #    The regex r'[a-h][1-8]|[qrbn]' will:
+        #    - Match 'a7', 'a8', 'q' (from "WPa7a8q")
+        #    - Ignore 'W', 'P', 'B', 'Q', '(', '+', '*' (Garbage)
         return self.token_pattern.findall(text)
     def _convert_token_to_id(self, token: str) -> int:
         """Convert a token to its ID."""
+        return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN, 0))
     def _convert_id_to_token(self, index: int) -> str:
         """Convert an ID to its token."""
         return self._ids_to_tokens.get(index, self.UNK_TOKEN)
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Reconstructs standard UCI string (e.g. "e2e4 a7a8q")"""
+        special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
+        clean_tokens = [t for t in tokens if t not in special]
         output = []
+        for token in clean_tokens:
+            # Append promotion to previous move
+            if token in ['q', 'r', 'b', 'n'] and output:
+                output[-1] += token
+            # Append 2nd square to 1st square (e2 + e4 -> e2e4)
+            elif output and len(output[-1]) == 2 and output[-1][0] in 'abcdefgh':
+                output[-1] += token
+            # Start new move
             else:
                 output.append(token)
+        return " ".join(output)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
         if not os.path.isdir(save_directory):
             os.makedirs(save_directory, exist_ok=True)
         vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
         )
         with open(vocab_file, "w", encoding="utf-8") as f:
             json.dump(self._vocab, f, ensure_ascii=False, indent=2)
         return (vocab_file,)
+    # --- Dummy Compatibility Methods ---
+    @classmethod
+    def build_vocab_from_iterator(cls, iterator, min_frequency=1):
+        return cls() # Vocab is static
+    @classmethod
+    def build_vocab_from_dataset(cls, **kwargs):
+        return cls() # Vocab is static
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab)
+    def get_vocab(self) -> Dict[str, int]:
+        return dict(self._vocab)

vocab.json CHANGED Viewed

@@ -3,85 +3,72 @@
   "[BOS]": 1,
   "[EOS]": 2,
   "[UNK]": 3,
-  "WP": 4,
-  "WR": 5,
-  "WN": 6,
-  "WB": 7,
-  "WQ": 8,
-  "WK": 9,
-  "BP": 10,
-  "BR": 11,
-  "BN": 12,
-  "BB": 13,
-  "BQ": 14,
-  "BK": 15,
-  "a1": 16,
-  "a2": 17,
-  "a3": 18,
-  "a4": 19,
-  "a5": 20,
-  "a6": 21,
-  "a7": 22,
-  "a8": 23,
-  "b1": 24,
-  "b2": 25,
-  "b3": 26,
-  "b4": 27,
-  "b5": 28,
-  "b6": 29,
-  "b7": 30,
-  "b8": 31,
-  "c1": 32,
-  "c2": 33,
-  "c3": 34,
-  "c4": 35,
-  "c5": 36,
-  "c6": 37,
-  "c7": 38,
-  "c8": 39,
-  "d1": 40,
-  "d2": 41,
-  "d3": 42,
-  "d4": 43,
-  "d5": 44,
-  "d6": 45,
-  "d7": 46,
-  "d8": 47,
-  "e1": 48,
-  "e2": 49,
-  "e3": 50,
-  "e4": 51,
-  "e5": 52,
-  "e6": 53,
-  "e7": 54,
-  "e8": 55,
-  "f1": 56,
-  "f2": 57,
-  "f3": 58,
-  "f4": 59,
-  "f5": 60,
-  "f6": 61,
-  "f7": 62,
-  "f8": 63,
-  "g1": 64,
-  "g2": 65,
-  "g3": 66,
-  "g4": 67,
-  "g5": 68,
-  "g6": 69,
-  "g7": 70,
-  "g8": 71,
-  "h1": 72,
-  "h2": 73,
-  "h3": 74,
-  "h4": 75,
-  "h5": 76,
-  "h6": 77,
-  "h7": 78,
-  "h8": 79,
-  "(O)": 80,
-  "(o)": 81,
-  "(x)": 82,
-  "(+)": 83,
-  "(+*)": 84
 }

   "[BOS]": 1,
   "[EOS]": 2,
   "[UNK]": 3,
+  "a1": 4,
+  "a2": 5,
+  "a3": 6,
+  "a4": 7,
+  "a5": 8,
+  "a6": 9,
+  "a7": 10,
+  "a8": 11,
+  "b1": 12,
+  "b2": 13,
+  "b3": 14,
+  "b4": 15,
+  "b5": 16,
+  "b6": 17,
+  "b7": 18,
+  "b8": 19,
+  "c1": 20,
+  "c2": 21,
+  "c3": 22,
+  "c4": 23,
+  "c5": 24,
+  "c6": 25,
+  "c7": 26,
+  "c8": 27,
+  "d1": 28,
+  "d2": 29,
+  "d3": 30,
+  "d4": 31,
+  "d5": 32,
+  "d6": 33,
+  "d7": 34,
+  "d8": 35,
+  "e1": 36,
+  "e2": 37,
+  "e3": 38,
+  "e4": 39,
+  "e5": 40,
+  "e6": 41,
+  "e7": 42,
+  "e8": 43,
+  "f1": 44,
+  "f2": 45,
+  "f3": 46,
+  "f4": 47,
+  "f5": 48,
+  "f6": 49,
+  "f7": 50,
+  "f8": 51,
+  "g1": 52,
+  "g2": 53,
+  "g3": 54,
+  "g4": 55,
+  "g5": 56,
+  "g6": 57,
+  "g7": 58,
+  "g8": 59,
+  "h1": 60,
+  "h2": 61,
+  "h3": 62,
+  "h4": 63,
+  "h5": 64,
+  "h6": 65,
+  "h7": 66,
+  "h8": 67,
+  "q": 68,
+  "r": 69,
+  "b": 70,
+  "n": 71
 }