raphael-mathiot commited on
Commit
dc2885a
·
verified ·
1 Parent(s): a0b3e90

Chess Challenge submission by raphael-mathiot

Browse files
Files changed (3) hide show
  1. config.json +0 -4
  2. tokenizer.py +83 -115
  3. tokenizer_config.json +6 -0
config.json CHANGED
@@ -2,10 +2,6 @@
2
  "architectures": [
3
  "ChessForCausalLM"
4
  ],
5
- "auto_map": {
6
- "AutoConfig": "model.ChessConfig",
7
- "AutoModelForCausalLM": "model.ChessForCausalLM"
8
- },
9
  "bos_token_id": 1,
10
  "dropout": 0.1,
11
  "dtype": "float32",
 
2
  "architectures": [
3
  "ChessForCausalLM"
4
  ],
 
 
 
 
5
  "bos_token_id": 1,
6
  "dropout": 0.1,
7
  "dtype": "float32",
tokenizer.py CHANGED
@@ -17,7 +17,6 @@ import json
17
  import os
18
  from pathlib import Path
19
  from typing import Dict, List, Optional
20
- import re
21
 
22
  from transformers import PreTrainedTokenizer
23
 
@@ -26,8 +25,14 @@ class ChessTokenizer(PreTrainedTokenizer):
26
  """
27
  A custom tokenizer for chess moves using extended UCI notation.
28
 
29
- This tokenizer splits moves into semantic components (Pieces, Squares, Metadata).
30
- Example: "WPe2e4" -> ["WP", "e2", "e4"]
 
 
 
 
 
 
31
  """
32
 
33
  model_input_names = ["input_ids", "attention_mask"]
@@ -47,6 +52,11 @@ class ChessTokenizer(PreTrainedTokenizer):
47
  ):
48
  """
49
  Initialize the chess tokenizer.
 
 
 
 
 
50
  """
51
  # Initialize special tokens
52
  self._pad_token = self.PAD_TOKEN
@@ -54,24 +64,13 @@ class ChessTokenizer(PreTrainedTokenizer):
54
  self._eos_token = self.EOS_TOKEN
55
  self._unk_token = self.UNK_TOKEN
56
 
57
- # Clean kwargs
 
58
  kwargs.pop("pad_token", None)
59
  kwargs.pop("bos_token", None)
60
  kwargs.pop("eos_token", None)
61
  kwargs.pop("unk_token", None)
62
 
63
- # Regex for splitting moves into:
64
- # 1. Castling: (O), (o)
65
- # 2. Metadata: (x), (+*), (+)
66
- # 3. Pieces: WP, BR, etc.
67
- # 4. Squares: a1, h8, etc.
68
- self.token_pattern = re.compile(
69
- r'\(O\)|\(o\)|' # Castling
70
- r'\(x\)|\(\+\*\)|\(\+\)|' # Metadata (Capture, Mate, Check)
71
- r'[WB][PRNBQK]|' # Pieces (Color + Type)
72
- r'[a-h][1-8]' # Squares
73
- )
74
-
75
  # Load or create vocabulary
76
  if vocab is not None:
77
  self._vocab = vocab
@@ -79,13 +78,14 @@ class ChessTokenizer(PreTrainedTokenizer):
79
  with open(vocab_file, "r", encoding="utf-8") as f:
80
  self._vocab = json.load(f)
81
  else:
82
- # In this version, the default vocab is the FULL vocab
83
- # because chess rules are static.
84
  self._vocab = self._create_default_vocab()
85
 
86
  # Create reverse mapping
87
  self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
88
 
 
89
  super().__init__(
90
  pad_token=self._pad_token,
91
  bos_token=self._bos_token,
@@ -96,61 +96,53 @@ class ChessTokenizer(PreTrainedTokenizer):
96
 
97
  def _create_default_vocab(self) -> Dict[str, int]:
98
  """
99
- Create the full static vocabulary for Chess.
100
- Since the 'rules' of the tokens are known (squares a1-h8, pieces),
101
- we generate the full map here instead of learning it.
 
102
  """
103
- # 1. Special Tokens
104
  special_tokens = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
105
  vocab = {token: idx for idx, token in enumerate(special_tokens)}
106
- idx = len(vocab)
107
-
108
- # 2. Pieces (White/Black + Pawn/Rook/Knight/Bishop/Queen/King)
109
- colors = ['W', 'B']
110
- pieces = ['P', 'R', 'N', 'B', 'Q', 'K']
111
- for c in colors:
112
- for p in pieces:
113
- token = f"{c}{p}"
114
- if token not in vocab:
115
- vocab[token] = idx
116
- idx += 1
117
-
118
- # 3. Squares (a1 to h8)
119
- files = 'abcdefgh'
120
- ranks = '12345678'
121
- for f in files:
122
- for r in ranks:
123
- token = f"{f}{r}"
124
- if token not in vocab:
125
- vocab[token] = idx
126
- idx += 1
127
-
128
- # 4. Special Move Suffixes
129
- # Note: Order is handled by regex, but we just need them in vocab here
130
- specials = ['(O)', '(o)', '(x)', '(+)', '(+*)']
131
- for s in specials:
132
- if s not in vocab:
133
- vocab[s] = idx
134
- idx += 1
135
-
136
  return vocab
137
 
138
  @classmethod
139
  def build_vocab_from_iterator(
140
  cls,
141
- iterator: Iterator,
142
  min_frequency: int = 1,
143
  ) -> "ChessTokenizer":
144
  """
145
- API Compatibility Method.
 
 
 
 
146
 
147
- Since this tokenizer uses a static vocabulary based on Chess rules,
148
- scanning the iterator is not necessary. We simply consume the iterator
149
- (optional) and return the standard tokenizer.
150
  """
151
- # We explicitly ignore the iterator data because our vocab
152
- # is pre-defined by the rules of the game.
153
- return cls()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  @classmethod
156
  def build_vocab_from_dataset(
@@ -162,12 +154,30 @@ class ChessTokenizer(PreTrainedTokenizer):
162
  max_samples: Optional[int] = 100000,
163
  ) -> "ChessTokenizer":
164
  """
165
- API Compatibility Method.
 
 
 
 
 
 
 
166
 
167
- Returns a tokenizer with the standard chess vocabulary.
168
- Does not download the dataset as the vocabulary is static.
169
  """
170
- return cls()
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  @property
173
  def vocab_size(self) -> int:
@@ -180,71 +190,29 @@ class ChessTokenizer(PreTrainedTokenizer):
180
 
181
  def _tokenize(self, text: str) -> List[str]:
182
  """
183
- Tokenize a string of moves into semantic components using Regex.
184
 
185
  Args:
186
- text: A string of space-separated moves (e.g., "WPe2e4 BPe7e5")
187
 
188
  Returns:
189
- List of components (e.g., ["WP", "e2", "e4", "BP", "e7", "e5"])
190
  """
191
- # findall will ignore spaces and return only the matching components
192
- return self.token_pattern.findall(text)
193
 
194
  def _convert_token_to_id(self, token: str) -> int:
195
  """Convert a token to its ID."""
196
- return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN))
197
 
198
  def _convert_id_to_token(self, index: int) -> str:
199
  """Convert an ID to its token."""
200
  return self._ids_to_tokens.get(index, self.UNK_TOKEN)
201
 
202
- def _is_start_of_move(self, token: str) -> bool:
203
- """
204
- Helper to determine if a token represents the start of a new move.
205
- Moves start with a Piece (e.g., 'WP') or Castling (e.g., '(O)').
206
- """
207
- # 1. Check for Castling (Short or Long)
208
- if token in ['(O)', '(o)']:
209
- return True
210
-
211
- # 2. Check for Pieces (Length 2, starts with W/B, ends with Piece type)
212
- # We check specific characters to avoid confusion with squares or suffixes
213
- if len(token) == 2 and token[0] in 'WB' and token[1] in 'PRNBQK':
214
- return True
215
-
216
- return False
217
-
218
  def convert_tokens_to_string(self, tokens: List[str]) -> str:
219
- """
220
- Converts a list of tokens back to a string, respecting Chess notation rules.
221
-
222
- Logic:
223
- - Spaces are inserted BEFORE a token ONLY if that token marks the start of a new move.
224
- - Squares (e2, e4) and Suffixes (x, +) are concatenated to the previous token.
225
- """
226
- output = []
227
- special_tokens = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
228
-
229
- for i, token in enumerate(tokens):
230
- # 1. Handle Special Tokens (keep them, surround with spaces if needed)
231
- if token in special_tokens:
232
- if output and output[-1] != " ":
233
- output.append(" ")
234
- output.append(token)
235
-
236
- # 2. Handle Start of New Move (Insert space before)
237
- elif self._is_start_of_move(token):
238
- # Add a space if we aren't at the very start and the previous char isn't already a space
239
- if output and output[-1] != " ":
240
- output.append(" ")
241
- output.append(token)
242
-
243
- # 3. Handle Continuations (Squares 'e2', Suffixes '(x)') -> Concatenate
244
- else:
245
- output.append(token)
246
-
247
- return "".join(output).strip()
248
 
249
  def save_vocabulary(
250
  self,
 
17
  import os
18
  from pathlib import Path
19
  from typing import Dict, List, Optional
 
20
 
21
  from transformers import PreTrainedTokenizer
22
 
 
25
  """
26
  A custom tokenizer for chess moves using extended UCI notation.
27
 
28
+ This tokenizer maps each possible chess move to a unique token ID.
29
+ The vocabulary is built from the training dataset to ensure all moves
30
+ encountered during training have a corresponding token.
31
+
32
+ Example:
33
+ >>> tokenizer = ChessTokenizer()
34
+ >>> tokenizer.encode("WPe2e4 BPe7e5")
35
+ [1, 42, 87, 2] # [BOS, e2e4, e7e5, EOS]
36
  """
37
 
38
  model_input_names = ["input_ids", "attention_mask"]
 
52
  ):
53
  """
54
  Initialize the chess tokenizer.
55
+
56
+ Args:
57
+ vocab_file: Path to a JSON file containing the vocabulary mapping.
58
+ vocab: Dictionary mapping tokens to IDs (alternative to vocab_file).
59
+ **kwargs: Additional arguments passed to PreTrainedTokenizer.
60
  """
61
  # Initialize special tokens
62
  self._pad_token = self.PAD_TOKEN
 
64
  self._eos_token = self.EOS_TOKEN
65
  self._unk_token = self.UNK_TOKEN
66
 
67
+ # Remove any duplicate special-token entries passed through kwargs
68
+ # to avoid "multiple values for keyword" errors when loading from disk.
69
  kwargs.pop("pad_token", None)
70
  kwargs.pop("bos_token", None)
71
  kwargs.pop("eos_token", None)
72
  kwargs.pop("unk_token", None)
73
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  # Load or create vocabulary
75
  if vocab is not None:
76
  self._vocab = vocab
 
78
  with open(vocab_file, "r", encoding="utf-8") as f:
79
  self._vocab = json.load(f)
80
  else:
81
+ # Create a minimal vocabulary with just special tokens
82
+ # The full vocabulary should be built from the dataset
83
  self._vocab = self._create_default_vocab()
84
 
85
  # Create reverse mapping
86
  self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
87
 
88
+ # Call parent init AFTER setting up vocab
89
  super().__init__(
90
  pad_token=self._pad_token,
91
  bos_token=self._bos_token,
 
96
 
97
  def _create_default_vocab(self) -> Dict[str, int]:
98
  """
99
+ Create a minimal default vocabulary with just special tokens.
100
+
101
+ For the full vocabulary, use `build_vocab_from_dataset()`.
102
+ This minimal vocab is just a placeholder - you should build from data.
103
  """
 
104
  special_tokens = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
105
  vocab = {token: idx for idx, token in enumerate(special_tokens)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  return vocab
107
 
108
  @classmethod
109
  def build_vocab_from_iterator(
110
  cls,
111
+ iterator,
112
  min_frequency: int = 1,
113
  ) -> "ChessTokenizer":
114
  """
115
+ Build a tokenizer vocabulary from an iterator of game strings.
116
+
117
+ Args:
118
+ iterator: An iterator yielding game strings (space-separated moves).
119
+ min_frequency: Minimum frequency for a token to be included.
120
 
121
+ Returns:
122
+ A ChessTokenizer with the built vocabulary.
 
123
  """
124
+ from collections import Counter
125
+
126
+ token_counts = Counter()
127
+
128
+ for game in iterator:
129
+ moves = game.strip().split()
130
+ token_counts.update(moves)
131
+
132
+ # Filter by frequency
133
+ tokens = [
134
+ token for token, count in token_counts.items()
135
+ if count >= min_frequency
136
+ ]
137
+
138
+ # Sort for reproducibility
139
+ tokens = sorted(tokens)
140
+
141
+ # Build vocabulary
142
+ special_tokens = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN]
143
+ vocab = {token: idx for idx, token in enumerate(special_tokens + tokens)}
144
+
145
+ return cls(vocab=vocab)
146
 
147
  @classmethod
148
  def build_vocab_from_dataset(
 
154
  max_samples: Optional[int] = 100000,
155
  ) -> "ChessTokenizer":
156
  """
157
+ Build a tokenizer vocabulary from a Hugging Face dataset.
158
+
159
+ Args:
160
+ dataset_name: Name of the dataset on Hugging Face Hub.
161
+ split: Dataset split to use.
162
+ column: Column containing the game strings.
163
+ min_frequency: Minimum frequency for a token to be included (default: 500).
164
+ max_samples: Maximum number of samples to process (default: 100k).
165
 
166
+ Returns:
167
+ A ChessTokenizer with the built vocabulary.
168
  """
169
+ from datasets import load_dataset
170
+
171
+ dataset = load_dataset(dataset_name, split=split)
172
+
173
+ if max_samples is not None:
174
+ dataset = dataset.select(range(min(max_samples, len(dataset))))
175
+
176
+ def game_iterator():
177
+ for example in dataset:
178
+ yield example[column]
179
+
180
+ return cls.build_vocab_from_iterator(game_iterator(), min_frequency=min_frequency)
181
 
182
  @property
183
  def vocab_size(self) -> int:
 
190
 
191
  def _tokenize(self, text: str) -> List[str]:
192
  """
193
+ Tokenize a string of moves into a list of tokens.
194
 
195
  Args:
196
+ text: A string of space-separated moves.
197
 
198
  Returns:
199
+ List of move tokens.
200
  """
201
+ return text.strip().split()
 
202
 
203
  def _convert_token_to_id(self, token: str) -> int:
204
  """Convert a token to its ID."""
205
+ return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN, 0))
206
 
207
  def _convert_id_to_token(self, index: int) -> str:
208
  """Convert an ID to its token."""
209
  return self._ids_to_tokens.get(index, self.UNK_TOKEN)
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  def convert_tokens_to_string(self, tokens: List[str]) -> str:
212
+ """Convert a list of tokens back to a string."""
213
+ # Filter out special tokens for cleaner output
214
+ special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
215
+ return " ".join(t for t in tokens if t not in special)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
  def save_vocabulary(
218
  self,
tokenizer_config.json CHANGED
@@ -33,6 +33,12 @@
33
  "special": true
34
  }
35
  },
 
 
 
 
 
 
36
  "bos_token": "[BOS]",
37
  "clean_up_tokenization_spaces": false,
38
  "eos_token": "[EOS]",
 
33
  "special": true
34
  }
35
  },
36
+ "auto_map": {
37
+ "AutoTokenizer": [
38
+ "tokenizer.ChessTokenizer",
39
+ null
40
+ ]
41
+ },
42
  "bos_token": "[BOS]",
43
  "clean_up_tokenization_spaces": false,
44
  "eos_token": "[EOS]",