swdo commited on
Commit
467b97a
·
verified ·
1 Parent(s): 35b59a1

Chess Challenge submission by swdo

Browse files
Files changed (2) hide show
  1. model.safetensors +1 -1
  2. tokenizer.py +122 -107
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2f6badcac6a0d4167ee6f95d6322d2d821159f95ce87172d365dc48ec691b74
3
  size 3490096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02aaa4f1c65a9dc94d710f1071176353e932da7f89b161579755aa4c93adcc71
3
  size 3490096
tokenizer.py CHANGED
@@ -1,16 +1,4 @@
1
- """
2
- Decomposed Chess Tokenizer v2 for the Chess Challenge.
3
 
4
- This tokenizer decomposes moves into structural components:
5
- - Color (W/B)
6
- - Piece (P/N/B/R/Q/K)
7
- - From square (a1-h8)
8
- - To square (a1-h8)
9
- - Modifiers (capture, check, checkmate, promotion, castling)
10
-
11
- This allows the model to learn chess structure and generalize better
12
- while using a much smaller vocabulary (~90 tokens vs ~1200+).
13
- """
14
 
15
  from __future__ import annotations
16
 
@@ -25,18 +13,12 @@ from transformers import PreTrainedTokenizer
25
 
26
  class ChessTokenizer(PreTrainedTokenizer):
27
  """
28
- Decomposed chess move tokenizer.
29
-
30
- Breaks moves into structural components for better learning.
31
 
32
  Example:
33
  >>> tokenizer = ChessTokenizer()
34
- >>> tokens = tokenizer.tokenize("WPe2e4 BPe7e5")
35
- >>> print(tokens)
36
- ['W', 'P', 'e2', 'e4', 'B', 'P', 'e7', 'e5']
37
-
38
- >>> tokenizer.encode("WNg1f3(+)")
39
- [1, 5, 8, 39, 29, 12, 2] # [BOS, W, N, g1, f3, +, EOS]
40
  """
41
 
42
  model_input_names = ["input_ids", "attention_mask"]
@@ -47,18 +29,14 @@ class ChessTokenizer(PreTrainedTokenizer):
47
  BOS_TOKEN = "[BOS]"
48
  EOS_TOKEN = "[EOS]"
49
  UNK_TOKEN = "[UNK]"
50
- SEP_TOKEN = "[SEP]" # Optional: separate moves
51
 
52
  # Chess components
53
- # Use [W] and [B] for colors to avoid collision with piece 'B' (Bishop)
54
  COLORS = ["[W]", "[B]"]
55
  PIECES = ["P", "N", "B", "R", "Q", "K"]
56
  FILES = ["a", "b", "c", "d", "e", "f", "g", "h"]
57
  RANKS = ["1", "2", "3", "4", "5", "6", "7", "8"]
58
- # Generate all 64 squares
59
  SQUARES = [f + r for f in FILES for r in ["1", "2", "3", "4", "5", "6", "7", "8"]]
60
 
61
- # Modifiers
62
  MODIFIERS = [
63
  "x", # Capture
64
  "+", # Check
@@ -74,8 +52,6 @@ class ChessTokenizer(PreTrainedTokenizer):
74
  "O", # Queenside castling (dataset format)
75
  ]
76
 
77
- # Regex pattern to parse extended UCI moves
78
- # Format: [W|B][Piece][from_sq][to_sq][promotion]?[suffixes]?
79
  MOVE_PATTERN = re.compile(
80
  r'^([WB])' # Color
81
  r'([PNBRQK])' # Piece
@@ -89,24 +65,16 @@ class ChessTokenizer(PreTrainedTokenizer):
89
  self,
90
  vocab_file: Optional[str] = None,
91
  vocab: Optional[Dict[str, int]] = None,
92
- add_move_separator: bool = False,
93
  **kwargs,
94
  ):
95
- """
96
- Initialize the decomposed chess tokenizer.
97
-
98
- Args:
99
- vocab_file: Path to vocabulary JSON file.
100
- vocab: Pre-built vocabulary dictionary.
101
- add_move_separator: Whether to add [SEP] between moves.
102
- """
103
  self._pad_token = self.PAD_TOKEN
104
  self._bos_token = self.BOS_TOKEN
105
  self._eos_token = self.EOS_TOKEN
106
  self._unk_token = self.UNK_TOKEN
107
- self.add_move_separator = add_move_separator
108
-
109
- # Remove duplicates from kwargs
110
  kwargs.pop("pad_token", None)
111
  kwargs.pop("bos_token", None)
112
  kwargs.pop("eos_token", None)
@@ -119,9 +87,10 @@ class ChessTokenizer(PreTrainedTokenizer):
119
  with open(vocab_file, "r", encoding="utf-8") as f:
120
  self._vocab = json.load(f)
121
  else:
122
- self._vocab = self._create_vocab()
 
123
 
124
- # Reverse mapping
125
  self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
126
 
127
  super().__init__(
@@ -132,14 +101,17 @@ class ChessTokenizer(PreTrainedTokenizer):
132
  **kwargs,
133
  )
134
 
135
- def _create_vocab(self) -> Dict[str, int]:
136
- """Create the fixed vocabulary from chess components."""
 
 
 
 
 
137
  tokens = []
138
 
139
  # Special tokens first
140
  tokens.extend([self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN])
141
- if self.add_move_separator:
142
- tokens.append(self.SEP_TOKEN)
143
 
144
  # Colors
145
  tokens.extend(self.COLORS)
@@ -155,6 +127,56 @@ class ChessTokenizer(PreTrainedTokenizer):
155
 
156
  return {token: idx for idx, token in enumerate(tokens)}
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  @property
159
  def vocab_size(self) -> int:
160
  return len(self._vocab)
@@ -175,12 +197,11 @@ class ChessTokenizer(PreTrainedTokenizer):
175
  match = self.MOVE_PATTERN.match(move)
176
 
177
  if not match:
178
- # Fallback: return as unknown
179
  return [self.UNK_TOKEN]
180
 
181
  tokens = []
182
 
183
- # Color - map 'W' -> '[W]' and 'B' -> '[B]' to avoid collision with piece Bishop
184
  color = match.group(1)
185
  tokens.append(f"[{color}]")
186
 
@@ -195,15 +216,13 @@ class ChessTokenizer(PreTrainedTokenizer):
195
 
196
  # Promotion (optional)
197
  if match.group(5):
198
- tokens.append(match.group(5)) # e.g., "=Q"
199
 
200
  # Parse suffixes (optional)
201
  if match.group(6):
202
- suffix = match.group(6) # e.g., "(x+)"
203
- # Remove parentheses
204
  suffix_content = suffix[1:-1]
205
 
206
- # Parse individual modifiers
207
  if "x" in suffix_content:
208
  tokens.append("x")
209
  if "+*" in suffix_content:
@@ -219,7 +238,7 @@ class ChessTokenizer(PreTrainedTokenizer):
219
 
220
  def _tokenize(self, text: str) -> List[str]:
221
  """
222
- Tokenize a string of moves.
223
 
224
  Args:
225
  text: Space-separated moves in extended UCI format.
@@ -230,13 +249,9 @@ class ChessTokenizer(PreTrainedTokenizer):
230
  tokens = []
231
  moves = text.strip().split()
232
 
233
- for i, move in enumerate(moves):
234
  move_tokens = self._parse_move(move)
235
  tokens.extend(move_tokens)
236
-
237
- # Add separator between moves (optional)
238
- if self.add_move_separator and i < len(moves) - 1:
239
- tokens.append(self.SEP_TOKEN)
240
 
241
  return tokens
242
 
@@ -252,7 +267,7 @@ class ChessTokenizer(PreTrainedTokenizer):
252
 
253
  Reconstructs moves from component tokens.
254
  """
255
- special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN, self.SEP_TOKEN}
256
 
257
  result = []
258
  current_move = []
@@ -288,7 +303,6 @@ class ChessTokenizer(PreTrainedTokenizer):
288
  tokens[2] in self.SQUARES and
289
  tokens[3] in self.SQUARES):
290
 
291
- # Check if next token would start a new move
292
  if len(tokens) == 4:
293
  return True
294
 
@@ -296,7 +310,7 @@ class ChessTokenizer(PreTrainedTokenizer):
296
  remaining = tokens[4:]
297
  for t in remaining:
298
  if t in self.COLORS:
299
- return True # Next move starting
300
  if t not in self.MODIFIERS and not t.startswith("="):
301
  return True
302
 
@@ -309,12 +323,11 @@ class ChessTokenizer(PreTrainedTokenizer):
309
  if not tokens:
310
  return ""
311
 
312
- # Basic structure: Color + Piece + From + To
313
  if len(tokens) >= 4:
314
  # Convert [W] -> W and [B] -> B for colors
315
  color = tokens[0]
316
  if color in self.COLORS:
317
- color = color[1] # Extract 'W' from '[W]' or 'B' from '[B]'
318
 
319
  move = color + "".join(tokens[1:4])
320
 
@@ -338,6 +351,16 @@ class ChessTokenizer(PreTrainedTokenizer):
338
  save_directory: str,
339
  filename_prefix: Optional[str] = None,
340
  ) -> Tuple[str]:
 
 
 
 
 
 
 
 
 
 
341
  if not os.path.isdir(save_directory):
342
  os.makedirs(save_directory, exist_ok=True)
343
 
@@ -349,50 +372,42 @@ class ChessTokenizer(PreTrainedTokenizer):
349
  with open(vocab_file, "w", encoding="utf-8") as f:
350
  json.dump(self._vocab, f, ensure_ascii=False, indent=2)
351
 
352
- # Also save config with auto_map for HuggingFace to find our custom tokenizer
353
- # Format: (slow_tokenizer_class, fast_tokenizer_class) - we don't have a fast version
354
- config = {
355
- "tokenizer_class": "ChessTokenizer",
356
- "auto_map": {
357
- "AutoTokenizer": ["tokenizer.ChessTokenizer", None]
358
- },
359
- "add_move_separator": self.add_move_separator,
360
- "vocab_size": self.vocab_size,
361
- }
362
- config_file = os.path.join(save_directory, "tokenizer_config.json")
363
- with open(config_file, "w", encoding="utf-8") as f:
364
- json.dump(config, f, indent=2)
365
-
366
  return (vocab_file,)
367
-
368
- @classmethod
369
- def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
370
- """Load tokenizer from directory or hub."""
371
- path = Path(pretrained_model_name_or_path)
372
-
373
- if path.is_dir():
374
- vocab_file = path / "vocab.json"
375
- config_file = path / "tokenizer_config.json"
376
-
377
- add_move_separator = False
378
- if config_file.exists():
379
- with open(config_file, "r") as f:
380
- config = json.load(f)
381
- add_move_separator = config.get("add_move_separator", False)
382
-
383
- return cls(
384
- vocab_file=str(vocab_file) if vocab_file.exists() else None,
385
- add_move_separator=add_move_separator,
386
- **kwargs,
387
- )
388
-
389
- # Fallback to HuggingFace hub
390
- from huggingface_hub import hf_hub_download
391
-
392
- vocab_file = hf_hub_download(
393
- repo_id=pretrained_model_name_or_path,
394
- filename="vocab.json",
395
- )
396
-
397
- return cls(vocab_file=vocab_file, **kwargs)
398
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  from __future__ import annotations
4
 
 
13
 
14
  class ChessTokenizer(PreTrainedTokenizer):
15
  """
16
+ A custom tokenizer
 
 
17
 
18
  Example:
19
  >>> tokenizer = ChessTokenizer()
20
+ >>> tokenizer.encode("WPe2e4 BPe7e5")
21
+ [1, 4, 6, 45, 47, 5, 6, 50, 48, 2] # [BOS, components..., EOS]
 
 
 
 
22
  """
23
 
24
  model_input_names = ["input_ids", "attention_mask"]
 
29
  BOS_TOKEN = "[BOS]"
30
  EOS_TOKEN = "[EOS]"
31
  UNK_TOKEN = "[UNK]"
 
32
 
33
  # Chess components
 
34
  COLORS = ["[W]", "[B]"]
35
  PIECES = ["P", "N", "B", "R", "Q", "K"]
36
  FILES = ["a", "b", "c", "d", "e", "f", "g", "h"]
37
  RANKS = ["1", "2", "3", "4", "5", "6", "7", "8"]
 
38
  SQUARES = [f + r for f in FILES for r in ["1", "2", "3", "4", "5", "6", "7", "8"]]
39
 
 
40
  MODIFIERS = [
41
  "x", # Capture
42
  "+", # Check
 
52
  "O", # Queenside castling (dataset format)
53
  ]
54
 
 
 
55
  MOVE_PATTERN = re.compile(
56
  r'^([WB])' # Color
57
  r'([PNBRQK])' # Piece
 
65
  self,
66
  vocab_file: Optional[str] = None,
67
  vocab: Optional[Dict[str, int]] = None,
 
68
  **kwargs,
69
  ):
70
+
71
+
 
 
 
 
 
 
72
  self._pad_token = self.PAD_TOKEN
73
  self._bos_token = self.BOS_TOKEN
74
  self._eos_token = self.EOS_TOKEN
75
  self._unk_token = self.UNK_TOKEN
76
+
77
+ # Remove any duplicate
 
78
  kwargs.pop("pad_token", None)
79
  kwargs.pop("bos_token", None)
80
  kwargs.pop("eos_token", None)
 
87
  with open(vocab_file, "r", encoding="utf-8") as f:
88
  self._vocab = json.load(f)
89
  else:
90
+ # Create the fixed decomposed vocabulary
91
+ self._vocab = self._create_default_vocab()
92
 
93
+ # Create reverse mapping
94
  self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
95
 
96
  super().__init__(
 
101
  **kwargs,
102
  )
103
 
104
+ def _create_default_vocab(self) -> Dict[str, int]:
105
+ """
106
+ Create the fixed vocabulary from chess components.
107
+
108
+ Unlike the standard tokenizer, this creates a small fixed vocab
109
+ of ~88 tokens for decomposed move representation.
110
+ """
111
  tokens = []
112
 
113
  # Special tokens first
114
  tokens.extend([self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN])
 
 
115
 
116
  # Colors
117
  tokens.extend(self.COLORS)
 
127
 
128
  return {token: idx for idx, token in enumerate(tokens)}
129
 
130
+ @classmethod
131
+ def build_vocab_from_iterator(
132
+ cls,
133
+ iterator,
134
+ min_frequency: int = 1,
135
+ ) -> "ChessTokenizer":
136
+ """
137
+ Build a tokenizer vocabulary from an iterator of game strings.
138
+
139
+ Note: For decomposed tokenizer, this ignores the iterator and
140
+ creates the fixed vocabulary. Provided for API compatibility.
141
+
142
+ Args:
143
+ iterator: An iterator yielding game strings (ignored).
144
+ min_frequency: Minimum frequency for a token (ignored).
145
+
146
+ Returns:
147
+ A ChessTokenizer with the fixed decomposed vocabulary.
148
+ """
149
+ # Decomposed tokenizer uses fixed vocabulary
150
+ return cls()
151
+
152
+ @classmethod
153
+ def build_vocab_from_dataset(
154
+ cls,
155
+ dataset_name: str = "dlouapre/lichess_2025-01_1M",
156
+ split: str = "train",
157
+ column: str = "moves",
158
+ min_frequency: int = 1,
159
+ max_samples: Optional[int] = None,
160
+ ) -> "ChessTokenizer":
161
+ """
162
+ Build a tokenizer vocabulary from a Hugging Face dataset.
163
+
164
+ Note: For decomposed tokenizer, this ignores the dataset and
165
+ creates the fixed vocabulary. Provided for API compatibility.
166
+
167
+ Args:
168
+ dataset_name: Name of the dataset on Hugging Face Hub (ignored).
169
+ split: Dataset split to use (ignored).
170
+ column: Column containing move strings (ignored).
171
+ min_frequency: Minimum frequency for inclusion (ignored).
172
+ max_samples: Maximum samples to process (ignored).
173
+
174
+ Returns:
175
+ A ChessTokenizer with the fixed decomposed vocabulary.
176
+ """
177
+ print(f"Note: Decomposed tokenizer uses fixed vocabulary (~88 tokens)")
178
+ return cls()
179
+
180
  @property
181
  def vocab_size(self) -> int:
182
  return len(self._vocab)
 
197
  match = self.MOVE_PATTERN.match(move)
198
 
199
  if not match:
 
200
  return [self.UNK_TOKEN]
201
 
202
  tokens = []
203
 
204
+ # Color - map 'W' -> '[W]' and 'B' -> '[B]'
205
  color = match.group(1)
206
  tokens.append(f"[{color}]")
207
 
 
216
 
217
  # Promotion (optional)
218
  if match.group(5):
219
+ tokens.append(match.group(5))
220
 
221
  # Parse suffixes (optional)
222
  if match.group(6):
223
+ suffix = match.group(6)
 
224
  suffix_content = suffix[1:-1]
225
 
 
226
  if "x" in suffix_content:
227
  tokens.append("x")
228
  if "+*" in suffix_content:
 
238
 
239
  def _tokenize(self, text: str) -> List[str]:
240
  """
241
+ Tokenize a string of moves into component tokens.
242
 
243
  Args:
244
  text: Space-separated moves in extended UCI format.
 
249
  tokens = []
250
  moves = text.strip().split()
251
 
252
+ for move in moves:
253
  move_tokens = self._parse_move(move)
254
  tokens.extend(move_tokens)
 
 
 
 
255
 
256
  return tokens
257
 
 
267
 
268
  Reconstructs moves from component tokens.
269
  """
270
+ special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
271
 
272
  result = []
273
  current_move = []
 
303
  tokens[2] in self.SQUARES and
304
  tokens[3] in self.SQUARES):
305
 
 
306
  if len(tokens) == 4:
307
  return True
308
 
 
310
  remaining = tokens[4:]
311
  for t in remaining:
312
  if t in self.COLORS:
313
+ return True
314
  if t not in self.MODIFIERS and not t.startswith("="):
315
  return True
316
 
 
323
  if not tokens:
324
  return ""
325
 
 
326
  if len(tokens) >= 4:
327
  # Convert [W] -> W and [B] -> B for colors
328
  color = tokens[0]
329
  if color in self.COLORS:
330
+ color = color[1]
331
 
332
  move = color + "".join(tokens[1:4])
333
 
 
351
  save_directory: str,
352
  filename_prefix: Optional[str] = None,
353
  ) -> Tuple[str]:
354
+ """
355
+ Save the vocabulary to a file.
356
+
357
+ Args:
358
+ save_directory: Directory to save the vocabulary.
359
+ filename_prefix: Optional prefix for the vocabulary file.
360
+
361
+ Returns:
362
+ Tuple containing the path to the saved vocabulary file.
363
+ """
364
  if not os.path.isdir(save_directory):
365
  os.makedirs(save_directory, exist_ok=True)
366
 
 
372
  with open(vocab_file, "w", encoding="utf-8") as f:
373
  json.dump(self._vocab, f, ensure_ascii=False, indent=2)
374
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  return (vocab_file,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
+
378
+ def count_vocab_from_dataset(
379
+ dataset_name: str = "dlouapre/lichess_2025-01_1M",
380
+ split: str = "train",
381
+ column: str = "moves",
382
+ max_samples: Optional[int] = None,
383
+ ) -> Dict[str, int]:
384
+ """
385
+ Count token frequencies in a dataset.
386
+
387
+ Note: For decomposed tokenizer, this counts component frequencies
388
+ rather than whole-move frequencies.
389
+
390
+ Args:
391
+ dataset_name: Name of the dataset.
392
+ split: Dataset split.
393
+ column: Column with moves.
394
+ max_samples: Max samples to process.
395
+
396
+ Returns:
397
+ Dictionary of token frequencies.
398
+ """
399
+ from collections import Counter
400
+ from datasets import load_dataset
401
+
402
+ tokenizer = ChessTokenizer()
403
+
404
+ dataset = load_dataset(dataset_name, split=split)
405
+ if max_samples:
406
+ dataset = dataset.select(range(min(max_samples, len(dataset))))
407
+
408
+ counts = Counter()
409
+ for example in dataset:
410
+ tokens = tokenizer.tokenize(example[column])
411
+ counts.update(tokens)
412
+
413
+ return dict(counts)