alexandreduplessis commited on
Commit
6d61d77
·
verified ·
1 Parent(s): 57243ca

Chess Challenge submission by alexandreduplessis

Browse files
Files changed (1) hide show
  1. tokenizer.py +0 -70
tokenizer.py CHANGED
@@ -1,22 +1,3 @@
1
- """
2
- Custom Chess Tokenizer for the Chess Challenge (structured, decomposed).
3
-
4
- This tokenizer parses the dataset's extended UCI tokens (e.g., WPe2e4, BNg8f6(x))
5
- and decomposes each move into a small set of atomic tokens:
6
-
7
- [MOVE] e2 e4
8
- [MOVE] e7 e8 promo_q (promotion when detected)
9
-
10
- Design goals for <1M parameter models:
11
- - Small, fixed vocabulary (no dataset scan needed)
12
- - Reduced sparsity (share statistics across moves)
13
- - Fewer failure modes (drop suffix tokens like (x), (+), etc.)
14
- - Compatible with HF Trainer / PreTrainedTokenizer
15
-
16
- Note: evaluation extracts UCI moves by detecting square patterns in generated text.
17
- This tokenizer ensures squares appear as tokens ("e2", "e4") which is evaluator-friendly.
18
- """
19
-
20
  from __future__ import annotations
21
 
22
  import json
@@ -28,15 +9,6 @@ from transformers import PreTrainedTokenizer
28
 
29
 
30
  class ChessTokenizer(PreTrainedTokenizer):
31
- """
32
- A custom tokenizer for chess moves.
33
-
34
- Each dataset move like 'WPe2e4(x)' becomes tokens:
35
- ['[MOVE]', 'e2', 'e4'] (+ optional 'promo_q/r/b/n')
36
-
37
- This helps small models learn legality by learning square transitions
38
- rather than memorizing thousands of full-move tokens.
39
- """
40
 
41
  model_input_names = ["input_ids", "attention_mask"]
42
  vocab_files_names = {"vocab_file": "vocab.json"}
@@ -50,11 +22,9 @@ class ChessTokenizer(PreTrainedTokenizer):
50
  # Structure token
51
  MOVE_TOKEN = "[MOVE]"
52
 
53
- # Regex to parse dataset moves: W/B + piece + from + to + rest
54
  _MOVE_RE = re.compile(
55
  r'^(?P<color>[WB])(?P<piece>[PNBRQK])(?P<from>[a-h][1-8])(?P<to>[a-h][1-8])(?P<rest>.*)$'
56
  )
57
- # Promotion detection (be permissive)
58
  _PROMO_RE = re.compile(r'=?([QRBNqrbn])')
59
 
60
  def __init__(
@@ -86,7 +56,6 @@ class ChessTokenizer(PreTrainedTokenizer):
86
 
87
  self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
88
 
89
- # Call parent init AFTER vocab is ready
90
  super().__init__(
91
  pad_token=self._pad_token,
92
  bos_token=self._bos_token,
@@ -96,21 +65,11 @@ class ChessTokenizer(PreTrainedTokenizer):
96
  )
97
 
98
  def _create_default_vocab(self) -> Dict[str, int]:
99
- """
100
- Minimal default vocab (placeholder). Prefer build_structured_vocab().
101
- """
102
  special = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN, self.MOVE_TOKEN]
103
  return {t: i for i, t in enumerate(special)}
104
 
105
  @classmethod
106
  def build_structured_vocab(cls) -> "ChessTokenizer":
107
- """
108
- Build a fixed, complete vocabulary:
109
- - special tokens
110
- - [MOVE]
111
- - 64 squares: a1..h8
112
- - promotion tokens: promo_q/r/b/n
113
- """
114
  special = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN, cls.MOVE_TOKEN]
115
 
116
  files = "abcdefgh"
@@ -123,8 +82,6 @@ class ChessTokenizer(PreTrainedTokenizer):
123
  vocab = {t: i for i, t in enumerate(tokens)}
124
  return cls(vocab=vocab)
125
 
126
- # Backwards-compatible API: if someone calls dataset-based vocab build,
127
- # we return structured vocab by default (dataset scan is unnecessary here).
128
  @classmethod
129
  def build_vocab_from_dataset(
130
  cls,
@@ -134,7 +91,6 @@ class ChessTokenizer(PreTrainedTokenizer):
134
  min_frequency: int = 500,
135
  max_samples: Optional[int] = 100000,
136
  ) -> "ChessTokenizer":
137
- # Keep signature, but use structured vocab for this tokenizer design.
138
  return cls.build_structured_vocab()
139
 
140
  @property
@@ -151,20 +107,10 @@ class ChessTokenizer(PreTrainedTokenizer):
151
  return self._ids_to_tokens.get(index, self.UNK_TOKEN)
152
 
153
  def convert_tokens_to_string(self, tokens: List[str]) -> str:
154
- """
155
- Convert tokens back to a string.
156
-
157
- We keep squares and promo tokens; we drop PAD/BOS/EOS/UNK for cleaner output.
158
- Keeping [MOVE] is useful for structure (but you can drop it if you want).
159
- """
160
  drop = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
161
  return " ".join(t for t in tokens if t not in drop)
162
 
163
  def _decompose_one_move(self, move_tok: str) -> List[str]:
164
- """
165
- Parse dataset move token 'WPe2e4(x)' -> ['[MOVE]', 'e2', 'e4'] (+ promo)
166
- If parsing fails, emit [UNK].
167
- """
168
  m = self._MOVE_RE.match(move_tok)
169
  if not m:
170
  return [self.UNK_TOKEN]
@@ -185,31 +131,19 @@ class ChessTokenizer(PreTrainedTokenizer):
185
  return out
186
 
187
  def _tokenize(self, text: str) -> List[str]:
188
- """
189
- Tokenize text.
190
-
191
- Important: HF may call _tokenize() on already-split "words".
192
- So this must handle both:
193
- - full strings with spaces
194
- - a single token like "WPe2e4(x)"
195
- """
196
  text = text.strip()
197
  if not text:
198
  return []
199
 
200
  special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN, self.MOVE_TOKEN}
201
 
202
- # If HF already split: single "word"
203
  if " " not in text:
204
  if text in special:
205
  return [text]
206
- # If it's already a square or promo token, keep it
207
  if text in self._vocab:
208
  return [text]
209
- # Otherwise treat as a dataset move token
210
  return self._decompose_one_move(text)
211
 
212
- # Otherwise split ourselves
213
  out: List[str] = []
214
  for part in text.split():
215
  if part in special:
@@ -245,10 +179,6 @@ def count_vocab_from_dataset(
245
  column: str = "text",
246
  max_samples: Optional[int] = 10000,
247
  ) -> Dict[str, int]:
248
- """
249
- Left here for convenience if you still want frequency stats,
250
- but it's not used by the structured tokenizer.
251
- """
252
  from collections import Counter
253
  from datasets import load_dataset
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
 
3
  import json
 
9
 
10
 
11
  class ChessTokenizer(PreTrainedTokenizer):
 
 
 
 
 
 
 
 
 
12
 
13
  model_input_names = ["input_ids", "attention_mask"]
14
  vocab_files_names = {"vocab_file": "vocab.json"}
 
22
  # Structure token
23
  MOVE_TOKEN = "[MOVE]"
24
 
 
25
  _MOVE_RE = re.compile(
26
  r'^(?P<color>[WB])(?P<piece>[PNBRQK])(?P<from>[a-h][1-8])(?P<to>[a-h][1-8])(?P<rest>.*)$'
27
  )
 
28
  _PROMO_RE = re.compile(r'=?([QRBNqrbn])')
29
 
30
  def __init__(
 
56
 
57
  self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
58
 
 
59
  super().__init__(
60
  pad_token=self._pad_token,
61
  bos_token=self._bos_token,
 
65
  )
66
 
67
  def _create_default_vocab(self) -> Dict[str, int]:
 
 
 
68
  special = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN, self.MOVE_TOKEN]
69
  return {t: i for i, t in enumerate(special)}
70
 
71
  @classmethod
72
  def build_structured_vocab(cls) -> "ChessTokenizer":
 
 
 
 
 
 
 
73
  special = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN, cls.MOVE_TOKEN]
74
 
75
  files = "abcdefgh"
 
82
  vocab = {t: i for i, t in enumerate(tokens)}
83
  return cls(vocab=vocab)
84
 
 
 
85
  @classmethod
86
  def build_vocab_from_dataset(
87
  cls,
 
91
  min_frequency: int = 500,
92
  max_samples: Optional[int] = 100000,
93
  ) -> "ChessTokenizer":
 
94
  return cls.build_structured_vocab()
95
 
96
  @property
 
107
  return self._ids_to_tokens.get(index, self.UNK_TOKEN)
108
 
109
  def convert_tokens_to_string(self, tokens: List[str]) -> str:
 
 
 
 
 
 
110
  drop = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
111
  return " ".join(t for t in tokens if t not in drop)
112
 
113
  def _decompose_one_move(self, move_tok: str) -> List[str]:
 
 
 
 
114
  m = self._MOVE_RE.match(move_tok)
115
  if not m:
116
  return [self.UNK_TOKEN]
 
131
  return out
132
 
133
  def _tokenize(self, text: str) -> List[str]:
 
 
 
 
 
 
 
 
134
  text = text.strip()
135
  if not text:
136
  return []
137
 
138
  special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN, self.MOVE_TOKEN}
139
 
 
140
  if " " not in text:
141
  if text in special:
142
  return [text]
 
143
  if text in self._vocab:
144
  return [text]
 
145
  return self._decompose_one_move(text)
146
 
 
147
  out: List[str] = []
148
  for part in text.split():
149
  if part in special:
 
179
  column: str = "text",
180
  max_samples: Optional[int] = 10000,
181
  ) -> Dict[str, int]:
 
 
 
 
182
  from collections import Counter
183
  from datasets import load_dataset
184