raphael-mathiot commited on
Commit
1da69df
·
verified ·
1 Parent(s): d810780

Chess Challenge submission by raphael-mathiot

Browse files
Files changed (5) hide show
  1. README.md +22 -1
  2. config.json +5 -5
  3. model.safetensors +2 -2
  4. tokenizer.py +62 -203
  5. vocab.json +68 -81
README.md CHANGED
@@ -1,5 +1,26 @@
1
  ---
 
2
  tags:
3
  - chess
 
 
 
4
  ---
5
- # chess-stonkfish
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ library_name: transformers
3
  tags:
4
  - chess
5
+ - llm-course
6
+ - chess-challenge
7
+ license: mit
8
  ---
9
+
10
+ # chess-stonkfish
11
+
12
+ Chess model submitted to the LLM Course Chess Challenge.
13
+
14
+ ## Submission Info
15
+
16
+ - **Submitted by**: [raphael-mathiot](https://huggingface.co/raphael-mathiot)
17
+ - **Parameters**: 991,320
18
+ - **Organization**: LLM-course
19
+
20
+ ## Model Details
21
+
22
+ - **Architecture**: Chess Transformer (GPT-style)
23
+ - **Vocab size**: 72
24
+ - **Embedding dim**: 128
25
+ - **Layers**: 6
26
+ - **Heads**: 8
config.json CHANGED
@@ -12,12 +12,12 @@
12
  "layer_norm_epsilon": 1e-05,
13
  "model_type": "chess_transformer",
14
  "n_ctx": 256,
15
- "n_embd": 96,
16
- "n_head": 6,
17
- "n_inner": 288,
18
- "n_layer": 10,
19
  "pad_token_id": 0,
20
  "tie_weights": true,
21
  "transformers_version": "4.57.1",
22
- "vocab_size": 85
23
  }
 
12
  "layer_norm_epsilon": 1e-05,
13
  "model_type": "chess_transformer",
14
  "n_ctx": 256,
15
+ "n_embd": 128,
16
+ "n_head": 8,
17
+ "n_inner": 356,
18
+ "n_layer": 6,
19
  "pad_token_id": 0,
20
  "tie_weights": true,
21
  "transformers_version": "4.57.1",
22
+ "vocab_size": 72
23
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d3d5e7e5bd0d5e40c2268eb3e4d00753e923393a3eedbe22c20fa54d9575c3c
3
- size 3874600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68f2b547f340aaf26abb5a994f3dc47e741d65c2939db2511ef66481b31ae60a
3
+ size 3971728
tokenizer.py CHANGED
@@ -24,14 +24,18 @@ from transformers import PreTrainedTokenizer
24
 
25
  class ChessTokenizer(PreTrainedTokenizer):
26
  """
27
- A custom tokenizer for chess moves using extended UCI notation.
28
 
29
- This tokenizer splits moves into semantic components (Pieces, Squares, Metadata).
30
- Example: "WPe2e4" -> ["WP", "e2", "e4"]
 
 
 
 
 
31
  """
32
 
33
  model_input_names = ["input_ids", "attention_mask"]
34
- vocab_files_names = {"vocab_file": "vocab.json"}
35
 
36
  # Special tokens
37
  PAD_TOKEN = "[PAD]"
@@ -45,45 +49,27 @@ class ChessTokenizer(PreTrainedTokenizer):
45
  vocab: Optional[Dict[str, int]] = None,
46
  **kwargs,
47
  ):
48
- """
49
- Initialize the chess tokenizer.
50
- """
51
- # Initialize special tokens
52
  self._pad_token = self.PAD_TOKEN
53
  self._bos_token = self.BOS_TOKEN
54
  self._eos_token = self.EOS_TOKEN
55
  self._unk_token = self.UNK_TOKEN
56
 
57
- # Clean kwargs
58
  kwargs.pop("pad_token", None)
59
  kwargs.pop("bos_token", None)
60
  kwargs.pop("eos_token", None)
61
  kwargs.pop("unk_token", None)
62
 
63
- # Regex for splitting moves into:
64
- # 1. Castling: (O), (o)
65
- # 2. Metadata: (x), (+*), (+)
66
- # 3. Pieces: WP, BR, etc.
67
- # 4. Squares: a1, h8, etc.
68
- self.token_pattern = re.compile(
69
- r'\(O\)|\(o\)|' # Castling
70
- r'\(x\)|\(\+\*\)|\(\+\)|' # Metadata (Capture, Mate, Check)
71
- r'[WB][PRNBQK]|' # Pieces (Color + Type)
72
- r'[a-h][1-8]' # Squares
73
- )
74
 
75
- # Load or create vocabulary
76
  if vocab is not None:
77
  self._vocab = vocab
78
  elif vocab_file is not None and os.path.exists(vocab_file):
79
  with open(vocab_file, "r", encoding="utf-8") as f:
80
  self._vocab = json.load(f)
81
  else:
82
- # In this version, the default vocab is the FULL vocab
83
- # because chess rules are static.
84
  self._vocab = self._create_default_vocab()
85
 
86
- # Create reverse mapping
87
  self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
88
 
89
  super().__init__(
@@ -95,216 +81,89 @@ class ChessTokenizer(PreTrainedTokenizer):
95
  )
96
 
97
  def _create_default_vocab(self) -> Dict[str, int]:
98
- """
99
- Create the full static vocabulary for Chess.
100
- Since the 'rules' of the tokens are known (squares a1-h8, pieces),
101
- we generate the full map here instead of learning it.
102
- """
103
- # 1. Special Tokens
104
  special_tokens = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
105
  vocab = {token: idx for idx, token in enumerate(special_tokens)}
106
  idx = len(vocab)
107
-
108
- # 2. Pieces (White/Black + Pawn/Rook/Knight/Bishop/Queen/King)
109
- colors = ['W', 'B']
110
- pieces = ['P', 'R', 'N', 'B', 'Q', 'K']
111
- for c in colors:
112
- for p in pieces:
113
- token = f"{c}{p}"
114
- if token not in vocab:
115
- vocab[token] = idx
116
- idx += 1
117
-
118
- # 3. Squares (a1 to h8)
119
- files = 'abcdefgh'
120
- ranks = '12345678'
121
- for f in files:
122
- for r in ranks:
123
- token = f"{f}{r}"
124
- if token not in vocab:
125
- vocab[token] = idx
126
- idx += 1
127
-
128
- # 4. Special Move Suffixes
129
- # Note: Order is handled by regex, but we just need them in vocab here
130
- specials = ['(O)', '(o)', '(x)', '(+)', '(+*)']
131
- for s in specials:
132
- if s not in vocab:
133
- vocab[s] = idx
134
  idx += 1
135
 
 
 
 
 
136
  return vocab
137
 
138
- @classmethod
139
- def build_vocab_from_iterator(
140
- cls,
141
- iterator: Iterator,
142
- min_frequency: int = 1,
143
- ) -> "ChessTokenizer":
144
- """
145
- API Compatibility Method.
146
-
147
- Since this tokenizer uses a static vocabulary based on Chess rules,
148
- scanning the iterator is not necessary. We simply consume the iterator
149
- (optional) and return the standard tokenizer.
150
- """
151
- # We explicitly ignore the iterator data because our vocab
152
- # is pre-defined by the rules of the game.
153
- return cls()
154
-
155
- @classmethod
156
- def build_vocab_from_dataset(
157
- cls,
158
- dataset_name: str = "dlouapre/lichess_2025-01_1M",
159
- split: str = "train",
160
- column: str = "text",
161
- min_frequency: int = 500,
162
- max_samples: Optional[int] = 100000,
163
- ) -> "ChessTokenizer":
164
- """
165
- API Compatibility Method.
166
-
167
- Returns a tokenizer with the standard chess vocabulary.
168
- Does not download the dataset as the vocabulary is static.
169
- """
170
- return cls()
171
-
172
- @property
173
- def vocab_size(self) -> int:
174
- """Return the size of the vocabulary."""
175
- return len(self._vocab)
176
-
177
- def get_vocab(self) -> Dict[str, int]:
178
- """Return the vocabulary as a dictionary."""
179
- return dict(self._vocab)
180
-
181
  def _tokenize(self, text: str) -> List[str]:
182
  """
183
- Tokenize a string of moves into semantic components using Regex.
184
-
185
- Args:
186
- text: A string of space-separated moves (e.g., "WPe2e4 BPe7e5")
187
-
188
- Returns:
189
- List of components (e.g., ["WP", "e2", "e4", "BP", "e7", "e5"])
190
  """
191
- # findall will ignore spaces and return only the matching components
 
 
 
 
 
 
 
 
 
 
192
  return self.token_pattern.findall(text)
193
 
194
  def _convert_token_to_id(self, token: str) -> int:
195
  """Convert a token to its ID."""
196
- return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN))
197
 
198
  def _convert_id_to_token(self, index: int) -> str:
199
  """Convert an ID to its token."""
200
  return self._ids_to_tokens.get(index, self.UNK_TOKEN)
201
 
202
- def _is_start_of_move(self, token: str) -> bool:
203
- """
204
- Helper to determine if a token represents the start of a new move.
205
- Moves start with a Piece (e.g., 'WP') or Castling (e.g., '(O)').
206
- """
207
- # 1. Check for Castling (Short or Long)
208
- if token in ['(O)', '(o)']:
209
- return True
210
-
211
- # 2. Check for Pieces (Length 2, starts with W/B, ends with Piece type)
212
- # We check specific characters to avoid confusion with squares or suffixes
213
- if len(token) == 2 and token[0] in 'WB' and token[1] in 'PRNBQK':
214
- return True
215
-
216
- return False
217
-
218
  def convert_tokens_to_string(self, tokens: List[str]) -> str:
219
- """
220
- Converts a list of tokens back to a string, respecting Chess notation rules.
 
221
 
222
- Logic:
223
- - Spaces are inserted BEFORE a token ONLY if that token marks the start of a new move.
224
- - Squares (e2, e4) and Suffixes (x, +) are concatenated to the previous token.
225
- """
226
  output = []
227
- special_tokens = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
228
-
229
- for i, token in enumerate(tokens):
230
- # 1. Handle Special Tokens (keep them, surround with spaces if needed)
231
- if token in special_tokens:
232
- if output and output[-1] != " ":
233
- output.append(" ")
234
- output.append(token)
235
-
236
- # 2. Handle Start of New Move (Insert space before)
237
- elif self._is_start_of_move(token):
238
- # Add a space if we aren't at the very start and the previous char isn't already a space
239
- if output and output[-1] != " ":
240
- output.append(" ")
241
- output.append(token)
242
-
243
- # 3. Handle Continuations (Squares 'e2', Suffixes '(x)') -> Concatenate
244
  else:
245
  output.append(token)
246
-
247
- return "".join(output).strip()
248
 
249
- def save_vocabulary(
250
- self,
251
- save_directory: str,
252
- filename_prefix: Optional[str] = None,
253
- ) -> tuple:
254
- """
255
- Save the vocabulary to a JSON file.
256
-
257
- Args:
258
- save_directory: Directory to save the vocabulary.
259
- filename_prefix: Optional prefix for the filename.
260
-
261
- Returns:
262
- Tuple containing the path to the saved vocabulary file.
263
- """
264
  if not os.path.isdir(save_directory):
265
  os.makedirs(save_directory, exist_ok=True)
266
-
267
  vocab_file = os.path.join(
268
- save_directory,
269
- (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
270
  )
271
-
272
  with open(vocab_file, "w", encoding="utf-8") as f:
273
  json.dump(self._vocab, f, ensure_ascii=False, indent=2)
274
-
275
  return (vocab_file,)
276
-
277
-
278
- def count_vocab_from_dataset(
279
- dataset_name: str = "dlouapre/lichess_2025-01_1M",
280
- split: str = "train",
281
- column: str = "text",
282
- max_samples: Optional[int] = 10000,
283
- ) -> Dict[str, int]:
284
- """
285
- Count token frequencies in a dataset (useful for vocabulary analysis).
286
-
287
- Args:
288
- dataset_name: Name of the dataset on Hugging Face Hub.
289
- split: Dataset split to use.
290
- column: Column containing the game strings.
291
- max_samples: Maximum number of samples to process.
292
-
293
- Returns:
294
- Dictionary mapping tokens to their frequencies.
295
- """
296
- from collections import Counter
297
- from datasets import load_dataset
298
 
299
- dataset = load_dataset(dataset_name, split=split)
300
-
301
- if max_samples is not None:
302
- dataset = dataset.select(range(min(max_samples, len(dataset))))
303
-
304
- token_counts = Counter()
305
-
306
- for example in dataset:
307
- moves = example[column].strip().split()
308
- token_counts.update(moves)
 
 
309
 
310
- return dict(token_counts)
 
 
24
 
25
  class ChessTokenizer(PreTrainedTokenizer):
26
  """
27
+ A robust chess tokenizer using a 72-token vocabulary.
28
 
29
+ It handles raw Extended UCI notation (e.g., "WPa7a8(Q)", "BQd8h4(+*)")
30
+ by automatically cleaning and extracting only the necessary board moves.
31
+
32
+ Vocabulary:
33
+ - 4 Special: [PAD], [BOS], [EOS], [UNK]
34
+ - 64 Squares: a1...h8
35
+ - 4 Promotions: q, r, b, n
36
  """
37
 
38
  model_input_names = ["input_ids", "attention_mask"]
 
39
 
40
  # Special tokens
41
  PAD_TOKEN = "[PAD]"
 
49
  vocab: Optional[Dict[str, int]] = None,
50
  **kwargs,
51
  ):
 
 
 
 
52
  self._pad_token = self.PAD_TOKEN
53
  self._bos_token = self.BOS_TOKEN
54
  self._eos_token = self.EOS_TOKEN
55
  self._unk_token = self.UNK_TOKEN
56
 
 
57
  kwargs.pop("pad_token", None)
58
  kwargs.pop("bos_token", None)
59
  kwargs.pop("eos_token", None)
60
  kwargs.pop("unk_token", None)
61
 
62
+ # Regex to find Squares (a1-h8) OR lowercase promotion letters (qrbn)
63
+ self.token_pattern = re.compile(r'[a-h][1-8]|[qrbn]')
 
 
 
 
 
 
 
 
 
64
 
 
65
  if vocab is not None:
66
  self._vocab = vocab
67
  elif vocab_file is not None and os.path.exists(vocab_file):
68
  with open(vocab_file, "r", encoding="utf-8") as f:
69
  self._vocab = json.load(f)
70
  else:
 
 
71
  self._vocab = self._create_default_vocab()
72
 
 
73
  self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
74
 
75
  super().__init__(
 
81
  )
82
 
83
  def _create_default_vocab(self) -> Dict[str, int]:
 
 
 
 
 
 
84
  special_tokens = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
85
  vocab = {token: idx for idx, token in enumerate(special_tokens)}
86
  idx = len(vocab)
87
+
88
+ # Squares (4-67)
89
+ for f in 'abcdefgh':
90
+ for r in '12345678':
91
+ vocab[f"{f}{r}"] = idx
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  idx += 1
93
 
94
+ # Promotions (68-71)
95
+ for p in ['q', 'r', 'b', 'n']:
96
+ vocab[p] = idx
97
+ idx += 1
98
  return vocab
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  def _tokenize(self, text: str) -> List[str]:
101
  """
102
+ Tokenizes text by first normalizing specific chess patterns
103
+ and then extracting squares/promotions.
 
 
 
 
 
104
  """
105
+ # 1. NORMALIZE: Handle the bracketed promotions found in your dataset
106
+ # Convert (Q) -> q, (N) -> n, etc.
107
+ text = (text.replace("(Q)", "q")
108
+ .replace("(R)", "r")
109
+ .replace("(B)", "b")
110
+ .replace("(N)", "n"))
111
+
112
+ # 2. EXTRACT: Use regex to find valid tokens.
113
+ # The regex r'[a-h][1-8]|[qrbn]' will:
114
+ # - Match 'a7', 'a8', 'q' (from "WPa7a8q")
115
+ # - Ignore 'W', 'P', 'B', 'Q', '(', '+', '*' (Garbage)
116
  return self.token_pattern.findall(text)
117
 
118
  def _convert_token_to_id(self, token: str) -> int:
119
  """Convert a token to its ID."""
120
+ return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN, 0))
121
 
122
  def _convert_id_to_token(self, index: int) -> str:
123
  """Convert an ID to its token."""
124
  return self._ids_to_tokens.get(index, self.UNK_TOKEN)
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  def convert_tokens_to_string(self, tokens: List[str]) -> str:
127
+ """Reconstructs standard UCI string (e.g. "e2e4 a7a8q")"""
128
+ special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
129
+ clean_tokens = [t for t in tokens if t not in special]
130
 
 
 
 
 
131
  output = []
132
+ for token in clean_tokens:
133
+ # Append promotion to previous move
134
+ if token in ['q', 'r', 'b', 'n'] and output:
135
+ output[-1] += token
136
+ # Append 2nd square to 1st square (e2 + e4 -> e2e4)
137
+ elif output and len(output[-1]) == 2 and output[-1][0] in 'abcdefgh':
138
+ output[-1] += token
139
+ # Start new move
 
 
 
 
 
 
 
 
 
140
  else:
141
  output.append(token)
142
+
143
+ return " ".join(output)
144
 
145
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  if not os.path.isdir(save_directory):
147
  os.makedirs(save_directory, exist_ok=True)
 
148
  vocab_file = os.path.join(
149
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
 
150
  )
 
151
  with open(vocab_file, "w", encoding="utf-8") as f:
152
  json.dump(self._vocab, f, ensure_ascii=False, indent=2)
 
153
  return (vocab_file,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
+ # --- Dummy Compatibility Methods ---
156
+ @classmethod
157
+ def build_vocab_from_iterator(cls, iterator, min_frequency=1):
158
+ return cls() # Vocab is static
159
+
160
+ @classmethod
161
+ def build_vocab_from_dataset(cls, **kwargs):
162
+ return cls() # Vocab is static
163
+
164
+ @property
165
+ def vocab_size(self) -> int:
166
+ return len(self._vocab)
167
 
168
+ def get_vocab(self) -> Dict[str, int]:
169
+ return dict(self._vocab)
vocab.json CHANGED
@@ -3,85 +3,72 @@
3
  "[BOS]": 1,
4
  "[EOS]": 2,
5
  "[UNK]": 3,
6
- "WP": 4,
7
- "WR": 5,
8
- "WN": 6,
9
- "WB": 7,
10
- "WQ": 8,
11
- "WK": 9,
12
- "BP": 10,
13
- "BR": 11,
14
- "BN": 12,
15
- "BB": 13,
16
- "BQ": 14,
17
- "BK": 15,
18
- "a1": 16,
19
- "a2": 17,
20
- "a3": 18,
21
- "a4": 19,
22
- "a5": 20,
23
- "a6": 21,
24
- "a7": 22,
25
- "a8": 23,
26
- "b1": 24,
27
- "b2": 25,
28
- "b3": 26,
29
- "b4": 27,
30
- "b5": 28,
31
- "b6": 29,
32
- "b7": 30,
33
- "b8": 31,
34
- "c1": 32,
35
- "c2": 33,
36
- "c3": 34,
37
- "c4": 35,
38
- "c5": 36,
39
- "c6": 37,
40
- "c7": 38,
41
- "c8": 39,
42
- "d1": 40,
43
- "d2": 41,
44
- "d3": 42,
45
- "d4": 43,
46
- "d5": 44,
47
- "d6": 45,
48
- "d7": 46,
49
- "d8": 47,
50
- "e1": 48,
51
- "e2": 49,
52
- "e3": 50,
53
- "e4": 51,
54
- "e5": 52,
55
- "e6": 53,
56
- "e7": 54,
57
- "e8": 55,
58
- "f1": 56,
59
- "f2": 57,
60
- "f3": 58,
61
- "f4": 59,
62
- "f5": 60,
63
- "f6": 61,
64
- "f7": 62,
65
- "f8": 63,
66
- "g1": 64,
67
- "g2": 65,
68
- "g3": 66,
69
- "g4": 67,
70
- "g5": 68,
71
- "g6": 69,
72
- "g7": 70,
73
- "g8": 71,
74
- "h1": 72,
75
- "h2": 73,
76
- "h3": 74,
77
- "h4": 75,
78
- "h5": 76,
79
- "h6": 77,
80
- "h7": 78,
81
- "h8": 79,
82
- "(O)": 80,
83
- "(o)": 81,
84
- "(x)": 82,
85
- "(+)": 83,
86
- "(+*)": 84
87
  }
 
3
  "[BOS]": 1,
4
  "[EOS]": 2,
5
  "[UNK]": 3,
6
+ "a1": 4,
7
+ "a2": 5,
8
+ "a3": 6,
9
+ "a4": 7,
10
+ "a5": 8,
11
+ "a6": 9,
12
+ "a7": 10,
13
+ "a8": 11,
14
+ "b1": 12,
15
+ "b2": 13,
16
+ "b3": 14,
17
+ "b4": 15,
18
+ "b5": 16,
19
+ "b6": 17,
20
+ "b7": 18,
21
+ "b8": 19,
22
+ "c1": 20,
23
+ "c2": 21,
24
+ "c3": 22,
25
+ "c4": 23,
26
+ "c5": 24,
27
+ "c6": 25,
28
+ "c7": 26,
29
+ "c8": 27,
30
+ "d1": 28,
31
+ "d2": 29,
32
+ "d3": 30,
33
+ "d4": 31,
34
+ "d5": 32,
35
+ "d6": 33,
36
+ "d7": 34,
37
+ "d8": 35,
38
+ "e1": 36,
39
+ "e2": 37,
40
+ "e3": 38,
41
+ "e4": 39,
42
+ "e5": 40,
43
+ "e6": 41,
44
+ "e7": 42,
45
+ "e8": 43,
46
+ "f1": 44,
47
+ "f2": 45,
48
+ "f3": 46,
49
+ "f4": 47,
50
+ "f5": 48,
51
+ "f6": 49,
52
+ "f7": 50,
53
+ "f8": 51,
54
+ "g1": 52,
55
+ "g2": 53,
56
+ "g3": 54,
57
+ "g4": 55,
58
+ "g5": 56,
59
+ "g6": 57,
60
+ "g7": 58,
61
+ "g8": 59,
62
+ "h1": 60,
63
+ "h2": 61,
64
+ "h3": 62,
65
+ "h4": 63,
66
+ "h5": 64,
67
+ "h6": 65,
68
+ "h7": 66,
69
+ "h8": 67,
70
+ "q": 68,
71
+ "r": 69,
72
+ "b": 70,
73
+ "n": 71
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  }