Tanguy85 commited on
Commit
df324be
·
verified ·
1 Parent(s): 643d192

Chess Challenge submission by Tanguy85

Browse files
Files changed (7) hide show
  1. README.md +26 -0
  2. config.json +20 -0
  3. model.safetensors +3 -0
  4. special_tokens_map.json +6 -0
  5. tokenizer.py +327 -0
  6. tokenizer_config.json +51 -0
  7. vocab.json +75 -0
README.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - chess
5
+ - llm-course
6
+ - chess-challenge
7
+ license: mit
8
+ ---
9
+
10
+ # chess-carlsen
11
+
12
+ Chess model submitted to the LLM Course Chess Challenge.
13
+
14
+ ## Submission Info
15
+
16
+ - **Submitted by**: [Tanguy85](https://huggingface.co/Tanguy85)
17
+ - **Parameters**: 982,196
18
+ - **Organization**: LLM-course
19
+
20
+ ## Model Details
21
+
22
+ - **Architecture**: Chess Transformer (GPT-style)
23
+ - **Vocab size**: 73
24
+ - **Embedding dim**: 128
25
+ - **Layers**: 6
26
+ - **Heads**: 8
config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ChessForCausalLM"
4
+ ],
5
+ "bos_token_id": 1,
6
+ "dropout": 0.05,
7
+ "dtype": "float32",
8
+ "eos_token_id": 2,
9
+ "layer_norm_epsilon": 1e-05,
10
+ "model_type": "chess_transformer",
11
+ "n_ctx": 256,
12
+ "n_embd": 128,
13
+ "n_head": 8,
14
+ "n_inner": 350,
15
+ "n_layer": 6,
16
+ "pad_token_id": 0,
17
+ "tie_weights": true,
18
+ "transformers_version": "4.57.3",
19
+ "vocab_size": 73
20
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5cac79454b4a4b53f214becc812557ab985c17474cf338a657ae8384ce3d91d
3
+ size 3935232
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[BOS]",
3
+ "eos_token": "[EOS]",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "[UNK]"
6
+ }
tokenizer.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom Chess Tokenizer for the Chess Challenge.
3
+
4
+ This tokenizer supports TWO tokenization modes:
5
+
6
+ 1) tokenization_mode="move" (original)
7
+ - Each move is a single token using the extended UCI notation
8
+ from the Lichess dataset (e.g., WPe2e4, BNg8f6, WPe7e8=Q(x+), ...).
9
+ - Vocabulary is usually built from the dataset (frequency threshold).
10
+
11
+ 2) tokenization_mode="uci_square" (recommended for good legal-move performance with small vocab)
12
+ - Each move is decomposed into 3 tokens:
13
+ [from_square, to_square, promotion_or_-]
14
+ Example:
15
+ "WPe2e4" -> ["e2", "e4", "-"]
16
+ "WPe7e8=Q(+)" -> ["e7", "e8", "q"]
17
+ - Fixed vocabulary that can express ANY UCI move:
18
+ specials (4) + squares (64) + promo tokens (5) = 73 tokens.
19
+
20
+ Why uci_square helps:
21
+ - You can keep vocab tiny (70-150 range) WITHOUT losing expressivity,
22
+ so the model can still output any move.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import json
28
+ import os
29
+ import re
30
+ from typing import Dict, List, Optional
31
+
32
+ from transformers import PreTrainedTokenizer
33
+
34
+
35
+ class ChessTokenizer(PreTrainedTokenizer):
36
+ """
37
+ A custom tokenizer for chess moves.
38
+
39
+ - "move" mode: extended-uci move tokens like "WPe2e4"
40
+ - "uci_square" mode: squares + promotion tokens
41
+ """
42
+
43
+ model_input_names = ["input_ids", "attention_mask"]
44
+ vocab_files_names = {"vocab_file": "vocab.json"}
45
+
46
+ # Special tokens
47
+ PAD_TOKEN = "[PAD]"
48
+ BOS_TOKEN = "[BOS]"
49
+ EOS_TOKEN = "[EOS]"
50
+ UNK_TOKEN = "[UNK]"
51
+
52
+ def __init__(
53
+ self,
54
+ vocab_file: Optional[str] = None,
55
+ vocab: Optional[Dict[str, int]] = None,
56
+ **kwargs,
57
+ ):
58
+ """
59
+ Initialize the chess tokenizer.
60
+
61
+ Args:
62
+ vocab_file: Path to a JSON file containing the vocabulary mapping.
63
+ vocab: Dictionary mapping tokens to IDs (alternative to vocab_file).
64
+ kwargs:
65
+ - tokenization_mode: "move" (default) or "uci_square"
66
+ - plus usual HF tokenizer kwargs
67
+ """
68
+ # Initialize special tokens
69
+ self._pad_token = self.PAD_TOKEN
70
+ self._bos_token = self.BOS_TOKEN
71
+ self._eos_token = self.EOS_TOKEN
72
+ self._unk_token = self.UNK_TOKEN
73
+
74
+ # Read tokenization_mode from kwargs (and keep it for save/load)
75
+ tokenization_mode = kwargs.pop("tokenization_mode", "move")
76
+ if tokenization_mode not in ("move", "uci_square"):
77
+ raise ValueError(f"Unknown tokenization_mode={tokenization_mode!r}")
78
+ self.tokenization_mode = tokenization_mode
79
+
80
+ # Remove any duplicate special-token entries passed through kwargs
81
+ # to avoid "multiple values for keyword" errors when loading from disk.
82
+ kwargs.pop("pad_token", None)
83
+ kwargs.pop("bos_token", None)
84
+ kwargs.pop("eos_token", None)
85
+ kwargs.pop("unk_token", None)
86
+
87
+ # Load or create vocabulary
88
+ if vocab is not None:
89
+ self._vocab = vocab
90
+ elif vocab_file is not None and os.path.exists(vocab_file):
91
+ with open(vocab_file, "r", encoding="utf-8") as f:
92
+ self._vocab = json.load(f)
93
+ else:
94
+ # Create a minimal vocabulary with just special tokens
95
+ # (you should build from dataset or use build_uci_square_vocab)
96
+ self._vocab = self._create_default_vocab()
97
+
98
+ # Create reverse mapping
99
+ self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
100
+
101
+ # Ensure tokenization_mode is saved in tokenizer_config.json
102
+ kwargs["tokenization_mode"] = self.tokenization_mode
103
+
104
+ # Call parent init AFTER setting up vocab
105
+ super().__init__(
106
+ pad_token=self._pad_token,
107
+ bos_token=self._bos_token,
108
+ eos_token=self._eos_token,
109
+ unk_token=self._unk_token,
110
+ **kwargs,
111
+ )
112
+
113
+ def _create_default_vocab(self) -> Dict[str, int]:
114
+ """
115
+ Minimal default vocabulary with just special tokens.
116
+ """
117
+ special_tokens = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
118
+ vocab = {token: idx for idx, token in enumerate(special_tokens)}
119
+ return vocab
120
+
121
+ @classmethod
122
+ def build_vocab_from_iterator(
123
+ cls,
124
+ iterator,
125
+ min_frequency: int = 1,
126
+ ) -> "ChessTokenizer":
127
+ """
128
+ Build a "move" tokenizer vocabulary from an iterator of game strings.
129
+
130
+ Args:
131
+ iterator: yields game strings (space-separated moves).
132
+ min_frequency: minimum frequency for a token to be included.
133
+
134
+ Returns:
135
+ ChessTokenizer(tokenization_mode="move") with the built vocabulary.
136
+ """
137
+ from collections import Counter
138
+
139
+ token_counts = Counter()
140
+
141
+ for game in iterator:
142
+ moves = game.strip().split()
143
+ token_counts.update(moves)
144
+
145
+ tokens = [token for token, count in token_counts.items() if count >= min_frequency]
146
+ tokens = sorted(tokens)
147
+
148
+ special_tokens = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN]
149
+ vocab = {token: idx for idx, token in enumerate(special_tokens + tokens)}
150
+
151
+ return cls(vocab=vocab, tokenization_mode="move")
152
+
153
+ @classmethod
154
+ def build_vocab_from_dataset(
155
+ cls,
156
+ dataset_name: str = "dlouapre/lichess_2025-01_1M",
157
+ split: str = "train",
158
+ column: str = "text",
159
+ min_frequency: int = 500,
160
+ max_samples: Optional[int] = 100000,
161
+ ) -> "ChessTokenizer":
162
+ """
163
+ Build a "move" tokenizer vocabulary from a Hugging Face dataset.
164
+
165
+ Args:
166
+ dataset_name: dataset on HF Hub.
167
+ split: dataset split.
168
+ column: column containing game strings.
169
+ min_frequency: minimum frequency for a token to be included.
170
+ max_samples: max number of samples to process.
171
+
172
+ Returns:
173
+ ChessTokenizer(tokenization_mode="move") with the built vocabulary.
174
+ """
175
+ from datasets import load_dataset
176
+
177
+ dataset = load_dataset(dataset_name, split=split)
178
+
179
+ if max_samples is not None:
180
+ dataset = dataset.select(range(min(max_samples, len(dataset))))
181
+
182
+ def game_iterator():
183
+ for example in dataset:
184
+ yield example[column]
185
+
186
+ return cls.build_vocab_from_iterator(game_iterator(), min_frequency=min_frequency)
187
+
188
+ @classmethod
189
+ def build_uci_square_vocab(cls) -> "ChessTokenizer":
190
+ """
191
+ Build a fixed tiny vocab that can express ANY UCI move using 3 tokens:
192
+ [from_square, to_square, promotion_or_-].
193
+
194
+ Vocab:
195
+ - 4 specials
196
+ - 64 squares (a1..h8)
197
+ - 5 promo tokens: "-", "q", "r", "b", "n"
198
+ Total = 73 tokens.
199
+ """
200
+ special = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN]
201
+
202
+ files = "abcdefgh"
203
+ ranks = "12345678"
204
+ squares = [f"{f}{r}" for r in ranks for f in files] # 64
205
+
206
+ promo = ["-", "q", "r", "b", "n"] # 5
207
+
208
+ vocab = {tok: i for i, tok in enumerate(special + squares + promo)}
209
+ return cls(vocab=vocab, tokenization_mode="uci_square")
210
+
211
+ @property
212
+ def vocab_size(self) -> int:
213
+ return len(self._vocab)
214
+
215
+ def get_vocab(self) -> Dict[str, int]:
216
+ return dict(self._vocab)
217
+
218
+ def _tokenize(self, text: str) -> List[str]:
219
+ """
220
+ Tokenize a string.
221
+
222
+ - mode="move": split on spaces (original dataset tokens like "WPe2e4").
223
+ - mode="uci_square": each dataset move token -> [from_sq, to_sq, promo_or_-]
224
+ Example: "WPe2e4" -> ["e2", "e4", "-"]
225
+ "WPe7e8=Q" -> ["e7", "e8", "q"]
226
+ """
227
+ tokens = text.strip().split()
228
+
229
+ if self.tokenization_mode != "uci_square":
230
+ return tokens
231
+
232
+ out: List[str] = []
233
+ for tok in tokens:
234
+ # Keep special tokens as-is if they appear in text
235
+ if tok in self._vocab:
236
+ out.append(tok)
237
+ continue
238
+
239
+ # Typical dataset format:
240
+ # [W|B][Piece][from_sq][to_sq]... possibly "(x)" "(+)" "(o)" "=Q" etc.
241
+ # Examples:
242
+ # WPe2e4
243
+ # BNg8f6
244
+ # WPe7e8=Q(+)
245
+ # WPe5d6(x)
246
+ if len(tok) >= 6 and tok[0] in ("W", "B"):
247
+ from_sq = tok[2:4]
248
+ to_sq = tok[4:6]
249
+
250
+ if re.fullmatch(r"[a-h][1-8]", from_sq) and re.fullmatch(r"[a-h][1-8]", to_sq):
251
+ promo = "-"
252
+ if "=" in tok:
253
+ i = tok.index("=")
254
+ if i + 1 < len(tok):
255
+ p = tok[i + 1].lower()
256
+ if p in ("q", "r", "b", "n"):
257
+ promo = p
258
+ out.extend([from_sq, to_sq, promo])
259
+ continue
260
+
261
+ # Fallback: find two squares anywhere in token
262
+ squares = re.findall(r"[a-h][1-8]", tok)
263
+ if len(squares) >= 2:
264
+ promo = "-"
265
+ m = re.search(r"[=]?([qrbnQRBN])", tok)
266
+ if m:
267
+ promo = m.group(1).lower()
268
+ out.extend([squares[0], squares[1], promo])
269
+ else:
270
+ out.append(self.UNK_TOKEN)
271
+
272
+ return out
273
+
274
+ def _convert_token_to_id(self, token: str) -> int:
275
+ return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN, 0))
276
+
277
+ def _convert_id_to_token(self, index: int) -> str:
278
+ return self._ids_to_tokens.get(index, self.UNK_TOKEN)
279
+
280
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
281
+ # Filter out special tokens for cleaner output
282
+ special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
283
+ return " ".join(t for t in tokens if t not in special)
284
+
285
+ def save_vocabulary(
286
+ self,
287
+ save_directory: str,
288
+ filename_prefix: Optional[str] = None,
289
+ ) -> tuple:
290
+ if not os.path.isdir(save_directory):
291
+ os.makedirs(save_directory, exist_ok=True)
292
+
293
+ vocab_file = os.path.join(
294
+ save_directory,
295
+ (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
296
+ )
297
+
298
+ with open(vocab_file, "w", encoding="utf-8") as f:
299
+ json.dump(self._vocab, f, ensure_ascii=False, indent=2)
300
+
301
+ return (vocab_file,)
302
+
303
+
304
+ def count_vocab_from_dataset(
305
+ dataset_name: str = "dlouapre/lichess_2025-01_1M",
306
+ split: str = "train",
307
+ column: str = "text",
308
+ max_samples: Optional[int] = 10000,
309
+ ) -> Dict[str, int]:
310
+ """
311
+ Count token frequencies in a dataset (useful for vocabulary analysis).
312
+ """
313
+ from collections import Counter
314
+ from datasets import load_dataset
315
+
316
+ dataset = load_dataset(dataset_name, split=split)
317
+
318
+ if max_samples is not None:
319
+ dataset = dataset.select(range(min(max_samples, len(dataset))))
320
+
321
+ token_counts = Counter()
322
+
323
+ for example in dataset:
324
+ moves = example[column].strip().split()
325
+ token_counts.update(moves)
326
+
327
+ return dict(token_counts)
tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[BOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[EOS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "auto_map": {
37
+ "AutoTokenizer": [
38
+ "tokenizer.ChessTokenizer",
39
+ null
40
+ ]
41
+ },
42
+ "bos_token": "[BOS]",
43
+ "clean_up_tokenization_spaces": false,
44
+ "eos_token": "[EOS]",
45
+ "extra_special_tokens": {},
46
+ "model_max_length": 1000000000000000019884624838656,
47
+ "pad_token": "[PAD]",
48
+ "tokenization_mode": "uci_square",
49
+ "tokenizer_class": "ChessTokenizer",
50
+ "unk_token": "[UNK]"
51
+ }
vocab.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[PAD]": 0,
3
+ "[BOS]": 1,
4
+ "[EOS]": 2,
5
+ "[UNK]": 3,
6
+ "a1": 4,
7
+ "b1": 5,
8
+ "c1": 6,
9
+ "d1": 7,
10
+ "e1": 8,
11
+ "f1": 9,
12
+ "g1": 10,
13
+ "h1": 11,
14
+ "a2": 12,
15
+ "b2": 13,
16
+ "c2": 14,
17
+ "d2": 15,
18
+ "e2": 16,
19
+ "f2": 17,
20
+ "g2": 18,
21
+ "h2": 19,
22
+ "a3": 20,
23
+ "b3": 21,
24
+ "c3": 22,
25
+ "d3": 23,
26
+ "e3": 24,
27
+ "f3": 25,
28
+ "g3": 26,
29
+ "h3": 27,
30
+ "a4": 28,
31
+ "b4": 29,
32
+ "c4": 30,
33
+ "d4": 31,
34
+ "e4": 32,
35
+ "f4": 33,
36
+ "g4": 34,
37
+ "h4": 35,
38
+ "a5": 36,
39
+ "b5": 37,
40
+ "c5": 38,
41
+ "d5": 39,
42
+ "e5": 40,
43
+ "f5": 41,
44
+ "g5": 42,
45
+ "h5": 43,
46
+ "a6": 44,
47
+ "b6": 45,
48
+ "c6": 46,
49
+ "d6": 47,
50
+ "e6": 48,
51
+ "f6": 49,
52
+ "g6": 50,
53
+ "h6": 51,
54
+ "a7": 52,
55
+ "b7": 53,
56
+ "c7": 54,
57
+ "d7": 55,
58
+ "e7": 56,
59
+ "f7": 57,
60
+ "g7": 58,
61
+ "h7": 59,
62
+ "a8": 60,
63
+ "b8": 61,
64
+ "c8": 62,
65
+ "d8": 63,
66
+ "e8": 64,
67
+ "f8": 65,
68
+ "g8": 66,
69
+ "h8": 67,
70
+ "-": 68,
71
+ "q": 69,
72
+ "r": 70,
73
+ "b": 71,
74
+ "n": 72
75
+ }