Vadim38 commited on
Commit
756590c
·
verified ·
1 Parent(s): 09281e1

Chess Challenge submission by Vadim38

Browse files
Files changed (4) hide show
  1. README.md +26 -0
  2. config.json +20 -0
  3. model.safetensors +3 -0
  4. tokenizer.py +484 -0
README.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - chess
5
+ - llm-course
6
+ - chess-challenge
7
+ license: mit
8
+ ---
9
+
10
+ # chess-gpt-char-level-v7
11
+
12
+ Chess model submitted to the LLM Course Chess Challenge.
13
+
14
+ ## Submission Info
15
+
16
+ - **Submitted by**: [Vadim38](https://huggingface.co/Vadim38)
17
+ - **Parameters**: 997,764
18
+ - **Organization**: LLM-course
19
+
20
+ ## Model Details
21
+
22
+ - **Architecture**: Chess Transformer (GPT-style)
23
+ - **Vocab size**: 30
24
+ - **Embedding dim**: 128
25
+ - **Layers**: 7
26
+ - **Heads**: 8
config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ChessForCausalLM"
4
+ ],
5
+ "bos_token_id": 28,
6
+ "dropout": 0.1,
7
+ "dtype": "float32",
8
+ "eos_token_id": 29,
9
+ "layer_norm_epsilon": 1e-05,
10
+ "model_type": "chess_transformer",
11
+ "n_ctx": 1024,
12
+ "n_embd": 128,
13
+ "n_head": 8,
14
+ "n_inner": 220,
15
+ "n_layer": 7,
16
+ "pad_token_id": 27,
17
+ "tie_weights": true,
18
+ "transformers_version": "4.57.6",
19
+ "vocab_size": 30
20
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:722049aec3c84823bdd7cb2ed42a8a48961951518bc8fd79564f9b2e5a1a0507
3
+ size 3998520
tokenizer.py ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import torch
3
+ import json
4
+ import os
5
+ from pathlib import Path
6
+ from typing import Dict, List, Optional, Union
7
+
8
+ from transformers import PreTrainedTokenizer
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+ """
17
+ Custom Chess Tokenizer (Character Level)
18
+ Compatible with HF Trainer & Evaluator (BatchEncoding support)
19
+ """
20
+
21
+ class BatchEncoding(dict):
22
+ """
23
+ Sert à envelopper le dictionnaire de sortie pour qu'il accepte la méthode .to(device).
24
+ """
25
+ def to(self, device):
26
+ new_obj = BatchEncoding()
27
+ for k, v in self.items():
28
+ if hasattr(v, "to"):
29
+ new_obj[k] = v.to(device)
30
+ else:
31
+ new_obj[k] = v
32
+ return new_obj
33
+
34
+ class ChessTokenizer:
35
+ def __init__(self):
36
+ # Vocabulaire statique
37
+ self.chars = list("abcdefgh12345678PRNBQKxoO-=") + ["<pad>", "<s>", "</s>"]
38
+
39
+ self.vocab = {ch: i for i, ch in enumerate(self.chars)}
40
+ self.id_to_char = {i: ch for i, ch in enumerate(self.chars)}
41
+
42
+ # Attributs spéciaux
43
+ self.pad_token = "<pad>"
44
+ self.bos_token = "<s>"
45
+ self.eos_token = "</s>"
46
+ self.unk_token = "<pad>"
47
+
48
+ self.pad_token_id = self.vocab["<pad>"]
49
+ self.bos_token_id = self.vocab["<s>"]
50
+ self.eos_token_id = self.vocab["</s>"]
51
+ self.vocab_size = len(self.vocab)
52
+
53
+ self.model_max_length = 1024
54
+ self.padding_side = "right"
55
+
56
+ @classmethod
57
+ def build_vocab_from_dataset(cls, *args, **kwargs):
58
+ return cls()
59
+
60
+ def encode(self, text):
61
+ return [self.vocab.get(c, self.pad_token_id) for c in text]
62
+
63
+ # --- CORRECTION ICI : Ajout de **kwargs pour accepter 'skip_special_tokens' ---
64
+ def decode(self, token_ids, skip_special_tokens=False, **kwargs):
65
+ if isinstance(token_ids, torch.Tensor):
66
+ token_ids = token_ids.tolist()
67
+ if isinstance(token_ids, int):
68
+ token_ids = [token_ids]
69
+
70
+ tokens = [self.id_to_char.get(i, "") for i in token_ids]
71
+ # On nettoie toujours les tokens spéciaux, peu importe l'argument
72
+ return "".join(tokens).replace("<pad>", "").replace("<s>", "").replace("</s>", "")
73
+
74
+ def __call__(self, text, max_length=None, padding=False, truncation=False, return_tensors=None, **kwargs):
75
+ # 1. Encodage
76
+ ids = self.encode(text)
77
+
78
+ # 2. Truncation
79
+ if truncation and max_length is not None:
80
+ ids = ids[:max_length]
81
+
82
+ # 3. Padding + Mask
83
+ attention_mask = [1] * len(ids)
84
+
85
+ if padding == "max_length" and max_length is not None:
86
+ if len(ids) < max_length:
87
+ pad_len = max_length - len(ids)
88
+ ids = ids + [self.pad_token_id] * pad_len
89
+ attention_mask = attention_mask + [0] * pad_len
90
+
91
+ # 4. Retour intelligent (BatchEncoding)
92
+ if return_tensors == "pt":
93
+ return BatchEncoding({
94
+ "input_ids": torch.tensor([ids], dtype=torch.long),
95
+ "attention_mask": torch.tensor([attention_mask], dtype=torch.long)
96
+ })
97
+
98
+ return {
99
+ "input_ids": ids,
100
+ "attention_mask": attention_mask
101
+ }
102
+
103
+ def save_pretrained(self, save_directory):
104
+ pass
105
+
106
+ @classmethod
107
+ def from_pretrained(cls, save_directory):
108
+ return cls()
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+ '''"""
123
+ Custom Chess Tokenizer (Character Level) - Fully Compatible with HF Trainer
124
+ """
125
+
126
+ class ChessTokenizer:
127
+ def __init__(self):
128
+ # Vocabulaire statique
129
+ self.chars = list("abcdefgh12345678PRNBQKxoO-=") + ["<pad>", "<s>", "</s>"]
130
+
131
+ self.vocab = {ch: i for i, ch in enumerate(self.chars)}
132
+ self.id_to_char = {i: ch for i, ch in enumerate(self.chars)}
133
+
134
+ # Attributs spéciaux (version texte)
135
+ self.pad_token = "<pad>"
136
+ self.bos_token = "<s>"
137
+ self.eos_token = "</s>"
138
+ self.unk_token = "<pad>"
139
+
140
+ # Attributs spéciaux (version ID)
141
+ self.pad_token_id = self.vocab["<pad>"]
142
+ self.bos_token_id = self.vocab["<s>"]
143
+ self.eos_token_id = self.vocab["</s>"]
144
+ self.vocab_size = len(self.vocab)
145
+
146
+ # Config par défaut
147
+ self.model_max_length = 1024
148
+ self.padding_side = "right"
149
+
150
+ @classmethod
151
+ def build_vocab_from_dataset(cls, *args, **kwargs):
152
+ print("⚡ Utilisation du Tokenizer 'Char-Level' (Vocabulaire statique) ⚡")
153
+ return cls()
154
+
155
+ def encode(self, text):
156
+ return [self.vocab.get(c, self.pad_token_id) for c in text]
157
+
158
+ def decode(self, token_ids):
159
+ if isinstance(token_ids, torch.Tensor):
160
+ token_ids = token_ids.tolist()
161
+ if isinstance(token_ids, int):
162
+ token_ids = [token_ids]
163
+
164
+ tokens = [self.id_to_char.get(i, "") for i in token_ids]
165
+ return "".join(tokens).replace("<pad>", "").replace("<s>", "").replace("</s>", "")
166
+
167
+ def __call__(self, text, max_length=None, padding=False, truncation=False, return_tensors=None, **kwargs):
168
+ """
169
+ Cette méthode est le coeur du problème. Elle imite le comportement
170
+ d'un tokenizer Hugging Face standard (Padding, Truncation, Tensors).
171
+ """
172
+ # 1. Encodage brut
173
+ ids = self.encode(text)
174
+
175
+ # 2. Truncation (Couper si trop long)
176
+ if truncation and max_length is not None:
177
+ ids = ids[:max_length]
178
+
179
+ # 3. Padding (Remplir si trop court)
180
+ # On calcule le masque d'attention en même temps (1 pour les vrais tokens, 0 pour le padding)
181
+ attention_mask = [1] * len(ids)
182
+
183
+ if padding == "max_length" and max_length is not None:
184
+ if len(ids) < max_length:
185
+ pad_len = max_length - len(ids)
186
+ ids = ids + [self.pad_token_id] * pad_len
187
+ attention_mask = attention_mask + [0] * pad_len
188
+
189
+ # 4. Conversion en Tenseurs PyTorch
190
+ if return_tensors == "pt":
191
+ # data.py s'attend à une dimension de batch [1, seq_len] pour pouvoir faire .squeeze(0)
192
+ return {
193
+ "input_ids": torch.tensor([ids], dtype=torch.long),
194
+ "attention_mask": torch.tensor([attention_mask], dtype=torch.long)
195
+ }
196
+
197
+ # Fallback (liste simple)
198
+ return {
199
+ "input_ids": ids,
200
+ "attention_mask": attention_mask
201
+ }
202
+
203
+ def save_pretrained(self, save_directory):
204
+ pass
205
+
206
+ @classmethod
207
+ def from_pretrained(cls, save_directory):
208
+ return cls()
209
+ '''
210
+
211
+
212
+
213
+
214
+
215
+ """
216
+ Custom Chess Tokenizer for the Chess Challenge.
217
+
218
+ This tokenizer treats each move as a single token using the extended UCI notation
219
+ from the Lichess dataset (e.g., WPe2e4, BNg8f6).
220
+
221
+ The dataset format uses:
222
+ - W/B prefix for White/Black
223
+ - Piece letter: P=Pawn, N=Knight, B=Bishop, R=Rook, Q=Queen, K=King
224
+ - Source and destination squares (e.g., e2e4)
225
+ - Special suffixes: (x)=capture, (+)=check, (+*)=checkmate, (o)/(O)=castling
226
+ """
227
+
228
+
229
+
230
+ '''class ChessTokenizer(PreTrainedTokenizer):
231
+ """
232
+ A custom tokenizer for chess moves using extended UCI notation.
233
+
234
+ This tokenizer maps each possible chess move to a unique token ID.
235
+ The vocabulary is built from the training dataset to ensure all moves
236
+ encountered during training have a corresponding token.
237
+
238
+ Example:
239
+ >>> tokenizer = ChessTokenizer()
240
+ >>> tokenizer.encode("WPe2e4 BPe7e5")
241
+ [1, 42, 87, 2] # [BOS, e2e4, e7e5, EOS]
242
+ """
243
+
244
+ model_input_names = ["input_ids", "attention_mask"]
245
+ vocab_files_names = {"vocab_file": "vocab.json"}
246
+
247
+ # Special tokens
248
+ PAD_TOKEN = "[PAD]"
249
+ BOS_TOKEN = "[BOS]"
250
+ EOS_TOKEN = "[EOS]"
251
+ UNK_TOKEN = "[UNK]"
252
+
253
+ def __init__(
254
+ self,
255
+ vocab_file: Optional[str] = None,
256
+ vocab: Optional[Dict[str, int]] = None,
257
+ **kwargs,
258
+ ):
259
+ """
260
+ Initialize the chess tokenizer.
261
+
262
+ Args:
263
+ vocab_file: Path to a JSON file containing the vocabulary mapping.
264
+ vocab: Dictionary mapping tokens to IDs (alternative to vocab_file).
265
+ **kwargs: Additional arguments passed to PreTrainedTokenizer.
266
+ """
267
+ # Initialize special tokens
268
+ self._pad_token = self.PAD_TOKEN
269
+ self._bos_token = self.BOS_TOKEN
270
+ self._eos_token = self.EOS_TOKEN
271
+ self._unk_token = self.UNK_TOKEN
272
+
273
+ # Remove any duplicate special-token entries passed through kwargs
274
+ # to avoid "multiple values for keyword" errors when loading from disk.
275
+ kwargs.pop("pad_token", None)
276
+ kwargs.pop("bos_token", None)
277
+ kwargs.pop("eos_token", None)
278
+ kwargs.pop("unk_token", None)
279
+
280
+ # Load or create vocabulary
281
+ if vocab is not None:
282
+ self._vocab = vocab
283
+ elif vocab_file is not None and os.path.exists(vocab_file):
284
+ with open(vocab_file, "r", encoding="utf-8") as f:
285
+ self._vocab = json.load(f)
286
+ else:
287
+ # Create a minimal vocabulary with just special tokens
288
+ # The full vocabulary should be built from the dataset
289
+ self._vocab = self._create_default_vocab()
290
+
291
+ # Create reverse mapping
292
+ self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
293
+
294
+ # Call parent init AFTER setting up vocab
295
+ super().__init__(
296
+ pad_token=self._pad_token,
297
+ bos_token=self._bos_token,
298
+ eos_token=self._eos_token,
299
+ unk_token=self._unk_token,
300
+ **kwargs,
301
+ )
302
+
303
+ def _create_default_vocab(self) -> Dict[str, int]:
304
+ """
305
+ Create a minimal default vocabulary with just special tokens.
306
+
307
+ For the full vocabulary, use `build_vocab_from_dataset()`.
308
+ This minimal vocab is just a placeholder - you should build from data.
309
+ """
310
+ special_tokens = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
311
+ vocab = {token: idx for idx, token in enumerate(special_tokens)}
312
+ return vocab
313
+
314
+ @classmethod
315
+ def build_vocab_from_iterator(
316
+ cls,
317
+ iterator,
318
+ min_frequency: int = 1,
319
+ ) -> "ChessTokenizer":
320
+ """
321
+ Build a tokenizer vocabulary from an iterator of game strings.
322
+
323
+ Args:
324
+ iterator: An iterator yielding game strings (space-separated moves).
325
+ min_frequency: Minimum frequency for a token to be included.
326
+
327
+ Returns:
328
+ A ChessTokenizer with the built vocabulary.
329
+ """
330
+ from collections import Counter
331
+
332
+ token_counts = Counter()
333
+
334
+ for game in iterator:
335
+ moves = game.strip().split()
336
+ token_counts.update(moves)
337
+
338
+ # Filter by frequency
339
+ tokens = [
340
+ token for token, count in token_counts.items()
341
+ if count >= min_frequency
342
+ ]
343
+
344
+ # Sort for reproducibility
345
+ tokens = sorted(tokens)
346
+
347
+ # Build vocabulary
348
+ special_tokens = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN]
349
+ vocab = {token: idx for idx, token in enumerate(special_tokens + tokens)}
350
+
351
+ return cls(vocab=vocab)
352
+
353
+ @classmethod
354
+ def build_vocab_from_dataset(
355
+ cls,
356
+ dataset_name: str = "dlouapre/lichess_2025-01_1M",
357
+ split: str = "train",
358
+ column: str = "text",
359
+ min_frequency: int = 500,
360
+ max_samples: Optional[int] = 100000,
361
+ ) -> "ChessTokenizer":
362
+ """
363
+ Build a tokenizer vocabulary from a Hugging Face dataset.
364
+
365
+ Args:
366
+ dataset_name: Name of the dataset on Hugging Face Hub.
367
+ split: Dataset split to use.
368
+ column: Column containing the game strings.
369
+ min_frequency: Minimum frequency for a token to be included (default: 500).
370
+ max_samples: Maximum number of samples to process (default: 100k).
371
+
372
+ Returns:
373
+ A ChessTokenizer with the built vocabulary.
374
+ """
375
+ from datasets import load_dataset
376
+
377
+ dataset = load_dataset(dataset_name, split=split)
378
+
379
+ if max_samples is not None:
380
+ dataset = dataset.select(range(min(max_samples, len(dataset))))
381
+
382
+ def game_iterator():
383
+ for example in dataset:
384
+ yield example[column]
385
+
386
+ return cls.build_vocab_from_iterator(game_iterator(), min_frequency=min_frequency)
387
+
388
+ @property
389
+ def vocab_size(self) -> int:
390
+ """Return the size of the vocabulary."""
391
+ return len(self._vocab)
392
+
393
+ def get_vocab(self) -> Dict[str, int]:
394
+ """Return the vocabulary as a dictionary."""
395
+ return dict(self._vocab)
396
+
397
+ def _tokenize(self, text: str) -> List[str]:
398
+ """
399
+ Tokenize a string of moves into a list of tokens.
400
+
401
+ Args:
402
+ text: A string of space-separated moves.
403
+
404
+ Returns:
405
+ List of move tokens.
406
+ """
407
+ return text.strip().split()
408
+
409
+ def _convert_token_to_id(self, token: str) -> int:
410
+ """Convert a token to its ID."""
411
+ return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN, 0))
412
+
413
+ def _convert_id_to_token(self, index: int) -> str:
414
+ """Convert an ID to its token."""
415
+ return self._ids_to_tokens.get(index, self.UNK_TOKEN)
416
+
417
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
418
+ """Convert a list of tokens back to a string."""
419
+ # Filter out special tokens for cleaner output
420
+ special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
421
+ return " ".join(t for t in tokens if t not in special)
422
+
423
+ def save_vocabulary(
424
+ self,
425
+ save_directory: str,
426
+ filename_prefix: Optional[str] = None,
427
+ ) -> tuple:
428
+ """
429
+ Save the vocabulary to a JSON file.
430
+
431
+ Args:
432
+ save_directory: Directory to save the vocabulary.
433
+ filename_prefix: Optional prefix for the filename.
434
+
435
+ Returns:
436
+ Tuple containing the path to the saved vocabulary file.
437
+ """
438
+ if not os.path.isdir(save_directory):
439
+ os.makedirs(save_directory, exist_ok=True)
440
+
441
+ vocab_file = os.path.join(
442
+ save_directory,
443
+ (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
444
+ )
445
+
446
+ with open(vocab_file, "w", encoding="utf-8") as f:
447
+ json.dump(self._vocab, f, ensure_ascii=False, indent=2)
448
+
449
+ return (vocab_file,)'''
450
+
451
+
452
+ def count_vocab_from_dataset(
453
+ dataset_name: str = "dlouapre/lichess_2025-01_1M",
454
+ split: str = "train",
455
+ column: str = "text",
456
+ max_samples: Optional[int] = 10000,
457
+ ) -> Dict[str, int]:
458
+ """
459
+ Count token frequencies in a dataset (useful for vocabulary analysis).
460
+
461
+ Args:
462
+ dataset_name: Name of the dataset on Hugging Face Hub.
463
+ split: Dataset split to use.
464
+ column: Column containing the game strings.
465
+ max_samples: Maximum number of samples to process.
466
+
467
+ Returns:
468
+ Dictionary mapping tokens to their frequencies.
469
+ """
470
+ from collections import Counter
471
+ from datasets import load_dataset
472
+
473
+ dataset = load_dataset(dataset_name, split=split)
474
+
475
+ if max_samples is not None:
476
+ dataset = dataset.select(range(min(max_samples, len(dataset))))
477
+
478
+ token_counts = Counter()
479
+
480
+ for example in dataset:
481
+ moves = example[column].strip().split()
482
+ token_counts.update(moves)
483
+
484
+ return dict(token_counts)