gagliardidf commited on
Commit
065d0f2
·
verified ·
1 Parent(s): 3d28620

Chess Challenge submission by gagliardidf

Browse files
Files changed (7) hide show
  1. README.md +26 -0
  2. config.json +20 -0
  3. model.safetensors +3 -0
  4. special_tokens_map.json +6 -0
  5. tokenizer.py +469 -0
  6. tokenizer_config.json +50 -0
  7. vocab.json +87 -0
README.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - chess
5
+ - llm-course
6
+ - chess-challenge
7
+ license: mit
8
+ ---
9
+
10
+ # gagliardidf-chess-model-v2
11
+
12
+ Chess model submitted to the LLM Course Chess Challenge.
13
+
14
+ ## Submission Info
15
+
16
+ - **Submitted by**: [gagliardidf](https://huggingface.co/gagliardidf)
17
+ - **Parameters**: 836,992
18
+ - **Organization**: LLM-course
19
+
20
+ ## Model Details
21
+
22
+ - **Architecture**: Chess Transformer (GPT-style)
23
+ - **Vocab size**: 85
24
+ - **Embedding dim**: 128
25
+ - **Layers**: 4
26
+ - **Heads**: 4
config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ChessForCausalLM"
4
+ ],
5
+ "bos_token_id": 1,
6
+ "dropout": 0.1,
7
+ "dtype": "float32",
8
+ "eos_token_id": 2,
9
+ "layer_norm_epsilon": 1e-05,
10
+ "model_type": "chess_transformer",
11
+ "n_ctx": 256,
12
+ "n_embd": 128,
13
+ "n_head": 4,
14
+ "n_inner": 512,
15
+ "n_layer": 4,
16
+ "pad_token_id": 0,
17
+ "tie_weights": true,
18
+ "transformers_version": "4.57.6",
19
+ "vocab_size": 85
20
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5a9a13b659d52d0a4907503f7fe38b8ac8cd58f45682cbfb465e9422a33bbbe
3
+ size 3352376
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[BOS]",
3
+ "eos_token": "[EOS]",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "[UNK]"
6
+ }
tokenizer.py ADDED
@@ -0,0 +1,469 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom Chess Tokenizer for the Chess Challenge.
3
+
4
+ This tokenizer uses 85 tokens total:
5
+ - 64 Squares: Each square (a1–h8) is one token
6
+ - 6 Pieces: P, N, B, R, Q, K
7
+ - 10 Special Indicators: Capture (x), Check (+), Checkmate (#), Kingside Castle (O-O),
8
+ Queenside Castle (O-O-O), and Promotion pieces (q, r, b, n)
9
+ - 5 Control Tokens: [PAD], [BOS], [EOS], [WHITE], [BLACK]
10
+
11
+ The dataset format uses extended UCI notation (e.g., WPe2e4, BNg8f6(x)):
12
+ - W/B prefix maps to [WHITE]/[BLACK] tokens
13
+ - Piece letter (P, N, B, R, Q, K) is tokenized
14
+ - Source and destination squares are tokenized
15
+ - Special suffixes are extracted and tokenized
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import json
21
+ import os
22
+ import re
23
+ from pathlib import Path
24
+ from typing import Dict, List, Optional
25
+
26
+ from transformers import PreTrainedTokenizer
27
+
28
+
29
+ class ChessTokenizer(PreTrainedTokenizer):
30
+ """
31
+ A custom tokenizer for chess moves using 85 tokens.
32
+
33
+ This tokenizer decomposes moves into component tokens:
34
+ - Color token ([WHITE] or [BLACK])
35
+ - Piece token (P, N, B, R, Q, K)
36
+ - Source square token (a1-h8)
37
+ - Destination square token (a1-h8)
38
+ - Optional special indicator tokens (x, +, #, O-O, O-O-O, q/r/b/n)
39
+
40
+ Example:
41
+ >>> tokenizer = ChessTokenizer()
42
+ >>> tokenizer.encode("WPe2e4 BNg8f6(x)")
43
+ [1, 69, 65, 36, 40, 70, 66, 52, 46, 71, 2] # [BOS, WHITE, P, e2, e4, BLACK, N, g8, f6, x, EOS]
44
+ """
45
+
46
+ model_input_names = ["input_ids", "attention_mask"]
47
+ vocab_files_names = {"vocab_file": "vocab.json"}
48
+
49
+ # Special tokens
50
+ PAD_TOKEN = "[PAD]"
51
+ BOS_TOKEN = "[BOS]"
52
+ EOS_TOKEN = "[EOS]"
53
+ WHITE_TOKEN = "[WHITE]"
54
+ BLACK_TOKEN = "[BLACK]"
55
+ UNK_TOKEN = "[UNK]"
56
+
57
+ def __init__(
58
+ self,
59
+ vocab_file: Optional[str] = None,
60
+ vocab: Optional[Dict[str, int]] = None,
61
+ **kwargs,
62
+ ):
63
+ """
64
+ Initialize the chess tokenizer.
65
+
66
+ Args:
67
+ vocab_file: Path to a JSON file containing the vocabulary mapping.
68
+ vocab: Dictionary mapping tokens to IDs (alternative to vocab_file).
69
+ **kwargs: Additional arguments passed to PreTrainedTokenizer.
70
+ """
71
+ # Initialize special tokens
72
+ self._pad_token = self.PAD_TOKEN
73
+ self._bos_token = self.BOS_TOKEN
74
+ self._eos_token = self.EOS_TOKEN
75
+ self._unk_token = self.UNK_TOKEN
76
+
77
+ # Remove any duplicate special-token entries passed through kwargs
78
+ # to avoid "multiple values for keyword" errors when loading from disk.
79
+ kwargs.pop("pad_token", None)
80
+ kwargs.pop("bos_token", None)
81
+ kwargs.pop("eos_token", None)
82
+ kwargs.pop("unk_token", None)
83
+
84
+ # Load or create vocabulary
85
+ if vocab is not None:
86
+ self._vocab = vocab
87
+ elif vocab_file is not None and os.path.exists(vocab_file):
88
+ with open(vocab_file, "r", encoding="utf-8") as f:
89
+ self._vocab = json.load(f)
90
+ else:
91
+ # Create the fixed 85-token vocabulary
92
+ self._vocab = self._create_default_vocab()
93
+
94
+ # Create reverse mapping
95
+ self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
96
+
97
+ # Call parent init AFTER setting up vocab
98
+ super().__init__(
99
+ pad_token=self._pad_token,
100
+ bos_token=self._bos_token,
101
+ eos_token=self._eos_token,
102
+ unk_token=self._unk_token,
103
+ **kwargs,
104
+ )
105
+
106
+ def _create_default_vocab(self) -> Dict[str, int]:
107
+ """
108
+ Create the fixed 85-token vocabulary.
109
+
110
+ Vocabulary structure:
111
+ - 5 Control Tokens: [PAD], [BOS], [EOS], [WHITE], [BLACK]
112
+ - 64 Square Tokens: a1, a2, ..., a8, b1, ..., h8
113
+ - 6 Piece Tokens: P, N, B, R, Q, K
114
+ - 10 Special Indicator Tokens: x, +, #, O-O, O-O-O, q, r, b, n, (placeholder for UNK)
115
+ """
116
+ vocab = {}
117
+ idx = 0
118
+
119
+ # 5 Control Tokens
120
+ control_tokens = [
121
+ self.PAD_TOKEN,
122
+ self.BOS_TOKEN,
123
+ self.EOS_TOKEN,
124
+ self.WHITE_TOKEN,
125
+ self.BLACK_TOKEN,
126
+ ]
127
+ for token in control_tokens:
128
+ vocab[token] = idx
129
+ idx += 1
130
+
131
+ # 64 Square Tokens (a1-h8)
132
+ for file in 'abcdefgh':
133
+ for rank in '12345678':
134
+ square = f"{file}{rank}"
135
+ vocab[square] = idx
136
+ idx += 1
137
+
138
+ # 6 Piece Tokens
139
+ pieces = ['P', 'N', 'B', 'R', 'Q', 'K']
140
+ for piece in pieces:
141
+ vocab[piece] = idx
142
+ idx += 1
143
+
144
+ # 10 Special Indicator Tokens
145
+ special_indicators = [
146
+ 'x', # Capture
147
+ '+', # Check
148
+ '#', # Checkmate
149
+ 'O-O', # Kingside castle
150
+ 'O-O-O', # Queenside castle
151
+ 'q', # Queen promotion
152
+ 'r', # Rook promotion
153
+ 'b', # Bishop promotion
154
+ 'n', # Knight promotion
155
+ self.UNK_TOKEN, # Unknown (keeping for compatibility)
156
+ ]
157
+ for indicator in special_indicators:
158
+ vocab[indicator] = idx
159
+ idx += 1
160
+
161
+ return vocab
162
+
163
+ @classmethod
164
+ def build_vocab_from_iterator(
165
+ cls,
166
+ iterator,
167
+ min_frequency: int = 1,
168
+ ) -> "ChessTokenizer":
169
+ """
170
+ Build a tokenizer vocabulary from an iterator of game strings.
171
+
172
+ Args:
173
+ iterator: An iterator yielding game strings (space-separated moves).
174
+ min_frequency: Minimum frequency for a token to be included (not used, vocab is fixed).
175
+
176
+ Returns:
177
+ A ChessTokenizer with the fixed 85-token vocabulary.
178
+ """
179
+ # The vocabulary is fixed: 85 tokens
180
+ return cls()
181
+
182
+ @classmethod
183
+ def build_vocab_from_dataset(
184
+ cls,
185
+ dataset_name: str = "dlouapre/lichess_2025-01_1M",
186
+ split: str = "train",
187
+ column: str = "text",
188
+ min_frequency: int = 500,
189
+ max_samples: Optional[int] = 100000,
190
+ ) -> "ChessTokenizer":
191
+ """
192
+ Build a tokenizer vocabulary from a Hugging Face dataset.
193
+
194
+ Args:
195
+ dataset_name: Name of the dataset on Hugging Face Hub.
196
+ split: Dataset split to use.
197
+ column: Column containing the game strings.
198
+ min_frequency: Minimum frequency for a token to be included (not used, vocab is fixed).
199
+ max_samples: Maximum number of samples to process (default: 100k).
200
+
201
+ Returns:
202
+ A ChessTokenizer with the fixed 85-token vocabulary.
203
+ """
204
+ # The vocabulary is fixed: 85 tokens
205
+ return cls()
206
+
207
+ @property
208
+ def vocab_size(self) -> int:
209
+ """Return the size of the vocabulary (85 tokens)."""
210
+ return len(self._vocab)
211
+
212
+ def get_vocab(self) -> Dict[str, int]:
213
+ """Return the vocabulary as a dictionary."""
214
+ return dict(self._vocab)
215
+
216
+ def _parse_move(self, move: str) -> List[str]:
217
+ """
218
+ Parse a single move string into component tokens.
219
+
220
+ Moves are in extended UCI format: [W|B][Piece][from_sq][to_sq][suffixes]
221
+ Examples:
222
+ "WPe2e4" -> [WHITE, P, e2, e4]
223
+ "BNg8f6(x)" -> [BLACK, N, g8, f6, x]
224
+ "WPe7e8=Q(+*)" -> [WHITE, P, e7, e8, q, #]
225
+ "WKe1g1(o)" -> [WHITE, K, e1, g1, O-O]
226
+
227
+ Args:
228
+ move: Move string in extended UCI format.
229
+
230
+ Returns:
231
+ List of tokens representing the move.
232
+ """
233
+ tokens = []
234
+
235
+ if len(move) < 6:
236
+ return [self.UNK_TOKEN]
237
+
238
+ # Extract color (W or B)
239
+ color = move[0]
240
+ if color == 'W':
241
+ tokens.append(self.WHITE_TOKEN)
242
+ elif color == 'B':
243
+ tokens.append(self.BLACK_TOKEN)
244
+ else:
245
+ tokens.append(self.UNK_TOKEN)
246
+ return tokens
247
+
248
+ # Extract piece
249
+ piece = move[1]
250
+ if piece in ['P', 'N', 'B', 'R', 'Q', 'K']:
251
+ tokens.append(piece)
252
+ else:
253
+ tokens.append(self.UNK_TOKEN)
254
+ return tokens
255
+
256
+ # Extract source and destination squares
257
+ from_sq = move[2:4]
258
+ to_sq = move[4:6]
259
+
260
+ # Validate squares are valid (a1-h8)
261
+ if from_sq[0] in 'abcdefgh' and from_sq[1] in '12345678':
262
+ tokens.append(from_sq)
263
+ else:
264
+ tokens.append(self.UNK_TOKEN)
265
+
266
+ if to_sq[0] in 'abcdefgh' and to_sq[1] in '12345678':
267
+ tokens.append(to_sq)
268
+ else:
269
+ tokens.append(self.UNK_TOKEN)
270
+
271
+ # Extract and parse suffixes
272
+ suffix_pattern = r'\(([^)]+)\)|(=.)'
273
+ suffixes = re.findall(suffix_pattern, move[6:])
274
+
275
+ # Check for castling indicators (o) or (O)
276
+ if '(o)' in move or '(O)' in move:
277
+ # Determine kingside vs queenside by move
278
+ # Kingside: e1g1, e8g8 (short)
279
+ # Queenside: e1c1, e8c8 (long)
280
+ if to_sq in ['g1', 'g8']:
281
+ tokens.append('O-O')
282
+ elif to_sq in ['c1', 'c8']:
283
+ tokens.append('O-O-O')
284
+ else:
285
+ # Default to kingside if unclear
286
+ tokens.append('O-O')
287
+ else:
288
+ # Check for promotion (=Q, =R, =B, =N)
289
+ promotion_match = re.search(r'=([QRBN])', move)
290
+ if promotion_match:
291
+ promo_piece = promotion_match.group(1).lower()
292
+ if promo_piece in ['q', 'r', 'b', 'n']:
293
+ tokens.append(promo_piece)
294
+
295
+ # Check for capture (x)
296
+ if '(x)' in move or 'x' in move[6:]:
297
+ tokens.append('x')
298
+
299
+ # Check for checkmate (+*)
300
+ if '(+*)' in move or '(x+*)' in move or '(x+*' in move:
301
+ tokens.append('#')
302
+ # Check for check (+)
303
+ elif '(+)' in move or '(x+)' in move:
304
+ tokens.append('+')
305
+
306
+ return tokens
307
+
308
+ def _tokenize(self, text: str) -> List[str]:
309
+ """
310
+ Tokenize a string of moves into a list of tokens.
311
+
312
+ Each move is decomposed into its component tokens:
313
+ [WHITE/BLACK] + [Piece] + [from_sq] + [to_sq] + [special indicators...]
314
+
315
+ Args:
316
+ text: A string of space-separated moves in extended UCI format.
317
+
318
+ Returns:
319
+ List of tokens representing all moves.
320
+ """
321
+ moves = text.strip().split()
322
+ tokens = []
323
+
324
+ for move in moves:
325
+ move_tokens = self._parse_move(move)
326
+ tokens.extend(move_tokens)
327
+
328
+ return tokens
329
+
330
+ def _convert_token_to_id(self, token: str) -> int:
331
+ """Convert a token to its ID."""
332
+ return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN, 3))
333
+
334
+ def _convert_id_to_token(self, index: int) -> str:
335
+ """Convert an ID to its token."""
336
+ return self._ids_to_tokens.get(index, self.UNK_TOKEN)
337
+
338
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
339
+ """
340
+ Convert a list of tokens back to a string representation.
341
+
342
+ Reconstructs moves in simplified UCI format from component tokens.
343
+
344
+ Args:
345
+ tokens: List of tokens representing moves.
346
+
347
+ Returns:
348
+ String representation of moves in space-separated format.
349
+ """
350
+ # Filter out special control tokens (keep WHITE/BLACK for move boundaries)
351
+ control_tokens = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN}
352
+ filtered_tokens = [t for t in tokens if t not in control_tokens]
353
+
354
+ moves = []
355
+ i = 0
356
+ while i < len(filtered_tokens):
357
+ # Look for move pattern: [WHITE/BLACK] + [Piece] + [from_sq] + [to_sq] + [indicators...]
358
+ move_parts = []
359
+
360
+ # Check for color token
361
+ if i < len(filtered_tokens) and filtered_tokens[i] in [self.WHITE_TOKEN, self.BLACK_TOKEN]:
362
+ move_parts.append(filtered_tokens[i])
363
+ i += 1
364
+
365
+ # Get piece
366
+ if i < len(filtered_tokens) and filtered_tokens[i] in ['P', 'N', 'B', 'R', 'Q', 'K']:
367
+ move_parts.append(filtered_tokens[i])
368
+ i += 1
369
+ else:
370
+ i += 1
371
+ continue
372
+
373
+ # Get squares
374
+ if i + 1 < len(filtered_tokens):
375
+ from_sq = filtered_tokens[i]
376
+ to_sq = filtered_tokens[i + 1]
377
+ # Validate squares
378
+ if (len(from_sq) == 2 and from_sq[0] in 'abcdefgh' and from_sq[1] in '12345678' and
379
+ len(to_sq) == 2 and to_sq[0] in 'abcdefgh' and to_sq[1] in '12345678'):
380
+ move_parts.extend([from_sq, to_sq])
381
+ i += 2
382
+
383
+ # Collect special indicators
384
+ while i < len(filtered_tokens):
385
+ next_token = filtered_tokens[i]
386
+ if next_token in ['x', '+', '#', 'O-O', 'O-O-O', 'q', 'r', 'b', 'n']:
387
+ move_parts.append(next_token)
388
+ i += 1
389
+ else:
390
+ break
391
+
392
+ # Reconstruct move string (simplified UCI format)
393
+ if len(move_parts) >= 4:
394
+ move_str = move_parts[2] + move_parts[3] # from_sq + to_sq
395
+ # Add promotion if present
396
+ for promo in ['q', 'r', 'b', 'n']:
397
+ if promo in move_parts:
398
+ move_str += f"={promo.upper()}"
399
+ break
400
+ moves.append(move_str)
401
+ else:
402
+ i += 2
403
+ else:
404
+ break
405
+
406
+ return " ".join(moves)
407
+
408
+ def save_vocabulary(
409
+ self,
410
+ save_directory: str,
411
+ filename_prefix: Optional[str] = None,
412
+ ) -> tuple:
413
+ """
414
+ Save the vocabulary to a JSON file.
415
+
416
+ Args:
417
+ save_directory: Directory to save the vocabulary.
418
+ filename_prefix: Optional prefix for the filename.
419
+
420
+ Returns:
421
+ Tuple containing the path to the saved vocabulary file.
422
+ """
423
+ if not os.path.isdir(save_directory):
424
+ os.makedirs(save_directory, exist_ok=True)
425
+
426
+ vocab_file = os.path.join(
427
+ save_directory,
428
+ (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
429
+ )
430
+
431
+ with open(vocab_file, "w", encoding="utf-8") as f:
432
+ json.dump(self._vocab, f, ensure_ascii=False, indent=2)
433
+
434
+ return (vocab_file,)
435
+
436
+
437
+ def count_vocab_from_dataset(
438
+ dataset_name: str = "dlouapre/lichess_2025-01_1M",
439
+ split: str = "train",
440
+ column: str = "text",
441
+ max_samples: Optional[int] = 10000,
442
+ ) -> Dict[str, int]:
443
+ """
444
+ Count token frequencies in a dataset (useful for vocabulary analysis).
445
+
446
+ Args:
447
+ dataset_name: Name of the dataset on Hugging Face Hub.
448
+ split: Dataset split to use.
449
+ column: Column containing the game strings.
450
+ max_samples: Maximum number of samples to process.
451
+
452
+ Returns:
453
+ Dictionary mapping tokens to their frequencies.
454
+ """
455
+ from collections import Counter
456
+ from datasets import load_dataset
457
+
458
+ dataset = load_dataset(dataset_name, split=split)
459
+
460
+ if max_samples is not None:
461
+ dataset = dataset.select(range(min(max_samples, len(dataset))))
462
+
463
+ token_counts = Counter()
464
+
465
+ for example in dataset:
466
+ moves = example[column].strip().split()
467
+ token_counts.update(moves)
468
+
469
+ return dict(token_counts)
tokenizer_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[BOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[EOS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "84": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "auto_map": {
37
+ "AutoTokenizer": [
38
+ "tokenizer.ChessTokenizer",
39
+ null
40
+ ]
41
+ },
42
+ "bos_token": "[BOS]",
43
+ "clean_up_tokenization_spaces": false,
44
+ "eos_token": "[EOS]",
45
+ "extra_special_tokens": {},
46
+ "model_max_length": 1000000000000000019884624838656,
47
+ "pad_token": "[PAD]",
48
+ "tokenizer_class": "ChessTokenizer",
49
+ "unk_token": "[UNK]"
50
+ }
vocab.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[PAD]": 0,
3
+ "[BOS]": 1,
4
+ "[EOS]": 2,
5
+ "[WHITE]": 3,
6
+ "[BLACK]": 4,
7
+ "a1": 5,
8
+ "a2": 6,
9
+ "a3": 7,
10
+ "a4": 8,
11
+ "a5": 9,
12
+ "a6": 10,
13
+ "a7": 11,
14
+ "a8": 12,
15
+ "b1": 13,
16
+ "b2": 14,
17
+ "b3": 15,
18
+ "b4": 16,
19
+ "b5": 17,
20
+ "b6": 18,
21
+ "b7": 19,
22
+ "b8": 20,
23
+ "c1": 21,
24
+ "c2": 22,
25
+ "c3": 23,
26
+ "c4": 24,
27
+ "c5": 25,
28
+ "c6": 26,
29
+ "c7": 27,
30
+ "c8": 28,
31
+ "d1": 29,
32
+ "d2": 30,
33
+ "d3": 31,
34
+ "d4": 32,
35
+ "d5": 33,
36
+ "d6": 34,
37
+ "d7": 35,
38
+ "d8": 36,
39
+ "e1": 37,
40
+ "e2": 38,
41
+ "e3": 39,
42
+ "e4": 40,
43
+ "e5": 41,
44
+ "e6": 42,
45
+ "e7": 43,
46
+ "e8": 44,
47
+ "f1": 45,
48
+ "f2": 46,
49
+ "f3": 47,
50
+ "f4": 48,
51
+ "f5": 49,
52
+ "f6": 50,
53
+ "f7": 51,
54
+ "f8": 52,
55
+ "g1": 53,
56
+ "g2": 54,
57
+ "g3": 55,
58
+ "g4": 56,
59
+ "g5": 57,
60
+ "g6": 58,
61
+ "g7": 59,
62
+ "g8": 60,
63
+ "h1": 61,
64
+ "h2": 62,
65
+ "h3": 63,
66
+ "h4": 64,
67
+ "h5": 65,
68
+ "h6": 66,
69
+ "h7": 67,
70
+ "h8": 68,
71
+ "P": 69,
72
+ "N": 70,
73
+ "B": 71,
74
+ "R": 72,
75
+ "Q": 73,
76
+ "K": 74,
77
+ "x": 75,
78
+ "+": 76,
79
+ "#": 77,
80
+ "O-O": 78,
81
+ "O-O-O": 79,
82
+ "q": 80,
83
+ "r": 81,
84
+ "b": 82,
85
+ "n": 83,
86
+ "[UNK]": 84
87
+ }