totobobo1111 commited on
Commit
82d1bb5
·
verified ·
1 Parent(s): e073dd3

Chess Challenge submission by totobobo1111

Browse files
Files changed (7) hide show
  1. README.md +26 -0
  2. config.json +20 -0
  3. model.safetensors +3 -0
  4. special_tokens_map.json +6 -0
  5. tokenizer.py +190 -0
  6. tokenizer_config.json +50 -0
  7. vocab.json +80 -0
README.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - chess
5
+ - llm-course
6
+ - chess-challenge
7
+ license: mit
8
+ ---
9
+
10
+ # chess-troy
11
+
12
+ Chess model submitted to the LLM Course Chess Challenge.
13
+
14
+ ## Submission Info
15
+
16
+ - **Submitted by**: [totobobo1111](https://huggingface.co/totobobo1111)
17
+ - **Parameters**: 923,472
18
+ - **Organization**: LLM-course
19
+
20
+ ## Model Details
21
+
22
+ - **Architecture**: Chess Transformer (GPT-style)
23
+ - **Vocab size**: 78
24
+ - **Embedding dim**: 132
25
+ - **Layers**: 5
26
+ - **Heads**: 4
config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ChessForCausalLM"
4
+ ],
5
+ "bos_token_id": 1,
6
+ "dropout": 0.1,
7
+ "dtype": "float32",
8
+ "eos_token_id": 2,
9
+ "layer_norm_epsilon": 1e-05,
10
+ "model_type": "chess_transformer",
11
+ "n_ctx": 256,
12
+ "n_embd": 132,
13
+ "n_head": 4,
14
+ "n_inner": 396,
15
+ "n_layer": 5,
16
+ "pad_token_id": 0,
17
+ "tie_weights": true,
18
+ "transformers_version": "4.57.6",
19
+ "vocab_size": 78
20
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e081e90c1b60660d8f3f6149216c53601f447a30e215b0ba0643796b18304d8
3
+ size 3699312
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[BOS]",
3
+ "eos_token": "[EOS]",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "[UNK]"
6
+ }
tokenizer.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from typing import Dict, List, Optional
6
+ import re
7
+
8
+ from transformers import PreTrainedTokenizer
9
+
10
+
11
+ class ChessTokenizer(PreTrainedTokenizer):
12
+ """
13
+ Chess tokenizer with structured move tokens:
14
+ Each move is split into: [side][piece][from][to][suffixes].
15
+ Example:
16
+ "WPe2e4 BNg8xf6+" -> [W][P][e2][e4] [B][N][g8][f6][x][+]
17
+ """
18
+
19
+ model_input_names = ["input_ids", "attention_mask"]
20
+ vocab_files_names = {"vocab_file": "vocab.json"}
21
+
22
+ # Special tokens
23
+ PAD_TOKEN = "[PAD]"
24
+ BOS_TOKEN = "[BOS]"
25
+ EOS_TOKEN = "[EOS]"
26
+ UNK_TOKEN = "[UNK]"
27
+
28
+ MOVE_RE = re.compile(
29
+ r"^(?P<side>[WB])"
30
+ r"(?P<piece>[PNBRQK])"
31
+ r"(?P<src>[a-h][1-8])"
32
+ r"(?P<dst>[a-h][1-8])"
33
+ r"(?P<suffix>.*)$"
34
+ )
35
+
36
+ def __init__(
37
+ self,
38
+ vocab_file: Optional[str] = None,
39
+ vocab: Optional[Dict[str, int]] = None,
40
+ **kwargs,
41
+ ):
42
+ self._pad_token = self.PAD_TOKEN
43
+ self._bos_token = self.BOS_TOKEN
44
+ self._eos_token = self.EOS_TOKEN
45
+ self._unk_token = self.UNK_TOKEN
46
+
47
+ # Remove duplicates from kwargs
48
+ kwargs.pop("pad_token", None)
49
+ kwargs.pop("bos_token", None)
50
+ kwargs.pop("eos_token", None)
51
+ kwargs.pop("unk_token", None)
52
+
53
+ # Load or create vocab
54
+ if vocab is not None:
55
+ self._vocab = vocab
56
+ elif vocab_file is not None and os.path.exists(vocab_file):
57
+ with open(vocab_file, "r", encoding="utf-8") as f:
58
+ self._vocab = json.load(f)
59
+ else:
60
+ self._vocab = self._create_default_vocab()
61
+
62
+ # Reverse mapping
63
+ self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
64
+
65
+ super().__init__(
66
+ pad_token=self._pad_token,
67
+ bos_token=self._bos_token,
68
+ eos_token=self._eos_token,
69
+ unk_token=self._unk_token,
70
+ **kwargs,
71
+ )
72
+
73
+ def _create_default_vocab(self) -> Dict[str, int]:
74
+ special = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
75
+
76
+ sides = ["[W]", "[B]"]
77
+ pieces = ["[P]", "[N]", "[B]", "[R]", "[Q]", "[K]"]
78
+ squares = [f"[{f}{r}]" for f in "abcdefgh" for r in "12345678"]
79
+ suffixes = ["[x]", "[+]", "[#]", "[O-O]", "[O-O-O]",
80
+ "[prom_Q]", "[prom_R]", "[prom_B]", "[prom_N]"]
81
+
82
+ vocab_list = special + sides + pieces + squares + suffixes
83
+ return {tok: i for i, tok in enumerate(vocab_list)}
84
+
85
+ @classmethod
86
+ def build_vocab_from_iterator(cls, iterator, min_frequency: int = 1) -> "ChessTokenizer":
87
+ from collections import Counter
88
+
89
+ token_counts = Counter()
90
+ tokenizer = cls()
91
+
92
+ for game in iterator:
93
+ tokens = tokenizer._tokenize(game)
94
+ token_counts.update(tokens)
95
+
96
+ # Keep tokens meeting frequency threshold
97
+ tokens = [t for t, c in token_counts.items() if c >= min_frequency]
98
+ tokens = sorted(tokens)
99
+
100
+ special = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN]
101
+ vocab = {tok: i for i, tok in enumerate(special + tokens)}
102
+
103
+ return cls(vocab=vocab)
104
+
105
+ @classmethod
106
+ def build_vocab_from_dataset(
107
+ cls,
108
+ dataset_name: str = "dlouapre/lichess_2025-01_1M",
109
+ split: str = "train",
110
+ column: str = "text",
111
+ min_frequency: int = 500,
112
+ max_samples: Optional[int] = 100000,
113
+ ) -> "ChessTokenizer":
114
+ from datasets import load_dataset
115
+
116
+ dataset = load_dataset(dataset_name, split=split)
117
+ if max_samples is not None:
118
+ dataset = dataset.select(range(min(max_samples, len(dataset))))
119
+
120
+ def game_iterator():
121
+ for example in dataset:
122
+ yield example[column]
123
+
124
+ return cls.build_vocab_from_iterator(game_iterator(), min_frequency=min_frequency)
125
+
126
+ @property
127
+ def vocab_size(self) -> int:
128
+ return len(self._vocab)
129
+
130
+ def get_vocab(self) -> Dict[str, int]:
131
+ return dict(self._vocab)
132
+
133
+ def _tokenize(self, text: str) -> List[str]:
134
+ tokens: List[str] = []
135
+
136
+ moves = text.strip().split()
137
+ for move in moves:
138
+ # Castling
139
+ if "O-O-O" in move:
140
+ tokens.append("[W]" if move.startswith("W") else "[B]")
141
+ tokens.append("[O-O-O]")
142
+ continue
143
+ if "O-O" in move:
144
+ tokens.append("[W]" if move.startswith("W") else "[B]")
145
+ tokens.append("[O-O]")
146
+ continue
147
+
148
+ m = self.MOVE_RE.match(move)
149
+ if not m:
150
+ tokens.append(self.UNK_TOKEN)
151
+ continue
152
+
153
+ tokens.append(f"[{m.group('side')}]")
154
+ tokens.append(f"[{m.group('piece')}]")
155
+ tokens.append(f"[{m.group('src')}]")
156
+ tokens.append(f"[{m.group('dst')}]")
157
+
158
+ suffix = m.group("suffix")
159
+ if "x" in suffix:
160
+ tokens.append("[x]")
161
+ if "+" in suffix:
162
+ tokens.append("[+]")
163
+ if "*" in suffix:
164
+ tokens.append("[#]")
165
+ if "=" in suffix:
166
+ promo = suffix.split("=")[-1].upper()
167
+ tokens.append(f"[prom_{promo}]")
168
+
169
+ return tokens
170
+
171
+ def _convert_token_to_id(self, token: str) -> int:
172
+ return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN, 0))
173
+
174
+ def _convert_id_to_token(self, index: int) -> str:
175
+ return self._ids_to_tokens.get(index, self.UNK_TOKEN)
176
+
177
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
178
+ special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
179
+ return " ".join(t for t in tokens if t not in special)
180
+
181
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
182
+ if not os.path.isdir(save_directory):
183
+ os.makedirs(save_directory, exist_ok=True)
184
+ vocab_file = os.path.join(
185
+ save_directory,
186
+ (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
187
+ )
188
+ with open(vocab_file, "w", encoding="utf-8") as f:
189
+ json.dump(self._vocab, f, ensure_ascii=False, indent=2)
190
+ return (vocab_file,)
tokenizer_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[BOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[EOS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "auto_map": {
37
+ "AutoTokenizer": [
38
+ "tokenizer.ChessTokenizer",
39
+ null
40
+ ]
41
+ },
42
+ "bos_token": "[BOS]",
43
+ "clean_up_tokenization_spaces": false,
44
+ "eos_token": "[EOS]",
45
+ "extra_special_tokens": {},
46
+ "model_max_length": 1000000000000000019884624838656,
47
+ "pad_token": "[PAD]",
48
+ "tokenizer_class": "ChessTokenizer",
49
+ "unk_token": "[UNK]"
50
+ }
vocab.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[PAD]": 0,
3
+ "[BOS]": 1,
4
+ "[EOS]": 2,
5
+ "[UNK]": 3,
6
+ "[#]": 4,
7
+ "[+]": 5,
8
+ "[B]": 6,
9
+ "[K]": 7,
10
+ "[N]": 8,
11
+ "[P]": 9,
12
+ "[Q]": 10,
13
+ "[R]": 11,
14
+ "[W]": 12,
15
+ "[a1]": 13,
16
+ "[a2]": 14,
17
+ "[a3]": 15,
18
+ "[a4]": 16,
19
+ "[a5]": 17,
20
+ "[a6]": 18,
21
+ "[a7]": 19,
22
+ "[a8]": 20,
23
+ "[b1]": 21,
24
+ "[b2]": 22,
25
+ "[b3]": 23,
26
+ "[b4]": 24,
27
+ "[b5]": 25,
28
+ "[b6]": 26,
29
+ "[b7]": 27,
30
+ "[b8]": 28,
31
+ "[c1]": 29,
32
+ "[c2]": 30,
33
+ "[c3]": 31,
34
+ "[c4]": 32,
35
+ "[c5]": 33,
36
+ "[c6]": 34,
37
+ "[c7]": 35,
38
+ "[c8]": 36,
39
+ "[d1]": 37,
40
+ "[d2]": 38,
41
+ "[d3]": 39,
42
+ "[d4]": 40,
43
+ "[d5]": 41,
44
+ "[d6]": 42,
45
+ "[d7]": 43,
46
+ "[d8]": 44,
47
+ "[e1]": 45,
48
+ "[e2]": 46,
49
+ "[e3]": 47,
50
+ "[e4]": 48,
51
+ "[e5]": 49,
52
+ "[e6]": 50,
53
+ "[e7]": 51,
54
+ "[e8]": 52,
55
+ "[f1]": 53,
56
+ "[f2]": 54,
57
+ "[f3]": 55,
58
+ "[f4]": 56,
59
+ "[f5]": 57,
60
+ "[f6]": 58,
61
+ "[f7]": 59,
62
+ "[f8]": 60,
63
+ "[g1]": 61,
64
+ "[g2]": 62,
65
+ "[g3]": 63,
66
+ "[g4]": 64,
67
+ "[g5]": 65,
68
+ "[g6]": 66,
69
+ "[g7]": 67,
70
+ "[g8]": 68,
71
+ "[h1]": 69,
72
+ "[h2]": 70,
73
+ "[h3]": 71,
74
+ "[h4]": 72,
75
+ "[h5]": 73,
76
+ "[h6]": 74,
77
+ "[h7]": 75,
78
+ "[h8]": 76,
79
+ "[x]": 77
80
+ }