CHU-ZP commited on
Commit
24c4a71
·
verified ·
1 Parent(s): 5d4b393

Chess Challenge submission by CHU-ZP

Browse files
Files changed (7) hide show
  1. README.md +26 -0
  2. config.json +20 -0
  3. model.safetensors +3 -0
  4. special_tokens_map.json +6 -0
  5. tokenizer.py +150 -0
  6. tokenizer_config.json +50 -0
  7. vocab.json +1 -0
README.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - chess
5
+ - llm-course
6
+ - chess-challenge
7
+ license: mit
8
+ ---
9
+
10
+ # chess-bot-v3
11
+
12
+ Chess model submitted to the LLM Course Chess Challenge.
13
+
14
+ ## Submission Info
15
+
16
+ - **Submitted by**: [CHU-ZP](https://huggingface.co/CHU-ZP)
17
+ - **Parameters**: 932,720
18
+ - **Organization**: LLM-course
19
+
20
+ ## Model Details
21
+
22
+ - **Architecture**: Chess Transformer (GPT-style)
23
+ - **Vocab size**: 81
24
+ - **Embedding dim**: 128
25
+ - **Layers**: 6
26
+ - **Heads**: 8
config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ChessForCausalLM"
4
+ ],
5
+ "bos_token_id": 1,
6
+ "dropout": 0.1,
7
+ "dtype": "float32",
8
+ "eos_token_id": 2,
9
+ "layer_norm_epsilon": 1e-05,
10
+ "model_type": "chess_transformer",
11
+ "n_ctx": 512,
12
+ "n_embd": 128,
13
+ "n_head": 8,
14
+ "n_inner": 296,
15
+ "n_layer": 6,
16
+ "pad_token_id": 0,
17
+ "tie_weights": true,
18
+ "transformers_version": "4.57.6",
19
+ "vocab_size": 81
20
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c22888a9e80379921956d8cf0b212e25be081d5b466d1e8f4022fc4d0abc235
3
+ size 3737320
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[BOS]",
3
+ "eos_token": "[EOS]",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "[UNK]"
6
+ }
tokenizer.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom Atomic Chess Tokenizer for the Chess Challenge.
3
+ Strategy: Component-level tokenization (W, P, e2, e4) to save vocabulary size.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import os
10
+ from typing import Dict, List, Optional, Tuple
11
+
12
+
13
+ from transformers import PreTrainedTokenizer
14
+
15
+ class ChessTokenizer(PreTrainedTokenizer):
16
+ model_input_names = ["input_ids", "attention_mask"]
17
+
18
+ def __init__(self, vocab_file: str = None, **kwargs):
19
+ # 1. 定义原子词表
20
+ self.special_tokens = ["[PAD]", "[BOS]", "[EOS]", "[UNK]"]
21
+ self.colors = ["W", "B"]
22
+ self.pieces = ["P", "N", "B", "R", "Q", "K"]
23
+ self.squares = [f"{c}{r}" for c in "abcdefgh" for r in range(1, 9)] # a1...h8
24
+ self.suffixes = ["x", "+", "#", "=", "O-O", "O-O-O"] # captures, checks, castling
25
+
26
+ # 2. 合并所有 Token
27
+ all_tokens = self.special_tokens + self.colors + self.pieces + self.squares + self.suffixes
28
+
29
+ # 3. 构建内存中的字典
30
+ self.vocab = {t: i for i, t in enumerate(all_tokens)}
31
+ self.ids_to_tokens = {i: t for t, i in self.vocab.items()}
32
+
33
+ kwargs.pop("pad_token", None)
34
+ kwargs.pop("bos_token", None)
35
+ kwargs.pop("eos_token", None)
36
+ kwargs.pop("unk_token", None)
37
+
38
+ # 4. 初始化父类
39
+ super().__init__(
40
+ pad_token="[PAD]",
41
+ bos_token="[BOS]",
42
+ eos_token="[EOS]",
43
+ unk_token="[UNK]",
44
+ **kwargs
45
+ )
46
+
47
+ @property
48
+ def vocab_size(self) -> int:
49
+ return len(self.vocab)
50
+
51
+ def get_vocab(self) -> Dict[str, int]:
52
+ return dict(self.vocab)
53
+
54
+ def _tokenize(self, text: str) -> List[str]:
55
+ """
56
+ Input: "WPe2e4 BNg8f6"
57
+ Output: ['W', 'P', 'e2', 'e4', 'B', 'N', 'g8', 'f6']
58
+ """
59
+ tokens = []
60
+ moves = text.strip().split()
61
+
62
+ for move in moves:
63
+ # 1. 处理特殊易位
64
+ if "O-O" in move:
65
+ tokens.append(move)
66
+ continue
67
+
68
+ # 2. 线性扫描拆解 (Greedy Match)
69
+ # 我们只需要不断从字符串头部切下最长的合法Token
70
+ remaining = move
71
+ while remaining:
72
+ matched = False
73
+ # 尝试从长度2的Token开始匹配 (如 e4, e2, x)
74
+ # 因为我们的词表里最长的普通Token就是2个字符 (a1, x, +, P, W)
75
+ # 除了易位(已处理)
76
+
77
+ # 优先匹配2个字符的 (主要是坐标 a1-h8)
78
+ if len(remaining) >= 2 and remaining[:2] in self.vocab:
79
+ tokens.append(remaining[:2])
80
+ remaining = remaining[2:]
81
+ matched = True
82
+ continue
83
+
84
+ # 匹配1个字符的 (W, B, P, N, x, +)
85
+ if len(remaining) >= 1 and remaining[:1] in self.vocab:
86
+ tokens.append(remaining[:1])
87
+ remaining = remaining[1:]
88
+ matched = True
89
+ continue
90
+
91
+ # 如果都匹配不上,说明有脏数据,简单跳过或作为UNK处理
92
+ if not matched:
93
+ # 为了防止死循环,强制消费一个字符
94
+ # 实际训练中你可以选择 tokens.append(self.unk_token)
95
+ remaining = remaining[1:]
96
+
97
+ return tokens
98
+
99
+ def _convert_token_to_id(self, token: str) -> int:
100
+ return self.vocab.get(token, self.vocab.get(self.unk_token))
101
+
102
+ def _convert_id_to_token(self, index: int) -> str:
103
+ return self.ids_to_tokens.get(index, self.unk_token)
104
+
105
+ # --- 👇 新增的关键方法 1: 保存词表 ---
106
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
107
+ """
108
+ 保存 vocab.json 到指定目录。没有这个,save_pretrained 会出问题。
109
+ """
110
+ if not os.path.isdir(save_directory):
111
+ os.makedirs(save_directory, exist_ok=True)
112
+
113
+ vocab_file = os.path.join(
114
+ save_directory,
115
+ (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
116
+ )
117
+
118
+ with open(vocab_file, "w", encoding="utf-8") as f:
119
+ json.dump(self.vocab, f, ensure_ascii=False)
120
+
121
+ return (vocab_file,)
122
+
123
+ # --- 👇 新增的关键方法 2: 还原字符串 ---
124
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
125
+ """
126
+ 将 Token 列表还原为棋谱字符串。
127
+ Input: ['W', 'P', 'e2', 'e4', 'B', 'P', 'e7', 'e5']
128
+ Output: "WPe2e4 BPe7e5"
129
+ """
130
+ out_string = []
131
+ for t in tokens:
132
+ # 过滤特殊 Token
133
+ if t in self.special_tokens:
134
+ continue
135
+
136
+ # 逻辑:如果这个 Token 是颜色 ('W'/'B') 或者是易位 ('O-O')
137
+ # 说明它是一个新动作的开始,前面需要加空格
138
+ # (除非它是整个句子的第一个)
139
+ if t in self.colors or "O-O" in t:
140
+ if out_string: # 如果不是第一个
141
+ out_string.append(" ")
142
+
143
+ out_string.append(t)
144
+
145
+ return "".join(out_string).strip()
146
+
147
+ # 可选:提供一个类方法来构建(虽然这里是硬编码,但为了接口兼容)
148
+ @classmethod
149
+ def build_vocab_from_dataset(cls, *args, **kwargs):
150
+ return cls()
tokenizer_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[BOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[EOS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "auto_map": {
37
+ "AutoTokenizer": [
38
+ "tokenizer.ChessTokenizer",
39
+ null
40
+ ]
41
+ },
42
+ "bos_token": "[BOS]",
43
+ "clean_up_tokenization_spaces": false,
44
+ "eos_token": "[EOS]",
45
+ "extra_special_tokens": {},
46
+ "model_max_length": 1000000000000000019884624838656,
47
+ "pad_token": "[PAD]",
48
+ "tokenizer_class": "ChessTokenizer",
49
+ "unk_token": "[UNK]"
50
+ }
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"[PAD]": 0, "[BOS]": 1, "[EOS]": 2, "[UNK]": 3, "W": 4, "B": 8, "P": 6, "N": 7, "R": 9, "Q": 10, "K": 11, "a1": 12, "a2": 13, "a3": 14, "a4": 15, "a5": 16, "a6": 17, "a7": 18, "a8": 19, "b1": 20, "b2": 21, "b3": 22, "b4": 23, "b5": 24, "b6": 25, "b7": 26, "b8": 27, "c1": 28, "c2": 29, "c3": 30, "c4": 31, "c5": 32, "c6": 33, "c7": 34, "c8": 35, "d1": 36, "d2": 37, "d3": 38, "d4": 39, "d5": 40, "d6": 41, "d7": 42, "d8": 43, "e1": 44, "e2": 45, "e3": 46, "e4": 47, "e5": 48, "e6": 49, "e7": 50, "e8": 51, "f1": 52, "f2": 53, "f3": 54, "f4": 55, "f5": 56, "f6": 57, "f7": 58, "f8": 59, "g1": 60, "g2": 61, "g3": 62, "g4": 63, "g5": 64, "g6": 65, "g7": 66, "g8": 67, "h1": 68, "h2": 69, "h3": 70, "h4": 71, "h5": 72, "h6": 73, "h7": 74, "h8": 75, "x": 76, "+": 77, "#": 78, "=": 79, "O-O": 80, "O-O-O": 81}