Chess Challenge submission by CHU-ZP

Browse files

Files changed (7) hide show

README.md +26 -0
config.json +20 -0
model.safetensors +3 -0
special_tokens_map.json +6 -0
tokenizer.py +150 -0
tokenizer_config.json +50 -0
vocab.json +1 -0

README.md ADDED Viewed

	@@ -0,0 +1,26 @@

+---
+library_name: transformers
+tags:
+- chess
+- llm-course
+- chess-challenge
+license: mit
+---
+# chess-czp-v2
+Chess model submitted to the LLM Course Chess Challenge.
+## Submission Info
+- **Submitted by**: [CHU-ZP](https://huggingface.co/CHU-ZP)
+- **Parameters**: 704,896
+- **Organization**: LLM-course
+## Model Details
+- **Architecture**: Chess Transformer (GPT-style)
+- **Vocab size**: 81
+- **Embedding dim**: 128
+- **Layers**: 4
+- **Heads**: 4

config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "architectures": [
+    "ChessForCausalLM"
+  ],
+  "bos_token_id": 1,
+  "dropout": 0.1,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "chess_transformer",
+  "n_ctx": 256,
+  "n_embd": 128,
+  "n_head": 4,
+  "n_inner": 384,
+  "n_layer": 4,
+  "pad_token_id": 0,
+  "tie_weights": true,
+  "transformers_version": "4.57.6",
+  "vocab_size": 81
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7faa5448b8b6fd51db110040971f925b03e146fdca4cf447a6b717bdb8d1a48
+size 2823984

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "[BOS]",
+  "eos_token": "[EOS]",
+  "pad_token": "[PAD]",
+  "unk_token": "[UNK]"
+}

tokenizer.py ADDED Viewed

	@@ -0,0 +1,150 @@

+"""
+Custom Atomic Chess Tokenizer for the Chess Challenge.
+Strategy: Component-level tokenization (W, P, e2, e4) to save vocabulary size.
+"""
+from __future__ import annotations
+import json
+import os
+from typing import Dict, List, Optional, Tuple
+from transformers import PreTrainedTokenizer
+class ChessTokenizer(PreTrainedTokenizer):
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(self, vocab_file: str = None, **kwargs):
+        # 1. 定义原子词表
+        self.special_tokens = ["[PAD]", "[BOS]", "[EOS]", "[UNK]"]
+        self.colors = ["W", "B"]
+        self.pieces = ["P", "N", "B", "R", "Q", "K"]
+        self.squares = [f"{c}{r}" for c in "abcdefgh" for r in range(1, 9)] # a1...h8
+        self.suffixes = ["x", "+", "#", "=", "O-O", "O-O-O"] # captures, checks, castling
+        # 2. 合并所有 Token
+        all_tokens = self.special_tokens + self.colors + self.pieces + self.squares + self.suffixes
+        # 3. 构建内存中的字典
+        self.vocab = {t: i for i, t in enumerate(all_tokens)}
+        self.ids_to_tokens = {i: t for t, i in self.vocab.items()}
+        kwargs.pop("pad_token", None)
+        kwargs.pop("bos_token", None)
+        kwargs.pop("eos_token", None)
+        kwargs.pop("unk_token", None)
+        # 4. 初始化父类
+        super().__init__(
+            pad_token="[PAD]",
+            bos_token="[BOS]",
+            eos_token="[EOS]",
+            unk_token="[UNK]",
+            **kwargs
+        )
+    @property
+    def vocab_size(self) -> int:
+        return len(self.vocab)
+    def get_vocab(self) -> Dict[str, int]:
+        return dict(self.vocab)
+    def _tokenize(self, text: str) -> List[str]:
+        """
+        Input: "WPe2e4 BNg8f6"
+        Output: ['W', 'P', 'e2', 'e4', 'B', 'N', 'g8', 'f6']
+        """
+        tokens = []
+        moves = text.strip().split()
+        for move in moves:
+            # 1. 处理特殊易位
+            if "O-O" in move:
+                tokens.append(move)
+                continue
+            # 2. 线性扫描拆解 (Greedy Match)
+            # 我们只需要不断从字符串头部切下最长的合法Token
+            remaining = move
+            while remaining:
+                matched = False
+                # 尝试从长度2的Token开始匹配 (如 e4, e2, x)
+                # 因为我们的词表里最长的普通Token就是2个字符 (a1, x, +, P, W)
+                # 除了易位(已处理)
+                # 优先匹配2个字符的 (主要是坐标 a1-h8)
+                if len(remaining) >= 2 and remaining[:2] in self.vocab:
+                    tokens.append(remaining[:2])
+                    remaining = remaining[2:]
+                    matched = True
+                    continue
+                # 匹配1个字符的 (W, B, P, N, x, +)
+                if len(remaining) >= 1 and remaining[:1] in self.vocab:
+                    tokens.append(remaining[:1])
+                    remaining = remaining[1:]
+                    matched = True
+                    continue
+                # 如果都匹配不上，说明有脏数据，简单跳过或作为UNK处理
+                if not matched:
+                    # 为了防止死循环，强制消费一个字符
+                    # 实际训练中你可以选择 tokens.append(self.unk_token)
+                    remaining = remaining[1:]
+        return tokens
+    def _convert_token_to_id(self, token: str) -> int:
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def _convert_id_to_token(self, index: int) -> str:
+        return self.ids_to_tokens.get(index, self.unk_token)
+    # --- 👇 新增的关键方法 1: 保存词表 ---
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        保存 vocab.json 到指定目录。没有这个，save_pretrained 会出问题。
+        """
+        if not os.path.isdir(save_directory):
+            os.makedirs(save_directory, exist_ok=True)
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            json.dump(self.vocab, f, ensure_ascii=False)
+        return (vocab_file,)
+    # --- 👇 新增的关键方法 2: 还原字符串 ---
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """
+        将 Token 列表还原为棋谱字符串。
+        Input: ['W', 'P', 'e2', 'e4', 'B', 'P', 'e7', 'e5']
+        Output: "WPe2e4 BPe7e5"
+        """
+        out_string = []
+        for t in tokens:
+            # 过滤特殊 Token
+            if t in self.special_tokens:
+                continue
+            # 逻辑：如果这个 Token 是颜色 ('W'/'B') 或者是易位 ('O-O')
+            # 说明它是一个新动作的开始，前面需要加空格
+            # (除非它是整个句子的第一个)
+            if t in self.colors or "O-O" in t:
+                if out_string: # 如果不是第一个
+                    out_string.append(" ")
+            out_string.append(t)
+        return "".join(out_string).strip()
+    # 可选：提供一个类方法来构建(虽然这里是硬编码，但为了接口兼容)
+    @classmethod
+    def build_vocab_from_dataset(cls, *args, **kwargs):
+        return cls()

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[EOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenizer.ChessTokenizer",
+      null
+    ]
+  },
+  "bos_token": "[BOS]",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "[EOS]",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "tokenizer_class": "ChessTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"[PAD]": 0, "[BOS]": 1, "[EOS]": 2, "[UNK]": 3, "W": 4, "B": 8, "P": 6, "N": 7, "R": 9, "Q": 10, "K": 11, "a1": 12, "a2": 13, "a3": 14, "a4": 15, "a5": 16, "a6": 17, "a7": 18, "a8": 19, "b1": 20, "b2": 21, "b3": 22, "b4": 23, "b5": 24, "b6": 25, "b7": 26, "b8": 27, "c1": 28, "c2": 29, "c3": 30, "c4": 31, "c5": 32, "c6": 33, "c7": 34, "c8": 35, "d1": 36, "d2": 37, "d3": 38, "d4": 39, "d5": 40, "d6": 41, "d7": 42, "d8": 43, "e1": 44, "e2": 45, "e3": 46, "e4": 47, "e5": 48, "e6": 49, "e7": 50, "e8": 51, "f1": 52, "f2": 53, "f3": 54, "f4": 55, "f5": 56, "f6": 57, "f7": 58, "f8": 59, "g1": 60, "g2": 61, "g3": 62, "g4": 63, "g5": 64, "g6": 65, "g7": 66, "g8": 67, "h1": 68, "h2": 69, "h3": 70, "h4": 71, "h5": 72, "h6": 73, "h7": 74, "h8": 75, "x": 76, "+": 77, "#": 78, "=": 79, "O-O": 80, "O-O-O": 81}