Chess Challenge submission by swdo
Browse files- model.safetensors +1 -1
- tokenizer.py +122 -107
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 3490096
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:02aaa4f1c65a9dc94d710f1071176353e932da7f89b161579755aa4c93adcc71
|
| 3 |
size 3490096
|
tokenizer.py
CHANGED
|
@@ -1,16 +1,4 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Decomposed Chess Tokenizer v2 for the Chess Challenge.
|
| 3 |
|
| 4 |
-
This tokenizer decomposes moves into structural components:
|
| 5 |
-
- Color (W/B)
|
| 6 |
-
- Piece (P/N/B/R/Q/K)
|
| 7 |
-
- From square (a1-h8)
|
| 8 |
-
- To square (a1-h8)
|
| 9 |
-
- Modifiers (capture, check, checkmate, promotion, castling)
|
| 10 |
-
|
| 11 |
-
This allows the model to learn chess structure and generalize better
|
| 12 |
-
while using a much smaller vocabulary (~90 tokens vs ~1200+).
|
| 13 |
-
"""
|
| 14 |
|
| 15 |
from __future__ import annotations
|
| 16 |
|
|
@@ -25,18 +13,12 @@ from transformers import PreTrainedTokenizer
|
|
| 25 |
|
| 26 |
class ChessTokenizer(PreTrainedTokenizer):
|
| 27 |
"""
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
Breaks moves into structural components for better learning.
|
| 31 |
|
| 32 |
Example:
|
| 33 |
>>> tokenizer = ChessTokenizer()
|
| 34 |
-
>>>
|
| 35 |
-
|
| 36 |
-
['W', 'P', 'e2', 'e4', 'B', 'P', 'e7', 'e5']
|
| 37 |
-
|
| 38 |
-
>>> tokenizer.encode("WNg1f3(+)")
|
| 39 |
-
[1, 5, 8, 39, 29, 12, 2] # [BOS, W, N, g1, f3, +, EOS]
|
| 40 |
"""
|
| 41 |
|
| 42 |
model_input_names = ["input_ids", "attention_mask"]
|
|
@@ -47,18 +29,14 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 47 |
BOS_TOKEN = "[BOS]"
|
| 48 |
EOS_TOKEN = "[EOS]"
|
| 49 |
UNK_TOKEN = "[UNK]"
|
| 50 |
-
SEP_TOKEN = "[SEP]" # Optional: separate moves
|
| 51 |
|
| 52 |
# Chess components
|
| 53 |
-
# Use [W] and [B] for colors to avoid collision with piece 'B' (Bishop)
|
| 54 |
COLORS = ["[W]", "[B]"]
|
| 55 |
PIECES = ["P", "N", "B", "R", "Q", "K"]
|
| 56 |
FILES = ["a", "b", "c", "d", "e", "f", "g", "h"]
|
| 57 |
RANKS = ["1", "2", "3", "4", "5", "6", "7", "8"]
|
| 58 |
-
# Generate all 64 squares
|
| 59 |
SQUARES = [f + r for f in FILES for r in ["1", "2", "3", "4", "5", "6", "7", "8"]]
|
| 60 |
|
| 61 |
-
# Modifiers
|
| 62 |
MODIFIERS = [
|
| 63 |
"x", # Capture
|
| 64 |
"+", # Check
|
|
@@ -74,8 +52,6 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 74 |
"O", # Queenside castling (dataset format)
|
| 75 |
]
|
| 76 |
|
| 77 |
-
# Regex pattern to parse extended UCI moves
|
| 78 |
-
# Format: [W|B][Piece][from_sq][to_sq][promotion]?[suffixes]?
|
| 79 |
MOVE_PATTERN = re.compile(
|
| 80 |
r'^([WB])' # Color
|
| 81 |
r'([PNBRQK])' # Piece
|
|
@@ -89,24 +65,16 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 89 |
self,
|
| 90 |
vocab_file: Optional[str] = None,
|
| 91 |
vocab: Optional[Dict[str, int]] = None,
|
| 92 |
-
add_move_separator: bool = False,
|
| 93 |
**kwargs,
|
| 94 |
):
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
Args:
|
| 99 |
-
vocab_file: Path to vocabulary JSON file.
|
| 100 |
-
vocab: Pre-built vocabulary dictionary.
|
| 101 |
-
add_move_separator: Whether to add [SEP] between moves.
|
| 102 |
-
"""
|
| 103 |
self._pad_token = self.PAD_TOKEN
|
| 104 |
self._bos_token = self.BOS_TOKEN
|
| 105 |
self._eos_token = self.EOS_TOKEN
|
| 106 |
self._unk_token = self.UNK_TOKEN
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
# Remove duplicates from kwargs
|
| 110 |
kwargs.pop("pad_token", None)
|
| 111 |
kwargs.pop("bos_token", None)
|
| 112 |
kwargs.pop("eos_token", None)
|
|
@@ -119,9 +87,10 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 119 |
with open(vocab_file, "r", encoding="utf-8") as f:
|
| 120 |
self._vocab = json.load(f)
|
| 121 |
else:
|
| 122 |
-
|
|
|
|
| 123 |
|
| 124 |
-
#
|
| 125 |
self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
|
| 126 |
|
| 127 |
super().__init__(
|
|
@@ -132,14 +101,17 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 132 |
**kwargs,
|
| 133 |
)
|
| 134 |
|
| 135 |
-
def
|
| 136 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
tokens = []
|
| 138 |
|
| 139 |
# Special tokens first
|
| 140 |
tokens.extend([self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN])
|
| 141 |
-
if self.add_move_separator:
|
| 142 |
-
tokens.append(self.SEP_TOKEN)
|
| 143 |
|
| 144 |
# Colors
|
| 145 |
tokens.extend(self.COLORS)
|
|
@@ -155,6 +127,56 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 155 |
|
| 156 |
return {token: idx for idx, token in enumerate(tokens)}
|
| 157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
@property
|
| 159 |
def vocab_size(self) -> int:
|
| 160 |
return len(self._vocab)
|
|
@@ -175,12 +197,11 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 175 |
match = self.MOVE_PATTERN.match(move)
|
| 176 |
|
| 177 |
if not match:
|
| 178 |
-
# Fallback: return as unknown
|
| 179 |
return [self.UNK_TOKEN]
|
| 180 |
|
| 181 |
tokens = []
|
| 182 |
|
| 183 |
-
# Color - map 'W' -> '[W]' and 'B' -> '[B]'
|
| 184 |
color = match.group(1)
|
| 185 |
tokens.append(f"[{color}]")
|
| 186 |
|
|
@@ -195,15 +216,13 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 195 |
|
| 196 |
# Promotion (optional)
|
| 197 |
if match.group(5):
|
| 198 |
-
tokens.append(match.group(5))
|
| 199 |
|
| 200 |
# Parse suffixes (optional)
|
| 201 |
if match.group(6):
|
| 202 |
-
suffix = match.group(6)
|
| 203 |
-
# Remove parentheses
|
| 204 |
suffix_content = suffix[1:-1]
|
| 205 |
|
| 206 |
-
# Parse individual modifiers
|
| 207 |
if "x" in suffix_content:
|
| 208 |
tokens.append("x")
|
| 209 |
if "+*" in suffix_content:
|
|
@@ -219,7 +238,7 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 219 |
|
| 220 |
def _tokenize(self, text: str) -> List[str]:
|
| 221 |
"""
|
| 222 |
-
Tokenize a string of moves.
|
| 223 |
|
| 224 |
Args:
|
| 225 |
text: Space-separated moves in extended UCI format.
|
|
@@ -230,13 +249,9 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 230 |
tokens = []
|
| 231 |
moves = text.strip().split()
|
| 232 |
|
| 233 |
-
for
|
| 234 |
move_tokens = self._parse_move(move)
|
| 235 |
tokens.extend(move_tokens)
|
| 236 |
-
|
| 237 |
-
# Add separator between moves (optional)
|
| 238 |
-
if self.add_move_separator and i < len(moves) - 1:
|
| 239 |
-
tokens.append(self.SEP_TOKEN)
|
| 240 |
|
| 241 |
return tokens
|
| 242 |
|
|
@@ -252,7 +267,7 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 252 |
|
| 253 |
Reconstructs moves from component tokens.
|
| 254 |
"""
|
| 255 |
-
special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN
|
| 256 |
|
| 257 |
result = []
|
| 258 |
current_move = []
|
|
@@ -288,7 +303,6 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 288 |
tokens[2] in self.SQUARES and
|
| 289 |
tokens[3] in self.SQUARES):
|
| 290 |
|
| 291 |
-
# Check if next token would start a new move
|
| 292 |
if len(tokens) == 4:
|
| 293 |
return True
|
| 294 |
|
|
@@ -296,7 +310,7 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 296 |
remaining = tokens[4:]
|
| 297 |
for t in remaining:
|
| 298 |
if t in self.COLORS:
|
| 299 |
-
return True
|
| 300 |
if t not in self.MODIFIERS and not t.startswith("="):
|
| 301 |
return True
|
| 302 |
|
|
@@ -309,12 +323,11 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 309 |
if not tokens:
|
| 310 |
return ""
|
| 311 |
|
| 312 |
-
# Basic structure: Color + Piece + From + To
|
| 313 |
if len(tokens) >= 4:
|
| 314 |
# Convert [W] -> W and [B] -> B for colors
|
| 315 |
color = tokens[0]
|
| 316 |
if color in self.COLORS:
|
| 317 |
-
color = color[1]
|
| 318 |
|
| 319 |
move = color + "".join(tokens[1:4])
|
| 320 |
|
|
@@ -338,6 +351,16 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 338 |
save_directory: str,
|
| 339 |
filename_prefix: Optional[str] = None,
|
| 340 |
) -> Tuple[str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
if not os.path.isdir(save_directory):
|
| 342 |
os.makedirs(save_directory, exist_ok=True)
|
| 343 |
|
|
@@ -349,50 +372,42 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 349 |
with open(vocab_file, "w", encoding="utf-8") as f:
|
| 350 |
json.dump(self._vocab, f, ensure_ascii=False, indent=2)
|
| 351 |
|
| 352 |
-
# Also save config with auto_map for HuggingFace to find our custom tokenizer
|
| 353 |
-
# Format: (slow_tokenizer_class, fast_tokenizer_class) - we don't have a fast version
|
| 354 |
-
config = {
|
| 355 |
-
"tokenizer_class": "ChessTokenizer",
|
| 356 |
-
"auto_map": {
|
| 357 |
-
"AutoTokenizer": ["tokenizer.ChessTokenizer", None]
|
| 358 |
-
},
|
| 359 |
-
"add_move_separator": self.add_move_separator,
|
| 360 |
-
"vocab_size": self.vocab_size,
|
| 361 |
-
}
|
| 362 |
-
config_file = os.path.join(save_directory, "tokenizer_config.json")
|
| 363 |
-
with open(config_file, "w", encoding="utf-8") as f:
|
| 364 |
-
json.dump(config, f, indent=2)
|
| 365 |
-
|
| 366 |
return (vocab_file,)
|
| 367 |
-
|
| 368 |
-
@classmethod
|
| 369 |
-
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
| 370 |
-
"""Load tokenizer from directory or hub."""
|
| 371 |
-
path = Path(pretrained_model_name_or_path)
|
| 372 |
-
|
| 373 |
-
if path.is_dir():
|
| 374 |
-
vocab_file = path / "vocab.json"
|
| 375 |
-
config_file = path / "tokenizer_config.json"
|
| 376 |
-
|
| 377 |
-
add_move_separator = False
|
| 378 |
-
if config_file.exists():
|
| 379 |
-
with open(config_file, "r") as f:
|
| 380 |
-
config = json.load(f)
|
| 381 |
-
add_move_separator = config.get("add_move_separator", False)
|
| 382 |
-
|
| 383 |
-
return cls(
|
| 384 |
-
vocab_file=str(vocab_file) if vocab_file.exists() else None,
|
| 385 |
-
add_move_separator=add_move_separator,
|
| 386 |
-
**kwargs,
|
| 387 |
-
)
|
| 388 |
-
|
| 389 |
-
# Fallback to HuggingFace hub
|
| 390 |
-
from huggingface_hub import hf_hub_download
|
| 391 |
-
|
| 392 |
-
vocab_file = hf_hub_download(
|
| 393 |
-
repo_id=pretrained_model_name_or_path,
|
| 394 |
-
filename="vocab.json",
|
| 395 |
-
)
|
| 396 |
-
|
| 397 |
-
return cls(vocab_file=vocab_file, **kwargs)
|
| 398 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
|
|
| 13 |
|
| 14 |
class ChessTokenizer(PreTrainedTokenizer):
|
| 15 |
"""
|
| 16 |
+
A custom tokenizer
|
|
|
|
|
|
|
| 17 |
|
| 18 |
Example:
|
| 19 |
>>> tokenizer = ChessTokenizer()
|
| 20 |
+
>>> tokenizer.encode("WPe2e4 BPe7e5")
|
| 21 |
+
[1, 4, 6, 45, 47, 5, 6, 50, 48, 2] # [BOS, components..., EOS]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
"""
|
| 23 |
|
| 24 |
model_input_names = ["input_ids", "attention_mask"]
|
|
|
|
| 29 |
BOS_TOKEN = "[BOS]"
|
| 30 |
EOS_TOKEN = "[EOS]"
|
| 31 |
UNK_TOKEN = "[UNK]"
|
|
|
|
| 32 |
|
| 33 |
# Chess components
|
|
|
|
| 34 |
COLORS = ["[W]", "[B]"]
|
| 35 |
PIECES = ["P", "N", "B", "R", "Q", "K"]
|
| 36 |
FILES = ["a", "b", "c", "d", "e", "f", "g", "h"]
|
| 37 |
RANKS = ["1", "2", "3", "4", "5", "6", "7", "8"]
|
|
|
|
| 38 |
SQUARES = [f + r for f in FILES for r in ["1", "2", "3", "4", "5", "6", "7", "8"]]
|
| 39 |
|
|
|
|
| 40 |
MODIFIERS = [
|
| 41 |
"x", # Capture
|
| 42 |
"+", # Check
|
|
|
|
| 52 |
"O", # Queenside castling (dataset format)
|
| 53 |
]
|
| 54 |
|
|
|
|
|
|
|
| 55 |
MOVE_PATTERN = re.compile(
|
| 56 |
r'^([WB])' # Color
|
| 57 |
r'([PNBRQK])' # Piece
|
|
|
|
| 65 |
self,
|
| 66 |
vocab_file: Optional[str] = None,
|
| 67 |
vocab: Optional[Dict[str, int]] = None,
|
|
|
|
| 68 |
**kwargs,
|
| 69 |
):
|
| 70 |
+
|
| 71 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
self._pad_token = self.PAD_TOKEN
|
| 73 |
self._bos_token = self.BOS_TOKEN
|
| 74 |
self._eos_token = self.EOS_TOKEN
|
| 75 |
self._unk_token = self.UNK_TOKEN
|
| 76 |
+
|
| 77 |
+
# Remove any duplicate
|
|
|
|
| 78 |
kwargs.pop("pad_token", None)
|
| 79 |
kwargs.pop("bos_token", None)
|
| 80 |
kwargs.pop("eos_token", None)
|
|
|
|
| 87 |
with open(vocab_file, "r", encoding="utf-8") as f:
|
| 88 |
self._vocab = json.load(f)
|
| 89 |
else:
|
| 90 |
+
# Create the fixed decomposed vocabulary
|
| 91 |
+
self._vocab = self._create_default_vocab()
|
| 92 |
|
| 93 |
+
# Create reverse mapping
|
| 94 |
self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
|
| 95 |
|
| 96 |
super().__init__(
|
|
|
|
| 101 |
**kwargs,
|
| 102 |
)
|
| 103 |
|
| 104 |
+
def _create_default_vocab(self) -> Dict[str, int]:
|
| 105 |
+
"""
|
| 106 |
+
Create the fixed vocabulary from chess components.
|
| 107 |
+
|
| 108 |
+
Unlike the standard tokenizer, this creates a small fixed vocab
|
| 109 |
+
of ~88 tokens for decomposed move representation.
|
| 110 |
+
"""
|
| 111 |
tokens = []
|
| 112 |
|
| 113 |
# Special tokens first
|
| 114 |
tokens.extend([self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN])
|
|
|
|
|
|
|
| 115 |
|
| 116 |
# Colors
|
| 117 |
tokens.extend(self.COLORS)
|
|
|
|
| 127 |
|
| 128 |
return {token: idx for idx, token in enumerate(tokens)}
|
| 129 |
|
| 130 |
+
@classmethod
|
| 131 |
+
def build_vocab_from_iterator(
|
| 132 |
+
cls,
|
| 133 |
+
iterator,
|
| 134 |
+
min_frequency: int = 1,
|
| 135 |
+
) -> "ChessTokenizer":
|
| 136 |
+
"""
|
| 137 |
+
Build a tokenizer vocabulary from an iterator of game strings.
|
| 138 |
+
|
| 139 |
+
Note: For decomposed tokenizer, this ignores the iterator and
|
| 140 |
+
creates the fixed vocabulary. Provided for API compatibility.
|
| 141 |
+
|
| 142 |
+
Args:
|
| 143 |
+
iterator: An iterator yielding game strings (ignored).
|
| 144 |
+
min_frequency: Minimum frequency for a token (ignored).
|
| 145 |
+
|
| 146 |
+
Returns:
|
| 147 |
+
A ChessTokenizer with the fixed decomposed vocabulary.
|
| 148 |
+
"""
|
| 149 |
+
# Decomposed tokenizer uses fixed vocabulary
|
| 150 |
+
return cls()
|
| 151 |
+
|
| 152 |
+
@classmethod
|
| 153 |
+
def build_vocab_from_dataset(
|
| 154 |
+
cls,
|
| 155 |
+
dataset_name: str = "dlouapre/lichess_2025-01_1M",
|
| 156 |
+
split: str = "train",
|
| 157 |
+
column: str = "moves",
|
| 158 |
+
min_frequency: int = 1,
|
| 159 |
+
max_samples: Optional[int] = None,
|
| 160 |
+
) -> "ChessTokenizer":
|
| 161 |
+
"""
|
| 162 |
+
Build a tokenizer vocabulary from a Hugging Face dataset.
|
| 163 |
+
|
| 164 |
+
Note: For decomposed tokenizer, this ignores the dataset and
|
| 165 |
+
creates the fixed vocabulary. Provided for API compatibility.
|
| 166 |
+
|
| 167 |
+
Args:
|
| 168 |
+
dataset_name: Name of the dataset on Hugging Face Hub (ignored).
|
| 169 |
+
split: Dataset split to use (ignored).
|
| 170 |
+
column: Column containing move strings (ignored).
|
| 171 |
+
min_frequency: Minimum frequency for inclusion (ignored).
|
| 172 |
+
max_samples: Maximum samples to process (ignored).
|
| 173 |
+
|
| 174 |
+
Returns:
|
| 175 |
+
A ChessTokenizer with the fixed decomposed vocabulary.
|
| 176 |
+
"""
|
| 177 |
+
print(f"Note: Decomposed tokenizer uses fixed vocabulary (~88 tokens)")
|
| 178 |
+
return cls()
|
| 179 |
+
|
| 180 |
@property
|
| 181 |
def vocab_size(self) -> int:
|
| 182 |
return len(self._vocab)
|
|
|
|
| 197 |
match = self.MOVE_PATTERN.match(move)
|
| 198 |
|
| 199 |
if not match:
|
|
|
|
| 200 |
return [self.UNK_TOKEN]
|
| 201 |
|
| 202 |
tokens = []
|
| 203 |
|
| 204 |
+
# Color - map 'W' -> '[W]' and 'B' -> '[B]'
|
| 205 |
color = match.group(1)
|
| 206 |
tokens.append(f"[{color}]")
|
| 207 |
|
|
|
|
| 216 |
|
| 217 |
# Promotion (optional)
|
| 218 |
if match.group(5):
|
| 219 |
+
tokens.append(match.group(5))
|
| 220 |
|
| 221 |
# Parse suffixes (optional)
|
| 222 |
if match.group(6):
|
| 223 |
+
suffix = match.group(6)
|
|
|
|
| 224 |
suffix_content = suffix[1:-1]
|
| 225 |
|
|
|
|
| 226 |
if "x" in suffix_content:
|
| 227 |
tokens.append("x")
|
| 228 |
if "+*" in suffix_content:
|
|
|
|
| 238 |
|
| 239 |
def _tokenize(self, text: str) -> List[str]:
|
| 240 |
"""
|
| 241 |
+
Tokenize a string of moves into component tokens.
|
| 242 |
|
| 243 |
Args:
|
| 244 |
text: Space-separated moves in extended UCI format.
|
|
|
|
| 249 |
tokens = []
|
| 250 |
moves = text.strip().split()
|
| 251 |
|
| 252 |
+
for move in moves:
|
| 253 |
move_tokens = self._parse_move(move)
|
| 254 |
tokens.extend(move_tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
return tokens
|
| 257 |
|
|
|
|
| 267 |
|
| 268 |
Reconstructs moves from component tokens.
|
| 269 |
"""
|
| 270 |
+
special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
|
| 271 |
|
| 272 |
result = []
|
| 273 |
current_move = []
|
|
|
|
| 303 |
tokens[2] in self.SQUARES and
|
| 304 |
tokens[3] in self.SQUARES):
|
| 305 |
|
|
|
|
| 306 |
if len(tokens) == 4:
|
| 307 |
return True
|
| 308 |
|
|
|
|
| 310 |
remaining = tokens[4:]
|
| 311 |
for t in remaining:
|
| 312 |
if t in self.COLORS:
|
| 313 |
+
return True
|
| 314 |
if t not in self.MODIFIERS and not t.startswith("="):
|
| 315 |
return True
|
| 316 |
|
|
|
|
| 323 |
if not tokens:
|
| 324 |
return ""
|
| 325 |
|
|
|
|
| 326 |
if len(tokens) >= 4:
|
| 327 |
# Convert [W] -> W and [B] -> B for colors
|
| 328 |
color = tokens[0]
|
| 329 |
if color in self.COLORS:
|
| 330 |
+
color = color[1]
|
| 331 |
|
| 332 |
move = color + "".join(tokens[1:4])
|
| 333 |
|
|
|
|
| 351 |
save_directory: str,
|
| 352 |
filename_prefix: Optional[str] = None,
|
| 353 |
) -> Tuple[str]:
|
| 354 |
+
"""
|
| 355 |
+
Save the vocabulary to a file.
|
| 356 |
+
|
| 357 |
+
Args:
|
| 358 |
+
save_directory: Directory to save the vocabulary.
|
| 359 |
+
filename_prefix: Optional prefix for the vocabulary file.
|
| 360 |
+
|
| 361 |
+
Returns:
|
| 362 |
+
Tuple containing the path to the saved vocabulary file.
|
| 363 |
+
"""
|
| 364 |
if not os.path.isdir(save_directory):
|
| 365 |
os.makedirs(save_directory, exist_ok=True)
|
| 366 |
|
|
|
|
| 372 |
with open(vocab_file, "w", encoding="utf-8") as f:
|
| 373 |
json.dump(self._vocab, f, ensure_ascii=False, indent=2)
|
| 374 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
return (vocab_file,)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
|
| 377 |
+
|
| 378 |
+
def count_vocab_from_dataset(
|
| 379 |
+
dataset_name: str = "dlouapre/lichess_2025-01_1M",
|
| 380 |
+
split: str = "train",
|
| 381 |
+
column: str = "moves",
|
| 382 |
+
max_samples: Optional[int] = None,
|
| 383 |
+
) -> Dict[str, int]:
|
| 384 |
+
"""
|
| 385 |
+
Count token frequencies in a dataset.
|
| 386 |
+
|
| 387 |
+
Note: For decomposed tokenizer, this counts component frequencies
|
| 388 |
+
rather than whole-move frequencies.
|
| 389 |
+
|
| 390 |
+
Args:
|
| 391 |
+
dataset_name: Name of the dataset.
|
| 392 |
+
split: Dataset split.
|
| 393 |
+
column: Column with moves.
|
| 394 |
+
max_samples: Max samples to process.
|
| 395 |
+
|
| 396 |
+
Returns:
|
| 397 |
+
Dictionary of token frequencies.
|
| 398 |
+
"""
|
| 399 |
+
from collections import Counter
|
| 400 |
+
from datasets import load_dataset
|
| 401 |
+
|
| 402 |
+
tokenizer = ChessTokenizer()
|
| 403 |
+
|
| 404 |
+
dataset = load_dataset(dataset_name, split=split)
|
| 405 |
+
if max_samples:
|
| 406 |
+
dataset = dataset.select(range(min(max_samples, len(dataset))))
|
| 407 |
+
|
| 408 |
+
counts = Counter()
|
| 409 |
+
for example in dataset:
|
| 410 |
+
tokens = tokenizer.tokenize(example[column])
|
| 411 |
+
counts.update(tokens)
|
| 412 |
+
|
| 413 |
+
return dict(counts)
|