Chiensaucisse67 commited on
Commit
04796f5
·
verified ·
1 Parent(s): f22aba4

Chess Challenge submission by Chiensaucisse67

Browse files
Files changed (2) hide show
  1. tokenizer_config.json +3 -2
  2. tokenizer_custom.py +288 -1
tokenizer_config.json CHANGED
@@ -35,7 +35,7 @@
35
  },
36
  "auto_map": {
37
  "AutoTokenizer": [
38
- "tokenizer_custom.ChessTokenizer",
39
  null
40
  ]
41
  },
@@ -45,6 +45,7 @@
45
  "extra_special_tokens": {},
46
  "model_max_length": 1000000000000000019884624838656,
47
  "pad_token": "[PAD]",
48
- "tokenizer_class": "ChessTokenizer",
 
49
  "unk_token": "[UNK]"
50
  }
 
35
  },
36
  "auto_map": {
37
  "AutoTokenizer": [
38
+ "tokenizer_custom.CoordinateTokenizer",
39
  null
40
  ]
41
  },
 
45
  "extra_special_tokens": {},
46
  "model_max_length": 1000000000000000019884624838656,
47
  "pad_token": "[PAD]",
48
+ "tokenizer_class": "CoordinateTokenizer",
49
+ "truncation_side": "left",
50
  "unk_token": "[UNK]"
51
  }
tokenizer_custom.py CHANGED
@@ -16,10 +16,11 @@ from __future__ import annotations
16
  import json
17
  import os
18
  from pathlib import Path
 
19
  from typing import Dict, List, Optional
20
 
21
  from transformers import PreTrainedTokenizer
22
-
23
 
24
  class ChessTokenizer(PreTrainedTokenizer):
25
  """
@@ -276,3 +277,289 @@ def count_vocab_from_dataset(
276
  token_counts.update(moves)
277
 
278
  return dict(token_counts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  import json
17
  import os
18
  from pathlib import Path
19
+ from token import OP
20
  from typing import Dict, List, Optional
21
 
22
  from transformers import PreTrainedTokenizer
23
+ import re
24
 
25
  class ChessTokenizer(PreTrainedTokenizer):
26
  """
 
277
  token_counts.update(moves)
278
 
279
  return dict(token_counts)
280
+
281
+
282
+
283
+ class CoordinateTokenizer(ChessTokenizer):
284
+ def __init__(self, **kwargs):
285
+ squares = [f"{f}{r}" for f in "abcdefgh" for r in "12345678"]
286
+ promotions = ["q", "r", "b", "n"]
287
+ control = ["[PAD]", "[BOS]", "[EOS]", "[UNK]"]
288
+ vocab_list = control + squares + promotions
289
+ self._vocab = {t: i for i, t in enumerate(vocab_list)}
290
+ self._ids_to_token = {i: t for t, i in self._vocab.items()}
291
+
292
+ super().__init__(
293
+ vocab=self._vocab,
294
+ pad_token="[PAD]",
295
+ bos_token="[BOS]",
296
+ eos_token="[EOS]",
297
+ unk_token="[UNK]",
298
+ truncation_side="left",
299
+ **kwargs
300
+ )
301
+
302
+ def _tokenize(self, text: str) -> List[str]:
303
+ raw_moves = text.strip().split()
304
+ tokens = []
305
+ for raw_move in raw_moves:
306
+ squares = re.findall(r'[a-h][1-8]', raw_move)
307
+ tokens.extend(squares)
308
+ if "=" in raw_move:
309
+ idx = raw_move.index("=")
310
+ if idx + 1 < len(raw_move):
311
+ tokens.append(raw_move[idx+1].lower())
312
+ elif "q" in raw_move[-2:].lower():
313
+ tokens.append(raw_move[-1].lower())
314
+ return tokens
315
+
316
+
317
+ class CoordinateChessTokenizer(PreTrainedTokenizer):
318
+ """
319
+ Tokenizer that decomposes chess moves into coordinate components.
320
+
321
+ Example:
322
+ WPe2e4 -> ['e2', 'e4']
323
+ WPa7a8q -> ['a7', 'a8', 'q'] # pawn promotion
324
+
325
+ Vocabulary size: 72 tokens
326
+ - 64 squares (a1-h8)
327
+ - 4 promotions (q, r, b, n)
328
+ - 4 special tokens
329
+ """
330
+
331
+ model_input_names = ["input_ids", "attention_mask"]
332
+ vocab_files_names = {"vocab_file": "vocab.json"}
333
+
334
+ PAD_TOKEN = "[PAD]"
335
+ BOS_TOKEN = "[BOS]"
336
+ EOS_TOKEN = "[EOS]"
337
+ UNK_TOKEN = "[UNK]"
338
+
339
+ # Regex to extract from-square, to-square, and optional promotion
340
+ MOVE_PATTERN = re.compile(r'([a-h][1-8])([a-h][1-8])([qrbn])?')
341
+
342
+ def __init__(self, vocab_file: Optional[str] = None, **kwargs):
343
+ # Remove duplicate special token kwargs
344
+ kwargs.pop("pad_token", None)
345
+ kwargs.pop("bos_token", None)
346
+ kwargs.pop("eos_token", None)
347
+ kwargs.pop("unk_token", None)
348
+
349
+ # Build fixed vocabulary
350
+ if vocab_file is not None and os.path.exists(vocab_file):
351
+ with open(vocab_file, "r", encoding="utf-8") as f:
352
+ self._vocab = json.load(f)
353
+ else:
354
+ self._vocab = self._create_vocab()
355
+
356
+ self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
357
+
358
+ super().__init__(
359
+ pad_token=self.PAD_TOKEN,
360
+ bos_token=self.BOS_TOKEN,
361
+ eos_token=self.EOS_TOKEN,
362
+ unk_token=self.UNK_TOKEN,
363
+ **kwargs,
364
+ )
365
+
366
+ def _create_vocab(self) -> Dict[str, int]:
367
+ """Create fixed vocabulary of 72 tokens."""
368
+ tokens = [
369
+ self.PAD_TOKEN,
370
+ self.BOS_TOKEN,
371
+ self.EOS_TOKEN,
372
+ self.UNK_TOKEN,
373
+ ]
374
+
375
+ # Add all 64 squares
376
+ for file in 'abcdefgh':
377
+ for rank in '12345678':
378
+ tokens.append(f"{file}{rank}")
379
+
380
+ # Add promotion pieces
381
+ tokens.extend(['q', 'r', 'b', 'n'])
382
+
383
+ return {token: idx for idx, token in enumerate(tokens)}
384
+
385
+ @property
386
+ def vocab_size(self) -> int:
387
+ return len(self._vocab)
388
+
389
+ def get_vocab(self) -> Dict[str, int]:
390
+ return dict(self._vocab)
391
+
392
+ def _tokenize(self, text: str) -> List[str]:
393
+ """
394
+ Tokenize move string into coordinate components.
395
+
396
+ Args:
397
+ text: Space-separated moves like "WPe2e4 BNg8f6"
398
+
399
+ Returns:
400
+ List of coordinate tokens: ['e2', 'e4', 'g8', 'f6']
401
+ """
402
+ tokens = []
403
+ raw_moves = text.strip().split()
404
+
405
+ for move in raw_moves:
406
+ match = self.MOVE_PATTERN.search(move)
407
+ if match:
408
+ from_sq, to_sq, promotion = match.groups()
409
+ tokens.append(from_sq)
410
+ tokens.append(to_sq)
411
+ if promotion:
412
+ tokens.append(promotion)
413
+
414
+ return tokens
415
+
416
+ def _convert_token_to_id(self, token: str) -> int:
417
+ return self._vocab.get(token, self._vocab[self.UNK_TOKEN])
418
+
419
+ def _convert_id_to_token(self, index: int) -> str:
420
+ return self._ids_to_tokens.get(index, self.UNK_TOKEN)
421
+
422
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
423
+ """Reconstruct moves from coordinate tokens."""
424
+ special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
425
+ clean = [t for t in tokens if t not in special]
426
+
427
+ # Group into moves (2 or 3 tokens per move)
428
+ moves = []
429
+ i = 0
430
+ while i < len(clean):
431
+ if i + 1 < len(clean):
432
+ move = clean[i] + clean[i + 1]
433
+ i += 2
434
+ # Check for promotion
435
+ if i < len(clean) and clean[i] in ['q', 'r', 'b', 'n']:
436
+ move += clean[i]
437
+ i += 1
438
+ moves.append(move)
439
+ else:
440
+ i += 1
441
+
442
+ return " ".join(moves)
443
+
444
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
445
+ if not os.path.isdir(save_directory):
446
+ os.makedirs(save_directory, exist_ok=True)
447
+
448
+ vocab_file = os.path.join(
449
+ save_directory,
450
+ (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
451
+ )
452
+
453
+ with open(vocab_file, "w", encoding="utf-8") as f:
454
+ json.dump(self._vocab, f, ensure_ascii=False, indent=2)
455
+
456
+ return (vocab_file,)
457
+
458
+
459
+ class EnhancedCoordinateTokenizer(CoordinateChessTokenizer):
460
+ """
461
+ Extended version that preserves piece information as optional metadata.
462
+ Vocabulary: 76 tokens (adds W, B, P, N, B, R, Q, K but makes them optional)
463
+
464
+ Use this if you want to preserve color/piece info with minimal vocab growth.
465
+ """
466
+
467
+ def _create_vocab(self) -> Dict[str, int]:
468
+ vocab = super()._create_vocab()
469
+
470
+ # Add optional color and piece tokens
471
+ piece_tokens = ['W', 'B', 'P', 'N', 'R', 'Q', 'K'] # Note: B appears in both contexts
472
+
473
+ next_id = len(vocab)
474
+ for token in piece_tokens:
475
+ if token not in vocab:
476
+ vocab[token] = next_id
477
+ next_id += 1
478
+
479
+ return vocab
480
+
481
+ def _tokenize(self, text: str) -> List[str]:
482
+ """
483
+ Optionally include piece info: WPe2e4 -> ['W', 'P', 'e2', 'e4']
484
+ Or strip it for minimal version: WPe2e4 -> ['e2', 'e4']
485
+ """
486
+ tokens = []
487
+ raw_moves = text.strip().split()
488
+
489
+ for move in raw_moves:
490
+ # Extract color and piece if present
491
+ if len(move) >= 2 and move[0] in 'WB' and move[1] in 'PNBRQK':
492
+ # Uncomment to include piece info (increases sequence length):
493
+ # tokens.extend([move[0], move[1]])
494
+ pass
495
+
496
+ # Extract coordinates
497
+ match = self.MOVE_PATTERN.search(move)
498
+ if match:
499
+ from_sq, to_sq, promotion = match.groups()
500
+ tokens.append(from_sq)
501
+ tokens.append(to_sq)
502
+ if promotion:
503
+ tokens.append(promotion)
504
+
505
+ return tokens
506
+
507
+
508
+
509
+ class SanitizedChessTokenizer(ChessTokenizer):
510
+
511
+ # Strategy:
512
+ # 1. Strip suffixes: (, ), x, +, *, o, O, E
513
+ # 2. Strip prefixes: W or B followed by P, N, B, R, Q, K
514
+ # Regex: ^[WB][PNBRQK] matches the start of the string
515
+
516
+ # We can use a single regex to find the "Pure Move" part.
517
+ # We look for the square-to-square pattern (e.g., e2e4) and optional promotion (q,r,b,n)
518
+ # This is safer than stripping because it ignores all noise around the move.
519
+ MOVE_PATTERN = re.compile(r'([a-h][1-8][a-h][1-8][qrbn]?)')
520
+
521
+ def _sanitize(self, text: str) -> str:
522
+ # Extract just the move part (e.g., "WPe2e4(x)" -> "e2e4")
523
+ match = self.MOVE_PATTERN.search(text)
524
+ if match:
525
+ return match.group(1)
526
+ return self.unk_token # Fallback if no valid move found
527
+
528
+ def _tokenize(self, text: str) -> List[str]:
529
+ # Tokenize by splitting space, then extracting the move
530
+ tokens = []
531
+ for t in text.strip().split():
532
+ clean = self._sanitize(t)
533
+ if clean != self.unk_token:
534
+ tokens.append(clean)
535
+ return tokens
536
+
537
+ @classmethod
538
+ def build_vocab_from_iterator(cls, iterator, min_frequency: int = 1) -> "SanitizedChessTokenizer":
539
+ from collections import Counter
540
+
541
+ token_counts = Counter()
542
+
543
+ for game in iterator:
544
+ moves = game.strip().split()
545
+ # Extract only the Pure UCI part
546
+ clean_moves = []
547
+ for m in moves:
548
+ match = cls.MOVE_PATTERN.search(m)
549
+ if match:
550
+ clean_moves.append(match.group(1))
551
+
552
+ token_counts.update(clean_moves)
553
+
554
+ # Filter by frequency
555
+ tokens = [
556
+ token for token, count in token_counts.items()
557
+ if count >= min_frequency
558
+ ]
559
+ tokens = sorted(tokens)
560
+
561
+ # Build vocabulary
562
+ special_tokens = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN]
563
+ vocab = {token: idx for idx, token in enumerate(special_tokens + tokens)}
564
+
565
+ return cls(vocab=vocab)