LLM-course
/

chess-Sunxt25

@@ -7,8 +7,7 @@ import torch
 class ChessTokenizer(PreTrainedTokenizer):
     """
-    符合评估脚本要求的 Chess Tokenizer。
-    词表大小: 149 (4 special + 12 pieces + 64 from_sq + 64 to_sq + 5 suffix)
     """
     model_input_names = ["input_ids", "attention_mask"]
@@ -54,98 +53,79 @@ class ChessTokenizer(PreTrainedTokenizer):
     @property
     def vocab_size(self) -> int:
-        """Return the size of the vocabulary."""
         return len(self._vocab)
     def get_vocab(self) -> Dict[str, int]:
-        """Return the vocabulary as a dictionary."""
         return dict(self._vocab)
     def _tokenize(self, text: str) -> List[str]:
-        """
-        Tokenize a string of moves into a list of tokens.
-        Args:
-            text: A string of space-separated moves.
-        Returns:
-            List of move tokens.
-        """
-        return text.strip().split()
     def _convert_token_to_id(self, token: str) -> int:
-        """Convert a token to its ID."""
-        return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN, 0))
     def _convert_id_to_token(self, index: int) -> str:
-        """Convert an ID to its token."""
-        return self._ids_to_tokens.get(index, self.UNK_TOKEN)
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        """Convert a list of tokens back to a string."""
-        # Filter out special tokens for cleaner output
-        special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
-        return " ".join(t for t in tokens if t not in special)
-    def save_vocabulary(
-        self,
-        save_directory: str,
-        filename_prefix: Optional[str] = None,
-    ) -> tuple:
-        """
-        Save the vocabulary to a JSON file.
-        Args:
-            save_directory: Directory to save the vocabulary.
-            filename_prefix: Optional prefix for the filename.
-        Returns:
-            Tuple containing the path to the saved vocabulary file.
-        """
         if not os.path.isdir(save_directory):
             os.makedirs(save_directory, exist_ok=True)
-        vocab_file = os.path.join(
-            save_directory,
-            (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
-        )
         with open(vocab_file, "w", encoding="utf-8") as f:
             json.dump(self._vocab, f, ensure_ascii=False, indent=2)
         return (vocab_file,)
-def count_vocab_from_dataset(
-    dataset_name: str = "dlouapre/lichess_2025-01_1M",
-    split: str = "train",
-    column: str = "text",
-    max_samples: Optional[int] = 10000,
-) -> Dict[str, int]:
-    """
-    Count token frequencies in a dataset (useful for vocabulary analysis).
-    Args:
-        dataset_name: Name of the dataset on Hugging Face Hub.
-        split: Dataset split to use.
-        column: Column containing the game strings.
-        max_samples: Maximum number of samples to process.
-    Returns:
-        Dictionary mapping tokens to their frequencies.
-    """
-    from collections import Counter
-    from datasets import load_dataset
-    dataset = load_dataset(dataset_name, split=split)
-    if max_samples is not None:
-        dataset = dataset.select(range(min(max_samples, len(dataset))))
-    token_counts = Counter()
-    for example in dataset:
-        moves = example[column].strip().split()
-        token_counts.update(moves)
-    return dict(token_counts)

 class ChessTokenizer(PreTrainedTokenizer):
     """
+    vocab size: 149 (4 special + 12 pieces + 64 from_sq + 64 to_sq + 5 suffix)
     """
     model_input_names = ["input_ids", "attention_mask"]
     @property
     def vocab_size(self) -> int:
         return len(self._vocab)
     def get_vocab(self) -> Dict[str, int]:
         return dict(self._vocab)
     def _tokenize(self, text: str) -> List[str]:
+        tokens = []
+        parts = text.strip().split()
+        for part in parts:
+            if part in self._vocab:
+                tokens.append(part)
+            elif len(part) >= 6:
+                piece, f_sq, t_sq = part[:2], part[2:4] + "_f", part[4:6] + "_t"
+                if piece in self._vocab: tokens.append(piece)
+                if f_sq in self._vocab: tokens.append(f_sq)
+                if t_sq in self._vocab: tokens.append(t_sq)
+                if len(part) > 6 and part[6:] in self.suffixes:
+                    tokens.append(part[6:])
+        return tokens
+    def _convert_id_to_token(self, index: int) -> str:
+        token = self._ids_to_tokens.get(index, self.UNK_TOKEN)
+        if token in [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]:
+            return ""
+        return token.replace("_f", "").replace("_t", "")
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        res = []
+        for t in tokens:
+            if not t: continue
+            # if piece token，new move，add space
+            if len(t) == 2 and (t.startswith('W') or t.startswith('B')):
+                res.append(" " + t)
+            else:
+                res.append(t)
+        return "".join(res).strip()
     def _convert_token_to_id(self, token: str) -> int:
+        return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN))
     def _convert_id_to_token(self, index: int) -> str:
+        token = self._ids_to_tokens.get(index, self.UNK_TOKEN)
+        if token in [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]:
+            return ""
+        if token in self.suffixes:
+            return token
+        return token.replace("_f", "").replace("_t", "")
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return "".join([t for t in tokens if t])
+    def decode(self, token_ids, skip_special_tokens=True, **kwargs) -> str:
+        if hasattr(token_ids, "tolist"):
+            ids = token_ids.tolist()
+        elif isinstance(token_ids, (int, torch.LongTensor, torch.IntTensor)):
+            ids = [int(token_ids)] if isinstance(token_ids, int) else token_ids.tolist()
+        else:
+            ids = token_ids
+        tokens = [self._convert_id_to_token(i) for i in ids]
+        return self.convert_tokens_to_string(tokens)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
         if not os.path.isdir(save_directory):
             os.makedirs(save_directory, exist_ok=True)
+        vocab_file = os.path.join(save_directory, (filename_prefix + "-" if filename_prefix else "") + "vocab.json")
         with open(vocab_file, "w", encoding="utf-8") as f:
             json.dump(self._vocab, f, ensure_ascii=False, indent=2)
         return (vocab_file,)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "ChessTokenizer":
+        vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
+        if not os.path.exists(vocab_file):
+            return cls()
+        with open(vocab_file, "r", encoding="utf-8") as f:
+            vocab = json.load(f)
+        return cls(vocab=vocab, **kwargs)