LLM-course
/

chess-Sunxt25

@@ -54,72 +54,98 @@ class ChessTokenizer(PreTrainedTokenizer):
     @property
     def vocab_size(self) -> int:
         return len(self._vocab)
     def get_vocab(self) -> Dict[str, int]:
         return dict(self._vocab)
     def _tokenize(self, text: str) -> List[str]:
-        """关键：支持识别带后缀的 token，让 eval 识别为 decomposed 模式"""
-        tokens = []
-        parts = text.strip().split()
-        for part in parts:
-            if part in self._vocab:
-                tokens.append(part)
-            elif len(part) >= 6: # 处理 WPe2e4 紧凑格式
-                piece, f_sq, t_sq = part[:2], part[2:4] + "_f", part[4:6] + "_t"
-                if piece in self._vocab: tokens.append(piece)
-                if f_sq in self._vocab: tokens.append(f_sq)
-                if t_sq in self._vocab: tokens.append(t_sq)
-                if len(part) > 6 and part[6:] in self.suffixes:
-                    tokens.append(part[6:])
-        return tokens
     def _convert_id_to_token(self, index: int) -> str:
-        """关键：去掉后缀，让 eval 的正则 [a-h][1-8] 能抓到坐标"""
-        token = self._ids_to_tokens.get(index, self.UNK_TOKEN)
-        if token in [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]:
-            return ""
-        return token.replace("_f", "").replace("_t", "")
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        """关键：在 Piece 前加空格，确保历史棋局格式正确"""
-        res = []
-        for t in tokens:
-            if not t: continue
-            # 如果是棋子 token，说明是新 move，加空格
-            if len(t) == 2 and (t.startswith('W') or t.startswith('B')):
-                res.append(" " + t)
-            else:
-                res.append(t)
-        return "".join(res).strip()
-    def _convert_token_to_id(self, token: str) -> int:
-        return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN))
-    def decode(self, token_ids, skip_special_tokens=True, **kwargs) -> str:
-        if hasattr(token_ids, "tolist"):
-            ids = token_ids.tolist()
-        elif isinstance(token_ids, (int, torch.LongTensor, torch.IntTensor)):
-            ids = [int(token_ids)] if isinstance(token_ids, int) else token_ids.tolist()
-        else:
-            ids = token_ids
-        tokens = [self._convert_id_to_token(i) for i in ids]
-        return self.convert_tokens_to_string(tokens)
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
         if not os.path.isdir(save_directory):
             os.makedirs(save_directory, exist_ok=True)
-        vocab_file = os.path.join(save_directory, (filename_prefix + "-" if filename_prefix else "") + "vocab.json")
         with open(vocab_file, "w", encoding="utf-8") as f:
             json.dump(self._vocab, f, ensure_ascii=False, indent=2)
         return (vocab_file,)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "ChessTokenizer":
-        vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
-        if not os.path.exists(vocab_file):
-            return cls()
-        with open(vocab_file, "r", encoding="utf-8") as f:
-            vocab = json.load(f)
-        return cls(vocab=vocab, **kwargs)

     @property
     def vocab_size(self) -> int:
+        """Return the size of the vocabulary."""
         return len(self._vocab)
     def get_vocab(self) -> Dict[str, int]:
+        """Return the vocabulary as a dictionary."""
         return dict(self._vocab)
     def _tokenize(self, text: str) -> List[str]:
+        """
+        Tokenize a string of moves into a list of tokens.
+        Args:
+            text: A string of space-separated moves.
+        Returns:
+            List of move tokens.
+        """
+        return text.strip().split()
+    def _convert_token_to_id(self, token: str) -> int:
+        """Convert a token to its ID."""
+        return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN, 0))
     def _convert_id_to_token(self, index: int) -> str:
+        """Convert an ID to its token."""
+        return self._ids_to_tokens.get(index, self.UNK_TOKEN)
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Convert a list of tokens back to a string."""
+        # Filter out special tokens for cleaner output
+        special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
+        return " ".join(t for t in tokens if t not in special)
+    def save_vocabulary(
+        self,
+        save_directory: str,
+        filename_prefix: Optional[str] = None,
+    ) -> tuple:
+        """
+        Save the vocabulary to a JSON file.
+        Args:
+            save_directory: Directory to save the vocabulary.
+            filename_prefix: Optional prefix for the filename.
+        Returns:
+            Tuple containing the path to the saved vocabulary file.
+        """
         if not os.path.isdir(save_directory):
             os.makedirs(save_directory, exist_ok=True)
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
+        )
         with open(vocab_file, "w", encoding="utf-8") as f:
             json.dump(self._vocab, f, ensure_ascii=False, indent=2)
         return (vocab_file,)
+def count_vocab_from_dataset(
+    dataset_name: str = "dlouapre/lichess_2025-01_1M",
+    split: str = "train",
+    column: str = "text",
+    max_samples: Optional[int] = 10000,
+) -> Dict[str, int]:
+    """
+    Count token frequencies in a dataset (useful for vocabulary analysis).
+    Args:
+        dataset_name: Name of the dataset on Hugging Face Hub.
+        split: Dataset split to use.
+        column: Column containing the game strings.
+        max_samples: Maximum number of samples to process.
+    Returns:
+        Dictionary mapping tokens to their frequencies.
+    """
+    from collections import Counter
+    from datasets import load_dataset
+    dataset = load_dataset(dataset_name, split=split)
+    if max_samples is not None:
+        dataset = dataset.select(range(min(max_samples, len(dataset))))
+    token_counts = Counter()
+    for example in dataset:
+        moves = example[column].strip().split()
+        token_counts.update(moves)
+    return dict(token_counts)