File size: 335 Bytes
0c354cf
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
from tokenizers import Tokenizer

class FastTokenizer:
    def __init__(self, tokenizer_path):
        self.tokenizer = Tokenizer.from_file(tokenizer_path)
        
    def tokenize(self, text):
        """完全模拟 AutoTokenizer.tokenize() 的行为"""
        return self.tokenizer.encode(text).tokens[1:-1]  # 去掉[CLS]和[SEP]