| from tokenizers import Tokenizer | |
| class FastTokenizer: | |
| def __init__(self, tokenizer_path): | |
| self.tokenizer = Tokenizer.from_file(tokenizer_path) | |
| def tokenize(self, text): | |
| """完全模拟 AutoTokenizer.tokenize() 的行为""" | |
| return self.tokenizer.encode(text).tokens[1:-1] # 去掉[CLS]和[SEP] |