File size: 335 Bytes
0c354cf |
1 2 3 4 5 6 7 8 9 |
from tokenizers import Tokenizer
class FastTokenizer:
def __init__(self, tokenizer_path):
self.tokenizer = Tokenizer.from_file(tokenizer_path)
def tokenize(self, text):
"""完全模拟 AutoTokenizer.tokenize() 的行为"""
return self.tokenizer.encode(text).tokens[1:-1] # 去掉[CLS]和[SEP] |