HY-2012's picture
First commit
0c354cf verified
from tokenizers import Tokenizer
class FastTokenizer:
def __init__(self, tokenizer_path):
self.tokenizer = Tokenizer.from_file(tokenizer_path)
def tokenize(self, text):
"""完全模拟 AutoTokenizer.tokenize() 的行为"""
return self.tokenizer.encode(text).tokens[1:-1] # 去掉[CLS]和[SEP]