File size: 841 Bytes
8d85ec0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | import sentencepiece as spm
from transformers import PreTrainedTokenizer
class IndicTokenizer(PreTrainedTokenizer):
def __init__(self, vocab_file, **kwargs):
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.load(vocab_file)
super().__init__(**kwargs)
def _tokenize(self, text):
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token):
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
return self.sp_model.id_to_piece(index)
def get_vocab(self):
return {self.sp_model.id_to_piece(i): i for i in range(self.sp_model.get_piece_size())}
def __len__(self):
return self.sp_model.get_piece_size()
@property
def vocab_size(self):
return self.sp_model.get_piece_size()
|