import sentencepiece as spm from transformers import PreTrainedTokenizer class IndicTokenizer(PreTrainedTokenizer): def __init__(self, vocab_file, **kwargs): self.sp_model = spm.SentencePieceProcessor() self.sp_model.load(vocab_file) super().__init__(**kwargs) def _tokenize(self, text): return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): return self.sp_model.piece_to_id(token) def _convert_id_to_token(self, index): return self.sp_model.id_to_piece(index) def get_vocab(self): return {self.sp_model.id_to_piece(i): i for i in range(self.sp_model.get_piece_size())} def __len__(self): return self.sp_model.get_piece_size() @property def vocab_size(self): return self.sp_model.get_piece_size()