| import sentencepiece as spm | |
| from transformers import PreTrainedTokenizer | |
| class IndicTokenizer(PreTrainedTokenizer): | |
| def __init__(self, vocab_file, **kwargs): | |
| self.sp_model = spm.SentencePieceProcessor() | |
| self.sp_model.load(vocab_file) | |
| super().__init__(**kwargs) | |
| def _tokenize(self, text): | |
| return self.sp_model.encode(text, out_type=str) | |
| def _convert_token_to_id(self, token): | |
| return self.sp_model.piece_to_id(token) | |
| def _convert_id_to_token(self, index): | |
| return self.sp_model.id_to_piece(index) | |
| def get_vocab(self): | |
| return {self.sp_model.id_to_piece(i): i for i in range(self.sp_model.get_piece_size())} | |
| def __len__(self): | |
| return self.sp_model.get_piece_size() | |
| def vocab_size(self): | |
| return self.sp_model.get_piece_size() | |