indic-tokenizer-v2-2 / tokenizer.py
PraneetNS's picture
Upload folder using huggingface_hub
8d85ec0 verified
import sentencepiece as spm
from transformers import PreTrainedTokenizer
class IndicTokenizer(PreTrainedTokenizer):
def __init__(self, vocab_file, **kwargs):
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.load(vocab_file)
super().__init__(**kwargs)
def _tokenize(self, text):
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token):
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
return self.sp_model.id_to_piece(index)
def get_vocab(self):
return {self.sp_model.id_to_piece(i): i for i in range(self.sp_model.get_piece_size())}
def __len__(self):
return self.sp_model.get_piece_size()
@property
def vocab_size(self):
return self.sp_model.get_piece_size()