File size: 841 Bytes
8d85ec0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import sentencepiece as spm
from transformers import PreTrainedTokenizer

class IndicTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab_file, **kwargs):
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.load(vocab_file)
        super().__init__(**kwargs)

    def _tokenize(self, text):
        return self.sp_model.encode(text, out_type=str)

    def _convert_token_to_id(self, token):
        return self.sp_model.piece_to_id(token)

    def _convert_id_to_token(self, index):
        return self.sp_model.id_to_piece(index)

    def get_vocab(self):
        return {self.sp_model.id_to_piece(i): i for i in range(self.sp_model.get_piece_size())}

    def __len__(self):
        return self.sp_model.get_piece_size()

    @property
    def vocab_size(self):
        return self.sp_model.get_piece_size()