File size: 820 Bytes
9e25bfd be8f2be 9fc7c74 be8f2be 9e25bfd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | from transformers import PreTrainedTokenizer
AUTO_MAP = {
"AutoTokenizer": "tokenization_makemore.MakemoreTokenizer"
}
class MakemoreTokenizer(PreTrainedTokenizer):
def __init__(self, **kwargs):
self._stoi = {'.': 0, **{chr(ord('a') + i): i + 1 for i in range(26)}}
self._itos = {v: k for k, v in self._stoi.items()}
super().__init__(**kwargs)
@property
def vocab_size(self):
return 27
def get_vocab(self):
return dict(self._stoi)
def _tokenize(self, text):
return list(text.lower())
def _convert_token_to_id(self, token):
return self._stoi.get(token, 0)
def _convert_id_to_token(self, index):
return self._itos.get(index, '.')
def save_vocabulary(self, save_directory, filename_prefix=None):
return ()
|