| from transformers import PreTrainedTokenizer | |
| AUTO_MAP = { | |
| "AutoTokenizer": "tokenization_makemore.MakemoreTokenizer" | |
| } | |
| class MakemoreTokenizer(PreTrainedTokenizer): | |
| def __init__(self, **kwargs): | |
| self._stoi = {'.': 0, **{chr(ord('a') + i): i + 1 for i in range(26)}} | |
| self._itos = {v: k for k, v in self._stoi.items()} | |
| super().__init__(**kwargs) | |
| def vocab_size(self): | |
| return 27 | |
| def get_vocab(self): | |
| return dict(self._stoi) | |
| def _tokenize(self, text): | |
| return list(text.lower()) | |
| def _convert_token_to_id(self, token): | |
| return self._stoi.get(token, 0) | |
| def _convert_id_to_token(self, index): | |
| return self._itos.get(index, '.') | |
| def save_vocabulary(self, save_directory, filename_prefix=None): | |
| return () | |