| """Character-level tokenizer for nanoGPT TinyStories model.""" | |
| import json, os | |
| _dir = os.path.dirname(os.path.abspath(__file__)) | |
| with open(os.path.join(_dir, "vocab.json"), "r", encoding="utf-8") as f: | |
| _vocab = json.load(f) | |
| _ivocab = {v: k for k, v in _vocab.items()} | |
| def encode(text: str) -> list[int]: | |
| return [_vocab.get(ch, _vocab.get("?", 0)) for ch in text] | |
| def decode(ids: list[int]) -> str: | |
| return "".join(_ivocab.get(i, "?") for i in ids) | |
| VOCAB_SIZE = 93 | |