"""Character-level tokenizer for nanoGPT TinyStories model.""" import json, os _dir = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(_dir, "vocab.json"), "r", encoding="utf-8") as f: _vocab = json.load(f) _ivocab = {v: k for k, v in _vocab.items()} def encode(text: str) -> list[int]: return [_vocab.get(ch, _vocab.get("?", 0)) for ch in text] def decode(ids: list[int]) -> str: return "".join(_ivocab.get(i, "?") for i in ids) VOCAB_SIZE = 93