nanogpt-tinystories / char_tokenizer.py
ncncomplete's picture
Upload nanoGPT TinyStories model + char tokenizer
9f1960e verified
"""Character-level tokenizer for nanoGPT TinyStories model."""
import json, os
_dir = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(_dir, "vocab.json"), "r", encoding="utf-8") as f:
_vocab = json.load(f)
_ivocab = {v: k for k, v in _vocab.items()}
def encode(text: str) -> list[int]:
return [_vocab.get(ch, _vocab.get("?", 0)) for ch in text]
def decode(ids: list[int]) -> str:
return "".join(_ivocab.get(i, "?") for i in ids)
VOCAB_SIZE = 93