File size: 479 Bytes
9f1960e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
"""Character-level tokenizer for nanoGPT TinyStories model."""
import json, os

_dir = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(_dir, "vocab.json"), "r", encoding="utf-8") as f:
    _vocab = json.load(f)
_ivocab = {v: k for k, v in _vocab.items()}

def encode(text: str) -> list[int]:
    return [_vocab.get(ch, _vocab.get("?", 0)) for ch in text]

def decode(ids: list[int]) -> str:
    return "".join(_ivocab.get(i, "?") for i in ids)

VOCAB_SIZE = 93