""" Two tokenizers: - BPETokenizer : wraps tiktoken (GPT-2 vocab, 50257 tokens) — default for multi-dataset training - CharTokenizer : original character-level fallback """ import json class BPETokenizer: """Thin wrapper around tiktoken's GPT-2 encoding.""" def __init__(self): import tiktoken self._enc = tiktoken.get_encoding("gpt2") self.vocab_size = self._enc.n_vocab # 50257 def encode(self, text): return self._enc.encode_ordinary(text) def decode(self, ids): return self._enc.decode(ids) def save(self, path): with open(path, "w") as f: json.dump({"type": "bpe", "encoding": "gpt2"}, f) @classmethod def load(cls, path): with open(path) as f: meta = json.load(f) assert meta.get("type") == "bpe", "Not a BPE tokenizer file" return cls() class CharTokenizer: def __init__(self, text=None, chars=None): if chars is not None: self.chars = sorted(chars) elif text is not None: self.chars = sorted(set(text)) else: raise ValueError("Provide either text or chars") self.vocab_size = len(self.chars) self.stoi = {ch: i for i, ch in enumerate(self.chars)} self.itos = {i: ch for ch, i in self.stoi.items()} def encode(self, text): return [self.stoi[ch] for ch in text if ch in self.stoi] def decode(self, ids): return "".join(self.itos.get(i, "") for i in ids) def save(self, path): with open(path, "w") as f: json.dump({"type": "char", "chars": self.chars}, f) @classmethod def load(cls, path): with open(path) as f: data = json.load(f) if data.get("type") == "bpe": return BPETokenizer() return cls(chars=data["chars"]) def load_tokenizer(path): with open(path) as f: meta = json.load(f) if meta.get("type") == "bpe": return BPETokenizer() return CharTokenizer(chars=meta["chars"])