File size: 2,051 Bytes
"""
Two tokenizers:
  - BPETokenizer  : wraps tiktoken (GPT-2 vocab, 50257 tokens) — default for multi-dataset training
  - CharTokenizer : original character-level fallback
"""
import json


class BPETokenizer:
    """Thin wrapper around tiktoken's GPT-2 encoding."""

    def __init__(self):
        import tiktoken
        self._enc = tiktoken.get_encoding("gpt2")
        self.vocab_size = self._enc.n_vocab  # 50257

    def encode(self, text):
        return self._enc.encode_ordinary(text)

    def decode(self, ids):
        return self._enc.decode(ids)

    def save(self, path):
        with open(path, "w") as f:
            json.dump({"type": "bpe", "encoding": "gpt2"}, f)

    @classmethod
    def load(cls, path):
        with open(path) as f:
            meta = json.load(f)
        assert meta.get("type") == "bpe", "Not a BPE tokenizer file"
        return cls()


class CharTokenizer:
    def __init__(self, text=None, chars=None):
        if chars is not None:
            self.chars = sorted(chars)
        elif text is not None:
            self.chars = sorted(set(text))
        else:
            raise ValueError("Provide either text or chars")

        self.vocab_size = len(self.chars)
        self.stoi = {ch: i for i, ch in enumerate(self.chars)}
        self.itos = {i: ch for ch, i in self.stoi.items()}

    def encode(self, text):
        return [self.stoi[ch] for ch in text if ch in self.stoi]

    def decode(self, ids):
        return "".join(self.itos.get(i, "") for i in ids)

    def save(self, path):
        with open(path, "w") as f:
            json.dump({"type": "char", "chars": self.chars}, f)

    @classmethod
    def load(cls, path):
        with open(path) as f:
            data = json.load(f)
        if data.get("type") == "bpe":
            return BPETokenizer()
        return cls(chars=data["chars"])


def load_tokenizer(path):
    with open(path) as f:
        meta = json.load(f)
    if meta.get("type") == "bpe":
        return BPETokenizer()
    return CharTokenizer(chars=meta["chars"])