| """ |
| Two tokenizers: |
| - BPETokenizer : wraps tiktoken (GPT-2 vocab, 50257 tokens) — default for multi-dataset training |
| - CharTokenizer : original character-level fallback |
| """ |
| import json |
|
|
|
|
| class BPETokenizer: |
| """Thin wrapper around tiktoken's GPT-2 encoding.""" |
|
|
| def __init__(self): |
| import tiktoken |
| self._enc = tiktoken.get_encoding("gpt2") |
| self.vocab_size = self._enc.n_vocab |
|
|
| def encode(self, text): |
| return self._enc.encode_ordinary(text) |
|
|
| def decode(self, ids): |
| return self._enc.decode(ids) |
|
|
| def save(self, path): |
| with open(path, "w") as f: |
| json.dump({"type": "bpe", "encoding": "gpt2"}, f) |
|
|
| @classmethod |
| def load(cls, path): |
| with open(path) as f: |
| meta = json.load(f) |
| assert meta.get("type") == "bpe", "Not a BPE tokenizer file" |
| return cls() |
|
|
|
|
| class CharTokenizer: |
| def __init__(self, text=None, chars=None): |
| if chars is not None: |
| self.chars = sorted(chars) |
| elif text is not None: |
| self.chars = sorted(set(text)) |
| else: |
| raise ValueError("Provide either text or chars") |
|
|
| self.vocab_size = len(self.chars) |
| self.stoi = {ch: i for i, ch in enumerate(self.chars)} |
| self.itos = {i: ch for ch, i in self.stoi.items()} |
|
|
| def encode(self, text): |
| return [self.stoi[ch] for ch in text if ch in self.stoi] |
|
|
| def decode(self, ids): |
| return "".join(self.itos.get(i, "") for i in ids) |
|
|
| def save(self, path): |
| with open(path, "w") as f: |
| json.dump({"type": "char", "chars": self.chars}, f) |
|
|
| @classmethod |
| def load(cls, path): |
| with open(path) as f: |
| data = json.load(f) |
| if data.get("type") == "bpe": |
| return BPETokenizer() |
| return cls(chars=data["chars"]) |
|
|
|
|
| def load_tokenizer(path): |
| with open(path) as f: |
| meta = json.load(f) |
| if meta.get("type") == "bpe": |
| return BPETokenizer() |
| return CharTokenizer(chars=meta["chars"]) |
|
|