File size: 2,051 Bytes
0158205 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | """
Two tokenizers:
- BPETokenizer : wraps tiktoken (GPT-2 vocab, 50257 tokens) — default for multi-dataset training
- CharTokenizer : original character-level fallback
"""
import json
class BPETokenizer:
"""Thin wrapper around tiktoken's GPT-2 encoding."""
def __init__(self):
import tiktoken
self._enc = tiktoken.get_encoding("gpt2")
self.vocab_size = self._enc.n_vocab # 50257
def encode(self, text):
return self._enc.encode_ordinary(text)
def decode(self, ids):
return self._enc.decode(ids)
def save(self, path):
with open(path, "w") as f:
json.dump({"type": "bpe", "encoding": "gpt2"}, f)
@classmethod
def load(cls, path):
with open(path) as f:
meta = json.load(f)
assert meta.get("type") == "bpe", "Not a BPE tokenizer file"
return cls()
class CharTokenizer:
def __init__(self, text=None, chars=None):
if chars is not None:
self.chars = sorted(chars)
elif text is not None:
self.chars = sorted(set(text))
else:
raise ValueError("Provide either text or chars")
self.vocab_size = len(self.chars)
self.stoi = {ch: i for i, ch in enumerate(self.chars)}
self.itos = {i: ch for ch, i in self.stoi.items()}
def encode(self, text):
return [self.stoi[ch] for ch in text if ch in self.stoi]
def decode(self, ids):
return "".join(self.itos.get(i, "") for i in ids)
def save(self, path):
with open(path, "w") as f:
json.dump({"type": "char", "chars": self.chars}, f)
@classmethod
def load(cls, path):
with open(path) as f:
data = json.load(f)
if data.get("type") == "bpe":
return BPETokenizer()
return cls(chars=data["chars"])
def load_tokenizer(path):
with open(path) as f:
meta = json.load(f)
if meta.get("type") == "bpe":
return BPETokenizer()
return CharTokenizer(chars=meta["chars"])
|