my-gpt-from-scratch / tokenizer.py
edgemindroboticslabs's picture
Upload tokenizer.py with huggingface_hub
0158205 verified
"""
Two tokenizers:
- BPETokenizer : wraps tiktoken (GPT-2 vocab, 50257 tokens) — default for multi-dataset training
- CharTokenizer : original character-level fallback
"""
import json
class BPETokenizer:
"""Thin wrapper around tiktoken's GPT-2 encoding."""
def __init__(self):
import tiktoken
self._enc = tiktoken.get_encoding("gpt2")
self.vocab_size = self._enc.n_vocab # 50257
def encode(self, text):
return self._enc.encode_ordinary(text)
def decode(self, ids):
return self._enc.decode(ids)
def save(self, path):
with open(path, "w") as f:
json.dump({"type": "bpe", "encoding": "gpt2"}, f)
@classmethod
def load(cls, path):
with open(path) as f:
meta = json.load(f)
assert meta.get("type") == "bpe", "Not a BPE tokenizer file"
return cls()
class CharTokenizer:
def __init__(self, text=None, chars=None):
if chars is not None:
self.chars = sorted(chars)
elif text is not None:
self.chars = sorted(set(text))
else:
raise ValueError("Provide either text or chars")
self.vocab_size = len(self.chars)
self.stoi = {ch: i for i, ch in enumerate(self.chars)}
self.itos = {i: ch for ch, i in self.stoi.items()}
def encode(self, text):
return [self.stoi[ch] for ch in text if ch in self.stoi]
def decode(self, ids):
return "".join(self.itos.get(i, "") for i in ids)
def save(self, path):
with open(path, "w") as f:
json.dump({"type": "char", "chars": self.chars}, f)
@classmethod
def load(cls, path):
with open(path) as f:
data = json.load(f)
if data.get("type") == "bpe":
return BPETokenizer()
return cls(chars=data["chars"])
def load_tokenizer(path):
with open(path) as f:
meta = json.load(f)
if meta.get("type") == "bpe":
return BPETokenizer()
return CharTokenizer(chars=meta["chars"])