class CharTokenizer: def __init__(self): self.stoi = {} self.itos = {} @property def vocab_size(self) -> int: return len(self.stoi) def fit(self, text: str): chars = sorted(set(text)) self.stoi = {ch: idx for idx, ch in enumerate(chars)} self.itos = {idx: ch for ch, idx in self.stoi.items()} return self def encode(self, text: str): return [self.stoi[ch] for ch in text if ch in self.stoi] def decode(self, ids): return "".join(self.itos.get(int(idx), "") for idx in ids) def state_dict(self): return {"stoi": self.stoi} @classmethod def from_state_dict(cls, state): tok = cls() tok.stoi = dict(state["stoi"]) tok.itos = {idx: ch for ch, idx in tok.stoi.items()} return tok