class CharTokenizer: def __init__(self, text: str): chars = sorted(set(text)) if not chars: raise ValueError("Tokenizer cannot be built from empty text.") self.chars = chars self.stoi = {ch: idx for idx, ch in enumerate(chars)} self.itos = {idx: ch for idx, ch in enumerate(chars)} @property def vocab_size(self) -> int: return len(self.chars) def encode(self, text: str) -> list[int]: return [self.stoi[ch] for ch in text if ch in self.stoi] def decode(self, tokens: list[int]) -> str: return "".join(self.itos[token] for token in tokens)