| class CharTokenizer: |
| def __init__(self, text: str): |
| chars = sorted(set(text)) |
| if not chars: |
| raise ValueError("Tokenizer cannot be built from empty text.") |
|
|
| self.chars = chars |
| self.stoi = {ch: idx for idx, ch in enumerate(chars)} |
| self.itos = {idx: ch for idx, ch in enumerate(chars)} |
|
|
| @property |
| def vocab_size(self) -> int: |
| return len(self.chars) |
|
|
| def encode(self, text: str) -> list[int]: |
| return [self.stoi[ch] for ch in text if ch in self.stoi] |
|
|
| def decode(self, tokens: list[int]) -> str: |
| return "".join(self.itos[token] for token in tokens) |
|
|