Spaces:
Sleeping
Sleeping
| class CharTokenizer: | |
| def __init__(self): | |
| self.stoi = {} | |
| self.itos = {} | |
| def vocab_size(self) -> int: | |
| return len(self.stoi) | |
| def fit(self, text: str): | |
| chars = sorted(set(text)) | |
| self.stoi = {ch: idx for idx, ch in enumerate(chars)} | |
| self.itos = {idx: ch for ch, idx in self.stoi.items()} | |
| return self | |
| def encode(self, text: str): | |
| return [self.stoi[ch] for ch in text if ch in self.stoi] | |
| def decode(self, ids): | |
| return "".join(self.itos.get(int(idx), "") for idx in ids) | |
| def state_dict(self): | |
| return {"stoi": self.stoi} | |
| def from_state_dict(cls, state): | |
| tok = cls() | |
| tok.stoi = dict(state["stoi"]) | |
| tok.itos = {idx: ch for ch, idx in tok.stoi.items()} | |
| return tok | |