abersbail's picture
Add local small LLM Python Space
740c342 verified
raw
history blame contribute delete
829 Bytes
class CharTokenizer:
def __init__(self):
self.stoi = {}
self.itos = {}
@property
def vocab_size(self) -> int:
return len(self.stoi)
def fit(self, text: str):
chars = sorted(set(text))
self.stoi = {ch: idx for idx, ch in enumerate(chars)}
self.itos = {idx: ch for ch, idx in self.stoi.items()}
return self
def encode(self, text: str):
return [self.stoi[ch] for ch in text if ch in self.stoi]
def decode(self, ids):
return "".join(self.itos.get(int(idx), "") for idx in ids)
def state_dict(self):
return {"stoi": self.stoi}
@classmethod
def from_state_dict(cls, state):
tok = cls()
tok.stoi = dict(state["stoi"])
tok.itos = {idx: ch for ch, idx in tok.stoi.items()}
return tok