Spaces:

abersbail
/

local-small-llm-python

Sleeping

File size: 829 Bytes

740c342

class CharTokenizer:
    def __init__(self):
        self.stoi = {}
        self.itos = {}

    @property
    def vocab_size(self) -> int:
        return len(self.stoi)

    def fit(self, text: str):
        chars = sorted(set(text))
        self.stoi = {ch: idx for idx, ch in enumerate(chars)}
        self.itos = {idx: ch for ch, idx in self.stoi.items()}
        return self

    def encode(self, text: str):
        return [self.stoi[ch] for ch in text if ch in self.stoi]

    def decode(self, ids):
        return "".join(self.itos.get(int(idx), "") for idx in ids)

    def state_dict(self):
        return {"stoi": self.stoi}

    @classmethod
    def from_state_dict(cls, state):
        tok = cls()
        tok.stoi = dict(state["stoi"])
        tok.itos = {idx: ch for ch, idx in tok.stoi.items()}
        return tok