Spaces:

abersbail
/

local-small-llm-python

Sleeping

Add local small LLM Python Space

740c342 verified 2 months ago

829 Bytes

	class CharTokenizer:
	def __init__(self):
	self.stoi = {}
	self.itos = {}

	@property
	def vocab_size(self) -> int:
	return len(self.stoi)

	def fit(self, text: str):
	chars = sorted(set(text))
	self.stoi = {ch: idx for idx, ch in enumerate(chars)}
	self.itos = {idx: ch for ch, idx in self.stoi.items()}
	return self

	def encode(self, text: str):
	return [self.stoi[ch] for ch in text if ch in self.stoi]

	def decode(self, ids):
	return "".join(self.itos.get(int(idx), "") for idx in ids)

	def state_dict(self):
	return {"stoi": self.stoi}

	@classmethod
	def from_state_dict(cls, state):
	tok = cls()
	tok.stoi = dict(state["stoi"])
	tok.itos = {idx: ch for ch, idx in tok.stoi.items()}
	return tok