Spaces:

abersbail
/

llm

Runtime error

llm / tokenizer.py

Upload 16 files

7fc99b0 verified 7 days ago

640 Bytes

	class CharTokenizer:
	def __init__(self, text: str):
	chars = sorted(set(text))
	if not chars:
	raise ValueError("Tokenizer cannot be built from empty text.")

	self.chars = chars
	self.stoi = {ch: idx for idx, ch in enumerate(chars)}
	self.itos = {idx: ch for idx, ch in enumerate(chars)}

	@property
	def vocab_size(self) -> int:
	return len(self.chars)

	def encode(self, text: str) -> list[int]:
	return [self.stoi[ch] for ch in text if ch in self.stoi]

	def decode(self, tokens: list[int]) -> str:
	return "".join(self.itos[token] for token in tokens)