llm / tokenizer.py
abersbail's picture
Upload 16 files
7fc99b0 verified
raw
history blame contribute delete
640 Bytes
class CharTokenizer:
def __init__(self, text: str):
chars = sorted(set(text))
if not chars:
raise ValueError("Tokenizer cannot be built from empty text.")
self.chars = chars
self.stoi = {ch: idx for idx, ch in enumerate(chars)}
self.itos = {idx: ch for idx, ch in enumerate(chars)}
@property
def vocab_size(self) -> int:
return len(self.chars)
def encode(self, text: str) -> list[int]:
return [self.stoi[ch] for ch in text if ch in self.stoi]
def decode(self, tokens: list[int]) -> str:
return "".join(self.itos[token] for token in tokens)