slm-tiny-stories / tokenizer.py
Eric Houzelle
Initial commit
c64cf6f
import json
def build_tokenizer_word_by_word_from_texts(texts, save_path="tokenizer_words.json", pad_token="[PAD]"):
all_text = " ".join(texts)
words = all_text.split()
unique_words = sorted(set(words))
if pad_token not in unique_words:
unique_words.insert(0, pad_token)
stoi = {word: i for i, word in enumerate(unique_words)}
itos = {i: word for word, i in stoi.items()}
tokenizer = {"stoi": stoi, "itos": itos, "pad_token": pad_token}
with open(save_path, "w", encoding="utf-8") as f:
json.dump(tokenizer, f, ensure_ascii=False, indent=2)
print(f"Tokenizer mot-par-mot sauvegardé dans {save_path} ({len(stoi)} tokens)")
def build_tokenizer_char_by_char_from_texts(texts, save_path="tokenizer.json", pad_token="[PAD]"):
all_text = " ".join(texts)
unique_chars = sorted(set(all_text))
if pad_token not in unique_chars:
unique_chars.insert(0, pad_token)
stoi = {ch: i for i, ch in enumerate(unique_chars)}
itos = {i: ch for ch, i in stoi.items()}
tokenizer = {"stoi": stoi, "itos": itos, "pad_token": pad_token}
with open(save_path, "w", encoding="utf-8") as f:
json.dump(tokenizer, f, ensure_ascii=False, indent=2)
print(f"Tokenizer sauvegardé dans {save_path} ({len(stoi)} tokens)")
def load_tokenizer(path="tokenizer.json"):
with open(path, "r", encoding="utf-8") as f:
tokenizer = json.load(f)
stoi = tokenizer["stoi"]
itos = {int(k): v for k, v in tokenizer["itos"].items()}
encode = lambda s: [stoi[word] for word in s.split()]
decode = lambda l: ' '.join([itos[i] for i in l])
pad_token = tokenizer.get("pad_token", "[PAD]")
pad_token_id = stoi[pad_token]
return stoi, itos, encode, decode, pad_token_id