| import json | |
| def build_tokenizer_word_by_word_from_texts(texts, save_path="tokenizer_words.json", pad_token="[PAD]"): | |
| all_text = " ".join(texts) | |
| words = all_text.split() | |
| unique_words = sorted(set(words)) | |
| if pad_token not in unique_words: | |
| unique_words.insert(0, pad_token) | |
| stoi = {word: i for i, word in enumerate(unique_words)} | |
| itos = {i: word for word, i in stoi.items()} | |
| tokenizer = {"stoi": stoi, "itos": itos, "pad_token": pad_token} | |
| with open(save_path, "w", encoding="utf-8") as f: | |
| json.dump(tokenizer, f, ensure_ascii=False, indent=2) | |
| print(f"Tokenizer mot-par-mot sauvegardé dans {save_path} ({len(stoi)} tokens)") | |
| def build_tokenizer_char_by_char_from_texts(texts, save_path="tokenizer.json", pad_token="[PAD]"): | |
| all_text = " ".join(texts) | |
| unique_chars = sorted(set(all_text)) | |
| if pad_token not in unique_chars: | |
| unique_chars.insert(0, pad_token) | |
| stoi = {ch: i for i, ch in enumerate(unique_chars)} | |
| itos = {i: ch for ch, i in stoi.items()} | |
| tokenizer = {"stoi": stoi, "itos": itos, "pad_token": pad_token} | |
| with open(save_path, "w", encoding="utf-8") as f: | |
| json.dump(tokenizer, f, ensure_ascii=False, indent=2) | |
| print(f"Tokenizer sauvegardé dans {save_path} ({len(stoi)} tokens)") | |
| def load_tokenizer(path="tokenizer.json"): | |
| with open(path, "r", encoding="utf-8") as f: | |
| tokenizer = json.load(f) | |
| stoi = tokenizer["stoi"] | |
| itos = {int(k): v for k, v in tokenizer["itos"].items()} | |
| encode = lambda s: [stoi[word] for word in s.split()] | |
| decode = lambda l: ' '.join([itos[i] for i in l]) | |
| pad_token = tokenizer.get("pad_token", "[PAD]") | |
| pad_token_id = stoi[pad_token] | |
| return stoi, itos, encode, decode, pad_token_id |