| from datasets import load_dataset | |
| from tokenizer import build_tokenizer_char_by_char_from_texts, build_tokenizer_word_by_word_from_texts | |
| texts = load_dataset("iproskurina/TinyStories-French")["train"]["french-tinystories"] | |
| #texts = load_dataset("CATIE-AQ/wikipedia_fr_2022_250K")["train"]["text"] | |
| name="tokenizer_wtw_tinystories.json" | |
| build_tokenizer_char_by_char_from_texts(texts, save_path=name) | |
| build_tokenizer_word_by_word_from_texts(texts, save_path=name) | |