from datasets import load_dataset from tokenizer import build_tokenizer_char_by_char_from_texts, build_tokenizer_word_by_word_from_texts texts = load_dataset("iproskurina/TinyStories-French")["train"]["french-tinystories"] #texts = load_dataset("CATIE-AQ/wikipedia_fr_2022_250K")["train"]["text"] name="tokenizer_wtw_tinystories.json" build_tokenizer_char_by_char_from_texts(texts, save_path=name) build_tokenizer_word_by_word_from_texts(texts, save_path=name)