slm-tiny-stories / build_tokenizer.py
Eric Houzelle
Initial commit
c64cf6f
raw
history blame contribute delete
463 Bytes
from datasets import load_dataset
from tokenizer import build_tokenizer_char_by_char_from_texts, build_tokenizer_word_by_word_from_texts
texts = load_dataset("iproskurina/TinyStories-French")["train"]["french-tinystories"]
#texts = load_dataset("CATIE-AQ/wikipedia_fr_2022_250K")["train"]["text"]
name="tokenizer_wtw_tinystories.json"
build_tokenizer_char_by_char_from_texts(texts, save_path=name)
build_tokenizer_word_by_word_from_texts(texts, save_path=name)