File size: 463 Bytes
c64cf6f
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
from datasets import load_dataset
from tokenizer import build_tokenizer_char_by_char_from_texts, build_tokenizer_word_by_word_from_texts

texts = load_dataset("iproskurina/TinyStories-French")["train"]["french-tinystories"]
#texts = load_dataset("CATIE-AQ/wikipedia_fr_2022_250K")["train"]["text"]
name="tokenizer_wtw_tinystories.json"
build_tokenizer_char_by_char_from_texts(texts, save_path=name)
build_tokenizer_word_by_word_from_texts(texts, save_path=name)