Quark-0.5M / train_tokenizer.py
LH-Tech-AI's picture
Create train_tokenizer.py
2155c30 verified
from datasets import load_dataset
from tokenizers import ByteLevelBPETokenizer
dataset = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT", split="train", streaming=True)
def get_training_corpus():
dataset_iter = iter(dataset)
for _ in range(50000):
yield next(dataset_iter)["text"]
tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator(
get_training_corpus(),
vocab_size=500,
min_frequency=2,
special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
)
tokenizer.save_model(".", "custom_llama_tokenizer")
print("Tokenizer training complete!")