from datasets import load_dataset from tokenizers import ByteLevelBPETokenizer dataset = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT", split="train", streaming=True) def get_training_corpus(): dataset_iter = iter(dataset) for _ in range(50000): yield next(dataset_iter)["text"] tokenizer = ByteLevelBPETokenizer() tokenizer.train_from_iterator( get_training_corpus(), vocab_size=500, min_frequency=2, special_tokens=["", "", "", "", ""] ) tokenizer.save_model(".", "custom_llama_tokenizer") print("Tokenizer training complete!")