print("[*] Loading libraries...") from datasets import load_dataset from tokenizers import ByteLevelBPETokenizer from tqdm import tqdm dataset = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT", split="train", streaming=True) def get_training_corpus(): dataset_iter = iter(dataset) for _ in tqdm(range(500_000), desc="Feeding data"): yield next(dataset_iter)["text"] tokenizer = ByteLevelBPETokenizer() print("[*] Training tokenizer...") tokenizer.train_from_iterator( get_training_corpus(), vocab_size=32_000, min_frequency=2, show_progress=True, special_tokens=["", "", "", "", ""] ) tokenizer.save_model(".", "custom_llama_tokenizer") print("[*] Tokenizer training complete!")