| from datasets import load_dataset | |
| from tokenizers import ByteLevelBPETokenizer | |
| dataset = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT", split="train", streaming=True) | |
| def get_training_corpus(): | |
| dataset_iter = iter(dataset) | |
| for _ in range(50000): | |
| yield next(dataset_iter)["text"] | |
| tokenizer = ByteLevelBPETokenizer() | |
| tokenizer.train_from_iterator( | |
| get_training_corpus(), | |
| vocab_size=500, | |
| min_frequency=2, | |
| special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"] | |
| ) | |
| tokenizer.save_model(".", "custom_llama_tokenizer") | |
| print("Tokenizer training complete!") |