| from tokenizers import Tokenizer, normalizers, models, pre_tokenizers, processors, ByteLevelBPETokenizer | |
| import tokenizers | |
| from tokenizers.models import WordPiece, BPE | |
| from tokenizers.trainers import WordPieceTrainer, BpeTrainer | |
| from tokenizers.pre_tokenizers import Whitespace, Punctuation, Sequence | |
| from tokenizers.processors import TemplateProcessing | |
| import os | |
| from transformers import AutoTokenizer | |
| old_tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
| import datasets | |
| input_dir = "/dataset/location" | |
| dataset = datasets.load_from_disk(input_dir) | |
| def get_training_corpus(): | |
| for start_idx in range(0, len(dataset), 10000): | |
| samples = dataset[start_idx : start_idx + 10000] | |
| yield samples["text"] | |
| print("start") | |
| tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(), vocab_size=50000) | |
| print("end") | |
| tokenizer.save_vocabulary("/tokenizer_location") | |