from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders # Create a BPE tokenizer tokenizer = Tokenizer(models.BPE()) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoder = decoders.ByteLevel() # Train on your text data trainer = trainers.BpeTrainer( vocab_size=30000, special_tokens=["", "", "", "", ""] ) # Replace 'train.txt' with your text file containing all training data tokenizer.train(files=["train.txt"], trainer=trainer) # Save the tokenizer.json tokenizer.save("tokenizer.json") print("tokenizer.json is ready!")