from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders
# Create a BPE tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
# Train on your text data
trainer = trainers.BpeTrainer(
vocab_size=30000,
special_tokens=["", "", "", "", ""]
)
# Replace 'train.txt' with your text file containing all training data
tokenizer.train(files=["train.txt"], trainer=trainer)
# Save the tokenizer.json
tokenizer.save("tokenizer.json")
print("tokenizer.json is ready!")