File size: 597 Bytes
c37bfa2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders
# Create a BPE tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
# Train on your text data
trainer = trainers.BpeTrainer(
vocab_size=30000,
special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
)
# Replace 'train.txt' with your text file containing all training data
tokenizer.train(files=["train.txt"], trainer=trainer)
# Save the tokenizer.json
tokenizer.save("tokenizer.json")
print("tokenizer.json is ready!") |