File size: 475 Bytes
80cf276 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | import tokenizers.models
import tokenizers
import tokenizers.trainers
import tokenizers.pre_tokenizers
import tokenizers.decoders
tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())
trainer = tokenizers.trainers.BpeTrainer()
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.decoder = tokenizers.decoders.ByteLevel()
tokenizer.train_from_iterator([chr(x) for x in range(256)], trainer)
tokenizer.save("tokenizer.json") |