qikp's picture
Upload 3 files
80cf276 verified
import tokenizers.models
import tokenizers
import tokenizers.trainers
import tokenizers.pre_tokenizers
import tokenizers.decoders
tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())
trainer = tokenizers.trainers.BpeTrainer()
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.decoder = tokenizers.decoders.ByteLevel()
tokenizer.train_from_iterator([chr(x) for x in range(256)], trainer)
tokenizer.save("tokenizer.json")