import tokenizers.models import tokenizers import tokenizers.trainers import tokenizers.pre_tokenizers import tokenizers.decoders tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE()) trainer = tokenizers.trainers.BpeTrainer() tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False) tokenizer.decoder = tokenizers.decoders.ByteLevel() tokenizer.train_from_iterator([chr(x) for x in range(256)], trainer) tokenizer.save("tokenizer.json")