| import tokenizers.models | |
| import tokenizers | |
| import tokenizers.trainers | |
| import tokenizers.pre_tokenizers | |
| import tokenizers.decoders | |
| tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE()) | |
| trainer = tokenizers.trainers.BpeTrainer() | |
| tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False) | |
| tokenizer.decoder = tokenizers.decoders.ByteLevel() | |
| tokenizer.train_from_iterator([chr(x) for x in range(256)], trainer) | |
| tokenizer.save("tokenizer.json") |