File size: 475 Bytes
80cf276
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import tokenizers.models
import tokenizers
import tokenizers.trainers
import tokenizers.pre_tokenizers
import tokenizers.decoders

tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())

trainer = tokenizers.trainers.BpeTrainer()

tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.decoder = tokenizers.decoders.ByteLevel()

tokenizer.train_from_iterator([chr(x) for x in range(256)], trainer)

tokenizer.save("tokenizer.json")