from tokenizers import Tokenizer from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer from tokenizers.pre_tokenizers import ByteLevel from tokenizers.decoders import ByteLevel as ByteLevelDecoder from tokenizers.processors import TemplateProcessing tokenizer = Tokenizer(BPE(unk_token="")) tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True) tokenizer.decoder = ByteLevelDecoder() trainer = BpeTrainer( vocab_size=5000, min_frequency=1, # Lowered to combine words more easily special_tokens=["", "", "<|endoftext|>"] ) tokenizer.train( files=["training_data.txt"], trainer=trainer ) tokenizer.post_processor = TemplateProcessing( single="$A <|endoftext|>", special_tokens=[ ("<|endoftext|>", tokenizer.token_to_id("<|endoftext|>")) ] ) tokenizer.save("aoban_tokenizer.json") print("Tokenizer fixed and saved.")