aithing / aoban_tokenizer.py
Aobangaming's picture
Upload 3 files
359d4bb verified
raw
history blame contribute delete
935 Bytes
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.processors import TemplateProcessing
tokenizer = Tokenizer(BPE(unk_token="<UNK>"))
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
tokenizer.decoder = ByteLevelDecoder()
trainer = BpeTrainer(
vocab_size=5000,
min_frequency=1, # Lowered to combine words more easily
special_tokens=["<PAD>", "<UNK>", "<|endoftext|>"]
)
tokenizer.train(
files=["training_data.txt"],
trainer=trainer
)
tokenizer.post_processor = TemplateProcessing(
single="$A <|endoftext|>",
special_tokens=[
("<|endoftext|>", tokenizer.token_to_id("<|endoftext|>"))
]
)
tokenizer.save("aoban_tokenizer.json")
print("Tokenizer fixed and saved.")