from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors from pathlib import Path import argparse parser = argparse.ArgumentParser() parser.add_argument("--vocab_size", type=int, default=16000) parser.add_argument("--input", type=str, default="data/corpus_raw.txt") parser.add_argument("--out", type=str, default="out/tokenizer.json") args = parser.parse_args() Path("out").mkdir(exist_ok=True) tok = Tokenizer(models.BPE(unk_token="[UNK]")) tok.pre_tokenizer = pre_tokenizers.ByteLevel() trainer = trainers.BpeTrainer( vocab_size=args.vocab_size, special_tokens=["[PAD]","[BOS]","[EOS]","[UNK]"] ) tok.train(files=[args.input], trainer=trainer) tok.post_processor = processors.TemplateProcessing( single="[BOS] $A [EOS]", special_tokens=[("[BOS]", tok.token_to_id("[BOS]")), ("[EOS]", tok.token_to_id("[EOS]"))], ) tok.save(args.out) print(f"tokenizer saved to {args.out}")