| from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors | |
| from pathlib import Path | |
| import argparse | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--vocab_size", type=int, default=16000) | |
| parser.add_argument("--input", type=str, default="data/corpus_raw.txt") | |
| parser.add_argument("--out", type=str, default="out/tokenizer.json") | |
| args = parser.parse_args() | |
| Path("out").mkdir(exist_ok=True) | |
| tok = Tokenizer(models.BPE(unk_token="[UNK]")) | |
| tok.pre_tokenizer = pre_tokenizers.ByteLevel() | |
| trainer = trainers.BpeTrainer( | |
| vocab_size=args.vocab_size, | |
| special_tokens=["[PAD]","[BOS]","[EOS]","[UNK]"] | |
| ) | |
| tok.train(files=[args.input], trainer=trainer) | |
| tok.post_processor = processors.TemplateProcessing( | |
| single="[BOS] $A [EOS]", | |
| special_tokens=[("[BOS]", tok.token_to_id("[BOS]")), ("[EOS]", tok.token_to_id("[EOS]"))], | |
| ) | |
| tok.save(args.out) | |
| print(f"tokenizer saved to {args.out}") | |