Physics-Tutor-Model / train /build_tokenizer.py
adityashisharma's picture
Create train/build_tokenizer.py
707323a verified
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors
from pathlib import Path
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--vocab_size", type=int, default=16000)
parser.add_argument("--input", type=str, default="data/corpus_raw.txt")
parser.add_argument("--out", type=str, default="out/tokenizer.json")
args = parser.parse_args()
Path("out").mkdir(exist_ok=True)
tok = Tokenizer(models.BPE(unk_token="[UNK]"))
tok.pre_tokenizer = pre_tokenizers.ByteLevel()
trainer = trainers.BpeTrainer(
vocab_size=args.vocab_size,
special_tokens=["[PAD]","[BOS]","[EOS]","[UNK]"]
)
tok.train(files=[args.input], trainer=trainer)
tok.post_processor = processors.TemplateProcessing(
single="[BOS] $A [EOS]",
special_tokens=[("[BOS]", tok.token_to_id("[BOS]")), ("[EOS]", tok.token_to_id("[EOS]"))],
)
tok.save(args.out)
print(f"tokenizer saved to {args.out}")