adityashisharma commited on
Commit
707323a
·
verified ·
1 Parent(s): ada26e1

Create train/build_tokenizer.py

Browse files
Files changed (1) hide show
  1. train/build_tokenizer.py +24 -0
train/build_tokenizer.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors
2
+ from pathlib import Path
3
+ import argparse
4
+
5
+ parser = argparse.ArgumentParser()
6
+ parser.add_argument("--vocab_size", type=int, default=16000)
7
+ parser.add_argument("--input", type=str, default="data/corpus_raw.txt")
8
+ parser.add_argument("--out", type=str, default="out/tokenizer.json")
9
+ args = parser.parse_args()
10
+
11
+ Path("out").mkdir(exist_ok=True)
12
+ tok = Tokenizer(models.BPE(unk_token="[UNK]"))
13
+ tok.pre_tokenizer = pre_tokenizers.ByteLevel()
14
+ trainer = trainers.BpeTrainer(
15
+ vocab_size=args.vocab_size,
16
+ special_tokens=["[PAD]","[BOS]","[EOS]","[UNK]"]
17
+ )
18
+ tok.train(files=[args.input], trainer=trainer)
19
+ tok.post_processor = processors.TemplateProcessing(
20
+ single="[BOS] $A [EOS]",
21
+ special_tokens=[("[BOS]", tok.token_to_id("[BOS]")), ("[EOS]", tok.token_to_id("[EOS]"))],
22
+ )
23
+ tok.save(args.out)
24
+ print(f"tokenizer saved to {args.out}")