Mike369williams commited on
Commit
3f6c465
·
verified ·
1 Parent(s): 79dd5bc

Create tokenizer/train_tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer/train_tokenizer.py +37 -0
tokenizer/train_tokenizer.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sentencepiece as spm
2
+ import argparse
3
+ import os
4
+
5
+ def train_tokenizer(input_file, vocab_size=50000, model_prefix="sanchari_spm"):
6
+ if not os.path.exists(input_file):
7
+ raise FileNotFoundError(f"Input file does not exist: {input_file}")
8
+
9
+ spm.SentencePieceTrainer.Train(
10
+ input=input_file,
11
+ model_prefix=model_prefix,
12
+ vocab_size=vocab_size,
13
+ model_type="unigram",
14
+ character_coverage=1.0,
15
+ num_threads=8,
16
+ normalization_rule_name="nmt_nfkc",
17
+ bos_id=1,
18
+ eos_id=2,
19
+ unk_id=0
20
+ )
21
+
22
+ print("Tokenizer training complete.")
23
+ print(f"Generated: {model_prefix}.model, {model_prefix}.vocab")
24
+
25
+ if __name__ == "__main__":
26
+ parser = argparse.ArgumentParser()
27
+ parser.add_argument("--input", required=True, help="Path to training text file")
28
+ parser.add_argument("--vocab_size", type=int, default=50000)
29
+ parser.add_argument("--model_prefix", default="sanchari_spm")
30
+
31
+ args = parser.parse_args()
32
+
33
+ train_tokenizer(
34
+ input_file=args.input,
35
+ vocab_size=args.vocab_size,
36
+ model_prefix=args.model_prefix
37
+ )