File size: 1,126 Bytes
3f6c465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import sentencepiece as spm
import argparse
import os

def train_tokenizer(input_file, vocab_size=50000, model_prefix="sanchari_spm"):
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file does not exist: {input_file}")

    spm.SentencePieceTrainer.Train(
        input=input_file,
        model_prefix=model_prefix,
        vocab_size=vocab_size,
        model_type="unigram",
        character_coverage=1.0,
        num_threads=8,
        normalization_rule_name="nmt_nfkc",
        bos_id=1,
        eos_id=2,
        unk_id=0
    )

    print("Tokenizer training complete.")
    print(f"Generated: {model_prefix}.model, {model_prefix}.vocab")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input", required=True, help="Path to training text file")
    parser.add_argument("--vocab_size", type=int, default=50000)
    parser.add_argument("--model_prefix", default="sanchari_spm")

    args = parser.parse_args()

    train_tokenizer(
        input_file=args.input,
        vocab_size=args.vocab_size,
        model_prefix=args.model_prefix
    )