| import sentencepiece as spm | |
| import argparse | |
| import os | |
| def train_tokenizer(input_file, vocab_size=50000, model_prefix="sanchari_spm"): | |
| if not os.path.exists(input_file): | |
| raise FileNotFoundError(f"Input file does not exist: {input_file}") | |
| spm.SentencePieceTrainer.Train( | |
| input=input_file, | |
| model_prefix=model_prefix, | |
| vocab_size=vocab_size, | |
| model_type="unigram", | |
| character_coverage=1.0, | |
| num_threads=8, | |
| normalization_rule_name="nmt_nfkc", | |
| bos_id=1, | |
| eos_id=2, | |
| unk_id=0 | |
| ) | |
| print("Tokenizer training complete.") | |
| print(f"Generated: {model_prefix}.model, {model_prefix}.vocab") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--input", required=True, help="Path to training text file") | |
| parser.add_argument("--vocab_size", type=int, default=50000) | |
| parser.add_argument("--model_prefix", default="sanchari_spm") | |
| args = parser.parse_args() | |
| train_tokenizer( | |
| input_file=args.input, | |
| vocab_size=args.vocab_size, | |
| model_prefix=args.model_prefix | |
| ) |