import sentencepiece as spm import argparse import os def train_tokenizer(input_file, vocab_size=50000, model_prefix="sanchari_spm"): if not os.path.exists(input_file): raise FileNotFoundError(f"Input file does not exist: {input_file}") spm.SentencePieceTrainer.Train( input=input_file, model_prefix=model_prefix, vocab_size=vocab_size, model_type="unigram", character_coverage=1.0, num_threads=8, normalization_rule_name="nmt_nfkc", bos_id=1, eos_id=2, unk_id=0 ) print("Tokenizer training complete.") print(f"Generated: {model_prefix}.model, {model_prefix}.vocab") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--input", required=True, help="Path to training text file") parser.add_argument("--vocab_size", type=int, default=50000) parser.add_argument("--model_prefix", default="sanchari_spm") args = parser.parse_args() train_tokenizer( input_file=args.input, vocab_size=args.vocab_size, model_prefix=args.model_prefix )