Sanchari / tokenizer /train_tokenizer.py
Mike369williams's picture
Create tokenizer/train_tokenizer.py
3f6c465 verified
import sentencepiece as spm
import argparse
import os
def train_tokenizer(input_file, vocab_size=50000, model_prefix="sanchari_spm"):
if not os.path.exists(input_file):
raise FileNotFoundError(f"Input file does not exist: {input_file}")
spm.SentencePieceTrainer.Train(
input=input_file,
model_prefix=model_prefix,
vocab_size=vocab_size,
model_type="unigram",
character_coverage=1.0,
num_threads=8,
normalization_rule_name="nmt_nfkc",
bos_id=1,
eos_id=2,
unk_id=0
)
print("Tokenizer training complete.")
print(f"Generated: {model_prefix}.model, {model_prefix}.vocab")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input", required=True, help="Path to training text file")
parser.add_argument("--vocab_size", type=int, default=50000)
parser.add_argument("--model_prefix", default="sanchari_spm")
args = parser.parse_args()
train_tokenizer(
input_file=args.input,
vocab_size=args.vocab_size,
model_prefix=args.model_prefix
)