STT_Model
/
IndicTrans2
/huggingface_interface
/IndicTransToolkit
/tokenizer_training
/train_tokenizer.py
| from datasets import load_dataset | |
| import sentencepiece as spm | |
| import os | |
| import shutil | |
| # 1️⃣ Load dataset | |
| dataset = load_dataset('nitikdias/malayala-english_medical_dataset') | |
| # 2️⃣ Extract texts | |
| ml_texts = dataset['train']['ml'] | |
| en_texts = dataset['train']['en'] | |
| # 3️⃣ Combine all texts | |
| all_texts = ml_texts + en_texts | |
| # 4️⃣ Save combined corpus to text file | |
| corpus_file = "tokenizer_corpus.txt" | |
| with open(corpus_file, "w", encoding="utf-8") as f: | |
| for line in all_texts: | |
| f.write(line.strip() + "\n") | |
| print(f"Corpus saved at: {corpus_file}") | |
| # 5️⃣ Train SentencePiece tokenizer | |
| spm.SentencePieceTrainer.train( | |
| input=corpus_file, | |
| model_prefix='custom_tokenizer', | |
| vocab_size=2500, | |
| character_coverage=1.0, | |
| model_type='bpe' # 'unigram' can also be used | |
| ) | |
| print("Tokenizer training completed.") | |
| print("Files generated:") | |
| print("- custom_tokenizer.model") | |
| print("- custom_tokenizer.vocab") | |
| # 6️⃣ OPTIONAL: Move tokenizer files to a desired local folder | |
| output_dir = "/Users/apple/Desktop/indictrans2/indictrans2/huggingface_interface/indictranstoolkit/tokenizer_training/my_tokenizer/" # Change to any folder you want | |
| # Create folder if doesn't exist | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Move files | |
| shutil.move("custom_tokenizer.model", os.path.join(output_dir, "custom_tokenizer.model")) | |
| shutil.move("custom_tokenizer.vocab", os.path.join(output_dir, "custom_tokenizer.vocab")) | |
| print(f"Tokenizer files saved locally at: {output_dir}") | |