# src/train_nepali.py import os from datasets import load_dataset, DatasetDict, concatenate_datasets from transformers import ( AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, ) def train_nepali_model(): """ Fine-tunes a pre-trained NLLB model on the Nepali parallel dataset. """ # --- 1. Configuration --- MODEL_CHECKPOINT = "facebook/nllb-200-distilled-600M" DATA_DIR = "data/processed" MODEL_OUTPUT_DIR = "D:\\SIH\\models\\nllb-finetuned-nepali-en" # --- 2. Load Tokenizer and Model --- print("Loading tokenizer and model...") tokenizer = AutoTokenizer.from_pretrained( MODEL_CHECKPOINT, src_lang="nep_Npan", tgt_lang="eng_Latn" ) model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT) # --- 3. Load and Preprocess Data --- print("Loading and preprocessing data...") nepali_dataset = load_dataset("text", data_files=os.path.join(DATA_DIR, "nepali.ne"))["train"] english_dataset = load_dataset("text", data_files=os.path.join(DATA_DIR, "nepali.en"))["train"] # rename the 'text' column to 'ne' and 'en' nepali_dataset = nepali_dataset.rename_column("text", "ne") english_dataset = english_dataset.rename_column("text", "en") # combine the datasets raw_datasets = concatenate_datasets([nepali_dataset, english_dataset], axis=1) split_datasets = raw_datasets.train_test_split(train_size=0.95, seed=42) split_datasets["validation"] = split_datasets.pop("test") def preprocess_function(examples): inputs = examples["ne"] targets = examples["en"] model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True) return model_inputs tokenized_datasets = split_datasets.map( preprocess_function, batched=True, remove_columns=split_datasets["train"].column_names, ) # --- 4. Set Up Training Arguments --- print("Setting up training arguments...") training_args = Seq2SeqTrainingArguments( output_dir=MODEL_OUTPUT_DIR, eval_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8, weight_decay=0.01, save_total_limit=3, num_train_epochs=3, # Reduced for faster training, can be increased predict_with_generate=True, fp16=False, # Set to True if you have a compatible GPU ) # --- 5. Create the Trainer --- data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], tokenizer=tokenizer, data_collator=data_collator, ) # --- 6. Start Training --- print(f"\n--- Starting model fine-tuning for Nepali-English ---") trainer.train() print("--- Training complete ---") # --- 7. Save the Final Model --- print(f"Saving final model to {MODEL_OUTPUT_DIR}") trainer.save_model() print("Model saved successfully!") if __name__ == "__main__": train_nepali_model()