Dyno1307 commited on
Commit
5bdd8f4
·
verified ·
1 Parent(s): fd2f49a

Upload 7 files

Browse files
src/__init__.py ADDED
File without changes
src/__pycache__/evaluate.cpython-313.pyc ADDED
Binary file (3.88 kB). View file
 
src/evaluate_sinhala.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/evaluate_sinhala.py
2
+
3
+ import torch
4
+ import evaluate # The new, preferred Hugging Face library for metrics
5
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
6
+ from tqdm import tqdm # A library to create smart progress bars
7
+
8
+ def evaluate_model():
9
+ """
10
+ Loads a fine-tuned model and evaluates its performance on the test set using the BLEU score.
11
+ """
12
+ # --- 1. Configuration ---
13
+ MODEL_PATH = "thilina/mt5-sinhalese-english"
14
+ TEST_DIR = "data/test_sets"
15
+ SOURCE_LANG_FILE = f"{TEST_DIR}/test.si"
16
+ TARGET_LANG_FILE = f"{TEST_DIR}/test.en"
17
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
18
+
19
+ # --- 2. Load Model, Tokenizer, and Metric ---
20
+ print("Loading model, tokenizer, and evaluation metric...")
21
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
22
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to(DEVICE)
23
+ bleu_metric = evaluate.load("sacrebleu")
24
+
25
+ # --- 3. Load Test Data ---
26
+ with open(SOURCE_LANG_FILE, "r", encoding="utf-8") as f:
27
+ source_sentences = [line.strip() for line in f.readlines()]
28
+ with open(TARGET_LANG_FILE, "r", encoding="utf-8") as f:
29
+ # The BLEU metric expects references to be a list of lists
30
+ reference_translations = [[line.strip()] for line in f.readlines()]
31
+
32
+ # --- 4. Generate Predictions ---
33
+ print(f"Generating translations for {len(source_sentences)} test sentences...")
34
+ predictions = []
35
+ for sentence in tqdm(source_sentences):
36
+ inputs = tokenizer(sentence, return_tensors="pt").to(DEVICE)
37
+
38
+ generated_tokens = model.generate(
39
+ **inputs,
40
+ max_length=128
41
+ )
42
+
43
+ translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
44
+ predictions.append(translation)
45
+
46
+ # --- 5. Compute BLEU Score ---
47
+ print("Calculating BLEU score...")
48
+ results = bleu_metric.compute(predictions=predictions, references=reference_translations)
49
+
50
+ # The result is a dictionary. The 'score' key holds the main BLEU score.
51
+ bleu_score = results["score"]
52
+
53
+ print("\n--- Evaluation Complete ---")
54
+ print(f"BLEU Score: {bleu_score:.2f}")
55
+ print("---------------------------")
56
+
57
+ if __name__ == "__main__":
58
+ evaluate_model()
src/evaluation.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/evaluate.py
2
+
3
+ import torch
4
+ import evaluate # The new, preferred Hugging Face library for metrics
5
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
6
+ from tqdm import tqdm # A library to create smart progress bars
7
+ import argparse
8
+
9
+ def evaluate_model():
10
+ """
11
+ Loads a fine-tuned model and evaluates its performance on the test set using the BLEU score.
12
+ """
13
+ parser = argparse.ArgumentParser(description="Evaluate a translation model.")
14
+ parser.add_argument("--model_path", type=str, required=True, help="Path to the fine-tuned model directory")
15
+ parser.add_argument("--source_lang_file", type=str, required=True, help="Path to the source language test file")
16
+ parser.add_argument("--target_lang_file", type=str, required=True, help="Path to the target language test file")
17
+ parser.add_argument("--source_lang_tokenizer", type=str, required=True, help="Source language code for tokenizer (e.g., 'nep_Npan')")
18
+ args = parser.parse_args()
19
+
20
+ # --- 1. Configuration ---
21
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
22
+
23
+ # --- 2. Load Model, Tokenizer, and Metric ---
24
+ print("Loading model, tokenizer, and evaluation metric...")
25
+ tokenizer = AutoTokenizer.from_pretrained(args.model_path)
26
+ model = AutoModelForSeq2SeqLM.from_pretrained(args.model_path).to(DEVICE)
27
+ bleu_metric = evaluate.load("sacrebleu")
28
+
29
+ # --- 3. Load Test Data ---
30
+ with open(args.source_lang_file, "r", encoding="utf-8") as f:
31
+ source_sentences = [line.strip() for line in f.readlines()]
32
+ with open(args.target_lang_file, "r", encoding="utf-8") as f:
33
+ # The BLEU metric expects references to be a list of lists
34
+ reference_translations = [[line.strip()] for line in f.readlines()]
35
+
36
+ # --- 4. Generate Predictions ---
37
+ print(f"Generating translations for {len(source_sentences)} test sentences...")
38
+ predictions = []
39
+ for sentence in tqdm(source_sentences):
40
+ tokenizer.src_lang = args.source_lang_tokenizer
41
+ inputs = tokenizer(sentence, return_tensors="pt").to(DEVICE)
42
+
43
+ generated_tokens = model.generate(
44
+ **inputs,
45
+ forced_bos_token_id=tokenizer.vocab["eng_Latn"],
46
+ max_length=128
47
+ )
48
+
49
+ translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
50
+ predictions.append(translation)
51
+
52
+ # --- 5. Compute BLEU Score ---
53
+ print("Calculating BLEU score...")
54
+ results = bleu_metric.compute(predictions=predictions, references=reference_translations)
55
+
56
+ # The result is a dictionary. The 'score' key holds the main BLEU score.
57
+ bleu_score = results["score"]
58
+
59
+ print("\n--- Evaluation Complete ---")
60
+ print(f"BLEU Score: {bleu_score:.2f}")
61
+ print("---------------------------")
62
+
63
+ if __name__ == "__main__":
64
+ evaluate_model()
src/train.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/train.py
2
+
3
+ import os
4
+ import argparse
5
+ from datasets import Dataset
6
+ from transformers import (
7
+ AutoModelForSeq2SeqLM,
8
+ AutoTokenizer,
9
+ DataCollatorForSeq2Seq,
10
+ Seq2SeqTrainingArguments,
11
+ Seq2SeqTrainer,
12
+ )
13
+
14
+ def train_model():
15
+ """
16
+ Fine-tunes a pre-trained NLLB model on a parallel dataset.
17
+ """
18
+ parser = argparse.ArgumentParser(description="Fine-tune a translation model.")
19
+ parser.add_argument("--model_checkpoint", type=str, default="facebook/nllb-200-distilled-600M")
20
+ parser.add_argument("--source_lang", type=str, required=True, help="Source language code (e.g., 'ne')")
21
+ parser.add_argument("--target_lang", type=str, default="en")
22
+ parser.add_argument("--source_lang_tokenizer", type=str, required=True, help="Source language code for tokenizer (e.g., 'nep_Npan')")
23
+ parser.add_argument("--train_file_source", type=str, required=True, help="Path to the source language training file")
24
+ parser.add_argument("--train_file_target", type=str, required=True, help="Path to the target language training file")
25
+ parser.add_argument("--output_dir", type=str, required=True, help="Directory to save the fine-tuned model")
26
+ parser.add_argument("--epochs", type=int, default=3)
27
+ parser.add_argument("--batch_size", type=int, default=8)
28
+
29
+ args = parser.parse_args()
30
+
31
+ # --- 1. Configuration ---
32
+ MODEL_CHECKPOINT = args.model_checkpoint
33
+ SOURCE_LANG = args.source_lang
34
+ TARGET_LANG = args.target_lang
35
+ MODEL_OUTPUT_DIR = args.output_dir
36
+
37
+ # --- 2. Load Tokenizer and Model ---
38
+ print("Loading tokenizer and model...")
39
+ tokenizer = AutoTokenizer.from_pretrained(
40
+ MODEL_CHECKPOINT, src_lang=args.source_lang_tokenizer, tgt_lang="eng_Latn"
41
+ )
42
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
43
+
44
+ # --- 3. Load and Preprocess Data (Memory-Efficiently) ---
45
+ print("Loading and preprocessing data...")
46
+
47
+ def generate_examples():
48
+ with open(args.train_file_source, "r", encoding="utf-8") as f_src, \
49
+ open(args.train_file_target, "r", encoding="utf-8") as f_tgt:
50
+ for src_line, tgt_line in zip(f_src, f_tgt):
51
+ yield {"translation": {SOURCE_LANG: src_line.strip(), TARGET_LANG: tgt_line.strip()}}
52
+
53
+ dataset = Dataset.from_generator(generate_examples)
54
+
55
+ split_datasets = dataset.train_test_split(train_size=0.95, seed=42)
56
+ split_datasets["validation"] = split_datasets.pop("test")
57
+
58
+ def preprocess_function(examples):
59
+ inputs = [ex[SOURCE_LANG] for ex in examples["translation"]]
60
+ targets = [ex[TARGET_LANG] for ex in examples["translation"]]
61
+
62
+ model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
63
+ return model_inputs
64
+
65
+ tokenized_datasets = split_datasets.map(
66
+ preprocess_function,
67
+ batched=True,
68
+ remove_columns=split_datasets["train"].column_names,
69
+ )
70
+
71
+ # --- 4. Set Up Training Arguments ---
72
+ print("Setting up training arguments...")
73
+ training_args = Seq2SeqTrainingArguments(
74
+ output_dir=MODEL_OUTPUT_DIR,
75
+ eval_strategy="epoch",
76
+ learning_rate=2e-5,
77
+ per_device_train_batch_size=args.batch_size,
78
+ per_device_eval_batch_size=args.batch_size,
79
+ weight_decay=0.01,
80
+ save_total_limit=3,
81
+ num_train_epochs=args.epochs,
82
+ predict_with_generate=True,
83
+ fp16=False, # Set to True if you have a compatible GPU
84
+ )
85
+
86
+ # --- 5. Create the Trainer ---
87
+ data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
88
+
89
+ trainer = Seq2SeqTrainer(
90
+ model=model,
91
+ args=training_args,
92
+ train_dataset=tokenized_datasets["train"],
93
+ eval_dataset=tokenized_datasets["validation"],
94
+ tokenizer=tokenizer,
95
+ data_collator=data_collator,
96
+ )
97
+
98
+ # --- 6. Start Training ---
99
+ print("\n--- Starting model fine-tuning ---")
100
+ trainer.train()
101
+ print("--- Training complete ---")
102
+
103
+ # --- 7. Save the Final Model ---
104
+ print(f"Saving final model to {MODEL_OUTPUT_DIR}")
105
+ trainer.save_model()
106
+ print("Model saved successfully!")
107
+
108
+ if __name__ == "__main__":
109
+ train_model()
src/train_nepali.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/train_nepali.py
2
+
3
+ import os
4
+ from datasets import load_dataset, DatasetDict, concatenate_datasets
5
+ from transformers import (
6
+ AutoModelForSeq2SeqLM,
7
+ AutoTokenizer,
8
+ DataCollatorForSeq2Seq,
9
+ Seq2SeqTrainingArguments,
10
+ Seq2SeqTrainer,
11
+ )
12
+
13
+ def train_nepali_model():
14
+ """
15
+ Fine-tunes a pre-trained NLLB model on the Nepali parallel dataset.
16
+ """
17
+ # --- 1. Configuration ---
18
+ MODEL_CHECKPOINT = "facebook/nllb-200-distilled-600M"
19
+ DATA_DIR = "data/processed"
20
+ MODEL_OUTPUT_DIR = "D:\\SIH\\models\\nllb-finetuned-nepali-en"
21
+
22
+ # --- 2. Load Tokenizer and Model ---
23
+ print("Loading tokenizer and model...")
24
+ tokenizer = AutoTokenizer.from_pretrained(
25
+ MODEL_CHECKPOINT, src_lang="nep_Npan", tgt_lang="eng_Latn"
26
+ )
27
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
28
+
29
+ # --- 3. Load and Preprocess Data ---
30
+ print("Loading and preprocessing data...")
31
+ nepali_dataset = load_dataset("text", data_files=os.path.join(DATA_DIR, "nepali.ne"))["train"]
32
+ english_dataset = load_dataset("text", data_files=os.path.join(DATA_DIR, "nepali.en"))["train"]
33
+
34
+ # rename the 'text' column to 'ne' and 'en'
35
+ nepali_dataset = nepali_dataset.rename_column("text", "ne")
36
+ english_dataset = english_dataset.rename_column("text", "en")
37
+
38
+ # combine the datasets
39
+ raw_datasets = concatenate_datasets([nepali_dataset, english_dataset], axis=1)
40
+
41
+ split_datasets = raw_datasets.train_test_split(train_size=0.95, seed=42)
42
+ split_datasets["validation"] = split_datasets.pop("test")
43
+
44
+ def preprocess_function(examples):
45
+ inputs = examples["ne"]
46
+ targets = examples["en"]
47
+
48
+ model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
49
+ return model_inputs
50
+
51
+ tokenized_datasets = split_datasets.map(
52
+ preprocess_function,
53
+ batched=True,
54
+ remove_columns=split_datasets["train"].column_names,
55
+ )
56
+
57
+ # --- 4. Set Up Training Arguments ---
58
+ print("Setting up training arguments...")
59
+ training_args = Seq2SeqTrainingArguments(
60
+ output_dir=MODEL_OUTPUT_DIR,
61
+ eval_strategy="epoch",
62
+ learning_rate=2e-5,
63
+ per_device_train_batch_size=8,
64
+ per_device_eval_batch_size=8,
65
+ weight_decay=0.01,
66
+ save_total_limit=3,
67
+ num_train_epochs=3, # Reduced for faster training, can be increased
68
+ predict_with_generate=True,
69
+ fp16=False, # Set to True if you have a compatible GPU
70
+ )
71
+
72
+ # --- 5. Create the Trainer ---
73
+ data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
74
+
75
+ trainer = Seq2SeqTrainer(
76
+ model=model,
77
+ args=training_args,
78
+ train_dataset=tokenized_datasets["train"],
79
+ eval_dataset=tokenized_datasets["validation"],
80
+ tokenizer=tokenizer,
81
+ data_collator=data_collator,
82
+ )
83
+
84
+ # --- 6. Start Training ---
85
+ print(f"\n--- Starting model fine-tuning for Nepali-English ---")
86
+ trainer.train()
87
+ print("--- Training complete ---")
88
+
89
+ # --- 7. Save the Final Model ---
90
+ print(f"Saving final model to {MODEL_OUTPUT_DIR}")
91
+ trainer.save_model()
92
+ print("Model saved successfully!")
93
+
94
+ if __name__ == "__main__":
95
+ train_nepali_model()
src/translate.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/translate.py
2
+
3
+ # src/translate.py
4
+
5
+ import torch
6
+ from transformers import MBartForConditionalGeneration, NllbTokenizer
7
+ import argparse
8
+
9
+ # --- 1. Configuration ---
10
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
11
+
12
+ # --- 2. Load Models and Tokenizers ---
13
+ print(f"Loading models on {DEVICE.upper()}...")
14
+ models = {
15
+ "nepali": MBartForConditionalGeneration.from_pretrained("models/nllb-finetuned-nepali-en").to(DEVICE)
16
+ }
17
+ tokenizers = {
18
+ "nepali": NllbTokenizer.from_pretrained("models/nllb-finetuned-nepali-en")
19
+ }
20
+ print("All models loaded successfully!")
21
+
22
+ def translate_text(text_to_translate: str, source_language: str) -> str:
23
+ """
24
+ Translates a single string of text to English using our fine-tuned models.
25
+ """
26
+ model = models[source_language]
27
+ tokenizer = tokenizers[source_language]
28
+
29
+ tokenizer.src_lang = "nep_Npan"
30
+
31
+ inputs = tokenizer(text_to_translate, return_tensors="pt").to(DEVICE)
32
+
33
+ generated_tokens = model.generate(
34
+ **inputs,
35
+ forced_bos_token_id=tokenizer.convert_tokens_to_ids("eng_Latn"),
36
+ max_length=128
37
+ )
38
+
39
+ translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
40
+ return translation
41
+
42
+ # --- 3. Example Usage ---
43
+ if __name__ == "__main__":
44
+ parser = argparse.ArgumentParser(description="Translate text using a fine-tuned model.")
45
+ parser.add_argument("--text", type=str, required=True, help="Text to translate.")
46
+ parser.add_argument("--lang", type=str, required=True, choices=["nepali"], help="Source language: 'nepali'.")
47
+ args = parser.parse_args()
48
+
49
+ translated_sentence = translate_text(args.text, args.lang)
50
+
51
+ print(f"\nOriginal ({args.lang}): {args.text}")
52
+ print(f"Translated (en): {translated_sentence}")