|
|
|
|
|
|
|
|
import torch |
|
|
import evaluate |
|
|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer |
|
|
from tqdm import tqdm |
|
|
|
|
|
def evaluate_model(): |
|
|
""" |
|
|
Loads a fine-tuned model and evaluates its performance on the test set using the BLEU score. |
|
|
""" |
|
|
|
|
|
MODEL_PATH = "thilina/mt5-sinhalese-english" |
|
|
TEST_DIR = "data/test_sets" |
|
|
SOURCE_LANG_FILE = f"{TEST_DIR}/test.si" |
|
|
TARGET_LANG_FILE = f"{TEST_DIR}/test.en" |
|
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
|
|
|
print("Loading model, tokenizer, and evaluation metric...") |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) |
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to(DEVICE) |
|
|
bleu_metric = evaluate.load("sacrebleu") |
|
|
|
|
|
|
|
|
with open(SOURCE_LANG_FILE, "r", encoding="utf-8") as f: |
|
|
source_sentences = [line.strip() for line in f.readlines()] |
|
|
with open(TARGET_LANG_FILE, "r", encoding="utf-8") as f: |
|
|
|
|
|
reference_translations = [[line.strip()] for line in f.readlines()] |
|
|
|
|
|
|
|
|
print(f"Generating translations for {len(source_sentences)} test sentences...") |
|
|
predictions = [] |
|
|
for sentence in tqdm(source_sentences): |
|
|
inputs = tokenizer(sentence, return_tensors="pt").to(DEVICE) |
|
|
|
|
|
generated_tokens = model.generate( |
|
|
**inputs, |
|
|
max_length=128 |
|
|
) |
|
|
|
|
|
translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] |
|
|
predictions.append(translation) |
|
|
|
|
|
|
|
|
print("Calculating BLEU score...") |
|
|
results = bleu_metric.compute(predictions=predictions, references=reference_translations) |
|
|
|
|
|
|
|
|
bleu_score = results["score"] |
|
|
|
|
|
print("\n--- Evaluation Complete ---") |
|
|
print(f"BLEU Score: {bleu_score:.2f}") |
|
|
print("---------------------------") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
evaluate_model() |
|
|
|