File size: 2,268 Bytes
5bdd8f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# src/evaluate_sinhala.py

import torch
import evaluate # The new, preferred Hugging Face library for metrics
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from tqdm import tqdm # A library to create smart progress bars

def evaluate_model():
    """
    Loads a fine-tuned model and evaluates its performance on the test set using the BLEU score.
    """
    # --- 1. Configuration ---
    MODEL_PATH = "thilina/mt5-sinhalese-english"
    TEST_DIR = "data/test_sets"
    SOURCE_LANG_FILE = f"{TEST_DIR}/test.si"
    TARGET_LANG_FILE = f"{TEST_DIR}/test.en"
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    
    # --- 2. Load Model, Tokenizer, and Metric ---
    print("Loading model, tokenizer, and evaluation metric...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to(DEVICE)
    bleu_metric = evaluate.load("sacrebleu")

    # --- 3. Load Test Data ---
    with open(SOURCE_LANG_FILE, "r", encoding="utf-8") as f:
        source_sentences = [line.strip() for line in f.readlines()]
    with open(TARGET_LANG_FILE, "r", encoding="utf-8") as f:
        # The BLEU metric expects references to be a list of lists
        reference_translations = [[line.strip()] for line in f.readlines()]

    # --- 4. Generate Predictions ---
    print(f"Generating translations for {len(source_sentences)} test sentences...")
    predictions = []
    for sentence in tqdm(source_sentences):
        inputs = tokenizer(sentence, return_tensors="pt").to(DEVICE)
        
        generated_tokens = model.generate(
            **inputs,
            max_length=128
        )
        
        translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
        predictions.append(translation)

    # --- 5. Compute BLEU Score ---
    print("Calculating BLEU score...")
    results = bleu_metric.compute(predictions=predictions, references=reference_translations)
    
    # The result is a dictionary. The 'score' key holds the main BLEU score.
    bleu_score = results["score"]

    print("\n--- Evaluation Complete ---")
    print(f"BLEU Score: {bleu_score:.2f}")
    print("---------------------------")

if __name__ == "__main__":
    evaluate_model()