File size: 5,729 Bytes
1fed70a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python3
"""Evaluate ByT5 on Indo NLP test sets - file-based logging version."""

import sys
import os
from pathlib import Path

project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

os.environ['PYTHONUNBUFFERED'] = '1'

import torch
import pandas as pd
import json
from datetime import datetime

from core.decoder import BeamSearchDecoder

# Redirect stderr to avoid tqdm issues
import io
sys.stderr = open(os.devnull, 'w')

LOG_FILE = Path("misc/eval_progress.log")

def log(msg):
    """Log to file and stdout."""
    with open(LOG_FILE, 'a', encoding='utf-8') as f:
        timestamp = datetime.now().strftime("%H:%M:%S")
        f.write(f"[{timestamp}] {msg}\n")
    print(msg, flush=True)

def load_test_set(filepath, max_samples=None):
    """Load Indo NLP test set."""
    samples = []
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f.readlines() if line.strip()]
    
    for i in range(0, len(lines), 2):
        if i + 1 < len(lines):
            samples.append({'singlish': lines[i], 'expected': lines[i+1]})
            if max_samples and len(samples) >= max_samples:
                break
    return samples

def compute_metrics(predicted, expected):
    """Compute CER, WER, BLEU, EM."""
    from difflib import SequenceMatcher
    
    matcher_char = SequenceMatcher(None, predicted, expected)
    cer = 1.0 - matcher_char.ratio() if expected else (1.0 if predicted else 0.0)
    
    pred_words = predicted.split()
    exp_words = expected.split()
    matcher_word = SequenceMatcher(None, pred_words, exp_words)
    wer = 1.0 - matcher_word.ratio() if exp_words else (1.0 if pred_words else 0.0)
    
    if exp_words:
        matches = sum(1 for t in pred_words if t in exp_words)
        bleu = matches / len(exp_words)
    else:
        bleu = 1.0 if not pred_words else 0.0
    
    em = 1 if predicted == expected else 0
    
    return {'cer': cer, 'wer': wer, 'bleu': bleu, 'em': em}

def main():
    # Clear log
    LOG_FILE.write_text("")
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    log(f"Device: {device}")
    
    max_formal = int(sys.argv[1]) if len(sys.argv) > 1 else None
    max_informal = int(sys.argv[2]) if len(sys.argv) > 2 else None
    log(f"Max formal: {max_formal}, Max informal: {max_informal}")
    
    log("\nLoading decoder...")
    try:
        decoder = BeamSearchDecoder(device=device)
        log("Decoder loaded!")
    except Exception as e:
        log(f"ERROR loading decoder: {e}")
        return
    
    # Load test sets
    test_dir = Path("IndoNLP-2025-Shared-Task/Test Dataset/Sinhala")
    
    log(f"\nLoading test sets...")
    formal_samples = load_test_set(test_dir / "Sinhala Test set 1.txt", max_samples=max_formal)
    informal_samples = load_test_set(test_dir / "Sinhala Test set 2.txt", max_samples=max_informal)
    log(f"Formal: {len(formal_samples)}, Informal: {len(informal_samples)}")
    
    all_results = []
    
    # Evaluate formal
    log(f"\n>>> EVALUATING FORMAL ({len(formal_samples)} samples)...")
    for idx, sample in enumerate(formal_samples):
        try:
            predicted, _, _ = decoder.decode(sample['singlish'])
            metrics = compute_metrics(predicted, sample['expected'])
            result = {**sample, 'predicted': predicted, 'subset': 'formal', **metrics}
            all_results.append(result)
            
            if (idx+1) % 10 == 0:
                log(f"  Formal {idx+1}/{len(formal_samples)}: EM={metrics['em']} CER={metrics['cer']:.3f}")
        except Exception as e:
            log(f"  ERROR at formal {idx+1}: {str(e)[:100]}")
            result = {**sample, 'predicted': '[ERROR]', 'subset': 'formal', 'cer': 1.0, 'wer': 1.0, 'bleu': 0.0, 'em': 0}
            all_results.append(result)
    
    log(f"Formal complete: {len([r for r in all_results if r['subset']=='formal'])} results")
    
    # Evaluate informal
    log(f"\n>>> EVALUATING INFORMAL ({len(informal_samples)} samples)...")
    formal_count = len(all_results)
    for idx, sample in enumerate(informal_samples):
        try:
            predicted, _, _ = decoder.decode(sample['singlish'])
            metrics = compute_metrics(predicted, sample['expected'])
            result = {**sample, 'predicted': predicted, 'subset': 'informal', **metrics}
            all_results.append(result)
            
            if (idx+1) % 10 == 0:
                log(f"  Informal {idx+1}/{len(informal_samples)}: EM={metrics['em']} CER={metrics['cer']:.3f}")
        except Exception as e:
            log(f"  ERROR at informal {idx+1}: {str(e)[:100]}")
            result = {**sample, 'predicted': '[ERROR]', 'subset': 'informal', 'cer': 1.0, 'wer': 1.0, 'bleu': 0.0, 'em': 0}
            all_results.append(result)
    
    log(f"Informal complete: {len([r for r in all_results if r['subset']=='informal'])} results")
    
    # Summary
    log(f"\n>>> SUMMARY...")
    all_df = pd.DataFrame(all_results)
    
    for subset in ['formal', 'informal', None]:
        if subset:
            df = all_df[all_df['subset'] == subset]
            label = subset.upper()
        else:
            df = all_df
            label = f"OVERALL ({len(df)})"
        
        cer_mean = df['cer'].mean()
        wer_mean = df['wer'].mean()
        bleu_mean = df['bleu'].mean()
        em_sum = int(df['em'].sum())
        
        log(f"{label:20s} n={len(df):5d} | CER={cer_mean:.4f} WER={wer_mean:.4f} BLEU={bleu_mean:.4f} EM={em_sum}/{len(df)}")
    
    # Save
    all_df.to_csv("misc/indo_nlp_results.csv", index=False)
    log(f"\nResults saved: misc/indo_nlp_results.csv")
    
    log("\nDONE!")

if __name__ == "__main__":
    main()