File size: 8,238 Bytes
1fed70a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 | #!/usr/bin/env python3
"""
Evaluate ByT5 + XLM-RoBERTa reranker on Indo NLP Sinhala test sets.
Test Set 1: 10K formal sentences
Test Set 2: 5K informal sentences (ad-hoc, colloquial)
"""
import sys
import os
from pathlib import Path
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
import torch
import pandas as pd
import numpy as np
from collections import defaultdict
# Import our models
from core.decoder import BeamSearchDecoder
def load_test_set(filepath, max_samples=None):
"""
Load Indo NLP test set.
Format: pairs of lines (Singlish, Sinhala expected output)
"""
samples = []
with open(filepath, 'r', encoding='utf-8') as f:
lines = [line.strip() for line in f.readlines() if line.strip()]
for i in range(0, len(lines), 2):
if i + 1 < len(lines):
singlish_input = lines[i]
sinhala_expected = lines[i + 1]
samples.append({
'singlish': singlish_input,
'expected': sinhala_expected
})
if max_samples and len(samples) >= max_samples:
break
return samples
def compute_cer(predicted, expected):
"""Character Error Rate"""
if not expected:
return 1.0 if predicted else 0.0
# Simple character-level edit distance
from difflib import SequenceMatcher
matcher = SequenceMatcher(None, predicted, expected)
ratio = matcher.ratio()
return 1.0 - ratio
def compute_wer(predicted, expected):
"""Word Error Rate (space-separated tokens)"""
pred_words = predicted.split()
exp_words = expected.split()
if not exp_words:
return 1.0 if pred_words else 0.0
from difflib import SequenceMatcher
matcher = SequenceMatcher(None, pred_words, exp_words)
ratio = matcher.ratio()
return 1.0 - ratio
def compute_em(predicted, expected):
"""Exact Match"""
return 1.0 if predicted == expected else 0.0
def compute_bleu(predicted, expected, n=4):
"""Simple BLEU approximation (unigram overlap)"""
pred_tokens = predicted.split()
exp_tokens = expected.split()
if not exp_tokens:
return 1.0 if not pred_tokens else 0.0
# Count matching tokens
matches = sum(1 for t in pred_tokens if t in exp_tokens)
return matches / len(exp_tokens)
def evaluate_samples(decoder, samples, device, batch_size=8):
"""
Evaluate ByT5 + MLM reranker on samples.
Returns: list of results with metrics
"""
results = []
total = len(samples)
for idx, sample in enumerate(samples):
singlish_input = sample['singlish']
expected_output = sample['expected']
# Print progress every 10 samples
if idx % 10 == 0:
print(f" Progress: {idx}/{total}", flush=True)
try:
# Decode using BeamSearchDecoder (includes ByT5 + MLM reranking)
predicted, trace_logs, _ = decoder.decode(singlish_input)
# Compute metrics
cer = compute_cer(predicted, expected_output)
wer = compute_wer(predicted, expected_output)
bleu = compute_bleu(predicted, expected_output)
em = compute_em(predicted, expected_output)
results.append({
'singlish': singlish_input,
'expected': expected_output,
'predicted': predicted,
'cer': cer,
'wer': wer,
'bleu': bleu,
'em': em
})
except Exception as e:
print(f" Error at {idx}/{total} processing '{singlish_input}': {e}")
results.append({
'singlish': singlish_input,
'expected': expected_output,
'predicted': '[ERROR]',
'cer': 1.0,
'wer': 1.0,
'bleu': 0.0,
'em': 0
})
print(f" Completed: {total}/{total}", flush=True)
return results
def print_metrics(results, subset_name):
"""Print metrics summary"""
if not results:
print(f"{subset_name}: No results")
return
df = pd.DataFrame(results)
print(f"\n{'='*60}")
print(f"Subset: {subset_name} (n={len(results)})")
print(f"{'='*60}")
print(f"CER (lower is better): {df['cer'].mean():.4f} ± {df['cer'].std():.4f}")
print(f"WER (lower is better): {df['wer'].mean():.4f} ± {df['wer'].std():.4f}")
print(f"BLEU (higher is better): {df['bleu'].mean():.4f} ± {df['bleu'].std():.4f}")
print(f"EM (higher is better): {df['em'].mean():.4f} ({int(df['em'].sum())} / {len(results)})")
# Show sample failures
failures = df[df['em'] == 0].head(3)
if len(failures) > 0:
print(f"\nSample Failures (first 3):")
for idx, row in failures.iterrows():
print(f" Input: {row['singlish']}")
print(f" Expected: {row['expected']}")
print(f" Got: {row['predicted']}")
print()
def main():
import sys
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Parse command line args for sample limits
max_formal = int(sys.argv[1]) if len(sys.argv) > 1 else None
max_informal = int(sys.argv[2]) if len(sys.argv) > 2 else None
# Initialize model
print("Loading BeamSearchDecoder (ByT5 + MLM reranker)...")
decoder = BeamSearchDecoder(device=device)
# Load test sets
test_dir = Path("IndoNLP-2025-Shared-Task/Test Dataset/Sinhala")
print("\nLoading Test Set 1 (formal, 10K)...")
formal_samples = load_test_set(test_dir / "Sinhala Test set 1.txt", max_samples=max_formal)
print(f"Loaded {len(formal_samples)} formal samples")
print("Loading Test Set 2 (informal, 5K)...")
informal_samples = load_test_set(test_dir / "Sinhala Test set 2.txt", max_samples=max_informal)
print(f"Loaded {len(informal_samples)} informal samples")
# Evaluate
print("\n" + "="*60)
print(f"EVALUATING FORMAL SUBSET ({len(formal_samples)} samples)")
print("="*60)
formal_results = evaluate_samples(decoder, formal_samples, device)
print("\n" + "="*60)
print(f"EVALUATING INFORMAL SUBSET ({len(informal_samples)} samples)")
print("="*60)
informal_results = evaluate_samples(decoder, informal_samples, device)
# Print results
print_metrics(formal_results, f"Formal ({len(formal_results)})")
print_metrics(informal_results, f"Informal ({len(informal_results)})")
# Overall
all_results = formal_results + informal_results
print_metrics(all_results, f"OVERALL ({len(all_results)} samples)")
# Save detailed results
results_df = pd.DataFrame(all_results)
results_df.to_csv("misc/indo_nlp_eval_results.csv", index=False)
print(f"\nDetailed results saved to: misc/indo_nlp_eval_results.csv")
# Save summary
summary = {
'Subset': [f'Formal ({len(formal_results)})', f'Informal ({len(informal_results)})', f'Overall ({len(all_results)})'],
'CER': [
f"{pd.DataFrame(formal_results)['cer'].mean():.4f}",
f"{pd.DataFrame(informal_results)['cer'].mean():.4f}",
f"{results_df['cer'].mean():.4f}"
],
'WER': [
f"{pd.DataFrame(formal_results)['wer'].mean():.4f}",
f"{pd.DataFrame(informal_results)['wer'].mean():.4f}",
f"{results_df['wer'].mean():.4f}"
],
'BLEU': [
f"{pd.DataFrame(formal_results)['bleu'].mean():.4f}",
f"{pd.DataFrame(informal_results)['bleu'].mean():.4f}",
f"{results_df['bleu'].mean():.4f}"
],
'EM': [
f"{pd.DataFrame(formal_results)['em'].mean():.4f}",
f"{pd.DataFrame(informal_results)['em'].mean():.4f}",
f"{results_df['em'].mean():.4f}"
]
}
summary_df = pd.DataFrame(summary)
summary_df.to_csv("misc/indo_nlp_eval_summary.csv", index=False)
print(f"Summary saved to: misc/indo_nlp_eval_summary.csv")
if __name__ == "__main__":
main()
|