Spaces:
Sleeping
Sleeping
Update src/evaluation.py
Browse files- src/evaluation.py +73 -296
src/evaluation.py
CHANGED
|
@@ -8,24 +8,21 @@ from collections import defaultdict
|
|
| 8 |
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
|
| 9 |
from typing import Dict, List, Tuple, Optional
|
| 10 |
from scipy import stats
|
| 11 |
-
from scipy.stats import bootstrap
|
| 12 |
import warnings
|
| 13 |
from config import (
|
| 14 |
ALL_UG40_LANGUAGES,
|
| 15 |
GOOGLE_SUPPORTED_LANGUAGES,
|
| 16 |
METRICS_CONFIG,
|
| 17 |
-
STATISTICAL_CONFIG,
|
| 18 |
EVALUATION_TRACKS,
|
| 19 |
MODEL_CATEGORIES,
|
| 20 |
-
SAMPLE_SIZE_RECOMMENDATIONS,
|
| 21 |
)
|
| 22 |
-
from src.utils import get_all_language_pairs
|
| 23 |
|
| 24 |
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
| 25 |
|
| 26 |
|
| 27 |
def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, float]:
|
| 28 |
-
"""Calculate all metrics for a single sentence pair
|
| 29 |
|
| 30 |
# Handle empty predictions
|
| 31 |
if not prediction or not isinstance(prediction, str):
|
|
@@ -75,28 +72,17 @@ def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, flo
|
|
| 75 |
except:
|
| 76 |
metrics["wer"] = 1.0
|
| 77 |
|
| 78 |
-
# Length ratio
|
| 79 |
-
try:
|
| 80 |
-
if len(ref_norm) > 0:
|
| 81 |
-
metrics["len_ratio"] = len(pred_norm) / len(ref_norm)
|
| 82 |
-
else:
|
| 83 |
-
metrics["len_ratio"] = 1.0 if len(pred_norm) == 0 else float("inf")
|
| 84 |
-
except:
|
| 85 |
-
metrics["len_ratio"] = 1.0
|
| 86 |
-
|
| 87 |
# ROUGE scores
|
| 88 |
try:
|
| 89 |
scorer = rouge_scorer.RougeScorer(
|
| 90 |
-
["rouge1", "
|
| 91 |
)
|
| 92 |
rouge_scores = scorer.score(ref_norm, pred_norm)
|
| 93 |
|
| 94 |
metrics["rouge1"] = rouge_scores["rouge1"].fmeasure
|
| 95 |
-
metrics["rouge2"] = rouge_scores["rouge2"].fmeasure
|
| 96 |
metrics["rougeL"] = rouge_scores["rougeL"].fmeasure
|
| 97 |
except:
|
| 98 |
metrics["rouge1"] = 0.0
|
| 99 |
-
metrics["rouge2"] = 0.0
|
| 100 |
metrics["rougeL"] = 0.0
|
| 101 |
|
| 102 |
# Quality score (composite metric)
|
|
@@ -116,130 +102,53 @@ def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, flo
|
|
| 116 |
return metrics
|
| 117 |
|
| 118 |
|
| 119 |
-
def
|
| 120 |
-
"""Calculate
|
| 121 |
|
| 122 |
if not values or len(values) == 0:
|
| 123 |
-
return
|
| 124 |
-
"mean": 0.0,
|
| 125 |
-
"std": 0.0,
|
| 126 |
-
"median": 0.0,
|
| 127 |
-
"ci_lower": 0.0,
|
| 128 |
-
"ci_upper": 0.0,
|
| 129 |
-
"n_samples": 0,
|
| 130 |
-
}
|
| 131 |
|
| 132 |
values = np.array(values)
|
| 133 |
values = values[~np.isnan(values)] # Remove NaN values
|
| 134 |
|
| 135 |
if len(values) == 0:
|
| 136 |
-
return
|
| 137 |
-
"mean": 0.0,
|
| 138 |
-
"std": 0.0,
|
| 139 |
-
"median": 0.0,
|
| 140 |
-
"ci_lower": 0.0,
|
| 141 |
-
"ci_upper": 0.0,
|
| 142 |
-
"n_samples": 0,
|
| 143 |
-
}
|
| 144 |
|
| 145 |
-
|
| 146 |
-
"mean": float(np.mean(values)),
|
| 147 |
-
"std": float(np.std(values, ddof=1)) if len(values) > 1 else 0.0,
|
| 148 |
-
"median": float(np.median(values)),
|
| 149 |
-
"n_samples": len(values),
|
| 150 |
-
}
|
| 151 |
-
|
| 152 |
-
# Calculate confidence intervals using bootstrap if enough samples
|
| 153 |
-
if len(values) >= STATISTICAL_CONFIG["min_samples_for_ci"]:
|
| 154 |
-
try:
|
| 155 |
-
confidence_level = STATISTICAL_CONFIG["confidence_level"]
|
| 156 |
-
|
| 157 |
-
# Bootstrap confidence interval
|
| 158 |
-
def mean_func(x):
|
| 159 |
-
return np.mean(x)
|
| 160 |
-
|
| 161 |
-
res = bootstrap(
|
| 162 |
-
(values,),
|
| 163 |
-
mean_func,
|
| 164 |
-
n_resamples=STATISTICAL_CONFIG["bootstrap_samples"],
|
| 165 |
-
confidence_level=confidence_level,
|
| 166 |
-
random_state=42,
|
| 167 |
-
)
|
| 168 |
-
|
| 169 |
-
stats_dict["ci_lower"] = float(res.confidence_interval.low)
|
| 170 |
-
stats_dict["ci_upper"] = float(res.confidence_interval.high)
|
| 171 |
-
|
| 172 |
-
except Exception as e:
|
| 173 |
-
# Fallback to t-distribution CI
|
| 174 |
-
try:
|
| 175 |
-
alpha = 1 - confidence_level
|
| 176 |
-
t_val = stats.t.ppf(1 - alpha / 2, len(values) - 1)
|
| 177 |
-
margin = t_val * stats_dict["std"] / np.sqrt(len(values))
|
| 178 |
-
stats_dict["ci_lower"] = stats_dict["mean"] - margin
|
| 179 |
-
stats_dict["ci_upper"] = stats_dict["mean"] + margin
|
| 180 |
-
except:
|
| 181 |
-
stats_dict["ci_lower"] = stats_dict["mean"]
|
| 182 |
-
stats_dict["ci_upper"] = stats_dict["mean"]
|
| 183 |
-
else:
|
| 184 |
-
stats_dict["ci_lower"] = stats_dict["mean"]
|
| 185 |
-
stats_dict["ci_upper"] = stats_dict["mean"]
|
| 186 |
-
|
| 187 |
-
return stats_dict
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
def perform_significance_test(
|
| 191 |
-
values1: List[float], values2: List[float], metric_name: str
|
| 192 |
-
) -> Dict[str, float]:
|
| 193 |
-
"""Perform statistical significance test between two groups."""
|
| 194 |
|
| 195 |
-
if len(
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
values1 = np.array(values1)
|
| 199 |
-
values2 = np.array(values2)
|
| 200 |
-
|
| 201 |
-
# Remove NaN values
|
| 202 |
-
values1 = values1[~np.isnan(values1)]
|
| 203 |
-
values2 = values2[~np.isnan(values2)]
|
| 204 |
-
|
| 205 |
-
if len(values1) < 2 or len(values2) < 2:
|
| 206 |
-
return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
|
| 207 |
|
| 208 |
try:
|
| 209 |
-
#
|
| 210 |
-
|
|
|
|
| 211 |
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
(
|
| 215 |
-
(len(values2) - 1) * np.var(values2, ddof=1)) /
|
| 216 |
-
(len(values1) + len(values2) - 2)
|
| 217 |
-
)
|
| 218 |
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
effect_size = 0.0
|
| 223 |
|
| 224 |
-
|
| 225 |
-
significance_level = EVALUATION_TRACKS["google_comparable"]["significance_level"]
|
| 226 |
-
significant = p_value < significance_level
|
| 227 |
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
return {"p_value": 1.0, "effect_size": 0.0, "significant": False}
|
| 237 |
|
| 238 |
|
| 239 |
def evaluate_predictions_by_track(
|
| 240 |
predictions: pd.DataFrame, test_set: pd.DataFrame, track: str
|
| 241 |
) -> Dict:
|
| 242 |
-
"""Evaluate predictions for a specific track
|
| 243 |
|
| 244 |
print(f"🔄 Evaluating for {track} track...")
|
| 245 |
|
|
@@ -277,7 +186,7 @@ def evaluate_predictions_by_track(
|
|
| 277 |
|
| 278 |
sample_df = pd.DataFrame(sample_metrics)
|
| 279 |
|
| 280 |
-
# Aggregate by language pairs
|
| 281 |
pair_metrics = {}
|
| 282 |
overall_metrics = defaultdict(list)
|
| 283 |
|
|
@@ -292,36 +201,44 @@ def evaluate_predictions_by_track(
|
|
| 292 |
(sample_df["target_language"] == tgt_lang)
|
| 293 |
]
|
| 294 |
|
| 295 |
-
if len(pair_data) >=
|
| 296 |
pair_key = f"{src_lang}_to_{tgt_lang}"
|
| 297 |
pair_metrics[pair_key] = {}
|
| 298 |
|
| 299 |
-
# Calculate
|
| 300 |
-
for metric in
|
| 301 |
-
METRICS_CONFIG["primary_metrics"] + METRICS_CONFIG["secondary_metrics"]
|
| 302 |
-
):
|
| 303 |
if metric in pair_data.columns:
|
| 304 |
values = pair_data[metric].replace([np.inf, -np.inf], np.nan).dropna()
|
| 305 |
|
| 306 |
if len(values) > 0:
|
| 307 |
-
|
| 308 |
-
pair_metrics[pair_key][metric] =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
|
| 310 |
# Add to overall metrics for track-level statistics
|
| 311 |
-
overall_metrics[metric].append(
|
| 312 |
|
| 313 |
pair_metrics[pair_key]["sample_count"] = len(pair_data)
|
| 314 |
-
pair_metrics[pair_key]["languages"] = f"{src_lang}-{tgt_lang}"
|
| 315 |
|
| 316 |
# Calculate track-level aggregated statistics
|
| 317 |
track_averages = {}
|
| 318 |
-
|
| 319 |
|
| 320 |
for metric in overall_metrics:
|
| 321 |
if overall_metrics[metric]:
|
| 322 |
-
|
| 323 |
-
track_averages[metric] =
|
| 324 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
|
| 326 |
# Generate evaluation summary
|
| 327 |
summary = {
|
|
@@ -331,15 +248,12 @@ def evaluate_predictions_by_track(
|
|
| 331 |
"language_pairs_evaluated": len([k for k in pair_metrics if pair_metrics[k].get("sample_count", 0) > 0]),
|
| 332 |
"languages_covered": len(set(sample_df["source_language"]) | set(sample_df["target_language"])),
|
| 333 |
"min_samples_per_pair": track_config["min_samples_per_pair"],
|
| 334 |
-
"statistical_power": track_config["statistical_power"],
|
| 335 |
-
"significance_level": track_config["significance_level"],
|
| 336 |
}
|
| 337 |
|
| 338 |
return {
|
| 339 |
-
"sample_metrics": sample_df,
|
| 340 |
"pair_metrics": pair_metrics,
|
| 341 |
"track_averages": track_averages,
|
| 342 |
-
"
|
| 343 |
"summary": summary,
|
| 344 |
"evaluated_samples": len(sample_df),
|
| 345 |
"track": track,
|
|
@@ -347,12 +261,12 @@ def evaluate_predictions_by_track(
|
|
| 347 |
}
|
| 348 |
|
| 349 |
|
| 350 |
-
def
|
| 351 |
predictions: pd.DataFrame, test_set: pd.DataFrame, model_category: str = "community"
|
| 352 |
) -> Dict:
|
| 353 |
-
"""Comprehensive evaluation across all tracks
|
| 354 |
|
| 355 |
-
print("🔬 Starting
|
| 356 |
|
| 357 |
# Validate model category
|
| 358 |
if model_category not in MODEL_CATEGORIES:
|
|
@@ -362,8 +276,7 @@ def evaluate_predictions_scientific(
|
|
| 362 |
"model_category": model_category,
|
| 363 |
"category_info": MODEL_CATEGORIES[model_category],
|
| 364 |
"tracks": {},
|
| 365 |
-
"
|
| 366 |
-
"scientific_metadata": {
|
| 367 |
"evaluation_timestamp": pd.Timestamp.now().isoformat(),
|
| 368 |
"total_samples_submitted": len(predictions),
|
| 369 |
"total_samples_available": len(test_set),
|
|
@@ -375,120 +288,24 @@ def evaluate_predictions_scientific(
|
|
| 375 |
track_result = evaluate_predictions_by_track(predictions, test_set, track_name)
|
| 376 |
evaluation_results["tracks"][track_name] = track_result
|
| 377 |
|
| 378 |
-
# Cross-track consistency analysis
|
| 379 |
-
evaluation_results["cross_track_analysis"] = analyze_cross_track_consistency(
|
| 380 |
-
evaluation_results["tracks"]
|
| 381 |
-
)
|
| 382 |
-
|
| 383 |
return evaluation_results
|
| 384 |
|
| 385 |
|
| 386 |
-
def
|
| 387 |
-
"""
|
| 388 |
-
|
| 389 |
-
consistency_analysis = {
|
| 390 |
-
"track_correlations": {},
|
| 391 |
-
"performance_stability": {},
|
| 392 |
-
"language_coverage_analysis": {},
|
| 393 |
-
}
|
| 394 |
-
|
| 395 |
-
# Extract quality scores from each track for correlation analysis
|
| 396 |
-
track_scores = {}
|
| 397 |
-
for track_name, track_data in track_results.items():
|
| 398 |
-
if track_data.get("track_averages") and "quality_score" in track_data["track_averages"]:
|
| 399 |
-
track_scores[track_name] = track_data["track_averages"]["quality_score"]
|
| 400 |
-
|
| 401 |
-
# Calculate pairwise correlations (would need more data points for meaningful correlation)
|
| 402 |
-
if len(track_scores) >= 2:
|
| 403 |
-
track_names = list(track_scores.keys())
|
| 404 |
-
for i, track1 in enumerate(track_names):
|
| 405 |
-
for track2 in track_names[i + 1:]:
|
| 406 |
-
# This would be more meaningful with multiple models
|
| 407 |
-
consistency_analysis["track_correlations"][f"{track1}_vs_{track2}"] = {
|
| 408 |
-
"score_difference": abs(track_scores[track1] - track_scores[track2]),
|
| 409 |
-
"relative_performance": track_scores[track1] / max(track_scores[track2], 0.001),
|
| 410 |
-
}
|
| 411 |
-
|
| 412 |
-
# Language coverage analysis
|
| 413 |
-
for track_name, track_data in track_results.items():
|
| 414 |
-
if track_data.get("summary"):
|
| 415 |
-
summary = track_data["summary"]
|
| 416 |
-
consistency_analysis["language_coverage_analysis"][track_name] = {
|
| 417 |
-
"coverage_rate": summary["language_pairs_evaluated"] / max(summary.get("total_possible_pairs", 1), 1),
|
| 418 |
-
"samples_per_pair": summary["total_samples"] / max(summary["language_pairs_evaluated"], 1),
|
| 419 |
-
"statistical_adequacy": summary["total_samples"] >= EVALUATION_TRACKS[track_name]["min_samples_per_pair"] * summary["language_pairs_evaluated"],
|
| 420 |
-
}
|
| 421 |
-
|
| 422 |
-
return consistency_analysis
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
def compare_models_statistically(
|
| 426 |
-
model1_results: Dict, model2_results: Dict, track: str = "google_comparable"
|
| 427 |
-
) -> Dict:
|
| 428 |
-
"""Perform statistical comparison between two models on a specific track."""
|
| 429 |
-
|
| 430 |
-
if track not in model1_results.get("tracks", {}) or track not in model2_results.get("tracks", {}):
|
| 431 |
-
return {"error": f"Track {track} not available for both models"}
|
| 432 |
-
|
| 433 |
-
track1_data = model1_results["tracks"][track]
|
| 434 |
-
track2_data = model2_results["tracks"][track]
|
| 435 |
-
|
| 436 |
-
if track1_data.get("error") or track2_data.get("error"):
|
| 437 |
-
return {"error": "One or both models have evaluation errors"}
|
| 438 |
-
|
| 439 |
-
comparison_results = {
|
| 440 |
-
"track": track,
|
| 441 |
-
"model1_category": model1_results.get("model_category", "unknown"),
|
| 442 |
-
"model2_category": model2_results.get("model_category", "unknown"),
|
| 443 |
-
"metric_comparisons": {},
|
| 444 |
-
"language_pair_comparisons": {},
|
| 445 |
-
"overall_significance": {},
|
| 446 |
-
}
|
| 447 |
-
|
| 448 |
-
# Compare each metric
|
| 449 |
-
for metric in METRICS_CONFIG["primary_metrics"] + METRICS_CONFIG["secondary_metrics"]:
|
| 450 |
-
if (metric in track1_data.get("track_statistics", {}) and
|
| 451 |
-
metric in track2_data.get("track_statistics", {})):
|
| 452 |
-
|
| 453 |
-
# Extract sample-level data for this metric from both models
|
| 454 |
-
# This would require access to the original sample metrics
|
| 455 |
-
# For now, we'll use the aggregated statistics
|
| 456 |
-
|
| 457 |
-
stats1 = track1_data["track_statistics"][metric]
|
| 458 |
-
stats2 = track2_data["track_statistics"][metric]
|
| 459 |
-
|
| 460 |
-
# Create comparison summary
|
| 461 |
-
comparison_results["metric_comparisons"][metric] = {
|
| 462 |
-
"model1_mean": stats1["mean"],
|
| 463 |
-
"model1_ci": [stats1["ci_lower"], stats1["ci_upper"]],
|
| 464 |
-
"model2_mean": stats2["mean"],
|
| 465 |
-
"model2_ci": [stats2["ci_lower"], stats2["ci_upper"]],
|
| 466 |
-
"difference": stats1["mean"] - stats2["mean"],
|
| 467 |
-
"ci_overlap": not (stats1["ci_upper"] < stats2["ci_lower"] or
|
| 468 |
-
stats2["ci_upper"] < stats1["ci_lower"]),
|
| 469 |
-
}
|
| 470 |
-
|
| 471 |
-
return comparison_results
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
def generate_scientific_report(
|
| 475 |
-
results: Dict, model_name: str = "", baseline_results: Dict = None
|
| 476 |
-
) -> str:
|
| 477 |
-
"""Generate a comprehensive scientific evaluation report."""
|
| 478 |
|
| 479 |
if any(track_data.get("error") for track_data in results.get("tracks", {}).values()):
|
| 480 |
-
return f"❌ **Evaluation Error**: Unable to complete
|
| 481 |
|
| 482 |
report = []
|
| 483 |
|
| 484 |
# Header
|
| 485 |
-
report.append(f"
|
| 486 |
report.append("")
|
| 487 |
|
| 488 |
# Model categorization
|
| 489 |
category_info = results.get("category_info", {})
|
| 490 |
report.append(f"**Model Category**: {category_info.get('name', 'Unknown')}")
|
| 491 |
-
report.append(f"**Category Description**: {category_info.get('description', 'N/A')}")
|
| 492 |
report.append("")
|
| 493 |
|
| 494 |
# Track-by-track analysis
|
|
@@ -498,73 +315,33 @@ def generate_scientific_report(
|
|
| 498 |
|
| 499 |
track_config = EVALUATION_TRACKS[track_name]
|
| 500 |
summary = track_data.get("summary", {})
|
| 501 |
-
|
|
|
|
| 502 |
|
| 503 |
-
report.append(f"
|
| 504 |
-
report.append(f"*{track_config['description']}*")
|
| 505 |
report.append("")
|
| 506 |
|
| 507 |
# Summary statistics
|
| 508 |
-
report.append("
|
| 509 |
report.append(f"- **Samples Evaluated**: {summary.get('total_samples', 0):,}")
|
| 510 |
report.append(f"- **Language Pairs**: {summary.get('language_pairs_evaluated', 0)}")
|
| 511 |
report.append(f"- **Languages Covered**: {summary.get('languages_covered', 0)}")
|
| 512 |
-
report.append(f"- **Statistical Power**: {track_config['statistical_power']}")
|
| 513 |
report.append("")
|
| 514 |
|
| 515 |
# Primary metrics with confidence intervals
|
| 516 |
-
report.append("
|
| 517 |
for metric in METRICS_CONFIG["primary_metrics"]:
|
| 518 |
-
if metric in
|
| 519 |
-
stats =
|
| 520 |
mean_val = stats["mean"]
|
| 521 |
ci_lower = stats["ci_lower"]
|
| 522 |
ci_upper = stats["ci_upper"]
|
| 523 |
|
| 524 |
report.append(f"- **{metric.upper()}**: {mean_val:.4f} [{ci_lower:.4f}, {ci_upper:.4f}]")
|
| 525 |
report.append("")
|
| 526 |
-
|
| 527 |
-
# Statistical adequacy assessment
|
| 528 |
-
min_required = track_config["min_samples_per_pair"] * summary.get("language_pairs_evaluated", 0)
|
| 529 |
-
adequacy = "✅ Adequate" if summary.get("total_samples", 0) >= min_required else "⚠️ Limited"
|
| 530 |
-
report.append(f"**Statistical Adequacy**: {adequacy}")
|
| 531 |
-
report.append("")
|
| 532 |
-
|
| 533 |
-
# Cross-track analysis
|
| 534 |
-
cross_track = results.get("cross_track_analysis", {})
|
| 535 |
-
if cross_track:
|
| 536 |
-
report.append("## 🔄 Cross-Track Consistency Analysis")
|
| 537 |
-
|
| 538 |
-
coverage_analysis = cross_track.get("language_coverage_analysis", {})
|
| 539 |
-
for track_name, coverage_info in coverage_analysis.items():
|
| 540 |
-
adequacy = "✅ Statistically adequate" if coverage_info.get("statistical_adequacy") else "⚠️ Limited statistical power"
|
| 541 |
-
report.append(f"- **{track_name}**: {adequacy}")
|
| 542 |
-
|
| 543 |
-
report.append("")
|
| 544 |
-
|
| 545 |
-
# Baseline comparison if available
|
| 546 |
-
if baseline_results:
|
| 547 |
-
report.append("## 📈 Baseline Comparison")
|
| 548 |
-
# This would include detailed statistical comparisons
|
| 549 |
-
report.append("*Statistical comparison with baseline models*")
|
| 550 |
-
report.append("")
|
| 551 |
-
|
| 552 |
-
# Scientific recommendations
|
| 553 |
-
report.append("## 💡 Scientific Recommendations")
|
| 554 |
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
if total_samples < SAMPLE_SIZE_RECOMMENDATIONS["publication_quality"]:
|
| 562 |
-
report.append("- ⚠️ Consider collecting more evaluation samples for publication-quality results")
|
| 563 |
-
|
| 564 |
-
google_track = results.get("tracks", {}).get("google_comparable", {})
|
| 565 |
-
if not google_track.get("error") and google_track.get("summary", {}).get("total_samples", 0) > 100:
|
| 566 |
-
report.append("- ✅ Sufficient data for comparison with commercial systems")
|
| 567 |
-
|
| 568 |
-
report.append("")
|
| 569 |
-
|
| 570 |
-
return "\n".join(report)
|
|
|
|
| 8 |
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
|
| 9 |
from typing import Dict, List, Tuple, Optional
|
| 10 |
from scipy import stats
|
|
|
|
| 11 |
import warnings
|
| 12 |
from config import (
|
| 13 |
ALL_UG40_LANGUAGES,
|
| 14 |
GOOGLE_SUPPORTED_LANGUAGES,
|
| 15 |
METRICS_CONFIG,
|
|
|
|
| 16 |
EVALUATION_TRACKS,
|
| 17 |
MODEL_CATEGORIES,
|
|
|
|
| 18 |
)
|
| 19 |
+
from src.utils import get_all_language_pairs
|
| 20 |
|
| 21 |
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
| 22 |
|
| 23 |
|
| 24 |
def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, float]:
|
| 25 |
+
"""Calculate all metrics for a single sentence pair."""
|
| 26 |
|
| 27 |
# Handle empty predictions
|
| 28 |
if not prediction or not isinstance(prediction, str):
|
|
|
|
| 72 |
except:
|
| 73 |
metrics["wer"] = 1.0
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
# ROUGE scores
|
| 76 |
try:
|
| 77 |
scorer = rouge_scorer.RougeScorer(
|
| 78 |
+
["rouge1", "rougeL"], use_stemmer=True
|
| 79 |
)
|
| 80 |
rouge_scores = scorer.score(ref_norm, pred_norm)
|
| 81 |
|
| 82 |
metrics["rouge1"] = rouge_scores["rouge1"].fmeasure
|
|
|
|
| 83 |
metrics["rougeL"] = rouge_scores["rougeL"].fmeasure
|
| 84 |
except:
|
| 85 |
metrics["rouge1"] = 0.0
|
|
|
|
| 86 |
metrics["rougeL"] = 0.0
|
| 87 |
|
| 88 |
# Quality score (composite metric)
|
|
|
|
| 102 |
return metrics
|
| 103 |
|
| 104 |
|
| 105 |
+
def calculate_confidence_interval(values: List[float], confidence_level: float = 0.95) -> Tuple[float, float, float]:
|
| 106 |
+
"""Calculate mean and confidence interval for a list of values."""
|
| 107 |
|
| 108 |
if not values or len(values) == 0:
|
| 109 |
+
return 0.0, 0.0, 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
values = np.array(values)
|
| 112 |
values = values[~np.isnan(values)] # Remove NaN values
|
| 113 |
|
| 114 |
if len(values) == 0:
|
| 115 |
+
return 0.0, 0.0, 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
+
mean_val = float(np.mean(values))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
+
if len(values) < METRICS_CONFIG["min_samples_for_ci"]:
|
| 120 |
+
# Not enough samples for meaningful CI
|
| 121 |
+
return mean_val, mean_val, mean_val
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
try:
|
| 124 |
+
# Bootstrap confidence interval
|
| 125 |
+
n_bootstrap = min(METRICS_CONFIG["bootstrap_samples"], 1000)
|
| 126 |
+
bootstrap_means = []
|
| 127 |
|
| 128 |
+
for _ in range(n_bootstrap):
|
| 129 |
+
bootstrap_sample = np.random.choice(values, size=len(values), replace=True)
|
| 130 |
+
bootstrap_means.append(np.mean(bootstrap_sample))
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
+
alpha = 1 - confidence_level
|
| 133 |
+
ci_lower = np.percentile(bootstrap_means, 100 * alpha / 2)
|
| 134 |
+
ci_upper = np.percentile(bootstrap_means, 100 * (1 - alpha / 2))
|
|
|
|
| 135 |
|
| 136 |
+
return mean_val, float(ci_lower), float(ci_upper)
|
|
|
|
|
|
|
| 137 |
|
| 138 |
+
except Exception:
|
| 139 |
+
# Fallback to t-distribution CI
|
| 140 |
+
try:
|
| 141 |
+
std_err = stats.sem(values)
|
| 142 |
+
h = std_err * stats.t.ppf((1 + confidence_level) / 2, len(values) - 1)
|
| 143 |
+
return mean_val, mean_val - h, mean_val + h
|
| 144 |
+
except:
|
| 145 |
+
return mean_val, mean_val, mean_val
|
|
|
|
| 146 |
|
| 147 |
|
| 148 |
def evaluate_predictions_by_track(
|
| 149 |
predictions: pd.DataFrame, test_set: pd.DataFrame, track: str
|
| 150 |
) -> Dict:
|
| 151 |
+
"""Evaluate predictions for a specific track."""
|
| 152 |
|
| 153 |
print(f"🔄 Evaluating for {track} track...")
|
| 154 |
|
|
|
|
| 186 |
|
| 187 |
sample_df = pd.DataFrame(sample_metrics)
|
| 188 |
|
| 189 |
+
# Aggregate by language pairs
|
| 190 |
pair_metrics = {}
|
| 191 |
overall_metrics = defaultdict(list)
|
| 192 |
|
|
|
|
| 201 |
(sample_df["target_language"] == tgt_lang)
|
| 202 |
]
|
| 203 |
|
| 204 |
+
if len(pair_data) >= MIN_SAMPLES_PER_PAIR:
|
| 205 |
pair_key = f"{src_lang}_to_{tgt_lang}"
|
| 206 |
pair_metrics[pair_key] = {}
|
| 207 |
|
| 208 |
+
# Calculate statistics for each metric
|
| 209 |
+
for metric in METRICS_CONFIG["primary_metrics"] + METRICS_CONFIG["secondary_metrics"]:
|
|
|
|
|
|
|
| 210 |
if metric in pair_data.columns:
|
| 211 |
values = pair_data[metric].replace([np.inf, -np.inf], np.nan).dropna()
|
| 212 |
|
| 213 |
if len(values) > 0:
|
| 214 |
+
mean_val, ci_lower, ci_upper = calculate_confidence_interval(values.tolist())
|
| 215 |
+
pair_metrics[pair_key][metric] = {
|
| 216 |
+
"mean": mean_val,
|
| 217 |
+
"ci_lower": ci_lower,
|
| 218 |
+
"ci_upper": ci_upper,
|
| 219 |
+
"std": float(np.std(values)) if len(values) > 1 else 0.0,
|
| 220 |
+
"count": len(values)
|
| 221 |
+
}
|
| 222 |
|
| 223 |
# Add to overall metrics for track-level statistics
|
| 224 |
+
overall_metrics[metric].append(mean_val)
|
| 225 |
|
| 226 |
pair_metrics[pair_key]["sample_count"] = len(pair_data)
|
|
|
|
| 227 |
|
| 228 |
# Calculate track-level aggregated statistics
|
| 229 |
track_averages = {}
|
| 230 |
+
track_confidence = {}
|
| 231 |
|
| 232 |
for metric in overall_metrics:
|
| 233 |
if overall_metrics[metric]:
|
| 234 |
+
mean_val, ci_lower, ci_upper = calculate_confidence_interval(overall_metrics[metric])
|
| 235 |
+
track_averages[metric] = mean_val
|
| 236 |
+
track_confidence[metric] = {
|
| 237 |
+
"mean": mean_val,
|
| 238 |
+
"ci_lower": ci_lower,
|
| 239 |
+
"ci_upper": ci_upper,
|
| 240 |
+
"std": float(np.std(overall_metrics[metric])) if len(overall_metrics[metric]) > 1 else 0.0
|
| 241 |
+
}
|
| 242 |
|
| 243 |
# Generate evaluation summary
|
| 244 |
summary = {
|
|
|
|
| 248 |
"language_pairs_evaluated": len([k for k in pair_metrics if pair_metrics[k].get("sample_count", 0) > 0]),
|
| 249 |
"languages_covered": len(set(sample_df["source_language"]) | set(sample_df["target_language"])),
|
| 250 |
"min_samples_per_pair": track_config["min_samples_per_pair"],
|
|
|
|
|
|
|
| 251 |
}
|
| 252 |
|
| 253 |
return {
|
|
|
|
| 254 |
"pair_metrics": pair_metrics,
|
| 255 |
"track_averages": track_averages,
|
| 256 |
+
"track_confidence": track_confidence,
|
| 257 |
"summary": summary,
|
| 258 |
"evaluated_samples": len(sample_df),
|
| 259 |
"track": track,
|
|
|
|
| 261 |
}
|
| 262 |
|
| 263 |
|
| 264 |
+
def evaluate_predictions(
|
| 265 |
predictions: pd.DataFrame, test_set: pd.DataFrame, model_category: str = "community"
|
| 266 |
) -> Dict:
|
| 267 |
+
"""Comprehensive evaluation across all tracks."""
|
| 268 |
|
| 269 |
+
print("🔬 Starting evaluation...")
|
| 270 |
|
| 271 |
# Validate model category
|
| 272 |
if model_category not in MODEL_CATEGORIES:
|
|
|
|
| 276 |
"model_category": model_category,
|
| 277 |
"category_info": MODEL_CATEGORIES[model_category],
|
| 278 |
"tracks": {},
|
| 279 |
+
"metadata": {
|
|
|
|
| 280 |
"evaluation_timestamp": pd.Timestamp.now().isoformat(),
|
| 281 |
"total_samples_submitted": len(predictions),
|
| 282 |
"total_samples_available": len(test_set),
|
|
|
|
| 288 |
track_result = evaluate_predictions_by_track(predictions, test_set, track_name)
|
| 289 |
evaluation_results["tracks"][track_name] = track_result
|
| 290 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
return evaluation_results
|
| 292 |
|
| 293 |
|
| 294 |
+
def generate_evaluation_report(results: Dict, model_name: str = "") -> str:
|
| 295 |
+
"""Generate a comprehensive evaluation report."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
|
| 297 |
if any(track_data.get("error") for track_data in results.get("tracks", {}).values()):
|
| 298 |
+
return f"❌ **Evaluation Error**: Unable to complete evaluation"
|
| 299 |
|
| 300 |
report = []
|
| 301 |
|
| 302 |
# Header
|
| 303 |
+
report.append(f"### 🔬 Evaluation Report: {model_name or 'Model'}")
|
| 304 |
report.append("")
|
| 305 |
|
| 306 |
# Model categorization
|
| 307 |
category_info = results.get("category_info", {})
|
| 308 |
report.append(f"**Model Category**: {category_info.get('name', 'Unknown')}")
|
|
|
|
| 309 |
report.append("")
|
| 310 |
|
| 311 |
# Track-by-track analysis
|
|
|
|
| 315 |
|
| 316 |
track_config = EVALUATION_TRACKS[track_name]
|
| 317 |
summary = track_data.get("summary", {})
|
| 318 |
+
track_averages = track_data.get("track_averages", {})
|
| 319 |
+
track_confidence = track_data.get("track_confidence", {})
|
| 320 |
|
| 321 |
+
report.append(f"#### {track_config['name']}")
|
|
|
|
| 322 |
report.append("")
|
| 323 |
|
| 324 |
# Summary statistics
|
| 325 |
+
report.append("**Summary Statistics:**")
|
| 326 |
report.append(f"- **Samples Evaluated**: {summary.get('total_samples', 0):,}")
|
| 327 |
report.append(f"- **Language Pairs**: {summary.get('language_pairs_evaluated', 0)}")
|
| 328 |
report.append(f"- **Languages Covered**: {summary.get('languages_covered', 0)}")
|
|
|
|
| 329 |
report.append("")
|
| 330 |
|
| 331 |
# Primary metrics with confidence intervals
|
| 332 |
+
report.append("**Primary Metrics (95% Confidence Intervals):**")
|
| 333 |
for metric in METRICS_CONFIG["primary_metrics"]:
|
| 334 |
+
if metric in track_confidence:
|
| 335 |
+
stats = track_confidence[metric]
|
| 336 |
mean_val = stats["mean"]
|
| 337 |
ci_lower = stats["ci_lower"]
|
| 338 |
ci_upper = stats["ci_upper"]
|
| 339 |
|
| 340 |
report.append(f"- **{metric.upper()}**: {mean_val:.4f} [{ci_lower:.4f}, {ci_upper:.4f}]")
|
| 341 |
report.append("")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
|
| 343 |
+
return "\n".join(report)
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
# Backwards compatibility
|
| 347 |
+
MIN_SAMPLES_PER_PAIR = 10
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|