Spaces:

akera
/

leaderboard

Sleeping

App Files Files Community

akera commited on Jun 13, 2025

Commit

6ff2c45

verified ·

1 Parent(s): 24ebdcb

Update src/evaluation.py

Browse files

Files changed (1) hide show

src/evaluation.py +52 -36

src/evaluation.py CHANGED Viewed

@@ -11,7 +11,7 @@ from config import ALL_UG40_LANGUAGES, GOOGLE_SUPPORTED_LANGUAGES, METRICS_CONFI
 from src.utils import get_all_language_pairs, get_google_comparable_pairs
 def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, float]:
-    """Calculate all metrics for a single sentence pair."""
     # Handle empty predictions
     if not prediction or not isinstance(prediction, str):
@@ -27,14 +27,14 @@ def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, flo
     metrics = {}
-    # BLEU score
     try:
         bleu = BLEU(effective_order=True)
         metrics['bleu'] = bleu.sentence_score(pred_norm, [ref_norm]).score
     except:
         metrics['bleu'] = 0.0
-    # ChrF score
     try:
         chrf = CHRF()
         metrics['chrf'] = chrf.sentence_score(pred_norm, [ref_norm]).score / 100.0
@@ -83,19 +83,30 @@ def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, flo
         metrics['rouge2'] = 0.0
         metrics['rougeL'] = 0.0
-    # Quality score (composite metric)
     try:
         quality_components = [
             metrics['bleu'] / 100.0,  # Normalize BLEU to 0-1
-            metrics['chrf'],
             1.0 - min(metrics['cer'], 1.0),  # Invert error rates
             1.0 - min(metrics['wer'], 1.0),
             metrics['rouge1'],
             metrics['rougeL']
         ]
         metrics['quality_score'] = np.mean(quality_components)
-    except:
-        metrics['quality_score'] = 0.0
     return metrics
@@ -132,7 +143,7 @@ def evaluate_predictions(predictions: pd.DataFrame, test_set: pd.DataFrame) -> D
     sample_df = pd.DataFrame(sample_metrics)
-    # Aggregate by language pairs
     pair_metrics = {}
     overall_metrics = defaultdict(list)
     google_comparable_metrics = defaultdict(list)
@@ -153,16 +164,19 @@ def evaluate_predictions(predictions: pd.DataFrame, test_set: pd.DataFrame) -> D
                     # Calculate averages for this pair
                     for metric in METRICS_CONFIG['primary_metrics'] + METRICS_CONFIG['secondary_metrics']:
                         if metric in pair_data.columns:
-                            avg_value = float(pair_data[metric].mean())
-                            pair_metrics[pair_key][metric] = avg_value
-                            # Add to overall averages
-                            overall_metrics[metric].append(avg_value)
-                            # Add to Google comparable if applicable
-                            if (src_lang in GOOGLE_SUPPORTED_LANGUAGES and
-                                tgt_lang in GOOGLE_SUPPORTED_LANGUAGES):
-                                google_comparable_metrics[metric].append(avg_value)
                     pair_metrics[pair_key]['sample_count'] = len(pair_data)
@@ -185,11 +199,12 @@ def evaluate_predictions(predictions: pd.DataFrame, test_set: pd.DataFrame) -> D
     # Generate evaluation summary
     summary = {
         'total_samples': len(sample_df),
-        'language_pairs_covered': len([k for k in pair_metrics if pair_metrics[k]['sample_count'] > 0]),
         'google_comparable_pairs': len([k for k in pair_metrics
                                       if '_to_' in k and
                                       k.split('_to_')[0] in GOOGLE_SUPPORTED_LANGUAGES and
-                                      k.split('_to_')[1] in GOOGLE_SUPPORTED_LANGUAGES]),
         'primary_metrics': {metric: averages.get(metric, 0.0)
                           for metric in METRICS_CONFIG['primary_metrics']},
         'secondary_metrics': {metric: averages.get(metric, 0.0)
@@ -206,6 +221,7 @@ def evaluate_predictions(predictions: pd.DataFrame, test_set: pd.DataFrame) -> D
         'error': None
     }
 def compare_with_baseline(results: Dict, baseline_results: Dict = None) -> Dict:
     """Compare results with baseline (e.g., Google Translate)."""
@@ -290,20 +306,19 @@ def generate_evaluation_report(results: Dict, model_name: str = "", comparison:
     report = []
     # Header
-    report.append(f"# Evaluation Report: {model_name or 'Submission'}")
-    report.append(f"Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
     report.append("")
     # Summary
     summary = results['summary']
-    report.append("## 📊 Summary")
     report.append(f"- **Total Samples Evaluated**: {summary['total_samples']:,}")
     report.append(f"- **Language Pairs Covered**: {summary['language_pairs_covered']}")
     report.append(f"- **Google Comparable Pairs**: {summary['google_comparable_pairs']}")
     report.append("")
     # Primary metrics
-    report.append("## 🎯 Primary Metrics")
     for metric, value in summary['primary_metrics'].items():
         formatted_value = f"{value:.4f}" if metric != 'bleu' else f"{value:.2f}"
         report.append(f"- **{metric.upper()}**: {formatted_value}")
@@ -323,7 +338,7 @@ def generate_evaluation_report(results: Dict, model_name: str = "", comparison:
     report.append("")
     # Secondary metrics
-    report.append("## 📈 Secondary Metrics")
     for metric, value in summary['secondary_metrics'].items():
         formatted_value = f"{value:.4f}"
         report.append(f"- **{metric.upper()}**: {formatted_value}")
@@ -339,26 +354,27 @@ def generate_evaluation_report(results: Dict, model_name: str = "", comparison:
             reverse=True
         )
-        report.append("## 🏆 Best Performing Language Pairs")
-        for pair, score in sorted_pairs[:5]:
-            src, tgt = pair.replace('_to_', ' → ').split(' → ')
-            report.append(f"- **{src} → {tgt}**: {score:.3f}")
-        if len(sorted_pairs) > 5:
-            report.append("")
-            report.append("## 📉 Challenging Language Pairs")
-            for pair, score in sorted_pairs[-3:]:
                 src, tgt = pair.replace('_to_', ' → ').split(' → ')
                 report.append(f"- **{src} → {tgt}**: {score:.3f}")
     # Comparison with baseline
     if comparison and comparison.get('comparison_available'):
         report.append("")
-        report.append("## 🔍 Comparison with Baseline")
         better_count = len(comparison.get('better_pairs', []))
         worse_count = len(comparison.get('worse_pairs', []))
-        total_comparable = better_count + worse_count + (comparison.get('google_comparable_pairs', 0) - better_count - worse_count)
         if total_comparable > 0:
             report.append(f"- **Better than baseline**: {better_count}/{total_comparable} pairs")

 from src.utils import get_all_language_pairs, get_google_comparable_pairs
 def calculate_sentence_metrics(reference: str, prediction: str) -> Dict[str, float]:
+    """Calculate all metrics for a single sentence pair - Fixed to match reference implementation."""
     # Handle empty predictions
     if not prediction or not isinstance(prediction, str):
     metrics = {}
+    # BLEU score (keep as 0-100 scale initially)
     try:
         bleu = BLEU(effective_order=True)
         metrics['bleu'] = bleu.sentence_score(pred_norm, [ref_norm]).score
     except:
         metrics['bleu'] = 0.0
+    # ChrF score (normalize to 0-1)
     try:
         chrf = CHRF()
         metrics['chrf'] = chrf.sentence_score(pred_norm, [ref_norm]).score / 100.0
         metrics['rouge2'] = 0.0
         metrics['rougeL'] = 0.0
+    # Quality score (composite metric) - Fixed to match reference
     try:
         quality_components = [
             metrics['bleu'] / 100.0,  # Normalize BLEU to 0-1
+            metrics['chrf'],          # Already 0-1
             1.0 - min(metrics['cer'], 1.0),  # Invert error rates
             1.0 - min(metrics['wer'], 1.0),
             metrics['rouge1'],
             metrics['rougeL']
         ]
         metrics['quality_score'] = np.mean(quality_components)
+    except Exception as e:
+        # Fallback without ROUGE
+        print(f"Error calculating quality score: {e}")
+        try:
+            fallback_components = [
+                metrics['bleu'] / 100.0,
+                metrics['chrf'],
+                1.0 - min(metrics['cer'], 1.0),
+                1.0 - min(metrics['wer'], 1.0)
+            ]
+            metrics['quality_score'] = np.mean(fallback_components)
+        except:
+            metrics['quality_score'] = 0.0
     return metrics
     sample_df = pd.DataFrame(sample_metrics)
+    # Aggregate by language pairs - Fixed aggregation
     pair_metrics = {}
     overall_metrics = defaultdict(list)
     google_comparable_metrics = defaultdict(list)
                     # Calculate averages for this pair
                     for metric in METRICS_CONFIG['primary_metrics'] + METRICS_CONFIG['secondary_metrics']:
                         if metric in pair_data.columns:
+                            # Filter out invalid values
+                            valid_values = pair_data[metric].replace([np.inf, -np.inf], np.nan).dropna()
+                            if len(valid_values) > 0:
+                                avg_value = float(valid_values.mean())
+                                pair_metrics[pair_key][metric] = avg_value
+                                # Add to overall averages
+                                overall_metrics[metric].append(avg_value)
+                                # Add to Google comparable if applicable
+                                if (src_lang in GOOGLE_SUPPORTED_LANGUAGES and
+                                    tgt_lang in GOOGLE_SUPPORTED_LANGUAGES):
+                                    google_comparable_metrics[metric].append(avg_value)
                     pair_metrics[pair_key]['sample_count'] = len(pair_data)
     # Generate evaluation summary
     summary = {
         'total_samples': len(sample_df),
+        'language_pairs_covered': len([k for k in pair_metrics if pair_metrics[k].get('sample_count', 0) > 0]),
         'google_comparable_pairs': len([k for k in pair_metrics
                                       if '_to_' in k and
                                       k.split('_to_')[0] in GOOGLE_SUPPORTED_LANGUAGES and
+                                      k.split('_to_')[1] in GOOGLE_SUPPORTED_LANGUAGES and
+                                      pair_metrics[k].get('sample_count', 0) > 0]),
         'primary_metrics': {metric: averages.get(metric, 0.0)
                           for metric in METRICS_CONFIG['primary_metrics']},
         'secondary_metrics': {metric: averages.get(metric, 0.0)
         'error': None
     }
+# Keep the rest of the functions unchanged...
 def compare_with_baseline(results: Dict, baseline_results: Dict = None) -> Dict:
     """Compare results with baseline (e.g., Google Translate)."""
     report = []
     # Header
+    report.append(f"## Evaluation Report: {model_name or 'Submission'}")
     report.append("")
     # Summary
     summary = results['summary']
+    report.append("### 📊 Summary")
     report.append(f"- **Total Samples Evaluated**: {summary['total_samples']:,}")
     report.append(f"- **Language Pairs Covered**: {summary['language_pairs_covered']}")
     report.append(f"- **Google Comparable Pairs**: {summary['google_comparable_pairs']}")
     report.append("")
     # Primary metrics
+    report.append("### 🎯 Primary Metrics")
     for metric, value in summary['primary_metrics'].items():
         formatted_value = f"{value:.4f}" if metric != 'bleu' else f"{value:.2f}"
         report.append(f"- **{metric.upper()}**: {formatted_value}")
     report.append("")
     # Secondary metrics
+    report.append("### 📈 Secondary Metrics")
     for metric, value in summary['secondary_metrics'].items():
         formatted_value = f"{value:.4f}"
         report.append(f"- **{metric.upper()}**: {formatted_value}")
             reverse=True
         )
+        if sorted_pairs:
+            report.append("### 🏆 Best Performing Language Pairs")
+            for pair, score in sorted_pairs[:5]:
                 src, tgt = pair.replace('_to_', ' → ').split(' → ')
                 report.append(f"- **{src} → {tgt}**: {score:.3f}")
+            if len(sorted_pairs) > 5:
+                report.append("")
+                report.append("### 📉 Challenging Language Pairs")
+                for pair, score in sorted_pairs[-3:]:
+                    src, tgt = pair.replace('_to_', ' → ').split(' → ')
+                    report.append(f"- **{src} → {tgt}**: {score:.3f}")
     # Comparison with baseline
     if comparison and comparison.get('comparison_available'):
         report.append("")
+        report.append("### 🔍 Comparison with Baseline")
         better_count = len(comparison.get('better_pairs', []))
         worse_count = len(comparison.get('worse_pairs', []))
+        total_comparable = len(comparison.get('pair_comparisons', {}))
         if total_comparable > 0:
             report.append(f"- **Better than baseline**: {better_count}/{total_comparable} pairs")