Spaces:
Sleeping
Sleeping
| # src/leaderboard.py | |
| import pandas as pd | |
| from datasets import Dataset, load_dataset | |
| import json | |
| import datetime | |
| from typing import Dict, List, Optional, Tuple | |
| import os | |
| import numpy as np | |
| from config import ( | |
| LEADERBOARD_DATASET, | |
| HF_TOKEN, | |
| EVALUATION_TRACKS, | |
| MODEL_CATEGORIES, | |
| METRICS_CONFIG, | |
| ) | |
| from src.utils import create_submission_id, sanitize_model_name | |
| def initialize_leaderboard() -> pd.DataFrame: | |
| """Initialize empty leaderboard DataFrame with all required columns.""" | |
| columns = { | |
| # Basic information | |
| "submission_id": [], | |
| "model_name": [], | |
| "author": [], | |
| "submission_date": [], | |
| "model_category": [], | |
| "description": [], | |
| # Track-specific quality scores | |
| "google_comparable_quality": [], | |
| "ug40_complete_quality": [], | |
| # Track-specific BLEU scores | |
| "google_comparable_bleu": [], | |
| "ug40_complete_bleu": [], | |
| # Track-specific ChrF scores | |
| "google_comparable_chrf": [], | |
| "ug40_complete_chrf": [], | |
| # Confidence intervals | |
| "google_comparable_ci_lower": [], | |
| "google_comparable_ci_upper": [], | |
| "ug40_complete_ci_lower": [], | |
| "ug40_complete_ci_upper": [], | |
| # Coverage information | |
| "google_comparable_samples": [], | |
| "ug40_complete_samples": [], | |
| "google_comparable_pairs": [], | |
| "ug40_complete_pairs": [], | |
| # Detailed results (JSON strings) | |
| "detailed_google_comparable": [], | |
| "detailed_ug40_complete": [], | |
| # Metadata | |
| "evaluation_date": [], | |
| } | |
| return pd.DataFrame(columns) | |
| def load_leaderboard() -> pd.DataFrame: | |
| """Load current leaderboard from HuggingFace dataset.""" | |
| try: | |
| print("📥 Loading leaderboard...") | |
| dataset = load_dataset(LEADERBOARD_DATASET, split="train", token=HF_TOKEN) | |
| df = dataset.to_pandas() | |
| # Ensure all required columns exist | |
| required_columns = list(initialize_leaderboard().columns) | |
| for col in required_columns: | |
| if col not in df.columns: | |
| if "quality" in col or "bleu" in col or "chrf" in col or "ci_" in col: | |
| df[col] = 0.0 | |
| elif "samples" in col or "pairs" in col: | |
| df[col] = 0 | |
| else: | |
| df[col] = "" | |
| # Ensure proper data types for numeric columns with robust conversion | |
| numeric_columns = [ | |
| col for col in df.columns | |
| if any(x in col for x in ["quality", "bleu", "chrf", "ci_", "samples", "pairs"]) | |
| ] | |
| for col in numeric_columns: | |
| try: | |
| # Convert to numeric, coercing errors to NaN, then fill NaN with 0 | |
| df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0) | |
| # Ensure it's float type for consistency | |
| df[col] = df[col].astype(float) | |
| except Exception as e: | |
| print(f"Warning: Could not convert column {col} to numeric: {e}") | |
| df[col] = 0.0 | |
| # Ensure string columns are properly typed | |
| string_columns = ["model_name", "author", "model_category", "description", "submission_date", "evaluation_date"] | |
| for col in string_columns: | |
| if col in df.columns: | |
| df[col] = df[col].fillna("").astype(str) | |
| print(f"✅ Loaded leaderboard with {len(df)} entries") | |
| return df | |
| except Exception as e: | |
| print(f"⚠️ Could not load leaderboard: {e}") | |
| print("🔄 Initializing empty leaderboard...") | |
| return initialize_leaderboard() | |
| def save_leaderboard(df: pd.DataFrame) -> bool: | |
| """Save leaderboard to HuggingFace dataset.""" | |
| try: | |
| # Clean data before saving | |
| df_clean = df.copy() | |
| # Ensure numeric columns are proper types | |
| numeric_columns = [ | |
| col for col in df_clean.columns | |
| if any(x in col for x in ["quality", "bleu", "chrf", "ci_", "samples", "pairs"]) | |
| ] | |
| for col in numeric_columns: | |
| if col in df_clean.columns: | |
| df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce").fillna(0.0) | |
| # Convert to dataset | |
| dataset = Dataset.from_pandas(df_clean) | |
| # Push to hub | |
| dataset.push_to_hub( | |
| LEADERBOARD_DATASET, | |
| token=HF_TOKEN, | |
| commit_message=f"Update leaderboard - {datetime.datetime.now().isoformat()[:19]}", | |
| ) | |
| print("✅ Leaderboard saved successfully!") | |
| return True | |
| except Exception as e: | |
| print(f"❌ Error saving leaderboard: {e}") | |
| return False | |
| def add_model_to_leaderboard( | |
| model_name: str, | |
| author: str, | |
| evaluation_results: Dict, | |
| model_category: str = "community", | |
| description: str = "", | |
| ) -> pd.DataFrame: | |
| """Add new model results to leaderboard.""" | |
| # Load current leaderboard | |
| df = load_leaderboard() | |
| # Remove existing entry if present | |
| existing_mask = df["model_name"] == model_name | |
| if existing_mask.any(): | |
| df = df[~existing_mask] | |
| # Extract track results | |
| tracks = evaluation_results.get("tracks", {}) | |
| # Prepare new entry | |
| new_entry = { | |
| "submission_id": create_submission_id(), | |
| "model_name": sanitize_model_name(model_name), | |
| "author": author[:100] if author else "Anonymous", | |
| "submission_date": datetime.datetime.now().isoformat(), | |
| "model_category": model_category if model_category in MODEL_CATEGORIES else "community", | |
| "description": description[:500] if description else "", | |
| # Extract track-specific metrics | |
| **extract_track_metrics(tracks), | |
| # Confidence intervals | |
| **extract_confidence_intervals(tracks), | |
| # Coverage information | |
| **extract_coverage_information(tracks), | |
| # Detailed results (JSON strings) | |
| **serialize_detailed_results(tracks), | |
| # Metadata | |
| "evaluation_date": datetime.datetime.now().isoformat(), | |
| } | |
| # Convert to DataFrame and append | |
| new_row_df = pd.DataFrame([new_entry]) | |
| updated_df = pd.concat([df, new_row_df], ignore_index=True) | |
| # Save to hub | |
| save_leaderboard(updated_df) | |
| return updated_df | |
| def extract_track_metrics(tracks: Dict) -> Dict: | |
| """Extract primary metrics from each track.""" | |
| metrics = {} | |
| for track_name in EVALUATION_TRACKS.keys(): | |
| track_data = tracks.get(track_name, {}) | |
| track_averages = track_data.get("track_averages", {}) | |
| # Quality score | |
| metrics[f"{track_name}_quality"] = float(track_averages.get("quality_score", 0.0)) | |
| # BLEU score | |
| metrics[f"{track_name}_bleu"] = float(track_averages.get("bleu", 0.0)) | |
| # ChrF score | |
| metrics[f"{track_name}_chrf"] = float(track_averages.get("chrf", 0.0)) | |
| return metrics | |
| def extract_confidence_intervals(tracks: Dict) -> Dict: | |
| """Extract confidence intervals from each track.""" | |
| ci_data = {} | |
| for track_name in EVALUATION_TRACKS.keys(): | |
| track_data = tracks.get(track_name, {}) | |
| track_confidence = track_data.get("track_confidence", {}) | |
| quality_stats = track_confidence.get("quality_score", {}) | |
| ci_data[f"{track_name}_ci_lower"] = float(quality_stats.get("ci_lower", 0.0)) | |
| ci_data[f"{track_name}_ci_upper"] = float(quality_stats.get("ci_upper", 0.0)) | |
| return ci_data | |
| def extract_coverage_information(tracks: Dict) -> Dict: | |
| """Extract coverage information from each track.""" | |
| coverage = {} | |
| for track_name in EVALUATION_TRACKS.keys(): | |
| track_data = tracks.get(track_name, {}) | |
| summary = track_data.get("summary", {}) | |
| coverage[f"{track_name}_samples"] = int(summary.get("total_samples", 0)) | |
| coverage[f"{track_name}_pairs"] = int(summary.get("language_pairs_evaluated", 0)) | |
| return coverage | |
| def serialize_detailed_results(tracks: Dict) -> Dict: | |
| """Serialize detailed results for storage.""" | |
| detailed = {} | |
| for track_name in EVALUATION_TRACKS.keys(): | |
| track_data = tracks.get(track_name, {}) | |
| # Create simplified detailed results for storage | |
| simple_track_data = { | |
| "pair_metrics": track_data.get("pair_metrics", {}), | |
| "track_averages": track_data.get("track_averages", {}), | |
| "track_confidence": track_data.get("track_confidence", {}), | |
| "summary": track_data.get("summary", {}) | |
| } | |
| detailed[f"detailed_{track_name}"] = json.dumps(simple_track_data) | |
| return detailed | |
| def get_track_leaderboard( | |
| df: pd.DataFrame, | |
| track: str, | |
| metric: str = "quality", | |
| category_filter: str = "all" | |
| ) -> pd.DataFrame: | |
| """Get leaderboard for a specific track with filtering.""" | |
| print(f"Getting track leaderboard for {track}, input df has {len(df)} rows") | |
| if df.empty: | |
| print("Input DataFrame is empty") | |
| return df | |
| track_quality_col = f"{track}_{metric}" | |
| # Ensure columns exist | |
| if track_quality_col not in df.columns: | |
| print(f"Warning: Missing column {track_quality_col} for track {track}") | |
| print(f"Available columns: {list(df.columns)}") | |
| return pd.DataFrame() | |
| try: | |
| # Make a copy to avoid modifying original | |
| df_filtered = df.copy() | |
| print(f"Created copy with {len(df_filtered)} rows") | |
| # Filter by category | |
| if category_filter != "all": | |
| original_count = len(df_filtered) | |
| df_filtered = df_filtered[df_filtered["model_category"] == category_filter] | |
| print(f"After category filter '{category_filter}': {len(df_filtered)} rows (was {original_count})") | |
| # Ensure numeric columns are properly typed | |
| numeric_columns = [ | |
| f"{track}_quality", f"{track}_bleu", f"{track}_chrf", | |
| f"{track}_ci_lower", f"{track}_ci_upper", | |
| f"{track}_samples", f"{track}_pairs" | |
| ] | |
| print(f"Converting numeric columns: {[col for col in numeric_columns if col in df_filtered.columns]}") | |
| for col in numeric_columns: | |
| if col in df_filtered.columns: | |
| try: | |
| # Check original data type | |
| print(f"Column {col} dtype: {df_filtered[col].dtype}, sample values: {df_filtered[col].head(3).tolist()}") | |
| # Convert to numeric | |
| df_filtered[col] = pd.to_numeric(df_filtered[col], errors='coerce').fillna(0.0) | |
| print(f"Column {col} converted successfully") | |
| except Exception as e: | |
| print(f"Error converting column {col}: {e}") | |
| df_filtered[col] = 0.0 | |
| # Filter to models that have this track | |
| original_count = len(df_filtered) | |
| quality_mask = df_filtered[track_quality_col] > 0 | |
| df_filtered = df_filtered[quality_mask] | |
| print(f"After quality filter (>{track_quality_col} > 0): {len(df_filtered)} rows (was {original_count})") | |
| if df_filtered.empty: | |
| print("No models found with quality > 0 for this track") | |
| return df_filtered | |
| # Sort by track-specific metric | |
| print(f"Sorting by {track_quality_col}") | |
| df_filtered = df_filtered.sort_values(track_quality_col, ascending=False).reset_index(drop=True) | |
| print(f"Sorted successfully, final result has {len(df_filtered)} rows") | |
| return df_filtered | |
| except Exception as e: | |
| print(f"Error in get_track_leaderboard: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return pd.DataFrame() | |
| def prepare_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFrame: | |
| """Prepare track-specific leaderboard for display.""" | |
| if df.empty: | |
| return df | |
| # Select relevant columns for this track | |
| base_columns = ["model_name", "author", "submission_date", "model_category"] | |
| track_columns = [ | |
| f"{track}_quality", | |
| f"{track}_bleu", | |
| f"{track}_chrf", | |
| f"{track}_ci_lower", | |
| f"{track}_ci_upper", | |
| f"{track}_samples", | |
| f"{track}_pairs", | |
| ] | |
| # Only include columns that exist | |
| available_columns = [col for col in base_columns + track_columns if col in df.columns] | |
| display_df = df[available_columns].copy() | |
| # Format numeric columns safely | |
| def safe_format(value, precision=4): | |
| """Safely format numeric values.""" | |
| try: | |
| if pd.isna(value) or value is None: | |
| return "0.0000" if precision == 4 else "0.00" | |
| return f"{float(value):.{precision}f}" | |
| except (ValueError, TypeError): | |
| return "0.0000" if precision == 4 else "0.00" | |
| # Apply formatting to numeric columns | |
| if f"{track}_quality" in display_df.columns: | |
| display_df[f"{track}_quality"] = display_df[f"{track}_quality"].apply(lambda x: safe_format(x, 4)) | |
| if f"{track}_bleu" in display_df.columns: | |
| display_df[f"{track}_bleu"] = display_df[f"{track}_bleu"].apply(lambda x: safe_format(x, 2)) | |
| if f"{track}_chrf" in display_df.columns: | |
| display_df[f"{track}_chrf"] = display_df[f"{track}_chrf"].apply(lambda x: safe_format(x, 4)) | |
| if f"{track}_ci_lower" in display_df.columns: | |
| display_df[f"{track}_ci_lower"] = display_df[f"{track}_ci_lower"].apply(lambda x: safe_format(x, 4)) | |
| if f"{track}_ci_upper" in display_df.columns: | |
| display_df[f"{track}_ci_upper"] = display_df[f"{track}_ci_upper"].apply(lambda x: safe_format(x, 4)) | |
| # Format confidence intervals | |
| if f"{track}_ci_lower" in display_df.columns and f"{track}_ci_upper" in display_df.columns: | |
| display_df[f"{track}_confidence_interval"] = ( | |
| "[" + display_df[f"{track}_ci_lower"] + ", " + display_df[f"{track}_ci_upper"] + "]" | |
| ) | |
| # Remove individual CI columns for cleaner display | |
| display_df = display_df.drop(columns=[f"{track}_ci_lower", f"{track}_ci_upper"]) | |
| # Format submission date | |
| if "submission_date" in display_df.columns: | |
| display_df["submission_date"] = pd.to_datetime(display_df["submission_date"]).dt.strftime("%Y-%m-%d") | |
| # Rename columns for better display | |
| track_name = EVALUATION_TRACKS[track]["name"].split()[0] # First word | |
| column_renames = { | |
| "model_name": "Model Name", | |
| "author": "Author", | |
| "submission_date": "Submitted", | |
| "model_category": "Category", | |
| f"{track}_quality": f"{track_name} Quality", | |
| f"{track}_bleu": f"{track_name} BLEU", | |
| f"{track}_chrf": f"{track_name} ChrF", | |
| f"{track}_confidence_interval": "95% CI", | |
| f"{track}_samples": "Samples", | |
| f"{track}_pairs": "Pairs", | |
| } | |
| display_df = display_df.rename(columns=column_renames) | |
| return display_df |