Spaces:
Sleeping
Sleeping
| # app.py | |
| import subprocess | |
| import sys | |
| import os | |
| from pathlib import Path | |
| import traceback | |
| from datetime import datetime | |
| from typing import Optional, Dict, Tuple, List | |
| def setup_salt(): | |
| """Clone and setup SALT library like in Colab.""" | |
| try: | |
| import salt.dataset | |
| print("β SALT library already available") | |
| return True | |
| except ImportError: | |
| pass | |
| print("π₯ Setting up SALT library...") | |
| try: | |
| salt_dir = Path("salt") | |
| if not salt_dir.exists(): | |
| print("π Cloning SALT repository...") | |
| subprocess.check_call([ | |
| "git", "clone", "https://github.com/sunbirdai/salt.git" | |
| ]) | |
| else: | |
| print("π SALT repository already exists") | |
| salt_requirements = salt_dir / "requirements.txt" | |
| if salt_requirements.exists(): | |
| print("π¦ Installing SALT requirements...") | |
| subprocess.check_call([ | |
| sys.executable, "-m", "pip", "install", "-q", "-r", str(salt_requirements) | |
| ]) | |
| salt_path = str(salt_dir.absolute()) | |
| if salt_path not in sys.path: | |
| sys.path.insert(0, salt_path) | |
| print(f"π Added {salt_path} to Python path") | |
| import salt.dataset | |
| print("β SALT library setup completed successfully") | |
| return True | |
| except Exception as e: | |
| print(f"β Failed to setup SALT: {e}") | |
| return False | |
| # Setup SALT on startup | |
| print("π Starting SALT Translation Leaderboard...") | |
| if not setup_salt(): | |
| print("β Cannot continue without SALT library") | |
| sys.exit(1) | |
| import gradio as gr | |
| import pandas as pd | |
| import json | |
| # Import our modules | |
| from src.test_set import ( | |
| get_public_test_set, | |
| get_complete_test_set, | |
| create_test_set_download | |
| ) | |
| from src.validation import validate_submission | |
| from src.evaluation import evaluate_predictions, generate_evaluation_report | |
| from src.leaderboard import ( | |
| load_leaderboard, | |
| add_model_to_leaderboard, | |
| get_track_leaderboard, | |
| prepare_leaderboard_display | |
| ) | |
| from src.plotting import ( | |
| create_leaderboard_plot, | |
| create_language_pair_heatmap, | |
| create_performance_comparison_plot, | |
| create_language_pair_comparison_plot | |
| ) | |
| from src.utils import sanitize_model_name, get_all_language_pairs | |
| from config import * | |
| # Global variables for caching | |
| current_leaderboard = None | |
| public_test_set = None | |
| complete_test_set = None | |
| def initialize_data(): | |
| """Initialize test sets and leaderboard data.""" | |
| global public_test_set, complete_test_set, current_leaderboard | |
| try: | |
| print("π₯ Loading test sets...") | |
| public_test_set = get_public_test_set() | |
| complete_test_set = get_complete_test_set() | |
| print("π Loading leaderboard...") | |
| current_leaderboard = load_leaderboard() | |
| # Debug leaderboard content | |
| print(f"Leaderboard loaded with {len(current_leaderboard)} entries") | |
| if not current_leaderboard.empty: | |
| print(f"Leaderboard columns: {list(current_leaderboard.columns)}") | |
| print(f"Sample row types: {current_leaderboard.dtypes.to_dict()}") | |
| else: | |
| print("Leaderboard is empty - will show empty interface") | |
| print(f"β Initialization complete!") | |
| print(f" - Test set: {len(public_test_set):,} samples") | |
| print(f" - Current models: {len(current_leaderboard)}") | |
| return True | |
| except Exception as e: | |
| print(f"β Initialization failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| def download_test_set() -> Tuple[str, str]: | |
| """Create downloadable test set and return file path and info.""" | |
| try: | |
| global public_test_set | |
| if public_test_set is None: | |
| public_test_set = get_public_test_set() | |
| download_path, stats = create_test_set_download() | |
| info_msg = f""" | |
| ## π₯ SALT Test Set Downloaded Successfully! | |
| ### π Dataset Statistics: | |
| - **Total Samples**: {stats['total_samples']:,} | |
| - **Languages**: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))}) | |
| - **Google Comparable**: {stats.get('google_comparable_samples', 0):,} samples | |
| - **Language Pairs**: {stats.get('language_pairs', 0)} | |
| ### π Track Breakdown: | |
| """ | |
| track_breakdown = stats.get('track_breakdown', {}) | |
| for track_name, track_info in track_breakdown.items(): | |
| info_msg += f""" | |
| **{EVALUATION_TRACKS[track_name]['name']}**: | |
| - Samples: {track_info.get('total_samples', 0):,} | |
| - Language Pairs: {track_info.get('language_pairs', 0)} | |
| """ | |
| info_msg += f""" | |
| ### π File Format: | |
| - `sample_id`: Unique identifier for each sample | |
| - `source_text`: Text to be translated | |
| - `source_language`: Source language code | |
| - `target_language`: Target language code | |
| - `domain`: Content domain (if available) | |
| - `google_comparable`: Whether this pair can be compared with Google Translate | |
| ### π¬ Next Steps: | |
| 1. **Run your model** on the source texts to generate translations | |
| 2. **Create a predictions file** with columns: `sample_id`, `prediction` | |
| 3. **Submit** your predictions using the submission tab | |
| """ | |
| return download_path, info_msg | |
| except Exception as e: | |
| error_msg = f"β Error creating test set download: {str(e)}" | |
| return None, error_msg | |
| def validate_submission_file(file, model_name: str, author: str, description: str) -> Tuple[str, Optional[pd.DataFrame], str]: | |
| """Validate uploaded prediction file.""" | |
| try: | |
| if file is None: | |
| return "β Please upload a predictions file", None, "community" | |
| if not model_name.strip(): | |
| return "β Please provide a model name", None, "community" | |
| # Handle different file input types | |
| if isinstance(file, bytes): | |
| file_content = file | |
| elif isinstance(file, str): | |
| if os.path.exists(file): | |
| with open(file, "rb") as f: | |
| file_content = f.read() | |
| else: | |
| file_content = file.encode("utf-8") | |
| elif hasattr(file, "name") and os.path.exists(file.name): | |
| with open(file.name, "rb") as f: | |
| file_content = f.read() | |
| else: | |
| return "β Could not read uploaded file", None, "community" | |
| filename = getattr(file, "name", None) or getattr(file, "filename", None) or "predictions.csv" | |
| global complete_test_set | |
| if complete_test_set is None: | |
| complete_test_set = get_complete_test_set() | |
| validation_result = validate_submission( | |
| file_content, filename, complete_test_set, model_name, author, description | |
| ) | |
| detected_category = validation_result.get("category", "community") | |
| if validation_result.get("can_evaluate", False): | |
| return validation_result["report"], validation_result["predictions"], detected_category | |
| else: | |
| return validation_result["report"], None, detected_category | |
| except Exception as e: | |
| return f"β Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}", None, "community" | |
| def evaluate_submission( | |
| predictions_df: pd.DataFrame, | |
| model_name: str, | |
| author: str, | |
| description: str, | |
| detected_category: str, | |
| ) -> Tuple[str, pd.DataFrame, object, object]: | |
| """Evaluate validated predictions.""" | |
| try: | |
| if predictions_df is None: | |
| return "β No valid predictions to evaluate", None, None, None | |
| global complete_test_set, current_leaderboard | |
| if complete_test_set is None: | |
| complete_test_set = get_complete_test_set() | |
| print(f"π¬ Starting evaluation for {model_name}...") | |
| evaluation_results = evaluate_predictions(predictions_df, complete_test_set, detected_category) | |
| if evaluation_results.get('error'): | |
| return f"β Evaluation error: {evaluation_results['error']}", None, None, None | |
| print("π Adding to leaderboard...") | |
| updated_leaderboard = add_model_to_leaderboard( | |
| model_name=sanitize_model_name(model_name), | |
| author=author or "Anonymous", | |
| evaluation_results=evaluation_results, | |
| model_category=detected_category, | |
| description=description or "" | |
| ) | |
| current_leaderboard = updated_leaderboard | |
| report = generate_evaluation_report(evaluation_results, model_name) | |
| # Create visualizations | |
| summary_plot = create_performance_comparison_plot(updated_leaderboard, "google_comparable") | |
| google_leaderboard = get_track_leaderboard(updated_leaderboard, "google_comparable") | |
| display_leaderboard = prepare_leaderboard_display(google_leaderboard, "google_comparable") | |
| success_msg = f""" | |
| ## π Evaluation Complete! | |
| ### π Model Information: | |
| - **Model**: {model_name} | |
| - **Category**: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)} | |
| - **Author**: {author or 'Anonymous'} | |
| {report} | |
| """ | |
| return success_msg, display_leaderboard, summary_plot, None | |
| except Exception as e: | |
| error_msg = f"β Evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" | |
| return error_msg, None, None, None | |
| def refresh_track_leaderboard(track: str, search_query: str = "", category_filter: str = "all") -> Tuple[pd.DataFrame, object, object, str]: | |
| """Refresh leaderboard for a specific track with filters.""" | |
| try: | |
| print(f"Refreshing {track} leaderboard...") | |
| global current_leaderboard | |
| if current_leaderboard is None: | |
| print("Loading leaderboard...") | |
| current_leaderboard = load_leaderboard() | |
| print(f"Leaderboard loaded with {len(current_leaderboard)} entries") | |
| # Get track leaderboard with robust error handling | |
| try: | |
| print(f"Getting track leaderboard for {track}...") | |
| track_leaderboard = get_track_leaderboard(current_leaderboard, track, category_filter=category_filter) | |
| print(f"Track leaderboard has {len(track_leaderboard)} entries") | |
| except Exception as e: | |
| print(f"Error getting track leaderboard: {e}") | |
| track_leaderboard = pd.DataFrame() | |
| # Apply search filter | |
| if search_query and not track_leaderboard.empty: | |
| try: | |
| print(f"Applying search filter: {search_query}") | |
| query_lower = search_query.lower() | |
| mask = ( | |
| track_leaderboard['model_name'].str.lower().str.contains(query_lower, na=False) | | |
| track_leaderboard['author'].str.lower().str.contains(query_lower, na=False) | |
| ) | |
| track_leaderboard = track_leaderboard[mask] | |
| print(f"After search filter: {len(track_leaderboard)} entries") | |
| except Exception as e: | |
| print(f"Error applying search filter: {e}") | |
| # Prepare display with error handling | |
| try: | |
| print("Preparing display...") | |
| display_df = prepare_leaderboard_display(track_leaderboard, track) | |
| print(f"Display prepared with {len(display_df)} rows") | |
| except Exception as e: | |
| print(f"Error preparing display: {e}") | |
| display_df = pd.DataFrame() | |
| # Create plots with error handling | |
| try: | |
| print("Creating ranking plot...") | |
| ranking_plot = create_leaderboard_plot(track_leaderboard, track) | |
| except Exception as e: | |
| print(f"Error creating ranking plot: {e}") | |
| ranking_plot = None | |
| try: | |
| print("Creating comparison plot...") | |
| comparison_plot = create_performance_comparison_plot(track_leaderboard, track) | |
| except Exception as e: | |
| print(f"Error creating comparison plot: {e}") | |
| comparison_plot = None | |
| # Generate stats text with safe formatting | |
| try: | |
| print("Generating stats...") | |
| track_config = EVALUATION_TRACKS[track] | |
| best_model = "None" | |
| best_score = 0.0 | |
| if not track_leaderboard.empty: | |
| best_model = str(track_leaderboard.iloc[0]['model_name']) | |
| quality_col = f'{track}_quality' | |
| if quality_col in track_leaderboard.columns: | |
| try: | |
| score_val = track_leaderboard.iloc[0][quality_col] | |
| best_score = float(score_val) if pd.notnull(score_val) else 0.0 | |
| except (ValueError, TypeError): | |
| best_score = 0.0 | |
| stats_text = f""" | |
| ### π {track_config['name']} Statistics | |
| - **Total Models**: {len(track_leaderboard)} | |
| - **Best Model**: {best_model} | |
| - **Best Score**: {best_score:.4f} | |
| ### π¬ Track Information: | |
| {track_config.get('description', 'No description available')} | |
| """ | |
| print("Stats generated successfully") | |
| except Exception as e: | |
| print(f"Error generating stats: {e}") | |
| stats_text = f"Error loading {track} statistics: {str(e)}" | |
| print("Track refresh completed successfully") | |
| return display_df, ranking_plot, comparison_plot, stats_text | |
| except Exception as e: | |
| error_msg = f"Error loading {track} leaderboard: {str(e)}" | |
| print(f"MAIN ERROR: {error_msg}") | |
| import traceback | |
| traceback.print_exc() | |
| return pd.DataFrame(), None, None, error_msg | |
| def get_language_pair_comparison(track: str) -> Tuple[pd.DataFrame, object]: | |
| """Get language pair comparison data and visualization.""" | |
| try: | |
| global current_leaderboard | |
| if current_leaderboard is None: | |
| return pd.DataFrame(), None | |
| track_leaderboard = get_track_leaderboard(current_leaderboard, track) | |
| if track_leaderboard.empty: | |
| return pd.DataFrame(), None | |
| # Create language pair comparison table | |
| pairs_data = [] | |
| track_languages = EVALUATION_TRACKS[track]["languages"] | |
| for src in track_languages: | |
| for tgt in track_languages: | |
| if src == tgt: | |
| continue | |
| pair_key = f"{src}_to_{tgt}" | |
| pair_display = f"{LANGUAGE_NAMES.get(src, src)} β {LANGUAGE_NAMES.get(tgt, tgt)}" | |
| for _, model in track_leaderboard.iterrows(): | |
| # Extract detailed results if available | |
| detailed_col = f'detailed_{track}' | |
| if detailed_col in model and pd.notna(model[detailed_col]): | |
| try: | |
| detailed_results = json.loads(model[detailed_col]) | |
| pair_metrics = detailed_results.get('pair_metrics', {}) | |
| if pair_key in pair_metrics: | |
| metrics = pair_metrics[pair_key] | |
| pairs_data.append({ | |
| 'Language Pair': pair_display, | |
| 'Model': model['model_name'], | |
| 'Category': model['model_category'], | |
| 'Quality Score': metrics.get('quality_score', {}).get('mean', 0), | |
| 'BLEU': metrics.get('bleu', {}).get('mean', 0), | |
| 'ChrF': metrics.get('chrf', {}).get('mean', 0), | |
| 'Samples': metrics.get('sample_count', 0) | |
| }) | |
| except (json.JSONDecodeError, KeyError): | |
| continue | |
| pairs_df = pd.DataFrame(pairs_data) | |
| if pairs_df.empty: | |
| return pd.DataFrame(), None | |
| # Create visualization | |
| comparison_plot = create_language_pair_comparison_plot(pairs_df, track) | |
| return pairs_df, comparison_plot | |
| except Exception as e: | |
| print(f"Error in language pair comparison: {e}") | |
| return pd.DataFrame(), None | |
| # Initialize data on startup | |
| initialization_success = initialize_data() | |
| # Create Gradio interface | |
| with gr.Blocks( | |
| title="π SALT Translation Leaderboard", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .gradio-container { | |
| max-width: 1600px !important; | |
| margin: 0 auto; | |
| } | |
| /* Force readable text in all themes */ | |
| .markdown, .gr-markdown, .gr-html { | |
| color: var(--body-text-color) !important; | |
| background: var(--background-fill-primary) !important; | |
| } | |
| .markdown h1, .markdown h2, .markdown h3, | |
| .gr-markdown h1, .gr-markdown h2, .gr-markdown h3 { | |
| color: var(--body-text-color) !important; | |
| } | |
| .markdown p, .markdown li, .markdown strong, | |
| .gr-markdown p, .gr-markdown li, .gr-markdown strong { | |
| color: var(--body-text-color) !important; | |
| } | |
| /* Table styling */ | |
| .dataframe, .gr-dataframe { | |
| color: var(--body-text-color) !important; | |
| background: var(--background-fill-primary) !important; | |
| } | |
| /* Button and input styling */ | |
| .gr-button, .gr-textbox, .gr-dropdown { | |
| color: var(--body-text-color) !important; | |
| } | |
| /* Ensure plot backgrounds work in both themes */ | |
| .plot-container { | |
| background: var(--background-fill-primary) !important; | |
| } | |
| """ | |
| ) as demo: | |
| # Header | |
| gr.HTML(""" | |
| <div style="text-align: center; margin-bottom: 2rem; padding: 2rem; background: linear-gradient(135deg, #1e3a8a 0%, #3730a3 50%, #1e40af 100%); color: white !important; border-radius: 10px;"> | |
| <h1 style="color: white !important;">π SALT Translation Leaderboard</h1> | |
| <p style="color: white !important;"><strong>Rigorous Evaluation of Translation Models on Ugandan Languages</strong></p> | |
| <p style="color: white !important;">Three-tier evaluation β’ Statistical confidence intervals β’ Research-grade analysis</p> | |
| </div> | |
| """) | |
| # Status indicator | |
| if initialization_success: | |
| status_msg = "β System initialized successfully" | |
| else: | |
| status_msg = "β System initialization failed - some features may not work" | |
| gr.Markdown(f"**System Status**: {status_msg}") | |
| with gr.Tabs(): | |
| # Tab 1: Download Test Set | |
| with gr.Tab("π₯ Download Test Set", id="download"): | |
| gr.Markdown(""" | |
| ## π Get the SALT Test Set | |
| Download our test set for translation model evaluation. | |
| """) | |
| download_btn = gr.Button("π₯ Download Test Set", variant="primary", size="lg") | |
| with gr.Row(): | |
| with gr.Column(): | |
| download_file = gr.File(label="π Test Set File", interactive=False) | |
| with gr.Column(): | |
| download_info = gr.Markdown() | |
| # Tab 2: Submit Predictions | |
| with gr.Tab("π Submit Predictions", id="submit"): | |
| gr.Markdown(""" | |
| ## π― Submit Your Model's Predictions | |
| Upload predictions for evaluation across all tracks. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π Model Information") | |
| model_name_input = gr.Textbox( | |
| label="π€ Model Name", | |
| placeholder="e.g., MyTranslator-v2.0", | |
| info="Unique name for your model" | |
| ) | |
| author_input = gr.Textbox( | |
| label="π€ Author/Organization", | |
| placeholder="Your name or organization", | |
| value="Anonymous" | |
| ) | |
| description_input = gr.Textbox( | |
| label="π Model Description", | |
| placeholder="Architecture, training data, special features...", | |
| lines=4 | |
| ) | |
| predictions_file = gr.File( | |
| label="π Predictions File", | |
| file_types=[".csv", ".tsv", ".json"] | |
| ) | |
| validate_btn = gr.Button("β Validate Submission", variant="secondary") | |
| submit_btn = gr.Button("π Submit for Evaluation", variant="primary", interactive=False) | |
| with gr.Column(scale=1): | |
| validation_output = gr.Markdown() | |
| gr.Markdown("### π Evaluation Results") | |
| evaluation_output = gr.Markdown() | |
| with gr.Row(): | |
| with gr.Column(): | |
| submission_plot = gr.Plot(label="π Performance Analysis") | |
| with gr.Column(): | |
| results_table = gr.Dataframe(label="π Updated Leaderboard", interactive=False) | |
| # Tab 3: Google-Comparable Track | |
| with gr.Tab("π€ Google-Comparable Track", id="google_track"): | |
| gr.Markdown(f""" | |
| ## {EVALUATION_TRACKS['google_comparable']['name']} | |
| **{EVALUATION_TRACKS['google_comparable']['description']}** | |
| This track evaluates models on language pairs supported by Google Translate, | |
| enabling direct comparison with commercial baselines. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| google_search = gr.Textbox(label="π Search Models", placeholder="Search by model name, author...") | |
| with gr.Column(scale=1): | |
| google_category = gr.Dropdown( | |
| label="π·οΈ Category Filter", | |
| choices=["all"] + list(MODEL_CATEGORIES.keys()), | |
| value="all" | |
| ) | |
| with gr.Column(scale=1): | |
| google_refresh = gr.Button("π Refresh", variant="secondary") | |
| google_stats = gr.Markdown() | |
| with gr.Row(): | |
| with gr.Column(): | |
| google_ranking_plot = gr.Plot(label="π Rankings") | |
| with gr.Column(): | |
| google_comparison_plot = gr.Plot(label="π Performance Comparison") | |
| google_leaderboard = gr.Dataframe(label="π Google-Comparable Leaderboard", interactive=False) | |
| # Tab 4: UG40-Complete Track | |
| with gr.Tab("π UG40-Complete Track", id="ug40_track"): | |
| gr.Markdown(f""" | |
| ## {EVALUATION_TRACKS['ug40_complete']['name']} | |
| **{EVALUATION_TRACKS['ug40_complete']['description']}** | |
| This track evaluates models on all UG40 language pairs, | |
| providing comprehensive assessment of Ugandan language translation capabilities. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| ug40_search = gr.Textbox(label="π Search Models", placeholder="Search by model name, author...") | |
| with gr.Column(scale=1): | |
| ug40_category = gr.Dropdown( | |
| label="π·οΈ Category Filter", | |
| choices=["all"] + list(MODEL_CATEGORIES.keys()), | |
| value="all" | |
| ) | |
| with gr.Column(scale=1): | |
| ug40_refresh = gr.Button("π Refresh", variant="secondary") | |
| ug40_stats = gr.Markdown() | |
| with gr.Row(): | |
| with gr.Column(): | |
| ug40_ranking_plot = gr.Plot(label="π Rankings") | |
| with gr.Column(): | |
| ug40_comparison_plot = gr.Plot(label="π Performance Comparison") | |
| ug40_leaderboard = gr.Dataframe(label="π UG40-Complete Leaderboard", interactive=False) | |
| # Tab 5: Language Pair Analysis | |
| with gr.Tab("π Language Pair Analysis", id="pairs_analysis"): | |
| gr.Markdown(""" | |
| ## π Language Pair Performance Analysis | |
| Compare model performance across individual language pairs with detailed breakdowns. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| pairs_track_select = gr.Dropdown( | |
| label="π Select Track", | |
| choices=list(EVALUATION_TRACKS.keys()), | |
| value="google_comparable" | |
| ) | |
| with gr.Column(scale=1): | |
| pairs_refresh = gr.Button("π Analyze Language Pairs", variant="primary") | |
| pairs_comparison_plot = gr.Plot(label="π Language Pair Comparison") | |
| pairs_table = gr.Dataframe(label="π Language Pair Performance", interactive=False) | |
| # Tab 6: Documentation | |
| with gr.Tab("π Documentation", id="docs"): | |
| gr.Markdown(f""" | |
| # π SALT Translation Leaderboard Documentation | |
| ## π― Overview | |
| The SALT Translation Leaderboard provides rigorous evaluation of translation models | |
| on Ugandan languages using three different tracks for fair comparison. | |
| ## π Evaluation Tracks | |
| **1. π€ Google-Comparable Track** | |
| - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])} | |
| - **Purpose**: Fair comparison with commercial translation systems | |
| - **Language Pairs**: {len([1 for src in GOOGLE_SUPPORTED_LANGUAGES for tgt in GOOGLE_SUPPORTED_LANGUAGES if src != tgt])} | |
| **2. π UG40-Complete Track** | |
| - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages | |
| - **Purpose**: Comprehensive Ugandan language capability assessment | |
| - **Language Pairs**: {len([1 for src in ALL_UG40_LANGUAGES for tgt in ALL_UG40_LANGUAGES if src != tgt])} | |
| ## π Evaluation Metrics | |
| ### Primary Metrics | |
| - **Quality Score**: Composite metric (0-1) combining BLEU, ChrF, and error rates | |
| - **BLEU**: Bilingual Evaluation Understudy (0-100) | |
| - **ChrF**: Character-level F-score (0-1) | |
| ### Model Categories | |
| Models are automatically categorized for fair comparison: | |
| - **π’ Commercial**: Production translation systems | |
| - **π¬ Research**: Academic and research institution models | |
| - **π Baseline**: Simple baseline and reference models | |
| - **π₯ Community**: User-submitted models | |
| ## π Submission Process | |
| ### Step 1: Download Test Set | |
| 1. Click "Download Test Set" in the first tab | |
| 2. Save the test set file | |
| ### Step 2: Generate Predictions | |
| 1. Load the test set in your evaluation pipeline | |
| 2. For each row, translate `source_text` from `source_language` to `target_language` | |
| 3. Save results as CSV with columns: `sample_id`, `prediction` | |
| ### Step 3: Submit & Evaluate | |
| 1. Fill in model information | |
| 2. Upload your predictions file | |
| 3. Review validation report | |
| 4. Submit for evaluation | |
| ## π File Formats | |
| ### Test Set Format | |
| ```csv | |
| sample_id,source_text,source_language,target_language,domain,google_comparable | |
| salt_000001,"Hello world",eng,lug,general,true | |
| salt_000002,"How are you?",eng,ach,conversation,true | |
| ``` | |
| ### Predictions Format | |
| ```csv | |
| sample_id,prediction | |
| salt_000001,"Amakuru ensi" | |
| salt_000002,"Ibino nining?" | |
| ``` | |
| ## π€ Contributing | |
| This leaderboard is designed for the research community. When using results: | |
| 1. Consider the appropriate track for your comparison | |
| 2. Report confidence intervals when available | |
| 3. Acknowledge the model category in comparisons | |
| --- | |
| *For questions, contact the team at research@sunbird.ai* | |
| """) | |
| # Event handlers | |
| predictions_validated = gr.State(value=None) | |
| detected_category_state = gr.State(value="community") | |
| # Download test set | |
| download_btn.click( | |
| fn=download_test_set, | |
| outputs=[download_file, download_info] | |
| ) | |
| # Validate predictions | |
| def handle_validation(file, model_name, author, description): | |
| report, predictions, category = validate_submission_file(file, model_name, author, description) | |
| can_evaluate = predictions is not None | |
| if can_evaluate: | |
| button_status = "\n\nβ **Ready to submit for evaluation!**" | |
| else: | |
| button_status = "\n\nβ **Please fix issues above before evaluation**" | |
| enhanced_report = report + button_status | |
| return ( | |
| enhanced_report, | |
| predictions, | |
| category, | |
| gr.update(interactive=can_evaluate) | |
| ) | |
| validate_btn.click( | |
| fn=handle_validation, | |
| inputs=[predictions_file, model_name_input, author_input, description_input], | |
| outputs=[validation_output, predictions_validated, detected_category_state, submit_btn] | |
| ) | |
| # Submit for evaluation | |
| submit_btn.click( | |
| fn=evaluate_submission, | |
| inputs=[predictions_validated, model_name_input, author_input, description_input, detected_category_state], | |
| outputs=[evaluation_output, results_table, submission_plot, gr.Plot(visible=False)] | |
| ) | |
| # Track leaderboard refresh functions | |
| google_refresh.click( | |
| fn=lambda *args: refresh_track_leaderboard("google_comparable", *args), | |
| inputs=[google_search, google_category], | |
| outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats] | |
| ) | |
| ug40_refresh.click( | |
| fn=lambda *args: refresh_track_leaderboard("ug40_complete", *args), | |
| inputs=[ug40_search, ug40_category], | |
| outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats] | |
| ) | |
| # Language pair analysis | |
| pairs_refresh.click( | |
| fn=get_language_pair_comparison, | |
| inputs=[pairs_track_select], | |
| outputs=[pairs_table, pairs_comparison_plot] | |
| ) | |
| # Load initial data and update dropdowns | |
| def load_initial_data(): | |
| try: | |
| print("Loading initial data...") | |
| global current_leaderboard | |
| # Make sure we have a leaderboard | |
| if current_leaderboard is None: | |
| current_leaderboard = load_leaderboard() | |
| print(f"Current leaderboard has {len(current_leaderboard)} entries") | |
| # Try to load Google track data | |
| try: | |
| google_data = refresh_track_leaderboard("google_comparable", "", "all") | |
| print("Successfully loaded Google track data") | |
| return google_data | |
| except Exception as e: | |
| print(f"Error loading Google track: {e}") | |
| # Return empty data if there's an error | |
| empty_df = pd.DataFrame() | |
| return (empty_df, None, None, "No data available") | |
| except Exception as e: | |
| print(f"Error in load_initial_data: {e}") | |
| empty_df = pd.DataFrame() | |
| return (empty_df, None, None, "Error loading data") | |
| demo.load( | |
| fn=load_initial_data, | |
| outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats] | |
| ) | |
| # Launch the application | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_error=True | |
| ) |