Spaces:

akera
/

leaderboard

Sleeping

App Files Files Community

akera commited on Jun 16, 2025

Commit

2158319

verified ·

1 Parent(s): 37b3c92

Update app.py

Browse files

Files changed (1) hide show

app.py +856 -388

app.py CHANGED Viewed

@@ -51,7 +51,7 @@ def setup_salt():
         return False
 # Setup SALT on startup
-print("🚀 Starting SALT Translation Leaderboard...")
 if not setup_salt():
     print("❌ Cannot continue without SALT library")
     print("💡 Please check that git is available and GitHub is accessible")
@@ -62,458 +62,711 @@ import pandas as pd
 import json
 import traceback
 from datetime import datetime
-from typing import Optional, Dict, Tuple
-# Import our modules
-from src.test_set import get_public_test_set, get_complete_test_set, create_test_set_download, validate_test_set_integrity
-from src.validation import validate_submission_complete
-from src.evaluation import evaluate_predictions, generate_evaluation_report, get_google_translate_baseline
 from src.leaderboard import (
-    load_leaderboard, add_model_to_leaderboard, get_leaderboard_stats,
-    filter_leaderboard, export_leaderboard, get_model_comparison, prepare_leaderboard_display
 )
 from src.plotting import (
-    create_leaderboard_ranking_plot, create_metrics_comparison_plot,
-    create_language_pair_heatmap, create_coverage_analysis_plot,
-    create_model_performance_timeline, create_google_comparison_plot,
-    create_detailed_model_analysis, create_submission_summary_plot
 )
-from src.utils import sanitize_model_name, get_all_language_pairs, get_google_comparable_pairs
 from config import *
 # Global variables for caching
 current_leaderboard = None
 public_test_set = None
 complete_test_set = None
-def initialize_data():
-    """Initialize test sets and leaderboard data."""
-    global public_test_set, complete_test_set, current_leaderboard
     try:
-        print("🔄 Initializing SALT Translation Leaderboard...")
-        # Load test sets
-        print("📥 Loading test sets...")
-        public_test_set = get_public_test_set()
-        complete_test_set = get_complete_test_set()
-        # Load leaderboard
-        print("🏆 Loading leaderboard...")
-        current_leaderboard = load_leaderboard()
-        print(f"✅ Initialization complete!")
         print(f"   - Test set: {len(public_test_set):,} samples")
-        print(f"   - Language pairs: {len(get_all_language_pairs())}")
         print(f"   - Current models: {len(current_leaderboard)}")
         return True
     except Exception as e:
-        print(f"❌ Initialization failed: {e}")
         traceback.print_exc()
         return False
-def download_test_set() -> Tuple[str, str]:
-    """Create downloadable test set and return file path and info."""
     try:
         global public_test_set
         if public_test_set is None:
-            public_test_set = get_public_test_set()
         # Create download file
-        download_path, stats = create_test_set_download()
-        # Create info message
         info_msg = f"""
-## 📥 SALT Test Set Downloaded Successfully!
-### Dataset Statistics:
 - **Total Samples**: {stats['total_samples']:,}
-- **Language Pairs**: {stats['language_pairs']}
-- **Google Comparable**: {stats['google_comparable_samples']:,} samples
-- **Languages**: {', '.join(stats['languages'])}
-### File Format:
 - `sample_id`: Unique identifier for each sample
 - `source_text`: Text to be translated
 - `source_language`: Source language code
 - `target_language`: Target language code
 - `domain`: Content domain (if available)
 - `google_comparable`: Whether this pair can be compared with Google Translate
-### Next Steps:
-1. Run your model on the source texts
-2. Create a CSV/JSON file with columns: `sample_id`, `prediction`
-3. Upload your predictions using the "Submit Predictions" tab
         """
         return download_path, info_msg
     except Exception as e:
-        error_msg = f"❌ Error creating test set download: {str(e)}"
         return None, error_msg
-def validate_submission(file, model_name: str, author: str, description: str) -> Tuple[str, Optional[pd.DataFrame]]:
-    """Validate uploaded prediction file, supporting str paths, bytes, and Gradio wrappers."""
     try:
         if file is None:
-            return "❌ Please upload a predictions file", None
         if not model_name.strip():
-            return "❌ Please provide a model name", None
-        # 1) Determine raw bytes
         if isinstance(file, bytes):
             file_content = file
         elif isinstance(file, str):
-            # could be a path or raw text
             if os.path.exists(file):
                 with open(file, "rb") as f:
                     file_content = f.read()
             else:
                 file_content = file.encode("utf-8")
         elif hasattr(file, "name") and os.path.exists(file.name):
-            # tempfile._TemporaryFileWrapper from Gradio
             with open(file.name, "rb") as f:
                 file_content = f.read()
         else:
-            return "❌ Could not read uploaded file", None
-        # 2) Infer filename for format-sniffing
         filename = (
             getattr(file, "name", None)
             or getattr(file, "filename", None)
             or "predictions.csv"
         )
-        # 3) Load test set if needed
         global complete_test_set
         if complete_test_set is None:
-            complete_test_set = get_complete_test_set()
-        # 4) Run existing validation pipeline
-        validation_result = validate_submission_complete(
-            file_content, filename, complete_test_set, model_name
         )
         if validation_result["valid"]:
-            return validation_result["report"], validation_result["predictions"]
         else:
-            return validation_result["report"], None
     except Exception as e:
         return (
             f"❌ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}",
             None,
         )
-def evaluate_submission(
-    predictions_df: pd.DataFrame,
-    model_name: str,
-    author: str,
     description: str,
-    validation_info: Dict
-) -> Tuple[str, pd.DataFrame, object, object]:
-    """Evaluate validated predictions and update leaderboard."""
     try:
         if predictions_df is None:
-            return "❌ No valid predictions to evaluate", None, None, None
         # Get complete test set with targets
         global complete_test_set, current_leaderboard
         if complete_test_set is None:
-            complete_test_set = get_complete_test_set()
-        # Run evaluation
-        print(f"🔄 Evaluating {model_name}...")
-        evaluation_results = evaluate_predictions(predictions_df, complete_test_set)
-        if evaluation_results.get('error'):
-            return f"❌ Evaluation error: {evaluation_results['error']}", None, None, None
-        # Add to leaderboard
-        print("🏆 Adding to leaderboard...")
-        model_type = "user_submission"  # Could be enhanced to detect model type
-        updated_leaderboard = add_model_to_leaderboard(
             model_name=sanitize_model_name(model_name),
-            author=author or "Anonymous",
             evaluation_results=evaluation_results,
-            validation_info=validation_info,
-            model_type=model_type,
             description=description or ""
         )
         # Update global leaderboard
         current_leaderboard = updated_leaderboard
-        # Generate evaluation report
-        report = generate_evaluation_report(evaluation_results, model_name)
-        # Create visualization plots
-        summary_plot = create_submission_summary_plot(validation_info, evaluation_results)
-        ranking_plot = create_leaderboard_ranking_plot(updated_leaderboard)
-        # Format success message
-        rank = updated_leaderboard[updated_leaderboard['model_name'] == sanitize_model_name(model_name)].index[0] + 1
-        total_models = len(updated_leaderboard)
         success_msg = f"""
-## 🎉 Evaluation Complete!
-### Your Results:
 - **Model**: {model_name}
-- **Rank**: #{rank} out of {total_models} models
-- **Quality Score**: {evaluation_results['averages'].get('quality_score', 0):.4f}
-- **BLEU**: {evaluation_results['averages'].get('bleu', 0):.2f}
-- **ChrF**: {evaluation_results['averages'].get('chrf', 0):.4f}
-### Coverage:
-- **Samples Evaluated**: {evaluation_results['evaluated_samples']:,}
-- **Language Pairs**: {evaluation_results['summary']['language_pairs_covered']}
-- **Google Comparable**: {evaluation_results['summary']['google_comparable_pairs']} pairs
 {report}
         """
-        return success_msg, prepare_leaderboard_display(updated_leaderboard), summary_plot, ranking_plot
     except Exception as e:
-        error_msg = f"❌ Evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
-        return error_msg, None, None, None
-def refresh_leaderboard_display(
     search_query: str = "",
-    model_type_filter: str = "all",
-    min_coverage: float = 0.0,
-    google_only: bool = False
 ) -> Tuple[pd.DataFrame, object, object, str]:
-    """Refresh and filter leaderboard display."""
     try:
         global current_leaderboard
         if current_leaderboard is None:
-            current_leaderboard = load_leaderboard()
-        # Apply filters
-        filtered_df = filter_leaderboard(
-            current_leaderboard,
-            search_query=search_query,
-            model_type=model_type_filter,
-            min_coverage=min_coverage,
-            google_comparable_only=google_only
         )
-        # Prepare for display (removes detailed_metrics column)
-        display_df = prepare_leaderboard_display(filtered_df)
         # Create plots
-        ranking_plot = create_leaderboard_ranking_plot(filtered_df)
-        comparison_plot = create_metrics_comparison_plot(filtered_df)
-        # Get stats
-        stats = get_leaderboard_stats(filtered_df)
         stats_text = f"""
-### 📊 Leaderboard Statistics
-- **Total Models**: {stats['total_models']}
-- **Average Quality Score**: {stats['avg_quality_score']:.4f}
-- **Google Comparable Models**: {stats['google_comparable_models']}
-**Best Model**: {stats['best_model']['name'] if stats['best_model'] else 'None'}
-**Latest Submission**: {stats['latest_submission'][:10] if stats['latest_submission'] else 'None'}
         """
         return display_df, ranking_plot, comparison_plot, stats_text
     except Exception as e:
-        error_msg = f"Error loading leaderboard: {str(e)}"
         empty_df = pd.DataFrame()
         return empty_df, None, None, error_msg
-def get_model_details(model_name: str) -> Tuple[str, object]:
-    """Get detailed analysis for a specific model."""
     try:
         global current_leaderboard
         if current_leaderboard is None:
-            return "Leaderboard not loaded", None
         # Find model
         model_row = current_leaderboard[current_leaderboard['model_name'] == model_name]
         if model_row.empty:
-            return f"Model '{model_name}' not found", None
         model_info = model_row.iloc[0]
-        # Parse detailed metrics
         try:
-            detailed_results = json.loads(model_info['detailed_metrics'])
         except:
             detailed_results = {}
-        # Create detailed plot
-        detail_plot = create_detailed_model_analysis(detailed_results, model_name)
-        # Format model details
         details_text = f"""
-## 🔍 Model Details: {model_name}
-### Basic Information:
 - **Author**: {model_info['author']}
 - **Submission Date**: {model_info['submission_date'][:10]}
-- **Model Type**: {model_info['model_type']}
 - **Description**: {model_info['description'] or 'No description provided'}
-### Performance Metrics:
-- **Quality Score**: {model_info['quality_score']:.4f}
-- **BLEU**: {model_info['bleu']:.2f}
-- **ChrF**: {model_info['chrf']:.4f}
-- **ROUGE-1**: {model_info['rouge1']:.4f}
-- **ROUGE-L**: {model_info['rougeL']:.4f}
-### Coverage Information:
-- **Total Samples**: {model_info['total_samples']:,}
-- **Language Pairs Covered**: {model_info['language_pairs_covered']}
-- **Google Comparable Pairs**: {model_info['google_pairs_covered']}
-- **Coverage Rate**: {model_info['coverage_rate']:.1%}
-### Google Translate Comparison:
-- **Google Quality Score**: {model_info['google_quality_score']:.4f}
-- **Google BLEU**: {model_info['google_bleu']:.2f}
-- **Google ChrF**: {model_info['google_chrf']:.4f}
         """
-        return details_text, detail_plot
     except Exception as e:
         error_msg = f"Error getting model details: {str(e)}"
         return error_msg, None
 # Initialize data on startup
-print("🚀 Starting SALT Translation Leaderboard...")
-initialization_success = initialize_data()
-# Create Gradio interface
 with gr.Blocks(
-    title=TITLE,
     theme=gr.themes.Soft(),
     css="""
     .gradio-container {
-        max-width: 1400px !important;
         margin: 0 auto;
     }
-    .main-header {
         text-align: center;
         margin-bottom: 2rem;
         padding: 2rem;
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
         color: white;
         border-radius: 10px;
     }
     .metric-box {
-        background: #f8f9fa;
         padding: 1rem;
         border-radius: 8px;
         margin: 0.5rem 0;
-        border-left: 4px solid #007bff;
     }
-    .error-box {
-        background: #f8d7da;
-        color: #721c24;
-        padding: 1rem;
         border-radius: 8px;
-        border-left: 4px solid #dc3545;
-    }
-    .success-box {
-        background: #d4edda;
-        color: #155724;
         padding: 1rem;
-        border-radius: 8px;
-        border-left: 4px solid #28a745;
     }
     """
 ) as demo:
-    # Header
     gr.HTML(f"""
-    <div class="main-header">
-    <h1>{TITLE}</h1>
-    <p>{DESCRIPTION}</p>
     <p><strong>Supported Languages</strong>: {len(ALL_UG40_LANGUAGES)} Ugandan languages | <strong>Google Comparable</strong>: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages</p>
     </div>
     """)
     # Status indicator
     if initialization_success:
-        status_msg = "✅ System initialized successfully"
     else:
         status_msg = "❌ System initialization failed - some features may not work"
-    gr.Markdown(f"**Status**: {status_msg}")
     with gr.Tabs():
-        # Tab 1: Get Test Set
         with gr.Tab("📥 Download Test Set", id="download"):
             gr.Markdown("""
-            ## 📋 Get the SALT Translation Test Set
-            Download the standardized test set to evaluate your translation model.
-            The test set contains source texts in multiple Ugandan languages that you need to translate.
             """)
             with gr.Row():
-                download_btn = gr.Button("📥 Download Test Set", variant="primary", size="lg")
             with gr.Row():
                 with gr.Column():
                     download_file = gr.File(label="📂 Test Set File", interactive=False)
                 with gr.Column():
                     download_info = gr.Markdown(label="ℹ️ Test Set Information")
-            gr.Markdown("""
-            ### 📖 Instructions
-            1. **Download** the test set using the button above
-            2. **Run your model** on the source texts to generate translations
-            3. **Create a predictions file** with your model's outputs
-            4. **Submit** your predictions using the "Submit Predictions" tab
-            ### 📋 Required Prediction Format
-            Your predictions file must be a CSV/TSV/JSON with these columns:
-            - `sample_id`: The unique identifier from the test set
-            - `prediction`: Your model's translation for that sample
-            **Example CSV:**
-            ```
-            sample_id,prediction
-            salt_000001,Oli otya mukwano gwange?
-            salt_000002,Webale nyo olukya
-            ...
-            ```
-            """)
-        # Tab 2: Submit Predictions
         with gr.Tab("🚀 Submit Predictions", id="submit"):
             gr.Markdown("""
-            ## 🎯 Submit Your Model's Predictions
-            Upload your model's predictions on the SALT test set for evaluation.
             """)
             with gr.Row():
                 with gr.Column(scale=1):
-                    # Model information
                     gr.Markdown("### 📝 Model Information")
                     model_name_input = gr.Textbox(
                         label="🤖 Model Name",
-                        placeholder="e.g., MyTranslator-v1.0",
                         info="Unique name for your model"
                     )
@@ -524,313 +777,528 @@ with gr.Blocks(
                     )
                     description_input = gr.Textbox(
-                        label="📄 Description (Optional)",
-                        placeholder="Brief description of your model",
-                        lines=3
                     )
-                    # File upload
                     gr.Markdown("### 📤 Upload Predictions")
-                    gr.Markdown("Upload a CSV/TSV/JSON file with your model's predictions")
                     predictions_file = gr.File(
                         label="📂 Predictions File",
                         file_types=[".csv", ".tsv", ".json"]
                     )
                     validate_btn = gr.Button("✅ Validate Submission", variant="secondary")
-                    submit_btn = gr.Button("🚀 Submit for Evaluation", variant="primary", interactive=False)
                 with gr.Column(scale=1):
                     gr.Markdown("### 📊 Validation Results")
                     validation_output = gr.Markdown()
             # Results section
-            gr.Markdown("### 🏆 Evaluation Results")
             with gr.Row():
                 evaluation_output = gr.Markdown()
             with gr.Row():
                 with gr.Column():
-                    submission_plot = gr.Plot(label="📈 Your Submission Analysis")
                 with gr.Column():
-                    updated_leaderboard_plot = gr.Plot(label="🏆 Updated Leaderboard")
             with gr.Row():
-                results_table = gr.Dataframe(label="📊 Updated Leaderboard", interactive=False)
-        # Tab 3: Leaderboard
-        with gr.Tab("🏆 Leaderboard", id="leaderboard"):
             with gr.Row():
-                with gr.Column(scale=3):
-                    search_input = gr.Textbox(
-                        label="🔍 Search Models",
-                        placeholder="Search by model name, author...",
-                    )
                 with gr.Column(scale=1):
-                    model_type_dropdown = gr.Dropdown(
-                        label="🔧 Model Type",
-                        choices=["all", "user_submission", "baseline"],
                         value="all"
                     )
                 with gr.Column(scale=1):
-                    min_coverage_slider = gr.Slider(
-                        label="📊 Min Coverage",
-                        minimum=0.0,
-                        maximum=1.0,
-                        value=0.0,
-                        step=0.1
                     )
                 with gr.Column(scale=1):
-                    google_only_checkbox = gr.Checkbox(
-                        label="🤖 Google Comparable Only",
-                        value=False
                     )
             with gr.Row():
-                refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
             with gr.Row():
-                leaderboard_stats = gr.Markdown()
             with gr.Row():
                 with gr.Column():
-                    leaderboard_plot = gr.Plot(label="🏆 Rankings")
                 with gr.Column():
-                    comparison_plot = gr.Plot(label="📊 Multi-Metric Comparison")
             with gr.Row():
-                leaderboard_table = gr.Dataframe(
-                    label="📈 Full Leaderboard",
-                    interactive=False,
-                    wrap=True
-                )
-        # Tab 4: Model Analysis
-        with gr.Tab("🔍 Model Analysis", id="analysis"):
             with gr.Row():
-                model_select = gr.Dropdown(
-                    label="🤖 Select Model",
-                    choices=[],
-                    value=None,
-                    info="Choose a model for detailed analysis"
-                )
-                analyze_btn = gr.Button("🔍 Analyze", variant="primary")
             with gr.Row():
                 model_details = gr.Markdown()
             with gr.Row():
-                model_analysis_plot = gr.Plot(label="📊 Detailed Performance Analysis")
-        # Tab 5: Documentation
-        with gr.Tab("📚 Documentation", id="docs"):
             gr.Markdown(f"""
-            # 📖 SALT Translation Leaderboard Documentation
             ## 🎯 Overview
-            The SALT Translation Leaderboard is a scientific evaluation platform for translation models on Ugandan languages.
-            Submit your model's predictions on our standardized test set to see how it compares with other models.
-            ## 🗣️ Supported Languages
-            **All UG40 Languages ({len(ALL_UG40_LANGUAGES)} total):**
-            {', '.join([f"{code} ({LANGUAGE_NAMES.get(code, code)})" for code in ALL_UG40_LANGUAGES])}
-            **Google Translate Comparable ({len(GOOGLE_SUPPORTED_LANGUAGES)} languages):**
-            {', '.join([f"{code} ({LANGUAGE_NAMES.get(code, code)})" for code in GOOGLE_SUPPORTED_LANGUAGES])}
             ## 📊 Evaluation Metrics
             ### Primary Metrics
-            - **Quality Score**: Composite metric (0-1, higher better) combining multiple metrics
-            - **BLEU**: Translation quality score (0-100, higher better)
-            - **ChrF**: Character-level F-score (0-1, higher better)
             ### Secondary Metrics
-            - **ROUGE-1/ROUGE-L**: Recall-oriented metrics (0-1, higher better)
-            - **CER/WER**: Character/Word Error Rate (0-1, lower better)
             - **Length Ratio**: Prediction/reference length ratio
             ## 🔄 Submission Process
-            ### Step 1: Download Test Set
-            1. Go to "Download Test Set" tab
-            2. Click "Download Test Set" button
-            3. Save the `salt_test_set.csv` file
             ### Step 2: Generate Predictions
-            1. Load the test set in your code
             2. For each row, translate `source_text` from `source_language` to `target_language`
             3. Save results as CSV with columns: `sample_id`, `prediction`
             ### Step 3: Submit & Evaluate
-            1. Go to "Submit Predictions" tab
-            2. Fill in model information
-            3. Upload your predictions file
-            4. Validate and submit for evaluation
-            ## 📋 File Formats
-            ### Test Set Format
             ```csv
-            sample_id,source_text,source_language,target_language,domain,google_comparable
-            salt_000001,"Hello world",eng,lug,general,true
-            salt_000002,"How are you?",eng,ach,conversation,true
             ```
             ### Predictions Format
             ```csv
-            sample_id,prediction
-            salt_000001,"Amakuru ensi"
-            salt_000002,"Ibino nining?"
             ```
-            ## 🏆 Leaderboard Types
-            ### 1. Full UG40 Leaderboard
-            - Includes all {len(get_all_language_pairs())} language pairs
-            - Complete evaluation across all Ugandan languages
-            - Primary ranking system
-            ### 2. Google Translate Comparable
-            - Limited to {len(get_google_comparable_pairs())} pairs
-            - Only languages supported by Google Translate
-            - Allows direct comparison with Google Translate baseline
-            ## 🔬 Scientific Rigor
-            - **Standardized Evaluation**: Same test set for all models
-            - **Multiple Metrics**: Comprehensive evaluation beyond just BLEU
-            - **Coverage Tracking**: Transparency about what each model covers
-            - **Reproducible**: All evaluation code and data available
-            ## 🤝 Contributing
-            This leaderboard is maintained by [Sunbird AI](https://sunbird.ai).
-            **Contact**: [research@sunbird.ai](mailto:research@sunbird.ai)
-            **GitHub**: [Sunbird AI GitHub](https://github.com/sunbirdai)
             ## 📄 Citation
             If you use this leaderboard in your research, please cite:
             ```bibtex
-            @misc{{salt_leaderboard_2024,
-              title={{SALT Translation Leaderboard: Evaluation of Translation Models on Ugandan Languages}},
               author={{Sunbird AI}},
               year={{2024}},
-              url={{https://huggingface.co/spaces/Sunbird/salt-translation-leaderboard}}
             }}
             ```
             ## 🔗 Related Resources
             - **SALT Dataset**: [sunbird/salt](https://huggingface.co/datasets/sunbird/salt)
-            - **Sunbird AI Models**: [Sunbird Organization](https://huggingface.co/Sunbird)
-            - **Research Papers**: [Sunbird AI Publications](https://sunbird.ai/research)
             """)
-    # Event handlers with state management
     predictions_validated = gr.State(value=None)
     validation_info_state = gr.State(value=None)
     # Download test set
     download_btn.click(
-        fn=download_test_set,
         outputs=[download_file, download_info]
     )
     # Validate predictions
-    def handle_validation(file, model_name, author, description):
-        report, predictions = validate_submission(file, model_name, author, description)
         valid = predictions is not None
-        # Build the four returns:
-        if valid:
-            return (
-                report,
-                predictions,                   # predictions_validated state
-                predictions,                   # validation_info_state (you can store whatever you like here)
-                gr.update(interactive=True)
-            )
-        else:
-            return (
-                report,
-                None,
-                None,
-                gr.update(interactive=False)   # <— this *disables* the button
-            )
     validate_btn.click(
-        fn=handle_validation,
         inputs=[predictions_file, model_name_input, author_input, description_input],
-        outputs=[validation_output, predictions_validated, validation_info_state, submit_btn]
     )
     # Submit for evaluation
-    def handle_submission(predictions, model_name, author, description, validation_info):
         if predictions is None:
-            return "❌ Please validate your submission first", None, None, None
-        # Extract validation info dict
-        validation_dict = {
-            'coverage': getattr(validation_info, 'coverage', 0.8) if hasattr(validation_info, 'coverage') else 0.8,
-            'report': 'Validation passed'
-        }
-        return evaluate_submission(predictions, model_name, author, description, validation_dict)
     submit_btn.click(
-        fn=handle_submission,
-        inputs=[predictions_validated, model_name_input, author_input, description_input, validation_info_state],
-        outputs=[evaluation_output, results_table, submission_plot, updated_leaderboard_plot]
     )
-    # Refresh leaderboard
-    def update_leaderboard_and_dropdown(*args):
-        table, plot1, plot2, stats = refresh_leaderboard_display(*args)
-        # Update model dropdown choices
-        if current_leaderboard is not None and not current_leaderboard.empty:
-            model_choices = current_leaderboard['model_name'].tolist()
-        else:
-            model_choices = []
-        return table, plot1, plot2, stats, gr.Dropdown(choices=model_choices)
-    refresh_btn.click(
-        fn=update_leaderboard_and_dropdown,
-        inputs=[search_input, model_type_dropdown, min_coverage_slider, google_only_checkbox],
-        outputs=[leaderboard_table, leaderboard_plot, comparison_plot, leaderboard_stats, model_select]
     )
-    # Auto-refresh on filter changes
-    for input_component in [search_input, model_type_dropdown, min_coverage_slider, google_only_checkbox]:
-        input_component.change(
-            fn=update_leaderboard_and_dropdown,
-            inputs=[search_input, model_type_dropdown, min_coverage_slider, google_only_checkbox],
-            outputs=[leaderboard_table, leaderboard_plot, comparison_plot, leaderboard_stats, model_select]
-        )
     # Model analysis
     analyze_btn.click(
-        fn=get_model_details,
-        inputs=[model_select],
-        outputs=[model_details, model_analysis_plot]
     )
-    # Load initial data
     demo.load(
-        fn=update_leaderboard_and_dropdown,
-        inputs=[search_input, model_type_dropdown, min_coverage_slider, google_only_checkbox],
-        outputs=[leaderboard_table, leaderboard_plot, comparison_plot, leaderboard_stats, model_select]
     )
-# Launch the application
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",

         return False
 # Setup SALT on startup
+print("🚀 Starting SALT Translation Leaderboard - Scientific Edition...")
 if not setup_salt():
     print("❌ Cannot continue without SALT library")
     print("💡 Please check that git is available and GitHub is accessible")
 import json
 import traceback
 from datetime import datetime
+from typing import Optional, Dict, Tuple, List
+# Import our enhanced modules
+from src.test_set import (
+    get_public_test_set_scientific,
+    get_complete_test_set_scientific,
+    create_test_set_download_scientific,
+    validate_test_set_integrity_scientific,
+    get_track_test_set
+)
+from src.validation import validate_submission_scientific
+from src.evaluation import (
+    evaluate_predictions_scientific,
+    generate_scientific_report,
+    compare_models_statistically
+)
 from src.leaderboard import (
+    load_scientific_leaderboard,
+    add_model_to_scientific_leaderboard,
+    get_scientific_leaderboard_stats,
+    get_track_leaderboard,
+    prepare_track_leaderboard_display,
+    perform_fair_comparison,
+    export_scientific_leaderboard
 )
 from src.plotting import (
+    create_scientific_leaderboard_plot,
+    create_language_pair_heatmap_scientific,
+    create_statistical_comparison_plot,
+    create_category_comparison_plot,
+    create_adequacy_analysis_plot,
+    create_cross_track_analysis_plot,
+    create_scientific_model_detail_plot
+)
+from src.utils import (
+    sanitize_model_name,
+    get_all_language_pairs,
+    get_google_comparable_pairs,
+    get_track_language_pairs,
+    format_metric_value
 )
 from config import *
 # Global variables for caching
 current_leaderboard = None
 public_test_set = None
 complete_test_set = None
+test_set_stats = None
+def initialize_scientific_data():
+    """Initialize scientific test sets and leaderboard data."""
+    global public_test_set, complete_test_set, current_leaderboard, test_set_stats
     try:
+        print("🔬 Initializing SALT Translation Leaderboard - Scientific Edition...")
+        # Load scientific test sets
+        print("📥 Loading scientific test sets...")
+        public_test_set = get_public_test_set_scientific()
+        complete_test_set = get_complete_test_set_scientific()
+        # Load scientific leaderboard
+        print("🏆 Loading scientific leaderboard...")
+        current_leaderboard = load_scientific_leaderboard()
+        # Validate test set integrity
+        print("🔍 Validating test set integrity...")
+        test_set_stats = validate_test_set_integrity_scientific()
+        print(f"✅ Scientific initialization complete!")
         print(f"   - Test set: {len(public_test_set):,} samples")
+        print(f"   - Integrity score: {test_set_stats.get('integrity_score', 0):.2f}")
+        print(f"   - Scientific adequacy: {test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')}")
         print(f"   - Current models: {len(current_leaderboard)}")
         return True
     except Exception as e:
+        print(f"❌ Scientific initialization failed: {e}")
         traceback.print_exc()
         return False
+def download_scientific_test_set() -> Tuple[str, str]:
+    """Create downloadable scientific test set and return file path and info."""
     try:
         global public_test_set
         if public_test_set is None:
+            public_test_set = get_public_test_set_scientific()
         # Create download file
+        download_path, stats = create_test_set_download_scientific()
+        # Create comprehensive info message
+        adequacy = stats.get('adequacy_assessment', 'unknown')
+        adequacy_emoji = {
+            'excellent': '🟢',
+            'good': '🟡',
+            'fair': '🟠',
+            'insufficient': '🔴',
+            'unknown': '⚪'
+        }.get(adequacy, '⚪')
         info_msg = f"""
+## 📥 SALT Scientific Test Set Downloaded Successfully!
+### 🔬 Scientific Edition Features:
+- **Stratified Sampling**: Ensures representative coverage across domains
+- **Statistical Weighting**: Samples weighted by track importance
+- **Track Balancing**: Optimized for fair cross-track comparison
+- **Adequacy Validation**: {adequacy_emoji} Overall adequacy: **{adequacy.title()}**
+### 📊 Dataset Statistics:
 - **Total Samples**: {stats['total_samples']:,}
+- **Languages**: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))})
+- **Google Comparable**: {stats.get('google_comparable_samples', 0):,} samples ({stats.get('google_comparable_rate', 0):.1%})
+- **Domains**: {', '.join(stats.get('domains', ['general']))}
+### 🏁 Track Breakdown:
+"""
+        track_breakdown = stats.get('track_breakdown', {})
+        for track_name, track_info in track_breakdown.items():
+            status_emoji = '✅' if track_info.get('statistical_adequacy', False) else '⚠️'
+            info_msg += f"""
+**{status_emoji} {track_info.get('name', track_name)}**:
+- Samples: {track_info.get('total_samples', 0):,}
+- Language Pairs: {track_info.get('language_pairs', 0)}
+- Min Required/Pair: {track_info.get('min_samples_per_pair', 0)}
+- Statistical Adequacy: {'Yes' if track_info.get('statistical_adequacy', False) else 'No'}
+"""
+        info_msg += f"""
+### 📋 Enhanced File Format:
 - `sample_id`: Unique identifier for each sample
 - `source_text`: Text to be translated
 - `source_language`: Source language code
 - `target_language`: Target language code
 - `domain`: Content domain (if available)
 - `google_comparable`: Whether this pair can be compared with Google Translate
+- `tracks_included`: Comma-separated list of tracks that include this sample
+- `statistical_weight`: Statistical importance weight (1.0-5.0)
+### 🔬 Next Steps for Scientific Evaluation:
+1. **Run your model** on the source texts to generate translations
+2. **Create a predictions file** with columns: `sample_id`, `prediction`
+3. **Optional**: Add `category` column to help with model classification
+4. **Submit** your predictions using the appropriate track tab
+5. **Analyze** results with statistical confidence intervals
+### 💡 Tips for Best Results:
+- Ensure coverage of all language pairs for chosen track
+- Include confidence scores if available
+- Provide detailed model description for proper categorization
+- Consider submitting to multiple tracks for comprehensive evaluation
         """
         return download_path, info_msg
     except Exception as e:
+        error_msg = f"❌ Error creating scientific test set download: {str(e)}"
         return None, error_msg
+def validate_scientific_submission(
+    file, model_name: str, author: str, description: str
+) -> Tuple[str, Optional[pd.DataFrame], str]:
+    """Validate uploaded prediction file with scientific rigor."""
     try:
         if file is None:
+            return "❌ Please upload a predictions file", None, "community"
         if not model_name.strip():
+            return "❌ Please provide a model name", None, "community"
+        # Handle different file input types
         if isinstance(file, bytes):
             file_content = file
         elif isinstance(file, str):
             if os.path.exists(file):
                 with open(file, "rb") as f:
                     file_content = f.read()
             else:
                 file_content = file.encode("utf-8")
         elif hasattr(file, "name") and os.path.exists(file.name):
             with open(file.name, "rb") as f:
                 file_content = f.read()
         else:
+            return "❌ Could not read uploaded file", None, "community"
+        # Determine filename
         filename = (
             getattr(file, "name", None)
             or getattr(file, "filename", None)
             or "predictions.csv"
         )
+        # Load test set if needed
         global complete_test_set
         if complete_test_set is None:
+            complete_test_set = get_complete_test_set_scientific()
+        # Run enhanced scientific validation
+        validation_result = validate_submission_scientific(
+            file_content, filename, complete_test_set, model_name, author, description
         )
+        detected_category = validation_result.get("category", "community")
         if validation_result["valid"]:
+            return validation_result["report"], validation_result["predictions"], detected_category
         else:
+            return validation_result["report"], None, detected_category
     except Exception as e:
         return (
             f"❌ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}",
             None,
+            "community"
         )
+def evaluate_scientific_submission(
+    predictions_df: pd.DataFrame,
+    model_name: str,
+    author: str,
     description: str,
+    detected_category: str,
+    validation_info: Dict,
+) -> Tuple[str, pd.DataFrame, object, object, object]:
+    """Evaluate validated predictions using scientific methodology."""
     try:
         if predictions_df is None:
+            return "❌ No valid predictions to evaluate", None, None, None, None
         # Get complete test set with targets
         global complete_test_set, current_leaderboard
         if complete_test_set is None:
+            complete_test_set = get_complete_test_set_scientific()
+        # Run scientific evaluation across all tracks
+        print(f"🔬 Starting scientific evaluation for {model_name}...")
+        evaluation_results = evaluate_predictions_scientific(
+            predictions_df, complete_test_set, detected_category
+        )
+        if any(track_data.get('error') for track_data in evaluation_results.get('tracks', {}).values()):
+            errors = [track_data['error'] for track_data in evaluation_results['tracks'].values() if track_data.get('error')]
+            return f"❌ Evaluation errors: {'; '.join(errors)}", None, None, None, None
+        # Add to scientific leaderboard
+        print("🏆 Adding to scientific leaderboard...")
+        updated_leaderboard = add_model_to_scientific_leaderboard(
             model_name=sanitize_model_name(model_name),
+            author=author or "Anonymous",
             evaluation_results=evaluation_results,
+            model_category=detected_category,
             description=description or ""
         )
         # Update global leaderboard
         current_leaderboard = updated_leaderboard
+        # Generate scientific report
+        report = generate_scientific_report(evaluation_results, model_name)
+        # Create visualizations
+        summary_plot = create_adequacy_analysis_plot(updated_leaderboard)
+        cross_track_plot = create_cross_track_analysis_plot(updated_leaderboard)
+        # Prepare display leaderboard (Google-comparable track by default)
+        google_leaderboard = get_track_leaderboard(updated_leaderboard, "google_comparable")
+        display_leaderboard = prepare_track_leaderboard_display(google_leaderboard, "google_comparable")
+        # Format success message with track-specific results
         success_msg = f"""
+## 🎉 Scientific Evaluation Complete!
+### 📊 Model Information:
 - **Model**: {model_name}
+- **Category**: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)}
+- **Author**: {author or 'Anonymous'}
+### 🏆 Track Performance Summary:
+"""
+        tracks = evaluation_results.get('tracks', {})
+        for track_name, track_data in tracks.items():
+            if not track_data.get('error'):
+                track_config = EVALUATION_TRACKS[track_name]
+                track_averages = track_data.get('track_averages', {})
+                summary = track_data.get('summary', {})
+                # Get rank in this track
+                track_leaderboard = get_track_leaderboard(updated_leaderboard, track_name)
+                if not track_leaderboard.empty:
+                    model_row = track_leaderboard[track_leaderboard['model_name'] == sanitize_model_name(model_name)]
+                    rank = model_row.index[0] + 1 if not model_row.empty else "N/A"
+                    total_models = len(track_leaderboard)
+                else:
+                    rank = "N/A"
+                    total_models = 0
+                quality_score = track_averages.get('quality_score', 0)
+                bleu_score = track_averages.get('bleu', 0)
+                samples = summary.get('total_samples', 0)
+                success_msg += f"""
+**🏁 {track_config['name']}**:
+- Rank: #{rank} out of {total_models} models
+- Quality Score: {quality_score:.4f}
+- BLEU: {bleu_score:.2f}
+- Samples: {samples:,}
+"""
+        success_msg += f"""
+### 🔬 Scientific Adequacy:
+- **Cross-Track Consistency**: Available in detailed analysis
+- **Statistical Confidence**: 95% confidence intervals computed
+- **Sample Adequacy**: {validation_info.get('adequacy', {}).get('overall_adequate', 'Unknown')}
 {report}
         """
+        return success_msg, display_leaderboard, summary_plot, cross_track_plot, updated_leaderboard
     except Exception as e:
+        error_msg = f"❌ Scientific evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+        return error_msg, None, None, None, None
+def refresh_track_leaderboard(
+    track: str,
     search_query: str = "",
+    category_filter: str = "all",
+    min_adequacy: float = 0.0,
+    show_ci: bool = True
 ) -> Tuple[pd.DataFrame, object, object, str]:
+    """Refresh leaderboard for a specific track with filters."""
     try:
         global current_leaderboard
         if current_leaderboard is None:
+            current_leaderboard = load_scientific_leaderboard()
+        # Get track-specific leaderboard
+        track_leaderboard = get_track_leaderboard(
+            current_leaderboard, track, category_filter=category_filter, min_adequacy=min_adequacy
         )
+        # Apply search filter
+        if search_query:
+            query_lower = search_query.lower()
+            mask = (
+                track_leaderboard['model_name'].str.lower().str.contains(query_lower, na=False) |
+                track_leaderboard['author'].str.lower().str.contains(query_lower, na=False)
+            )
+            track_leaderboard = track_leaderboard[mask]
+        # Prepare for display
+        display_df = prepare_track_leaderboard_display(track_leaderboard, track)
         # Create plots
+        ranking_plot = create_scientific_leaderboard_plot(track_leaderboard, track)
+        comparison_plot = create_statistical_comparison_plot(track_leaderboard, track)
+        # Get track statistics
+        track_stats = get_scientific_leaderboard_stats(track_leaderboard, track)
+        track_config = EVALUATION_TRACKS[track]
         stats_text = f"""
+### 📊 {track_config['name']} Statistics
+- **Total Models**: {track_stats.get('total_models', 0)}
+- **Models by Category**: {', '.join([f"{k}: {v}" for k, v in track_stats.get('models_by_category', {}).items()])}
+- **Average Quality Score**: {track_stats.get('track_statistics', {}).get(track, {}).get('avg_quality', 0.0):.4f}
+**Best Model**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('name', 'None')}
+**Best Score**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('quality', 0.0):.4f}
+### 🔬 Scientific Notes:
+- All metrics include 95% confidence intervals
+- Statistical adequacy verified for reliable comparisons
+- {track_config['description']}
         """
         return display_df, ranking_plot, comparison_plot, stats_text
     except Exception as e:
+        error_msg = f"Error loading {track} leaderboard: {str(e)}"
         empty_df = pd.DataFrame()
         return empty_df, None, None, error_msg
+def get_scientific_model_details(model_name: str, track: str) -> Tuple[str, object, object]:
+    """Get detailed scientific analysis for a specific model."""
     try:
         global current_leaderboard
         if current_leaderboard is None:
+            return "Leaderboard not loaded", None, None
         # Find model
         model_row = current_leaderboard[current_leaderboard['model_name'] == model_name]
         if model_row.empty:
+            return f"Model '{model_name}' not found", None, None
         model_info = model_row.iloc[0]
+        # Parse detailed metrics for the requested track
         try:
+            detailed_results = json.loads(model_info[f'detailed_{track}'])
         except:
             detailed_results = {}
+        # Create detailed plots
+        detail_plot = create_scientific_model_detail_plot(detailed_results, model_name, track)
+        # Create language pair heatmap
+        heatmap_plot = create_language_pair_heatmap_scientific(detailed_results, track)
+        # Format model details with scientific information
+        track_config = EVALUATION_TRACKS[track]
+        category_info = MODEL_CATEGORIES.get(model_info['model_category'], {})
+        # Extract track-specific metrics
+        quality_col = f"{track}_quality"
+        bleu_col = f"{track}_bleu"
+        chrf_col = f"{track}_chrf"
+        ci_lower_col = f"{track}_ci_lower"
+        ci_upper_col = f"{track}_ci_upper"
+        samples_col = f"{track}_samples"
+        pairs_col = f"{track}_pairs"
+        adequate_col = f"{track}_adequate"
         details_text = f"""
+## 🔬 Scientific Model Analysis: {model_name}
+### 📋 Basic Information:
 - **Author**: {model_info['author']}
+- **Category**: {category_info.get('name', 'Unknown')} - {category_info.get('description', '')}
 - **Submission Date**: {model_info['submission_date'][:10]}
 - **Description**: {model_info['description'] or 'No description provided'}
+### 🏁 {track_config['name']} Performance:
+- **Quality Score**: {format_metric_value(model_info.get(quality_col, 0), 'quality_score', True, model_info.get(ci_lower_col, 0), model_info.get(ci_upper_col, 0))}
+- **BLEU**: {format_metric_value(model_info.get(bleu_col, 0), 'bleu')}
+- **ChrF**: {format_metric_value(model_info.get(chrf_col, 0), 'chrf')}
+### 📊 Coverage Information:
+- **Total Samples**: {model_info.get(samples_col, 0):,}
+- **Language Pairs Covered**: {model_info.get(pairs_col, 0)}
+- **Statistical Adequacy**: {'✅ Yes' if model_info.get(adequate_col, False) else '❌ No'}
+### 🔬 Statistical Metadata:
+- **Confidence Level**: {STATISTICAL_CONFIG['confidence_level']:.0%}
+- **Bootstrap Samples**: {STATISTICAL_CONFIG['bootstrap_samples']:,}
+- **Scientific Adequacy Score**: {model_info.get('scientific_adequacy_score', 0.0):.3f}
+### 📈 Cross-Track Performance:
+"""
+        # Add other track performances for comparison
+        for other_track in EVALUATION_TRACKS.keys():
+            if other_track != track:
+                other_quality_col = f"{other_track}_quality"
+                other_adequate_col = f"{other_track}_adequate"
+                if model_info.get(other_adequate_col, False):
+                    other_quality = model_info.get(other_quality_col, 0)
+                    details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: {other_quality:.4f}\n"
+                else:
+                    details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: Not evaluated\n"
+        details_text += f"""
+### 💡 Scientific Interpretation:
+- Performance metrics include 95% confidence intervals for reliability
+- Statistical adequacy ensures meaningful comparisons with other models
+- Cross-track analysis reveals model strengths across different language sets
+- Category classification helps contextualize performance expectations
         """
+        return details_text, detail_plot, heatmap_plot
     except Exception as e:
         error_msg = f"Error getting model details: {str(e)}"
+        return error_msg, None, None
+def perform_model_comparison(
+    model_names: List[str], track: str, comparison_type: str = "statistical"
+) -> Tuple[str, object]:
+    """Perform scientific comparison between selected models."""
+    try:
+        global current_leaderboard
+        if current_leaderboard is None:
+            return "Leaderboard not loaded", None
+        if len(model_names) < 2:
+            return "Please select at least 2 models for comparison", None
+        # Get models
+        models = current_leaderboard[current_leaderboard['model_name'].isin(model_names)]
+        if len(models) < 2:
+            return "Selected models not found in leaderboard", None
+        # Perform fair comparison
+        comparison_result = perform_fair_comparison(current_leaderboard, model_names)
+        if comparison_result.get('error'):
+            return f"Comparison error: {comparison_result['error']}", None
+        # Create comparison visualization
+        if comparison_type == "statistical":
+            comparison_plot = create_statistical_comparison_plot(models, track)
+        else:
+            comparison_plot = create_category_comparison_plot(models, track)
+        # Format comparison report
+        track_config = EVALUATION_TRACKS[track]
+        comparison_text = f"""
+## 🔬 Scientific Model Comparison - {track_config['name']}
+### 📊 Models Compared:
+"""
+        quality_col = f"{track}_quality"
+        ci_lower_col = f"{track}_ci_lower"
+        ci_upper_col = f"{track}_ci_upper"
+        # Sort models by performance
+        models_sorted = models.sort_values(quality_col, ascending=False)
+        for i, (_, model) in enumerate(models_sorted.iterrows(), 1):
+            category_info = MODEL_CATEGORIES.get(model['model_category'], {})
+            comparison_text += f"""
+**#{i}. {model['model_name']}**
+- Category: {category_info.get('name', 'Unknown')}
+- Quality Score: {format_metric_value(model[quality_col], 'quality_score', True, model[ci_lower_col], model[ci_upper_col])}
+- Author: {model['author']}
+"""
+        # Add statistical analysis
+        track_comparison = comparison_result.get('track_comparisons', {}).get(track, {})
+        if track_comparison:
+            comparison_text += f"""
+### 🔬 Statistical Analysis:
+- **Models with adequate data**: {track_comparison.get('participating_models', 0)}
+- **Confidence intervals available**: Yes (95% level)
+- **Fair comparison possible**: {'✅ Yes' if comparison_result.get('fair_comparison_possible', False) else '⚠️ Limited'}
+"""
+            # Check for statistical significance (simplified)
+            quality_scores = list(track_comparison.get('quality_scores', {}).values())
+            if len(quality_scores) >= 2:
+                score_range = max(quality_scores) - min(quality_scores)
+                if score_range > 0.05:  # 5% difference threshold
+                    comparison_text += "- **Performance differences**: Potentially significant\n"
+                else:
+                    comparison_text += "- **Performance differences**: Minimal\n"
+        # Add recommendations
+        recommendations = comparison_result.get('recommendations', [])
+        if recommendations:
+            comparison_text += "\n### 💡 Recommendations:\n"
+            for rec in recommendations:
+                comparison_text += f"- {rec}\n"
+        return comparison_text, comparison_plot
+    except Exception as e:
+        error_msg = f"Error performing comparison: {str(e)}"
         return error_msg, None
 # Initialize data on startup
+print("🚀 Starting SALT Translation Leaderboard - Scientific Edition...")
+initialization_success = initialize_scientific_data()
+# Create Gradio interface with scientific design
 with gr.Blocks(
+    title=UI_CONFIG["title"],
     theme=gr.themes.Soft(),
     css="""
     .gradio-container {
+        max-width: 1600px !important;
         margin: 0 auto;
     }
+    .scientific-header {
         text-align: center;
         margin-bottom: 2rem;
         padding: 2rem;
+        background: linear-gradient(135deg, #1e3a8a 0%, #3730a3 50%, #1e40af 100%);
         color: white;
         border-radius: 10px;
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    }
+    .track-tab {
+        border-radius: 8px;
+        margin: 0.5rem;
+        padding: 1rem;
+        border: 2px solid transparent;
+    }
+    .track-tab.google-comparable {
+        border-color: #1f77b4;
+        background: linear-gradient(45deg, #f0f9ff, #e0f2fe);
+    }
+    .track-tab.ug40-complete {
+        border-color: #ff7f0e;
+        background: linear-gradient(45deg, #fff7ed, #fed7aa);
+    }
+    .track-tab.language-pair-matrix {
+        border-color: #2ca02c;
+        background: linear-gradient(45deg, #f0fdf4, #dcfce7);
     }
     .metric-box {
+        background: #f8fafc;
         padding: 1rem;
         border-radius: 8px;
         margin: 0.5rem 0;
+        border-left: 4px solid #3b82f6;
     }
+    .scientific-note {
+        background: #fef3c7;
+        border: 1px solid #f59e0b;
         border-radius: 8px;
         padding: 1rem;
+        margin: 1rem 0;
     }
+    .adequacy-excellent { border-left-color: #22c55e; }
+    .adequacy-good { border-left-color: #eab308; }
+    .adequacy-fair { border-left-color: #f97316; }
+    .adequacy-insufficient { border-left-color: #ef4444; }
     """
 ) as demo:
+    # Scientific Header
     gr.HTML(f"""
+    <div class="scientific-header">
+    <h1>🏆 SALT Translation Leaderboard - Scientific Edition</h1>
+    <p><strong>Rigorous Evaluation with Statistical Significance Testing</strong></p>
+    <p>Three-tier evaluation tracks • 95% Confidence intervals • Research-grade analysis</p>
     <p><strong>Supported Languages</strong>: {len(ALL_UG40_LANGUAGES)} Ugandan languages | <strong>Google Comparable</strong>: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages</p>
     </div>
     """)
     # Status indicator
     if initialization_success:
+        status_msg = "✅ Scientific system initialized successfully"
+        adequacy_info = test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')
+        status_msg += f" | Test set adequacy: {adequacy_info.title()}"
     else:
         status_msg = "❌ System initialization failed - some features may not work"
+    gr.Markdown(f"**System Status**: {status_msg}")
+    # Add scientific overview
+    gr.Markdown("""
+    ## 🔬 Scientific Evaluation Framework
+    This leaderboard implements rigorous scientific methodology for translation model evaluation:
+    - **Three Evaluation Tracks**: Fair comparison across different model capabilities
+    - **Statistical Significance**: 95% confidence intervals and effect size analysis
+    - **Category-Based Analysis**: Commercial, Research, Baseline, and Community models
+    - **Cross-Track Consistency**: Validate model performance across language sets
+    """)
     with gr.Tabs():
+        # Tab 1: Download Test Set
         with gr.Tab("📥 Download Test Set", id="download"):
             gr.Markdown("""
+            ## 📋 Get the SALT Scientific Test Set
+            Download our scientifically designed test set with stratified sampling and statistical weighting.
             """)
             with gr.Row():
+                download_btn = gr.Button("📥 Download Scientific Test Set", variant="primary", size="lg")
             with gr.Row():
                 with gr.Column():
                     download_file = gr.File(label="📂 Test Set File", interactive=False)
                 with gr.Column():
                     download_info = gr.Markdown(label="ℹ️ Test Set Information")
+        # Tab 2: Submit Predictions
         with gr.Tab("🚀 Submit Predictions", id="submit"):
             gr.Markdown("""
+            ## 🎯 Submit Your Model's Predictions for Scientific Evaluation
+            Upload predictions for comprehensive evaluation across all three tracks with statistical analysis.
             """)
             with gr.Row():
                 with gr.Column(scale=1):
                     gr.Markdown("### 📝 Model Information")
                     model_name_input = gr.Textbox(
                         label="🤖 Model Name",
+                        placeholder="e.g., MyTranslator-v2.0",
                         info="Unique name for your model"
                     )
                     )
                     description_input = gr.Textbox(
+                        label="📄 Model Description",
+                        placeholder="Architecture, training data, special features...",
+                        lines=4,
+                        info="Detailed description helps with proper categorization"
                     )
                     gr.Markdown("### 📤 Upload Predictions")
                     predictions_file = gr.File(
                         label="📂 Predictions File",
                         file_types=[".csv", ".tsv", ".json"]
                     )
                     validate_btn = gr.Button("✅ Validate Submission", variant="secondary")
+                    submit_btn = gr.Button("🚀 Submit for Scientific Evaluation", variant="primary", interactive=False)
                 with gr.Column(scale=1):
                     gr.Markdown("### 📊 Validation Results")
                     validation_output = gr.Markdown()
             # Results section
+            gr.Markdown("### 🏆 Scientific Evaluation Results")
             with gr.Row():
                 evaluation_output = gr.Markdown()
             with gr.Row():
                 with gr.Column():
+                    submission_plot = gr.Plot(label="📈 Submission Analysis")
                 with gr.Column():
+                    cross_track_plot = gr.Plot(label="🔄 Cross-Track Analysis")
             with gr.Row():
+                results_table = gr.Dataframe(label="📊 Updated Leaderboard (Google-Comparable Track)", interactive=False)
+        # Tab 3: Google-Comparable Track
+        with gr.Tab("🤖 Google-Comparable Track", id="google_track", elem_classes=["track-tab", "google-comparable"]):
+            gr.Markdown(f"""
+            ## {UI_CONFIG['tracks']['google_comparable']['tab_name']}
+            **Fair comparison with commercial translation systems**
+            This track evaluates models on the {len(get_google_comparable_pairs())} language pairs supported by Google Translate,
+            enabling direct comparison with commercial baselines.
+            - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
+            - **Purpose**: Commercial system comparison and baseline establishment
+            - **Statistical Power**: High (optimized sample sizes)
+            """)
             with gr.Row():
+                with gr.Column(scale=2):
+                    google_search = gr.Textbox(label="🔍 Search Models", placeholder="Search by model name, author...")
                 with gr.Column(scale=1):
+                    google_category = gr.Dropdown(
+                        label="🏷️ Category Filter",
+                        choices=["all"] + list(MODEL_CATEGORIES.keys()),
                         value="all"
                     )
                 with gr.Column(scale=1):
+                    google_adequacy = gr.Slider(
+                        label="📊 Min Adequacy",
+                        minimum=0.0, maximum=1.0, value=0.0, step=0.1
+                    )
+                with gr.Column(scale=1):
+                    google_refresh = gr.Button("🔄 Refresh", variant="secondary")
+            with gr.Row():
+                google_stats = gr.Markdown()
+            with gr.Row():
+                with gr.Column():
+                    google_ranking_plot = gr.Plot(label="🏆 Google-Comparable Rankings")
+                with gr.Column():
+                    google_comparison_plot = gr.Plot(label="📊 Statistical Comparison")
+            with gr.Row():
+                google_leaderboard = gr.Dataframe(label="📈 Google-Comparable Leaderboard", interactive=False)
+        # Tab 4: UG40-Complete Track
+        with gr.Tab("🌍 UG40-Complete Track", id="ug40_track", elem_classes=["track-tab", "ug40-complete"]):
+            gr.Markdown(f"""
+            ## {UI_CONFIG['tracks']['ug40_complete']['tab_name']}
+            **Comprehensive evaluation across all Ugandan languages**
+            This track evaluates models on all {len(get_all_language_pairs())} UG40 language pairs,
+            providing the most comprehensive assessment of Ugandan language translation capabilities.
+            - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
+            - **Purpose**: Comprehensive Ugandan language capability assessment
+            - **Coverage**: Complete linguistic landscape of Uganda
+            """)
+            with gr.Row():
+                with gr.Column(scale=2):
+                    ug40_search = gr.Textbox(label="🔍 Search Models", placeholder="Search by model name, author...")
+                with gr.Column(scale=1):
+                    ug40_category = gr.Dropdown(
+                        label="🏷️ Category Filter",
+                        choices=["all"] + list(MODEL_CATEGORIES.keys()),
+                        value="all"
                     )
                 with gr.Column(scale=1):
+                    ug40_adequacy = gr.Slider(
+                        label="📊 Min Adequacy",
+                        minimum=0.0, maximum=1.0, value=0.0, step=0.1
                     )
+                with gr.Column(scale=1):
+                    ug40_refresh = gr.Button("🔄 Refresh", variant="secondary")
             with gr.Row():
+                ug40_stats = gr.Markdown()
             with gr.Row():
+                with gr.Column():
+                    ug40_ranking_plot = gr.Plot(label="🏆 UG40-Complete Rankings")
+                with gr.Column():
+                    ug40_comparison_plot = gr.Plot(label="📊 Statistical Comparison")
+            with gr.Row():
+                ug40_leaderboard = gr.Dataframe(label="📈 UG40-Complete Leaderboard", interactive=False)
+        # Tab 5: Language-Pair Matrix
+        with gr.Tab("📊 Language-Pair Matrix", id="matrix_track", elem_classes=["track-tab", "language-pair-matrix"]):
+            gr.Markdown(f"""
+            ## {UI_CONFIG['tracks']['language_pair_matrix']['tab_name']}
+            **Detailed language pair analysis with statistical significance**
+            This view provides granular analysis of model performance across individual language pairs
+            with statistical significance testing and effect size analysis.
+            - **Resolution**: Individual language pair performance
+            - **Purpose**: Detailed linguistic analysis and model diagnostics
+            - **Statistics**: Pairwise significance testing available
+            """)
+            with gr.Row():
+                with gr.Column(scale=2):
+                    matrix_search = gr.Textbox(label="🔍 Search Models", placeholder="Search by model name, author...")
+                with gr.Column(scale=1):
+                    matrix_category = gr.Dropdown(
+                        label="🏷️ Category Filter",
+                        choices=["all"] + list(MODEL_CATEGORIES.keys()),
+                        value="all"
+                    )
+                with gr.Column(scale=1):
+                    matrix_adequacy = gr.Slider(
+                        label="📊 Min Adequacy",
+                        minimum=0.0, maximum=1.0, value=0.0, step=0.1
+                    )
+                with gr.Column(scale=1):
+                    matrix_refresh = gr.Button("🔄 Refresh", variant="secondary")
+            with gr.Row():
+                matrix_stats = gr.Markdown()
             with gr.Row():
                 with gr.Column():
+                    matrix_ranking_plot = gr.Plot(label="🏆 Language-Pair Matrix Rankings")
                 with gr.Column():
+                    matrix_comparison_plot = gr.Plot(label="📊 Statistical Comparison")
             with gr.Row():
+                matrix_leaderboard = gr.Dataframe(label="📈 Language-Pair Matrix Leaderboard", interactive=False)
+        # Tab 6: Model Analysis
+        with gr.Tab("🔍 Scientific Model Analysis", id="analysis"):
+            gr.Markdown("""
+            ## 🔬 Detailed Scientific Model Analysis
+            Comprehensive analysis of individual models with statistical confidence intervals,
+            cross-track performance, and detailed language pair breakdowns.
+            """)
             with gr.Row():
+                with gr.Column(scale=2):
+                    model_select = gr.Dropdown(
+                        label="🤖 Select Model",
+                        choices=[],
+                        value=None,
+                        info="Choose a model for detailed scientific analysis"
+                    )
+                with gr.Column(scale=1):
+                    track_select = gr.Dropdown(
+                        label="🏁 Analysis Track",
+                        choices=list(EVALUATION_TRACKS.keys()),
+                        value="google_comparable",
+                        info="Track for detailed analysis"
+                    )
+                with gr.Column(scale=1):
+                    analyze_btn = gr.Button("🔍 Analyze", variant="primary")
             with gr.Row():
                 model_details = gr.Markdown()
             with gr.Row():
+                with gr.Column():
+                    model_analysis_plot = gr.Plot(label="📊 Detailed Performance Analysis")
+                with gr.Column():
+                    model_heatmap_plot = gr.Plot(label="🗺️ Language Pair Heatmap")
+        # Tab 7: Model Comparison
+        with gr.Tab("⚖️ Scientific Model Comparison", id="comparison"):
+            gr.Markdown("""
+            ## 🔬 Scientific Model Comparison
+            Compare multiple models with statistical significance testing and fair comparison analysis.
+            Only models evaluated on the same language pairs are compared for scientific validity.
+            """)
+            with gr.Row():
+                with gr.Column(scale=2):
+                    comparison_models = gr.CheckboxGroup(
+                        label="🤖 Select Models to Compare",
+                        choices=[],
+                        value=[],
+                        info="Select 2-6 models for comparison"
+                    )
+                with gr.Column(scale=1):
+                    comparison_track = gr.Dropdown(
+                        label="🏁 Comparison Track",
+                        choices=list(EVALUATION_TRACKS.keys()),
+                        value="google_comparable"
+                    )
+                    comparison_type = gr.Radio(
+                        label="📊 Comparison Type",
+                        choices=["statistical", "category"],
+                        value="statistical"
+                    )
+                    compare_btn = gr.Button("⚖️ Compare Models", variant="primary")
+            with gr.Row():
+                comparison_output = gr.Markdown()
+            with gr.Row():
+                comparison_plot = gr.Plot(label="📊 Model Comparison Analysis")
+        # Tab 8: Documentation
+        with gr.Tab("📚 Scientific Documentation", id="docs"):
             gr.Markdown(f"""
+            # 📖 SALT Translation Leaderboard - Scientific Edition Documentation
             ## 🎯 Overview
+            The SALT Translation Leaderboard Scientific Edition implements rigorous evaluation methodology
+            for translation models on Ugandan languages, designed for research publication and scientific analysis.
+            ## 🔬 Scientific Methodology
+            ### Three-Tier Evaluation System
+            **1. 🤖 Google-Comparable Track**
+            - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
+            - **Pairs**: {len(get_google_comparable_pairs())} language pairs
+            - **Purpose**: Fair comparison with commercial translation systems
+            - **Statistical Power**: High (≥200 samples per pair recommended)
+            **2. 🌍 UG40-Complete Track**
+            - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
+            - **Pairs**: {len(get_all_language_pairs())} language pairs
+            - **Purpose**: Comprehensive Ugandan language capability assessment
+            - **Statistical Power**: Moderate (≥100 samples per pair recommended)
+            **3. 📊 Language-Pair Matrix**
+            - **Resolution**: Individual language pair analysis
+            - **Purpose**: Detailed linguistic analysis and model diagnostics
+            - **Statistics**: Pairwise significance testing with multiple comparison correction
+            ### Statistical Rigor
+            - **Confidence Intervals**: 95% confidence intervals using bootstrap sampling ({STATISTICAL_CONFIG['bootstrap_samples']:,} resamples)
+            - **Significance Testing**: Two-tailed t-tests with {STATISTICAL_CONFIG['multiple_testing_correction'].title()} correction
+            - **Effect Size**: Cohen's d with interpretation (small: {STATISTICAL_CONFIG['effect_size_thresholds']['small']}, medium: {STATISTICAL_CONFIG['effect_size_thresholds']['medium']}, large: {STATISTICAL_CONFIG['effect_size_thresholds']['large']})
+            - **Statistical Power**: Estimated based on sample sizes and effect sizes
+            ### Model Categories
+            Models are automatically categorized for fair comparison:
+            - **🏢 Commercial**: Production translation systems (Google Translate, Azure, etc.)
+            - **🔬 Research**: Academic and research institution models (NLLB, M2M-100, etc.)
+            - **📊 Baseline**: Simple baseline and reference models
+            - **👥 Community**: User-submitted models and fine-tuned variants
             ## 📊 Evaluation Metrics
             ### Primary Metrics
+            - **Quality Score**: Composite metric (0-1) combining BLEU, ChrF, error rates, and ROUGE
+            - **BLEU**: Bilingual Evaluation Understudy (0-100)
+            - **ChrF**: Character-level F-score (0-1)
             ### Secondary Metrics
+            - **ROUGE-1/ROUGE-L**: Recall-oriented metrics for content overlap
+            - **CER/WER**: Character/Word Error Rate (lower is better)
             - **Length Ratio**: Prediction/reference length ratio
+            All metrics include 95% confidence intervals for statistical reliability.
             ## 🔄 Submission Process
+            ### Step 1: Download Scientific Test Set
+            1. Click "Download Scientific Test Set" in the first tab
+            2. Review test set adequacy and track breakdown
+            3. Save the enhanced test set with statistical weights
             ### Step 2: Generate Predictions
+            1. Load the test set in your evaluation pipeline
             2. For each row, translate `source_text` from `source_language` to `target_language`
             3. Save results as CSV with columns: `sample_id`, `prediction`
+            4. Optional: Add `category` column for automatic classification
             ### Step 3: Submit & Evaluate
+            1. Fill in detailed model information (improves categorization)
+            2. Upload your predictions file
+            3. Review validation report with track-specific adequacy assessment
+            4. Submit for scientific evaluation across all tracks
+            ## 📋 Enhanced File Formats
+            ### Scientific Test Set Format
             ```csv
+            sample_id,source_text,source_language,target_language,domain,google_comparable,tracks_included,statistical_weight
+            salt_000001,"Hello world",eng,lug,general,true,"google_comparable,ug40_complete",2.5
+            salt_000002,"How are you?",eng,ach,conversation,true,"google_comparable,ug40_complete",2.5
+            salt_000003,"Good morning",lgg,teo,greetings,false,"ug40_complete,language_pair_matrix",1.0
             ```
             ### Predictions Format
             ```csv
+            sample_id,prediction,category
+            salt_000001,"Amakuru ensi","community"
+            salt_000002,"Ibino nining?","community"
+            salt_000003,"Ejok nanu","community"
             ```
+            ## 🏆 Scientific Leaderboard Features
+            ### Fair Comparison
+            - Models only compared within the same category and track
+            - Statistical significance testing prevents misleading rankings
+            - Confidence intervals show measurement uncertainty
+            ### Cross-Track Analysis
+            - Consistency analysis across evaluation tracks
+            - Identification of model strengths and weaknesses
+            - Language-specific performance patterns
+            ### Publication Quality
+            - All visualizations include error bars and statistical annotations
+            - Comprehensive methodology documentation
+            - Reproducible evaluation pipeline
+            ## 🔬 Statistical Interpretation Guide
+            ### Confidence Intervals
+            - **Non-overlapping CIs**: Likely significant difference
+            - **Overlapping CIs**: May or may not be significant (requires formal testing)
+            - **Wide CIs**: High uncertainty (need more data)
+            ### Effect Sizes
+            - **Negligible (< {STATISTICAL_CONFIG['effect_size_thresholds']['small']})**: Practical equivalence
+            - **Small ({STATISTICAL_CONFIG['effect_size_thresholds']['small']}-{STATISTICAL_CONFIG['effect_size_thresholds']['medium']})**: Noticeable difference
+            - **Medium ({STATISTICAL_CONFIG['effect_size_thresholds']['medium']}-{STATISTICAL_CONFIG['effect_size_thresholds']['large']})**: Substantial difference
+            - **Large (> {STATISTICAL_CONFIG['effect_size_thresholds']['large']})**: Very large difference
+            ### Statistical Adequacy
+            - **Excellent**: High statistical power (>0.8) for all comparisons
+            - **Good**: Adequate power for most comparisons
+            - **Fair**: Limited power, interpret with caution
+            - **Insufficient**: Results not reliable for scientific conclusions
+            ## 🤝 Contributing to Science
+            This leaderboard is designed for the research community. When using results:
+            1. **Always report confidence intervals** along with point estimates
+            2. **Acknowledge statistical adequacy** when interpreting results
+            3. **Use appropriate track** for your comparison (don't compare Google-track vs UG40-track results)
+            4. **Consider effect sizes** not just statistical significance
             ## 📄 Citation
             If you use this leaderboard in your research, please cite:
             ```bibtex
+            @misc{{salt_leaderboard_scientific_2024,
+              title={{SALT Translation Leaderboard: Scientific Edition - Rigorous Evaluation of Translation Models on Ugandan Languages}},
               author={{Sunbird AI}},
               year={{2024}},
+              url={{https://huggingface.co/spaces/Sunbird/salt-translation-leaderboard-scientific}},
+              note={{Three-tier evaluation system with statistical significance testing}}
             }}
             ```
             ## 🔗 Related Resources
             - **SALT Dataset**: [sunbird/salt](https://huggingface.co/datasets/sunbird/salt)
+            - **Sunbird AI Research**: [sunbird.ai/research](https://sunbird.ai/research)
+            - **Statistical Methodology**: See our technical paper on rigorous MT evaluation
+            - **Open Source Code**: Available on GitHub for reproducibility
+            ---
+            *For questions about scientific methodology or statistical interpretation, contact our research team at research@sunbird.ai*
             """)
+    # Event handlers with enhanced scientific functionality
     predictions_validated = gr.State(value=None)
     validation_info_state = gr.State(value=None)
+    detected_category_state = gr.State(value="community")
     # Download test set
     download_btn.click(
+        fn=download_scientific_test_set,
         outputs=[download_file, download_info]
     )
     # Validate predictions
+    def handle_scientific_validation(file, model_name, author, description):
+        report, predictions, category = validate_scientific_submission(file, model_name, author, description)
         valid = predictions is not None
+        return (
+            report,
+            predictions,
+            {"category": category, "validation_passed": valid},
+            category,
+            gr.update(interactive=valid)
+        )
     validate_btn.click(
+        fn=handle_scientific_validation,
         inputs=[predictions_file, model_name_input, author_input, description_input],
+        outputs=[validation_output, predictions_validated, validation_info_state, detected_category_state, submit_btn]
     )
     # Submit for evaluation
+    def handle_scientific_submission(predictions, model_name, author, description, category, validation_info):
         if predictions is None:
+            return "❌ Please validate your submission first", None, None, None, None
+        return evaluate_scientific_submission(
+            predictions, model_name, author, description, category, validation_info
+        )
     submit_btn.click(
+        fn=handle_scientific_submission,
+        inputs=[predictions_validated, model_name_input, author_input, description_input, detected_category_state, validation_info_state],
+        outputs=[evaluation_output, results_table, submission_plot, cross_track_plot, current_leaderboard]
     )
+    # Track leaderboard refresh functions
+    def refresh_google_track(*args):
+        return refresh_track_leaderboard("google_comparable", *args)
+    def refresh_ug40_track(*args):
+        return refresh_track_leaderboard("ug40_complete", *args)
+    def refresh_matrix_track(*args):
+        return refresh_track_leaderboard("language_pair_matrix", *args)
+    # Google-Comparable Track
+    google_refresh.click(
+        fn=refresh_google_track,
+        inputs=[google_search, google_category, google_adequacy],
+        outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats]
     )
+    # UG40-Complete Track
+    ug40_refresh.click(
+        fn=refresh_ug40_track,
+        inputs=[ug40_search, ug40_category, ug40_adequacy],
+        outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats]
+    )
+    # Language-Pair Matrix Track
+    matrix_refresh.click(
+        fn=refresh_matrix_track,
+        inputs=[matrix_search, matrix_category, matrix_adequacy],
+        outputs=[matrix_leaderboard, matrix_ranking_plot, matrix_comparison_plot, matrix_stats]
+    )
     # Model analysis
     analyze_btn.click(
+        fn=get_scientific_model_details,
+        inputs=[model_select, track_select],
+        outputs=[model_details, model_analysis_plot, model_heatmap_plot]
+    )
+    # Model comparison
+    compare_btn.click(
+        fn=perform_model_comparison,
+        inputs=[comparison_models, comparison_track, comparison_type],
+        outputs=[comparison_output, comparison_plot]
     )
+    # Update dropdown choices when leaderboard changes
+    def update_dropdown_choices():
+        if current_leaderboard is not None and not current_leaderboard.empty:
+            model_choices = current_leaderboard['model_name'].tolist()
+        else:
+            model_choices = []
+        return (
+            gr.Dropdown(choices=model_choices),
+            gr.CheckboxGroup(choices=model_choices)
+        )
+    # Load initial data and update dropdowns
     demo.load(
+        fn=lambda: (
+            refresh_google_track("", "all", 0.0),
+            update_dropdown_choices()
+        ),
+        outputs=[
+            [google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats],
+            [model_select, comparison_models]
+        ]
     )
+# Launch the scientific application
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",