""" Evaluation Leaderboard - Gradio Interface Displays model evaluation results from HuggingFace datasets. """ import gradio as gr import pandas as pd from pathlib import Path from data_loader import ( load_hf_dataset_on_startup, get_available_leaderboards, get_eval_metadata, build_leaderboard_table, clear_cache, search_model_across_leaderboards, get_all_model_names, DATA_DIR ) from ui_components import ( get_theme, get_custom_css, format_leaderboard_header, format_metric_details, format_model_card, format_model_comparison, ) PAGE_SIZE = 50 def update_leaderboard_table(selected_leaderboard, search_query="", current_page=1, sort_column=None, selected_columns=None, progress=gr.Progress()): """Loads and aggregates data for the selected leaderboard.""" if not selected_leaderboard: return ( pd.DataFrame(), format_leaderboard_header(None, {}), format_metric_details(None, {}), gr.update(choices=[], value=None), gr.update(interactive=False), gr.update(interactive=False), gr.update(choices=[], value=None), "0 / 0", gr.update(choices=[], value=[]), ) metadata = get_eval_metadata(selected_leaderboard) def progress_callback(value, desc): progress(value, desc=desc) df = build_leaderboard_table(selected_leaderboard, "", progress_callback) # Get all available columns BEFORE filtering (for column selector) all_available_columns = list(df.columns) if not df.empty else [] # Filter columns if selected (if None or empty, show all columns) if selected_columns is not None and len(selected_columns) > 0: # Ensure Model column is always included base_cols = ["Model"] available_cols = list(df.columns) cols_to_show = [col for col in base_cols if col in available_cols] # Add Developer and other selected columns cols_to_show.extend([col for col in selected_columns if col in available_cols and col not in cols_to_show]) if cols_to_show: df = df[cols_to_show] if search_query and not df.empty: mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1) df = df[mask] filtered_count = len(df) if sort_column and sort_column in df.columns and not df.empty: df = df.sort_values(by=sort_column, ascending=False, na_position='last') total_pages = max(1, (filtered_count + PAGE_SIZE - 1) // PAGE_SIZE) if filtered_count > 0 else 1 current_page = max(1, min(current_page, total_pages)) start_idx = (current_page - 1) * PAGE_SIZE end_idx = start_idx + PAGE_SIZE df_paginated = df.iloc[start_idx:end_idx] if not df.empty else df page_choices = [str(i) for i in range(1, total_pages + 1)] page_dropdown = gr.update(choices=page_choices, value=str(current_page)) prev_btn = gr.update(interactive=(current_page > 1)) next_btn = gr.update(interactive=(current_page < total_pages)) page_info = f"{current_page} / {total_pages}" sort_choices = list(df.columns) if not df.empty else [] default_sort = sort_column if sort_column and sort_column in sort_choices else ("Average" if "Average" in sort_choices else (sort_choices[0] if sort_choices else None)) sort_column_update = gr.update(choices=sort_choices, value=default_sort) # Get all available columns for column selector (use full list, not filtered) # Include all columns except Model in the selector (Model is always shown) column_choices = [col for col in all_available_columns if col != "Model"] # Preserve current selection, or default to all columns if None or empty if selected_columns is None or len(selected_columns) == 0: column_value = column_choices else: # Preserve user's selection, filtering out any invalid choices column_value = [col for col in selected_columns if col in column_choices] column_selector_update = gr.update(choices=column_choices, value=column_value) return ( df_paginated, format_leaderboard_header(selected_leaderboard, metadata), format_metric_details(selected_leaderboard, metadata), page_dropdown, prev_btn, next_btn, sort_column_update, page_info, column_selector_update, ) def search_model(model_query): """Search for a model and return formatted card.""" if not model_query or len(model_query) < 2: return """

Search for a model

Enter a model name to see its benchmarks across all leaderboards

""" results, _ = search_model_across_leaderboards(model_query) if not results: return f"""

No results for "{model_query}"

Try a different model name or check the spelling

""" # Use the first matching model model_name = list(results.keys())[0] model_data = results[model_name] return format_model_card(model_name, model_data) def compare_models(selected_models): """Compare multiple selected models.""" if not selected_models or len(selected_models) == 0: return """

Select models to compare

Choose multiple models from the dropdown to see a side-by-side comparison

""" # Get data for all selected models all_results = {} for model_name in selected_models: results, _ = search_model_across_leaderboards(model_name) if results: # Use the first matching model (exact match preferred) matched_model = list(results.keys())[0] all_results[matched_model] = results[matched_model] if len(all_results) == 1: # Single model - show card view model_name = list(all_results.keys())[0] return format_model_card(model_name, all_results[model_name]) elif len(all_results) > 1: # Multiple models - show comparison return format_model_comparison(list(all_results.keys()), all_results) else: return """

No results found

Try selecting different models

""" def get_model_suggestions(query): """Get model name suggestions for autocomplete.""" if not query or len(query) < 2: return gr.update(choices=[]) _, matches = search_model_across_leaderboards(query) return gr.update(choices=matches[:15]) # Load data at startup load_hf_dataset_on_startup() # Build interface with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css()) as demo: # Header gr.HTML("""
Eยณ

Every Eval Ever

Browse and compare model benchmarks
beta
""") with gr.Tabs(): # === TAB 1: Leaderboard View === with gr.TabItem("๐Ÿ“Š Leaderboards"): with gr.Row(elem_classes="controls-bar"): initial_choices = get_available_leaderboards() initial_value = initial_choices[0] if initial_choices else None with gr.Column(scale=2, min_width=200): leaderboard_selector = gr.Dropdown( choices=initial_choices, value=initial_value, label="Leaderboard", interactive=True ) with gr.Column(scale=3, min_width=250): search_box = gr.Textbox( label="Filter", placeholder="Filter models...", show_label=True ) with gr.Column(scale=1, min_width=100): refresh_btn = gr.Button("โ†ป Refresh", variant="secondary", size="sm") init_df, init_header, init_metrics, init_page_dropdown, init_prev, init_next, init_sort_cols, init_page_info, init_column_selector = update_leaderboard_table(initial_value, "", 1, "Average", None) header_view = gr.HTML(value=init_header) # Hidden sort state (default to Average) sort_column_dropdown = gr.Dropdown( choices=init_sort_cols.get("choices", []) if hasattr(init_sort_cols, 'get') else [], value=init_sort_cols.get("value") if hasattr(init_sort_cols, 'get') else None, visible=False, ) # Column selector with gr.Row(elem_classes="controls-bar"): column_selector = gr.CheckboxGroup( choices=init_column_selector.get("choices", []) if isinstance(init_column_selector, dict) else [], value=init_column_selector.get("value", []) if isinstance(init_column_selector, dict) else [], label="Columns to Display", interactive=True, show_label=True, ) leaderboard_table = gr.Dataframe( value=init_df, label=None, interactive=False, wrap=False, elem_classes="dataframe", ) # Pagination below table - centered with gr.Row(elem_classes="pagination-bar"): prev_btn = gr.Button("โ†", variant="secondary", size="sm", min_width=60) page_info = gr.Markdown(value=init_page_info, elem_classes="page-info") next_btn = gr.Button("โ†’", variant="secondary", size="sm", min_width=60) # Extract choices and value from gr.update() dict, ensuring value is in choices if isinstance(init_page_dropdown, dict): page_choices = init_page_dropdown.get("choices", ["1"]) page_value = str(init_page_dropdown.get("value", "1")) if init_page_dropdown.get("value") is not None else "1" # Ensure value exists in choices if page_value not in page_choices: page_value = page_choices[0] if page_choices else "1" if not page_choices: page_choices = ["1"] else: page_choices = ["1"] page_value = "1" page_dropdown = gr.Dropdown( choices=page_choices, value=page_value, visible=False, ) metrics_view = gr.HTML(value=init_metrics) # === TAB 2: Model View === with gr.TabItem("๐Ÿ” Model Lookup"): gr.Markdown("### Find and compare models across all leaderboards") selected_models_state = gr.State(value=[]) default_compare_html = """

Search for models to compare

Type in the dropdown above, then click a model to add it

""" with gr.Row(elem_classes="controls-bar"): with gr.Column(scale=4): all_models = get_all_model_names() model_dropdown = gr.Dropdown( choices=all_models, label="Search models to add", interactive=True, allow_custom_value=False, filterable=True, ) with gr.Column(scale=1, min_width=100): clear_models_btn = gr.Button("Clear All", variant="secondary", size="sm") selected_models_group = gr.CheckboxGroup( choices=[], value=[], label="Selected Models (click to remove)", interactive=True, elem_classes="selected-models-group" ) model_card_view = gr.HTML(value=default_compare_html) # Submission guide with gr.Accordion("๐Ÿ“ค How to Submit Data", open=False): gr.Markdown(""" **Submit via GitHub Pull Request:** 1. Fork [evaleval/every_eval_ever](https://github.com/evaleval/every_eval_ever) 2. Add JSON files to `data////` 3. Open a PR โ€” automated validation runs on submission 4. After merge, data syncs to HuggingFace automatically [Submission Guide](https://github.com/evaleval/every_eval_ever#contributor-guide) ยท [JSON Schema](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json) """) # === State === current_page_state = gr.State(value=1) sort_column_state = gr.State(value="Average") def go_prev(current): return max(1, current - 1) def go_next(current): return current + 1 def reset_page(): return 1 def update_table_only(selected_leaderboard, search_query, current_page, sort_column, selected_columns): """Update table without modifying column selector (for column changes).""" result = update_leaderboard_table(selected_leaderboard, search_query, current_page, sort_column, selected_columns) # Return all outputs except the last one (column_selector) return result[:-1] # === Leaderboard Events === leaderboard_selector.change( fn=reset_page, outputs=[current_page_state] ).then( fn=lambda: "Average", outputs=[sort_column_state] ).then( fn=lambda: None, outputs=[column_selector] ).then( fn=update_leaderboard_table, inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector], outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info, column_selector] ) search_box.input( fn=reset_page, outputs=[current_page_state] ).then( fn=update_table_only, inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector], outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info] ) sort_column_dropdown.change( fn=lambda col: col, inputs=[sort_column_dropdown], outputs=[sort_column_state] ).then( fn=reset_page, outputs=[current_page_state] ).then( fn=update_table_only, inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector], outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info] ) column_selector.change( fn=reset_page, outputs=[current_page_state] ).then( fn=update_table_only, inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector], outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info] ) page_dropdown.change( fn=lambda p: int(p) if p else 1, inputs=[page_dropdown], outputs=[current_page_state] ).then( fn=update_table_only, inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector], outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info] ) prev_btn.click( fn=go_prev, inputs=[current_page_state], outputs=[current_page_state] ).then( fn=update_table_only, inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector], outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info] ) next_btn.click( fn=go_next, inputs=[current_page_state], outputs=[current_page_state] ).then( fn=update_table_only, inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector], outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info] ) refresh_btn.click( fn=lambda: gr.Dropdown(choices=get_available_leaderboards()), outputs=[leaderboard_selector] ).then( fn=lambda: clear_cache() ).then( fn=reset_page, outputs=[current_page_state] ).then( fn=lambda: "Average", outputs=[sort_column_state] ).then( fn=lambda: None, outputs=[column_selector] ).then( fn=update_leaderboard_table, inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector], outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info, column_selector] ) # === Model Search Events === def add_model_and_compare(selected_model, current_selected): """Add a model and auto-compare.""" if not selected_model: comparison_html = compare_models(current_selected) if current_selected else default_compare_html return ( current_selected, gr.update(value=None), gr.update(choices=current_selected, value=current_selected), comparison_html ) if current_selected is None: current_selected = [] if selected_model not in current_selected: current_selected = current_selected + [selected_model] comparison_html = compare_models(current_selected) return ( current_selected, gr.update(value=None), gr.update(choices=current_selected, value=current_selected), comparison_html ) def update_selection(selected_list): """Update selection from checkbox changes.""" selected_list = selected_list or [] comparison_html = compare_models(selected_list) if selected_list else default_compare_html return selected_list, comparison_html def clear_all_models(): """Clear all selected models.""" return ( [], gr.update(value=None), gr.update(choices=[], value=[]), default_compare_html ) # Select from dropdown adds model and auto-compares model_dropdown.select( fn=add_model_and_compare, inputs=[model_dropdown, selected_models_state], outputs=[selected_models_state, model_dropdown, selected_models_group, model_card_view] ) selected_models_group.change( fn=update_selection, inputs=[selected_models_group], outputs=[selected_models_state, model_card_view] ) clear_models_btn.click( fn=clear_all_models, outputs=[selected_models_state, model_dropdown, selected_models_group, model_card_view] ) DATA_DIR.mkdir(exist_ok=True) if __name__ == "__main__": demo.launch()