import gradio as gr import pandas as pd from pathlib import Path from data_loader import ( load_hf_dataset_on_startup, get_available_leaderboards, get_eval_metadata, build_leaderboard_table, clear_cache, search_model_across_leaderboards, get_model_suggestions_fast, DATA_DIR ) from ui_components import ( get_theme, get_custom_css, format_leaderboard_header, format_metric_details, format_model_card, format_model_comparison, create_radar_plot, ) PAGE_SIZE = 50 def get_leaderboard_data(selected_leaderboard, progress=gr.Progress()): if not selected_leaderboard: return pd.DataFrame(), {} metadata = get_eval_metadata(selected_leaderboard) def progress_callback(value, desc): progress(value, desc=desc) df = build_leaderboard_table(selected_leaderboard, "", progress_callback) return df, metadata def filter_and_paginate(df, search_query, sort_column, selected_columns, current_page): if df.empty: return df.copy(), 1, 1 df = df.copy() all_columns = list(df.columns) if selected_columns: cols = ["Model"] + [c for c in all_columns if c in selected_columns and c != "Model"] df = df[cols] if search_query: mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1) df = df[mask] if sort_column and sort_column in df.columns: df = df.sort_values(by=sort_column, ascending=False, na_position='last') total_rows = len(df) total_pages = max(1, (total_rows + PAGE_SIZE - 1) // PAGE_SIZE) current_page = max(1, min(current_page, total_pages)) start = (current_page - 1) * PAGE_SIZE end = start + PAGE_SIZE return df.iloc[start:end], current_page, total_pages def search_model(model_query): if not model_query or len(model_query) < 2: return """

Search for a model

Enter a model name to see its benchmarks across all leaderboards

""" results, _ = search_model_across_leaderboards(model_query) if not results: return f"""

No results for "{model_query}"

Try a different model name or check the spelling

""" model_name = list(results.keys())[0] model_data = results[model_name] return format_model_card(model_name, model_data) def compare_models(selected_models): if not selected_models: return """

Select models to compare

Choose multiple models from the dropdown to see a side-by-side comparison

""", None all_results = {} for model_name in selected_models: results, _ = search_model_across_leaderboards(model_name) if results: matched_model = list(results.keys())[0] all_results[matched_model] = results[matched_model] plot = create_radar_plot(list(all_results.keys()), all_results) if len(all_results) == 1: model_name = list(all_results.keys())[0] return format_model_card(model_name, all_results[model_name]), plot elif len(all_results) > 1: return format_model_comparison(list(all_results.keys()), all_results), plot else: return """

No results found

Try selecting different models

""", None def get_model_suggestions(value): query = value or "" if not query or len(query) < 2: return gr.update(choices=[], value=[]) matches = get_model_suggestions_fast(query, limit=10) return gr.update(choices=matches, value=[]) def export_leaderboard_to_csv(full_df, selected_leaderboard, search_query, selected_columns): """Export the current leaderboard view to CSV.""" if full_df.empty: return None df = full_df.copy() # Apply column selection if selected_columns: cols = ["Model"] + [c for c in df.columns if c in selected_columns and c != "Model"] df = df[cols] # Apply search filter if search_query: mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1) df = df[mask] # Save to CSV with absolute path from pathlib import Path import tempfile temp_dir = Path(tempfile.gettempdir()) filename = temp_dir / f"{selected_leaderboard.replace(' ', '_')}_leaderboard.csv" df.to_csv(filename, index=False) return str(filename) def export_comparison_to_csv(selected_models): """Export model comparison to CSV.""" if not selected_models: return None all_results = {} for model_name in selected_models: results, _ = search_model_across_leaderboards(model_name) if results: matched_model = list(results.keys())[0] all_results[matched_model] = results[matched_model] if not all_results: return None # Build comparison table rows = [] for model_name, model_data in all_results.items(): for leaderboard_name, data in model_data.items(): results = data.get("results", {}) row = { "Model": model_name, "Leaderboard": leaderboard_name, "Developer": data.get("developer"), "Params (B)": data.get("params"), "Architecture": data.get("architecture"), "Precision": data.get("precision") } row.update(results) rows.append(row) df = pd.DataFrame(rows) from pathlib import Path import tempfile temp_dir = Path(tempfile.gettempdir()) filename = temp_dir / "model_comparison.csv" df.to_csv(filename, index=False) return str(filename) load_hf_dataset_on_startup() initial_leaderboards = get_available_leaderboards() initial_leaderboard = initial_leaderboards[0] if initial_leaderboards else None if initial_leaderboard: _init_df, _init_metadata = get_leaderboard_data(initial_leaderboard) _init_columns = [c for c in _init_df.columns if c != "Model"] if not _init_df.empty else [] _init_df_display, _, _init_total_pages = filter_and_paginate(_init_df, "", "Average", None, 1) else: _init_df = pd.DataFrame() _init_metadata = {} _init_columns = [] _init_df_display = pd.DataFrame() _init_total_pages = 1 with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css()) as demo: full_df_state = gr.State(value=_init_df) metadata_state = gr.State(value=_init_metadata) current_page_state = gr.State(value=1) gr.HTML("""
E³

Every Eval Ever

Browse and compare model benchmarks
beta
""") with gr.Tabs(): with gr.TabItem("Leaderboards"): with gr.Column(elem_classes="controls-bar"): with gr.Row(): with gr.Column(scale=4, min_width=260): leaderboard_selector = gr.Dropdown( choices=initial_leaderboards, value=initial_leaderboard, label="Leaderboard", interactive=True ) with gr.Column(scale=1, min_width=120): refresh_btn = gr.Button("↻ Refresh", variant="secondary", size="sm") with gr.Column(scale=1, min_width=120): export_btn = gr.DownloadButton("šŸ“„ Export CSV", variant="secondary", size="sm") search_box = gr.Textbox( label="Filter", placeholder="Filter models...", show_label=True ) header_view = gr.HTML(value=format_leaderboard_header(initial_leaderboard, _init_metadata)) with gr.Row(elem_classes="column-selector-bar"): with gr.Column(scale=5, min_width=320): column_selector = gr.Dropdown( choices=_init_columns, value=_init_columns, label="Columns to Display", multiselect=True, interactive=True, elem_classes="column-selector-dropdown" ) leaderboard_table = gr.Dataframe( value=_init_df_display, label=None, interactive=False, wrap=False, elem_classes="dataframe", ) with gr.Row(elem_classes="pagination-bar"): prev_btn = gr.Button("←", variant="secondary", size="sm", min_width=60) page_info = gr.Markdown(value=f"1 / {_init_total_pages}", elem_classes="page-info") next_btn = gr.Button("→", variant="secondary", size="sm", min_width=60) metrics_view = gr.HTML(value=format_metric_details(initial_leaderboard, _init_metadata)) with gr.TabItem("šŸ” Model Lookup"): gr.Markdown("### Find and compare models across all leaderboards") selected_models_state = gr.State(value=[]) default_compare_html = """

Search for models to compare

Type in the dropdown to search, then select a model to add it

""" with gr.Row(elem_classes="controls-bar"): with gr.Column(scale=3): model_search_box = gr.Textbox( label="Search models", placeholder="Type model name...", interactive=True, ) with gr.Column(scale=1, min_width=120): search_button = gr.Button("Search", variant="secondary") with gr.Column(scale=3): search_results = gr.CheckboxGroup( choices=[], label="Top matches (select to add)", interactive=True, value=[], elem_classes=["match-pills"], ) with gr.Column(scale=1, min_width=80): clear_models_btn = gr.Button("Clear", variant="secondary", size="sm") selected_models_group = gr.CheckboxGroup( choices=[], value=[], label="Selected Models (click to remove)", interactive=True, elem_classes="selected-models-group" ) with gr.Row(): with gr.Column(scale=4): pass with gr.Column(scale=1, min_width=120): export_comparison_btn = gr.DownloadButton("šŸ“„ Export CSV", variant="secondary", size="sm") radar_view = gr.Plot(label="Radar Comparison") model_card_view = gr.HTML(value=default_compare_html) with gr.Accordion("šŸ“¤ How to Submit Data", open=False): gr.Markdown(""" Submit via GitHub Pull Request: 1. Fork [evaleval/every_eval_ever](https://github.com/evaleval/every_eval_ever) 2. Add JSON files to `data////` 3. Open a PR - automated validation runs on submission 4. After merge, data syncs to HuggingFace automatically [Submission Guide](https://github.com/evaleval/every_eval_ever#contributor-guide) - [JSON Schema](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json) """) def load_leaderboard(leaderboard_name): df, metadata = get_leaderboard_data(leaderboard_name) columns = [c for c in df.columns if c != "Model"] if not df.empty else [] df_display, page, total_pages = filter_and_paginate(df, "", "Average", None, 1) return ( df, # full_df_state metadata, # metadata_state 1, # current_page_state df_display, # leaderboard_table format_leaderboard_header(leaderboard_name, metadata), # header_view format_metric_details(leaderboard_name, metadata), # metrics_view gr.update(choices=columns, value=columns), # column_selector f"1 / {total_pages}", # page_info ) def update_table(full_df, search_query, selected_columns, current_page): df_display, page, total_pages = filter_and_paginate( full_df, search_query, "Average", selected_columns, current_page ) return df_display, f"{page} / {total_pages}", page def go_page(full_df, search_query, selected_columns, current_page, delta): new_page = max(1, current_page + delta) df_display, page, total_pages = filter_and_paginate( full_df, search_query, "Average", selected_columns, new_page ) return df_display, f"{page} / {total_pages}", page leaderboard_selector.change( fn=load_leaderboard, inputs=[leaderboard_selector], outputs=[full_df_state, metadata_state, current_page_state, leaderboard_table, header_view, metrics_view, column_selector, page_info] ) search_box.input( fn=lambda df, q, cols: update_table(df, q, cols, 1), inputs=[full_df_state, search_box, column_selector], outputs=[leaderboard_table, page_info, current_page_state] ) def on_column_change(df, q, cols): if not cols: cols = [c for c in df.columns if c != "Model"] return update_table(df, q, cols, 1) column_selector.change( fn=on_column_change, inputs=[full_df_state, search_box, column_selector], outputs=[leaderboard_table, page_info, current_page_state] ) prev_btn.click( fn=lambda df, q, cols, p: go_page(df, q, cols, p, -1), inputs=[full_df_state, search_box, column_selector, current_page_state], outputs=[leaderboard_table, page_info, current_page_state] ) next_btn.click( fn=lambda df, q, cols, p: go_page(df, q, cols, p, 1), inputs=[full_df_state, search_box, column_selector, current_page_state], outputs=[leaderboard_table, page_info, current_page_state] ) refresh_btn.click( fn=lambda: (clear_cache(), gr.update(choices=get_available_leaderboards()))[1], outputs=[leaderboard_selector] ) export_btn.click( fn=export_leaderboard_to_csv, inputs=[full_df_state, leaderboard_selector, search_box, column_selector], outputs=[export_btn] ) def add_models_from_search(selected_from_results, current_selected): selected_from_results = selected_from_results or [] current_selected = current_selected or [] merged = list(dict.fromkeys(current_selected + selected_from_results)) comparison_html, plot = compare_models(merged) if merged else (default_compare_html, None) return ( merged, gr.update(choices=[], value=[]), gr.update(choices=merged, value=merged), comparison_html, plot ) def update_selection(selected_list): comparison_html, plot = compare_models(selected_list) if selected_list else (default_compare_html, None) return selected_list, gr.update(choices=selected_list, value=selected_list), comparison_html, plot def clear_all_models(): return ( [], gr.update(value=""), gr.update(choices=[], value=[]), gr.update(choices=[], value=[]), default_compare_html, None ) search_button.click( fn=get_model_suggestions, inputs=[model_search_box], outputs=[search_results], queue=False, ) model_search_box.submit( fn=get_model_suggestions, inputs=[model_search_box], outputs=[search_results], queue=False, ) search_results.change( fn=add_models_from_search, inputs=[search_results, selected_models_state], outputs=[selected_models_state, search_results, selected_models_group, model_card_view, radar_view], ) selected_models_group.change( fn=update_selection, inputs=[selected_models_group], outputs=[selected_models_state, selected_models_group, model_card_view, radar_view] ) clear_models_btn.click( fn=clear_all_models, outputs=[selected_models_state, model_search_box, search_results, selected_models_group, model_card_view, radar_view] ) export_comparison_btn.click( fn=export_comparison_to_csv, inputs=[selected_models_state], outputs=[export_comparison_btn] ) DATA_DIR.mkdir(exist_ok=True) if __name__ == "__main__": demo.launch()