deepmage121's picture
added exports, fixes to search with a split and other qol
7ab1991
raw
history blame
17.7 kB
import gradio as gr
import pandas as pd
from pathlib import Path
from data_loader import (
load_hf_dataset_on_startup,
get_available_leaderboards,
get_eval_metadata,
build_leaderboard_table,
clear_cache,
search_model_across_leaderboards,
get_all_model_names,
DATA_DIR
)
from ui_components import (
get_theme,
get_custom_css,
format_leaderboard_header,
format_metric_details,
format_model_card,
format_model_comparison,
create_radar_plot,
)
PAGE_SIZE = 50
def get_leaderboard_data(selected_leaderboard, progress=gr.Progress()):
if not selected_leaderboard:
return pd.DataFrame(), {}
metadata = get_eval_metadata(selected_leaderboard)
def progress_callback(value, desc):
progress(value, desc=desc)
df = build_leaderboard_table(selected_leaderboard, "", progress_callback)
return df, metadata
def filter_and_paginate(df, search_query, sort_column, selected_columns, current_page):
if df.empty:
return df.copy(), 1, 1
df = df.copy()
all_columns = list(df.columns)
if selected_columns:
cols = ["Model"] + [c for c in all_columns if c in selected_columns and c != "Model"]
df = df[cols]
if search_query:
mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1)
df = df[mask]
if sort_column and sort_column in df.columns:
df = df.sort_values(by=sort_column, ascending=False, na_position='last')
total_rows = len(df)
total_pages = max(1, (total_rows + PAGE_SIZE - 1) // PAGE_SIZE)
current_page = max(1, min(current_page, total_pages))
start = (current_page - 1) * PAGE_SIZE
end = start + PAGE_SIZE
return df.iloc[start:end], current_page, total_pages
def search_model(model_query):
if not model_query or len(model_query) < 2:
return """
<div class="no-results">
<h3>Search for a model</h3>
<p>Enter a model name to see its benchmarks across all leaderboards</p>
</div>
"""
results, _ = search_model_across_leaderboards(model_query)
if not results:
return f"""
<div class="no-results">
<h3>No results for "{model_query}"</h3>
<p>Try a different model name or check the spelling</p>
</div>
"""
model_name = list(results.keys())[0]
model_data = results[model_name]
return format_model_card(model_name, model_data)
def compare_models(selected_models):
if not selected_models:
return """
<div class="no-results">
<h3>Select models to compare</h3>
<p>Choose multiple models from the dropdown to see a side-by-side comparison</p>
</div>
""", None
all_results = {}
for model_name in selected_models:
results, _ = search_model_across_leaderboards(model_name)
if results:
matched_model = list(results.keys())[0]
all_results[matched_model] = results[matched_model]
plot = create_radar_plot(list(all_results.keys()), all_results)
if len(all_results) == 1:
model_name = list(all_results.keys())[0]
return format_model_card(model_name, all_results[model_name]), plot
elif len(all_results) > 1:
return format_model_comparison(list(all_results.keys()), all_results), plot
else:
return """
<div class="no-results">
<h3>No results found</h3>
<p>Try selecting different models</p>
</div>
""", None
def get_model_suggestions(query):
if not query or len(query) < 2:
return gr.update(choices=[])
_, matches = search_model_across_leaderboards(query)
return gr.update(choices=matches[:15])
def export_leaderboard_to_csv(full_df, selected_leaderboard, search_query, selected_columns):
"""Export the current leaderboard view to CSV."""
if full_df.empty:
return None
df = full_df.copy()
# Apply column selection
if selected_columns:
cols = ["Model"] + [c for c in df.columns if c in selected_columns and c != "Model"]
df = df[cols]
# Apply search filter
if search_query:
mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1)
df = df[mask]
# Save to CSV with absolute path
from pathlib import Path
import tempfile
temp_dir = Path(tempfile.gettempdir())
filename = temp_dir / f"{selected_leaderboard.replace(' ', '_')}_leaderboard.csv"
df.to_csv(filename, index=False)
return str(filename)
def export_comparison_to_csv(selected_models):
"""Export model comparison to CSV."""
if not selected_models:
return None
all_results = {}
for model_name in selected_models:
results, _ = search_model_across_leaderboards(model_name)
if results:
matched_model = list(results.keys())[0]
all_results[matched_model] = results[matched_model]
if not all_results:
return None
# Build comparison table
rows = []
for model_name, model_data in all_results.items():
for leaderboard_name, data in model_data.items():
results = data.get("results", {})
row = {
"Model": model_name,
"Leaderboard": leaderboard_name,
"Developer": data.get("developer"),
"Params (B)": data.get("params"),
"Architecture": data.get("architecture"),
"Precision": data.get("precision")
}
row.update(results)
rows.append(row)
df = pd.DataFrame(rows)
from pathlib import Path
import tempfile
temp_dir = Path(tempfile.gettempdir())
filename = temp_dir / "model_comparison.csv"
df.to_csv(filename, index=False)
return str(filename)
load_hf_dataset_on_startup()
initial_leaderboards = get_available_leaderboards()
initial_leaderboard = initial_leaderboards[0] if initial_leaderboards else None
if initial_leaderboard:
_init_df, _init_metadata = get_leaderboard_data(initial_leaderboard)
_init_columns = [c for c in _init_df.columns if c != "Model"] if not _init_df.empty else []
_init_df_display, _, _init_total_pages = filter_and_paginate(_init_df, "", "Average", None, 1)
else:
_init_df = pd.DataFrame()
_init_metadata = {}
_init_columns = []
_init_df_display = pd.DataFrame()
_init_total_pages = 1
with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css()) as demo:
full_df_state = gr.State(value=_init_df)
metadata_state = gr.State(value=_init_metadata)
current_page_state = gr.State(value=1)
gr.HTML("""
<div class="app-header">
<div class="logo-mark">E³</div>
<div class="brand">
<h1>Every Eval Ever</h1>
<span class="tagline">Browse and compare model benchmarks</span>
</div>
<div class="header-right">
<span class="version-badge">beta</span>
</div>
</div>
""")
with gr.Tabs():
with gr.TabItem("Leaderboards"):
with gr.Column(elem_classes="controls-bar"):
with gr.Row():
with gr.Column(scale=4, min_width=260):
leaderboard_selector = gr.Dropdown(
choices=initial_leaderboards,
value=initial_leaderboard,
label="Leaderboard",
interactive=True
)
with gr.Column(scale=1, min_width=120):
refresh_btn = gr.Button("↻ Refresh", variant="secondary", size="sm")
with gr.Column(scale=1, min_width=120):
export_btn = gr.DownloadButton("📥 Export CSV", variant="secondary", size="sm")
search_box = gr.Textbox(
label="Filter",
placeholder="Filter models...",
show_label=True
)
header_view = gr.HTML(value=format_leaderboard_header(initial_leaderboard, _init_metadata))
with gr.Row(elem_classes="column-selector-bar"):
with gr.Column(scale=5, min_width=320):
column_selector = gr.Dropdown(
choices=_init_columns,
value=_init_columns,
label="Columns to Display",
multiselect=True,
interactive=True,
elem_classes="column-selector-dropdown"
)
leaderboard_table = gr.Dataframe(
value=_init_df_display,
label=None,
interactive=False,
wrap=False,
elem_classes="dataframe",
)
with gr.Row(elem_classes="pagination-bar"):
prev_btn = gr.Button("←", variant="secondary", size="sm", min_width=60)
page_info = gr.Markdown(value=f"1 / {_init_total_pages}", elem_classes="page-info")
next_btn = gr.Button("→", variant="secondary", size="sm", min_width=60)
metrics_view = gr.HTML(value=format_metric_details(initial_leaderboard, _init_metadata))
with gr.TabItem("🔍 Model Lookup"):
gr.Markdown("### Find and compare models across all leaderboards")
selected_models_state = gr.State(value=[])
default_compare_html = """
<div class="no-results">
<h3>Search for models to compare</h3>
<p>Type in the dropdown above, then click a model to add it</p>
</div>
"""
model_search_box = gr.Textbox(
label="Type to search for models",
placeholder="Start typing model name (e.g., gpt, llama, claude)...",
interactive=True,
)
with gr.Row(elem_classes="controls-bar"):
with gr.Column(scale=4):
model_dropdown = gr.Dropdown(
choices=[],
label="Select from search results",
interactive=True,
allow_custom_value=False,
)
with gr.Column(scale=1, min_width=100):
clear_models_btn = gr.Button("Clear All", variant="secondary", size="sm")
selected_models_group = gr.CheckboxGroup(
choices=[],
value=[],
label="Selected Models (click to remove)",
interactive=True,
elem_classes="selected-models-group"
)
with gr.Row():
with gr.Column(scale=4):
pass
with gr.Column(scale=1, min_width=120):
export_comparison_btn = gr.DownloadButton("📥 Export CSV", variant="secondary", size="sm")
radar_view = gr.Plot(label="Radar Comparison")
model_card_view = gr.HTML(value=default_compare_html)
with gr.Accordion("📤 How to Submit Data", open=False):
gr.Markdown("""
Submit via GitHub Pull Request:
1. Fork [evaleval/every_eval_ever](https://github.com/evaleval/every_eval_ever)
2. Add JSON files to `data/<leaderboard>/<developer>/<model>/`
3. Open a PR - automated validation runs on submission
4. After merge, data syncs to HuggingFace automatically
[Submission Guide](https://github.com/evaleval/every_eval_ever#contributor-guide) - [JSON Schema](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json)
""")
def load_leaderboard(leaderboard_name):
df, metadata = get_leaderboard_data(leaderboard_name)
columns = [c for c in df.columns if c != "Model"] if not df.empty else []
df_display, page, total_pages = filter_and_paginate(df, "", "Average", None, 1)
return (
df, # full_df_state
metadata, # metadata_state
1, # current_page_state
df_display, # leaderboard_table
format_leaderboard_header(leaderboard_name, metadata), # header_view
format_metric_details(leaderboard_name, metadata), # metrics_view
gr.update(choices=columns, value=columns), # column_selector
f"1 / {total_pages}", # page_info
)
def update_table(full_df, search_query, selected_columns, current_page):
df_display, page, total_pages = filter_and_paginate(
full_df, search_query, "Average", selected_columns, current_page
)
return df_display, f"{page} / {total_pages}", page
def go_page(full_df, search_query, selected_columns, current_page, delta):
new_page = max(1, current_page + delta)
df_display, page, total_pages = filter_and_paginate(
full_df, search_query, "Average", selected_columns, new_page
)
return df_display, f"{page} / {total_pages}", page
leaderboard_selector.change(
fn=load_leaderboard,
inputs=[leaderboard_selector],
outputs=[full_df_state, metadata_state, current_page_state, leaderboard_table, header_view, metrics_view, column_selector, page_info]
)
search_box.input(
fn=lambda df, q, cols: update_table(df, q, cols, 1),
inputs=[full_df_state, search_box, column_selector],
outputs=[leaderboard_table, page_info, current_page_state]
)
def on_column_change(df, q, cols):
if not cols:
cols = [c for c in df.columns if c != "Model"]
return update_table(df, q, cols, 1)
column_selector.change(
fn=on_column_change,
inputs=[full_df_state, search_box, column_selector],
outputs=[leaderboard_table, page_info, current_page_state]
)
prev_btn.click(
fn=lambda df, q, cols, p: go_page(df, q, cols, p, -1),
inputs=[full_df_state, search_box, column_selector, current_page_state],
outputs=[leaderboard_table, page_info, current_page_state]
)
next_btn.click(
fn=lambda df, q, cols, p: go_page(df, q, cols, p, 1),
inputs=[full_df_state, search_box, column_selector, current_page_state],
outputs=[leaderboard_table, page_info, current_page_state]
)
refresh_btn.click(
fn=lambda: (clear_cache(), gr.update(choices=get_available_leaderboards()))[1],
outputs=[leaderboard_selector]
)
export_btn.click(
fn=export_leaderboard_to_csv,
inputs=[full_df_state, leaderboard_selector, search_box, column_selector],
outputs=[export_btn]
)
def add_model_and_compare(selected_model, current_selected):
if not selected_model:
comparison_html, plot = compare_models(current_selected) if current_selected else (default_compare_html, None)
return (
current_selected,
gr.update(value=""),
gr.update(value=None, choices=[]),
gr.update(choices=current_selected, value=current_selected),
comparison_html,
plot
)
if selected_model not in current_selected:
current_selected = current_selected + [selected_model]
comparison_html, plot = compare_models(current_selected)
return (
current_selected,
gr.update(value=""),
gr.update(value=None, choices=[]),
gr.update(choices=current_selected, value=current_selected),
comparison_html,
plot
)
def update_selection(selected_list):
comparison_html, plot = compare_models(selected_list) if selected_list else (default_compare_html, None)
return selected_list, gr.update(choices=selected_list, value=selected_list), comparison_html, plot
def clear_all_models():
return (
[],
gr.update(value=""),
gr.update(value=None, choices=[]),
gr.update(choices=[], value=[]),
default_compare_html,
None
)
model_search_box.input(
fn=get_model_suggestions,
inputs=[model_search_box],
outputs=[model_dropdown]
)
model_dropdown.select(
fn=add_model_and_compare,
inputs=[model_dropdown, selected_models_state],
outputs=[selected_models_state, model_search_box, model_dropdown, selected_models_group, model_card_view, radar_view]
)
selected_models_group.change(
fn=update_selection,
inputs=[selected_models_group],
outputs=[selected_models_state, selected_models_group, model_card_view, radar_view]
)
clear_models_btn.click(
fn=clear_all_models,
outputs=[selected_models_state, model_search_box, model_dropdown, selected_models_group, model_card_view, radar_view]
)
export_comparison_btn.click(
fn=export_comparison_to_csv,
inputs=[selected_models_state],
outputs=[export_comparison_btn]
)
DATA_DIR.mkdir(exist_ok=True)
if __name__ == "__main__":
demo.launch()