| import sys |
| import time |
| from datetime import datetime, timezone |
|
|
| import gradio as gr |
| from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns |
| import pandas as pd |
| import plotly.express as px |
| from apscheduler.schedulers.background import BackgroundScheduler |
| from huggingface_hub import snapshot_download |
|
|
| from src.about import ( |
| CITATION_BUTTON_LABEL, |
| CITATION_BUTTON_TEXT, |
| EVALUATION_QUEUE_TEXT, |
| INTRODUCTION_TEXT, |
| LLM_BENCHMARKS_TEXT, |
| TITLE, |
| ) |
| from src.display.css_html_js import custom_css |
| from src.display.utils import ( |
| BENCHMARK_COLS, |
| COLS, |
| EVAL_COLS, |
| EVAL_TYPES, |
| AutoEvalColumn, |
| ModelType, |
| fields, |
| WeightType, |
| Precision, |
| ) |
| from src.envs import ( |
| API, |
| EVAL_REQUESTS_PATH, |
| EVAL_RESULTS_PATH, |
| QUEUE_REPO, |
| REPO_ID, |
| RESULTS_REPO, |
| TOKEN, |
| WORKFLOWS, |
| get_eval_results_path, |
| get_eval_requests_path, |
| ) |
| from src.leaderboard.read_evals import clear_eval_cache |
| from src.populate import ( |
| get_evaluation_queue_df, |
| get_leaderboard_df, |
| get_trend_summary_df, |
| get_trend_history_df, |
| get_combined_trend_history_df, |
| get_combined_trend_summary_df, |
| ) |
| from src.submission.submit import add_new_eval |
|
|
| |
| LOCAL_MODE = "--local" in sys.argv |
|
|
|
|
| def restart_space(): |
| API.restart_space(repo_id=REPO_ID) |
|
|
|
|
| |
|
|
| MAX_DOWNLOAD_RETRIES = 3 |
|
|
|
|
| def download_with_retry(repo_id: str, local_dir: str, label: str) -> None: |
| """Download a HF Hub dataset with retries. Restarts the Space only after all retries are exhausted.""" |
| for attempt in range(1, MAX_DOWNLOAD_RETRIES + 1): |
| try: |
| print(f"Downloading {label} ({repo_id}) β attempt {attempt}/{MAX_DOWNLOAD_RETRIES}") |
| snapshot_download( |
| repo_id=repo_id, |
| local_dir=local_dir, |
| repo_type="dataset", |
| tqdm_class=None, |
| etag_timeout=30, |
| token=TOKEN, |
| ) |
| return |
| except Exception as e: |
| print(f"WARNING: Failed to download {label}: {e}") |
| if attempt < MAX_DOWNLOAD_RETRIES: |
| wait = 10 * attempt |
| print(f" Retrying in {wait}s...") |
| time.sleep(wait) |
| else: |
| print(f"ERROR: All {MAX_DOWNLOAD_RETRIES} download attempts failed for {label}. Restarting space.") |
| restart_space() |
|
|
|
|
| if not LOCAL_MODE: |
| download_with_retry(QUEUE_REPO, EVAL_REQUESTS_PATH, "eval requests") |
| download_with_retry(RESULTS_REPO, EVAL_RESULTS_PATH, "eval results") |
| else: |
| print("LOCAL MODE: skipping HF Hub downloads, using local eval-results/ and eval-queue/") |
|
|
| |
| SINGLE_AGENT_RESULTS = get_eval_results_path("single_agent") |
| SINGLE_AGENT_REQUESTS = get_eval_requests_path("single_agent") |
| MULTI_AGENT_RESULTS = get_eval_results_path("multi_agent") |
| MULTI_AGENT_REQUESTS = get_eval_requests_path("multi_agent") |
|
|
| LEADERBOARD_DF = get_leaderboard_df(SINGLE_AGENT_RESULTS, SINGLE_AGENT_REQUESTS, COLS, BENCHMARK_COLS) |
| if not LEADERBOARD_DF.empty: |
| LEADERBOARD_DF["T"] = range(1, len(LEADERBOARD_DF) + 1) |
|
|
| LEADERBOARD_DF_MULTI = get_leaderboard_df(MULTI_AGENT_RESULTS, MULTI_AGENT_REQUESTS, COLS, BENCHMARK_COLS) |
| if not LEADERBOARD_DF_MULTI.empty: |
| LEADERBOARD_DF_MULTI["T"] = range(1, len(LEADERBOARD_DF_MULTI) + 1) |
|
|
| |
| try: |
| TREND_SUMMARY_DF = get_combined_trend_summary_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH) |
| TREND_HISTORY_DF = get_combined_trend_history_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH) |
| except Exception as e: |
| print(f"WARNING: Failed to load trend data: {e}") |
| TREND_SUMMARY_DF = pd.DataFrame() |
| TREND_HISTORY_DF = pd.DataFrame() |
|
|
| try: |
| ( |
| finished_eval_queue_df, |
| running_eval_queue_df, |
| pending_eval_queue_df, |
| ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) |
| except Exception as e: |
| print(f"WARNING: Failed to load evaluation queue: {e}") |
| _empty_queue = pd.DataFrame(columns=EVAL_COLS) |
| finished_eval_queue_df = _empty_queue |
| running_eval_queue_df = _empty_queue.copy() |
| pending_eval_queue_df = _empty_queue.copy() |
|
|
|
|
| def build_trend_chart(history_df: pd.DataFrame, workflow_filter: str = "All"): |
| """Build a Plotly line chart showing model scores over time. |
| |
| Parameters |
| ---------- |
| history_df : pd.DataFrame |
| Combined history with an optional ``workflow`` column. |
| workflow_filter : str |
| One of "All", "single_agent", or "multi_agent". |
| """ |
| if history_df.empty: |
| fig = px.line(title="No historical data available yet") |
| fig.update_layout( |
| xaxis_title="Date", |
| yaxis_title="Average Score (%)", |
| ) |
| return fig |
|
|
| df = history_df.copy() |
|
|
| |
| if workflow_filter != "All" and "workflow" in df.columns: |
| df = df[df["workflow"] == workflow_filter] |
|
|
| if df.empty: |
| fig = px.line(title=f"No data available for workflow: {workflow_filter}") |
| fig.update_layout(xaxis_title="Date", yaxis_title="Average Score (%)") |
| return fig |
|
|
| |
| if "workflow" in df.columns and workflow_filter == "All": |
| df["label"] = df["model"] + " (" + df["workflow"] + ")" |
| |
| fig = px.line( |
| df, |
| x="eval_date", |
| y="average", |
| color="model", |
| line_dash="workflow", |
| markers=True, |
| title="Model Performance Over Time", |
| labels={ |
| "eval_date": "Evaluation Date", |
| "average": "Average Score (%)", |
| "model": "Model", |
| "workflow": "Workflow", |
| }, |
| ) |
| else: |
| fig = px.line( |
| df, |
| x="eval_date", |
| y="average", |
| color="model", |
| markers=True, |
| title="Model Performance Over Time", |
| labels={ |
| "eval_date": "Evaluation Date", |
| "average": "Average Score (%)", |
| "model": "Model", |
| }, |
| ) |
|
|
| fig.update_layout( |
| xaxis_title="Date", |
| yaxis_title="Average Score (%)", |
| legend_title="Model", |
| hovermode="x unified", |
| yaxis=dict(range=[0, 105]), |
| ) |
| return fig |
|
|
|
|
| def filter_trend_data(workflow_filter: str = "All"): |
| """Filter the already-loaded trend data in memory (no re-download). |
| |
| Used by the workflow dropdown so that switching filters is instant. |
| """ |
| global TREND_HISTORY_DF, TREND_SUMMARY_DF |
|
|
| summary_df = TREND_SUMMARY_DF.copy() if not TREND_SUMMARY_DF.empty else pd.DataFrame() |
| history_df = TREND_HISTORY_DF |
|
|
| if workflow_filter != "All" and not summary_df.empty and "Workflow" in summary_df.columns: |
| summary_df = summary_df[summary_df["Workflow"] == workflow_filter] |
|
|
| chart = build_trend_chart(history_df, workflow_filter) |
| timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") |
| return chart, summary_df, timestamp |
|
|
|
|
| def refresh_trend_data(workflow_filter: str = "All"): |
| """Re-download eval results from HF Hub and recompute trend data. |
| |
| Returns updated values for the trend chart, summary table, and a |
| last-updated timestamp string. Errors during download or computation |
| are caught so the UI never crashes. |
| """ |
| global TREND_HISTORY_DF, TREND_SUMMARY_DF |
|
|
| try: |
| if not LOCAL_MODE: |
| download_with_retry(RESULTS_REPO, EVAL_RESULTS_PATH, "eval results") |
|
|
| |
| clear_eval_cache() |
|
|
| TREND_SUMMARY_DF = get_combined_trend_summary_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH) |
| TREND_HISTORY_DF = get_combined_trend_history_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH) |
| except Exception as e: |
| print(f"WARNING: Failed to refresh trend data: {e}") |
| TREND_SUMMARY_DF = pd.DataFrame() |
| TREND_HISTORY_DF = pd.DataFrame() |
|
|
| |
| return filter_trend_data(workflow_filter) |
|
|
|
|
| def init_leaderboard(dataframe): |
| if dataframe is None or dataframe.empty: |
| |
| dataframe = pd.DataFrame(columns=COLS) |
| return Leaderboard( |
| value=dataframe, |
| datatype=[c.type for c in fields(AutoEvalColumn)], |
| select_columns=SelectColumns( |
| default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], |
| cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], |
| label="Select Columns to Display:", |
| ), |
| search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], |
| hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], |
| filter_columns=[ |
| ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"), |
| ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"), |
| ColumnFilter( |
| AutoEvalColumn.params.name, |
| type="slider", |
| min=0.01, |
| max=150, |
| label="Select the number of parameters (B)", |
| ), |
| ColumnFilter( |
| AutoEvalColumn.still_on_hub.name, |
| type="boolean", |
| label="Deleted/incomplete", |
| default=False, |
| ), |
| ], |
| bool_checkboxgroup_label="Hide models", |
| interactive=False, |
| ) |
|
|
|
|
| demo = gr.Blocks(css=custom_css) |
| with demo: |
| gr.HTML(TITLE) |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text", elem_id="cg-intro-block") |
|
|
| with gr.Tabs(elem_classes="tab-buttons") as tabs: |
| with gr.TabItem("π
Single-Agent Benchmark", elem_id="llm-benchmark-tab-table", id=0): |
| leaderboard = init_leaderboard(LEADERBOARD_DF) |
|
|
| with gr.TabItem("π€ Multi-Agent Benchmark", elem_id="multi-agent-benchmark-tab-table", id=1): |
| leaderboard_multi = init_leaderboard(LEADERBOARD_DF_MULTI) |
|
|
| with gr.TabItem("π Trends", elem_id="llm-benchmark-tab-trends", id=2): |
| gr.Markdown( |
| "### Performance Trends\n" |
| "Track how model scores change over time across workflows. " |
| "Averages are computed over available evaluation days within each window. " |
| "The *(N/M)* annotation shows how many days of data were available.\n\n" |
| "When viewing **All** workflows, single-agent results are shown with solid lines " |
| "and multi-agent results with dashed lines.", |
| elem_classes="markdown-text", |
| elem_id="cg-trends-header", |
| ) |
| with gr.Row(elem_id="cg-trend-controls"): |
| workflow_filter = gr.Dropdown( |
| choices=["All", "single_agent", "multi_agent"], |
| value="All", |
| label="Workflow", |
| interactive=True, |
| scale=0, |
| min_width=180, |
| ) |
| refresh_btn = gr.Button("π Refresh", scale=0, min_width=120, elem_id="cg-refresh-btn") |
| last_updated_box = gr.Textbox( |
| label="Last updated", |
| value=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC"), |
| interactive=False, |
| scale=1, |
| ) |
| trend_chart = gr.Plot(value=build_trend_chart(TREND_HISTORY_DF), elem_id="cg-trend-chart") |
| gr.Markdown("### Summary: 1-Day / 3-Day / 7-Day Averages", elem_id="cg-trend-summary-label") |
| trend_table = gr.Dataframe( |
| value=TREND_SUMMARY_DF, |
| interactive=False, |
| elem_id="cg-trend-summary", |
| ) |
|
|
| |
| workflow_filter.change( |
| fn=filter_trend_data, |
| inputs=[workflow_filter], |
| outputs=[trend_chart, trend_table, last_updated_box], |
| ) |
|
|
| |
| refresh_btn.click( |
| fn=refresh_trend_data, |
| inputs=[workflow_filter], |
| outputs=[trend_chart, trend_table, last_updated_box], |
| ) |
|
|
| |
| |
| |
| |
| |
|
|
| with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=3): |
| gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text", elem_id="cg-about-content") |
|
|
| with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=4): |
| with gr.Column(): |
| with gr.Row(): |
| gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text", elem_id="cg-submit-guide") |
|
|
| with gr.Column(): |
| with gr.Accordion( |
| f"β
Finished Evaluations ({len(finished_eval_queue_df)})", |
| open=False, |
| elem_classes="cg-queue-accordion", |
| ): |
| with gr.Row(): |
| finished_eval_table = gr.components.Dataframe( |
| value=finished_eval_queue_df, |
| headers=EVAL_COLS, |
| datatype=EVAL_TYPES, |
| row_count=5, |
| ) |
| with gr.Accordion( |
| f"π Running Evaluation Queue ({len(running_eval_queue_df)})", |
| open=False, |
| elem_classes="cg-queue-accordion", |
| ): |
| with gr.Row(): |
| running_eval_table = gr.components.Dataframe( |
| value=running_eval_queue_df, |
| headers=EVAL_COLS, |
| datatype=EVAL_TYPES, |
| row_count=5, |
| ) |
|
|
| with gr.Accordion( |
| f"β³ Pending Evaluation Queue ({len(pending_eval_queue_df)})", |
| open=False, |
| elem_classes="cg-queue-accordion", |
| ): |
| with gr.Row(): |
| pending_eval_table = gr.components.Dataframe( |
| value=pending_eval_queue_df, |
| headers=EVAL_COLS, |
| datatype=EVAL_TYPES, |
| row_count=5, |
| ) |
| with gr.Row(): |
| gr.Markdown("# βοΈβ¨ Submit your model here!", elem_classes="markdown-text", elem_id="cg-submit-heading") |
|
|
| with gr.Row(): |
| with gr.Column(): |
| model_name_textbox = gr.Textbox(label="Model name") |
| revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") |
| model_type = gr.Dropdown( |
| choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], |
| label="Model type", |
| multiselect=False, |
| value=None, |
| interactive=True, |
| ) |
|
|
| with gr.Column(): |
| precision = gr.Dropdown( |
| choices=[i.value.name for i in Precision if i != Precision.Unknown], |
| label="Precision", |
| multiselect=False, |
| value="float16", |
| interactive=True, |
| ) |
| weight_type = gr.Dropdown( |
| choices=[i.value.name for i in WeightType], |
| label="Weights type", |
| multiselect=False, |
| value="Original", |
| interactive=True, |
| ) |
| base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") |
| workflow_type = gr.Dropdown( |
| choices=["Both", "single_agent", "multi_agent"], |
| label="Workflow", |
| multiselect=False, |
| value="Both", |
| interactive=True, |
| ) |
|
|
| submit_button = gr.Button("Submit Eval", elem_id="cg-submit-btn") |
| submission_result = gr.Markdown() |
| submit_button.click( |
| add_new_eval, |
| [ |
| model_name_textbox, |
| base_model_name_textbox, |
| revision_name_textbox, |
| precision, |
| weight_type, |
| model_type, |
| workflow_type, |
| ], |
| submission_result, |
| ) |
|
|
| with gr.Row(elem_id="cg-citation-section"): |
| with gr.Accordion("π Citation", open=False): |
| citation_button = gr.Textbox( |
| value=CITATION_BUTTON_TEXT, |
| label=CITATION_BUTTON_LABEL, |
| lines=20, |
| elem_id="citation-button", |
| show_copy_button=True, |
| ) |
|
|
| if not LOCAL_MODE: |
| scheduler = BackgroundScheduler() |
| scheduler.add_job(restart_space, "interval", seconds=21600) |
| scheduler.start() |
|
|
| demo.queue(default_concurrency_limit=4).launch() |
|
|