| import os |
| import re |
| from pathlib import Path |
|
|
|
|
| def patch_gradio_leaderboard(): |
| """Patch gradio_leaderboard JS to fix crash on tab switch with Gradio 5.x.""" |
| import gradio_leaderboard |
| pkg_dir = Path(gradio_leaderboard.__file__).parent |
| js_file = pkg_dir / "templates" / "component" / "Index-CzS_eGV6.js" |
| if not js_file.exists(): |
| return |
|
|
| src = js_file.read_text() |
|
|
| patches = [ |
| |
| ( |
| 'r[0].filter(\n /*func*/\n r[39]\n ).map(qd)', |
| '(r[39] ? r[0].filter(r[39]) : r[0]).map(qd)', |
| ), |
| ( |
| 'a[0].filter(\n /*func*/\n a[39]\n ).map(qd))', |
| '(a[39] ? a[0].filter(a[39]) : a[0]).map(qd))', |
| ), |
| |
| ( |
| '{ Boolean: Lx } = Rx,', |
| 'Lx = (Rx && Rx.Boolean) || Boolean,', |
| ), |
| ] |
|
|
| patched = False |
| for old, new in patches: |
| if old in src: |
| src = src.replace(old, new) |
| patched = True |
|
|
| if patched: |
| js_file.write_text(src) |
|
|
|
|
| patch_gradio_leaderboard() |
|
|
| import gradio as gr |
| from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns |
| from apscheduler.schedulers.background import BackgroundScheduler |
| from huggingface_hub import HfApi |
|
|
| from src.leaderboard import get_leaderboard_df, get_benchmark_run_df |
| from src.display.text_blocks import ( |
| INTRODUCTION_TEXT, |
| LLM_BENCHMARKS_TEXT, |
| ) |
|
|
| REPO_ID = "taagarwa/coding-agent-leaderboard" |
| TOKEN = os.environ.get("HF_TOKEN") |
| API = HfApi(token=TOKEN) |
|
|
| def restart_space(): |
| API.restart_space(repo_id=REPO_ID) |
|
|
|
|
| LEADERBOARD_DF = get_leaderboard_df() |
| BENCHMARK_RUN_DF = get_benchmark_run_df() |
|
|
| def extract_body(s: str): |
| return re.match(r'\[(.*?)\]', s).group(1) |
|
|
|
|
| def build_header_html(df): |
| n_results = len(df) |
| n_models = df["Model"].nunique() |
| n_harnesses = df["Harness"].apply(lambda s: extract_body(s)).nunique() |
| n_benchmarks = df["Benchmark"].apply(lambda s: extract_body(s)).nunique() |
|
|
| return f""" |
| <base target="_blank"> |
| <div style="padding: 1.5rem 0.5rem 1rem 0.5rem; text-align: left;"> |
| <h1 style="margin: 0 0 0.5rem 0; font-size: 2rem;"> |
| Coding Agent Leaderboard |
| </h1> |
| <div style="height: 4px; border-radius: 2px; background: linear-gradient(90deg, #84cc16, #f59e0b); margin-bottom: 0.75rem;"></div> |
| <p style="margin: 0 0 0.75rem 0; font-size: 1.1rem; opacity: 0.8;"> |
| Compare coding agents across models and harnesses |
| </p> |
| <div style="display: flex; gap: 0.5rem; flex-wrap: wrap; font-size: 0.95rem; opacity: 0.7;"> |
| <span style="font-weight: 600;">{n_results} Results</span> |
| <span>路</span> |
| <span style="font-weight: 600;">{n_models} Models</span> |
| <span>路</span> |
| <span style="font-weight: 600;">{n_harnesses} Harnesses</span> |
| <span>路</span> |
| <span style="font-weight: 600;">{n_benchmarks} Benchmarks</span> |
| </div> |
| </div> |
| """ |
| |
| def init_leaderboard(dataframe): |
| if dataframe is None or dataframe.empty: |
| raise ValueError("Leaderboard DataFrame is empty or None.") |
| |
| label_choices = [("馃煚 Fully FOSS", "馃煚"), ("馃敹 Proprietary", "馃敹")] |
| meta_columns = [" ", "Harness", "Model", "Harness License", "Model License", "Model Num Params (B)", "Precision"] |
| benchmark_columns = [col for col in dataframe.columns if col not in meta_columns] |
| model_choices = sorted({(extract_body(v), v) for v in dataframe["Model"]}) |
| harness_choices = sorted({(extract_body(v), v) for v in dataframe["Harness"]}) |
|
|
| default_columns = [" ", "Harness", "Model"] + benchmark_columns |
| return Leaderboard( |
| value=dataframe, |
| select_columns=SelectColumns( |
| default_selection=default_columns, |
| label="Select Columns to Display:", |
| ), |
| datatype="markdown", |
| search_columns=["Harness", "Model"], |
| filter_columns=[ |
| ColumnFilter(label="Category", column=" ", type="checkboxgroup", choices=label_choices), |
| ColumnFilter(label="Model", column="Model", type="checkboxgroup", choices=model_choices), |
| ColumnFilter(label="Harness", column="Harness", type="checkboxgroup", choices=harness_choices), |
| ColumnFilter(label="Number of Parameters (B)", column="Model Num Params (B)", type="slider"), |
| ColumnFilter(label="Precision", column="Precision", type="checkboxgroup"), |
| ], |
| interactive=False, |
| ) |
|
|
| def init_benchmark_runs(dataframe): |
| if dataframe is None or dataframe.empty: |
| raise ValueError("Leaderboard DataFrame is empty or None.") |
| |
| |
| label_choices = [("馃煚 Fully FOSS", "馃煚"), ("馃敹 Proprietary", "馃敹")] |
| benchmark_choices = sorted({(extract_body(v), v) for v in dataframe["Benchmark"]}) |
| |
| return Leaderboard( |
| value=dataframe, |
| select_columns=SelectColumns( |
| default_selection=[ |
| " ", |
| "Model", |
| "Harness", |
| "Benchmark", |
| "Score", |
| "Avg Cost Per Task (USD)", |
| ], |
| label="Select Columns to Display:", |
| ), |
| datatype="markdown", |
| search_columns=[ |
| "Benchmark", |
| "Harness", |
| "Model", |
| ], |
| filter_columns=[ |
| ColumnFilter(label="Category", column=" ", type="checkboxgroup", choices=label_choices), |
| ColumnFilter(label="Benchmark", column="Benchmark", type="checkboxgroup", choices=benchmark_choices), |
| ColumnFilter(label="Number of Parameters (B)", column="Model Num Params (B)", type="slider"), |
| ColumnFilter(label="Precision", column="Precision", type="checkboxgroup"), |
| ], |
| interactive=False, |
| ) |
|
|
| demo = gr.Blocks(theme="citrus") |
| with demo: |
| gr.HTML(build_header_html(BENCHMARK_RUN_DF)) |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
|
| with gr.Tabs(): |
| with gr.Tab("馃弳 Leaderboard"): |
| leaderboard = init_leaderboard(LEADERBOARD_DF) |
|
|
| with gr.Tab("馃弮 Benchmark Runs"): |
| benchmark_runs = init_benchmark_runs(BENCHMARK_RUN_DF) |
|
|
| with gr.Tab("馃摑 About"): |
| gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") |
|
|
| scheduler = BackgroundScheduler() |
| scheduler.add_job(restart_space, "interval", seconds=1800) |
| scheduler.start() |
| demo.queue(default_concurrency_limit=40).launch() |
|
|