File size: 6,697 Bytes
fa0576d abb343c ff489b1 abb343c fa0576d 067ad94 fa0576d 067ad94 fa0576d 7a6725b d0aedab 067ad94 7a6725b 37f1252 7a6725b 067ad94 7a6725b 067ad94 77a435c 067ad94 77a435c 067ad94 7a6725b 77a435c 067ad94 77a435c 067ad94 77a435c 067ad94 4b9a7ba 067ad94 7a6725b 067ad94 fa0576d be7275a 067ad94 fa0576d 067ad94 77a435c 067ad94 0b3694d 067ad94 fa0576d 067ad94 fa0576d be7275a 067ad94 fc97436 d0aedab fa0576d 7a6725b fa0576d 067ad94 fa0576d 5dfc258 067ad94 abb343c 5dfc258 067ad94 abb343c fa0576d 067ad94 fa0576d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | import os
import re
from pathlib import Path
def patch_gradio_leaderboard():
"""Patch gradio_leaderboard JS to fix crash on tab switch with Gradio 5.x."""
import gradio_leaderboard
pkg_dir = Path(gradio_leaderboard.__file__).parent
js_file = pkg_dir / "templates" / "component" / "Index-CzS_eGV6.js"
if not js_file.exists():
return
src = js_file.read_text()
patches = [
# Fix 1 & 2: Guard r[39]/a[39] filter callback (undefined during Svelte outro)
(
'r[0].filter(\n /*func*/\n r[39]\n ).map(qd)',
'(r[39] ? r[0].filter(r[39]) : r[0]).map(qd)',
),
(
'a[0].filter(\n /*func*/\n a[39]\n ).map(qd))',
'(a[39] ? a[0].filter(a[39]) : a[0]).map(qd))',
),
# Fix 3: Lx (Boolean) extracted from Rx (globals) which is undefined in Gradio 5
(
'{ Boolean: Lx } = Rx,',
'Lx = (Rx && Rx.Boolean) || Boolean,',
),
]
patched = False
for old, new in patches:
if old in src:
src = src.replace(old, new)
patched = True
if patched:
js_file.write_text(src)
patch_gradio_leaderboard()
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi
from src.leaderboard import get_leaderboard_df, get_benchmark_run_df
from src.display.text_blocks import (
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
)
REPO_ID = "taagarwa/coding-agent-leaderboard"
TOKEN = os.environ.get("HF_TOKEN")
API = HfApi(token=TOKEN)
def restart_space():
API.restart_space(repo_id=REPO_ID)
LEADERBOARD_DF = get_leaderboard_df()
BENCHMARK_RUN_DF = get_benchmark_run_df()
def extract_body(s: str):
return re.match(r'\[(.*?)\]', s).group(1)
def build_header_html(df):
n_results = len(df)
n_models = df["Model"].nunique()
n_harnesses = df["Harness"].apply(lambda s: extract_body(s)).nunique()
n_benchmarks = df["Benchmark"].apply(lambda s: extract_body(s)).nunique()
return f"""
<base target="_blank">
<div style="padding: 1.5rem 0.5rem 1rem 0.5rem; text-align: left;">
<h1 style="margin: 0 0 0.5rem 0; font-size: 2rem;">
Coding Agent Leaderboard
</h1>
<div style="height: 4px; border-radius: 2px; background: linear-gradient(90deg, #84cc16, #f59e0b); margin-bottom: 0.75rem;"></div>
<p style="margin: 0 0 0.75rem 0; font-size: 1.1rem; opacity: 0.8;">
Compare coding agents across models and harnesses
</p>
<div style="display: flex; gap: 0.5rem; flex-wrap: wrap; font-size: 0.95rem; opacity: 0.7;">
<span style="font-weight: 600;">{n_results} Results</span>
<span>路</span>
<span style="font-weight: 600;">{n_models} Models</span>
<span>路</span>
<span style="font-weight: 600;">{n_harnesses} Harnesses</span>
<span>路</span>
<span style="font-weight: 600;">{n_benchmarks} Benchmarks</span>
</div>
</div>
"""
def init_leaderboard(dataframe):
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
label_choices = [("馃煚 Fully FOSS", "馃煚"), ("馃敹 Proprietary", "馃敹")]
meta_columns = [" ", "Harness", "Model", "Harness License", "Model License", "Model Num Params (B)", "Precision"]
benchmark_columns = [col for col in dataframe.columns if col not in meta_columns]
model_choices = sorted({(extract_body(v), v) for v in dataframe["Model"]})
harness_choices = sorted({(extract_body(v), v) for v in dataframe["Harness"]})
default_columns = [" ", "Harness", "Model"] + benchmark_columns
return Leaderboard(
value=dataframe,
select_columns=SelectColumns(
default_selection=default_columns,
label="Select Columns to Display:",
),
datatype="markdown",
search_columns=["Harness", "Model"],
filter_columns=[
ColumnFilter(label="Category", column=" ", type="checkboxgroup", choices=label_choices),
ColumnFilter(label="Model", column="Model", type="checkboxgroup", choices=model_choices),
ColumnFilter(label="Harness", column="Harness", type="checkboxgroup", choices=harness_choices),
ColumnFilter(label="Number of Parameters (B)", column="Model Num Params (B)", type="slider"),
ColumnFilter(label="Precision", column="Precision", type="checkboxgroup"),
],
interactive=False,
)
def init_benchmark_runs(dataframe):
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
# Make ColumnFilter choices
label_choices = [("馃煚 Fully FOSS", "馃煚"), ("馃敹 Proprietary", "馃敹")]
benchmark_choices = sorted({(extract_body(v), v) for v in dataframe["Benchmark"]})
return Leaderboard(
value=dataframe,
select_columns=SelectColumns(
default_selection=[
" ",
"Model",
"Harness",
"Benchmark",
"Score",
"Avg Cost Per Task (USD)",
],
label="Select Columns to Display:",
),
datatype="markdown",
search_columns=[
"Benchmark",
"Harness",
"Model",
],
filter_columns=[
ColumnFilter(label="Category", column=" ", type="checkboxgroup", choices=label_choices),
ColumnFilter(label="Benchmark", column="Benchmark", type="checkboxgroup", choices=benchmark_choices),
ColumnFilter(label="Number of Parameters (B)", column="Model Num Params (B)", type="slider"),
ColumnFilter(label="Precision", column="Precision", type="checkboxgroup"),
],
interactive=False,
)
demo = gr.Blocks(theme="citrus")
with demo:
gr.HTML(build_header_html(BENCHMARK_RUN_DF))
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs():
with gr.Tab("馃弳 Leaderboard"):
leaderboard = init_leaderboard(LEADERBOARD_DF)
with gr.Tab("馃弮 Benchmark Runs"):
benchmark_runs = init_benchmark_runs(BENCHMARK_RUN_DF)
with gr.Tab("馃摑 About"):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()
|