Spaces:

taagarwa
/

coding-agent-leaderboard

Running

File size: 6,697 Bytes

fa0576d
 
abb343c
 
 
 
ff489b1
abb343c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa0576d
 
 
 
 
 
067ad94
fa0576d
 
 
 
 
 
 
 
 
 
 
 
 
 
067ad94
fa0576d
 
 
 
 
7a6725b
 
d0aedab
 
067ad94
7a6725b
 
 
 
 
 
 
 
 
37f1252
7a6725b
 
 
 
 
 
 
 
067ad94
7a6725b
 
 
067ad94
 
 
 
 
 
77a435c
067ad94
77a435c
067ad94
7a6725b
77a435c
067ad94
 
 
 
 
 
 
77a435c
067ad94
 
77a435c
067ad94
4b9a7ba
067ad94
 
 
 
7a6725b
067ad94
fa0576d
 
 
be7275a
 
067ad94
fa0576d
 
 
 
067ad94
 
 
77a435c
 
067ad94
0b3694d
067ad94
fa0576d
 
 
067ad94
 
 
 
 
fa0576d
be7275a
067ad94
fc97436
d0aedab
fa0576d
 
 
 
7a6725b
fa0576d
067ad94
fa0576d
 
5dfc258
067ad94
abb343c
5dfc258
067ad94
abb343c
fa0576d
067ad94
fa0576d

import os
import re
from pathlib import Path


def patch_gradio_leaderboard():
    """Patch gradio_leaderboard JS to fix crash on tab switch with Gradio 5.x."""
    import gradio_leaderboard
    pkg_dir = Path(gradio_leaderboard.__file__).parent
    js_file = pkg_dir / "templates" / "component" / "Index-CzS_eGV6.js"
    if not js_file.exists():
        return

    src = js_file.read_text()

    patches = [
        # Fix 1 & 2: Guard r[39]/a[39] filter callback (undefined during Svelte outro)
        (
            'r[0].filter(\n        /*func*/\n        r[39]\n      ).map(qd)',
            '(r[39] ? r[0].filter(r[39]) : r[0]).map(qd)',
        ),
        (
            'a[0].filter(\n          /*func*/\n          a[39]\n        ).map(qd))',
            '(a[39] ? a[0].filter(a[39]) : a[0]).map(qd))',
        ),
        # Fix 3: Lx (Boolean) extracted from Rx (globals) which is undefined in Gradio 5
        (
            '{ Boolean: Lx } = Rx,',
            'Lx = (Rx && Rx.Boolean) || Boolean,',
        ),
    ]

    patched = False
    for old, new in patches:
        if old in src:
            src = src.replace(old, new)
            patched = True

    if patched:
        js_file.write_text(src)


patch_gradio_leaderboard()

import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi

from src.leaderboard import get_leaderboard_df, get_benchmark_run_df
from src.display.text_blocks import (
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
)

REPO_ID = "taagarwa/coding-agent-leaderboard"
TOKEN = os.environ.get("HF_TOKEN")
API = HfApi(token=TOKEN)

def restart_space():
    API.restart_space(repo_id=REPO_ID)


LEADERBOARD_DF = get_leaderboard_df()
BENCHMARK_RUN_DF = get_benchmark_run_df()

def extract_body(s: str):
    return re.match(r'\[(.*?)\]', s).group(1)


def build_header_html(df):
    n_results = len(df)
    n_models = df["Model"].nunique()
    n_harnesses = df["Harness"].apply(lambda s: extract_body(s)).nunique()
    n_benchmarks = df["Benchmark"].apply(lambda s: extract_body(s)).nunique()

    return f"""
    <base target="_blank">
    <div style="padding: 1.5rem 0.5rem 1rem 0.5rem; text-align: left;">
        <h1 style="margin: 0 0 0.5rem 0; font-size: 2rem;">
            Coding Agent Leaderboard
        </h1>
        <div style="height: 4px; border-radius: 2px; background: linear-gradient(90deg, #84cc16, #f59e0b); margin-bottom: 0.75rem;"></div>
        <p style="margin: 0 0 0.75rem 0; font-size: 1.1rem; opacity: 0.8;">
            Compare coding agents across models and harnesses
        </p>
        <div style="display: flex; gap: 0.5rem; flex-wrap: wrap; font-size: 0.95rem; opacity: 0.7;">
            <span style="font-weight: 600;">{n_results} Results</span>
            <span>·</span>
            <span style="font-weight: 600;">{n_models} Models</span>
            <span>·</span>
            <span style="font-weight: 600;">{n_harnesses} Harnesses</span>
            <span>·</span>
            <span style="font-weight: 600;">{n_benchmarks} Benchmarks</span>
        </div>
    </div>
    """
    
def init_leaderboard(dataframe):
    if dataframe is None or dataframe.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")
    
    label_choices = [("🟠 Fully FOSS", "🟠"), ("🔶 Proprietary", "🔶")]
    meta_columns = [" ", "Harness", "Model", "Harness License", "Model License", "Model Num Params (B)", "Precision"]
    benchmark_columns = [col for col in dataframe.columns if col not in meta_columns]
    model_choices = sorted({(extract_body(v), v) for v in dataframe["Model"]})
    harness_choices = sorted({(extract_body(v), v) for v in dataframe["Harness"]})

    default_columns = [" ", "Harness", "Model"] + benchmark_columns
    return Leaderboard(
        value=dataframe,
        select_columns=SelectColumns(
            default_selection=default_columns,
            label="Select Columns to Display:",
        ),
        datatype="markdown",
        search_columns=["Harness", "Model"],
        filter_columns=[
            ColumnFilter(label="Category", column=" ", type="checkboxgroup", choices=label_choices),
            ColumnFilter(label="Model", column="Model", type="checkboxgroup", choices=model_choices),
            ColumnFilter(label="Harness", column="Harness", type="checkboxgroup", choices=harness_choices),
            ColumnFilter(label="Number of Parameters (B)", column="Model Num Params (B)", type="slider"),
            ColumnFilter(label="Precision", column="Precision", type="checkboxgroup"),
        ],
        interactive=False,
    )

def init_benchmark_runs(dataframe):
    if dataframe is None or dataframe.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")
    
    # Make ColumnFilter choices
    label_choices = [("🟠 Fully FOSS", "🟠"), ("🔶 Proprietary", "🔶")]
    benchmark_choices = sorted({(extract_body(v), v) for v in dataframe["Benchmark"]})
    
    return Leaderboard(
        value=dataframe,
        select_columns=SelectColumns(
            default_selection=[
                " ",
                "Model",
                "Harness",
                "Benchmark",
                "Score",
                "Avg Cost Per Task (USD)",
            ],
            label="Select Columns to Display:",
        ),
        datatype="markdown",
        search_columns=[
            "Benchmark",
            "Harness",
            "Model",
        ],
        filter_columns=[
            ColumnFilter(label="Category", column=" ", type="checkboxgroup", choices=label_choices),
            ColumnFilter(label="Benchmark", column="Benchmark", type="checkboxgroup", choices=benchmark_choices),
            ColumnFilter(label="Number of Parameters (B)", column="Model Num Params (B)", type="slider"),
            ColumnFilter(label="Precision", column="Precision", type="checkboxgroup"),
        ],
        interactive=False,
    )

demo = gr.Blocks(theme="citrus")
with demo:
    gr.HTML(build_header_html(BENCHMARK_RUN_DF))
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs():
        with gr.Tab("🏆 Leaderboard"):
            leaderboard = init_leaderboard(LEADERBOARD_DF)

        with gr.Tab("🏃 Benchmark Runs"):
            benchmark_runs = init_benchmark_runs(BENCHMARK_RUN_DF)

        with gr.Tab("📝 About"):
            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()