File size: 6,697 Bytes
fa0576d
 
abb343c
 
 
 
ff489b1
abb343c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa0576d
 
 
 
 
 
067ad94
fa0576d
 
 
 
 
 
 
 
 
 
 
 
 
 
067ad94
fa0576d
 
 
 
 
7a6725b
 
d0aedab
 
067ad94
7a6725b
 
 
 
 
 
 
 
 
37f1252
7a6725b
 
 
 
 
 
 
 
067ad94
7a6725b
 
 
067ad94
 
 
 
 
 
77a435c
067ad94
77a435c
067ad94
7a6725b
77a435c
067ad94
 
 
 
 
 
 
77a435c
067ad94
 
77a435c
067ad94
4b9a7ba
067ad94
 
 
 
7a6725b
067ad94
fa0576d
 
 
be7275a
 
067ad94
fa0576d
 
 
 
067ad94
 
 
77a435c
 
067ad94
0b3694d
067ad94
fa0576d
 
 
067ad94
 
 
 
 
fa0576d
be7275a
067ad94
fc97436
d0aedab
fa0576d
 
 
 
7a6725b
fa0576d
067ad94
fa0576d
 
5dfc258
067ad94
abb343c
5dfc258
067ad94
abb343c
fa0576d
067ad94
fa0576d
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import os
import re
from pathlib import Path


def patch_gradio_leaderboard():
    """Patch gradio_leaderboard JS to fix crash on tab switch with Gradio 5.x."""
    import gradio_leaderboard
    pkg_dir = Path(gradio_leaderboard.__file__).parent
    js_file = pkg_dir / "templates" / "component" / "Index-CzS_eGV6.js"
    if not js_file.exists():
        return

    src = js_file.read_text()

    patches = [
        # Fix 1 & 2: Guard r[39]/a[39] filter callback (undefined during Svelte outro)
        (
            'r[0].filter(\n        /*func*/\n        r[39]\n      ).map(qd)',
            '(r[39] ? r[0].filter(r[39]) : r[0]).map(qd)',
        ),
        (
            'a[0].filter(\n          /*func*/\n          a[39]\n        ).map(qd))',
            '(a[39] ? a[0].filter(a[39]) : a[0]).map(qd))',
        ),
        # Fix 3: Lx (Boolean) extracted from Rx (globals) which is undefined in Gradio 5
        (
            '{ Boolean: Lx } = Rx,',
            'Lx = (Rx && Rx.Boolean) || Boolean,',
        ),
    ]

    patched = False
    for old, new in patches:
        if old in src:
            src = src.replace(old, new)
            patched = True

    if patched:
        js_file.write_text(src)


patch_gradio_leaderboard()

import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi

from src.leaderboard import get_leaderboard_df, get_benchmark_run_df
from src.display.text_blocks import (
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
)

REPO_ID = "taagarwa/coding-agent-leaderboard"
TOKEN = os.environ.get("HF_TOKEN")
API = HfApi(token=TOKEN)

def restart_space():
    API.restart_space(repo_id=REPO_ID)


LEADERBOARD_DF = get_leaderboard_df()
BENCHMARK_RUN_DF = get_benchmark_run_df()

def extract_body(s: str):
    return re.match(r'\[(.*?)\]', s).group(1)


def build_header_html(df):
    n_results = len(df)
    n_models = df["Model"].nunique()
    n_harnesses = df["Harness"].apply(lambda s: extract_body(s)).nunique()
    n_benchmarks = df["Benchmark"].apply(lambda s: extract_body(s)).nunique()

    return f"""
    <base target="_blank">
    <div style="padding: 1.5rem 0.5rem 1rem 0.5rem; text-align: left;">
        <h1 style="margin: 0 0 0.5rem 0; font-size: 2rem;">
            Coding Agent Leaderboard
        </h1>
        <div style="height: 4px; border-radius: 2px; background: linear-gradient(90deg, #84cc16, #f59e0b); margin-bottom: 0.75rem;"></div>
        <p style="margin: 0 0 0.75rem 0; font-size: 1.1rem; opacity: 0.8;">
            Compare coding agents across models and harnesses
        </p>
        <div style="display: flex; gap: 0.5rem; flex-wrap: wrap; font-size: 0.95rem; opacity: 0.7;">
            <span style="font-weight: 600;">{n_results} Results</span>
            <span>路</span>
            <span style="font-weight: 600;">{n_models} Models</span>
            <span>路</span>
            <span style="font-weight: 600;">{n_harnesses} Harnesses</span>
            <span>路</span>
            <span style="font-weight: 600;">{n_benchmarks} Benchmarks</span>
        </div>
    </div>
    """
    
def init_leaderboard(dataframe):
    if dataframe is None or dataframe.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")
    
    label_choices = [("馃煚 Fully FOSS", "馃煚"), ("馃敹 Proprietary", "馃敹")]
    meta_columns = [" ", "Harness", "Model", "Harness License", "Model License", "Model Num Params (B)", "Precision"]
    benchmark_columns = [col for col in dataframe.columns if col not in meta_columns]
    model_choices = sorted({(extract_body(v), v) for v in dataframe["Model"]})
    harness_choices = sorted({(extract_body(v), v) for v in dataframe["Harness"]})

    default_columns = [" ", "Harness", "Model"] + benchmark_columns
    return Leaderboard(
        value=dataframe,
        select_columns=SelectColumns(
            default_selection=default_columns,
            label="Select Columns to Display:",
        ),
        datatype="markdown",
        search_columns=["Harness", "Model"],
        filter_columns=[
            ColumnFilter(label="Category", column=" ", type="checkboxgroup", choices=label_choices),
            ColumnFilter(label="Model", column="Model", type="checkboxgroup", choices=model_choices),
            ColumnFilter(label="Harness", column="Harness", type="checkboxgroup", choices=harness_choices),
            ColumnFilter(label="Number of Parameters (B)", column="Model Num Params (B)", type="slider"),
            ColumnFilter(label="Precision", column="Precision", type="checkboxgroup"),
        ],
        interactive=False,
    )

def init_benchmark_runs(dataframe):
    if dataframe is None or dataframe.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")
    
    # Make ColumnFilter choices
    label_choices = [("馃煚 Fully FOSS", "馃煚"), ("馃敹 Proprietary", "馃敹")]
    benchmark_choices = sorted({(extract_body(v), v) for v in dataframe["Benchmark"]})
    
    return Leaderboard(
        value=dataframe,
        select_columns=SelectColumns(
            default_selection=[
                " ",
                "Model",
                "Harness",
                "Benchmark",
                "Score",
                "Avg Cost Per Task (USD)",
            ],
            label="Select Columns to Display:",
        ),
        datatype="markdown",
        search_columns=[
            "Benchmark",
            "Harness",
            "Model",
        ],
        filter_columns=[
            ColumnFilter(label="Category", column=" ", type="checkboxgroup", choices=label_choices),
            ColumnFilter(label="Benchmark", column="Benchmark", type="checkboxgroup", choices=benchmark_choices),
            ColumnFilter(label="Number of Parameters (B)", column="Model Num Params (B)", type="slider"),
            ColumnFilter(label="Precision", column="Precision", type="checkboxgroup"),
        ],
        interactive=False,
    )

demo = gr.Blocks(theme="citrus")
with demo:
    gr.HTML(build_header_html(BENCHMARK_RUN_DF))
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs():
        with gr.Tab("馃弳 Leaderboard"):
            leaderboard = init_leaderboard(LEADERBOARD_DF)

        with gr.Tab("馃弮 Benchmark Runs"):
            benchmark_runs = init_benchmark_runs(BENCHMARK_RUN_DF)

        with gr.Tab("馃摑 About"):
            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()