Add baseline qwen3.6

#2
by taagarwa - opened
.gitattributes CHANGED
@@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
36
- *.png filter=lfs diff=lfs merge=lfs -text
 
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
 
.github/workflows/sync-to-hf-space-stage.yml DELETED
@@ -1,101 +0,0 @@
1
- name: Sync stage to HF Space (staging)
2
-
3
- # Mirrors every push to `stage` on GitHub into the HF Space git remote so
4
- # that the staging Space (https://huggingface.co/spaces/taagarwa/coding-agent-leaderboard-stage)
5
- # always tracks the stage branch.
6
- #
7
- # Required repository secrets (Settings -> Secrets and variables -> Actions):
8
- # HF_TOKEN Hugging Face access token with write permission to the Space.
9
- # HF_USERNAME Optional fallback username if token introspection fails.
10
-
11
- on:
12
- push:
13
- branches: [stage]
14
- workflow_dispatch:
15
-
16
- concurrency:
17
- group: sync-to-hf-space-stage
18
- cancel-in-progress: false
19
-
20
- jobs:
21
- mirror:
22
- runs-on: ubuntu-latest
23
- timeout-minutes: 10
24
- steps:
25
- - name: Checkout GitHub stage (full history + LFS)
26
- uses: actions/checkout@v4
27
- with:
28
- fetch-depth: 0
29
- lfs: true
30
-
31
- - name: Verify required secrets
32
- env:
33
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
34
- run: |
35
- if [ -z "$HF_TOKEN" ]; then
36
- echo "::error::HF_TOKEN repository secret must be set."
37
- exit 1
38
- fi
39
-
40
- - name: Ensure HF Space exists
41
- id: hf
42
- env:
43
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
44
- HF_USERNAME: ${{ secrets.HF_USERNAME }}
45
- run: |
46
- set -euo pipefail
47
- python -m pip install --quiet 'huggingface_hub>=0.24,<2'
48
- python - <<'PY'
49
- import os
50
-
51
- from huggingface_hub import HfApi
52
-
53
- token = os.environ["HF_TOKEN"]
54
- space_id = "taagarwa/coding-agent-leaderboard-stage"
55
- fallback_username = os.environ.get("HF_USERNAME", "").strip()
56
-
57
- api = HfApi(token=token)
58
- username = fallback_username
59
- try:
60
- info = api.whoami(token=token)
61
- username = str(info.get("name") or username).strip()
62
- except Exception as exc:
63
- if not username:
64
- raise RuntimeError("HF_USERNAME fallback is required when token introspection fails") from exc
65
-
66
- api.create_repo(
67
- repo_id=space_id,
68
- repo_type="space",
69
- space_sdk="docker",
70
- token=token,
71
- exist_ok=True,
72
- )
73
-
74
- with open(os.environ["GITHUB_OUTPUT"], "a", encoding="utf-8") as output:
75
- output.write(f"username={username}\n")
76
- print(f"HF Space ready: {space_id}")
77
- PY
78
-
79
- - name: Push to HF Space remote
80
- env:
81
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
82
- HF_USERNAME: ${{ steps.hf.outputs.username }}
83
- run: |
84
- set -euo pipefail
85
- HF_REMOTE="https://${HF_USERNAME}:${HF_TOKEN}@huggingface.co/spaces/taagarwa/coding-agent-leaderboard-stage"
86
-
87
- git config user.name "github-actions[bot]"
88
- git config user.email "github-actions[bot]@users.noreply.github.com"
89
-
90
- echo "Pushing $(git rev-parse --short HEAD) to taagarwa/coding-agent-leaderboard-stage..."
91
- git push --force "${HF_REMOTE}" HEAD:main
92
- echo "Sync complete."
93
-
94
- - name: Summary
95
- if: success()
96
- run: |
97
- echo "### HF Space mirror (staging)" >> "$GITHUB_STEP_SUMMARY"
98
- echo "" >> "$GITHUB_STEP_SUMMARY"
99
- echo "Pushed \`$(git rev-parse --short HEAD)\` to \`taagarwa/coding-agent-leaderboard-stage\` Space." >> "$GITHUB_STEP_SUMMARY"
100
- echo "" >> "$GITHUB_STEP_SUMMARY"
101
- echo "View the Space: <https://huggingface.co/spaces/taagarwa/coding-agent-leaderboard-stage>" >> "$GITHUB_STEP_SUMMARY"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.github/workflows/sync-to-hf-space.yml DELETED
@@ -1,124 +0,0 @@
1
- name: Sync main to HF Space
2
-
3
- # Mirrors every push to `main` on GitHub into the HF Space git remote so
4
- # that the public coding-agent-leaderboard Space (https://huggingface.co/spaces/taagarwa/coding-agent-leaderboard)
5
- # always tracks the source-of-truth repo.
6
- #
7
- # Required repository secrets (Settings -> Secrets and variables -> Actions):
8
- # HF_TOKEN Hugging Face access token with write permission to the Space.
9
- # Create at https://huggingface.co/settings/tokens
10
- # (token type "Write" is sufficient; no organization scope needed).
11
- # HF_USERNAME Optional fallback username if token introspection fails.
12
- #
13
- # Optional: set HF_SPACE_ID as a repo variable (not secret) to point the
14
- # workflow at a different Space; defaults to "taagarwa/coding-agent-leaderboard".
15
-
16
- on:
17
- push:
18
- branches: [main]
19
- # Manual dispatch lets you re-mirror the latest main on demand from
20
- # the Actions tab without pushing a new commit.
21
- workflow_dispatch:
22
-
23
- # Only one mirror job at a time so we never race ourselves into
24
- # non-fast-forward pushes on the Space remote.
25
- concurrency:
26
- group: sync-to-hf-space
27
- cancel-in-progress: false
28
-
29
- jobs:
30
- mirror:
31
- runs-on: ubuntu-latest
32
- timeout-minutes: 10
33
- steps:
34
- - name: Checkout GitHub main (full history + LFS)
35
- uses: actions/checkout@v4
36
- with:
37
- fetch-depth: 0
38
- lfs: true
39
-
40
- - name: Verify required secrets
41
- env:
42
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
43
- run: |
44
- if [ -z "$HF_TOKEN" ]; then
45
- echo "::error::HF_TOKEN repository secret must be set."
46
- echo " Create HF_TOKEN at https://huggingface.co/settings/tokens (type: Write)"
47
- exit 1
48
- fi
49
-
50
- - name: Ensure HF Space exists
51
- id: hf
52
- env:
53
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
54
- HF_USERNAME: ${{ secrets.HF_USERNAME }}
55
- HF_SPACE_ID: ${{ vars.HF_SPACE_ID || 'taagarwa/coding-agent-leaderboard' }}
56
- run: |
57
- set -euo pipefail
58
- python -m pip install --quiet 'huggingface_hub>=0.24,<2'
59
- python - <<'PY'
60
- import os
61
-
62
- from huggingface_hub import HfApi
63
-
64
- token = os.environ["HF_TOKEN"]
65
- space_id = os.environ["HF_SPACE_ID"]
66
- fallback_username = os.environ.get("HF_USERNAME", "").strip()
67
-
68
- api = HfApi(token=token)
69
- username = fallback_username
70
- try:
71
- info = api.whoami(token=token)
72
- username = str(info.get("name") or username).strip()
73
- except Exception as exc:
74
- if not username:
75
- raise RuntimeError("HF_USERNAME fallback is required when token introspection fails") from exc
76
-
77
- api.create_repo(
78
- repo_id=space_id,
79
- repo_type="space",
80
- space_sdk="docker",
81
- token=token,
82
- exist_ok=True,
83
- )
84
-
85
- with open(os.environ["GITHUB_OUTPUT"], "a", encoding="utf-8") as output:
86
- output.write(f"username={username}\n")
87
- print(f"HF Space ready: {space_id}")
88
- PY
89
-
90
- - name: Push to HF Space remote
91
- env:
92
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
93
- HF_USERNAME: ${{ steps.hf.outputs.username }}
94
- HF_SPACE_ID: ${{ vars.HF_SPACE_ID || 'taagarwa/coding-agent-leaderboard' }}
95
- run: |
96
- set -euo pipefail
97
- # Authenticate via token in the URL. HF Spaces accept the
98
- # username + token basic-auth format over HTTPS git.
99
- HF_REMOTE="https://${HF_USERNAME}:${HF_TOKEN}@huggingface.co/spaces/${HF_SPACE_ID}"
100
-
101
- # Configure identity for any metadata operations. The actual
102
- # commits come from GitHub unchanged; we only push refs.
103
- git config user.name "github-actions[bot]"
104
- git config user.email "github-actions[bot]@users.noreply.github.com"
105
-
106
- echo "Pushing $(git rev-parse --short HEAD) to ${HF_SPACE_ID}..."
107
-
108
- # --force is intentional: GitHub is the single source of truth
109
- # for the Space's git history. Anything on the Space side that
110
- # wasn't committed via GitHub is overwritten on the next sync.
111
- # This prevents the drift situation where someone edits files
112
- # in the HF Space UI and creates commits only visible there.
113
- git push --force "${HF_REMOTE}" HEAD:main
114
-
115
- echo "Sync complete."
116
-
117
- - name: Summary
118
- if: success()
119
- run: |
120
- echo "### HF Space mirror" >> "$GITHUB_STEP_SUMMARY"
121
- echo "" >> "$GITHUB_STEP_SUMMARY"
122
- echo "Pushed \`$(git rev-parse --short HEAD)\` to \`${{ vars.HF_SPACE_ID || 'taagarwa/coding-agent-leaderboard' }}\` Space." >> "$GITHUB_STEP_SUMMARY"
123
- echo "" >> "$GITHUB_STEP_SUMMARY"
124
- echo "View the Space: <https://huggingface.co/spaces/${{ vars.HF_SPACE_ID || 'taagarwa/coding-agent-leaderboard' }}>" >> "$GITHUB_STEP_SUMMARY"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,181 +1,73 @@
1
  import os
2
- import re
3
- from pathlib import Path
4
-
5
-
6
- def patch_gradio_leaderboard():
7
- """Patch gradio_leaderboard JS to fix crash on tab switch with Gradio 5.x."""
8
- import gradio_leaderboard
9
- pkg_dir = Path(gradio_leaderboard.__file__).parent
10
- js_file = pkg_dir / "templates" / "component" / "Index-CzS_eGV6.js"
11
- if not js_file.exists():
12
- return
13
-
14
- src = js_file.read_text()
15
-
16
- patches = [
17
- # Fix 1 & 2: Guard r[39]/a[39] filter callback (undefined during Svelte outro)
18
- (
19
- 'r[0].filter(\n /*func*/\n r[39]\n ).map(qd)',
20
- '(r[39] ? r[0].filter(r[39]) : r[0]).map(qd)',
21
- ),
22
- (
23
- 'a[0].filter(\n /*func*/\n a[39]\n ).map(qd))',
24
- '(a[39] ? a[0].filter(a[39]) : a[0]).map(qd))',
25
- ),
26
- # Fix 3: Lx (Boolean) extracted from Rx (globals) which is undefined in Gradio 5
27
- (
28
- '{ Boolean: Lx } = Rx,',
29
- 'Lx = (Rx && Rx.Boolean) || Boolean,',
30
- ),
31
- ]
32
-
33
- patched = False
34
- for old, new in patches:
35
- if old in src:
36
- src = src.replace(old, new)
37
- patched = True
38
-
39
- if patched:
40
- js_file.write_text(src)
41
-
42
-
43
- patch_gradio_leaderboard()
44
 
45
  import gradio as gr
46
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
47
  from apscheduler.schedulers.background import BackgroundScheduler
48
  from huggingface_hub import HfApi
49
 
50
- from src.leaderboard import get_leaderboard_df, get_benchmark_run_df
51
  from src.display.text_blocks import (
 
52
  INTRODUCTION_TEXT,
53
  LLM_BENCHMARKS_TEXT,
 
 
54
  )
55
 
56
  REPO_ID = "taagarwa/coding-agent-leaderboard"
57
  TOKEN = os.environ.get("HF_TOKEN")
58
  API = HfApi(token=TOKEN)
59
 
 
60
  def restart_space():
61
  API.restart_space(repo_id=REPO_ID)
62
 
63
 
64
  LEADERBOARD_DF = get_leaderboard_df()
65
- BENCHMARK_RUN_DF = get_benchmark_run_df()
66
 
67
- def extract_body(s: str):
68
- return re.match(r'\[(.*?)\]', s).group(1)
69
 
70
-
71
- def build_header_html(df):
72
- n_results = len(df)
73
- n_models = df["Model"].nunique()
74
- n_harnesses = df["Harness"].apply(lambda s: extract_body(s)).nunique()
75
- n_benchmarks = df["Benchmark"].apply(lambda s: extract_body(s)).nunique()
76
-
77
- return f"""
78
- <base target="_blank">
79
- <div style="padding: 1.5rem 0.5rem 1rem 0.5rem; text-align: left;">
80
- <h1 style="margin: 0 0 0.5rem 0; font-size: 2rem;">
81
- Coding Agent Leaderboard
82
- </h1>
83
- <div style="height: 4px; border-radius: 2px; background: linear-gradient(90deg, #84cc16, #f59e0b); margin-bottom: 0.75rem;"></div>
84
- <p style="margin: 0 0 0.75rem 0; font-size: 1.1rem; opacity: 0.8;">
85
- Compare coding agents across models and harnesses
86
- </p>
87
- <div style="display: flex; gap: 0.5rem; flex-wrap: wrap; font-size: 0.95rem; opacity: 0.7;">
88
- <span style="font-weight: 600;">{n_results} Results</span>
89
- <span>·</span>
90
- <span style="font-weight: 600;">{n_models} Models</span>
91
- <span>·</span>
92
- <span style="font-weight: 600;">{n_harnesses} Harnesses</span>
93
- <span>·</span>
94
- <span style="font-weight: 600;">{n_benchmarks} Benchmarks</span>
95
- </div>
96
- </div>
97
- """
98
-
99
  def init_leaderboard(dataframe):
100
  if dataframe is None or dataframe.empty:
101
  raise ValueError("Leaderboard DataFrame is empty or None.")
102
-
103
- label_choices = [("🟠 Fully FOSS", "🟠"), ("🔶 Proprietary", "🔶")]
104
- meta_columns = [" ", "Harness", "Model", "Harness License", "Model License", "Model Num Params (B)", "Precision"]
105
- benchmark_columns = [col for col in dataframe.columns if col not in meta_columns]
106
- model_choices = sorted({(extract_body(v), v) for v in dataframe["Model"]})
107
- harness_choices = sorted({(extract_body(v), v) for v in dataframe["Harness"]})
108
-
109
- default_columns = [" ", "Harness", "Model"] + benchmark_columns
110
  return Leaderboard(
111
  value=dataframe,
112
  select_columns=SelectColumns(
113
- default_selection=default_columns,
114
  label="Select Columns to Display:",
115
  ),
116
- datatype="markdown",
117
- search_columns=["Harness", "Model"],
118
  filter_columns=[
119
- ColumnFilter(label="Category", column=" ", type="checkboxgroup", choices=label_choices),
120
- ColumnFilter(label="Model", column="Model", type="checkboxgroup", choices=model_choices),
121
- ColumnFilter(label="Harness", column="Harness", type="checkboxgroup", choices=harness_choices),
122
- ColumnFilter(label="Number of Parameters (B)", column="Model Num Params (B)", type="slider"),
123
- ColumnFilter(label="Precision", column="Precision", type="checkboxgroup"),
124
  ],
125
  interactive=False,
126
  )
127
 
128
- def init_benchmark_runs(dataframe):
129
- if dataframe is None or dataframe.empty:
130
- raise ValueError("Leaderboard DataFrame is empty or None.")
131
-
132
- # Make ColumnFilter choices
133
- label_choices = [("🟠 Fully FOSS", "🟠"), ("🔶 Proprietary", "🔶")]
134
- benchmark_choices = sorted({(extract_body(v), v) for v in dataframe["Benchmark"]})
135
-
136
- return Leaderboard(
137
- value=dataframe,
138
- select_columns=SelectColumns(
139
- default_selection=[
140
- " ",
141
- "Model",
142
- "Harness",
143
- "Benchmark",
144
- "Score",
145
- "Avg Cost Per Task (USD)",
146
- ],
147
- label="Select Columns to Display:",
148
- ),
149
- datatype="markdown",
150
- search_columns=[
151
- "Benchmark",
152
- "Harness",
153
- "Model",
154
- ],
155
- filter_columns=[
156
- ColumnFilter(label="Category", column=" ", type="checkboxgroup", choices=label_choices),
157
- ColumnFilter(label="Benchmark", column="Benchmark", type="checkboxgroup", choices=benchmark_choices),
158
- ColumnFilter(label="Number of Parameters (B)", column="Model Num Params (B)", type="slider"),
159
- ColumnFilter(label="Precision", column="Precision", type="checkboxgroup"),
160
- ],
161
- interactive=False,
162
- )
163
 
164
- demo = gr.Blocks(theme="citrus")
165
  with demo:
166
- gr.HTML(build_header_html(BENCHMARK_RUN_DF))
167
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
168
 
169
- with gr.Tabs():
170
- with gr.Tab("🏆 Leaderboard"):
171
  leaderboard = init_leaderboard(LEADERBOARD_DF)
 
172
 
173
- with gr.Tab("🏃 Benchmark Runs"):
174
- benchmark_runs = init_benchmark_runs(BENCHMARK_RUN_DF)
175
-
176
- with gr.Tab("📝 About"):
177
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
178
 
 
 
 
 
 
 
 
 
 
 
179
  scheduler = BackgroundScheduler()
180
  scheduler.add_job(restart_space, "interval", seconds=1800)
181
  scheduler.start()
 
1
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  import gradio as gr
4
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
  from huggingface_hub import HfApi
7
 
8
+ from src.leaderboard import get_leaderboard_df, DISPLAY_BY_DEFAULT, SEARCH_COLUMNS
9
  from src.display.text_blocks import (
10
+ TITLE,
11
  INTRODUCTION_TEXT,
12
  LLM_BENCHMARKS_TEXT,
13
+ CITATION_BUTTON_LABEL,
14
+ CITATION_BUTTON_TEXT,
15
  )
16
 
17
  REPO_ID = "taagarwa/coding-agent-leaderboard"
18
  TOKEN = os.environ.get("HF_TOKEN")
19
  API = HfApi(token=TOKEN)
20
 
21
+
22
  def restart_space():
23
  API.restart_space(repo_id=REPO_ID)
24
 
25
 
26
  LEADERBOARD_DF = get_leaderboard_df()
 
27
 
 
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def init_leaderboard(dataframe):
30
  if dataframe is None or dataframe.empty:
31
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
 
 
 
 
 
 
 
32
  return Leaderboard(
33
  value=dataframe,
34
  select_columns=SelectColumns(
35
+ default_selection=DISPLAY_BY_DEFAULT,
36
  label="Select Columns to Display:",
37
  ),
38
+ search_columns=SEARCH_COLUMNS,
 
39
  filter_columns=[
40
+ ColumnFilter(label="Dataset", column="dataset", type="checkboxgroup"),
41
+ ColumnFilter(label="Number of Parameters (B)", column="model_num_params", type="slider", min=0.5, max=150),
42
+ ColumnFilter(label="Precision", column="precision", type="checkboxgroup"),
 
 
43
  ],
44
  interactive=False,
45
  )
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ demo = gr.Blocks()
49
  with demo:
50
+ gr.HTML(TITLE)
51
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
52
 
53
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
54
+ with gr.TabItem("🏅 Coding Agent Benchmark", elem_id="llm-benchmark-tab-table", id=0):
55
  leaderboard = init_leaderboard(LEADERBOARD_DF)
56
+ gr.Markdown("\* `internal` refers to internal benchmarks performed by the model provider where the harness/environment were not made public")
57
 
58
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
 
 
 
59
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
60
 
61
+ with gr.Row():
62
+ with gr.Accordion("📙 Citation", open=False):
63
+ citation_button = gr.Textbox(
64
+ value=CITATION_BUTTON_TEXT,
65
+ label=CITATION_BUTTON_LABEL,
66
+ lines=20,
67
+ elem_id="citation-button",
68
+ show_copy_button=True,
69
+ )
70
+
71
  scheduler = BackgroundScheduler()
72
  scheduler.add_job(restart_space, "interval", seconds=1800)
73
  scheduler.start()
requirements.txt CHANGED
@@ -3,7 +3,7 @@ black
3
  datasets
4
  gradio
5
  gradio[oauth]
6
- gradio_leaderboard
7
  gradio_client
8
  huggingface-hub>=0.18.0
9
  matplotlib
 
3
  datasets
4
  gradio
5
  gradio[oauth]
6
+ gradio_leaderboard==0.0.13
7
  gradio_client
8
  huggingface-hub>=0.18.0
9
  matplotlib
results/qwen3-6-35b-internal.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": {
3
+ "name": "swe-bench-verified",
4
+ "repo": "SWE-bench/SWE-bench_Verified",
5
+ "num_tasks": 500
6
+ },
7
+ "harness": {
8
+ "name": "internal",
9
+ "skills": []
10
+ },
11
+ "model": {
12
+ "name": "Qwen3.6-35B-A3B",
13
+ "repo": "Qwen/Qwen3.6-35B-A3B",
14
+ "is_oss": true,
15
+ "num_params": 35,
16
+ "precision": "bf16"
17
+ },
18
+ "environment": {
19
+ "name": "internal"
20
+ },
21
+ "metrics": {
22
+ "score": 0.734,
23
+ "time": null,
24
+ "costUSD": null
25
+ }
26
+ }
results/{swe-bench-verified-claude-sonnet-4-6-claude-code.json → qwen3-6-35b-nvfp4-claude-code.json} RENAMED
@@ -1,23 +1,19 @@
1
  {
2
- "benchmark": {
3
  "name": "swe-bench-verified",
4
  "repo": "SWE-bench/SWE-bench_Verified",
5
- "num_tasks": 500,
6
- "url": "https://huggingface.co/datasets/SWE-bench/SWE-bench_Verified"
7
  },
8
  "harness": {
9
- "name": "Claude Code",
10
- "skills": [],
11
- "is_oss": false,
12
- "url": "https://github.com/anthropics/claude-code"
13
  },
14
  "model": {
15
- "name": "Sonnet 4.6",
16
- "repo": "Sonnet 4.6",
17
  "is_oss": true,
18
- "num_params": 1000,
19
- "precision": "bf16",
20
- "url": "https://www.anthropic.com/news/claude-sonnet-4-6"
21
  },
22
  "environment": {
23
  "name": "harbor",
@@ -33,10 +29,11 @@
33
  "task_names": null,
34
  "exclude_task_names": null,
35
  "n_tasks": null
36
- },
37
- "url": "https://github.com/harbor-framework/harbor"
38
  },
39
  "metrics": {
40
- "score": 0.796
 
 
41
  }
42
  }
 
1
  {
2
+ "dataset": {
3
  "name": "swe-bench-verified",
4
  "repo": "SWE-bench/SWE-bench_Verified",
5
+ "num_tasks": 500
 
6
  },
7
  "harness": {
8
+ "name": "claude-code",
9
+ "skills": []
 
 
10
  },
11
  "model": {
12
+ "name": "Qwen3.6-35B-A3B",
13
+ "repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
14
  "is_oss": true,
15
+ "num_params": 35,
16
+ "precision": "nvfp4"
 
17
  },
18
  "environment": {
19
  "name": "harbor",
 
29
  "task_names": null,
30
  "exclude_task_names": null,
31
  "n_tasks": null
32
+ }
 
33
  },
34
  "metrics": {
35
+ "score": 0.632,
36
+ "time": 21600,
37
+ "costUSD": 48.00
38
  }
39
  }
results/swe-bench-pro--ansible-claude-sonnet-4-6-claude-code.json DELETED
@@ -1,60 +0,0 @@
1
- {
2
- "benchmark": {
3
- "name": "swe-bench-pro--ansible",
4
- "repo": "ScaleAI/SWE-bench_Pro",
5
- "num_tasks": 96,
6
- "url": "https://huggingface.co/datasets/ScaleAI/SWE-bench_Pro"
7
- },
8
- "harness": {
9
- "name": "Claude Code",
10
- "skills": [],
11
- "is_oss": false,
12
- "url": "https://github.com/anthropics/claude-code"
13
- },
14
- "model": {
15
- "name": "Sonnet 4.6",
16
- "repo": "Sonnet 4.6",
17
- "is_oss": true,
18
- "num_params": 1000,
19
- "precision": "bf16",
20
- "url": "https://www.anthropic.com/news/claude-sonnet-4-6"
21
- },
22
- "environment": {
23
- "name": "harbor",
24
- "config": {
25
- "path": null,
26
- "name": "scale-ai/swe-bench-pro",
27
- "version": null,
28
- "ref": "sha256:88411d32ff27e53a4c1a7e29f0c2aeba180c8e5d60f221cab5ed56325f33549d",
29
- "registry_url": null,
30
- "registry_path": null,
31
- "overwrite": false,
32
- "download_dir": null,
33
- "task_names": [
34
- "*ansible*"
35
- ],
36
- "exclude_task_names": null,
37
- "n_tasks": null
38
- },
39
- "url": "https://github.com/harbor-framework/harbor"
40
- },
41
- "metrics": {
42
- "n_tasks": 96,
43
- "n_errors": 1,
44
- "score": 0.5,
45
- "n_input_tokens": 190672390,
46
- "n_cache_tokens": 184409111,
47
- "n_output_tokens": 1593112,
48
- "n_total_tokens": 376674613,
49
- "agent_time_seconds": 40527,
50
- "total_time_seconds": 49734,
51
- "cost_usd": 184.42824125000004,
52
- "mean_input_tokens_per_task": 1986170,
53
- "mean_cache_tokens_per_task": 1920928,
54
- "mean_output_tokens_per_task": 16594,
55
- "mean_tokens_per_task": 3923693,
56
- "mean_cost_usd_per_task": 1.92,
57
- "mean_total_time_seconds_per_task": 518,
58
- "mean_agent_time_seconds_per_task": 422
59
- }
60
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/swe-bench-pro--ansible-qwen3-6-35b-nvfp4-claude-code.json DELETED
@@ -1,60 +0,0 @@
1
- {
2
- "benchmark": {
3
- "name": "swe-bench-pro--ansible",
4
- "repo": "ScaleAI/SWE-bench_Pro",
5
- "num_tasks": 96,
6
- "url": "https://huggingface.co/datasets/ScaleAI/SWE-bench_Pro"
7
- },
8
- "harness": {
9
- "name": "Claude Code",
10
- "skills": [],
11
- "is_oss": false,
12
- "url": "https://github.com/anthropics/claude-code"
13
- },
14
- "model": {
15
- "name": "Qwen3.6-35B-A3B",
16
- "repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
17
- "is_oss": true,
18
- "num_params": 35,
19
- "precision": "nvfp4",
20
- "url": "https://huggingface.co/RedHatAI/Qwen3.6-35B-A3B-NVFP4"
21
- },
22
- "environment": {
23
- "name": "harbor",
24
- "config": {
25
- "path": null,
26
- "name": "scale-ai/swe-bench-pro",
27
- "version": null,
28
- "ref": "sha256:88411d32ff27e53a4c1a7e29f0c2aeba180c8e5d60f221cab5ed56325f33549d",
29
- "registry_url": null,
30
- "registry_path": null,
31
- "overwrite": false,
32
- "download_dir": null,
33
- "task_names": [
34
- "*ansible*"
35
- ],
36
- "exclude_task_names": null,
37
- "n_tasks": null
38
- },
39
- "url": "https://github.com/harbor-framework/harbor"
40
- },
41
- "metrics": {
42
- "n_tasks": 96,
43
- "n_errors": 6,
44
- "score": 0.458,
45
- "n_input_tokens": 367897697,
46
- "n_cache_tokens": 0,
47
- "n_output_tokens": 1694885,
48
- "n_total_tokens": 369592582,
49
- "agent_time_seconds": 39024,
50
- "total_time_seconds": 46758,
51
- "cost_usd": 9.64,
52
- "mean_input_tokens_per_task": 3832267,
53
- "mean_cache_tokens_per_task": 0,
54
- "mean_output_tokens_per_task": 17655,
55
- "mean_tokens_per_task": 3849922,
56
- "mean_cost_usd_per_task": 0.1,
57
- "mean_total_time_seconds_per_task": 487,
58
- "mean_agent_time_seconds_per_task": 406
59
- }
60
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/swe-bench-pro--ansible-qwen3-6-35b-nvfp4-opencode.json DELETED
@@ -1,60 +0,0 @@
1
- {
2
- "benchmark": {
3
- "name": "swe-bench-pro--ansible",
4
- "repo": "ScaleAI/SWE-bench_Pro",
5
- "num_tasks": 96,
6
- "url": "https://huggingface.co/datasets/ScaleAI/SWE-bench_Pro"
7
- },
8
- "harness": {
9
- "name": "OpenCode",
10
- "skills": [],
11
- "is_oss": true,
12
- "url": "https://github.com/anomalyco/opencode"
13
- },
14
- "model": {
15
- "name": "Qwen3.6-35B-A3B",
16
- "repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
17
- "is_oss": true,
18
- "num_params": 35,
19
- "precision": "nvfp4",
20
- "url": "https://huggingface.co/RedHatAI/Qwen3.6-35B-A3B-NVFP4"
21
- },
22
- "environment": {
23
- "name": "harbor",
24
- "config": {
25
- "path": null,
26
- "name": "scale-ai/swe-bench-pro",
27
- "version": null,
28
- "ref": "sha256:88411d32ff27e53a4c1a7e29f0c2aeba180c8e5d60f221cab5ed56325f33549d",
29
- "registry_url": null,
30
- "registry_path": null,
31
- "overwrite": false,
32
- "download_dir": null,
33
- "task_names": [
34
- "*ansible*"
35
- ],
36
- "exclude_task_names": null,
37
- "n_tasks": null
38
- },
39
- "url": "https://github.com/harbor-framework/harbor"
40
- },
41
- "metrics": {
42
- "n_tasks": 96,
43
- "n_errors": 4,
44
- "score": 0.375,
45
- "n_input_tokens": 207164679,
46
- "n_cache_tokens": 0,
47
- "n_output_tokens": 1598703,
48
- "n_total_tokens": 208763382,
49
- "agent_time_seconds": 49450,
50
- "total_time_seconds": 57287,
51
- "cost_usd": 12.21,
52
- "mean_input_tokens_per_task": 2157965,
53
- "mean_cache_tokens_per_task": 0,
54
- "mean_output_tokens_per_task": 16653,
55
- "mean_tokens_per_task": 2174618,
56
- "mean_cost_usd_per_task": 0.13,
57
- "mean_total_time_seconds_per_task": 596,
58
- "mean_agent_time_seconds_per_task": 515
59
- }
60
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/swe-bench-pro--ansible-qwen3-6-36b-nvfp4-pi.json DELETED
@@ -1,60 +0,0 @@
1
- {
2
- "benchmark": {
3
- "name": "swe-bench-pro--ansible",
4
- "repo": "ScaleAI/SWE-bench_Pro",
5
- "num_tasks": 96,
6
- "url": "https://huggingface.co/datasets/ScaleAI/SWE-bench_Pro"
7
- },
8
- "harness": {
9
- "name": "Pi",
10
- "skills": [],
11
- "is_oss": true,
12
- "url": "https://github.com/earendil-works/pi/tree/main"
13
- },
14
- "model": {
15
- "name": "Qwen3.6-35B-A3B",
16
- "repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
17
- "is_oss": true,
18
- "num_params": 35,
19
- "precision": "nvfp4",
20
- "url": "https://huggingface.co/RedHatAI/Qwen3.6-35B-A3B-NVFP4"
21
- },
22
- "environment": {
23
- "name": "harbor",
24
- "config": {
25
- "path": null,
26
- "name": "scale-ai/swe-bench-pro",
27
- "version": null,
28
- "ref": "sha256:88411d32ff27e53a4c1a7e29f0c2aeba180c8e5d60f221cab5ed56325f33549d",
29
- "registry_url": null,
30
- "registry_path": null,
31
- "overwrite": false,
32
- "download_dir": null,
33
- "task_names": [
34
- "*ansible*"
35
- ],
36
- "exclude_task_names": null,
37
- "n_tasks": null
38
- },
39
- "url": "https://github.com/harbor-framework/harbor"
40
- },
41
- "metrics": {
42
- "n_tasks": 96,
43
- "n_errors": 1,
44
- "score": 0.479,
45
- "n_input_tokens": 742491363,
46
- "n_cache_tokens": 0,
47
- "n_output_tokens": 2387609,
48
- "n_total_tokens": 744878972,
49
- "agent_time_seconds": 54543,
50
- "total_time_seconds": 62422,
51
- "cost_usd": 13.47,
52
- "mean_input_tokens_per_task": 7734285,
53
- "mean_cache_tokens_per_task": 0,
54
- "mean_output_tokens_per_task": 24870,
55
- "mean_tokens_per_task": 7759155,
56
- "mean_cost_usd_per_task": 0.14,
57
- "mean_total_time_seconds_per_task": 650,
58
- "mean_agent_time_seconds_per_task": 568
59
- }
60
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/swe-bench-verified-qwen3-6-35b-nvfp4-claude-code.json DELETED
@@ -1,58 +0,0 @@
1
- {
2
- "benchmark": {
3
- "name": "swe-bench-verified",
4
- "repo": "SWE-bench/SWE-bench_Verified",
5
- "num_tasks": 500,
6
- "url": "https://huggingface.co/datasets/SWE-bench/SWE-bench_Verified"
7
- },
8
- "harness": {
9
- "name": "Claude Code",
10
- "skills": [],
11
- "is_oss": false,
12
- "url": "https://github.com/anthropics/claude-code"
13
- },
14
- "model": {
15
- "name": "Qwen3.6-35B-A3B",
16
- "repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
17
- "is_oss": true,
18
- "num_params": 35,
19
- "precision": "nvfp4",
20
- "url": "https://huggingface.co/RedHatAI/Qwen3.6-35B-A3B-NVFP4"
21
- },
22
- "environment": {
23
- "name": "harbor",
24
- "config": {
25
- "path": null,
26
- "name": "swe-bench/swe-bench-verified",
27
- "version": null,
28
- "ref": "sha256:235d6032d549851a936db3b5fe08807c4d385c12ee10e7be9c9786a1ff60563c",
29
- "registry_url": null,
30
- "registry_path": null,
31
- "overwrite": false,
32
- "download_dir": null,
33
- "task_names": null,
34
- "exclude_task_names": null,
35
- "n_tasks": null
36
- },
37
- "url": "https://github.com/harbor-framework/harbor"
38
- },
39
- "metrics": {
40
- "n_tasks": 500,
41
- "n_errors": 1,
42
- "score": 0.632,
43
- "n_input_tokens": 1106618897,
44
- "n_cache_tokens": 0,
45
- "n_output_tokens": 5733245,
46
- "n_total_tokens": 1112352142,
47
- "agent_time_seconds": 122808,
48
- "total_time_seconds": 171897,
49
- "cost_usd": 34.11,
50
- "mean_input_tokens_per_task": 2213237,
51
- "mean_cache_tokens_per_task": 0,
52
- "mean_output_tokens_per_task": 11466,
53
- "mean_tokens_per_task": 2224704,
54
- "mean_cost_usd_per_task": 0.07,
55
- "mean_total_time_seconds_per_task": 343,
56
- "mean_agent_time_seconds_per_task": 245
57
- }
58
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/swe-bench-verified-qwen3-6-35b-nvfp4-opencode.json DELETED
@@ -1,59 +0,0 @@
1
- {
2
- "benchmark": {
3
- "name": "swe-bench-verified",
4
- "repo": "SWE-bench/SWE-bench_Verified",
5
- "num_tasks": 500,
6
- "url": "https://huggingface.co/datasets/SWE-bench/SWE-bench_Verified"
7
- },
8
- "harness": {
9
- "name": "OpenCode",
10
- "skills": [],
11
- "is_oss": true,
12
- "url": "https://github.com/anomalyco/opencode"
13
- },
14
- "model": {
15
- "name": "Qwen3.6-35B-A3B",
16
- "repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
17
- "is_oss": true,
18
- "num_params": 35,
19
- "precision": "nvfp4",
20
- "url": "https://huggingface.co/RedHatAI/Qwen3.6-35B-A3B-NVFP4"
21
- },
22
- "environment": {
23
- "name": "harbor",
24
- "config": {
25
- "path": null,
26
- "name": "swe-bench/swe-bench-verified",
27
- "version": null,
28
- "ref": "sha256:235d6032d549851a936db3b5fe08807c4d385c12ee10e7be9c9786a1ff60563c",
29
- "registry_url": null,
30
- "registry_path": null,
31
- "overwrite": false,
32
- "download_dir": null,
33
- "task_names": null,
34
- "exclude_task_names": null,
35
- "n_tasks": null,
36
- "accelerated_images": true
37
- },
38
- "url": "https://github.com/harbor-framework/harbor"
39
- },
40
- "metrics": {
41
- "n_tasks": 500,
42
- "n_errors": 4,
43
- "score": 0.548,
44
- "n_input_tokens": 469806650,
45
- "n_cache_tokens": 0,
46
- "n_output_tokens": 4937761,
47
- "n_total_tokens": 474744411,
48
- "agent_time_seconds": 120473,
49
- "total_time_seconds": 185168,
50
- "cost_usd": 29.75,
51
- "mean_input_tokens_per_task": 939613,
52
- "mean_cache_tokens_per_task": 0,
53
- "mean_output_tokens_per_task": 9875,
54
- "mean_tokens_per_task": 949488,
55
- "mean_cost_usd_per_task": 0.06,
56
- "mean_total_time_seconds_per_task": 370,
57
- "mean_agent_time_seconds_per_task": 240
58
- }
59
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/swe-bench-verified-qwen3-6-36b-nvfp4-pi.json DELETED
@@ -1,59 +0,0 @@
1
- {
2
- "benchmark": {
3
- "name": "swe-bench-verified",
4
- "repo": "SWE-bench/SWE-bench_Verified",
5
- "num_tasks": 500,
6
- "url": "https://huggingface.co/datasets/SWE-bench/SWE-bench_Verified"
7
- },
8
- "harness": {
9
- "name": "Pi",
10
- "skills": [],
11
- "is_oss": true,
12
- "url": "https://github.com/earendil-works/pi/tree/main"
13
- },
14
- "model": {
15
- "name": "Qwen3.6-35B-A3B",
16
- "repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
17
- "is_oss": true,
18
- "num_params": 35,
19
- "precision": "nvfp4",
20
- "url": "https://huggingface.co/RedHatAI/Qwen3.6-35B-A3B-NVFP4"
21
- },
22
- "environment": {
23
- "name": "harbor",
24
- "config": {
25
- "path": null,
26
- "name": "swe-bench/swe-bench-verified",
27
- "version": null,
28
- "ref": "sha256:235d6032d549851a936db3b5fe08807c4d385c12ee10e7be9c9786a1ff60563c",
29
- "registry_url": null,
30
- "registry_path": null,
31
- "overwrite": false,
32
- "download_dir": null,
33
- "task_names": null,
34
- "exclude_task_names": null,
35
- "n_tasks": null,
36
- "accelerated_images": true
37
- },
38
- "url": "https://github.com/harbor-framework/harbor"
39
- },
40
- "metrics": {
41
- "n_tasks": 500,
42
- "n_errors": 6,
43
- "score": 0.65,
44
- "n_input_tokens": 791183735,
45
- "n_cache_tokens": 0,
46
- "n_output_tokens": 6333798,
47
- "n_total_tokens": 797517533,
48
- "agent_time_seconds": 154531,
49
- "total_time_seconds": 218988,
50
- "cost_usd": 38.16,
51
- "mean_input_tokens_per_task": 1582367,
52
- "mean_cache_tokens_per_task": 0,
53
- "mean_output_tokens_per_task": 12667,
54
- "mean_tokens_per_task": 1595035,
55
- "mean_cost_usd_per_task": 0.08,
56
- "mean_total_time_seconds_per_task": 437,
57
- "mean_agent_time_seconds_per_task": 309
58
- }
59
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/text_blocks.py CHANGED
@@ -1,34 +1,19 @@
 
 
1
  INTRODUCTION_TEXT = """
2
- A **Coding Agent** is more than just a model - it's the combination of a **Model** and a **Harness** (the tool/framework driving the model).
3
- This leaderboard tracks how these components work together, because the same model can perform very differently depending on the harness it's paired with.
4
  """
5
 
6
  LLM_BENCHMARKS_TEXT = """
7
- ## What is a Coding Agent?
8
-
9
- A coding agent is a system that autonomously solves software engineering tasks - reading code, reasoning about bugs, and writing patches. Its performance depends on two components:
10
-
11
- - **Model** - The underlying language model (e.g. Claude Opus 4.7, Qwen3.6-35B)
12
- - **Harness** - The framework or tool that orchestrates the model's actions (e.g. Claude Code, OpenCode, Pi)
13
 
14
- ## How to Read the Table
15
 
16
- | Column | Description |
17
- |--------|-------------|
18
- | **Benchmark** | The benchmark used for evaluation (e.g. SWE-bench Verified - 500 real GitHub issues) |
19
- | **Harness** | The agent framework driving the model. |
20
- | **Model** | The language model being evaluated |
21
- | **Skills** | The set of instructions guiding the agent's behavior |
22
- | **Score** | Outcome of the benchmark, often the fraction of tasks solved correctly (higher is better) |
23
- | **Precision** | Model weight format (e.g. bf16, fp4) - affects speed, memory footprint, and quality |
24
 
25
- ## Key Concepts
26
-
27
- - **FOSS vs Proprietary** - Filters let you compare fully open-source agents against proprietary ones. A FOSS model with a FOSS harness means anyone can reproduce the result
28
- - **Skills** - Some harnesses augment the model with extra capabilities (tools, retrieval, etc.). Listed in the "skills" column when present
29
- - **Internal results (`*`)** - Benchmarks run by the model provider where the harness and environment were not made public. These are useful reference points but are not independently reproducible
30
 
31
- ## Learn More
32
 
33
- Visit the [GitHub repo](https://github.com/redhat-et/coding_agent_bench) for details about the project, methodology, and how to submit your own results.
34
- """
 
1
+ TITLE = """<h1 align="center" id="space-title">Coding Agent Leaderboard</h1>"""
2
+
3
  INTRODUCTION_TEXT = """
4
+ Welcome to the Coding Agent Leaderboard!
 
5
  """
6
 
7
  LLM_BENCHMARKS_TEXT = """
8
+ ## About
 
 
 
 
 
9
 
10
+ Evaluate and compare Coding Agents.
11
 
12
+ Coding Agent = Model + Harness + Skills.
 
 
 
 
 
 
 
13
 
14
+ Visit our [GitHub repo](https://github.com/redhat-et/coding_agent_bench) for more details about the project.
15
+ """
 
 
 
16
 
17
+ CITATION_BUTTON_TEXT = "TBD"
18
 
19
+ CITATION_BUTTON_LABEL = "Citation"
 
src/leaderboard.py CHANGED
@@ -2,10 +2,27 @@ from pathlib import Path
2
  import json
3
  import pandas as pd
4
 
5
- from src.models import Result, Model, Harness
6
 
7
  RESULTS_DIR = Path(__file__).parent.parent / "results"
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def format_time(seconds: int):
10
  if seconds is None:
11
  return None
@@ -14,9 +31,6 @@ def format_time(seconds: int):
14
  return f"{h}h{m}m{s}s"
15
 
16
 
17
- def get_benchmark_names(results: list[Result]):
18
- return {r.benchmark.name for r in results}
19
-
20
  def get_leaderboard_df():
21
  results: list[Result] = []
22
  for file in RESULTS_DIR.glob("*.json"):
@@ -24,73 +38,25 @@ def get_leaderboard_df():
24
  data = json.load(f)
25
  result = Result(**data)
26
  results.append(result)
27
-
28
- # Collect benchmark scores for each model-harness pair, and convert to percent out of 100
29
- benchmark_lookup: dict[tuple[str, str], dict[str, float]] = {}
30
- model_lookup: dict[str, Model] = {}
31
- harness_lookup: dict[str, Harness] = {}
32
- for result in results:
33
- pair = (result.model.repo, result.harness.name)
34
- harness_lookup[result.harness.name] = result.harness
35
- model_lookup[result.model.repo] = result.model
36
- if pair not in benchmark_lookup:
37
- benchmark_lookup[pair] = {}
38
- benchmark_lookup[pair][result.benchmark.name] = round(result.metrics.score * 100, 1)
39
-
40
- # Collect results into df rows
41
- rows = []
42
- benchmark_names = get_benchmark_names(results=results)
43
- for pair, benchmarks in benchmark_lookup.items():
44
- model = model_lookup[pair[0]]
45
- harness = harness_lookup[pair[1]]
46
- avg_score = sum(benchmarks.values()) / len(benchmarks)
47
- row = {
48
- " ": "🟠" if model.is_oss and harness.is_oss else "🔶",
49
- "Model": f'[{model.repo}]({model.url})',
50
- "Harness": f'[{harness.name}]({harness.url})<sup>*</sup>' if result.harness.name == "internal" else f'[{harness.name}]({harness.url})',
51
- "Precision": model.precision,
52
- "Model License": "FOSS" if model.is_oss else "Proprietary",
53
- "Harness License": "FOSS" if harness.is_oss else "Proprietary",
54
- "Model Num Params (B)": model.num_params,
55
- "Avg Score": round(avg_score, 1),
56
- }
57
- for benchmark_name in sorted(benchmark_names, key=lambda x: (0 if x == "swe-bench-verified" else 1)):
58
- row[benchmark_name] = benchmarks.get(benchmark_name, "")
59
- rows.append(row)
60
-
61
- leaderboard_df = pd.DataFrame(rows).sort_values("Avg Score", ascending=False).fillna("")
62
- return leaderboard_df
63
-
64
-
65
- def get_benchmark_run_df():
66
- results: list[Result] = []
67
- for file in RESULTS_DIR.glob("*.json"):
68
- with open(file, "r") as f:
69
- data = json.load(f)
70
- result = Result(**data)
71
- results.append(result)
72
 
73
  rows = []
74
  for result in results:
75
  rows.append(
76
  {
77
- " ": "🟠" if result.model.is_oss and result.harness.is_oss else "🔶",
78
- "Model": f'[{result.model.repo}]({result.model.url})',
79
- "Harness": f'[{result.harness.name}]({result.harness.url})<sup>*</sup>' if result.harness.name == "internal" else f'[{result.harness.name}]({result.harness.url})',
80
- "Benchmark": f'[{result.benchmark.name}]({result.benchmark.url})',
81
- "Base Model": result.model.name,
82
- "Precision": result.model.precision,
83
- "Skills": str(result.harness.skills) if result.harness.skills else "None",
84
- "Score": round(result.metrics.score * 100, 1),
85
- "Avg Cost Per Task (USD)": result.metrics.mean_cost_usd_per_task,
86
- "Avg Seconds Per Task": result.metrics.mean_agent_time_seconds_per_task,
87
- "Avg Input Tokens Per Task": result.metrics.mean_input_tokens_per_task,
88
- "Avg Output Tokens Per Task": result.metrics.mean_output_tokens_per_task,
89
- "Model License": "FOSS" if result.model.is_oss else "Proprietary",
90
- "Harness License": "FOSS" if result.harness.is_oss else "Proprietary",
91
- "Model Num Params (B)": result.model.num_params,
92
  }
93
  )
94
 
95
- benchmark_run_df = pd.DataFrame(rows).sort_values(by=["Benchmark", "Score"], ascending=False).fillna("")
96
- return benchmark_run_df
 
2
  import json
3
  import pandas as pd
4
 
5
+ from src.models import Result
6
 
7
  RESULTS_DIR = Path(__file__).parent.parent / "results"
8
 
9
+ DISPLAY_BY_DEFAULT = [
10
+ "dataset",
11
+ "model",
12
+ "precision",
13
+ "harness",
14
+ "skills",
15
+ "environment",
16
+ "score",
17
+ ]
18
+
19
+ SEARCH_COLUMNS = [
20
+ "dataset",
21
+ "model",
22
+ "harness",
23
+ ]
24
+
25
+
26
  def format_time(seconds: int):
27
  if seconds is None:
28
  return None
 
31
  return f"{h}h{m}m{s}s"
32
 
33
 
 
 
 
34
  def get_leaderboard_df():
35
  results: list[Result] = []
36
  for file in RESULTS_DIR.glob("*.json"):
 
38
  data = json.load(f)
39
  result = Result(**data)
40
  results.append(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  rows = []
43
  for result in results:
44
  rows.append(
45
  {
46
+ "dataset": result.dataset.name,
47
+ "model": result.model.name,
48
+ "model_id": result.model.repo,
49
+ "precision": result.model.precision,
50
+ "harness": result.harness.name,
51
+ "skills": str(result.harness.skills) if result.harness.skills else "None",
52
+ "environment": result.environment.name,
53
+ "score": result.metrics.score,
54
+ "costUSD": result.metrics.costUSD,
55
+ "time": format_time(result.metrics.time),
56
+ "model_is_oss": result.model.is_oss,
57
+ "model_num_params": result.model.num_params,
 
 
 
58
  }
59
  )
60
 
61
+ leaderboard_df = pd.DataFrame(rows)
62
+ return leaderboard_df
src/models.py CHANGED
@@ -3,18 +3,15 @@ from typing import Any, Optional
3
  from pydantic import BaseModel
4
 
5
 
6
- class Benchmark(BaseModel):
7
  name: str
8
  repo: str
9
  num_tasks: int
10
- url: str
11
 
12
 
13
  class Harness(BaseModel):
14
  name: str
15
  skills: list[str]
16
- is_oss: bool
17
- url: str
18
 
19
 
20
  class Model(BaseModel):
@@ -23,38 +20,21 @@ class Model(BaseModel):
23
  is_oss: bool
24
  num_params: int
25
  precision: str
26
- url: str
27
 
28
 
29
  class Environment(BaseModel):
30
  name: str
31
  config: Optional[dict[str, Any]] = None
32
- url: str
33
 
34
 
35
  class Metrics(BaseModel):
36
-
37
  score: float
38
- n_tasks: Optional[int] = None
39
- n_errors: Optional[int] = None
40
- n_input_tokens: Optional[int] = None
41
- n_cache_tokens: Optional[int] = None
42
- n_output_tokens: Optional[int] = None
43
- n_total_tokens: Optional[int] = None
44
- total_time_seconds: Optional[int] = None
45
- agent_time_seconds: Optional[int] = None
46
- cost_usd: Optional[float] = None
47
- mean_input_tokens_per_task: Optional[int] = None
48
- mean_cache_tokens_per_task: Optional[int] = None
49
- mean_output_tokens_per_task: Optional[int] = None
50
- mean_tokens_per_task: Optional[int] = None
51
- mean_cost_usd_per_task: Optional[float] = None
52
- mean_total_time_seconds_per_task: Optional[int] = None
53
- mean_agent_time_seconds_per_task: Optional[int] = None
54
 
55
 
56
  class Result(BaseModel):
57
- benchmark: Benchmark
58
  harness: Harness
59
  model: Model
60
  environment: Environment
 
3
  from pydantic import BaseModel
4
 
5
 
6
+ class Dataset(BaseModel):
7
  name: str
8
  repo: str
9
  num_tasks: int
 
10
 
11
 
12
  class Harness(BaseModel):
13
  name: str
14
  skills: list[str]
 
 
15
 
16
 
17
  class Model(BaseModel):
 
20
  is_oss: bool
21
  num_params: int
22
  precision: str
 
23
 
24
 
25
  class Environment(BaseModel):
26
  name: str
27
  config: Optional[dict[str, Any]] = None
 
28
 
29
 
30
  class Metrics(BaseModel):
 
31
  score: float
32
+ time: Optional[int] = None
33
+ costUSD: Optional[float] = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
 
36
  class Result(BaseModel):
37
+ dataset: Dataset
38
  harness: Harness
39
  model: Model
40
  environment: Environment