Add baseline qwen3.6
#2
by taagarwa - opened
- .gitattributes +0 -1
- .github/workflows/sync-to-hf-space-stage.yml +0 -101
- .github/workflows/sync-to-hf-space.yml +0 -124
- app.py +26 -134
- requirements.txt +1 -1
- results/qwen3-6-35b-internal.json +26 -0
- results/{swe-bench-verified-claude-sonnet-4-6-claude-code.json → qwen3-6-35b-nvfp4-claude-code.json} +12 -15
- results/swe-bench-pro--ansible-claude-sonnet-4-6-claude-code.json +0 -60
- results/swe-bench-pro--ansible-qwen3-6-35b-nvfp4-claude-code.json +0 -60
- results/swe-bench-pro--ansible-qwen3-6-35b-nvfp4-opencode.json +0 -60
- results/swe-bench-pro--ansible-qwen3-6-36b-nvfp4-pi.json +0 -60
- results/swe-bench-verified-qwen3-6-35b-nvfp4-claude-code.json +0 -58
- results/swe-bench-verified-qwen3-6-35b-nvfp4-opencode.json +0 -59
- results/swe-bench-verified-qwen3-6-36b-nvfp4-pi.json +0 -59
- src/display/text_blocks.py +10 -25
- src/leaderboard.py +32 -66
- src/models.py +4 -24
.gitattributes
CHANGED
|
@@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 35 |
scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
|
| 36 |
-
*.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 35 |
scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
|
|
|
.github/workflows/sync-to-hf-space-stage.yml
DELETED
|
@@ -1,101 +0,0 @@
|
|
| 1 |
-
name: Sync stage to HF Space (staging)
|
| 2 |
-
|
| 3 |
-
# Mirrors every push to `stage` on GitHub into the HF Space git remote so
|
| 4 |
-
# that the staging Space (https://huggingface.co/spaces/taagarwa/coding-agent-leaderboard-stage)
|
| 5 |
-
# always tracks the stage branch.
|
| 6 |
-
#
|
| 7 |
-
# Required repository secrets (Settings -> Secrets and variables -> Actions):
|
| 8 |
-
# HF_TOKEN Hugging Face access token with write permission to the Space.
|
| 9 |
-
# HF_USERNAME Optional fallback username if token introspection fails.
|
| 10 |
-
|
| 11 |
-
on:
|
| 12 |
-
push:
|
| 13 |
-
branches: [stage]
|
| 14 |
-
workflow_dispatch:
|
| 15 |
-
|
| 16 |
-
concurrency:
|
| 17 |
-
group: sync-to-hf-space-stage
|
| 18 |
-
cancel-in-progress: false
|
| 19 |
-
|
| 20 |
-
jobs:
|
| 21 |
-
mirror:
|
| 22 |
-
runs-on: ubuntu-latest
|
| 23 |
-
timeout-minutes: 10
|
| 24 |
-
steps:
|
| 25 |
-
- name: Checkout GitHub stage (full history + LFS)
|
| 26 |
-
uses: actions/checkout@v4
|
| 27 |
-
with:
|
| 28 |
-
fetch-depth: 0
|
| 29 |
-
lfs: true
|
| 30 |
-
|
| 31 |
-
- name: Verify required secrets
|
| 32 |
-
env:
|
| 33 |
-
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 34 |
-
run: |
|
| 35 |
-
if [ -z "$HF_TOKEN" ]; then
|
| 36 |
-
echo "::error::HF_TOKEN repository secret must be set."
|
| 37 |
-
exit 1
|
| 38 |
-
fi
|
| 39 |
-
|
| 40 |
-
- name: Ensure HF Space exists
|
| 41 |
-
id: hf
|
| 42 |
-
env:
|
| 43 |
-
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 44 |
-
HF_USERNAME: ${{ secrets.HF_USERNAME }}
|
| 45 |
-
run: |
|
| 46 |
-
set -euo pipefail
|
| 47 |
-
python -m pip install --quiet 'huggingface_hub>=0.24,<2'
|
| 48 |
-
python - <<'PY'
|
| 49 |
-
import os
|
| 50 |
-
|
| 51 |
-
from huggingface_hub import HfApi
|
| 52 |
-
|
| 53 |
-
token = os.environ["HF_TOKEN"]
|
| 54 |
-
space_id = "taagarwa/coding-agent-leaderboard-stage"
|
| 55 |
-
fallback_username = os.environ.get("HF_USERNAME", "").strip()
|
| 56 |
-
|
| 57 |
-
api = HfApi(token=token)
|
| 58 |
-
username = fallback_username
|
| 59 |
-
try:
|
| 60 |
-
info = api.whoami(token=token)
|
| 61 |
-
username = str(info.get("name") or username).strip()
|
| 62 |
-
except Exception as exc:
|
| 63 |
-
if not username:
|
| 64 |
-
raise RuntimeError("HF_USERNAME fallback is required when token introspection fails") from exc
|
| 65 |
-
|
| 66 |
-
api.create_repo(
|
| 67 |
-
repo_id=space_id,
|
| 68 |
-
repo_type="space",
|
| 69 |
-
space_sdk="docker",
|
| 70 |
-
token=token,
|
| 71 |
-
exist_ok=True,
|
| 72 |
-
)
|
| 73 |
-
|
| 74 |
-
with open(os.environ["GITHUB_OUTPUT"], "a", encoding="utf-8") as output:
|
| 75 |
-
output.write(f"username={username}\n")
|
| 76 |
-
print(f"HF Space ready: {space_id}")
|
| 77 |
-
PY
|
| 78 |
-
|
| 79 |
-
- name: Push to HF Space remote
|
| 80 |
-
env:
|
| 81 |
-
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 82 |
-
HF_USERNAME: ${{ steps.hf.outputs.username }}
|
| 83 |
-
run: |
|
| 84 |
-
set -euo pipefail
|
| 85 |
-
HF_REMOTE="https://${HF_USERNAME}:${HF_TOKEN}@huggingface.co/spaces/taagarwa/coding-agent-leaderboard-stage"
|
| 86 |
-
|
| 87 |
-
git config user.name "github-actions[bot]"
|
| 88 |
-
git config user.email "github-actions[bot]@users.noreply.github.com"
|
| 89 |
-
|
| 90 |
-
echo "Pushing $(git rev-parse --short HEAD) to taagarwa/coding-agent-leaderboard-stage..."
|
| 91 |
-
git push --force "${HF_REMOTE}" HEAD:main
|
| 92 |
-
echo "Sync complete."
|
| 93 |
-
|
| 94 |
-
- name: Summary
|
| 95 |
-
if: success()
|
| 96 |
-
run: |
|
| 97 |
-
echo "### HF Space mirror (staging)" >> "$GITHUB_STEP_SUMMARY"
|
| 98 |
-
echo "" >> "$GITHUB_STEP_SUMMARY"
|
| 99 |
-
echo "Pushed \`$(git rev-parse --short HEAD)\` to \`taagarwa/coding-agent-leaderboard-stage\` Space." >> "$GITHUB_STEP_SUMMARY"
|
| 100 |
-
echo "" >> "$GITHUB_STEP_SUMMARY"
|
| 101 |
-
echo "View the Space: <https://huggingface.co/spaces/taagarwa/coding-agent-leaderboard-stage>" >> "$GITHUB_STEP_SUMMARY"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.github/workflows/sync-to-hf-space.yml
DELETED
|
@@ -1,124 +0,0 @@
|
|
| 1 |
-
name: Sync main to HF Space
|
| 2 |
-
|
| 3 |
-
# Mirrors every push to `main` on GitHub into the HF Space git remote so
|
| 4 |
-
# that the public coding-agent-leaderboard Space (https://huggingface.co/spaces/taagarwa/coding-agent-leaderboard)
|
| 5 |
-
# always tracks the source-of-truth repo.
|
| 6 |
-
#
|
| 7 |
-
# Required repository secrets (Settings -> Secrets and variables -> Actions):
|
| 8 |
-
# HF_TOKEN Hugging Face access token with write permission to the Space.
|
| 9 |
-
# Create at https://huggingface.co/settings/tokens
|
| 10 |
-
# (token type "Write" is sufficient; no organization scope needed).
|
| 11 |
-
# HF_USERNAME Optional fallback username if token introspection fails.
|
| 12 |
-
#
|
| 13 |
-
# Optional: set HF_SPACE_ID as a repo variable (not secret) to point the
|
| 14 |
-
# workflow at a different Space; defaults to "taagarwa/coding-agent-leaderboard".
|
| 15 |
-
|
| 16 |
-
on:
|
| 17 |
-
push:
|
| 18 |
-
branches: [main]
|
| 19 |
-
# Manual dispatch lets you re-mirror the latest main on demand from
|
| 20 |
-
# the Actions tab without pushing a new commit.
|
| 21 |
-
workflow_dispatch:
|
| 22 |
-
|
| 23 |
-
# Only one mirror job at a time so we never race ourselves into
|
| 24 |
-
# non-fast-forward pushes on the Space remote.
|
| 25 |
-
concurrency:
|
| 26 |
-
group: sync-to-hf-space
|
| 27 |
-
cancel-in-progress: false
|
| 28 |
-
|
| 29 |
-
jobs:
|
| 30 |
-
mirror:
|
| 31 |
-
runs-on: ubuntu-latest
|
| 32 |
-
timeout-minutes: 10
|
| 33 |
-
steps:
|
| 34 |
-
- name: Checkout GitHub main (full history + LFS)
|
| 35 |
-
uses: actions/checkout@v4
|
| 36 |
-
with:
|
| 37 |
-
fetch-depth: 0
|
| 38 |
-
lfs: true
|
| 39 |
-
|
| 40 |
-
- name: Verify required secrets
|
| 41 |
-
env:
|
| 42 |
-
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 43 |
-
run: |
|
| 44 |
-
if [ -z "$HF_TOKEN" ]; then
|
| 45 |
-
echo "::error::HF_TOKEN repository secret must be set."
|
| 46 |
-
echo " Create HF_TOKEN at https://huggingface.co/settings/tokens (type: Write)"
|
| 47 |
-
exit 1
|
| 48 |
-
fi
|
| 49 |
-
|
| 50 |
-
- name: Ensure HF Space exists
|
| 51 |
-
id: hf
|
| 52 |
-
env:
|
| 53 |
-
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 54 |
-
HF_USERNAME: ${{ secrets.HF_USERNAME }}
|
| 55 |
-
HF_SPACE_ID: ${{ vars.HF_SPACE_ID || 'taagarwa/coding-agent-leaderboard' }}
|
| 56 |
-
run: |
|
| 57 |
-
set -euo pipefail
|
| 58 |
-
python -m pip install --quiet 'huggingface_hub>=0.24,<2'
|
| 59 |
-
python - <<'PY'
|
| 60 |
-
import os
|
| 61 |
-
|
| 62 |
-
from huggingface_hub import HfApi
|
| 63 |
-
|
| 64 |
-
token = os.environ["HF_TOKEN"]
|
| 65 |
-
space_id = os.environ["HF_SPACE_ID"]
|
| 66 |
-
fallback_username = os.environ.get("HF_USERNAME", "").strip()
|
| 67 |
-
|
| 68 |
-
api = HfApi(token=token)
|
| 69 |
-
username = fallback_username
|
| 70 |
-
try:
|
| 71 |
-
info = api.whoami(token=token)
|
| 72 |
-
username = str(info.get("name") or username).strip()
|
| 73 |
-
except Exception as exc:
|
| 74 |
-
if not username:
|
| 75 |
-
raise RuntimeError("HF_USERNAME fallback is required when token introspection fails") from exc
|
| 76 |
-
|
| 77 |
-
api.create_repo(
|
| 78 |
-
repo_id=space_id,
|
| 79 |
-
repo_type="space",
|
| 80 |
-
space_sdk="docker",
|
| 81 |
-
token=token,
|
| 82 |
-
exist_ok=True,
|
| 83 |
-
)
|
| 84 |
-
|
| 85 |
-
with open(os.environ["GITHUB_OUTPUT"], "a", encoding="utf-8") as output:
|
| 86 |
-
output.write(f"username={username}\n")
|
| 87 |
-
print(f"HF Space ready: {space_id}")
|
| 88 |
-
PY
|
| 89 |
-
|
| 90 |
-
- name: Push to HF Space remote
|
| 91 |
-
env:
|
| 92 |
-
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 93 |
-
HF_USERNAME: ${{ steps.hf.outputs.username }}
|
| 94 |
-
HF_SPACE_ID: ${{ vars.HF_SPACE_ID || 'taagarwa/coding-agent-leaderboard' }}
|
| 95 |
-
run: |
|
| 96 |
-
set -euo pipefail
|
| 97 |
-
# Authenticate via token in the URL. HF Spaces accept the
|
| 98 |
-
# username + token basic-auth format over HTTPS git.
|
| 99 |
-
HF_REMOTE="https://${HF_USERNAME}:${HF_TOKEN}@huggingface.co/spaces/${HF_SPACE_ID}"
|
| 100 |
-
|
| 101 |
-
# Configure identity for any metadata operations. The actual
|
| 102 |
-
# commits come from GitHub unchanged; we only push refs.
|
| 103 |
-
git config user.name "github-actions[bot]"
|
| 104 |
-
git config user.email "github-actions[bot]@users.noreply.github.com"
|
| 105 |
-
|
| 106 |
-
echo "Pushing $(git rev-parse --short HEAD) to ${HF_SPACE_ID}..."
|
| 107 |
-
|
| 108 |
-
# --force is intentional: GitHub is the single source of truth
|
| 109 |
-
# for the Space's git history. Anything on the Space side that
|
| 110 |
-
# wasn't committed via GitHub is overwritten on the next sync.
|
| 111 |
-
# This prevents the drift situation where someone edits files
|
| 112 |
-
# in the HF Space UI and creates commits only visible there.
|
| 113 |
-
git push --force "${HF_REMOTE}" HEAD:main
|
| 114 |
-
|
| 115 |
-
echo "Sync complete."
|
| 116 |
-
|
| 117 |
-
- name: Summary
|
| 118 |
-
if: success()
|
| 119 |
-
run: |
|
| 120 |
-
echo "### HF Space mirror" >> "$GITHUB_STEP_SUMMARY"
|
| 121 |
-
echo "" >> "$GITHUB_STEP_SUMMARY"
|
| 122 |
-
echo "Pushed \`$(git rev-parse --short HEAD)\` to \`${{ vars.HF_SPACE_ID || 'taagarwa/coding-agent-leaderboard' }}\` Space." >> "$GITHUB_STEP_SUMMARY"
|
| 123 |
-
echo "" >> "$GITHUB_STEP_SUMMARY"
|
| 124 |
-
echo "View the Space: <https://huggingface.co/spaces/${{ vars.HF_SPACE_ID || 'taagarwa/coding-agent-leaderboard' }}>" >> "$GITHUB_STEP_SUMMARY"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -1,181 +1,73 @@
|
|
| 1 |
import os
|
| 2 |
-
import re
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
def patch_gradio_leaderboard():
|
| 7 |
-
"""Patch gradio_leaderboard JS to fix crash on tab switch with Gradio 5.x."""
|
| 8 |
-
import gradio_leaderboard
|
| 9 |
-
pkg_dir = Path(gradio_leaderboard.__file__).parent
|
| 10 |
-
js_file = pkg_dir / "templates" / "component" / "Index-CzS_eGV6.js"
|
| 11 |
-
if not js_file.exists():
|
| 12 |
-
return
|
| 13 |
-
|
| 14 |
-
src = js_file.read_text()
|
| 15 |
-
|
| 16 |
-
patches = [
|
| 17 |
-
# Fix 1 & 2: Guard r[39]/a[39] filter callback (undefined during Svelte outro)
|
| 18 |
-
(
|
| 19 |
-
'r[0].filter(\n /*func*/\n r[39]\n ).map(qd)',
|
| 20 |
-
'(r[39] ? r[0].filter(r[39]) : r[0]).map(qd)',
|
| 21 |
-
),
|
| 22 |
-
(
|
| 23 |
-
'a[0].filter(\n /*func*/\n a[39]\n ).map(qd))',
|
| 24 |
-
'(a[39] ? a[0].filter(a[39]) : a[0]).map(qd))',
|
| 25 |
-
),
|
| 26 |
-
# Fix 3: Lx (Boolean) extracted from Rx (globals) which is undefined in Gradio 5
|
| 27 |
-
(
|
| 28 |
-
'{ Boolean: Lx } = Rx,',
|
| 29 |
-
'Lx = (Rx && Rx.Boolean) || Boolean,',
|
| 30 |
-
),
|
| 31 |
-
]
|
| 32 |
-
|
| 33 |
-
patched = False
|
| 34 |
-
for old, new in patches:
|
| 35 |
-
if old in src:
|
| 36 |
-
src = src.replace(old, new)
|
| 37 |
-
patched = True
|
| 38 |
-
|
| 39 |
-
if patched:
|
| 40 |
-
js_file.write_text(src)
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
patch_gradio_leaderboard()
|
| 44 |
|
| 45 |
import gradio as gr
|
| 46 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
| 47 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 48 |
from huggingface_hub import HfApi
|
| 49 |
|
| 50 |
-
from src.leaderboard import get_leaderboard_df,
|
| 51 |
from src.display.text_blocks import (
|
|
|
|
| 52 |
INTRODUCTION_TEXT,
|
| 53 |
LLM_BENCHMARKS_TEXT,
|
|
|
|
|
|
|
| 54 |
)
|
| 55 |
|
| 56 |
REPO_ID = "taagarwa/coding-agent-leaderboard"
|
| 57 |
TOKEN = os.environ.get("HF_TOKEN")
|
| 58 |
API = HfApi(token=TOKEN)
|
| 59 |
|
|
|
|
| 60 |
def restart_space():
|
| 61 |
API.restart_space(repo_id=REPO_ID)
|
| 62 |
|
| 63 |
|
| 64 |
LEADERBOARD_DF = get_leaderboard_df()
|
| 65 |
-
BENCHMARK_RUN_DF = get_benchmark_run_df()
|
| 66 |
|
| 67 |
-
def extract_body(s: str):
|
| 68 |
-
return re.match(r'\[(.*?)\]', s).group(1)
|
| 69 |
|
| 70 |
-
|
| 71 |
-
def build_header_html(df):
|
| 72 |
-
n_results = len(df)
|
| 73 |
-
n_models = df["Model"].nunique()
|
| 74 |
-
n_harnesses = df["Harness"].apply(lambda s: extract_body(s)).nunique()
|
| 75 |
-
n_benchmarks = df["Benchmark"].apply(lambda s: extract_body(s)).nunique()
|
| 76 |
-
|
| 77 |
-
return f"""
|
| 78 |
-
<base target="_blank">
|
| 79 |
-
<div style="padding: 1.5rem 0.5rem 1rem 0.5rem; text-align: left;">
|
| 80 |
-
<h1 style="margin: 0 0 0.5rem 0; font-size: 2rem;">
|
| 81 |
-
Coding Agent Leaderboard
|
| 82 |
-
</h1>
|
| 83 |
-
<div style="height: 4px; border-radius: 2px; background: linear-gradient(90deg, #84cc16, #f59e0b); margin-bottom: 0.75rem;"></div>
|
| 84 |
-
<p style="margin: 0 0 0.75rem 0; font-size: 1.1rem; opacity: 0.8;">
|
| 85 |
-
Compare coding agents across models and harnesses
|
| 86 |
-
</p>
|
| 87 |
-
<div style="display: flex; gap: 0.5rem; flex-wrap: wrap; font-size: 0.95rem; opacity: 0.7;">
|
| 88 |
-
<span style="font-weight: 600;">{n_results} Results</span>
|
| 89 |
-
<span>·</span>
|
| 90 |
-
<span style="font-weight: 600;">{n_models} Models</span>
|
| 91 |
-
<span>·</span>
|
| 92 |
-
<span style="font-weight: 600;">{n_harnesses} Harnesses</span>
|
| 93 |
-
<span>·</span>
|
| 94 |
-
<span style="font-weight: 600;">{n_benchmarks} Benchmarks</span>
|
| 95 |
-
</div>
|
| 96 |
-
</div>
|
| 97 |
-
"""
|
| 98 |
-
|
| 99 |
def init_leaderboard(dataframe):
|
| 100 |
if dataframe is None or dataframe.empty:
|
| 101 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 102 |
-
|
| 103 |
-
label_choices = [("🟠 Fully FOSS", "🟠"), ("🔶 Proprietary", "🔶")]
|
| 104 |
-
meta_columns = [" ", "Harness", "Model", "Harness License", "Model License", "Model Num Params (B)", "Precision"]
|
| 105 |
-
benchmark_columns = [col for col in dataframe.columns if col not in meta_columns]
|
| 106 |
-
model_choices = sorted({(extract_body(v), v) for v in dataframe["Model"]})
|
| 107 |
-
harness_choices = sorted({(extract_body(v), v) for v in dataframe["Harness"]})
|
| 108 |
-
|
| 109 |
-
default_columns = [" ", "Harness", "Model"] + benchmark_columns
|
| 110 |
return Leaderboard(
|
| 111 |
value=dataframe,
|
| 112 |
select_columns=SelectColumns(
|
| 113 |
-
default_selection=
|
| 114 |
label="Select Columns to Display:",
|
| 115 |
),
|
| 116 |
-
|
| 117 |
-
search_columns=["Harness", "Model"],
|
| 118 |
filter_columns=[
|
| 119 |
-
ColumnFilter(label="
|
| 120 |
-
ColumnFilter(label="
|
| 121 |
-
ColumnFilter(label="
|
| 122 |
-
ColumnFilter(label="Number of Parameters (B)", column="Model Num Params (B)", type="slider"),
|
| 123 |
-
ColumnFilter(label="Precision", column="Precision", type="checkboxgroup"),
|
| 124 |
],
|
| 125 |
interactive=False,
|
| 126 |
)
|
| 127 |
|
| 128 |
-
def init_benchmark_runs(dataframe):
|
| 129 |
-
if dataframe is None or dataframe.empty:
|
| 130 |
-
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 131 |
-
|
| 132 |
-
# Make ColumnFilter choices
|
| 133 |
-
label_choices = [("🟠 Fully FOSS", "🟠"), ("🔶 Proprietary", "🔶")]
|
| 134 |
-
benchmark_choices = sorted({(extract_body(v), v) for v in dataframe["Benchmark"]})
|
| 135 |
-
|
| 136 |
-
return Leaderboard(
|
| 137 |
-
value=dataframe,
|
| 138 |
-
select_columns=SelectColumns(
|
| 139 |
-
default_selection=[
|
| 140 |
-
" ",
|
| 141 |
-
"Model",
|
| 142 |
-
"Harness",
|
| 143 |
-
"Benchmark",
|
| 144 |
-
"Score",
|
| 145 |
-
"Avg Cost Per Task (USD)",
|
| 146 |
-
],
|
| 147 |
-
label="Select Columns to Display:",
|
| 148 |
-
),
|
| 149 |
-
datatype="markdown",
|
| 150 |
-
search_columns=[
|
| 151 |
-
"Benchmark",
|
| 152 |
-
"Harness",
|
| 153 |
-
"Model",
|
| 154 |
-
],
|
| 155 |
-
filter_columns=[
|
| 156 |
-
ColumnFilter(label="Category", column=" ", type="checkboxgroup", choices=label_choices),
|
| 157 |
-
ColumnFilter(label="Benchmark", column="Benchmark", type="checkboxgroup", choices=benchmark_choices),
|
| 158 |
-
ColumnFilter(label="Number of Parameters (B)", column="Model Num Params (B)", type="slider"),
|
| 159 |
-
ColumnFilter(label="Precision", column="Precision", type="checkboxgroup"),
|
| 160 |
-
],
|
| 161 |
-
interactive=False,
|
| 162 |
-
)
|
| 163 |
|
| 164 |
-
demo = gr.Blocks(
|
| 165 |
with demo:
|
| 166 |
-
gr.HTML(
|
| 167 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 168 |
|
| 169 |
-
with gr.Tabs():
|
| 170 |
-
with gr.
|
| 171 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
|
|
|
| 172 |
|
| 173 |
-
with gr.
|
| 174 |
-
benchmark_runs = init_benchmark_runs(BENCHMARK_RUN_DF)
|
| 175 |
-
|
| 176 |
-
with gr.Tab("📝 About"):
|
| 177 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
scheduler = BackgroundScheduler()
|
| 180 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 181 |
scheduler.start()
|
|
|
|
| 1 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import gradio as gr
|
| 4 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
| 5 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 6 |
from huggingface_hub import HfApi
|
| 7 |
|
| 8 |
+
from src.leaderboard import get_leaderboard_df, DISPLAY_BY_DEFAULT, SEARCH_COLUMNS
|
| 9 |
from src.display.text_blocks import (
|
| 10 |
+
TITLE,
|
| 11 |
INTRODUCTION_TEXT,
|
| 12 |
LLM_BENCHMARKS_TEXT,
|
| 13 |
+
CITATION_BUTTON_LABEL,
|
| 14 |
+
CITATION_BUTTON_TEXT,
|
| 15 |
)
|
| 16 |
|
| 17 |
REPO_ID = "taagarwa/coding-agent-leaderboard"
|
| 18 |
TOKEN = os.environ.get("HF_TOKEN")
|
| 19 |
API = HfApi(token=TOKEN)
|
| 20 |
|
| 21 |
+
|
| 22 |
def restart_space():
|
| 23 |
API.restart_space(repo_id=REPO_ID)
|
| 24 |
|
| 25 |
|
| 26 |
LEADERBOARD_DF = get_leaderboard_df()
|
|
|
|
| 27 |
|
|
|
|
|
|
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def init_leaderboard(dataframe):
|
| 30 |
if dataframe is None or dataframe.empty:
|
| 31 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
return Leaderboard(
|
| 33 |
value=dataframe,
|
| 34 |
select_columns=SelectColumns(
|
| 35 |
+
default_selection=DISPLAY_BY_DEFAULT,
|
| 36 |
label="Select Columns to Display:",
|
| 37 |
),
|
| 38 |
+
search_columns=SEARCH_COLUMNS,
|
|
|
|
| 39 |
filter_columns=[
|
| 40 |
+
ColumnFilter(label="Dataset", column="dataset", type="checkboxgroup"),
|
| 41 |
+
ColumnFilter(label="Number of Parameters (B)", column="model_num_params", type="slider", min=0.5, max=150),
|
| 42 |
+
ColumnFilter(label="Precision", column="precision", type="checkboxgroup"),
|
|
|
|
|
|
|
| 43 |
],
|
| 44 |
interactive=False,
|
| 45 |
)
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
+
demo = gr.Blocks()
|
| 49 |
with demo:
|
| 50 |
+
gr.HTML(TITLE)
|
| 51 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 52 |
|
| 53 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 54 |
+
with gr.TabItem("🏅 Coding Agent Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 55 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 56 |
+
gr.Markdown("\* `internal` refers to internal benchmarks performed by the model provider where the harness/environment were not made public")
|
| 57 |
|
| 58 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
|
|
|
|
|
|
|
|
|
| 59 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 60 |
|
| 61 |
+
with gr.Row():
|
| 62 |
+
with gr.Accordion("📙 Citation", open=False):
|
| 63 |
+
citation_button = gr.Textbox(
|
| 64 |
+
value=CITATION_BUTTON_TEXT,
|
| 65 |
+
label=CITATION_BUTTON_LABEL,
|
| 66 |
+
lines=20,
|
| 67 |
+
elem_id="citation-button",
|
| 68 |
+
show_copy_button=True,
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
scheduler = BackgroundScheduler()
|
| 72 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 73 |
scheduler.start()
|
requirements.txt
CHANGED
|
@@ -3,7 +3,7 @@ black
|
|
| 3 |
datasets
|
| 4 |
gradio
|
| 5 |
gradio[oauth]
|
| 6 |
-
gradio_leaderboard
|
| 7 |
gradio_client
|
| 8 |
huggingface-hub>=0.18.0
|
| 9 |
matplotlib
|
|
|
|
| 3 |
datasets
|
| 4 |
gradio
|
| 5 |
gradio[oauth]
|
| 6 |
+
gradio_leaderboard==0.0.13
|
| 7 |
gradio_client
|
| 8 |
huggingface-hub>=0.18.0
|
| 9 |
matplotlib
|
results/qwen3-6-35b-internal.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset": {
|
| 3 |
+
"name": "swe-bench-verified",
|
| 4 |
+
"repo": "SWE-bench/SWE-bench_Verified",
|
| 5 |
+
"num_tasks": 500
|
| 6 |
+
},
|
| 7 |
+
"harness": {
|
| 8 |
+
"name": "internal",
|
| 9 |
+
"skills": []
|
| 10 |
+
},
|
| 11 |
+
"model": {
|
| 12 |
+
"name": "Qwen3.6-35B-A3B",
|
| 13 |
+
"repo": "Qwen/Qwen3.6-35B-A3B",
|
| 14 |
+
"is_oss": true,
|
| 15 |
+
"num_params": 35,
|
| 16 |
+
"precision": "bf16"
|
| 17 |
+
},
|
| 18 |
+
"environment": {
|
| 19 |
+
"name": "internal"
|
| 20 |
+
},
|
| 21 |
+
"metrics": {
|
| 22 |
+
"score": 0.734,
|
| 23 |
+
"time": null,
|
| 24 |
+
"costUSD": null
|
| 25 |
+
}
|
| 26 |
+
}
|
results/{swe-bench-verified-claude-sonnet-4-6-claude-code.json → qwen3-6-35b-nvfp4-claude-code.json}
RENAMED
|
@@ -1,23 +1,19 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
"name": "swe-bench-verified",
|
| 4 |
"repo": "SWE-bench/SWE-bench_Verified",
|
| 5 |
-
"num_tasks": 500
|
| 6 |
-
"url": "https://huggingface.co/datasets/SWE-bench/SWE-bench_Verified"
|
| 7 |
},
|
| 8 |
"harness": {
|
| 9 |
-
"name": "
|
| 10 |
-
"skills": []
|
| 11 |
-
"is_oss": false,
|
| 12 |
-
"url": "https://github.com/anthropics/claude-code"
|
| 13 |
},
|
| 14 |
"model": {
|
| 15 |
-
"name": "
|
| 16 |
-
"repo": "
|
| 17 |
"is_oss": true,
|
| 18 |
-
"num_params":
|
| 19 |
-
"precision": "
|
| 20 |
-
"url": "https://www.anthropic.com/news/claude-sonnet-4-6"
|
| 21 |
},
|
| 22 |
"environment": {
|
| 23 |
"name": "harbor",
|
|
@@ -33,10 +29,11 @@
|
|
| 33 |
"task_names": null,
|
| 34 |
"exclude_task_names": null,
|
| 35 |
"n_tasks": null
|
| 36 |
-
}
|
| 37 |
-
"url": "https://github.com/harbor-framework/harbor"
|
| 38 |
},
|
| 39 |
"metrics": {
|
| 40 |
-
"score": 0.
|
|
|
|
|
|
|
| 41 |
}
|
| 42 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"dataset": {
|
| 3 |
"name": "swe-bench-verified",
|
| 4 |
"repo": "SWE-bench/SWE-bench_Verified",
|
| 5 |
+
"num_tasks": 500
|
|
|
|
| 6 |
},
|
| 7 |
"harness": {
|
| 8 |
+
"name": "claude-code",
|
| 9 |
+
"skills": []
|
|
|
|
|
|
|
| 10 |
},
|
| 11 |
"model": {
|
| 12 |
+
"name": "Qwen3.6-35B-A3B",
|
| 13 |
+
"repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
|
| 14 |
"is_oss": true,
|
| 15 |
+
"num_params": 35,
|
| 16 |
+
"precision": "nvfp4"
|
|
|
|
| 17 |
},
|
| 18 |
"environment": {
|
| 19 |
"name": "harbor",
|
|
|
|
| 29 |
"task_names": null,
|
| 30 |
"exclude_task_names": null,
|
| 31 |
"n_tasks": null
|
| 32 |
+
}
|
|
|
|
| 33 |
},
|
| 34 |
"metrics": {
|
| 35 |
+
"score": 0.632,
|
| 36 |
+
"time": 21600,
|
| 37 |
+
"costUSD": 48.00
|
| 38 |
}
|
| 39 |
}
|
results/swe-bench-pro--ansible-claude-sonnet-4-6-claude-code.json
DELETED
|
@@ -1,60 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"benchmark": {
|
| 3 |
-
"name": "swe-bench-pro--ansible",
|
| 4 |
-
"repo": "ScaleAI/SWE-bench_Pro",
|
| 5 |
-
"num_tasks": 96,
|
| 6 |
-
"url": "https://huggingface.co/datasets/ScaleAI/SWE-bench_Pro"
|
| 7 |
-
},
|
| 8 |
-
"harness": {
|
| 9 |
-
"name": "Claude Code",
|
| 10 |
-
"skills": [],
|
| 11 |
-
"is_oss": false,
|
| 12 |
-
"url": "https://github.com/anthropics/claude-code"
|
| 13 |
-
},
|
| 14 |
-
"model": {
|
| 15 |
-
"name": "Sonnet 4.6",
|
| 16 |
-
"repo": "Sonnet 4.6",
|
| 17 |
-
"is_oss": true,
|
| 18 |
-
"num_params": 1000,
|
| 19 |
-
"precision": "bf16",
|
| 20 |
-
"url": "https://www.anthropic.com/news/claude-sonnet-4-6"
|
| 21 |
-
},
|
| 22 |
-
"environment": {
|
| 23 |
-
"name": "harbor",
|
| 24 |
-
"config": {
|
| 25 |
-
"path": null,
|
| 26 |
-
"name": "scale-ai/swe-bench-pro",
|
| 27 |
-
"version": null,
|
| 28 |
-
"ref": "sha256:88411d32ff27e53a4c1a7e29f0c2aeba180c8e5d60f221cab5ed56325f33549d",
|
| 29 |
-
"registry_url": null,
|
| 30 |
-
"registry_path": null,
|
| 31 |
-
"overwrite": false,
|
| 32 |
-
"download_dir": null,
|
| 33 |
-
"task_names": [
|
| 34 |
-
"*ansible*"
|
| 35 |
-
],
|
| 36 |
-
"exclude_task_names": null,
|
| 37 |
-
"n_tasks": null
|
| 38 |
-
},
|
| 39 |
-
"url": "https://github.com/harbor-framework/harbor"
|
| 40 |
-
},
|
| 41 |
-
"metrics": {
|
| 42 |
-
"n_tasks": 96,
|
| 43 |
-
"n_errors": 1,
|
| 44 |
-
"score": 0.5,
|
| 45 |
-
"n_input_tokens": 190672390,
|
| 46 |
-
"n_cache_tokens": 184409111,
|
| 47 |
-
"n_output_tokens": 1593112,
|
| 48 |
-
"n_total_tokens": 376674613,
|
| 49 |
-
"agent_time_seconds": 40527,
|
| 50 |
-
"total_time_seconds": 49734,
|
| 51 |
-
"cost_usd": 184.42824125000004,
|
| 52 |
-
"mean_input_tokens_per_task": 1986170,
|
| 53 |
-
"mean_cache_tokens_per_task": 1920928,
|
| 54 |
-
"mean_output_tokens_per_task": 16594,
|
| 55 |
-
"mean_tokens_per_task": 3923693,
|
| 56 |
-
"mean_cost_usd_per_task": 1.92,
|
| 57 |
-
"mean_total_time_seconds_per_task": 518,
|
| 58 |
-
"mean_agent_time_seconds_per_task": 422
|
| 59 |
-
}
|
| 60 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/swe-bench-pro--ansible-qwen3-6-35b-nvfp4-claude-code.json
DELETED
|
@@ -1,60 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"benchmark": {
|
| 3 |
-
"name": "swe-bench-pro--ansible",
|
| 4 |
-
"repo": "ScaleAI/SWE-bench_Pro",
|
| 5 |
-
"num_tasks": 96,
|
| 6 |
-
"url": "https://huggingface.co/datasets/ScaleAI/SWE-bench_Pro"
|
| 7 |
-
},
|
| 8 |
-
"harness": {
|
| 9 |
-
"name": "Claude Code",
|
| 10 |
-
"skills": [],
|
| 11 |
-
"is_oss": false,
|
| 12 |
-
"url": "https://github.com/anthropics/claude-code"
|
| 13 |
-
},
|
| 14 |
-
"model": {
|
| 15 |
-
"name": "Qwen3.6-35B-A3B",
|
| 16 |
-
"repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
|
| 17 |
-
"is_oss": true,
|
| 18 |
-
"num_params": 35,
|
| 19 |
-
"precision": "nvfp4",
|
| 20 |
-
"url": "https://huggingface.co/RedHatAI/Qwen3.6-35B-A3B-NVFP4"
|
| 21 |
-
},
|
| 22 |
-
"environment": {
|
| 23 |
-
"name": "harbor",
|
| 24 |
-
"config": {
|
| 25 |
-
"path": null,
|
| 26 |
-
"name": "scale-ai/swe-bench-pro",
|
| 27 |
-
"version": null,
|
| 28 |
-
"ref": "sha256:88411d32ff27e53a4c1a7e29f0c2aeba180c8e5d60f221cab5ed56325f33549d",
|
| 29 |
-
"registry_url": null,
|
| 30 |
-
"registry_path": null,
|
| 31 |
-
"overwrite": false,
|
| 32 |
-
"download_dir": null,
|
| 33 |
-
"task_names": [
|
| 34 |
-
"*ansible*"
|
| 35 |
-
],
|
| 36 |
-
"exclude_task_names": null,
|
| 37 |
-
"n_tasks": null
|
| 38 |
-
},
|
| 39 |
-
"url": "https://github.com/harbor-framework/harbor"
|
| 40 |
-
},
|
| 41 |
-
"metrics": {
|
| 42 |
-
"n_tasks": 96,
|
| 43 |
-
"n_errors": 6,
|
| 44 |
-
"score": 0.458,
|
| 45 |
-
"n_input_tokens": 367897697,
|
| 46 |
-
"n_cache_tokens": 0,
|
| 47 |
-
"n_output_tokens": 1694885,
|
| 48 |
-
"n_total_tokens": 369592582,
|
| 49 |
-
"agent_time_seconds": 39024,
|
| 50 |
-
"total_time_seconds": 46758,
|
| 51 |
-
"cost_usd": 9.64,
|
| 52 |
-
"mean_input_tokens_per_task": 3832267,
|
| 53 |
-
"mean_cache_tokens_per_task": 0,
|
| 54 |
-
"mean_output_tokens_per_task": 17655,
|
| 55 |
-
"mean_tokens_per_task": 3849922,
|
| 56 |
-
"mean_cost_usd_per_task": 0.1,
|
| 57 |
-
"mean_total_time_seconds_per_task": 487,
|
| 58 |
-
"mean_agent_time_seconds_per_task": 406
|
| 59 |
-
}
|
| 60 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/swe-bench-pro--ansible-qwen3-6-35b-nvfp4-opencode.json
DELETED
|
@@ -1,60 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"benchmark": {
|
| 3 |
-
"name": "swe-bench-pro--ansible",
|
| 4 |
-
"repo": "ScaleAI/SWE-bench_Pro",
|
| 5 |
-
"num_tasks": 96,
|
| 6 |
-
"url": "https://huggingface.co/datasets/ScaleAI/SWE-bench_Pro"
|
| 7 |
-
},
|
| 8 |
-
"harness": {
|
| 9 |
-
"name": "OpenCode",
|
| 10 |
-
"skills": [],
|
| 11 |
-
"is_oss": true,
|
| 12 |
-
"url": "https://github.com/anomalyco/opencode"
|
| 13 |
-
},
|
| 14 |
-
"model": {
|
| 15 |
-
"name": "Qwen3.6-35B-A3B",
|
| 16 |
-
"repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
|
| 17 |
-
"is_oss": true,
|
| 18 |
-
"num_params": 35,
|
| 19 |
-
"precision": "nvfp4",
|
| 20 |
-
"url": "https://huggingface.co/RedHatAI/Qwen3.6-35B-A3B-NVFP4"
|
| 21 |
-
},
|
| 22 |
-
"environment": {
|
| 23 |
-
"name": "harbor",
|
| 24 |
-
"config": {
|
| 25 |
-
"path": null,
|
| 26 |
-
"name": "scale-ai/swe-bench-pro",
|
| 27 |
-
"version": null,
|
| 28 |
-
"ref": "sha256:88411d32ff27e53a4c1a7e29f0c2aeba180c8e5d60f221cab5ed56325f33549d",
|
| 29 |
-
"registry_url": null,
|
| 30 |
-
"registry_path": null,
|
| 31 |
-
"overwrite": false,
|
| 32 |
-
"download_dir": null,
|
| 33 |
-
"task_names": [
|
| 34 |
-
"*ansible*"
|
| 35 |
-
],
|
| 36 |
-
"exclude_task_names": null,
|
| 37 |
-
"n_tasks": null
|
| 38 |
-
},
|
| 39 |
-
"url": "https://github.com/harbor-framework/harbor"
|
| 40 |
-
},
|
| 41 |
-
"metrics": {
|
| 42 |
-
"n_tasks": 96,
|
| 43 |
-
"n_errors": 4,
|
| 44 |
-
"score": 0.375,
|
| 45 |
-
"n_input_tokens": 207164679,
|
| 46 |
-
"n_cache_tokens": 0,
|
| 47 |
-
"n_output_tokens": 1598703,
|
| 48 |
-
"n_total_tokens": 208763382,
|
| 49 |
-
"agent_time_seconds": 49450,
|
| 50 |
-
"total_time_seconds": 57287,
|
| 51 |
-
"cost_usd": 12.21,
|
| 52 |
-
"mean_input_tokens_per_task": 2157965,
|
| 53 |
-
"mean_cache_tokens_per_task": 0,
|
| 54 |
-
"mean_output_tokens_per_task": 16653,
|
| 55 |
-
"mean_tokens_per_task": 2174618,
|
| 56 |
-
"mean_cost_usd_per_task": 0.13,
|
| 57 |
-
"mean_total_time_seconds_per_task": 596,
|
| 58 |
-
"mean_agent_time_seconds_per_task": 515
|
| 59 |
-
}
|
| 60 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/swe-bench-pro--ansible-qwen3-6-36b-nvfp4-pi.json
DELETED
|
@@ -1,60 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"benchmark": {
|
| 3 |
-
"name": "swe-bench-pro--ansible",
|
| 4 |
-
"repo": "ScaleAI/SWE-bench_Pro",
|
| 5 |
-
"num_tasks": 96,
|
| 6 |
-
"url": "https://huggingface.co/datasets/ScaleAI/SWE-bench_Pro"
|
| 7 |
-
},
|
| 8 |
-
"harness": {
|
| 9 |
-
"name": "Pi",
|
| 10 |
-
"skills": [],
|
| 11 |
-
"is_oss": true,
|
| 12 |
-
"url": "https://github.com/earendil-works/pi/tree/main"
|
| 13 |
-
},
|
| 14 |
-
"model": {
|
| 15 |
-
"name": "Qwen3.6-35B-A3B",
|
| 16 |
-
"repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
|
| 17 |
-
"is_oss": true,
|
| 18 |
-
"num_params": 35,
|
| 19 |
-
"precision": "nvfp4",
|
| 20 |
-
"url": "https://huggingface.co/RedHatAI/Qwen3.6-35B-A3B-NVFP4"
|
| 21 |
-
},
|
| 22 |
-
"environment": {
|
| 23 |
-
"name": "harbor",
|
| 24 |
-
"config": {
|
| 25 |
-
"path": null,
|
| 26 |
-
"name": "scale-ai/swe-bench-pro",
|
| 27 |
-
"version": null,
|
| 28 |
-
"ref": "sha256:88411d32ff27e53a4c1a7e29f0c2aeba180c8e5d60f221cab5ed56325f33549d",
|
| 29 |
-
"registry_url": null,
|
| 30 |
-
"registry_path": null,
|
| 31 |
-
"overwrite": false,
|
| 32 |
-
"download_dir": null,
|
| 33 |
-
"task_names": [
|
| 34 |
-
"*ansible*"
|
| 35 |
-
],
|
| 36 |
-
"exclude_task_names": null,
|
| 37 |
-
"n_tasks": null
|
| 38 |
-
},
|
| 39 |
-
"url": "https://github.com/harbor-framework/harbor"
|
| 40 |
-
},
|
| 41 |
-
"metrics": {
|
| 42 |
-
"n_tasks": 96,
|
| 43 |
-
"n_errors": 1,
|
| 44 |
-
"score": 0.479,
|
| 45 |
-
"n_input_tokens": 742491363,
|
| 46 |
-
"n_cache_tokens": 0,
|
| 47 |
-
"n_output_tokens": 2387609,
|
| 48 |
-
"n_total_tokens": 744878972,
|
| 49 |
-
"agent_time_seconds": 54543,
|
| 50 |
-
"total_time_seconds": 62422,
|
| 51 |
-
"cost_usd": 13.47,
|
| 52 |
-
"mean_input_tokens_per_task": 7734285,
|
| 53 |
-
"mean_cache_tokens_per_task": 0,
|
| 54 |
-
"mean_output_tokens_per_task": 24870,
|
| 55 |
-
"mean_tokens_per_task": 7759155,
|
| 56 |
-
"mean_cost_usd_per_task": 0.14,
|
| 57 |
-
"mean_total_time_seconds_per_task": 650,
|
| 58 |
-
"mean_agent_time_seconds_per_task": 568
|
| 59 |
-
}
|
| 60 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/swe-bench-verified-qwen3-6-35b-nvfp4-claude-code.json
DELETED
|
@@ -1,58 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"benchmark": {
|
| 3 |
-
"name": "swe-bench-verified",
|
| 4 |
-
"repo": "SWE-bench/SWE-bench_Verified",
|
| 5 |
-
"num_tasks": 500,
|
| 6 |
-
"url": "https://huggingface.co/datasets/SWE-bench/SWE-bench_Verified"
|
| 7 |
-
},
|
| 8 |
-
"harness": {
|
| 9 |
-
"name": "Claude Code",
|
| 10 |
-
"skills": [],
|
| 11 |
-
"is_oss": false,
|
| 12 |
-
"url": "https://github.com/anthropics/claude-code"
|
| 13 |
-
},
|
| 14 |
-
"model": {
|
| 15 |
-
"name": "Qwen3.6-35B-A3B",
|
| 16 |
-
"repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
|
| 17 |
-
"is_oss": true,
|
| 18 |
-
"num_params": 35,
|
| 19 |
-
"precision": "nvfp4",
|
| 20 |
-
"url": "https://huggingface.co/RedHatAI/Qwen3.6-35B-A3B-NVFP4"
|
| 21 |
-
},
|
| 22 |
-
"environment": {
|
| 23 |
-
"name": "harbor",
|
| 24 |
-
"config": {
|
| 25 |
-
"path": null,
|
| 26 |
-
"name": "swe-bench/swe-bench-verified",
|
| 27 |
-
"version": null,
|
| 28 |
-
"ref": "sha256:235d6032d549851a936db3b5fe08807c4d385c12ee10e7be9c9786a1ff60563c",
|
| 29 |
-
"registry_url": null,
|
| 30 |
-
"registry_path": null,
|
| 31 |
-
"overwrite": false,
|
| 32 |
-
"download_dir": null,
|
| 33 |
-
"task_names": null,
|
| 34 |
-
"exclude_task_names": null,
|
| 35 |
-
"n_tasks": null
|
| 36 |
-
},
|
| 37 |
-
"url": "https://github.com/harbor-framework/harbor"
|
| 38 |
-
},
|
| 39 |
-
"metrics": {
|
| 40 |
-
"n_tasks": 500,
|
| 41 |
-
"n_errors": 1,
|
| 42 |
-
"score": 0.632,
|
| 43 |
-
"n_input_tokens": 1106618897,
|
| 44 |
-
"n_cache_tokens": 0,
|
| 45 |
-
"n_output_tokens": 5733245,
|
| 46 |
-
"n_total_tokens": 1112352142,
|
| 47 |
-
"agent_time_seconds": 122808,
|
| 48 |
-
"total_time_seconds": 171897,
|
| 49 |
-
"cost_usd": 34.11,
|
| 50 |
-
"mean_input_tokens_per_task": 2213237,
|
| 51 |
-
"mean_cache_tokens_per_task": 0,
|
| 52 |
-
"mean_output_tokens_per_task": 11466,
|
| 53 |
-
"mean_tokens_per_task": 2224704,
|
| 54 |
-
"mean_cost_usd_per_task": 0.07,
|
| 55 |
-
"mean_total_time_seconds_per_task": 343,
|
| 56 |
-
"mean_agent_time_seconds_per_task": 245
|
| 57 |
-
}
|
| 58 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/swe-bench-verified-qwen3-6-35b-nvfp4-opencode.json
DELETED
|
@@ -1,59 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"benchmark": {
|
| 3 |
-
"name": "swe-bench-verified",
|
| 4 |
-
"repo": "SWE-bench/SWE-bench_Verified",
|
| 5 |
-
"num_tasks": 500,
|
| 6 |
-
"url": "https://huggingface.co/datasets/SWE-bench/SWE-bench_Verified"
|
| 7 |
-
},
|
| 8 |
-
"harness": {
|
| 9 |
-
"name": "OpenCode",
|
| 10 |
-
"skills": [],
|
| 11 |
-
"is_oss": true,
|
| 12 |
-
"url": "https://github.com/anomalyco/opencode"
|
| 13 |
-
},
|
| 14 |
-
"model": {
|
| 15 |
-
"name": "Qwen3.6-35B-A3B",
|
| 16 |
-
"repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
|
| 17 |
-
"is_oss": true,
|
| 18 |
-
"num_params": 35,
|
| 19 |
-
"precision": "nvfp4",
|
| 20 |
-
"url": "https://huggingface.co/RedHatAI/Qwen3.6-35B-A3B-NVFP4"
|
| 21 |
-
},
|
| 22 |
-
"environment": {
|
| 23 |
-
"name": "harbor",
|
| 24 |
-
"config": {
|
| 25 |
-
"path": null,
|
| 26 |
-
"name": "swe-bench/swe-bench-verified",
|
| 27 |
-
"version": null,
|
| 28 |
-
"ref": "sha256:235d6032d549851a936db3b5fe08807c4d385c12ee10e7be9c9786a1ff60563c",
|
| 29 |
-
"registry_url": null,
|
| 30 |
-
"registry_path": null,
|
| 31 |
-
"overwrite": false,
|
| 32 |
-
"download_dir": null,
|
| 33 |
-
"task_names": null,
|
| 34 |
-
"exclude_task_names": null,
|
| 35 |
-
"n_tasks": null,
|
| 36 |
-
"accelerated_images": true
|
| 37 |
-
},
|
| 38 |
-
"url": "https://github.com/harbor-framework/harbor"
|
| 39 |
-
},
|
| 40 |
-
"metrics": {
|
| 41 |
-
"n_tasks": 500,
|
| 42 |
-
"n_errors": 4,
|
| 43 |
-
"score": 0.548,
|
| 44 |
-
"n_input_tokens": 469806650,
|
| 45 |
-
"n_cache_tokens": 0,
|
| 46 |
-
"n_output_tokens": 4937761,
|
| 47 |
-
"n_total_tokens": 474744411,
|
| 48 |
-
"agent_time_seconds": 120473,
|
| 49 |
-
"total_time_seconds": 185168,
|
| 50 |
-
"cost_usd": 29.75,
|
| 51 |
-
"mean_input_tokens_per_task": 939613,
|
| 52 |
-
"mean_cache_tokens_per_task": 0,
|
| 53 |
-
"mean_output_tokens_per_task": 9875,
|
| 54 |
-
"mean_tokens_per_task": 949488,
|
| 55 |
-
"mean_cost_usd_per_task": 0.06,
|
| 56 |
-
"mean_total_time_seconds_per_task": 370,
|
| 57 |
-
"mean_agent_time_seconds_per_task": 240
|
| 58 |
-
}
|
| 59 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/swe-bench-verified-qwen3-6-36b-nvfp4-pi.json
DELETED
|
@@ -1,59 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"benchmark": {
|
| 3 |
-
"name": "swe-bench-verified",
|
| 4 |
-
"repo": "SWE-bench/SWE-bench_Verified",
|
| 5 |
-
"num_tasks": 500,
|
| 6 |
-
"url": "https://huggingface.co/datasets/SWE-bench/SWE-bench_Verified"
|
| 7 |
-
},
|
| 8 |
-
"harness": {
|
| 9 |
-
"name": "Pi",
|
| 10 |
-
"skills": [],
|
| 11 |
-
"is_oss": true,
|
| 12 |
-
"url": "https://github.com/earendil-works/pi/tree/main"
|
| 13 |
-
},
|
| 14 |
-
"model": {
|
| 15 |
-
"name": "Qwen3.6-35B-A3B",
|
| 16 |
-
"repo": "RedHatAI/Qwen3.6-35B-A3B-NVFP4",
|
| 17 |
-
"is_oss": true,
|
| 18 |
-
"num_params": 35,
|
| 19 |
-
"precision": "nvfp4",
|
| 20 |
-
"url": "https://huggingface.co/RedHatAI/Qwen3.6-35B-A3B-NVFP4"
|
| 21 |
-
},
|
| 22 |
-
"environment": {
|
| 23 |
-
"name": "harbor",
|
| 24 |
-
"config": {
|
| 25 |
-
"path": null,
|
| 26 |
-
"name": "swe-bench/swe-bench-verified",
|
| 27 |
-
"version": null,
|
| 28 |
-
"ref": "sha256:235d6032d549851a936db3b5fe08807c4d385c12ee10e7be9c9786a1ff60563c",
|
| 29 |
-
"registry_url": null,
|
| 30 |
-
"registry_path": null,
|
| 31 |
-
"overwrite": false,
|
| 32 |
-
"download_dir": null,
|
| 33 |
-
"task_names": null,
|
| 34 |
-
"exclude_task_names": null,
|
| 35 |
-
"n_tasks": null,
|
| 36 |
-
"accelerated_images": true
|
| 37 |
-
},
|
| 38 |
-
"url": "https://github.com/harbor-framework/harbor"
|
| 39 |
-
},
|
| 40 |
-
"metrics": {
|
| 41 |
-
"n_tasks": 500,
|
| 42 |
-
"n_errors": 6,
|
| 43 |
-
"score": 0.65,
|
| 44 |
-
"n_input_tokens": 791183735,
|
| 45 |
-
"n_cache_tokens": 0,
|
| 46 |
-
"n_output_tokens": 6333798,
|
| 47 |
-
"n_total_tokens": 797517533,
|
| 48 |
-
"agent_time_seconds": 154531,
|
| 49 |
-
"total_time_seconds": 218988,
|
| 50 |
-
"cost_usd": 38.16,
|
| 51 |
-
"mean_input_tokens_per_task": 1582367,
|
| 52 |
-
"mean_cache_tokens_per_task": 0,
|
| 53 |
-
"mean_output_tokens_per_task": 12667,
|
| 54 |
-
"mean_tokens_per_task": 1595035,
|
| 55 |
-
"mean_cost_usd_per_task": 0.08,
|
| 56 |
-
"mean_total_time_seconds_per_task": 437,
|
| 57 |
-
"mean_agent_time_seconds_per_task": 309
|
| 58 |
-
}
|
| 59 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/text_blocks.py
CHANGED
|
@@ -1,34 +1,19 @@
|
|
|
|
|
|
|
|
| 1 |
INTRODUCTION_TEXT = """
|
| 2 |
-
|
| 3 |
-
This leaderboard tracks how these components work together, because the same model can perform very differently depending on the harness it's paired with.
|
| 4 |
"""
|
| 5 |
|
| 6 |
LLM_BENCHMARKS_TEXT = """
|
| 7 |
-
##
|
| 8 |
-
|
| 9 |
-
A coding agent is a system that autonomously solves software engineering tasks - reading code, reasoning about bugs, and writing patches. Its performance depends on two components:
|
| 10 |
-
|
| 11 |
-
- **Model** - The underlying language model (e.g. Claude Opus 4.7, Qwen3.6-35B)
|
| 12 |
-
- **Harness** - The framework or tool that orchestrates the model's actions (e.g. Claude Code, OpenCode, Pi)
|
| 13 |
|
| 14 |
-
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|--------|-------------|
|
| 18 |
-
| **Benchmark** | The benchmark used for evaluation (e.g. SWE-bench Verified - 500 real GitHub issues) |
|
| 19 |
-
| **Harness** | The agent framework driving the model. |
|
| 20 |
-
| **Model** | The language model being evaluated |
|
| 21 |
-
| **Skills** | The set of instructions guiding the agent's behavior |
|
| 22 |
-
| **Score** | Outcome of the benchmark, often the fraction of tasks solved correctly (higher is better) |
|
| 23 |
-
| **Precision** | Model weight format (e.g. bf16, fp4) - affects speed, memory footprint, and quality |
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
- **FOSS vs Proprietary** - Filters let you compare fully open-source agents against proprietary ones. A FOSS model with a FOSS harness means anyone can reproduce the result
|
| 28 |
-
- **Skills** - Some harnesses augment the model with extra capabilities (tools, retrieval, etc.). Listed in the "skills" column when present
|
| 29 |
-
- **Internal results (`*`)** - Benchmarks run by the model provider where the harness and environment were not made public. These are useful reference points but are not independently reproducible
|
| 30 |
|
| 31 |
-
|
| 32 |
|
| 33 |
-
|
| 34 |
-
"""
|
|
|
|
| 1 |
+
TITLE = """<h1 align="center" id="space-title">Coding Agent Leaderboard</h1>"""
|
| 2 |
+
|
| 3 |
INTRODUCTION_TEXT = """
|
| 4 |
+
Welcome to the Coding Agent Leaderboard!
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
LLM_BENCHMARKS_TEXT = """
|
| 8 |
+
## About
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
+
Evaluate and compare Coding Agents.
|
| 11 |
|
| 12 |
+
Coding Agent = Model + Harness + Skills.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
Visit our [GitHub repo](https://github.com/redhat-et/coding_agent_bench) for more details about the project.
|
| 15 |
+
"""
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
CITATION_BUTTON_TEXT = "TBD"
|
| 18 |
|
| 19 |
+
CITATION_BUTTON_LABEL = "Citation"
|
|
|
src/leaderboard.py
CHANGED
|
@@ -2,10 +2,27 @@ from pathlib import Path
|
|
| 2 |
import json
|
| 3 |
import pandas as pd
|
| 4 |
|
| 5 |
-
from src.models import Result
|
| 6 |
|
| 7 |
RESULTS_DIR = Path(__file__).parent.parent / "results"
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
def format_time(seconds: int):
|
| 10 |
if seconds is None:
|
| 11 |
return None
|
|
@@ -14,9 +31,6 @@ def format_time(seconds: int):
|
|
| 14 |
return f"{h}h{m}m{s}s"
|
| 15 |
|
| 16 |
|
| 17 |
-
def get_benchmark_names(results: list[Result]):
|
| 18 |
-
return {r.benchmark.name for r in results}
|
| 19 |
-
|
| 20 |
def get_leaderboard_df():
|
| 21 |
results: list[Result] = []
|
| 22 |
for file in RESULTS_DIR.glob("*.json"):
|
|
@@ -24,73 +38,25 @@ def get_leaderboard_df():
|
|
| 24 |
data = json.load(f)
|
| 25 |
result = Result(**data)
|
| 26 |
results.append(result)
|
| 27 |
-
|
| 28 |
-
# Collect benchmark scores for each model-harness pair, and convert to percent out of 100
|
| 29 |
-
benchmark_lookup: dict[tuple[str, str], dict[str, float]] = {}
|
| 30 |
-
model_lookup: dict[str, Model] = {}
|
| 31 |
-
harness_lookup: dict[str, Harness] = {}
|
| 32 |
-
for result in results:
|
| 33 |
-
pair = (result.model.repo, result.harness.name)
|
| 34 |
-
harness_lookup[result.harness.name] = result.harness
|
| 35 |
-
model_lookup[result.model.repo] = result.model
|
| 36 |
-
if pair not in benchmark_lookup:
|
| 37 |
-
benchmark_lookup[pair] = {}
|
| 38 |
-
benchmark_lookup[pair][result.benchmark.name] = round(result.metrics.score * 100, 1)
|
| 39 |
-
|
| 40 |
-
# Collect results into df rows
|
| 41 |
-
rows = []
|
| 42 |
-
benchmark_names = get_benchmark_names(results=results)
|
| 43 |
-
for pair, benchmarks in benchmark_lookup.items():
|
| 44 |
-
model = model_lookup[pair[0]]
|
| 45 |
-
harness = harness_lookup[pair[1]]
|
| 46 |
-
avg_score = sum(benchmarks.values()) / len(benchmarks)
|
| 47 |
-
row = {
|
| 48 |
-
" ": "🟠" if model.is_oss and harness.is_oss else "🔶",
|
| 49 |
-
"Model": f'[{model.repo}]({model.url})',
|
| 50 |
-
"Harness": f'[{harness.name}]({harness.url})<sup>*</sup>' if result.harness.name == "internal" else f'[{harness.name}]({harness.url})',
|
| 51 |
-
"Precision": model.precision,
|
| 52 |
-
"Model License": "FOSS" if model.is_oss else "Proprietary",
|
| 53 |
-
"Harness License": "FOSS" if harness.is_oss else "Proprietary",
|
| 54 |
-
"Model Num Params (B)": model.num_params,
|
| 55 |
-
"Avg Score": round(avg_score, 1),
|
| 56 |
-
}
|
| 57 |
-
for benchmark_name in sorted(benchmark_names, key=lambda x: (0 if x == "swe-bench-verified" else 1)):
|
| 58 |
-
row[benchmark_name] = benchmarks.get(benchmark_name, "")
|
| 59 |
-
rows.append(row)
|
| 60 |
-
|
| 61 |
-
leaderboard_df = pd.DataFrame(rows).sort_values("Avg Score", ascending=False).fillna("")
|
| 62 |
-
return leaderboard_df
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
def get_benchmark_run_df():
|
| 66 |
-
results: list[Result] = []
|
| 67 |
-
for file in RESULTS_DIR.glob("*.json"):
|
| 68 |
-
with open(file, "r") as f:
|
| 69 |
-
data = json.load(f)
|
| 70 |
-
result = Result(**data)
|
| 71 |
-
results.append(result)
|
| 72 |
|
| 73 |
rows = []
|
| 74 |
for result in results:
|
| 75 |
rows.append(
|
| 76 |
{
|
| 77 |
-
"
|
| 78 |
-
"
|
| 79 |
-
"
|
| 80 |
-
"
|
| 81 |
-
"
|
| 82 |
-
"
|
| 83 |
-
"
|
| 84 |
-
"
|
| 85 |
-
"
|
| 86 |
-
"
|
| 87 |
-
"
|
| 88 |
-
"
|
| 89 |
-
"Model License": "FOSS" if result.model.is_oss else "Proprietary",
|
| 90 |
-
"Harness License": "FOSS" if result.harness.is_oss else "Proprietary",
|
| 91 |
-
"Model Num Params (B)": result.model.num_params,
|
| 92 |
}
|
| 93 |
)
|
| 94 |
|
| 95 |
-
|
| 96 |
-
return
|
|
|
|
| 2 |
import json
|
| 3 |
import pandas as pd
|
| 4 |
|
| 5 |
+
from src.models import Result
|
| 6 |
|
| 7 |
RESULTS_DIR = Path(__file__).parent.parent / "results"
|
| 8 |
|
| 9 |
+
DISPLAY_BY_DEFAULT = [
|
| 10 |
+
"dataset",
|
| 11 |
+
"model",
|
| 12 |
+
"precision",
|
| 13 |
+
"harness",
|
| 14 |
+
"skills",
|
| 15 |
+
"environment",
|
| 16 |
+
"score",
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
SEARCH_COLUMNS = [
|
| 20 |
+
"dataset",
|
| 21 |
+
"model",
|
| 22 |
+
"harness",
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
|
| 26 |
def format_time(seconds: int):
|
| 27 |
if seconds is None:
|
| 28 |
return None
|
|
|
|
| 31 |
return f"{h}h{m}m{s}s"
|
| 32 |
|
| 33 |
|
|
|
|
|
|
|
|
|
|
| 34 |
def get_leaderboard_df():
|
| 35 |
results: list[Result] = []
|
| 36 |
for file in RESULTS_DIR.glob("*.json"):
|
|
|
|
| 38 |
data = json.load(f)
|
| 39 |
result = Result(**data)
|
| 40 |
results.append(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
rows = []
|
| 43 |
for result in results:
|
| 44 |
rows.append(
|
| 45 |
{
|
| 46 |
+
"dataset": result.dataset.name,
|
| 47 |
+
"model": result.model.name,
|
| 48 |
+
"model_id": result.model.repo,
|
| 49 |
+
"precision": result.model.precision,
|
| 50 |
+
"harness": result.harness.name,
|
| 51 |
+
"skills": str(result.harness.skills) if result.harness.skills else "None",
|
| 52 |
+
"environment": result.environment.name,
|
| 53 |
+
"score": result.metrics.score,
|
| 54 |
+
"costUSD": result.metrics.costUSD,
|
| 55 |
+
"time": format_time(result.metrics.time),
|
| 56 |
+
"model_is_oss": result.model.is_oss,
|
| 57 |
+
"model_num_params": result.model.num_params,
|
|
|
|
|
|
|
|
|
|
| 58 |
}
|
| 59 |
)
|
| 60 |
|
| 61 |
+
leaderboard_df = pd.DataFrame(rows)
|
| 62 |
+
return leaderboard_df
|
src/models.py
CHANGED
|
@@ -3,18 +3,15 @@ from typing import Any, Optional
|
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
| 5 |
|
| 6 |
-
class
|
| 7 |
name: str
|
| 8 |
repo: str
|
| 9 |
num_tasks: int
|
| 10 |
-
url: str
|
| 11 |
|
| 12 |
|
| 13 |
class Harness(BaseModel):
|
| 14 |
name: str
|
| 15 |
skills: list[str]
|
| 16 |
-
is_oss: bool
|
| 17 |
-
url: str
|
| 18 |
|
| 19 |
|
| 20 |
class Model(BaseModel):
|
|
@@ -23,38 +20,21 @@ class Model(BaseModel):
|
|
| 23 |
is_oss: bool
|
| 24 |
num_params: int
|
| 25 |
precision: str
|
| 26 |
-
url: str
|
| 27 |
|
| 28 |
|
| 29 |
class Environment(BaseModel):
|
| 30 |
name: str
|
| 31 |
config: Optional[dict[str, Any]] = None
|
| 32 |
-
url: str
|
| 33 |
|
| 34 |
|
| 35 |
class Metrics(BaseModel):
|
| 36 |
-
|
| 37 |
score: float
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
n_input_tokens: Optional[int] = None
|
| 41 |
-
n_cache_tokens: Optional[int] = None
|
| 42 |
-
n_output_tokens: Optional[int] = None
|
| 43 |
-
n_total_tokens: Optional[int] = None
|
| 44 |
-
total_time_seconds: Optional[int] = None
|
| 45 |
-
agent_time_seconds: Optional[int] = None
|
| 46 |
-
cost_usd: Optional[float] = None
|
| 47 |
-
mean_input_tokens_per_task: Optional[int] = None
|
| 48 |
-
mean_cache_tokens_per_task: Optional[int] = None
|
| 49 |
-
mean_output_tokens_per_task: Optional[int] = None
|
| 50 |
-
mean_tokens_per_task: Optional[int] = None
|
| 51 |
-
mean_cost_usd_per_task: Optional[float] = None
|
| 52 |
-
mean_total_time_seconds_per_task: Optional[int] = None
|
| 53 |
-
mean_agent_time_seconds_per_task: Optional[int] = None
|
| 54 |
|
| 55 |
|
| 56 |
class Result(BaseModel):
|
| 57 |
-
|
| 58 |
harness: Harness
|
| 59 |
model: Model
|
| 60 |
environment: Environment
|
|
|
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
| 5 |
|
| 6 |
+
class Dataset(BaseModel):
|
| 7 |
name: str
|
| 8 |
repo: str
|
| 9 |
num_tasks: int
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
class Harness(BaseModel):
|
| 13 |
name: str
|
| 14 |
skills: list[str]
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
class Model(BaseModel):
|
|
|
|
| 20 |
is_oss: bool
|
| 21 |
num_params: int
|
| 22 |
precision: str
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
class Environment(BaseModel):
|
| 26 |
name: str
|
| 27 |
config: Optional[dict[str, Any]] = None
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
class Metrics(BaseModel):
|
|
|
|
| 31 |
score: float
|
| 32 |
+
time: Optional[int] = None
|
| 33 |
+
costUSD: Optional[float] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
class Result(BaseModel):
|
| 37 |
+
dataset: Dataset
|
| 38 |
harness: Harness
|
| 39 |
model: Model
|
| 40 |
environment: Environment
|