Upload folder using huggingface_hub
Browse files- README.md +27 -8
- app.py +195 -0
- requirements.txt +2 -0
README.md
CHANGED
|
@@ -1,13 +1,32 @@
|
|
| 1 |
---
|
| 2 |
-
title: ClawBench
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
-
python_version: '3.13'
|
| 9 |
app_file: app.py
|
| 10 |
-
pinned:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: ClawBench Leaderboard
|
| 3 |
+
emoji: π¦
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.15.0
|
|
|
|
| 8 |
app_file: app.py
|
| 9 |
+
pinned: true
|
| 10 |
+
license: apache-2.0
|
| 11 |
+
short_description: Can AI agents complete everyday online tasks?
|
| 12 |
+
tags:
|
| 13 |
+
- leaderboard
|
| 14 |
+
- benchmark
|
| 15 |
+
- web-agents
|
| 16 |
+
- browser-automation
|
| 17 |
+
- agent-evaluation
|
| 18 |
+
- llm-evaluation
|
| 19 |
---
|
| 20 |
|
| 21 |
+
# ClawBench β Leaderboard
|
| 22 |
+
|
| 23 |
+
Live results for the [ClawBench](https://huggingface.co/datasets/TIGER-Lab/ClawBench) web-agent benchmark β backed by [`leaderboard/results.csv`](https://huggingface.co/datasets/TIGER-Lab/ClawBench/blob/main/leaderboard/results.csv) in the dataset repo. Submit your model by opening a PR there.
|
| 24 |
+
|
| 25 |
+
| Resource | Link |
|
| 26 |
+
|---|---|
|
| 27 |
+
| π Paper | https://arxiv.org/abs/2604.08523 |
|
| 28 |
+
| π» GitHub | https://github.com/reacher-z/ClawBench |
|
| 29 |
+
| π Dataset | https://huggingface.co/datasets/TIGER-Lab/ClawBench |
|
| 30 |
+
| π Traces (V1) | https://huggingface.co/datasets/NAIL-Group/ClawBenchV1Trace |
|
| 31 |
+
| π Traces (V2) | https://huggingface.co/datasets/TIGER-Lab/ClawBenchV2Trace |
|
| 32 |
+
| π Website | https://claw-bench.com |
|
app.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ClawBench leaderboard Space β reads results.csv from the TIGER-Lab/ClawBench dataset.
|
| 2 |
+
|
| 3 |
+
Two-stage scoring per https://github.com/reacher-z/ClawBench/blob/main/eval/scoring.md:
|
| 4 |
+
- Intercepted (Stage 1) = fraction of runs whose final HTTP request hit the per-task URL/method schema.
|
| 5 |
+
- Reward (Stage 2) = fraction that also passed an LLM judge on the intercepted payload. Headline metric.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import io
|
| 9 |
+
import urllib.request
|
| 10 |
+
|
| 11 |
+
import gradio as gr
|
| 12 |
+
import pandas as pd
|
| 13 |
+
|
| 14 |
+
RESULTS_URL = (
|
| 15 |
+
"https://huggingface.co/datasets/TIGER-Lab/ClawBench/resolve/main/leaderboard/results.csv"
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
CITATION = """@article{zhang2026clawbench,
|
| 20 |
+
title = {ClawBench: Can AI Agents Complete Everyday Online Tasks?},
|
| 21 |
+
author = {Zhang, Xiaochen and others},
|
| 22 |
+
year = {2026},
|
| 23 |
+
eprint = {2604.08523},
|
| 24 |
+
archivePrefix = {arXiv},
|
| 25 |
+
}"""
|
| 26 |
+
|
| 27 |
+
INTRO = """# π ClawBench β Web Agent Benchmark
|
| 28 |
+
|
| 29 |
+
**Can AI agents complete everyday online tasks?** ClawBench scores agents on real, live websites (booking flights, ordering groceries, submitting job applications). Two corpora: **V1** β 153 tasks across 144 websites Β· **V2** β 130 newer tasks across 63 platforms. Every run is graded twice: a deterministic HTTP-request *interception* check (Stage 1), then an LLM *judge* on the intercepted payload (Stage 2 β the headline `Reward`).
|
| 30 |
+
|
| 31 |
+
[**π Paper**](https://arxiv.org/abs/2604.08523) Β· [**π» GitHub**](https://github.com/reacher-z/ClawBench) Β· [**π Dataset**](https://huggingface.co/datasets/TIGER-Lab/ClawBench) Β· [**π Traces V1**](https://huggingface.co/datasets/NAIL-Group/ClawBenchV1Trace) Β· [**π Traces V2**](https://huggingface.co/datasets/TIGER-Lab/ClawBenchV2Trace) Β· [**π Site**](https://claw-bench.com)
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
TABLE_INTRO = """**Intercepted** = agent's final HTTP request matched the task's URL/method schema. **Reward** = AND passed the LLM judge on the payload (default judge: `deepseek/deepseek-v4-pro`). Rows are ranked by Reward, then Intercepted as tiebreak. `β` means no Stage-2 data available."""
|
| 35 |
+
|
| 36 |
+
ABOUT = """## About ClawBench
|
| 37 |
+
|
| 38 |
+
### Why a new benchmark?
|
| 39 |
+
Existing browser-agent benchmarks either run on synthetic / sandboxed websites (WebArena, VisualWebArena) or only check whether the agent *reached* the endpoint (WebVoyager). ClawBench runs on **live, real-world websites** and verifies the *payload* the agent submitted β so an agent that types the wrong delivery address into Uber Eats fails, even if its last HTTP request hit the correct endpoint.
|
| 40 |
+
|
| 41 |
+
### Two corpora
|
| 42 |
+
|
| 43 |
+
- **V1** β 153 tasks across 144 real websites (the paper).
|
| 44 |
+
- **V2** β 130 newer everyday tasks across 63 platforms, expanded coverage of e-commerce / form-filling / authentication-walled flows.
|
| 45 |
+
|
| 46 |
+
### Two-stage scoring
|
| 47 |
+
|
| 48 |
+
| Stage | What it checks | Output |
|
| 49 |
+
|---|---|---|
|
| 50 |
+
| 1. **Interception** | Did the final HTTP request match the task's URL + method + canonical body schema? | `intercepted β {true, false}` |
|
| 51 |
+
| 2. **Judge** | Given the natural-language instruction and the intercepted payload, did the agent submit the *right* thing? | `match β {true, false, null}` |
|
| 52 |
+
|
| 53 |
+
`Reward = Intercepted β§ Match`. Full prompt + judge model details: [eval/scoring.md β](https://github.com/reacher-z/ClawBench/blob/main/eval/scoring.md)
|
| 54 |
+
|
| 55 |
+
### What ships with every run
|
| 56 |
+
|
| 57 |
+
A **5-layer trace bundle** (downloadable from the Traces datasets above):
|
| 58 |
+
|
| 59 |
+
- `recording.mp4` β full browser session video
|
| 60 |
+
- `actions.jsonl` β every click / type / scroll
|
| 61 |
+
- `agent-messages.jsonl` β model inputs & outputs (incl. reasoning)
|
| 62 |
+
- `requests.jsonl` β every HTTP request the page made
|
| 63 |
+
- `interception.json` β graded final request
|
| 64 |
+
- `run-meta.json` β model, harness, scores, timing
|
| 65 |
+
|
| 66 |
+
### Reproducing
|
| 67 |
+
|
| 68 |
+
```bash
|
| 69 |
+
pip install clawbench-eval
|
| 70 |
+
clawbench run --model <your-model> --harness hermes --corpus v2
|
| 71 |
+
python scripts/clawbench_rescore.py --judge-model deepseek-v4-pro --only-batch <your-batch-dir>
|
| 72 |
+
```
|
| 73 |
+
"""
|
| 74 |
+
|
| 75 |
+
SUBMIT = """## π Submit your model
|
| 76 |
+
|
| 77 |
+
Submissions are accepted as **PRs to the leaderboard CSV** in the dataset repo:
|
| 78 |
+
|
| 79 |
+
[**Open the CSV in the dataset repo β**](https://huggingface.co/datasets/TIGER-Lab/ClawBench/blob/main/leaderboard/results.csv)
|
| 80 |
+
|
| 81 |
+
### Required steps
|
| 82 |
+
|
| 83 |
+
1. **Run the benchmark** β install `pip install clawbench-eval`, then `clawbench run --model <your-model> --harness hermes --corpus v2` (or `v1`). Use the included harnesses (hermes / openclaw) so traces follow the standard 5-layer format.
|
| 84 |
+
2. **Score** β `python scripts/clawbench_rescore.py --judge-model deepseek-v4-pro --only-batch <your-batch-dir>` produces `rescore-summary.json` with the cells you'll need.
|
| 85 |
+
3. **Upload traces** (recommended) β push the 5-layer run bundles to `TIGER-Lab/ClawBenchV2Trace` (or `NAIL-Group/ClawBenchV1Trace`) so others can audit.
|
| 86 |
+
4. **Open a PR** β add one row per `(model, harness, corpus)` to `leaderboard/results.csv` with columns: `model,harness,dataset,passed,total,pass_rate,reward_rate,wall_hours`. Link the trace bundle in the PR description.
|
| 87 |
+
|
| 88 |
+
We re-run a sample of your submitted traces with our judge before merging β to keep the table honest.
|
| 89 |
+
|
| 90 |
+
For step-by-step instructions, see [`eval/scoring.md`](https://github.com/reacher-z/ClawBench/blob/main/eval/scoring.md).
|
| 91 |
+
"""
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def _format_pct(v) -> str:
|
| 95 |
+
return "β" if pd.isna(v) else f"{v:.2f}%"
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def _format_wall(v) -> str:
|
| 99 |
+
return "β" if pd.isna(v) else f"{v:.2f}"
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def load_results() -> pd.DataFrame:
|
| 103 |
+
raw = urllib.request.urlopen(RESULTS_URL, timeout=30).read()
|
| 104 |
+
df = pd.read_csv(io.BytesIO(raw))
|
| 105 |
+
if "reward_rate" not in df.columns:
|
| 106 |
+
df["reward_rate"] = pd.NA
|
| 107 |
+
df = df.sort_values(
|
| 108 |
+
["dataset", "reward_rate", "pass_rate"],
|
| 109 |
+
ascending=[True, False, False],
|
| 110 |
+
na_position="last",
|
| 111 |
+
).reset_index(drop=True)
|
| 112 |
+
df.insert(0, "rank", df.groupby("dataset").cumcount() + 1)
|
| 113 |
+
df["pass_rate"] = df["pass_rate"].map(_format_pct)
|
| 114 |
+
df["reward_rate"] = df["reward_rate"].map(_format_pct)
|
| 115 |
+
df["wall_hours"] = df["wall_hours"].map(_format_wall)
|
| 116 |
+
df.rename(
|
| 117 |
+
columns={
|
| 118 |
+
"model": "Model",
|
| 119 |
+
"harness": "Harness",
|
| 120 |
+
"dataset": "Corpus",
|
| 121 |
+
"passed": "Pass",
|
| 122 |
+
"total": "Total",
|
| 123 |
+
"pass_rate": "Intercepted",
|
| 124 |
+
"reward_rate": "Reward",
|
| 125 |
+
"wall_hours": "Wall (h)",
|
| 126 |
+
"rank": "Rank",
|
| 127 |
+
},
|
| 128 |
+
inplace=True,
|
| 129 |
+
)
|
| 130 |
+
return df[["Rank", "Model", "Harness", "Corpus", "Intercepted", "Reward", "Pass", "Total", "Wall (h)"]]
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def filter_df(query: str, corpus: str, harness_filter: list[str]):
|
| 134 |
+
df = load_results()
|
| 135 |
+
if corpus and corpus != "all":
|
| 136 |
+
df = df[df["Corpus"].str.lower() == corpus.lower()]
|
| 137 |
+
if harness_filter:
|
| 138 |
+
df = df[df["Harness"].isin(harness_filter)]
|
| 139 |
+
if query:
|
| 140 |
+
q = query.strip().lower()
|
| 141 |
+
df = df[df["Model"].str.lower().str.contains(q, na=False)]
|
| 142 |
+
return df.reset_index(drop=True)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def all_harnesses() -> list[str]:
|
| 146 |
+
try:
|
| 147 |
+
df = load_results()
|
| 148 |
+
return sorted(df["Harness"].dropna().unique().tolist())
|
| 149 |
+
except Exception:
|
| 150 |
+
return ["hermes", "openclaw"]
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
with gr.Blocks(title="ClawBench Leaderboard", theme=gr.themes.Soft()) as demo:
|
| 154 |
+
gr.Markdown(INTRO)
|
| 155 |
+
|
| 156 |
+
with gr.Tabs():
|
| 157 |
+
with gr.TabItem("π Leaderboard"):
|
| 158 |
+
with gr.Row():
|
| 159 |
+
with gr.Accordion("Citation", open=False):
|
| 160 |
+
gr.Textbox(value=CITATION, label="BibTeX", lines=8, interactive=False)
|
| 161 |
+
gr.Markdown(TABLE_INTRO)
|
| 162 |
+
with gr.Row():
|
| 163 |
+
search_bar = gr.Textbox(placeholder="Search modelsβ¦", show_label=False, scale=3)
|
| 164 |
+
corpus_choice = gr.Radio(choices=["all", "v2", "v1"], value="v2", label="Corpus", scale=2)
|
| 165 |
+
harness_choice = gr.CheckboxGroup(
|
| 166 |
+
choices=all_harnesses(),
|
| 167 |
+
value=all_harnesses(),
|
| 168 |
+
label="Harness",
|
| 169 |
+
)
|
| 170 |
+
df_init = filter_df("", "v2", all_harnesses())
|
| 171 |
+
table = gr.Dataframe(
|
| 172 |
+
value=df_init,
|
| 173 |
+
interactive=False,
|
| 174 |
+
wrap=True,
|
| 175 |
+
column_widths=["60px", "260px", "100px", "70px", "110px", "100px", "60px", "60px", "80px"],
|
| 176 |
+
)
|
| 177 |
+
refresh = gr.Button("π Refresh from dataset")
|
| 178 |
+
|
| 179 |
+
for control in (search_bar, corpus_choice, harness_choice):
|
| 180 |
+
control.change(
|
| 181 |
+
fn=filter_df,
|
| 182 |
+
inputs=[search_bar, corpus_choice, harness_choice],
|
| 183 |
+
outputs=table,
|
| 184 |
+
)
|
| 185 |
+
refresh.click(fn=filter_df, inputs=[search_bar, corpus_choice, harness_choice], outputs=table)
|
| 186 |
+
|
| 187 |
+
with gr.TabItem("π About"):
|
| 188 |
+
gr.Markdown(ABOUT)
|
| 189 |
+
|
| 190 |
+
with gr.TabItem("π Submit here"):
|
| 191 |
+
gr.Markdown(SUBMIT)
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
if __name__ == "__main__":
|
| 195 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==5.15.0
|
| 2 |
+
pandas
|