File size: 10,269 Bytes
41e181d a478c75 41e181d a478c75 41e181d a478c75 41e181d a478c75 41e181d 78cfa0c 41e181d e77a483 78cfa0c e77a483 78cfa0c 41e181d e77a483 41e181d e77a483 41e181d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 | """ClawBench leaderboard Space β reads results.csv from the TIGER-Lab/ClawBench dataset.
Two-stage scoring per https://github.com/reacher-z/ClawBench/blob/main/eval/scoring.md:
- Intercepted (Stage 1) = fraction of runs whose final HTTP request hit the per-task URL/method schema.
- Reward (Stage 2) = fraction that also passed an LLM judge on the intercepted payload. Headline metric.
"""
import io
import urllib.request
import gradio as gr
import pandas as pd
RESULTS_URL = (
"https://huggingface.co/datasets/TIGER-Lab/ClawBench/resolve/main/leaderboard/results.csv"
)
CITATION = """@misc{zhang2026clawbench,
title = {ClawBench: Can AI Agents Complete Everyday Online Tasks?},
author = {Yuxuan Zhang and Yubo Wang and Yipeng Zhu and Penghui Du and Junwen Miao and Xuan Lu and Wendong Xu and Yunzhuo Hao and Songcheng Cai and Xiaochen Wang and Huaisong Zhang and Xian Wu and Yi Lu and Minyi Lei and Kai Zou and Huifeng Yin and Ping Nie and Liang Chen and Dongfu Jiang and Wenhu Chen and Kelsey R. Allen},
year = {2026},
eprint = {2604.08523},
archivePrefix = {arXiv},
primaryClass = {cs.AI},
url = {https://arxiv.org/abs/2604.08523}
}"""
INTRO = """# π ClawBench β Web Agent Benchmark
**Can AI agents complete everyday online tasks?** ClawBench scores agents on real, live websites (booking flights, ordering groceries, submitting job applications). Two corpora: **V1** β 153 tasks across 144 websites Β· **V2** β 130 newer tasks across 63 platforms. Every run is graded twice: a deterministic HTTP-request *interception* check (Stage 1, the sort key) β then an LLM *judge* on the intercepted payload (Stage 2 = `Reward`).
[**π Paper**](https://arxiv.org/abs/2604.08523) Β· [**π» GitHub**](https://github.com/reacher-z/ClawBench) Β· [**π Dataset**](https://huggingface.co/datasets/TIGER-Lab/ClawBench) Β· [**π Traces V1**](https://huggingface.co/datasets/NAIL-Group/ClawBenchV1Trace) Β· [**π Traces V2**](https://huggingface.co/datasets/TIGER-Lab/ClawBenchV2Trace) Β· [**π Site**](https://claw-bench.com)
"""
TABLE_INTRO = """**Intercepted** (sort key) = agent's final HTTP request matched the task's URL/method schema β Stage 1, deterministic, no judge. **Reward** = additionally requires the LLM judge (default `deepseek/deepseek-v4-pro`) to confirm the payload fulfilled the instruction β Stage 2. Rows are ranked by Intercepted (corpus-normalized: `intercepted / 130` for V2 so partials don't outrank complete batches) with Reward as tiebreak. `β` = no Stage-2 data yet."""
ABOUT = """## About ClawBench
### Why a new benchmark?
Existing browser-agent benchmarks either run on synthetic / sandboxed websites (WebArena, VisualWebArena) or only check whether the agent *reached* the endpoint (WebVoyager). ClawBench runs on **live, real-world websites** and verifies the *payload* the agent submitted β so an agent that types the wrong delivery address into Uber Eats fails, even if its last HTTP request hit the correct endpoint.
### Two corpora
- **V1** β 153 tasks across 144 real websites (the paper).
- **V2** β 130 newer everyday tasks across 63 platforms, expanded coverage of e-commerce / form-filling / authentication-walled flows.
### Two-stage scoring
| Stage | What it checks | Output |
|---|---|---|
| 1. **Interception** | Did the final HTTP request match the task's URL + method + canonical body schema? | `intercepted β {true, false}` |
| 2. **Judge** | Given the natural-language instruction and the intercepted payload, did the agent submit the *right* thing? | `match β {true, false, null}` |
`Reward = Intercepted β§ Match`. Full prompt + judge model details: [eval/scoring.md β](https://github.com/reacher-z/ClawBench/blob/main/eval/scoring.md)
### What ships with every run
A **5-layer trace bundle** (downloadable from the Traces datasets above):
- `recording.mp4` β full browser session video
- `actions.jsonl` β every click / type / scroll
- `agent-messages.jsonl` β model inputs & outputs (incl. reasoning)
- `requests.jsonl` β every HTTP request the page made
- `interception.json` β graded final request
- `run-meta.json` β model, harness, scores, timing
### Reproducing
```bash
pip install clawbench-eval
clawbench run --model <your-model> --harness hermes --corpus v2
python scripts/clawbench_rescore.py --judge-model deepseek-v4-pro --only-batch <your-batch-dir>
```
"""
SUBMIT = """## π Submit your model
Submissions are accepted as **PRs to the leaderboard CSV** in the dataset repo:
[**Open the CSV in the dataset repo β**](https://huggingface.co/datasets/TIGER-Lab/ClawBench/blob/main/leaderboard/results.csv)
### Required steps
1. **Run the benchmark** β install `pip install clawbench-eval`, then `clawbench run --model <your-model> --harness hermes --corpus v2` (or `v1`). Use the included harnesses (hermes / openclaw) so traces follow the standard 5-layer format.
2. **Score** β `python scripts/clawbench_rescore.py --judge-model deepseek-v4-pro --only-batch <your-batch-dir>` produces `rescore-summary.json` with the cells you'll need.
3. **Upload traces** (recommended) β push the 5-layer run bundles to `TIGER-Lab/ClawBenchV2Trace` (or `NAIL-Group/ClawBenchV1Trace`) so others can audit.
4. **Open a PR** β add one row per `(model, harness, corpus)` to `leaderboard/results.csv` with columns: `model,harness,dataset,passed,total,pass_rate,reward_rate,wall_hours`. Link the trace bundle in the PR description.
We re-run a sample of your submitted traces with our judge before merging β to keep the table honest.
For step-by-step instructions, see [`eval/scoring.md`](https://github.com/reacher-z/ClawBench/blob/main/eval/scoring.md).
"""
def _format_pct(v) -> str:
return "β" if pd.isna(v) else f"{v:.2f}%"
def _format_wall(v) -> str:
return "β" if pd.isna(v) else f"{v:.2f}"
CORPUS_SIZE = {"v1": 153, "v2": 130}
def load_results() -> pd.DataFrame:
raw = urllib.request.urlopen(RESULTS_URL, timeout=30).read()
df = pd.read_csv(io.BytesIO(raw))
if "reward_rate" not in df.columns:
df["reward_rate"] = pd.NA
# Rank by corpus interception rate (intercepted_count / full_corpus_size) as
# the headline metric β Stage 1 is deterministic (URL/method match) and
# universally comparable. Tiebreak by corpus reward (passed / corpus_size)
# so partial batches don't outrank complete ones with lower rates.
df["_corpus_size"] = df["dataset"].map(CORPUS_SIZE).fillna(df["total"])
# `pass_rate` in our CSV is the Stage-1 intercept rate (%) over attempted.
# Convert it to a fraction over the full corpus.
df["_intercepted_count"] = (df["pass_rate"].astype(float) / 100.0 * df["total"]).round().astype(int)
df["_corpus_intercepted"] = df["_intercepted_count"] / df["_corpus_size"]
df["_corpus_reward"] = df["passed"] / df["_corpus_size"]
df = df.sort_values(
["dataset", "_corpus_intercepted", "_corpus_reward"],
ascending=[True, False, False],
na_position="last",
).reset_index(drop=True)
df.insert(0, "rank", df.groupby("dataset").cumcount() + 1)
df = df.drop(columns=["_corpus_size", "_corpus_reward", "_intercepted_count", "_corpus_intercepted"])
df["pass_rate"] = df["pass_rate"].map(_format_pct)
df["reward_rate"] = df["reward_rate"].map(_format_pct)
df["wall_hours"] = df["wall_hours"].map(_format_wall)
df.rename(
columns={
"model": "Model",
"harness": "Harness",
"dataset": "Corpus",
"passed": "Pass",
"total": "Total",
"pass_rate": "Intercepted",
"reward_rate": "Reward",
"wall_hours": "Wall (h)",
"rank": "Rank",
},
inplace=True,
)
return df[["Rank", "Model", "Harness", "Corpus", "Intercepted", "Reward", "Pass", "Total", "Wall (h)"]]
def filter_df(query: str, corpus: str, harness_filter: list[str]):
df = load_results()
if corpus and corpus != "all":
df = df[df["Corpus"].str.lower() == corpus.lower()]
if harness_filter:
df = df[df["Harness"].isin(harness_filter)]
if query:
q = query.strip().lower()
df = df[df["Model"].str.lower().str.contains(q, na=False)]
return df.reset_index(drop=True)
def all_harnesses() -> list[str]:
try:
df = load_results()
return sorted(df["Harness"].dropna().unique().tolist())
except Exception:
return ["hermes", "openclaw"]
with gr.Blocks(title="ClawBench Leaderboard", theme=gr.themes.Soft()) as demo:
gr.Markdown(INTRO)
with gr.Tabs():
with gr.TabItem("π Leaderboard"):
with gr.Row():
with gr.Accordion("Citation", open=False):
gr.Textbox(value=CITATION, label="BibTeX", lines=8, interactive=False)
gr.Markdown(TABLE_INTRO)
with gr.Row():
search_bar = gr.Textbox(placeholder="Search modelsβ¦", show_label=False, scale=3)
corpus_choice = gr.Radio(choices=["all", "v2", "v1"], value="v2", label="Corpus", scale=2)
harness_choice = gr.CheckboxGroup(
choices=all_harnesses(),
value=all_harnesses(),
label="Harness",
)
df_init = filter_df("", "v2", all_harnesses())
table = gr.Dataframe(
value=df_init,
interactive=False,
wrap=True,
column_widths=["60px", "260px", "100px", "70px", "110px", "100px", "60px", "60px", "80px"],
)
refresh = gr.Button("π Refresh from dataset")
for control in (search_bar, corpus_choice, harness_choice):
control.change(
fn=filter_df,
inputs=[search_bar, corpus_choice, harness_choice],
outputs=table,
)
refresh.click(fn=filter_df, inputs=[search_bar, corpus_choice, harness_choice], outputs=table)
with gr.TabItem("π About"):
gr.Markdown(ABOUT)
with gr.TabItem("π Submit here"):
gr.Markdown(SUBMIT)
if __name__ == "__main__":
demo.launch()
|