Spaces:

TIGER-Lab
/

ClawBench

Running

App Files Files Community

AgPerry commited on 19 days ago

Commit

a478c75

verified ·

1 Parent(s): e77a483

Sync Space INTRO+TABLE_INTRO+citation: Intercepted is sort key, full author list in BibTeX

Browse files

Files changed (1) hide show

app.py +9 -7

app.py CHANGED Viewed

@@ -16,22 +16,24 @@ RESULTS_URL = (
 )
-CITATION = """@article{zhang2026clawbench,
-  title  = {ClawBench: Can AI Agents Complete Everyday Online Tasks?},
-  author = {Zhang, Xiaochen and others},
-  year   = {2026},
-  eprint = {2604.08523},
   archivePrefix = {arXiv},
 }"""
 INTRO = """# 🏆 ClawBench — Web Agent Benchmark
-**Can AI agents complete everyday online tasks?** ClawBench scores agents on real, live websites (booking flights, ordering groceries, submitting job applications). Two corpora: **V1** — 153 tasks across 144 websites · **V2** — 130 newer tasks across 63 platforms. Every run is graded twice: a deterministic HTTP-request *interception* check (Stage 1), then an LLM *judge* on the intercepted payload (Stage 2 — the headline `Reward`).
 [**📖 Paper**](https://arxiv.org/abs/2604.08523) · [**💻 GitHub**](https://github.com/reacher-z/ClawBench) · [**🗂 Dataset**](https://huggingface.co/datasets/TIGER-Lab/ClawBench) · [**🎞 Traces V1**](https://huggingface.co/datasets/NAIL-Group/ClawBenchV1Trace) · [**🎞 Traces V2**](https://huggingface.co/datasets/TIGER-Lab/ClawBenchV2Trace) · [**🌐 Site**](https://claw-bench.com)
 """
-TABLE_INTRO = """**Intercepted** = agent's final HTTP request matched the task's URL/method schema. **Reward** = AND passed the LLM judge on the payload (default judge: `deepseek/deepseek-v4-pro`). Rows are ranked by Reward, then Intercepted as tiebreak. `—` means no Stage-2 data available."""
 ABOUT = """## About ClawBench

 )
+CITATION = """@misc{zhang2026clawbench,
+  title         = {ClawBench: Can AI Agents Complete Everyday Online Tasks?},
+  author        = {Yuxuan Zhang and Yubo Wang and Yipeng Zhu and Penghui Du and Junwen Miao and Xuan Lu and Wendong Xu and Yunzhuo Hao and Songcheng Cai and Xiaochen Wang and Huaisong Zhang and Xian Wu and Yi Lu and Minyi Lei and Kai Zou and Huifeng Yin and Ping Nie and Liang Chen and Dongfu Jiang and Wenhu Chen and Kelsey R. Allen},
+  year          = {2026},
+  eprint        = {2604.08523},
   archivePrefix = {arXiv},
+  primaryClass  = {cs.AI},
+  url           = {https://arxiv.org/abs/2604.08523}
 }"""
 INTRO = """# 🏆 ClawBench — Web Agent Benchmark
+**Can AI agents complete everyday online tasks?** ClawBench scores agents on real, live websites (booking flights, ordering groceries, submitting job applications). Two corpora: **V1** — 153 tasks across 144 websites · **V2** — 130 newer tasks across 63 platforms. Every run is graded twice: a deterministic HTTP-request *interception* check (Stage 1, the sort key) — then an LLM *judge* on the intercepted payload (Stage 2 = `Reward`).
 [**📖 Paper**](https://arxiv.org/abs/2604.08523) · [**💻 GitHub**](https://github.com/reacher-z/ClawBench) · [**🗂 Dataset**](https://huggingface.co/datasets/TIGER-Lab/ClawBench) · [**🎞 Traces V1**](https://huggingface.co/datasets/NAIL-Group/ClawBenchV1Trace) · [**🎞 Traces V2**](https://huggingface.co/datasets/TIGER-Lab/ClawBenchV2Trace) · [**🌐 Site**](https://claw-bench.com)
 """
+TABLE_INTRO = """**Intercepted** (sort key) = agent's final HTTP request matched the task's URL/method schema — Stage 1, deterministic, no judge. **Reward** = additionally requires the LLM judge (default `deepseek/deepseek-v4-pro`) to confirm the payload fulfilled the instruction — Stage 2. Rows are ranked by Intercepted (corpus-normalized: `intercepted / 130` for V2 so partials don't outrank complete batches) with Reward as tiebreak. `—` = no Stage-2 data yet."""
 ABOUT = """## About ClawBench