Remove hardcoded stats: load all config dynamically from test.raw.json
Browse files- Include test.raw.json in Space deployment (was excluded by .gitignore)
- Replace hardcoded fallbacks with FileNotFoundError in both schema.py files
- Replace hardcoded "3 web applications" with len(WEB_APPLICATIONS)
- Update README stats to match actual data (375 tasks, 3005 policies)
- README.md +2 -2
- app.py +1 -1
- data/test.raw.json +0 -0
- validation/schema.py +4 -2
README.md
CHANGED
|
@@ -23,7 +23,7 @@ short_description: "Safety & Trustworthiness Leaderboard for Web Agents"
|
|
| 23 |
|
| 24 |
**Evaluating Safety & Trustworthiness in Web Agents — ICLR 2025**
|
| 25 |
|
| 26 |
-
|
| 27 |
|
| 28 |
## Key Metrics
|
| 29 |
|
|
@@ -37,7 +37,7 @@ short_description: "Safety & Trustworthiness Leaderboard for Web Agents"
|
|
| 37 |
|
| 38 |
## How to Submit
|
| 39 |
|
| 40 |
-
1. Run the full benchmark on all
|
| 41 |
2. Generate your submission:
|
| 42 |
|
| 43 |
```bash
|
|
|
|
| 23 |
|
| 24 |
**Evaluating Safety & Trustworthiness in Web Agents — ICLR 2025**
|
| 25 |
|
| 26 |
+
375 tasks | 3,005 policies | 6 safety dimensions | 3 web applications
|
| 27 |
|
| 28 |
## Key Metrics
|
| 29 |
|
|
|
|
| 37 |
|
| 38 |
## How to Submit
|
| 39 |
|
| 40 |
+
1. Run the full benchmark on all 375 tasks
|
| 41 |
2. Generate your submission:
|
| 42 |
|
| 43 |
```bash
|
app.py
CHANGED
|
@@ -2114,7 +2114,7 @@ contact details.
|
|
| 2114 |
gr.Markdown(
|
| 2115 |
f"## About ST-WebAgentBench\n\n"
|
| 2116 |
f"**{EXPECTED_TASK_COUNT} tasks** | **{EXPECTED_POLICY_COUNT:,} policies** "
|
| 2117 |
-
f"| **{len(SAFETY_DIMENSIONS)} safety dimensions** | **
|
| 2118 |
"**Accepted at ICLR 2025** — ST-WebAgentBench evaluates web agents on both "
|
| 2119 |
"task completion **and** safety policy adherence — the first benchmark to "
|
| 2120 |
"systematically measure the safety-performance tradeoff in autonomous web agents.\n\n"
|
|
|
|
| 2114 |
gr.Markdown(
|
| 2115 |
f"## About ST-WebAgentBench\n\n"
|
| 2116 |
f"**{EXPECTED_TASK_COUNT} tasks** | **{EXPECTED_POLICY_COUNT:,} policies** "
|
| 2117 |
+
f"| **{len(SAFETY_DIMENSIONS)} safety dimensions** | **{len(WEB_APPLICATIONS)} web applications**\n\n"
|
| 2118 |
"**Accepted at ICLR 2025** — ST-WebAgentBench evaluates web agents on both "
|
| 2119 |
"task completion **and** safety policy adherence — the first benchmark to "
|
| 2120 |
"systematically measure the safety-performance tradeoff in autonomous web agents.\n\n"
|
data/test.raw.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
validation/schema.py
CHANGED
|
@@ -34,8 +34,10 @@ def _load_benchmark_config() -> tuple:
|
|
| 34 |
web_applications, tier_config).
|
| 35 |
"""
|
| 36 |
if not _TASKS_DATA_PATH.exists():
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
| 39 |
|
| 40 |
with open(_TASKS_DATA_PATH) as f:
|
| 41 |
tasks = json.load(f)
|
|
|
|
| 34 |
web_applications, tier_config).
|
| 35 |
"""
|
| 36 |
if not _TASKS_DATA_PATH.exists():
|
| 37 |
+
raise FileNotFoundError(
|
| 38 |
+
f"test.raw.json not found at {_TASKS_DATA_PATH}. "
|
| 39 |
+
"This file must be included in the Space deployment."
|
| 40 |
+
)
|
| 41 |
|
| 42 |
with open(_TASKS_DATA_PATH) as f:
|
| 43 |
tasks = json.load(f)
|