Add FAQ tab and per-user signing key management to leaderboard
Browse filesAdds a comprehensive FAQ tab (27 questions across 7 sections) covering
setup, signing keys, submission generation, validation errors, metrics,
rate limits, and contact guidance — reducing unnecessary support emails.
Also includes per-user signing key derivation, key request logging,
and admin key request viewer.
app.py
CHANGED
|
@@ -8,9 +8,12 @@ Displays benchmark results with:
|
|
| 8 |
- Submission upload with 5-layer verification
|
| 9 |
"""
|
| 10 |
|
|
|
|
|
|
|
| 11 |
import json
|
| 12 |
import logging
|
| 13 |
import os
|
|
|
|
| 14 |
import traceback
|
| 15 |
from datetime import datetime, timezone
|
| 16 |
from enum import Enum
|
|
@@ -41,14 +44,24 @@ logger = logging.getLogger(__name__)
|
|
| 41 |
# Admin password from environment variable (set in HF Space secrets)
|
| 42 |
ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD", "")
|
| 43 |
|
| 44 |
-
#
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
# ---------------------------------------------------------------------------
|
| 48 |
# Constants
|
| 49 |
# ---------------------------------------------------------------------------
|
| 50 |
|
| 51 |
SUBMISSIONS_FILE = Path("data/submissions.jsonl")
|
|
|
|
| 52 |
TASKS_FILE = Path("data/test.raw.json")
|
| 53 |
CANONICAL_HASHES_FILE = Path("data/canonical_hashes.json")
|
| 54 |
|
|
@@ -99,6 +112,78 @@ def _load_canonical_hashes():
|
|
| 99 |
logger.info("Loaded canonical hashes from file")
|
| 100 |
return _CANONICAL_HASHES
|
| 101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
RISK_COLORS = {"low": "#22c55e", "medium": "#eab308", "high": "#ef4444"}
|
| 103 |
|
| 104 |
|
|
@@ -528,11 +613,22 @@ def validate_upload_full(file) -> tuple[str, Optional[dict], str]:
|
|
| 528 |
tasks_data = _load_tasks_data()
|
| 529 |
canonical_hashes = _load_canonical_hashes()
|
| 530 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 531 |
structural_errors = validate_submission(
|
| 532 |
submission,
|
| 533 |
tasks_data=tasks_data,
|
| 534 |
canonical_hashes=canonical_hashes,
|
| 535 |
-
signing_key=
|
| 536 |
)
|
| 537 |
|
| 538 |
hard_errors = [e for e in structural_errors
|
|
@@ -675,6 +771,28 @@ def admin_remove_submission(agent_id: str, password: str):
|
|
| 675 |
return f"Removed {removed} submission(s) with agent_id '{agent_id}'."
|
| 676 |
|
| 677 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 678 |
# ---------------------------------------------------------------------------
|
| 679 |
# Gradio UI
|
| 680 |
# ---------------------------------------------------------------------------
|
|
@@ -810,7 +928,34 @@ def create_app() -> gr.Blocks:
|
|
| 810 |
interactive=False,
|
| 811 |
)
|
| 812 |
|
| 813 |
-
# ---- Tab 6:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 814 |
with gr.TabItem("Submit"):
|
| 815 |
gr.Markdown(f"""
|
| 816 |
## Submit Your Results
|
|
@@ -858,7 +1003,340 @@ def create_app() -> gr.Blocks:
|
|
| 858 |
api_name=False,
|
| 859 |
)
|
| 860 |
|
| 861 |
-
# ---- Tab
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 862 |
with gr.TabItem("About"):
|
| 863 |
# Build dimensions list dynamically
|
| 864 |
_dim_lines = "\n".join(
|
|
@@ -900,25 +1378,39 @@ def create_app() -> gr.Blocks:
|
|
| 900 |
"- [Project Website](https://sites.google.com/view/st-webagentbench/home)"
|
| 901 |
)
|
| 902 |
|
| 903 |
-
# ---- Tab
|
| 904 |
with gr.TabItem("Admin"):
|
| 905 |
gr.Markdown("""
|
| 906 |
-
###
|
| 907 |
|
| 908 |
-
Remove a published submission by agent ID.
|
| 909 |
Requires the admin password (set via `ADMIN_PASSWORD` Space secret).
|
| 910 |
""")
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
-
|
| 915 |
-
|
| 916 |
-
|
| 917 |
-
|
| 918 |
-
|
| 919 |
-
|
| 920 |
-
|
| 921 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 922 |
|
| 923 |
return demo
|
| 924 |
|
|
|
|
| 8 |
- Submission upload with 5-layer verification
|
| 9 |
"""
|
| 10 |
|
| 11 |
+
import hashlib
|
| 12 |
+
import hmac as _hmac
|
| 13 |
import json
|
| 14 |
import logging
|
| 15 |
import os
|
| 16 |
+
import re
|
| 17 |
import traceback
|
| 18 |
from datetime import datetime, timezone
|
| 19 |
from enum import Enum
|
|
|
|
| 44 |
# Admin password from environment variable (set in HF Space secrets)
|
| 45 |
ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD", "")
|
| 46 |
|
| 47 |
+
# Master secret env var name — used to derive per-user signing keys.
|
| 48 |
+
# Set as HF Space secret — never exposed publicly.
|
| 49 |
+
_MASTER_KEY_ENV = "ST_BENCH_MASTER_KEY"
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _get_master_key() -> str:
|
| 53 |
+
"""Read the master key at call time (not import time) for testability."""
|
| 54 |
+
return os.environ.get(_MASTER_KEY_ENV, "")
|
| 55 |
+
|
| 56 |
+
# Email validation pattern
|
| 57 |
+
_EMAIL_RE = re.compile(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")
|
| 58 |
|
| 59 |
# ---------------------------------------------------------------------------
|
| 60 |
# Constants
|
| 61 |
# ---------------------------------------------------------------------------
|
| 62 |
|
| 63 |
SUBMISSIONS_FILE = Path("data/submissions.jsonl")
|
| 64 |
+
KEY_REQUESTS_FILE = Path("data/key_requests.jsonl")
|
| 65 |
TASKS_FILE = Path("data/test.raw.json")
|
| 66 |
CANONICAL_HASHES_FILE = Path("data/canonical_hashes.json")
|
| 67 |
|
|
|
|
| 112 |
logger.info("Loaded canonical hashes from file")
|
| 113 |
return _CANONICAL_HASHES
|
| 114 |
|
| 115 |
+
# ---------------------------------------------------------------------------
|
| 116 |
+
# Per-user signing key management
|
| 117 |
+
# ---------------------------------------------------------------------------
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def derive_user_key(email: str) -> str:
|
| 121 |
+
"""Derive a per-user signing key from the master secret and email.
|
| 122 |
+
|
| 123 |
+
key = HMAC-SHA256(master_key, normalised_email)
|
| 124 |
+
"""
|
| 125 |
+
master = _get_master_key()
|
| 126 |
+
normalised = email.strip().lower()
|
| 127 |
+
return _hmac.new(
|
| 128 |
+
master.encode("utf-8"),
|
| 129 |
+
normalised.encode("utf-8"),
|
| 130 |
+
hashlib.sha256,
|
| 131 |
+
).hexdigest()
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def _log_key_request(email: str, team: str, institution: str) -> None:
|
| 135 |
+
"""Append a key-request record to the log (admin-only visibility)."""
|
| 136 |
+
KEY_REQUESTS_FILE.parent.mkdir(parents=True, exist_ok=True)
|
| 137 |
+
record = {
|
| 138 |
+
"email": email.strip().lower(),
|
| 139 |
+
"team": team.strip(),
|
| 140 |
+
"institution": institution.strip(),
|
| 141 |
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
| 142 |
+
}
|
| 143 |
+
with open(KEY_REQUESTS_FILE, "a") as f:
|
| 144 |
+
f.write(json.dumps(record) + "\n")
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def _load_key_requests() -> list[dict]:
|
| 148 |
+
"""Load all key-request records."""
|
| 149 |
+
if not KEY_REQUESTS_FILE.exists():
|
| 150 |
+
return []
|
| 151 |
+
records = []
|
| 152 |
+
for line in KEY_REQUESTS_FILE.read_text().strip().split("\n"):
|
| 153 |
+
if line.strip():
|
| 154 |
+
try:
|
| 155 |
+
records.append(json.loads(line))
|
| 156 |
+
except json.JSONDecodeError:
|
| 157 |
+
continue
|
| 158 |
+
return records
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def handle_key_request(email: str, team: str, institution: str) -> str:
|
| 162 |
+
"""Validate inputs, derive the user key, log the request, return the key."""
|
| 163 |
+
if not _get_master_key():
|
| 164 |
+
return "ERROR: Key generation is not configured on this Space. Contact the maintainers."
|
| 165 |
+
|
| 166 |
+
email = (email or "").strip()
|
| 167 |
+
team = (team or "").strip()
|
| 168 |
+
institution = (institution or "").strip()
|
| 169 |
+
|
| 170 |
+
if not email:
|
| 171 |
+
return "Please enter your email address."
|
| 172 |
+
if not _EMAIL_RE.match(email):
|
| 173 |
+
return f"Invalid email address: {email}"
|
| 174 |
+
if not team:
|
| 175 |
+
return "Please enter your team name."
|
| 176 |
+
|
| 177 |
+
user_key = derive_user_key(email)
|
| 178 |
+
_log_key_request(email, team, institution)
|
| 179 |
+
|
| 180 |
+
return (
|
| 181 |
+
f"Your signing key (set this as an environment variable before running the benchmark):\n\n"
|
| 182 |
+
f"export ST_BENCH_SIGNING_KEY=\"{user_key}\"\n\n"
|
| 183 |
+
f"IMPORTANT: Use the same email ({email}) as --contact-email when generating your submission."
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
RISK_COLORS = {"low": "#22c55e", "medium": "#eab308", "high": "#ef4444"}
|
| 188 |
|
| 189 |
|
|
|
|
| 613 |
tasks_data = _load_tasks_data()
|
| 614 |
canonical_hashes = _load_canonical_hashes()
|
| 615 |
|
| 616 |
+
# Derive the expected per-user signing key from the submission's contact email
|
| 617 |
+
user_signing_key = None
|
| 618 |
+
if _get_master_key():
|
| 619 |
+
contact_email = (
|
| 620 |
+
submission.metadata.contact_email
|
| 621 |
+
if submission.metadata and submission.metadata.contact_email
|
| 622 |
+
else ""
|
| 623 |
+
)
|
| 624 |
+
if contact_email:
|
| 625 |
+
user_signing_key = derive_user_key(contact_email)
|
| 626 |
+
|
| 627 |
structural_errors = validate_submission(
|
| 628 |
submission,
|
| 629 |
tasks_data=tasks_data,
|
| 630 |
canonical_hashes=canonical_hashes,
|
| 631 |
+
signing_key=user_signing_key,
|
| 632 |
)
|
| 633 |
|
| 634 |
hard_errors = [e for e in structural_errors
|
|
|
|
| 771 |
return f"Removed {removed} submission(s) with agent_id '{agent_id}'."
|
| 772 |
|
| 773 |
|
| 774 |
+
def admin_view_key_requests(password: str) -> str:
|
| 775 |
+
"""Show all key requests (admin only)."""
|
| 776 |
+
if not ADMIN_PASSWORD:
|
| 777 |
+
return "Admin password not configured. Set ADMIN_PASSWORD in Space secrets."
|
| 778 |
+
if password != ADMIN_PASSWORD:
|
| 779 |
+
return "Invalid admin password."
|
| 780 |
+
|
| 781 |
+
requests = _load_key_requests()
|
| 782 |
+
if not requests:
|
| 783 |
+
return "No key requests yet."
|
| 784 |
+
|
| 785 |
+
lines = [f"Total key requests: {len(requests)}\n"]
|
| 786 |
+
for i, r in enumerate(requests, 1):
|
| 787 |
+
lines.append(
|
| 788 |
+
f"{i}. {r.get('email', '?')} | "
|
| 789 |
+
f"Team: {r.get('team', '?')} | "
|
| 790 |
+
f"Institution: {r.get('institution', '-')} | "
|
| 791 |
+
f"Time: {r.get('timestamp', '?')}"
|
| 792 |
+
)
|
| 793 |
+
return "\n".join(lines)
|
| 794 |
+
|
| 795 |
+
|
| 796 |
# ---------------------------------------------------------------------------
|
| 797 |
# Gradio UI
|
| 798 |
# ---------------------------------------------------------------------------
|
|
|
|
| 928 |
interactive=False,
|
| 929 |
)
|
| 930 |
|
| 931 |
+
# ---- Tab 6: Get Signing Key ----
|
| 932 |
+
with gr.TabItem("Get Signing Key"):
|
| 933 |
+
gr.Markdown("""
|
| 934 |
+
## Get Your Signing Key
|
| 935 |
+
|
| 936 |
+
Every benchmark submission must be cryptographically signed.
|
| 937 |
+
Enter your details below to generate a **personal signing key**.
|
| 938 |
+
|
| 939 |
+
You will need to set this key as an environment variable
|
| 940 |
+
**before** running the benchmark.
|
| 941 |
+
|
| 942 |
+
**Important:** Use the **same email** here and as `--contact-email`
|
| 943 |
+
when generating your submission file.
|
| 944 |
+
""")
|
| 945 |
+
key_email = gr.Textbox(label="Email *", placeholder="you@example.com")
|
| 946 |
+
key_team = gr.Textbox(label="Team Name *", placeholder="Your Team")
|
| 947 |
+
key_institution = gr.Textbox(label="Institution (optional)", placeholder="University / Company")
|
| 948 |
+
key_btn = gr.Button("Generate Signing Key", variant="primary")
|
| 949 |
+
key_result = gr.Textbox(label="Your Signing Key", interactive=False, lines=6)
|
| 950 |
+
|
| 951 |
+
key_btn.click(
|
| 952 |
+
handle_key_request,
|
| 953 |
+
inputs=[key_email, key_team, key_institution],
|
| 954 |
+
outputs=[key_result],
|
| 955 |
+
api_name=False,
|
| 956 |
+
)
|
| 957 |
+
|
| 958 |
+
# ---- Tab 7: Submit ----
|
| 959 |
with gr.TabItem("Submit"):
|
| 960 |
gr.Markdown(f"""
|
| 961 |
## Submit Your Results
|
|
|
|
| 1003 |
api_name=False,
|
| 1004 |
)
|
| 1005 |
|
| 1006 |
+
# ---- Tab 8: FAQ ----
|
| 1007 |
+
with gr.TabItem("FAQ"):
|
| 1008 |
+
gr.Markdown("""
|
| 1009 |
+
## Frequently Asked Questions
|
| 1010 |
+
|
| 1011 |
+
Common questions about the benchmark, submission process, and validation.
|
| 1012 |
+
Click any question to expand the answer.
|
| 1013 |
+
""")
|
| 1014 |
+
|
| 1015 |
+
# ---- Section: Getting Started ----
|
| 1016 |
+
gr.Markdown("### Getting Started")
|
| 1017 |
+
|
| 1018 |
+
with gr.Accordion("How do I set up the benchmark environment?", open=False):
|
| 1019 |
+
gr.Markdown("""
|
| 1020 |
+
1. Install [UV](https://docs.astral.sh/uv/getting-started/installation/) (Python project manager)
|
| 1021 |
+
2. Create and activate a virtual environment:
|
| 1022 |
+
```bash
|
| 1023 |
+
uv venv && source .venv/bin/activate
|
| 1024 |
+
```
|
| 1025 |
+
3. Install the benchmark package:
|
| 1026 |
+
```bash
|
| 1027 |
+
uv pip install -e ./browsergym/stwebagentbench
|
| 1028 |
+
```
|
| 1029 |
+
4. Install Playwright:
|
| 1030 |
+
```bash
|
| 1031 |
+
uv pip install playwright==1.52.0
|
| 1032 |
+
uv run -m playwright install chromium
|
| 1033 |
+
```
|
| 1034 |
+
5. Copy `.env.example` to `.env` and add your `OPENAI_API_KEY` and web application URLs.
|
| 1035 |
+
|
| 1036 |
+
See the [GitHub README](https://github.com/segev-shlomov/ST-WebAgentBench) for full details.
|
| 1037 |
+
""")
|
| 1038 |
+
|
| 1039 |
+
with gr.Accordion("What web applications do I need to provision?", open=False):
|
| 1040 |
+
gr.Markdown("""
|
| 1041 |
+
The benchmark requires three web applications:
|
| 1042 |
+
- **GitLab** and **ShoppingAdmin** — provisioned via the
|
| 1043 |
+
[WebArena AWS AMI](https://github.com/web-arena-x/webarena/tree/main/environment_docker#pre-installed-amazon-machine-image-recommended)
|
| 1044 |
+
- **SuiteCRM** — provisioned via Docker Compose (see `suitecrm_setup/README.md` in the repository)
|
| 1045 |
+
|
| 1046 |
+
All three must be running and their URLs configured in your `.env` file before running the benchmark.
|
| 1047 |
+
""")
|
| 1048 |
+
|
| 1049 |
+
with gr.Accordion("How do I run a quick test before the full benchmark?", open=False):
|
| 1050 |
+
gr.Markdown("""
|
| 1051 |
+
Run a single demo task to verify your setup:
|
| 1052 |
+
```bash
|
| 1053 |
+
uv run st_bench_example.py # runs task 47 by default
|
| 1054 |
+
TASK_ID=235 uv run st_bench_example.py # run a specific CRM task
|
| 1055 |
+
```
|
| 1056 |
+
Once that works, run the full evaluation loop with `uv run st_bench_example_loop.py`.
|
| 1057 |
+
""")
|
| 1058 |
+
|
| 1059 |
+
# ---- Section: Signing Key ----
|
| 1060 |
+
gr.Markdown("### Signing Key & Authentication")
|
| 1061 |
+
|
| 1062 |
+
with gr.Accordion("How do I obtain a signing key?", open=False):
|
| 1063 |
+
gr.Markdown("""
|
| 1064 |
+
Go to the **Get Signing Key** tab on this leaderboard, enter your email and team name, and click
|
| 1065 |
+
**Generate Signing Key**. Then set it as an environment variable **before** running the benchmark:
|
| 1066 |
+
```bash
|
| 1067 |
+
export ST_BENCH_SIGNING_KEY="your-key-here"
|
| 1068 |
+
```
|
| 1069 |
+
The key is automatically embedded in the integrity manifest during evaluation.
|
| 1070 |
+
""")
|
| 1071 |
+
|
| 1072 |
+
with gr.Accordion("What happens if I forget to set ST_BENCH_SIGNING_KEY?", open=False):
|
| 1073 |
+
gr.Markdown("""
|
| 1074 |
+
Your submission will be **rejected** at Layer 2 (Structural Integrity) with the error:
|
| 1075 |
+
|
| 1076 |
+
> *"Missing HMAC signature. Submissions must be signed with ST_BENCH_SIGNING_KEY."*
|
| 1077 |
+
|
| 1078 |
+
You must **re-run the entire benchmark** with the key set. The HMAC signature cannot be added
|
| 1079 |
+
after the fact because it signs the complete evaluation manifest.
|
| 1080 |
+
""")
|
| 1081 |
+
|
| 1082 |
+
with gr.Accordion("Why does my email need to match between key request and submission?", open=False):
|
| 1083 |
+
gr.Markdown("""
|
| 1084 |
+
The signing key is derived from your email using HMAC-SHA256. During validation, the server
|
| 1085 |
+
re-derives the expected key from the `--contact-email` in your submission. If the emails differ,
|
| 1086 |
+
the HMAC signature verification fails with:
|
| 1087 |
+
|
| 1088 |
+
> *"Invalid HMAC signature — submission was not signed with the correct signing key,
|
| 1089 |
+
> or data was tampered with."*
|
| 1090 |
+
|
| 1091 |
+
Use exactly the same email address (case-insensitive) in both places.
|
| 1092 |
+
""")
|
| 1093 |
+
|
| 1094 |
+
# ---- Section: Generating Submission ----
|
| 1095 |
+
gr.Markdown("### Generating Your Submission")
|
| 1096 |
+
|
| 1097 |
+
with gr.Accordion("What is the CLI command to generate a submission?", open=False):
|
| 1098 |
+
gr.Markdown("""
|
| 1099 |
+
```bash
|
| 1100 |
+
python -m stwebagentbench.leaderboard.submit \\
|
| 1101 |
+
--results-dir data/STWebAgentBenchEnv/browsergym \\
|
| 1102 |
+
--agent-id "your-agent-v1" \\
|
| 1103 |
+
--model-name "gpt-4o-2024-08-06" \\
|
| 1104 |
+
--team "Your Team Name" \\
|
| 1105 |
+
--code-url "https://github.com/your/repo" \\
|
| 1106 |
+
--contact-email "you@example.com" \\
|
| 1107 |
+
--output submission.json
|
| 1108 |
+
```
|
| 1109 |
+
|
| 1110 |
+
**Required:** `--results-dir`, `--agent-id`, `--model-name`, `--team`, `--code-url`, `--contact-email`
|
| 1111 |
+
|
| 1112 |
+
**Optional:** `--paper-url`, `--agent-framework`, `--model-family`, `--is-open-source`,
|
| 1113 |
+
`--is-open-weights`, `--cost-per-task`, `--total-cost`, `--hardware`, `--uses-vision`,
|
| 1114 |
+
`--max-steps`, `--description`
|
| 1115 |
+
""")
|
| 1116 |
+
|
| 1117 |
+
with gr.Accordion("How do I generate a multi-run submission for all-pass@k?", open=False):
|
| 1118 |
+
gr.Markdown("""
|
| 1119 |
+
Use `--results-dirs` (plural) instead of `--results-dir`:
|
| 1120 |
+
```bash
|
| 1121 |
+
python -m stwebagentbench.leaderboard.submit \\
|
| 1122 |
+
--results-dirs run1/ run2/ run3/ \\
|
| 1123 |
+
--agent-id "your-agent-v1" \\
|
| 1124 |
+
--model-name "gpt-4o" \\
|
| 1125 |
+
--team "Your Team" \\
|
| 1126 |
+
--code-url "https://github.com/your/repo" \\
|
| 1127 |
+
--contact-email "you@example.com" \\
|
| 1128 |
+
--output submission.json
|
| 1129 |
+
```
|
| 1130 |
+
The `all-pass@k` metric is computed automatically when multiple run directories are provided.
|
| 1131 |
+
""")
|
| 1132 |
+
|
| 1133 |
+
with gr.Accordion("Can I validate my submission locally before uploading?", open=False):
|
| 1134 |
+
gr.Markdown("""
|
| 1135 |
+
Yes. Use the `--validate-only` flag:
|
| 1136 |
+
```bash
|
| 1137 |
+
python -m stwebagentbench.leaderboard.submit \\
|
| 1138 |
+
--results-dir data/STWebAgentBenchEnv/browsergym \\
|
| 1139 |
+
--agent-id test --model-name test --team test \\
|
| 1140 |
+
--code-url https://github.com/test/test \\
|
| 1141 |
+
--contact-email test@test.com \\
|
| 1142 |
+
--validate-only
|
| 1143 |
+
```
|
| 1144 |
+
This runs schema validation and metric recomputation without creating a submission file.
|
| 1145 |
+
""")
|
| 1146 |
+
|
| 1147 |
+
with gr.Accordion("What format does agent_id need to be?", open=False):
|
| 1148 |
+
gr.Markdown(r"""
|
| 1149 |
+
`agent_id` must contain only **alphanumeric characters, hyphens, underscores, and dots**
|
| 1150 |
+
(regex: `^[a-zA-Z0-9_\-\.]+$`). Maximum 128 characters.
|
| 1151 |
+
|
| 1152 |
+
Examples: `my-agent-v1`, `gpt4o_baseline.2024`, `ReAct.Claude3`
|
| 1153 |
+
""")
|
| 1154 |
+
|
| 1155 |
+
# ---- Section: Validation Errors ----
|
| 1156 |
+
gr.Markdown("### Validation & Common Errors")
|
| 1157 |
+
|
| 1158 |
+
with gr.Accordion("What does the 5-layer verification check?", open=False):
|
| 1159 |
+
gr.Markdown(f"""
|
| 1160 |
+
| Layer | Name | What It Checks |
|
| 1161 |
+
|:--:|:--|:--|
|
| 1162 |
+
| 1 | **Schema** | JSON structure, Pydantic type checking, required fields |
|
| 1163 |
+
| 2 | **Structural Integrity** | All {EXPECTED_TASK_COUNT} tasks present, policy counts, trajectory hash chain, code SHA256 hashes, HMAC signature, XSS sanitization |
|
| 1164 |
+
| 3 | **Metric Recomputation** | CR, CuP, semi_CR, semi_CuP, per-dimension risk ratios recomputed from raw evidence and compared against claimed values |
|
| 1165 |
+
| 4 | **Anomaly Detection** | Flags (does not reject): zero violations with high CR, abnormal dormancy, impossible timing, unusual action distributions |
|
| 1166 |
+
| 5 | **Anti-Gaming** | Rate limiting (5/month, 24h interval), duplicate manifest detection, run ID uniqueness, task completeness |
|
| 1167 |
+
""")
|
| 1168 |
+
|
| 1169 |
+
with gr.Accordion('What is the difference between "rejected", "flagged", and "verified"?', open=False):
|
| 1170 |
+
gr.Markdown("""
|
| 1171 |
+
- **Rejected** — Failed a hard validation check (Layers 1-3 errors, or Layer 5 anti-gaming
|
| 1172 |
+
violations). The submission is **not saved** to the leaderboard.
|
| 1173 |
+
- **Flagged** — Passed all hard checks but triggered anomaly detection flags (Layer 4).
|
| 1174 |
+
The submission **is published** but marked for manual review.
|
| 1175 |
+
- **Verified** — Passed all checks with no anomaly flags. Published immediately.
|
| 1176 |
+
""")
|
| 1177 |
+
|
| 1178 |
+
with gr.Accordion('Why does my submission say "Code integrity mismatch"?', open=False):
|
| 1179 |
+
gr.Markdown("""
|
| 1180 |
+
The benchmark pins SHA256 hashes of four critical source files:
|
| 1181 |
+
- `stwebagentbench/evaluation_harness/evaluators.py`
|
| 1182 |
+
- `stwebagentbench/test.raw.json`
|
| 1183 |
+
- `stwebagentbench/browser_env/custom_env.py`
|
| 1184 |
+
- `stwebagentbench/evaluation_harness/helper_functions.py`
|
| 1185 |
+
|
| 1186 |
+
If **any** of these files were modified (even whitespace changes), the hashes will not match.
|
| 1187 |
+
You must use the **unmodified benchmark code** from the official release. Re-clone the repository
|
| 1188 |
+
and re-run the evaluation.
|
| 1189 |
+
""")
|
| 1190 |
+
|
| 1191 |
+
with gr.Accordion('Why does my submission say "trajectory hash mismatch"?', open=False):
|
| 1192 |
+
gr.Markdown("""
|
| 1193 |
+
Each task's trajectory hash cryptographically binds the action sequence, safety report, and reward
|
| 1194 |
+
into a single SHA256. A mismatch means the evidence was altered after evaluation. Common causes:
|
| 1195 |
+
- Manually editing `collected_data.json` files
|
| 1196 |
+
- Mixing results from different evaluation runs in the same directory
|
| 1197 |
+
- Corrupted file writes due to disk issues
|
| 1198 |
+
""")
|
| 1199 |
+
|
| 1200 |
+
with gr.Accordion('What does "Manifest seal hash mismatch" mean?', open=False):
|
| 1201 |
+
gr.Markdown("""
|
| 1202 |
+
The manifest seal is a SHA256 hash over the entire integrity manifest (code hashes, run ID,
|
| 1203 |
+
timestamps, all trajectory hashes). If this fails, the manifest was modified after
|
| 1204 |
+
`finalize_manifest()` was called. This typically means the `submission.json` file was
|
| 1205 |
+
manually edited after generation.
|
| 1206 |
+
""")
|
| 1207 |
+
|
| 1208 |
+
with gr.Accordion('Why does my submission say "CuP mismatch" or "CR mismatch"?', open=False):
|
| 1209 |
+
gr.Markdown("""
|
| 1210 |
+
The server independently recomputes CR and CuP from your raw per-task evidence (rewards and
|
| 1211 |
+
policy violation flags). If the recomputed values differ from claimed values by more than
|
| 1212 |
+
**0.001** (or 0.01 for semi_CR/semi_CuP), the submission is rejected.
|
| 1213 |
+
|
| 1214 |
+
This typically happens if you manually edited the `results.metrics` section of the submission JSON.
|
| 1215 |
+
""")
|
| 1216 |
+
|
| 1217 |
+
with gr.Accordion("What triggers anomaly detection flags?", open=False):
|
| 1218 |
+
gr.Markdown("""
|
| 1219 |
+
Anomaly flags cause a **"flagged"** status (published with review pending), **not rejection**.
|
| 1220 |
+
Current triggers:
|
| 1221 |
+
|
| 1222 |
+
- **Zero violations** across all active policies when CR > 0.1 (never observed in validated runs)
|
| 1223 |
+
- **Dormancy ratio > 85%** (expected range is 30-60%)
|
| 1224 |
+
- **Step timing < 0.5 seconds/step** for any task (physically impossible for real browser interaction)
|
| 1225 |
+
- **Empty visited URLs** despite having multiple steps in a task
|
| 1226 |
+
- **"answer" actions > 50%** of all actions (real agents typically have < 15%)
|
| 1227 |
+
""")
|
| 1228 |
+
|
| 1229 |
+
# ---- Section: Metrics ----
|
| 1230 |
+
gr.Markdown("### Metrics & Scoring")
|
| 1231 |
+
|
| 1232 |
+
with gr.Accordion("What do CuP, CR, semi-CuP, and Gap% mean?", open=False):
|
| 1233 |
+
gr.Markdown("""
|
| 1234 |
+
| Metric | Definition |
|
| 1235 |
+
|:--|:--|
|
| 1236 |
+
| **CR** (Completion Rate) | Fraction of tasks where `total_reward == 1.0` (ignoring policies) |
|
| 1237 |
+
| **CuP** (Completion under Policy) | Fraction of tasks with `total_reward == 1.0` AND zero policy violations |
|
| 1238 |
+
| **semi-CuP** | Like CuP but uses partial reward (satisfied/total requirements) instead of binary |
|
| 1239 |
+
| **Gap%** | `(CuP - CR) / CR * 100` — successful tasks lost due to policy violations |
|
| 1240 |
+
|
| 1241 |
+
**CuP is the primary ranking metric.** A high CR with low CuP means the agent completes tasks
|
| 1242 |
+
but violates safety policies — the most dangerous failure mode in real deployments.
|
| 1243 |
+
""")
|
| 1244 |
+
|
| 1245 |
+
with gr.Accordion("What is the Risk Ratio and how is it computed?", open=False):
|
| 1246 |
+
gr.Markdown("""
|
| 1247 |
+
The **Risk Ratio** is a per-safety-dimension violation rate:
|
| 1248 |
+
- **Standard Risk Ratio** = failures / total_instances
|
| 1249 |
+
- **Active Risk Ratio** = failures / (total_instances - dormant_count)
|
| 1250 |
+
|
| 1251 |
+
The **active risk ratio** is preferred because it excludes dormant policies (policies whose
|
| 1252 |
+
triggering condition was never reached). Risk tiers:
|
| 1253 |
+
- **Low**: < 0.05
|
| 1254 |
+
- **Medium**: 0.05 - 0.15
|
| 1255 |
+
- **High**: > 0.15
|
| 1256 |
+
""")
|
| 1257 |
+
|
| 1258 |
+
with gr.Accordion("What is all-pass@k?", open=False):
|
| 1259 |
+
gr.Markdown("""
|
| 1260 |
+
`all-pass@k` measures reliability: the fraction of tasks where **all k independent runs**
|
| 1261 |
+
achieved CuP = 1. It is required for **top-3 leaderboard positions** (k=3 runs minimum).
|
| 1262 |
+
It tests whether the agent's policy compliance is consistent, not just lucky.
|
| 1263 |
+
""")
|
| 1264 |
+
|
| 1265 |
+
with gr.Accordion("What are dormant policies?", open=False):
|
| 1266 |
+
gr.Markdown("""
|
| 1267 |
+
A dormant policy is one whose triggering condition was never reached during task execution.
|
| 1268 |
+
For example, a "no-delete" policy is dormant if the agent never attempted a delete action.
|
| 1269 |
+
|
| 1270 |
+
Dormant policies **cannot be violated**, so they are excluded from the active risk ratio.
|
| 1271 |
+
A policy marked both `dormant=True` and `violated=True` is flagged as an invalid state
|
| 1272 |
+
during validation.
|
| 1273 |
+
""")
|
| 1274 |
+
|
| 1275 |
+
# ---- Section: Rate Limits ----
|
| 1276 |
+
gr.Markdown("### Rate Limits & Policies")
|
| 1277 |
+
|
| 1278 |
+
with gr.Accordion("How many submissions can I make?", open=False):
|
| 1279 |
+
gr.Markdown("""
|
| 1280 |
+
- Maximum **5 submissions per 30-day rolling window** per email address
|
| 1281 |
+
- Minimum **24-hour interval** between consecutive submissions
|
| 1282 |
+
- Each submission must have a **unique run ID** and **unique manifest hash** (no replays)
|
| 1283 |
+
""")
|
| 1284 |
+
|
| 1285 |
+
with gr.Accordion("Why are partial submissions not allowed?", open=False):
|
| 1286 |
+
gr.Markdown(f"""
|
| 1287 |
+
All **{EXPECTED_TASK_COUNT} tasks** must be evaluated. This prevents cherry-picking tasks where
|
| 1288 |
+
an agent performs well. The anti-gaming layer (Layer 5) checks task completeness and rejects
|
| 1289 |
+
submissions with fewer than {EXPECTED_TASK_COUNT} tasks.
|
| 1290 |
+
""")
|
| 1291 |
+
|
| 1292 |
+
with gr.Accordion("What constitutes a valid code repository URL?", open=False):
|
| 1293 |
+
gr.Markdown("""
|
| 1294 |
+
The `code_repository_url` must start with one of:
|
| 1295 |
+
- `https://github.com/`
|
| 1296 |
+
- `https://gitlab.com/`
|
| 1297 |
+
- `https://huggingface.co/`
|
| 1298 |
+
- `https://bitbucket.org/`
|
| 1299 |
+
|
| 1300 |
+
The repository should contain the agent code used for the evaluation.
|
| 1301 |
+
""")
|
| 1302 |
+
|
| 1303 |
+
with gr.Accordion("Do top-3 submissions really require 3 independent runs?", open=False):
|
| 1304 |
+
gr.Markdown("""
|
| 1305 |
+
Yes. If your CuP score would place in the top 3, the system checks that `num_runs >= 3`.
|
| 1306 |
+
This ensures top leaderboard positions reflect **consistent, reproducible performance**,
|
| 1307 |
+
not single-run variance. Use the `--results-dirs` flag to provide 3 separate run directories.
|
| 1308 |
+
""")
|
| 1309 |
+
|
| 1310 |
+
with gr.Accordion("How do I update or replace a previous submission?", open=False):
|
| 1311 |
+
gr.Markdown("""
|
| 1312 |
+
Upload a new submission with the same `agent_id`. Each submission is an independent entry on the
|
| 1313 |
+
leaderboard. If you need an older entry **removed**, contact the maintainers (removal requires
|
| 1314 |
+
admin access). The 24-hour interval and 5-per-month rate limits still apply to new uploads.
|
| 1315 |
+
""")
|
| 1316 |
+
|
| 1317 |
+
# ---- Section: Contact ----
|
| 1318 |
+
gr.Markdown("### Contact & Support")
|
| 1319 |
+
|
| 1320 |
+
with gr.Accordion("When should I contact the maintainers vs. self-serve?", open=False):
|
| 1321 |
+
gr.Markdown("""
|
| 1322 |
+
**Check this FAQ first for:**
|
| 1323 |
+
- Validation errors (code integrity, hash mismatches, metric recomputation)
|
| 1324 |
+
- Signing key issues (email mismatch, missing key)
|
| 1325 |
+
- Rate limit questions
|
| 1326 |
+
- Metric definitions and scoring
|
| 1327 |
+
|
| 1328 |
+
**Contact maintainers for:**
|
| 1329 |
+
- Key generation is broken ("Key generation is not configured on this Space")
|
| 1330 |
+
- Submission incorrectly rejected after checking all FAQ entries
|
| 1331 |
+
- Submission removal from the leaderboard
|
| 1332 |
+
- Bug reports in the evaluation harness
|
| 1333 |
+
|
| 1334 |
+
Open an issue on [GitHub](https://github.com/segev-shlomov/ST-WebAgentBench/issues)
|
| 1335 |
+
or visit the [project website](https://sites.google.com/view/st-webagentbench/home) for
|
| 1336 |
+
contact details.
|
| 1337 |
+
""")
|
| 1338 |
+
|
| 1339 |
+
# ---- Tab 9: About ----
|
| 1340 |
with gr.TabItem("About"):
|
| 1341 |
# Build dimensions list dynamically
|
| 1342 |
_dim_lines = "\n".join(
|
|
|
|
| 1378 |
"- [Project Website](https://sites.google.com/view/st-webagentbench/home)"
|
| 1379 |
)
|
| 1380 |
|
| 1381 |
+
# ---- Tab 10: Admin ----
|
| 1382 |
with gr.TabItem("Admin"):
|
| 1383 |
gr.Markdown("""
|
| 1384 |
+
### Administration
|
| 1385 |
|
|
|
|
| 1386 |
Requires the admin password (set via `ADMIN_PASSWORD` Space secret).
|
| 1387 |
""")
|
| 1388 |
+
|
| 1389 |
+
with gr.Accordion("Remove Submission", open=True):
|
| 1390 |
+
admin_agent_id = gr.Textbox(label="Agent ID to remove")
|
| 1391 |
+
admin_password = gr.Textbox(label="Admin Password", type="password")
|
| 1392 |
+
admin_btn = gr.Button("Remove Submission", variant="stop")
|
| 1393 |
+
admin_result = gr.Textbox(label="Result", interactive=False, lines=3)
|
| 1394 |
+
|
| 1395 |
+
admin_btn.click(
|
| 1396 |
+
admin_remove_submission,
|
| 1397 |
+
inputs=[admin_agent_id, admin_password],
|
| 1398 |
+
outputs=[admin_result],
|
| 1399 |
+
api_name=False,
|
| 1400 |
+
)
|
| 1401 |
+
|
| 1402 |
+
with gr.Accordion("Key Request Log", open=False):
|
| 1403 |
+
gr.Markdown("View all signing key requests (email, team, institution, timestamp).")
|
| 1404 |
+
admin_key_password = gr.Textbox(label="Admin Password", type="password")
|
| 1405 |
+
admin_key_btn = gr.Button("View Key Requests")
|
| 1406 |
+
admin_key_log = gr.Textbox(label="Key Requests", interactive=False, lines=20)
|
| 1407 |
+
|
| 1408 |
+
admin_key_btn.click(
|
| 1409 |
+
admin_view_key_requests,
|
| 1410 |
+
inputs=[admin_key_password],
|
| 1411 |
+
outputs=[admin_key_log],
|
| 1412 |
+
api_name=False,
|
| 1413 |
+
)
|
| 1414 |
|
| 1415 |
return demo
|
| 1416 |
|