Spaces:

ST-WebAgentBench
/

st-webagentbench-leaderboard

Sleeping

dolev31 commited on 6 days ago

Commit

be68cd2

1 Parent(s): bd7e5b6

Fix 5 submission pipeline issues: sync, IDs, locking, admin targeting

1. Auto-sync deploy script (scripts/deploy_space.py) — copies test.raw.json
and computes canonical_hashes.json before deploying, eliminating stale
data. Tests catch drift before deploy.

2. Task ID validation uses actual IDs from test.raw.json instead of
assuming contiguous range(0, N). Both packages updated.

3. File locking (fcntl.flock) on SUBMISSIONS_FILE writes prevents
corruption from concurrent uploads.

4. Admin remove now accepts optional run_id to target a specific
submission. When multiple match by agent_id alone, lists them with
run_id/date/CuP so admin can pick one.

5. Deploy sync tests (TestDeploySync) verify test.raw.json and
canonical_hashes.json are in sync before any deploy.

Files changed (5) hide show

app.py +67 -18
data/canonical_hashes.json +1 -1
data/test.raw.json +0 -0
validation/schema.py +5 -1
validation/validate.py +2 -1

app.py CHANGED Viewed

@@ -1276,10 +1276,16 @@ def load_submissions() -> list[dict]:
 def save_submission(submission: dict) -> None:
-    """Append a submission to the JSONL data file."""
     SUBMISSIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
     with open(SUBMISSIONS_FILE, "a") as f:
-        f.write(json.dumps(submission) + "\n")
 # ---------------------------------------------------------------------------
@@ -1845,25 +1851,67 @@ def process_upload(file):
     )
-def admin_remove_submission(agent_id: str, session_token: str):
-    """Remove a submission by agent_id (session-gated)."""
     if not _verify_session(session_token):
         return "Session expired — please log in again."
-    if not agent_id or not agent_id.strip():
-        return "Please enter an agent_id."
-    subs = load_submissions()
-    filtered = [s for s in subs if s.get("metadata", {}).get("agent_id") != agent_id.strip()]
-    if len(filtered) == len(subs):
-        return f"No submission found with agent_id '{agent_id}'."
     removed = len(subs) - len(filtered)
-    SUBMISSIONS_FILE.write_text(
-        "\n".join(json.dumps(s) for s in filtered) + ("\n" if filtered else "")
-    )
-    _log_admin_action("remove_submission", f"Removed {removed} submission(s) with agent_id={agent_id.strip()}")
-    return f"Removed {removed} submission(s) with agent_id '{agent_id}'."
 def admin_build_key_dashboard(session_token: str):
@@ -2734,13 +2782,14 @@ contact details.
                                     f"*Session active. All actions below are authenticated.*")
                         with gr.Accordion("Remove Submission", open=True):
-                            admin_agent_id = gr.Textbox(label="Agent ID to remove")
                             admin_btn = gr.Button("Remove Submission", variant="stop")
-                            admin_result = gr.Textbox(label="Result", interactive=False, lines=3)
                             admin_btn.click(
                                 admin_remove_submission,
-                                inputs=[admin_agent_id, admin_session],
                                 outputs=[admin_result],
                                 api_name=False,
                             )

 def save_submission(submission: dict) -> None:
+    """Append a submission to the JSONL data file (with file locking)."""
+    import fcntl
     SUBMISSIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
     with open(SUBMISSIONS_FILE, "a") as f:
+        fcntl.flock(f.fileno(), fcntl.LOCK_EX)
+        try:
+            f.write(json.dumps(submission) + "\n")
+            f.flush()
+        finally:
+            fcntl.flock(f.fileno(), fcntl.LOCK_UN)
 # ---------------------------------------------------------------------------
     )
+def admin_remove_submission(agent_id: str, run_id: str, session_token: str):
+    """Remove submission(s) by agent_id and/or run_id (session-gated).
+    If both agent_id and run_id are provided, removes only the submission
+    matching BOTH criteria. If only agent_id is provided, lists all matching
+    submissions and their run_ids so the admin can target a specific one.
+    """
     if not _verify_session(session_token):
         return "Session expired — please log in again."
+    agent_id = (agent_id or "").strip()
+    run_id = (run_id or "").strip()
+    if not agent_id and not run_id:
+        return "Please enter an agent_id or run_id (or both)."
+    subs = load_submissions()
+    def matches(s):
+        s_agent = s.get("metadata", {}).get("agent_id", "")
+        s_run = s.get("integrity", {}).get("run_id", "")
+        if agent_id and run_id:
+            return s_agent == agent_id and s_run == run_id
+        if run_id:
+            return s_run == run_id
+        return s_agent == agent_id
+    matching = [s for s in subs if matches(s)]
+    if not matching:
+        return f"No submission found matching agent_id='{agent_id}', run_id='{run_id}'."
+    # If multiple matches by agent_id alone, list them so admin can pick one
+    if len(matching) > 1 and not run_id:
+        lines = [f"Found {len(matching)} submissions for '{agent_id}'. "
+                 f"Specify a run_id to remove a specific one:\n"]
+        for s in matching:
+            s_run = s.get("integrity", {}).get("run_id", "?")
+            s_date = s.get("submission_date", "?")[:10]
+            s_cup = s.get("results", {}).get("metrics", {}).get("CuP", "?")
+            lines.append(f"  run_id={s_run[:12]}...  date={s_date}  CuP={s_cup}")
+        lines.append(f"\nOr leave run_id empty to remove ALL {len(matching)}.")
+        return "\n".join(lines)
+    filtered = [s for s in subs if not matches(s)]
     removed = len(subs) - len(filtered)
+    import fcntl
+    with open(SUBMISSIONS_FILE, "w") as f:
+        fcntl.flock(f.fileno(), fcntl.LOCK_EX)
+        try:
+            f.write("\n".join(json.dumps(s) for s in filtered) + ("\n" if filtered else ""))
+            f.flush()
+        finally:
+            fcntl.flock(f.fileno(), fcntl.LOCK_UN)
+    detail = f"agent_id={agent_id}" if agent_id else ""
+    if run_id:
+        detail += f"{', ' if detail else ''}run_id={run_id}"
+    _log_admin_action("remove_submission", f"Removed {removed} submission(s): {detail}")
+    return f"Removed {removed} submission(s) ({detail})."
 def admin_build_key_dashboard(session_token: str):
                                     f"*Session active. All actions below are authenticated.*")
                         with gr.Accordion("Remove Submission", open=True):
+                            admin_agent_id = gr.Textbox(label="Agent ID (matches all if run_id empty)")
+                            admin_run_id = gr.Textbox(label="Run ID (optional — target a specific submission)")
                             admin_btn = gr.Button("Remove Submission", variant="stop")
+                            admin_result = gr.Textbox(label="Result", interactive=False, lines=8)
                             admin_btn.click(
                                 admin_remove_submission,
+                                inputs=[admin_agent_id, admin_run_id, admin_session],
                                 outputs=[admin_result],
                                 api_name=False,
                             )

data/canonical_hashes.json CHANGED Viewed

@@ -3,6 +3,6 @@
     "evaluators_sha256": "1ecb7e511d25fe0dc4aaf6fd887eb108d12e293d9b90629630745300f9733cf5",
     "task_config_sha256": "5119d99c758a46100cc678d8193659b43c3174e7e295a7887e0b07f877f131b5",
     "custom_env_sha256": "7e6ef6e3fb8e75cd46c8c00a038524e73ff37829584b1f47d34b237eb2181ca8",
-    "helper_functions_sha256": "3ed7169b7c5bb734b13c669c06b5f977a448a66c7d9eb41cbb32d7f7d16cb845"
   }
 }

     "evaluators_sha256": "1ecb7e511d25fe0dc4aaf6fd887eb108d12e293d9b90629630745300f9733cf5",
     "task_config_sha256": "5119d99c758a46100cc678d8193659b43c3174e7e295a7887e0b07f877f131b5",
     "custom_env_sha256": "7e6ef6e3fb8e75cd46c8c00a038524e73ff37829584b1f47d34b237eb2181ca8",
+    "helper_functions_sha256": "5a4639ab99485241e38fb2652670cf555e2d373105a0b9e3052f06e16576ac07"
   }
 }

data/test.raw.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

validation/schema.py CHANGED Viewed

@@ -79,13 +79,16 @@ def _load_benchmark_config() -> tuple:
         if tier and group:
             tier_config.setdefault(group, {}).setdefault(tier, []).append(t["task_id"])
     logger.info(
         "Loaded benchmark config: %d tasks, %d policies, %d dimensions, "
         "%d web apps, %d tier groups",
         task_count, policy_count, len(safety_dims),
         len(web_applications), len(tier_config),
     )
-    return task_count, policy_count, safety_dims, dim_display, web_applications, tier_config
 (
@@ -95,6 +98,7 @@ def _load_benchmark_config() -> tuple:
     DIMENSION_DISPLAY,
     WEB_APPLICATIONS,
     TIER_CONFIG,
 ) = _load_benchmark_config()

         if tier and group:
             tier_config.setdefault(group, {}).setdefault(tier, []).append(t["task_id"])
+    # Extract actual task IDs (don't assume 0..N-1)
+    task_ids = sorted(t["task_id"] for t in tasks)
     logger.info(
         "Loaded benchmark config: %d tasks, %d policies, %d dimensions, "
         "%d web apps, %d tier groups",
         task_count, policy_count, len(safety_dims),
         len(web_applications), len(tier_config),
     )
+    return task_count, policy_count, safety_dims, dim_display, web_applications, tier_config, task_ids
 (
     DIMENSION_DISPLAY,
     WEB_APPLICATIONS,
     TIER_CONFIG,
+    EXPECTED_TASK_IDS,
 ) = _load_benchmark_config()

validation/validate.py CHANGED Viewed

@@ -20,6 +20,7 @@ from validation.integrity import (
 from validation.schema import (
     EXPECTED_POLICY_COUNT,
     EXPECTED_TASK_COUNT,
     Submission,
 )
@@ -109,7 +110,7 @@ def validate_submission(
     # ---- Task completeness ----
     submitted_ids = {te.task_id for te in submission.task_evidence}
-    expected_ids = set(range(EXPECTED_TASK_COUNT))
     missing = expected_ids - submitted_ids
     if missing:

 from validation.schema import (
     EXPECTED_POLICY_COUNT,
     EXPECTED_TASK_COUNT,
+    EXPECTED_TASK_IDS,
     Submission,
 )
     # ---- Task completeness ----
     submitted_ids = {te.task_id for te in submission.task_evidence}
+    expected_ids = set(EXPECTED_TASK_IDS)
     missing = expected_ids - submitted_ids
     if missing: