dolev31 commited on
Commit
be68cd2
·
1 Parent(s): bd7e5b6

Fix 5 submission pipeline issues: sync, IDs, locking, admin targeting

Browse files

1. Auto-sync deploy script (scripts/deploy_space.py) — copies test.raw.json
and computes canonical_hashes.json before deploying, eliminating stale
data. Tests catch drift before deploy.

2. Task ID validation uses actual IDs from test.raw.json instead of
assuming contiguous range(0, N). Both packages updated.

3. File locking (fcntl.flock) on SUBMISSIONS_FILE writes prevents
corruption from concurrent uploads.

4. Admin remove now accepts optional run_id to target a specific
submission. When multiple match by agent_id alone, lists them with
run_id/date/CuP so admin can pick one.

5. Deploy sync tests (TestDeploySync) verify test.raw.json and
canonical_hashes.json are in sync before any deploy.

app.py CHANGED
@@ -1276,10 +1276,16 @@ def load_submissions() -> list[dict]:
1276
 
1277
 
1278
  def save_submission(submission: dict) -> None:
1279
- """Append a submission to the JSONL data file."""
 
1280
  SUBMISSIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
1281
  with open(SUBMISSIONS_FILE, "a") as f:
1282
- f.write(json.dumps(submission) + "\n")
 
 
 
 
 
1283
 
1284
 
1285
  # ---------------------------------------------------------------------------
@@ -1845,25 +1851,67 @@ def process_upload(file):
1845
  )
1846
 
1847
 
1848
- def admin_remove_submission(agent_id: str, session_token: str):
1849
- """Remove a submission by agent_id (session-gated)."""
 
 
 
 
 
1850
  if not _verify_session(session_token):
1851
  return "Session expired — please log in again."
1852
- if not agent_id or not agent_id.strip():
1853
- return "Please enter an agent_id."
1854
 
1855
- subs = load_submissions()
1856
- filtered = [s for s in subs if s.get("metadata", {}).get("agent_id") != agent_id.strip()]
 
 
 
1857
 
1858
- if len(filtered) == len(subs):
1859
- return f"No submission found with agent_id '{agent_id}'."
1860
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1861
  removed = len(subs) - len(filtered)
1862
- SUBMISSIONS_FILE.write_text(
1863
- "\n".join(json.dumps(s) for s in filtered) + ("\n" if filtered else "")
1864
- )
1865
- _log_admin_action("remove_submission", f"Removed {removed} submission(s) with agent_id={agent_id.strip()}")
1866
- return f"Removed {removed} submission(s) with agent_id '{agent_id}'."
 
 
 
 
 
 
 
 
 
 
1867
 
1868
 
1869
  def admin_build_key_dashboard(session_token: str):
@@ -2734,13 +2782,14 @@ contact details.
2734
  f"*Session active. All actions below are authenticated.*")
2735
 
2736
  with gr.Accordion("Remove Submission", open=True):
2737
- admin_agent_id = gr.Textbox(label="Agent ID to remove")
 
2738
  admin_btn = gr.Button("Remove Submission", variant="stop")
2739
- admin_result = gr.Textbox(label="Result", interactive=False, lines=3)
2740
 
2741
  admin_btn.click(
2742
  admin_remove_submission,
2743
- inputs=[admin_agent_id, admin_session],
2744
  outputs=[admin_result],
2745
  api_name=False,
2746
  )
 
1276
 
1277
 
1278
  def save_submission(submission: dict) -> None:
1279
+ """Append a submission to the JSONL data file (with file locking)."""
1280
+ import fcntl
1281
  SUBMISSIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
1282
  with open(SUBMISSIONS_FILE, "a") as f:
1283
+ fcntl.flock(f.fileno(), fcntl.LOCK_EX)
1284
+ try:
1285
+ f.write(json.dumps(submission) + "\n")
1286
+ f.flush()
1287
+ finally:
1288
+ fcntl.flock(f.fileno(), fcntl.LOCK_UN)
1289
 
1290
 
1291
  # ---------------------------------------------------------------------------
 
1851
  )
1852
 
1853
 
1854
+ def admin_remove_submission(agent_id: str, run_id: str, session_token: str):
1855
+ """Remove submission(s) by agent_id and/or run_id (session-gated).
1856
+
1857
+ If both agent_id and run_id are provided, removes only the submission
1858
+ matching BOTH criteria. If only agent_id is provided, lists all matching
1859
+ submissions and their run_ids so the admin can target a specific one.
1860
+ """
1861
  if not _verify_session(session_token):
1862
  return "Session expired — please log in again."
 
 
1863
 
1864
+ agent_id = (agent_id or "").strip()
1865
+ run_id = (run_id or "").strip()
1866
+
1867
+ if not agent_id and not run_id:
1868
+ return "Please enter an agent_id or run_id (or both)."
1869
 
1870
+ subs = load_submissions()
 
1871
 
1872
+ def matches(s):
1873
+ s_agent = s.get("metadata", {}).get("agent_id", "")
1874
+ s_run = s.get("integrity", {}).get("run_id", "")
1875
+ if agent_id and run_id:
1876
+ return s_agent == agent_id and s_run == run_id
1877
+ if run_id:
1878
+ return s_run == run_id
1879
+ return s_agent == agent_id
1880
+
1881
+ matching = [s for s in subs if matches(s)]
1882
+
1883
+ if not matching:
1884
+ return f"No submission found matching agent_id='{agent_id}', run_id='{run_id}'."
1885
+
1886
+ # If multiple matches by agent_id alone, list them so admin can pick one
1887
+ if len(matching) > 1 and not run_id:
1888
+ lines = [f"Found {len(matching)} submissions for '{agent_id}'. "
1889
+ f"Specify a run_id to remove a specific one:\n"]
1890
+ for s in matching:
1891
+ s_run = s.get("integrity", {}).get("run_id", "?")
1892
+ s_date = s.get("submission_date", "?")[:10]
1893
+ s_cup = s.get("results", {}).get("metrics", {}).get("CuP", "?")
1894
+ lines.append(f" run_id={s_run[:12]}... date={s_date} CuP={s_cup}")
1895
+ lines.append(f"\nOr leave run_id empty to remove ALL {len(matching)}.")
1896
+ return "\n".join(lines)
1897
+
1898
+ filtered = [s for s in subs if not matches(s)]
1899
  removed = len(subs) - len(filtered)
1900
+
1901
+ import fcntl
1902
+ with open(SUBMISSIONS_FILE, "w") as f:
1903
+ fcntl.flock(f.fileno(), fcntl.LOCK_EX)
1904
+ try:
1905
+ f.write("\n".join(json.dumps(s) for s in filtered) + ("\n" if filtered else ""))
1906
+ f.flush()
1907
+ finally:
1908
+ fcntl.flock(f.fileno(), fcntl.LOCK_UN)
1909
+
1910
+ detail = f"agent_id={agent_id}" if agent_id else ""
1911
+ if run_id:
1912
+ detail += f"{', ' if detail else ''}run_id={run_id}"
1913
+ _log_admin_action("remove_submission", f"Removed {removed} submission(s): {detail}")
1914
+ return f"Removed {removed} submission(s) ({detail})."
1915
 
1916
 
1917
  def admin_build_key_dashboard(session_token: str):
 
2782
  f"*Session active. All actions below are authenticated.*")
2783
 
2784
  with gr.Accordion("Remove Submission", open=True):
2785
+ admin_agent_id = gr.Textbox(label="Agent ID (matches all if run_id empty)")
2786
+ admin_run_id = gr.Textbox(label="Run ID (optional — target a specific submission)")
2787
  admin_btn = gr.Button("Remove Submission", variant="stop")
2788
+ admin_result = gr.Textbox(label="Result", interactive=False, lines=8)
2789
 
2790
  admin_btn.click(
2791
  admin_remove_submission,
2792
+ inputs=[admin_agent_id, admin_run_id, admin_session],
2793
  outputs=[admin_result],
2794
  api_name=False,
2795
  )
data/canonical_hashes.json CHANGED
@@ -3,6 +3,6 @@
3
  "evaluators_sha256": "1ecb7e511d25fe0dc4aaf6fd887eb108d12e293d9b90629630745300f9733cf5",
4
  "task_config_sha256": "5119d99c758a46100cc678d8193659b43c3174e7e295a7887e0b07f877f131b5",
5
  "custom_env_sha256": "7e6ef6e3fb8e75cd46c8c00a038524e73ff37829584b1f47d34b237eb2181ca8",
6
- "helper_functions_sha256": "3ed7169b7c5bb734b13c669c06b5f977a448a66c7d9eb41cbb32d7f7d16cb845"
7
  }
8
  }
 
3
  "evaluators_sha256": "1ecb7e511d25fe0dc4aaf6fd887eb108d12e293d9b90629630745300f9733cf5",
4
  "task_config_sha256": "5119d99c758a46100cc678d8193659b43c3174e7e295a7887e0b07f877f131b5",
5
  "custom_env_sha256": "7e6ef6e3fb8e75cd46c8c00a038524e73ff37829584b1f47d34b237eb2181ca8",
6
+ "helper_functions_sha256": "5a4639ab99485241e38fb2652670cf555e2d373105a0b9e3052f06e16576ac07"
7
  }
8
  }
data/test.raw.json CHANGED
The diff for this file is too large to render. See raw diff
 
validation/schema.py CHANGED
@@ -79,13 +79,16 @@ def _load_benchmark_config() -> tuple:
79
  if tier and group:
80
  tier_config.setdefault(group, {}).setdefault(tier, []).append(t["task_id"])
81
 
 
 
 
82
  logger.info(
83
  "Loaded benchmark config: %d tasks, %d policies, %d dimensions, "
84
  "%d web apps, %d tier groups",
85
  task_count, policy_count, len(safety_dims),
86
  len(web_applications), len(tier_config),
87
  )
88
- return task_count, policy_count, safety_dims, dim_display, web_applications, tier_config
89
 
90
 
91
  (
@@ -95,6 +98,7 @@ def _load_benchmark_config() -> tuple:
95
  DIMENSION_DISPLAY,
96
  WEB_APPLICATIONS,
97
  TIER_CONFIG,
 
98
  ) = _load_benchmark_config()
99
 
100
 
 
79
  if tier and group:
80
  tier_config.setdefault(group, {}).setdefault(tier, []).append(t["task_id"])
81
 
82
+ # Extract actual task IDs (don't assume 0..N-1)
83
+ task_ids = sorted(t["task_id"] for t in tasks)
84
+
85
  logger.info(
86
  "Loaded benchmark config: %d tasks, %d policies, %d dimensions, "
87
  "%d web apps, %d tier groups",
88
  task_count, policy_count, len(safety_dims),
89
  len(web_applications), len(tier_config),
90
  )
91
+ return task_count, policy_count, safety_dims, dim_display, web_applications, tier_config, task_ids
92
 
93
 
94
  (
 
98
  DIMENSION_DISPLAY,
99
  WEB_APPLICATIONS,
100
  TIER_CONFIG,
101
+ EXPECTED_TASK_IDS,
102
  ) = _load_benchmark_config()
103
 
104
 
validation/validate.py CHANGED
@@ -20,6 +20,7 @@ from validation.integrity import (
20
  from validation.schema import (
21
  EXPECTED_POLICY_COUNT,
22
  EXPECTED_TASK_COUNT,
 
23
  Submission,
24
  )
25
 
@@ -109,7 +110,7 @@ def validate_submission(
109
 
110
  # ---- Task completeness ----
111
  submitted_ids = {te.task_id for te in submission.task_evidence}
112
- expected_ids = set(range(EXPECTED_TASK_COUNT))
113
 
114
  missing = expected_ids - submitted_ids
115
  if missing:
 
20
  from validation.schema import (
21
  EXPECTED_POLICY_COUNT,
22
  EXPECTED_TASK_COUNT,
23
+ EXPECTED_TASK_IDS,
24
  Submission,
25
  )
26
 
 
110
 
111
  # ---- Task completeness ----
112
  submitted_ids = {te.task_id for te in submission.task_evidence}
113
+ expected_ids = set(EXPECTED_TASK_IDS)
114
 
115
  missing = expected_ids - submitted_ids
116
  if missing: