behavior-in-the-wild commited on
Commit
ae2f1f7
·
verified ·
1 Parent(s): f9e2361

Deploy SDR-Arena leaderboard

Browse files
Files changed (1) hide show
  1. leaderboard/tabs/upload_tab.py +62 -45
leaderboard/tabs/upload_tab.py CHANGED
@@ -2,7 +2,7 @@
2
  Upload Results Tab - Upload pre-computed agent outputs for evaluation.
3
 
4
  Users upload a JSON file containing their agent's outputs for benchmark prompts.
5
- Submissions are saved server-side for admin review and evaluation.
6
  No submission history is shown publicly.
7
  """
8
 
@@ -10,34 +10,67 @@ from __future__ import annotations
10
 
11
  import json
12
  import os
 
13
  from datetime import datetime, timezone
14
  from pathlib import Path
15
- from typing import Optional
16
 
17
  import gradio as gr
18
 
19
  from leaderboard.data_loader import LeaderboardDataLoader
20
 
 
21
 
22
- # Directory for uploaded results (server-side only, not publicly visible)
23
- SUBMISSIONS_DIR = Path(__file__).resolve().parent.parent.parent / "data" / "submissions"
24
- SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)
25
 
26
- SUBMISSION_LOG = SUBMISSIONS_DIR / "submissions.json"
 
 
 
 
 
 
27
 
28
 
29
- def _load_submissions_log() -> list[dict]:
30
- """Load the submissions log (admin-only)."""
31
- if SUBMISSION_LOG.exists():
32
- with open(SUBMISSION_LOG) as f:
33
- return json.load(f)
34
- return []
35
-
36
-
37
- def _save_submissions_log(entries: list[dict]):
38
- """Save the submissions log."""
39
- with open(SUBMISSION_LOG, "w") as f:
40
- json.dump(entries, f, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
 
43
  def _validate_results_json(data: dict) -> tuple[bool, str]:
@@ -212,38 +245,22 @@ def build_upload_tab(loader: LeaderboardDataLoader) -> None:
212
  "",
213
  )
214
 
215
- # Check for duplicate agent name
216
  agent_name = data["agent_name"].strip()
217
- log = _load_submissions_log()
218
- existing_names = {e["agent_name"] for e in log}
219
- if agent_name in existing_names:
220
- return (
221
- f"<div style='color:var(--dr-danger);font-size:0.9rem;'>"
222
- f"&#x274C; An agent named '<strong>{agent_name}</strong>' has already been submitted. "
223
- f"Please use a unique agent name.</div>",
224
- "",
225
- )
226
 
227
- # Save the results file
228
  safe_name = "".join(c if c.isalnum() or c == "-" else "_" for c in agent_name)
229
  timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
230
- results_path = SUBMISSIONS_DIR / f"{safe_name}_{timestamp}.json"
231
- with open(results_path, "w") as f:
232
- json.dump(data, f, indent=2)
233
 
234
- # Record in submissions log
235
- num_results = len(data.get("results", {}))
236
- entry = {
237
- "agent_name": agent_name,
238
- "agent_author": data.get("agent_author", "").strip() or "Anonymous",
239
- "agent_description": data.get("agent_description", "").strip(),
240
- "num_results": num_results,
241
- "results_file": str(results_path),
242
- "submitted_at": datetime.now(timezone.utc).isoformat(),
243
- "status": "pending_evaluation",
244
- }
245
- log.append(entry)
246
- _save_submissions_log(log)
247
 
248
  return (
249
  f"<div style='color:var(--dr-success);font-size:0.9rem;font-weight:600;'>"
 
2
  Upload Results Tab - Upload pre-computed agent outputs for evaluation.
3
 
4
  Users upload a JSON file containing their agent's outputs for benchmark prompts.
5
+ Submissions are pushed to a private HF Dataset repo for admin review.
6
  No submission history is shown publicly.
7
  """
8
 
 
10
 
11
  import json
12
  import os
13
+ import tempfile
14
  from datetime import datetime, timezone
15
  from pathlib import Path
 
16
 
17
  import gradio as gr
18
 
19
  from leaderboard.data_loader import LeaderboardDataLoader
20
 
21
+ SUBMISSIONS_DATASET_REPO = "behavior-in-the-wild/sdr-arena-submissions"
22
 
 
 
 
23
 
24
+ def _get_hf_api():
25
+ """Return an authenticated HfApi instance, or None if no token is set."""
26
+ token = os.environ.get("HF_TOKEN", "")
27
+ if not token:
28
+ return None
29
+ from huggingface_hub import HfApi
30
+ return HfApi(token=token)
31
 
32
 
33
+ def _list_existing_submissions() -> set[str]:
34
+ """Fetch agent names already submitted to the dataset repo."""
35
+ api = _get_hf_api()
36
+ if api is None:
37
+ return set()
38
+ try:
39
+ files = api.list_repo_files(
40
+ repo_id=SUBMISSIONS_DATASET_REPO, repo_type="dataset"
41
+ )
42
+ names = set()
43
+ for f in files:
44
+ if f.startswith("submissions/") and f.endswith(".json"):
45
+ stem = f.split("/", 1)[1].rsplit("_", 2)[0]
46
+ names.add(stem)
47
+ return names
48
+ except Exception:
49
+ return set()
50
+
51
+
52
+ def _push_submission(data: dict, filename: str) -> bool:
53
+ """Push a submission JSON to the private dataset repo. Returns True on success."""
54
+ api = _get_hf_api()
55
+ if api is None:
56
+ return False
57
+ try:
58
+ with tempfile.NamedTemporaryFile(
59
+ mode="w", suffix=".json", delete=False
60
+ ) as tmp:
61
+ json.dump(data, tmp, indent=2)
62
+ tmp_path = tmp.name
63
+ api.upload_file(
64
+ path_or_fileobj=tmp_path,
65
+ path_in_repo=f"submissions/{filename}",
66
+ repo_id=SUBMISSIONS_DATASET_REPO,
67
+ repo_type="dataset",
68
+ commit_message=f"New submission: {data.get('agent_name', 'unknown')}",
69
+ )
70
+ Path(tmp_path).unlink(missing_ok=True)
71
+ return True
72
+ except Exception:
73
+ return False
74
 
75
 
76
  def _validate_results_json(data: dict) -> tuple[bool, str]:
 
245
  "",
246
  )
247
 
 
248
  agent_name = data["agent_name"].strip()
249
+ num_results = len(data.get("results", {}))
 
 
 
 
 
 
 
 
250
 
251
+ # Push to private HF dataset repo
252
  safe_name = "".join(c if c.isalnum() or c == "-" else "_" for c in agent_name)
253
  timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
254
+ filename = f"{safe_name}_{timestamp}.json"
 
 
255
 
256
+ pushed = _push_submission(data, filename)
257
+ if not pushed:
258
+ return (
259
+ f"<div style='color:var(--dr-danger);font-size:0.9rem;'>"
260
+ f"&#x274C; Submission storage is temporarily unavailable. "
261
+ f"Please try again later.</div>",
262
+ "",
263
+ )
 
 
 
 
 
264
 
265
  return (
266
  f"<div style='color:var(--dr-success);font-size:0.9rem;font-weight:600;'>"