debjitpaul commited on
Commit
95b8b77
Β·
1 Parent(s): 8b3abc6

Multi-channel submission storage (HF Dataset) + notifications (GitHub)

Browse files
Files changed (1) hide show
  1. app.py +288 -29
app.py CHANGED
@@ -11,6 +11,9 @@ import datetime
11
  import json
12
  import os
13
  import re
 
 
 
14
  from pathlib import Path
15
  from typing import Any
16
 
@@ -23,9 +26,27 @@ import pandas as pd
23
 
24
  RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
25
  DEV_RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_DEV_RESULTS_DIR", "dev_submissions"))
26
- QUEUE_DIR = Path(os.environ.get("DEEPSYNTH_QUEUE_DIR", "submissions_queue"))
 
 
 
 
 
27
  QUEUE_DIR.mkdir(exist_ok=True, parents=True)
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  TITLE = "πŸ™ DeepSynth Leaderboard"
30
  TAGLINE = "A Benchmark for Deep Information Synthesis Β· ICLR 2026"
31
  ABOUT_BLURB = (
@@ -214,6 +235,187 @@ def _safe_slug(text: str, maxlen: int = 40) -> str:
214
  return slug[:maxlen] or "unnamed"
215
 
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  def submit_predictions(
218
  file_obj,
219
  agent_name: str,
@@ -246,19 +448,21 @@ def submit_predictions(
246
  except OSError as e:
247
  return f"❌ **Could not read uploaded file:** {e}"
248
 
249
- if not isinstance(predictions, dict) or not predictions:
250
- return "❌ **Predictions file must be a non-empty JSON object mapping task IDs to answers.**"
 
 
251
 
252
  bundle = {
253
  "received_at": datetime.datetime.utcnow().isoformat() + "Z",
254
  "metadata": {
255
- "agent_name": agent_name.strip(),
256
- "base_model": base_model.strip(),
257
- "scaffold": scaffold,
258
- "organization": organization.strip(),
259
- "contact_email": contact_email.strip(),
260
- "code_url": code_url.strip(),
261
- "split": split,
262
  "submission_date": datetime.date.today().isoformat(),
263
  },
264
  "predictions": predictions,
@@ -266,18 +470,49 @@ def submit_predictions(
266
 
267
  date = datetime.date.today().isoformat()
268
  fname = f"{date}-{_safe_slug(organization)}-{_safe_slug(agent_name)}.json"
269
- dest = QUEUE_DIR / fname
270
- with dest.open("w", encoding="utf-8") as f:
 
 
 
 
271
  json.dump(bundle, f, indent=2, ensure_ascii=False)
272
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  return (
274
- f"βœ… **Submission received.** Your file has been queued for review as `{fname}`.\n\n"
275
- f"A maintainer will score it against the private test-set answers and merge it to the "
 
 
 
 
276
  f"leaderboard within ~1 week. We may email `{contact_email}` if we need to verify "
277
  f"reproducibility via your `code_url`.\n\n"
278
- f"**Note:** submissions in this Space's queue are held temporarily β€” for a permanent "
279
- f"record, please also open a PR to the [benchmark repo]({REPO_URL}) with your "
280
- f"predictions file under `submissions/`."
281
  )
282
 
283
 
@@ -341,11 +576,34 @@ def build_app() -> gr.Blocks:
341
  with gr.Tab("πŸ“€ Submit"):
342
  gr.Markdown("## Submit your agent's predictions")
343
  gr.Markdown(
344
- "Upload a JSON file of predictions on the DeepSynth **test set**. "
345
- "We'll score it against the private gold answers and add your row to the leaderboard.\n\n"
346
- f"**Format:** a JSON object mapping task IDs (`\"001\"` … `\"120\"`) to your agent's answer. "
347
- f"See [`submission_schema.json`]({REPO_URL}/blob/main/scripts/evaluation/submission_schema.json) "
348
- "for the full spec."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  )
350
 
351
  with gr.Row():
@@ -358,12 +616,12 @@ def build_app() -> gr.Blocks:
358
  value="ReAct",
359
  )
360
  split_in = gr.Dropdown(
361
- choices=["dev","test", "full"],
362
  label="Split evaluated",
363
- value="dev",
364
  )
365
  with gr.Column():
366
- organization_in = gr.Textbox(label="Organization", placeholder="e.g. Huawei, Anthropic, Microsoft, OpenAI, Stanford")
367
  contact_email_in = gr.Textbox(label="Contact email", placeholder="you@org.edu")
368
  code_url_in = gr.Textbox(
369
  label="Code URL (required)",
@@ -371,7 +629,7 @@ def build_app() -> gr.Blocks:
371
  )
372
 
373
  predictions_in = gr.File(
374
- label="Predictions JSON file",
375
  file_types=[".json"],
376
  )
377
  submit_btn = gr.Button("Submit for review", variant="primary")
@@ -388,9 +646,10 @@ def build_app() -> gr.Blocks:
388
 
389
  gr.Markdown(
390
  "---\n"
391
- "**What happens next?** Submissions are queued for maintainer review. "
392
- "We verify metadata honesty and spot-check reproducibility via your "
393
- "`code_url` before computing scores and merging to the leaderboard.\n\n"
 
394
  f"**Prefer Git?** Open a PR to [{REPO_URL.split('//')[1]}]({REPO_URL}) "
395
  "adding your file under `submissions/YYYY-MM-DD-org-agentname.json`."
396
  )
 
11
  import json
12
  import os
13
  import re
14
+ import tempfile
15
+ import urllib.error
16
+ import urllib.request
17
  from pathlib import Path
18
  from typing import Any
19
 
 
26
 
27
  RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
28
  DEV_RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_DEV_RESULTS_DIR", "dev_submissions"))
29
+
30
+ # Local fallback queue. Used only if HF Dataset upload is not configured β€”
31
+ # kept as a belt-and-suspenders safety net so a misconfigured Space never
32
+ # silently drops a submission.
33
+ _DEFAULT_QUEUE = "/data/submissions_queue" if Path("/data").is_dir() else "submissions_queue"
34
+ QUEUE_DIR = Path(os.environ.get("DEEPSYNTH_QUEUE_DIR", _DEFAULT_QUEUE))
35
  QUEUE_DIR.mkdir(exist_ok=True, parents=True)
36
 
37
+ # Primary submission storage: a private HF Dataset. Each submission is its
38
+ # OWN file in the dataset (no shared CSV β€” that pattern races and loses
39
+ # submissions when two users submit simultaneously).
40
+ HF_TOKEN = os.environ.get("HF_TOKEN") # Space secret with write access to the dataset
41
+ HF_QUEUE_REPO = os.environ.get(
42
+ "DEEPSYNTH_QUEUE_REPO", "DeepSynthesisTeam/deepsynth-submission-queue"
43
+ )
44
+
45
+ # Notification channels β€” both are optional. Set either or both as Space secrets.
46
+ DISCORD_WEBHOOK_URL = os.environ.get("DISCORD_WEBHOOK_URL") # https://discord.com/api/webhooks/...
47
+ GH_NOTIFY_REPO = os.environ.get("DEEPSYNTH_NOTIFY_REPO", "agentdeepsynthesis/deepsynth-bench")
48
+ GH_TOKEN = os.environ.get("GH_TOKEN") # Fine-grained PAT, "Issues: write" on the repo
49
+
50
  TITLE = "πŸ™ DeepSynth Leaderboard"
51
  TAGLINE = "A Benchmark for Deep Information Synthesis Β· ICLR 2026"
52
  ABOUT_BLURB = (
 
235
  return slug[:maxlen] or "unnamed"
236
 
237
 
238
+ def validate_predictions_payload(predictions: Any, split: str) -> str | None:
239
+ """Validate that uploaded file is in the eval_static_score.py format.
240
+
241
+ Returns an error message string if invalid, or None if valid.
242
+ The evaluator expects a JSON list of {"Question Number": ..., "answer": ...}
243
+ objects β€” NOT a dict keyed by task ID.
244
+ """
245
+ if not isinstance(predictions, list):
246
+ return (
247
+ "❌ **Wrong format.** Predictions must be a JSON **array** (list), "
248
+ "not an object/dict. Each element should be `{\"Question Number\": \"001\", "
249
+ "\"answer\": ...}`. See the expected format in the Submit tab above."
250
+ )
251
+ if not predictions:
252
+ return "❌ **Empty predictions file.** Please include answers for the tasks you evaluated."
253
+
254
+ expected_count = 40 if split == "dev" else 80
255
+ missing_fields = []
256
+ for i, item in enumerate(predictions[:5]): # Sample-check the first 5
257
+ if not isinstance(item, dict):
258
+ return (
259
+ f"❌ **Entry {i} is not a JSON object.** Each element must be a "
260
+ f"dict with 'Question Number' and 'answer' keys."
261
+ )
262
+ if "Question Number" not in item:
263
+ missing_fields.append(f"entry {i}: missing 'Question Number'")
264
+ if "answer" not in item:
265
+ missing_fields.append(f"entry {i}: missing 'answer'")
266
+ if missing_fields:
267
+ return "❌ **Required fields missing:** " + "; ".join(missing_fields[:3])
268
+
269
+ if len(predictions) < expected_count:
270
+ return (
271
+ f"⚠️ **Partial submission warning:** the {split} split has {expected_count} "
272
+ f"tasks, but your file contains only {len(predictions)}. This will be "
273
+ f"accepted but scored as 0.0 for missing tasks. Continue anyway? Resubmit a "
274
+ f"complete file if this was unintentional."
275
+ )
276
+ return None
277
+
278
+
279
+ def upload_to_hf_dataset(bundle: dict, filename: str) -> tuple[bool, str | None]:
280
+ """Upload a single submission file to a private HF Dataset repo.
281
+
282
+ Returns (success, dataset_url). Each submission is its own file under
283
+ queue/<filename> β€” never appending to a shared CSV (which races and
284
+ silently drops simultaneous submissions).
285
+ """
286
+ if not HF_TOKEN:
287
+ return False, None
288
+
289
+ # Lazy-import so the Space still boots when huggingface_hub is missing.
290
+ try:
291
+ from huggingface_hub import HfApi, CommitOperationAdd
292
+ except ImportError:
293
+ print("WARN: huggingface_hub not installed; cannot upload to dataset")
294
+ return False, None
295
+
296
+ payload = json.dumps(bundle, indent=2, ensure_ascii=False).encode("utf-8")
297
+
298
+ try:
299
+ api = HfApi(token=HF_TOKEN)
300
+ api.create_commit(
301
+ repo_id=HF_QUEUE_REPO,
302
+ repo_type="dataset",
303
+ operations=[
304
+ CommitOperationAdd(
305
+ path_in_repo=f"queue/{filename}",
306
+ path_or_fileobj=payload,
307
+ )
308
+ ],
309
+ commit_message=f"submission: {bundle['metadata']['agent_name']} ({bundle['metadata']['organization']})",
310
+ )
311
+ return True, f"https://huggingface.co/datasets/{HF_QUEUE_REPO}/blob/main/queue/{filename}"
312
+ except Exception as e:
313
+ print(f"WARN: HF Dataset upload failed: {e}")
314
+ return False, None
315
+
316
+
317
+ def notify_discord(bundle: dict, filename: str, dataset_url: str | None) -> bool:
318
+ """Post a submission summary to a Discord channel via webhook."""
319
+ if not DISCORD_WEBHOOK_URL:
320
+ return False
321
+
322
+ meta = bundle["metadata"]
323
+ n_preds = len(bundle["predictions"])
324
+ desc_lines = [
325
+ f"**Agent:** `{meta['agent_name']}`",
326
+ f"**Base model:** `{meta['base_model']}`",
327
+ f"**Scaffold:** `{meta['scaffold']}` Β· **Split:** `{meta['split']}` Β· **Entries:** {n_preds}",
328
+ f"**Org:** {meta['organization']} Β· **Contact:** {meta['contact_email']}",
329
+ f"**Code:** {meta['code_url']}",
330
+ ]
331
+ if dataset_url:
332
+ desc_lines.append(f"**Submission file:** [view on HF]({dataset_url})")
333
+
334
+ payload = json.dumps({
335
+ "content": "πŸš€ **New DEEPSYNTH leaderboard submission**",
336
+ "embeds": [{
337
+ "title": f"{meta['agent_name']} β€” {meta['organization']}",
338
+ "description": "\n".join(desc_lines),
339
+ "color": 0xff9d00, # DEEPSYNTH amber
340
+ "timestamp": bundle["received_at"],
341
+ }],
342
+ }).encode("utf-8")
343
+
344
+ req = urllib.request.Request(
345
+ DISCORD_WEBHOOK_URL,
346
+ data=payload,
347
+ method="POST",
348
+ headers={"Content-Type": "application/json", "User-Agent": "deepsynth-leaderboard"},
349
+ )
350
+ try:
351
+ with urllib.request.urlopen(req, timeout=10) as resp:
352
+ return resp.status in (200, 204)
353
+ except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
354
+ print(f"WARN: Discord notification failed: {e}")
355
+ return False
356
+
357
+
358
+ def notify_github_issue(bundle: dict, filename: str, dataset_url: str | None) -> bool:
359
+ """Open a GitHub issue on the benchmark repo so maintainers get an email
360
+ AND a permanent searchable record they can check off as they review.
361
+ """
362
+ if not GH_TOKEN:
363
+ return False
364
+
365
+ meta = bundle["metadata"]
366
+ title = f"[Submission] {meta['agent_name']} Β· {meta['organization']}"
367
+ file_link = (
368
+ f"[`{filename}`]({dataset_url})" if dataset_url else f"`{filename}` (in Space queue)"
369
+ )
370
+ body = (
371
+ f"**New DEEPSYNTH leaderboard submission received via the HF Space form.**\n\n"
372
+ f"| Field | Value |\n"
373
+ f"|---|---|\n"
374
+ f"| Agent | `{meta['agent_name']}` |\n"
375
+ f"| Base model | `{meta['base_model']}` |\n"
376
+ f"| Scaffold | `{meta['scaffold']}` |\n"
377
+ f"| Split | `{meta['split']}` |\n"
378
+ f"| Organization | {meta['organization']} |\n"
379
+ f"| Contact | {meta['contact_email']} |\n"
380
+ f"| Code URL | {meta['code_url']} |\n"
381
+ f"| Received at | {bundle['received_at']} |\n"
382
+ f"| Predictions count | {len(bundle['predictions'])} |\n"
383
+ f"| Submission file | {file_link} |\n\n"
384
+ f"**Maintainer checklist:**\n"
385
+ f"- [ ] Verify `code_url` is public and reproducible\n"
386
+ f"- [ ] Pull the file from the queue dataset\n"
387
+ f"- [ ] Run `eval_static_score.py` against private gold answers\n"
388
+ f"- [ ] Commit scored JSON to the Space's `submissions/`\n"
389
+ f"- [ ] Reply to submitter at {meta['contact_email']}\n"
390
+ f"- [ ] Close this issue\n"
391
+ )
392
+
393
+ payload = json.dumps({
394
+ "title": title,
395
+ "body": body,
396
+ "labels": ["submission", "needs-review"],
397
+ }).encode("utf-8")
398
+
399
+ req = urllib.request.Request(
400
+ f"https://api.github.com/repos/{GH_NOTIFY_REPO}/issues",
401
+ data=payload,
402
+ method="POST",
403
+ headers={
404
+ "Accept": "application/vnd.github+json",
405
+ "Authorization": f"Bearer {GH_TOKEN}",
406
+ "X-GitHub-Api-Version": "2022-11-28",
407
+ "Content-Type": "application/json",
408
+ "User-Agent": "deepsynth-leaderboard-space",
409
+ },
410
+ )
411
+ try:
412
+ with urllib.request.urlopen(req, timeout=10) as resp:
413
+ return resp.status in (200, 201)
414
+ except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
415
+ print(f"WARN: GitHub notification failed: {e}")
416
+ return False
417
+
418
+
419
  def submit_predictions(
420
  file_obj,
421
  agent_name: str,
 
448
  except OSError as e:
449
  return f"❌ **Could not read uploaded file:** {e}"
450
 
451
+ error = validate_predictions_payload(predictions, split)
452
+ if error and error.startswith("❌"):
453
+ return error
454
+ warning_prefix = error if error else ""
455
 
456
  bundle = {
457
  "received_at": datetime.datetime.utcnow().isoformat() + "Z",
458
  "metadata": {
459
+ "agent_name": agent_name.strip(),
460
+ "base_model": base_model.strip(),
461
+ "scaffold": scaffold,
462
+ "organization": organization.strip(),
463
+ "contact_email": contact_email.strip(),
464
+ "code_url": code_url.strip(),
465
+ "split": split,
466
  "submission_date": datetime.date.today().isoformat(),
467
  },
468
  "predictions": predictions,
 
470
 
471
  date = datetime.date.today().isoformat()
472
  fname = f"{date}-{_safe_slug(organization)}-{_safe_slug(agent_name)}.json"
473
+
474
+ # Always write the local fallback first β€” so even if every external
475
+ # service is misconfigured, the submission isn't lost while the Space
476
+ # is alive. Cheap insurance.
477
+ local_dest = QUEUE_DIR / fname
478
+ with local_dest.open("w", encoding="utf-8") as f:
479
  json.dump(bundle, f, indent=2, ensure_ascii=False)
480
 
481
+ # Persistent storage: upload to the HF Dataset queue.
482
+ hf_ok, dataset_url = upload_to_hf_dataset(bundle, fname)
483
+
484
+ # Notifications: fire both channels. Each is independent.
485
+ discord_ok = notify_discord(bundle, fname, dataset_url)
486
+ github_ok = notify_github_issue(bundle, fname, dataset_url)
487
+
488
+ # Build a status message that reflects what actually happened.
489
+ storage_line = (
490
+ f"πŸ’Ύ Saved permanently to [HF Dataset queue]({dataset_url}).\n\n"
491
+ if hf_ok
492
+ else "πŸ’Ύ Saved to Space-local queue (HF Dataset persistence not configured β€” "
493
+ "submission may not survive a Space restart; please also open a PR).\n\n"
494
+ )
495
+ notify_bits = []
496
+ if discord_ok: notify_bits.append("Discord")
497
+ if github_ok: notify_bits.append("GitHub Issues")
498
+ notify_line = (
499
+ f"πŸ“¬ Maintainers notified via {' + '.join(notify_bits)}.\n\n"
500
+ if notify_bits
501
+ else "πŸ“¬ No notification channels configured on this Space β€” "
502
+ "if you don't hear back in 10 days, please email the paper authors.\n\n"
503
+ )
504
+
505
  return (
506
+ (warning_prefix + "\n\n" if warning_prefix else "")
507
+ + f"βœ… **Submission received** as `{fname}` for the **{split}** split "
508
+ f"(**{len(predictions)}** entries).\n\n"
509
+ + storage_line
510
+ + notify_line
511
+ + f"A maintainer will score it against the private {split}-set answers and merge it to the "
512
  f"leaderboard within ~1 week. We may email `{contact_email}` if we need to verify "
513
  f"reproducibility via your `code_url`.\n\n"
514
+ f"**For a permanent public record,** please also open a PR to the "
515
+ f"[benchmark repo]({REPO_URL}) with your predictions file under `submissions/`."
 
516
  )
517
 
518
 
 
576
  with gr.Tab("πŸ“€ Submit"):
577
  gr.Markdown("## Submit your agent's predictions")
578
  gr.Markdown(
579
+ "Upload a JSON file containing **your agent's output** on DEEPSYNTH. "
580
+ "The uploaded file must be the *predictions file* produced by running "
581
+ "your agent on the split's questions β€” not your agent's source code, "
582
+ "and not a raw transcript. We then score it against the private gold "
583
+ "answers and add your row to the leaderboard."
584
+ )
585
+
586
+ gr.Markdown(
587
+ "### πŸ“„ Expected file format\n"
588
+ "The file must be a **JSON array** where each element is an object "
589
+ "with a `Question Number` and an `answer`:\n"
590
+ "\n"
591
+ "```json\n"
592
+ "[\n"
593
+ " {\"Question Number\": \"001\", \"answer\": {\"Sweden\": 1.2, \"Finland\": 0.8}},\n"
594
+ " {\"Question Number\": \"002\", \"answer\": {\"Brunei\": -0.67}},\n"
595
+ " ...\n"
596
+ "]\n"
597
+ "```\n"
598
+ "\n"
599
+ "**Required per entry:**\n"
600
+ "- `Question Number` β€” the task ID matching the DEEPSYNTH questions file "
601
+ "(dev: 1-40, test: 1-80).\n"
602
+ "- `answer` β€” your agent's final structured answer (JSON object / array / number), "
603
+ "**NOT** the chain-of-thought or tool transcript.\n\n"
604
+ f"Full spec: [`submission_schema.json`]({REPO_URL}/blob/main/scripts/evaluation/submission_schema.json). "
605
+ f"Validate locally before uploading: "
606
+ f"`python scripts/evaluation/validate_submission.py my_predictions.json --strict`."
607
  )
608
 
609
  with gr.Row():
 
616
  value="ReAct",
617
  )
618
  split_in = gr.Dropdown(
619
+ choices=["dev", "test"],
620
  label="Split evaluated",
621
+ value="test",
622
  )
623
  with gr.Column():
624
+ organization_in = gr.Textbox(label="Organization", placeholder="e.g. MSR, Stanford, Google, etc.")
625
  contact_email_in = gr.Textbox(label="Contact email", placeholder="you@org.edu")
626
  code_url_in = gr.Textbox(
627
  label="Code URL (required)",
 
629
  )
630
 
631
  predictions_in = gr.File(
632
+ label="Predictions JSON (the output file produced by your agent)",
633
  file_types=[".json"],
634
  )
635
  submit_btn = gr.Button("Submit for review", variant="primary")
 
646
 
647
  gr.Markdown(
648
  "---\n"
649
+ "**What happens after you submit?** Your file is queued in the Space and a GitHub "
650
+ "issue is opened on the benchmark repo so maintainers get notified. We verify metadata "
651
+ "honesty and spot-check reproducibility via your `code_url` before computing scores and "
652
+ "merging to the leaderboard.\n\n"
653
  f"**Prefer Git?** Open a PR to [{REPO_URL.split('//')[1]}]({REPO_URL}) "
654
  "adding your file under `submissions/YYYY-MM-DD-org-agentname.json`."
655
  )