debjitpaul commited on
Commit Β·
95b8b77
1
Parent(s): 8b3abc6
Multi-channel submission storage (HF Dataset) + notifications (GitHub)
Browse files
app.py
CHANGED
|
@@ -11,6 +11,9 @@ import datetime
|
|
| 11 |
import json
|
| 12 |
import os
|
| 13 |
import re
|
|
|
|
|
|
|
|
|
|
| 14 |
from pathlib import Path
|
| 15 |
from typing import Any
|
| 16 |
|
|
@@ -23,9 +26,27 @@ import pandas as pd
|
|
| 23 |
|
| 24 |
RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
|
| 25 |
DEV_RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_DEV_RESULTS_DIR", "dev_submissions"))
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
QUEUE_DIR.mkdir(exist_ok=True, parents=True)
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
TITLE = "π DeepSynth Leaderboard"
|
| 30 |
TAGLINE = "A Benchmark for Deep Information Synthesis Β· ICLR 2026"
|
| 31 |
ABOUT_BLURB = (
|
|
@@ -214,6 +235,187 @@ def _safe_slug(text: str, maxlen: int = 40) -> str:
|
|
| 214 |
return slug[:maxlen] or "unnamed"
|
| 215 |
|
| 216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
def submit_predictions(
|
| 218 |
file_obj,
|
| 219 |
agent_name: str,
|
|
@@ -246,19 +448,21 @@ def submit_predictions(
|
|
| 246 |
except OSError as e:
|
| 247 |
return f"β **Could not read uploaded file:** {e}"
|
| 248 |
|
| 249 |
-
|
| 250 |
-
|
|
|
|
|
|
|
| 251 |
|
| 252 |
bundle = {
|
| 253 |
"received_at": datetime.datetime.utcnow().isoformat() + "Z",
|
| 254 |
"metadata": {
|
| 255 |
-
"agent_name":
|
| 256 |
-
"base_model":
|
| 257 |
-
"scaffold":
|
| 258 |
-
"organization":
|
| 259 |
-
"contact_email":
|
| 260 |
-
"code_url":
|
| 261 |
-
"split":
|
| 262 |
"submission_date": datetime.date.today().isoformat(),
|
| 263 |
},
|
| 264 |
"predictions": predictions,
|
|
@@ -266,18 +470,49 @@ def submit_predictions(
|
|
| 266 |
|
| 267 |
date = datetime.date.today().isoformat()
|
| 268 |
fname = f"{date}-{_safe_slug(organization)}-{_safe_slug(agent_name)}.json"
|
| 269 |
-
|
| 270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
json.dump(bundle, f, indent=2, ensure_ascii=False)
|
| 272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
return (
|
| 274 |
-
|
| 275 |
-
f"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
f"leaderboard within ~1 week. We may email `{contact_email}` if we need to verify "
|
| 277 |
f"reproducibility via your `code_url`.\n\n"
|
| 278 |
-
f"**
|
| 279 |
-
f"
|
| 280 |
-
f"predictions file under `submissions/`."
|
| 281 |
)
|
| 282 |
|
| 283 |
|
|
@@ -341,11 +576,34 @@ def build_app() -> gr.Blocks:
|
|
| 341 |
with gr.Tab("π€ Submit"):
|
| 342 |
gr.Markdown("## Submit your agent's predictions")
|
| 343 |
gr.Markdown(
|
| 344 |
-
"Upload a JSON file
|
| 345 |
-
"
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
)
|
| 350 |
|
| 351 |
with gr.Row():
|
|
@@ -358,12 +616,12 @@ def build_app() -> gr.Blocks:
|
|
| 358 |
value="ReAct",
|
| 359 |
)
|
| 360 |
split_in = gr.Dropdown(
|
| 361 |
-
choices=["dev","test"
|
| 362 |
label="Split evaluated",
|
| 363 |
-
value="
|
| 364 |
)
|
| 365 |
with gr.Column():
|
| 366 |
-
organization_in = gr.Textbox(label="Organization", placeholder="e.g.
|
| 367 |
contact_email_in = gr.Textbox(label="Contact email", placeholder="you@org.edu")
|
| 368 |
code_url_in = gr.Textbox(
|
| 369 |
label="Code URL (required)",
|
|
@@ -371,7 +629,7 @@ def build_app() -> gr.Blocks:
|
|
| 371 |
)
|
| 372 |
|
| 373 |
predictions_in = gr.File(
|
| 374 |
-
label="Predictions JSON file",
|
| 375 |
file_types=[".json"],
|
| 376 |
)
|
| 377 |
submit_btn = gr.Button("Submit for review", variant="primary")
|
|
@@ -388,9 +646,10 @@ def build_app() -> gr.Blocks:
|
|
| 388 |
|
| 389 |
gr.Markdown(
|
| 390 |
"---\n"
|
| 391 |
-
"**What happens
|
| 392 |
-
"
|
| 393 |
-
"`code_url` before computing scores and
|
|
|
|
| 394 |
f"**Prefer Git?** Open a PR to [{REPO_URL.split('//')[1]}]({REPO_URL}) "
|
| 395 |
"adding your file under `submissions/YYYY-MM-DD-org-agentname.json`."
|
| 396 |
)
|
|
|
|
| 11 |
import json
|
| 12 |
import os
|
| 13 |
import re
|
| 14 |
+
import tempfile
|
| 15 |
+
import urllib.error
|
| 16 |
+
import urllib.request
|
| 17 |
from pathlib import Path
|
| 18 |
from typing import Any
|
| 19 |
|
|
|
|
| 26 |
|
| 27 |
RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
|
| 28 |
DEV_RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_DEV_RESULTS_DIR", "dev_submissions"))
|
| 29 |
+
|
| 30 |
+
# Local fallback queue. Used only if HF Dataset upload is not configured β
|
| 31 |
+
# kept as a belt-and-suspenders safety net so a misconfigured Space never
|
| 32 |
+
# silently drops a submission.
|
| 33 |
+
_DEFAULT_QUEUE = "/data/submissions_queue" if Path("/data").is_dir() else "submissions_queue"
|
| 34 |
+
QUEUE_DIR = Path(os.environ.get("DEEPSYNTH_QUEUE_DIR", _DEFAULT_QUEUE))
|
| 35 |
QUEUE_DIR.mkdir(exist_ok=True, parents=True)
|
| 36 |
|
| 37 |
+
# Primary submission storage: a private HF Dataset. Each submission is its
|
| 38 |
+
# OWN file in the dataset (no shared CSV β that pattern races and loses
|
| 39 |
+
# submissions when two users submit simultaneously).
|
| 40 |
+
HF_TOKEN = os.environ.get("HF_TOKEN") # Space secret with write access to the dataset
|
| 41 |
+
HF_QUEUE_REPO = os.environ.get(
|
| 42 |
+
"DEEPSYNTH_QUEUE_REPO", "DeepSynthesisTeam/deepsynth-submission-queue"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# Notification channels β both are optional. Set either or both as Space secrets.
|
| 46 |
+
DISCORD_WEBHOOK_URL = os.environ.get("DISCORD_WEBHOOK_URL") # https://discord.com/api/webhooks/...
|
| 47 |
+
GH_NOTIFY_REPO = os.environ.get("DEEPSYNTH_NOTIFY_REPO", "agentdeepsynthesis/deepsynth-bench")
|
| 48 |
+
GH_TOKEN = os.environ.get("GH_TOKEN") # Fine-grained PAT, "Issues: write" on the repo
|
| 49 |
+
|
| 50 |
TITLE = "π DeepSynth Leaderboard"
|
| 51 |
TAGLINE = "A Benchmark for Deep Information Synthesis Β· ICLR 2026"
|
| 52 |
ABOUT_BLURB = (
|
|
|
|
| 235 |
return slug[:maxlen] or "unnamed"
|
| 236 |
|
| 237 |
|
| 238 |
+
def validate_predictions_payload(predictions: Any, split: str) -> str | None:
|
| 239 |
+
"""Validate that uploaded file is in the eval_static_score.py format.
|
| 240 |
+
|
| 241 |
+
Returns an error message string if invalid, or None if valid.
|
| 242 |
+
The evaluator expects a JSON list of {"Question Number": ..., "answer": ...}
|
| 243 |
+
objects β NOT a dict keyed by task ID.
|
| 244 |
+
"""
|
| 245 |
+
if not isinstance(predictions, list):
|
| 246 |
+
return (
|
| 247 |
+
"β **Wrong format.** Predictions must be a JSON **array** (list), "
|
| 248 |
+
"not an object/dict. Each element should be `{\"Question Number\": \"001\", "
|
| 249 |
+
"\"answer\": ...}`. See the expected format in the Submit tab above."
|
| 250 |
+
)
|
| 251 |
+
if not predictions:
|
| 252 |
+
return "β **Empty predictions file.** Please include answers for the tasks you evaluated."
|
| 253 |
+
|
| 254 |
+
expected_count = 40 if split == "dev" else 80
|
| 255 |
+
missing_fields = []
|
| 256 |
+
for i, item in enumerate(predictions[:5]): # Sample-check the first 5
|
| 257 |
+
if not isinstance(item, dict):
|
| 258 |
+
return (
|
| 259 |
+
f"β **Entry {i} is not a JSON object.** Each element must be a "
|
| 260 |
+
f"dict with 'Question Number' and 'answer' keys."
|
| 261 |
+
)
|
| 262 |
+
if "Question Number" not in item:
|
| 263 |
+
missing_fields.append(f"entry {i}: missing 'Question Number'")
|
| 264 |
+
if "answer" not in item:
|
| 265 |
+
missing_fields.append(f"entry {i}: missing 'answer'")
|
| 266 |
+
if missing_fields:
|
| 267 |
+
return "β **Required fields missing:** " + "; ".join(missing_fields[:3])
|
| 268 |
+
|
| 269 |
+
if len(predictions) < expected_count:
|
| 270 |
+
return (
|
| 271 |
+
f"β οΈ **Partial submission warning:** the {split} split has {expected_count} "
|
| 272 |
+
f"tasks, but your file contains only {len(predictions)}. This will be "
|
| 273 |
+
f"accepted but scored as 0.0 for missing tasks. Continue anyway? Resubmit a "
|
| 274 |
+
f"complete file if this was unintentional."
|
| 275 |
+
)
|
| 276 |
+
return None
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def upload_to_hf_dataset(bundle: dict, filename: str) -> tuple[bool, str | None]:
|
| 280 |
+
"""Upload a single submission file to a private HF Dataset repo.
|
| 281 |
+
|
| 282 |
+
Returns (success, dataset_url). Each submission is its own file under
|
| 283 |
+
queue/<filename> β never appending to a shared CSV (which races and
|
| 284 |
+
silently drops simultaneous submissions).
|
| 285 |
+
"""
|
| 286 |
+
if not HF_TOKEN:
|
| 287 |
+
return False, None
|
| 288 |
+
|
| 289 |
+
# Lazy-import so the Space still boots when huggingface_hub is missing.
|
| 290 |
+
try:
|
| 291 |
+
from huggingface_hub import HfApi, CommitOperationAdd
|
| 292 |
+
except ImportError:
|
| 293 |
+
print("WARN: huggingface_hub not installed; cannot upload to dataset")
|
| 294 |
+
return False, None
|
| 295 |
+
|
| 296 |
+
payload = json.dumps(bundle, indent=2, ensure_ascii=False).encode("utf-8")
|
| 297 |
+
|
| 298 |
+
try:
|
| 299 |
+
api = HfApi(token=HF_TOKEN)
|
| 300 |
+
api.create_commit(
|
| 301 |
+
repo_id=HF_QUEUE_REPO,
|
| 302 |
+
repo_type="dataset",
|
| 303 |
+
operations=[
|
| 304 |
+
CommitOperationAdd(
|
| 305 |
+
path_in_repo=f"queue/{filename}",
|
| 306 |
+
path_or_fileobj=payload,
|
| 307 |
+
)
|
| 308 |
+
],
|
| 309 |
+
commit_message=f"submission: {bundle['metadata']['agent_name']} ({bundle['metadata']['organization']})",
|
| 310 |
+
)
|
| 311 |
+
return True, f"https://huggingface.co/datasets/{HF_QUEUE_REPO}/blob/main/queue/{filename}"
|
| 312 |
+
except Exception as e:
|
| 313 |
+
print(f"WARN: HF Dataset upload failed: {e}")
|
| 314 |
+
return False, None
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def notify_discord(bundle: dict, filename: str, dataset_url: str | None) -> bool:
|
| 318 |
+
"""Post a submission summary to a Discord channel via webhook."""
|
| 319 |
+
if not DISCORD_WEBHOOK_URL:
|
| 320 |
+
return False
|
| 321 |
+
|
| 322 |
+
meta = bundle["metadata"]
|
| 323 |
+
n_preds = len(bundle["predictions"])
|
| 324 |
+
desc_lines = [
|
| 325 |
+
f"**Agent:** `{meta['agent_name']}`",
|
| 326 |
+
f"**Base model:** `{meta['base_model']}`",
|
| 327 |
+
f"**Scaffold:** `{meta['scaffold']}` Β· **Split:** `{meta['split']}` Β· **Entries:** {n_preds}",
|
| 328 |
+
f"**Org:** {meta['organization']} Β· **Contact:** {meta['contact_email']}",
|
| 329 |
+
f"**Code:** {meta['code_url']}",
|
| 330 |
+
]
|
| 331 |
+
if dataset_url:
|
| 332 |
+
desc_lines.append(f"**Submission file:** [view on HF]({dataset_url})")
|
| 333 |
+
|
| 334 |
+
payload = json.dumps({
|
| 335 |
+
"content": "π **New DEEPSYNTH leaderboard submission**",
|
| 336 |
+
"embeds": [{
|
| 337 |
+
"title": f"{meta['agent_name']} β {meta['organization']}",
|
| 338 |
+
"description": "\n".join(desc_lines),
|
| 339 |
+
"color": 0xff9d00, # DEEPSYNTH amber
|
| 340 |
+
"timestamp": bundle["received_at"],
|
| 341 |
+
}],
|
| 342 |
+
}).encode("utf-8")
|
| 343 |
+
|
| 344 |
+
req = urllib.request.Request(
|
| 345 |
+
DISCORD_WEBHOOK_URL,
|
| 346 |
+
data=payload,
|
| 347 |
+
method="POST",
|
| 348 |
+
headers={"Content-Type": "application/json", "User-Agent": "deepsynth-leaderboard"},
|
| 349 |
+
)
|
| 350 |
+
try:
|
| 351 |
+
with urllib.request.urlopen(req, timeout=10) as resp:
|
| 352 |
+
return resp.status in (200, 204)
|
| 353 |
+
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
|
| 354 |
+
print(f"WARN: Discord notification failed: {e}")
|
| 355 |
+
return False
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
def notify_github_issue(bundle: dict, filename: str, dataset_url: str | None) -> bool:
|
| 359 |
+
"""Open a GitHub issue on the benchmark repo so maintainers get an email
|
| 360 |
+
AND a permanent searchable record they can check off as they review.
|
| 361 |
+
"""
|
| 362 |
+
if not GH_TOKEN:
|
| 363 |
+
return False
|
| 364 |
+
|
| 365 |
+
meta = bundle["metadata"]
|
| 366 |
+
title = f"[Submission] {meta['agent_name']} Β· {meta['organization']}"
|
| 367 |
+
file_link = (
|
| 368 |
+
f"[`{filename}`]({dataset_url})" if dataset_url else f"`{filename}` (in Space queue)"
|
| 369 |
+
)
|
| 370 |
+
body = (
|
| 371 |
+
f"**New DEEPSYNTH leaderboard submission received via the HF Space form.**\n\n"
|
| 372 |
+
f"| Field | Value |\n"
|
| 373 |
+
f"|---|---|\n"
|
| 374 |
+
f"| Agent | `{meta['agent_name']}` |\n"
|
| 375 |
+
f"| Base model | `{meta['base_model']}` |\n"
|
| 376 |
+
f"| Scaffold | `{meta['scaffold']}` |\n"
|
| 377 |
+
f"| Split | `{meta['split']}` |\n"
|
| 378 |
+
f"| Organization | {meta['organization']} |\n"
|
| 379 |
+
f"| Contact | {meta['contact_email']} |\n"
|
| 380 |
+
f"| Code URL | {meta['code_url']} |\n"
|
| 381 |
+
f"| Received at | {bundle['received_at']} |\n"
|
| 382 |
+
f"| Predictions count | {len(bundle['predictions'])} |\n"
|
| 383 |
+
f"| Submission file | {file_link} |\n\n"
|
| 384 |
+
f"**Maintainer checklist:**\n"
|
| 385 |
+
f"- [ ] Verify `code_url` is public and reproducible\n"
|
| 386 |
+
f"- [ ] Pull the file from the queue dataset\n"
|
| 387 |
+
f"- [ ] Run `eval_static_score.py` against private gold answers\n"
|
| 388 |
+
f"- [ ] Commit scored JSON to the Space's `submissions/`\n"
|
| 389 |
+
f"- [ ] Reply to submitter at {meta['contact_email']}\n"
|
| 390 |
+
f"- [ ] Close this issue\n"
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
payload = json.dumps({
|
| 394 |
+
"title": title,
|
| 395 |
+
"body": body,
|
| 396 |
+
"labels": ["submission", "needs-review"],
|
| 397 |
+
}).encode("utf-8")
|
| 398 |
+
|
| 399 |
+
req = urllib.request.Request(
|
| 400 |
+
f"https://api.github.com/repos/{GH_NOTIFY_REPO}/issues",
|
| 401 |
+
data=payload,
|
| 402 |
+
method="POST",
|
| 403 |
+
headers={
|
| 404 |
+
"Accept": "application/vnd.github+json",
|
| 405 |
+
"Authorization": f"Bearer {GH_TOKEN}",
|
| 406 |
+
"X-GitHub-Api-Version": "2022-11-28",
|
| 407 |
+
"Content-Type": "application/json",
|
| 408 |
+
"User-Agent": "deepsynth-leaderboard-space",
|
| 409 |
+
},
|
| 410 |
+
)
|
| 411 |
+
try:
|
| 412 |
+
with urllib.request.urlopen(req, timeout=10) as resp:
|
| 413 |
+
return resp.status in (200, 201)
|
| 414 |
+
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
|
| 415 |
+
print(f"WARN: GitHub notification failed: {e}")
|
| 416 |
+
return False
|
| 417 |
+
|
| 418 |
+
|
| 419 |
def submit_predictions(
|
| 420 |
file_obj,
|
| 421 |
agent_name: str,
|
|
|
|
| 448 |
except OSError as e:
|
| 449 |
return f"β **Could not read uploaded file:** {e}"
|
| 450 |
|
| 451 |
+
error = validate_predictions_payload(predictions, split)
|
| 452 |
+
if error and error.startswith("β"):
|
| 453 |
+
return error
|
| 454 |
+
warning_prefix = error if error else ""
|
| 455 |
|
| 456 |
bundle = {
|
| 457 |
"received_at": datetime.datetime.utcnow().isoformat() + "Z",
|
| 458 |
"metadata": {
|
| 459 |
+
"agent_name": agent_name.strip(),
|
| 460 |
+
"base_model": base_model.strip(),
|
| 461 |
+
"scaffold": scaffold,
|
| 462 |
+
"organization": organization.strip(),
|
| 463 |
+
"contact_email": contact_email.strip(),
|
| 464 |
+
"code_url": code_url.strip(),
|
| 465 |
+
"split": split,
|
| 466 |
"submission_date": datetime.date.today().isoformat(),
|
| 467 |
},
|
| 468 |
"predictions": predictions,
|
|
|
|
| 470 |
|
| 471 |
date = datetime.date.today().isoformat()
|
| 472 |
fname = f"{date}-{_safe_slug(organization)}-{_safe_slug(agent_name)}.json"
|
| 473 |
+
|
| 474 |
+
# Always write the local fallback first β so even if every external
|
| 475 |
+
# service is misconfigured, the submission isn't lost while the Space
|
| 476 |
+
# is alive. Cheap insurance.
|
| 477 |
+
local_dest = QUEUE_DIR / fname
|
| 478 |
+
with local_dest.open("w", encoding="utf-8") as f:
|
| 479 |
json.dump(bundle, f, indent=2, ensure_ascii=False)
|
| 480 |
|
| 481 |
+
# Persistent storage: upload to the HF Dataset queue.
|
| 482 |
+
hf_ok, dataset_url = upload_to_hf_dataset(bundle, fname)
|
| 483 |
+
|
| 484 |
+
# Notifications: fire both channels. Each is independent.
|
| 485 |
+
discord_ok = notify_discord(bundle, fname, dataset_url)
|
| 486 |
+
github_ok = notify_github_issue(bundle, fname, dataset_url)
|
| 487 |
+
|
| 488 |
+
# Build a status message that reflects what actually happened.
|
| 489 |
+
storage_line = (
|
| 490 |
+
f"πΎ Saved permanently to [HF Dataset queue]({dataset_url}).\n\n"
|
| 491 |
+
if hf_ok
|
| 492 |
+
else "πΎ Saved to Space-local queue (HF Dataset persistence not configured β "
|
| 493 |
+
"submission may not survive a Space restart; please also open a PR).\n\n"
|
| 494 |
+
)
|
| 495 |
+
notify_bits = []
|
| 496 |
+
if discord_ok: notify_bits.append("Discord")
|
| 497 |
+
if github_ok: notify_bits.append("GitHub Issues")
|
| 498 |
+
notify_line = (
|
| 499 |
+
f"π¬ Maintainers notified via {' + '.join(notify_bits)}.\n\n"
|
| 500 |
+
if notify_bits
|
| 501 |
+
else "π¬ No notification channels configured on this Space β "
|
| 502 |
+
"if you don't hear back in 10 days, please email the paper authors.\n\n"
|
| 503 |
+
)
|
| 504 |
+
|
| 505 |
return (
|
| 506 |
+
(warning_prefix + "\n\n" if warning_prefix else "")
|
| 507 |
+
+ f"β
**Submission received** as `{fname}` for the **{split}** split "
|
| 508 |
+
f"(**{len(predictions)}** entries).\n\n"
|
| 509 |
+
+ storage_line
|
| 510 |
+
+ notify_line
|
| 511 |
+
+ f"A maintainer will score it against the private {split}-set answers and merge it to the "
|
| 512 |
f"leaderboard within ~1 week. We may email `{contact_email}` if we need to verify "
|
| 513 |
f"reproducibility via your `code_url`.\n\n"
|
| 514 |
+
f"**For a permanent public record,** please also open a PR to the "
|
| 515 |
+
f"[benchmark repo]({REPO_URL}) with your predictions file under `submissions/`."
|
|
|
|
| 516 |
)
|
| 517 |
|
| 518 |
|
|
|
|
| 576 |
with gr.Tab("π€ Submit"):
|
| 577 |
gr.Markdown("## Submit your agent's predictions")
|
| 578 |
gr.Markdown(
|
| 579 |
+
"Upload a JSON file containing **your agent's output** on DEEPSYNTH. "
|
| 580 |
+
"The uploaded file must be the *predictions file* produced by running "
|
| 581 |
+
"your agent on the split's questions β not your agent's source code, "
|
| 582 |
+
"and not a raw transcript. We then score it against the private gold "
|
| 583 |
+
"answers and add your row to the leaderboard."
|
| 584 |
+
)
|
| 585 |
+
|
| 586 |
+
gr.Markdown(
|
| 587 |
+
"### π Expected file format\n"
|
| 588 |
+
"The file must be a **JSON array** where each element is an object "
|
| 589 |
+
"with a `Question Number` and an `answer`:\n"
|
| 590 |
+
"\n"
|
| 591 |
+
"```json\n"
|
| 592 |
+
"[\n"
|
| 593 |
+
" {\"Question Number\": \"001\", \"answer\": {\"Sweden\": 1.2, \"Finland\": 0.8}},\n"
|
| 594 |
+
" {\"Question Number\": \"002\", \"answer\": {\"Brunei\": -0.67}},\n"
|
| 595 |
+
" ...\n"
|
| 596 |
+
"]\n"
|
| 597 |
+
"```\n"
|
| 598 |
+
"\n"
|
| 599 |
+
"**Required per entry:**\n"
|
| 600 |
+
"- `Question Number` β the task ID matching the DEEPSYNTH questions file "
|
| 601 |
+
"(dev: 1-40, test: 1-80).\n"
|
| 602 |
+
"- `answer` β your agent's final structured answer (JSON object / array / number), "
|
| 603 |
+
"**NOT** the chain-of-thought or tool transcript.\n\n"
|
| 604 |
+
f"Full spec: [`submission_schema.json`]({REPO_URL}/blob/main/scripts/evaluation/submission_schema.json). "
|
| 605 |
+
f"Validate locally before uploading: "
|
| 606 |
+
f"`python scripts/evaluation/validate_submission.py my_predictions.json --strict`."
|
| 607 |
)
|
| 608 |
|
| 609 |
with gr.Row():
|
|
|
|
| 616 |
value="ReAct",
|
| 617 |
)
|
| 618 |
split_in = gr.Dropdown(
|
| 619 |
+
choices=["dev", "test"],
|
| 620 |
label="Split evaluated",
|
| 621 |
+
value="test",
|
| 622 |
)
|
| 623 |
with gr.Column():
|
| 624 |
+
organization_in = gr.Textbox(label="Organization", placeholder="e.g. MSR, Stanford, Google, etc.")
|
| 625 |
contact_email_in = gr.Textbox(label="Contact email", placeholder="you@org.edu")
|
| 626 |
code_url_in = gr.Textbox(
|
| 627 |
label="Code URL (required)",
|
|
|
|
| 629 |
)
|
| 630 |
|
| 631 |
predictions_in = gr.File(
|
| 632 |
+
label="Predictions JSON (the output file produced by your agent)",
|
| 633 |
file_types=[".json"],
|
| 634 |
)
|
| 635 |
submit_btn = gr.Button("Submit for review", variant="primary")
|
|
|
|
| 646 |
|
| 647 |
gr.Markdown(
|
| 648 |
"---\n"
|
| 649 |
+
"**What happens after you submit?** Your file is queued in the Space and a GitHub "
|
| 650 |
+
"issue is opened on the benchmark repo so maintainers get notified. We verify metadata "
|
| 651 |
+
"honesty and spot-check reproducibility via your `code_url` before computing scores and "
|
| 652 |
+
"merging to the leaderboard.\n\n"
|
| 653 |
f"**Prefer Git?** Open a PR to [{REPO_URL.split('//')[1]}]({REPO_URL}) "
|
| 654 |
"adding your file under `submissions/YYYY-MM-DD-org-agentname.json`."
|
| 655 |
)
|