Spaces:

DeepSynthesisTeam
/

deepsynth-leaderboard

Sleeping

App Files Files Community

debjitpaul commited on Apr 20

Commit

95b8b77

1 Parent(s): 8b3abc6

Multi-channel submission storage (HF Dataset) + notifications (GitHub)

Browse files

Files changed (1) hide show

app.py +288 -29

app.py CHANGED Viewed

@@ -11,6 +11,9 @@ import datetime
 import json
 import os
 import re
 from pathlib import Path
 from typing import Any
@@ -23,9 +26,27 @@ import pandas as pd
 RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
 DEV_RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_DEV_RESULTS_DIR", "dev_submissions"))
-QUEUE_DIR = Path(os.environ.get("DEEPSYNTH_QUEUE_DIR", "submissions_queue"))
 QUEUE_DIR.mkdir(exist_ok=True, parents=True)
 TITLE = "🐙 DeepSynth Leaderboard"
 TAGLINE = "A Benchmark for Deep Information Synthesis · ICLR 2026"
 ABOUT_BLURB = (
@@ -214,6 +235,187 @@ def _safe_slug(text: str, maxlen: int = 40) -> str:
     return slug[:maxlen] or "unnamed"
 def submit_predictions(
     file_obj,
     agent_name: str,
@@ -246,19 +448,21 @@ def submit_predictions(
     except OSError as e:
         return f"❌ **Could not read uploaded file:** {e}"
-    if not isinstance(predictions, dict) or not predictions:
-        return "❌ **Predictions file must be a non-empty JSON object mapping task IDs to answers.**"
     bundle = {
         "received_at": datetime.datetime.utcnow().isoformat() + "Z",
         "metadata": {
-            "agent_name":     agent_name.strip(),
-            "base_model":     base_model.strip(),
-            "scaffold":       scaffold,
-            "organization":   organization.strip(),
-            "contact_email":  contact_email.strip(),
-            "code_url":       code_url.strip(),
-            "split":          split,
             "submission_date": datetime.date.today().isoformat(),
         },
         "predictions": predictions,
@@ -266,18 +470,49 @@ def submit_predictions(
     date = datetime.date.today().isoformat()
     fname = f"{date}-{_safe_slug(organization)}-{_safe_slug(agent_name)}.json"
-    dest = QUEUE_DIR / fname
-    with dest.open("w", encoding="utf-8") as f:
         json.dump(bundle, f, indent=2, ensure_ascii=False)
     return (
-        f"✅ **Submission received.** Your file has been queued for review as `{fname}`.\n\n"
-        f"A maintainer will score it against the private test-set answers and merge it to the "
         f"leaderboard within ~1 week. We may email `{contact_email}` if we need to verify "
         f"reproducibility via your `code_url`.\n\n"
-        f"**Note:** submissions in this Space's queue are held temporarily — for a permanent "
-        f"record, please also open a PR to the [benchmark repo]({REPO_URL}) with your "
-        f"predictions file under `submissions/`."
     )
@@ -341,11 +576,34 @@ def build_app() -> gr.Blocks:
             with gr.Tab("📤 Submit"):
                 gr.Markdown("## Submit your agent's predictions")
                 gr.Markdown(
-                    "Upload a JSON file of predictions on the DeepSynth **test set**. "
-                    "We'll score it against the private gold answers and add your row to the leaderboard.\n\n"
-                    f"**Format:** a JSON object mapping task IDs (`\"001\"` … `\"120\"`) to your agent's answer. "
-                    f"See [`submission_schema.json`]({REPO_URL}/blob/main/scripts/evaluation/submission_schema.json) "
-                    "for the full spec."
                 )
                 with gr.Row():
@@ -358,12 +616,12 @@ def build_app() -> gr.Blocks:
                             value="ReAct",
                         )
                         split_in         = gr.Dropdown(
-                            choices=["dev","test", "full"],
                             label="Split evaluated",
-                            value="dev",
                         )
                     with gr.Column():
-                        organization_in  = gr.Textbox(label="Organization", placeholder="e.g. Huawei, Anthropic, Microsoft, OpenAI, Stanford")
                         contact_email_in = gr.Textbox(label="Contact email", placeholder="you@org.edu")
                         code_url_in      = gr.Textbox(
                             label="Code URL (required)",
@@ -371,7 +629,7 @@ def build_app() -> gr.Blocks:
                         )
                 predictions_in = gr.File(
-                    label="Predictions JSON file",
                     file_types=[".json"],
                 )
                 submit_btn = gr.Button("Submit for review", variant="primary")
@@ -388,9 +646,10 @@ def build_app() -> gr.Blocks:
                 gr.Markdown(
                     "---\n"
-                    "**What happens next?** Submissions are queued for maintainer review. "
-                    "We verify metadata honesty and spot-check reproducibility via your "
-                    "`code_url` before computing scores and merging to the leaderboard.\n\n"
                     f"**Prefer Git?** Open a PR to [{REPO_URL.split('//')[1]}]({REPO_URL}) "
                     "adding your file under `submissions/YYYY-MM-DD-org-agentname.json`."
                 )

 import json
 import os
 import re
+import tempfile
+import urllib.error
+import urllib.request
 from pathlib import Path
 from typing import Any
 RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
 DEV_RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_DEV_RESULTS_DIR", "dev_submissions"))
+# Local fallback queue. Used only if HF Dataset upload is not configured —
+# kept as a belt-and-suspenders safety net so a misconfigured Space never
+# silently drops a submission.
+_DEFAULT_QUEUE = "/data/submissions_queue" if Path("/data").is_dir() else "submissions_queue"
+QUEUE_DIR = Path(os.environ.get("DEEPSYNTH_QUEUE_DIR", _DEFAULT_QUEUE))
 QUEUE_DIR.mkdir(exist_ok=True, parents=True)
+# Primary submission storage: a private HF Dataset. Each submission is its
+# OWN file in the dataset (no shared CSV — that pattern races and loses
+# submissions when two users submit simultaneously).
+HF_TOKEN = os.environ.get("HF_TOKEN")  # Space secret with write access to the dataset
+HF_QUEUE_REPO = os.environ.get(
+    "DEEPSYNTH_QUEUE_REPO", "DeepSynthesisTeam/deepsynth-submission-queue"
+)
+# Notification channels — both are optional. Set either or both as Space secrets.
+DISCORD_WEBHOOK_URL = os.environ.get("DISCORD_WEBHOOK_URL")  # https://discord.com/api/webhooks/...
+GH_NOTIFY_REPO = os.environ.get("DEEPSYNTH_NOTIFY_REPO", "agentdeepsynthesis/deepsynth-bench")
+GH_TOKEN = os.environ.get("GH_TOKEN")  # Fine-grained PAT, "Issues: write" on the repo
 TITLE = "🐙 DeepSynth Leaderboard"
 TAGLINE = "A Benchmark for Deep Information Synthesis · ICLR 2026"
 ABOUT_BLURB = (
     return slug[:maxlen] or "unnamed"
+def validate_predictions_payload(predictions: Any, split: str) -> str | None:
+    """Validate that uploaded file is in the eval_static_score.py format.
+    Returns an error message string if invalid, or None if valid.
+    The evaluator expects a JSON list of {"Question Number": ..., "answer": ...}
+    objects — NOT a dict keyed by task ID.
+    """
+    if not isinstance(predictions, list):
+        return (
+            "❌ **Wrong format.** Predictions must be a JSON **array** (list), "
+            "not an object/dict. Each element should be `{\"Question Number\": \"001\", "
+            "\"answer\": ...}`. See the expected format in the Submit tab above."
+        )
+    if not predictions:
+        return "❌ **Empty predictions file.** Please include answers for the tasks you evaluated."
+    expected_count = 40 if split == "dev" else 80
+    missing_fields = []
+    for i, item in enumerate(predictions[:5]):  # Sample-check the first 5
+        if not isinstance(item, dict):
+            return (
+                f"❌ **Entry {i} is not a JSON object.** Each element must be a "
+                f"dict with 'Question Number' and 'answer' keys."
+            )
+        if "Question Number" not in item:
+            missing_fields.append(f"entry {i}: missing 'Question Number'")
+        if "answer" not in item:
+            missing_fields.append(f"entry {i}: missing 'answer'")
+    if missing_fields:
+        return "❌ **Required fields missing:** " + "; ".join(missing_fields[:3])
+    if len(predictions) < expected_count:
+        return (
+            f"⚠️ **Partial submission warning:** the {split} split has {expected_count} "
+            f"tasks, but your file contains only {len(predictions)}. This will be "
+            f"accepted but scored as 0.0 for missing tasks. Continue anyway? Resubmit a "
+            f"complete file if this was unintentional."
+        )
+    return None
+def upload_to_hf_dataset(bundle: dict, filename: str) -> tuple[bool, str | None]:
+    """Upload a single submission file to a private HF Dataset repo.
+    Returns (success, dataset_url). Each submission is its own file under
+    queue/<filename> — never appending to a shared CSV (which races and
+    silently drops simultaneous submissions).
+    """
+    if not HF_TOKEN:
+        return False, None
+    # Lazy-import so the Space still boots when huggingface_hub is missing.
+    try:
+        from huggingface_hub import HfApi, CommitOperationAdd
+    except ImportError:
+        print("WARN: huggingface_hub not installed; cannot upload to dataset")
+        return False, None
+    payload = json.dumps(bundle, indent=2, ensure_ascii=False).encode("utf-8")
+    try:
+        api = HfApi(token=HF_TOKEN)
+        api.create_commit(
+            repo_id=HF_QUEUE_REPO,
+            repo_type="dataset",
+            operations=[
+                CommitOperationAdd(
+                    path_in_repo=f"queue/{filename}",
+                    path_or_fileobj=payload,
+                )
+            ],
+            commit_message=f"submission: {bundle['metadata']['agent_name']} ({bundle['metadata']['organization']})",
+        )
+        return True, f"https://huggingface.co/datasets/{HF_QUEUE_REPO}/blob/main/queue/{filename}"
+    except Exception as e:
+        print(f"WARN: HF Dataset upload failed: {e}")
+        return False, None
+def notify_discord(bundle: dict, filename: str, dataset_url: str | None) -> bool:
+    """Post a submission summary to a Discord channel via webhook."""
+    if not DISCORD_WEBHOOK_URL:
+        return False
+    meta = bundle["metadata"]
+    n_preds = len(bundle["predictions"])
+    desc_lines = [
+        f"**Agent:** `{meta['agent_name']}`",
+        f"**Base model:** `{meta['base_model']}`",
+        f"**Scaffold:** `{meta['scaffold']}` · **Split:** `{meta['split']}` · **Entries:** {n_preds}",
+        f"**Org:** {meta['organization']} · **Contact:** {meta['contact_email']}",
+        f"**Code:** {meta['code_url']}",
+    ]
+    if dataset_url:
+        desc_lines.append(f"**Submission file:** [view on HF]({dataset_url})")
+    payload = json.dumps({
+        "content": "🚀 **New DEEPSYNTH leaderboard submission**",
+        "embeds": [{
+            "title": f"{meta['agent_name']} — {meta['organization']}",
+            "description": "\n".join(desc_lines),
+            "color": 0xff9d00,  # DEEPSYNTH amber
+            "timestamp": bundle["received_at"],
+        }],
+    }).encode("utf-8")
+    req = urllib.request.Request(
+        DISCORD_WEBHOOK_URL,
+        data=payload,
+        method="POST",
+        headers={"Content-Type": "application/json", "User-Agent": "deepsynth-leaderboard"},
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            return resp.status in (200, 204)
+    except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
+        print(f"WARN: Discord notification failed: {e}")
+        return False
+def notify_github_issue(bundle: dict, filename: str, dataset_url: str | None) -> bool:
+    """Open a GitHub issue on the benchmark repo so maintainers get an email
+    AND a permanent searchable record they can check off as they review.
+    """
+    if not GH_TOKEN:
+        return False
+    meta = bundle["metadata"]
+    title = f"[Submission] {meta['agent_name']} · {meta['organization']}"
+    file_link = (
+        f"[`{filename}`]({dataset_url})" if dataset_url else f"`{filename}` (in Space queue)"
+    )
+    body = (
+        f"**New DEEPSYNTH leaderboard submission received via the HF Space form.**\n\n"
+        f"| Field | Value |\n"
+        f"|---|---|\n"
+        f"| Agent | `{meta['agent_name']}` |\n"
+        f"| Base model | `{meta['base_model']}` |\n"
+        f"| Scaffold | `{meta['scaffold']}` |\n"
+        f"| Split | `{meta['split']}` |\n"
+        f"| Organization | {meta['organization']} |\n"
+        f"| Contact | {meta['contact_email']} |\n"
+        f"| Code URL | {meta['code_url']} |\n"
+        f"| Received at | {bundle['received_at']} |\n"
+        f"| Predictions count | {len(bundle['predictions'])} |\n"
+        f"| Submission file | {file_link} |\n\n"
+        f"**Maintainer checklist:**\n"
+        f"- [ ] Verify `code_url` is public and reproducible\n"
+        f"- [ ] Pull the file from the queue dataset\n"
+        f"- [ ] Run `eval_static_score.py` against private gold answers\n"
+        f"- [ ] Commit scored JSON to the Space's `submissions/`\n"
+        f"- [ ] Reply to submitter at {meta['contact_email']}\n"
+        f"- [ ] Close this issue\n"
+    )
+    payload = json.dumps({
+        "title": title,
+        "body": body,
+        "labels": ["submission", "needs-review"],
+    }).encode("utf-8")
+    req = urllib.request.Request(
+        f"https://api.github.com/repos/{GH_NOTIFY_REPO}/issues",
+        data=payload,
+        method="POST",
+        headers={
+            "Accept": "application/vnd.github+json",
+            "Authorization": f"Bearer {GH_TOKEN}",
+            "X-GitHub-Api-Version": "2022-11-28",
+            "Content-Type": "application/json",
+            "User-Agent": "deepsynth-leaderboard-space",
+        },
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            return resp.status in (200, 201)
+    except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
+        print(f"WARN: GitHub notification failed: {e}")
+        return False
 def submit_predictions(
     file_obj,
     agent_name: str,
     except OSError as e:
         return f"❌ **Could not read uploaded file:** {e}"
+    error = validate_predictions_payload(predictions, split)
+    if error and error.startswith("❌"):
+        return error
+    warning_prefix = error if error else ""
     bundle = {
         "received_at": datetime.datetime.utcnow().isoformat() + "Z",
         "metadata": {
+            "agent_name":      agent_name.strip(),
+            "base_model":      base_model.strip(),
+            "scaffold":        scaffold,
+            "organization":    organization.strip(),
+            "contact_email":   contact_email.strip(),
+            "code_url":        code_url.strip(),
+            "split":           split,
             "submission_date": datetime.date.today().isoformat(),
         },
         "predictions": predictions,
     date = datetime.date.today().isoformat()
     fname = f"{date}-{_safe_slug(organization)}-{_safe_slug(agent_name)}.json"
+    # Always write the local fallback first — so even if every external
+    # service is misconfigured, the submission isn't lost while the Space
+    # is alive. Cheap insurance.
+    local_dest = QUEUE_DIR / fname
+    with local_dest.open("w", encoding="utf-8") as f:
         json.dump(bundle, f, indent=2, ensure_ascii=False)
+    # Persistent storage: upload to the HF Dataset queue.
+    hf_ok, dataset_url = upload_to_hf_dataset(bundle, fname)
+    # Notifications: fire both channels. Each is independent.
+    discord_ok = notify_discord(bundle, fname, dataset_url)
+    github_ok = notify_github_issue(bundle, fname, dataset_url)
+    # Build a status message that reflects what actually happened.
+    storage_line = (
+        f"💾 Saved permanently to [HF Dataset queue]({dataset_url}).\n\n"
+        if hf_ok
+        else "💾 Saved to Space-local queue (HF Dataset persistence not configured — "
+             "submission may not survive a Space restart; please also open a PR).\n\n"
+    )
+    notify_bits = []
+    if discord_ok: notify_bits.append("Discord")
+    if github_ok:  notify_bits.append("GitHub Issues")
+    notify_line = (
+        f"📬 Maintainers notified via {' + '.join(notify_bits)}.\n\n"
+        if notify_bits
+        else "📬 No notification channels configured on this Space — "
+             "if you don't hear back in 10 days, please email the paper authors.\n\n"
+    )
     return (
+        (warning_prefix + "\n\n" if warning_prefix else "")
+        + f"✅ **Submission received** as `{fname}` for the **{split}** split "
+        f"(**{len(predictions)}** entries).\n\n"
+        + storage_line
+        + notify_line
+        + f"A maintainer will score it against the private {split}-set answers and merge it to the "
         f"leaderboard within ~1 week. We may email `{contact_email}` if we need to verify "
         f"reproducibility via your `code_url`.\n\n"
+        f"**For a permanent public record,** please also open a PR to the "
+        f"[benchmark repo]({REPO_URL}) with your predictions file under `submissions/`."
     )
             with gr.Tab("📤 Submit"):
                 gr.Markdown("## Submit your agent's predictions")
                 gr.Markdown(
+                    "Upload a JSON file containing **your agent's output** on DEEPSYNTH. "
+                    "The uploaded file must be the *predictions file* produced by running "
+                    "your agent on the split's questions — not your agent's source code, "
+                    "and not a raw transcript. We then score it against the private gold "
+                    "answers and add your row to the leaderboard."
+                )
+                gr.Markdown(
+                    "### 📄 Expected file format\n"
+                    "The file must be a **JSON array** where each element is an object "
+                    "with a `Question Number` and an `answer`:\n"
+                    "\n"
+                    "```json\n"
+                    "[\n"
+                    "  {\"Question Number\": \"001\", \"answer\": {\"Sweden\": 1.2, \"Finland\": 0.8}},\n"
+                    "  {\"Question Number\": \"002\", \"answer\": {\"Brunei\": -0.67}},\n"
+                    "  ...\n"
+                    "]\n"
+                    "```\n"
+                    "\n"
+                    "**Required per entry:**\n"
+                    "- `Question Number` — the task ID matching the DEEPSYNTH questions file "
+                    "(dev: 1-40, test: 1-80).\n"
+                    "- `answer` — your agent's final structured answer (JSON object / array / number), "
+                    "**NOT** the chain-of-thought or tool transcript.\n\n"
+                    f"Full spec: [`submission_schema.json`]({REPO_URL}/blob/main/scripts/evaluation/submission_schema.json). "
+                    f"Validate locally before uploading: "
+                    f"`python scripts/evaluation/validate_submission.py my_predictions.json --strict`."
                 )
                 with gr.Row():
                             value="ReAct",
                         )
                         split_in         = gr.Dropdown(
+                            choices=["dev", "test"],
                             label="Split evaluated",
+                            value="test",
                         )
                     with gr.Column():
+                        organization_in  = gr.Textbox(label="Organization", placeholder="e.g. MSR, Stanford, Google, etc.")
                         contact_email_in = gr.Textbox(label="Contact email", placeholder="you@org.edu")
                         code_url_in      = gr.Textbox(
                             label="Code URL (required)",
                         )
                 predictions_in = gr.File(
+                    label="Predictions JSON (the output file produced by your agent)",
                     file_types=[".json"],
                 )
                 submit_btn = gr.Button("Submit for review", variant="primary")
                 gr.Markdown(
                     "---\n"
+                    "**What happens after you submit?** Your file is queued in the Space and a GitHub "
+                    "issue is opened on the benchmark repo so maintainers get notified. We verify metadata "
+                    "honesty and spot-check reproducibility via your `code_url` before computing scores and "
+                    "merging to the leaderboard.\n\n"
                     f"**Prefer Git?** Open a PR to [{REPO_URL.split('//')[1]}]({REPO_URL}) "
                     "adding your file under `submissions/YYYY-MM-DD-org-agentname.json`."
                 )