| """ |
| DeepSynth Leaderboard β Hugging Face Space (Gradio) |
| |
| Unified leaderboard: paper baselines and community submissions both live as |
| JSON files in submissions/ and share a single table. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import datetime |
| import json |
| import os |
| import re |
| import tempfile |
| import urllib.error |
| import urllib.request |
| from pathlib import Path |
| from typing import Any |
|
|
| import gradio as gr |
| import pandas as pd |
|
|
| |
| |
| |
|
|
| RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions")) |
| DEV_RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_DEV_RESULTS_DIR", "dev_submissions")) |
|
|
| |
| |
| |
| _DEFAULT_QUEUE = "/data/submissions_queue" if Path("/data").is_dir() else "submissions_queue" |
| QUEUE_DIR = Path(os.environ.get("DEEPSYNTH_QUEUE_DIR", _DEFAULT_QUEUE)) |
| QUEUE_DIR.mkdir(exist_ok=True, parents=True) |
|
|
| |
| |
| |
| HF_TOKEN = os.environ.get("HF_TOKEN") |
| HF_QUEUE_REPO = os.environ.get( |
| "DEEPSYNTH_QUEUE_REPO", "DeepSynthesisTeam/deepsynth-submission-queue" |
| ) |
|
|
| |
| DISCORD_WEBHOOK_URL = os.environ.get("DISCORD_WEBHOOK_URL") |
| GH_NOTIFY_REPO = os.environ.get("DEEPSYNTH_NOTIFY_REPO", "agentdeepsynthesis/deepsynth-bench") |
| GH_TOKEN = os.environ.get("GH_TOKEN") |
|
|
| TITLE = "π DeepSynth Leaderboard" |
| TAGLINE = "A Benchmark for Deep Information Synthesis Β· ICLR 2026" |
| ABOUT_BLURB = ( |
| "Large language model (LLM)-based agents are increasingly used to solve complex tasks " |
| "involving tool use β web browsing, code execution, data analysis. Current benchmarks " |
| "do not adequately assess their ability to solve real-world tasks requiring synthesis " |
| "across multiple sources and inference beyond simple fact retrieval.\n\n" |
| "**DeepSynth** introduces 120 tasks across 7 domains and 67 countries, designed to evaluate " |
| "agents on realistic, time-consuming problems that combine information gathering, synthesis, " |
| "and structured reasoning." |
| ) |
| REPO_URL = "https://github.com/agentdeepsynthesis/deepsynth-bench" |
| PAPER_URL = "https://arxiv.org/abs/2602.21143" |
| DATASET_URL = "https://huggingface.co/datasets/DeepSynthesisTeam/deepsynth-bench" |
|
|
| |
| |
| |
|
|
| CUSTOM_CSS = """ |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap'); |
| |
| .gradio-container { |
| font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important; |
| max-width: 1400px !important; |
| margin: 0 auto !important; |
| } |
| |
| .gradio-container h1, .gradio-container h2, .gradio-container h3 { |
| font-family: 'Inter', sans-serif !important; |
| font-weight: 700 !important; |
| letter-spacing: -0.02em !important; |
| } |
| |
| .gradio-container h1 { |
| font-size: 2.2rem !important; |
| margin-bottom: 0.25rem !important; |
| text-align: center !important; |
| } |
| .gradio-container h2 { font-size: 1.4rem !important; margin-top: 1.5rem !important; } |
| |
| .gradio-container code, .gradio-container pre { |
| font-family: 'JetBrains Mono', 'Fira Code', monospace !important; |
| font-size: 0.9em !important; |
| } |
| |
| .tagline { |
| color: #6b7280; |
| font-size: 1rem; |
| margin-bottom: 1.5rem; |
| font-weight: 500; |
| text-align: center; |
| } |
| |
| .gradio-container .table-wrap table { |
| font-family: 'Inter', sans-serif !important; |
| font-size: 0.92rem !important; |
| } |
| .gradio-container .table-wrap th { |
| font-weight: 600 !important; |
| background: #f9fafb !important; |
| border-bottom: 2px solid #e5e7eb !important; |
| text-align: left !important; |
| } |
| .gradio-container .table-wrap td { |
| padding: 0.55rem 0.75rem !important; |
| } |
| |
| .section-header { |
| font-size: 1.1rem; |
| font-weight: 700; |
| margin: 1rem 0 0.5rem 0; |
| padding-bottom: 0.4rem; |
| border-bottom: 1px solid #e5e7eb; |
| } |
| |
| .link-row { |
| text-align: center; |
| margin-bottom: 1.5rem; |
| } |
| .link-row a { |
| display: inline-block; |
| padding: 0.25rem 0.75rem; |
| margin-right: 0.5rem; |
| border-radius: 6px; |
| background: #f3f4f6; |
| color: #374151 !important; |
| text-decoration: none !important; |
| font-size: 0.9rem; |
| font-weight: 500; |
| } |
| .link-row a:hover { background: #e5e7eb; } |
| |
| .gradio-container button.tab-nav { |
| font-weight: 600 !important; |
| font-size: 1rem !important; |
| } |
| """ |
|
|
| |
| |
| |
|
|
| def load_submissions(results_dir: Path) -> list[dict[str, Any]]: |
| if not results_dir.exists(): |
| return [] |
| rows = [] |
| for path in sorted(results_dir.glob("*.json")): |
| try: |
| with path.open("r", encoding="utf-8") as f: |
| rows.append(json.load(f)) |
| except (json.JSONDecodeError, OSError) as e: |
| print(f"WARN: failed to load {path}: {e}") |
| return rows |
|
|
|
|
| def _access_label(access: str | None) -> str: |
| if access == "closed": |
| return "π closed" |
| if access == "open": |
| return "π open" |
| return "β" |
|
|
|
|
| def leaderboard_dataframe(submissions: list[dict[str, Any]]) -> pd.DataFrame: |
| """Build a single unified leaderboard DataFrame, globally ranked by F1 desc.""" |
| if not submissions: |
| return pd.DataFrame(columns=[ |
| "Rank", "Agent", "Model", "Access", |
| "F1", "Precision", "Recall", "EM", "LLM Judge", |
| "Org", "Date", |
| ]) |
|
|
| rows = [] |
| for s in submissions: |
| meta = s.get("metadata", {}) |
| overall = s.get("scores", {}).get("overall", {}) |
| efficiency = s.get("efficiency", {}) |
|
|
| rows.append({ |
| "Agent": meta.get("agent_name", "β"), |
| "Model": meta.get("base_model", "β"), |
| "Access": _access_label(meta.get("access")), |
| "F1": overall.get("f1"), |
| "Precision": overall.get("precision"), |
| "Recall": overall.get("recall"), |
| "EM": overall.get("exact_match"), |
| "LLM Judge": overall.get("llm_judge"), |
| "Avg Cost ($)": efficiency.get("avg_cost_usd"), |
| "Avg Latency (s)": efficiency.get("avg_latency_s"), |
| "Org": meta.get("organization", "β"), |
| "Date": meta.get("submission_date", "β"), |
| }) |
|
|
| df = pd.DataFrame(rows) |
|
|
| |
| |
| for col in ("Avg Cost ($)", "Avg Latency (s)", "Precision", "Recall", "EM"): |
| if col in df.columns and df[col].isna().all(): |
| df = df.drop(columns=[col]) |
|
|
| |
| df = df.sort_values( |
| by=["F1", "LLM Judge"], |
| ascending=[False, False], |
| na_position="last", |
| ).reset_index(drop=True) |
|
|
| |
| medals = {0: "π₯ 1", 1: "π₯ 2", 2: "π₯ 3"} |
| df.insert(0, "Rank", [medals.get(i, str(i + 1)) for i in range(len(df))]) |
|
|
| return df |
|
|
|
|
| |
| |
| |
|
|
| SAFE_NAME_RE = re.compile(r"[^a-zA-Z0-9._-]+") |
|
|
|
|
| def _safe_slug(text: str, maxlen: int = 40) -> str: |
| slug = SAFE_NAME_RE.sub("-", (text or "unnamed").strip()).strip("-").lower() |
| return slug[:maxlen] or "unnamed" |
|
|
|
|
| def validate_predictions_payload(predictions: Any, split: str) -> str | None: |
| """Validate that uploaded file is in the eval_static_score.py format. |
| |
| Returns an error message string if invalid, or None if valid. |
| The evaluator expects a JSON list of {"Question Number": ..., "answer": ...} |
| objects β NOT a dict keyed by task ID. |
| """ |
| if not isinstance(predictions, list): |
| return ( |
| "β **Wrong format.** Predictions must be a JSON **array** (list), " |
| "not an object/dict. Each element should be `{\"Question Number\": \"001\", " |
| "\"answer\": ...}`. See the expected format in the Submit tab above." |
| ) |
| if not predictions: |
| return "β **Empty predictions file.** Please include answers for the tasks you evaluated." |
|
|
| expected_count = 40 if split == "dev" else 80 |
| missing_fields = [] |
| for i, item in enumerate(predictions[:5]): |
| if not isinstance(item, dict): |
| return ( |
| f"β **Entry {i} is not a JSON object.** Each element must be a " |
| f"dict with 'Question Number' and 'answer' keys." |
| ) |
| if "Question Number" not in item: |
| missing_fields.append(f"entry {i}: missing 'Question Number'") |
| if "answer" not in item: |
| missing_fields.append(f"entry {i}: missing 'answer'") |
| if missing_fields: |
| return "β **Required fields missing:** " + "; ".join(missing_fields[:3]) |
|
|
| if len(predictions) < expected_count: |
| return ( |
| f"β οΈ **Partial submission warning:** the {split} split has {expected_count} " |
| f"tasks, but your file contains only {len(predictions)}. This will be " |
| f"accepted but scored as 0.0 for missing tasks. Continue anyway? Resubmit a " |
| f"complete file if this was unintentional." |
| ) |
| return None |
|
|
|
|
| def upload_to_hf_dataset(bundle: dict, filename: str) -> tuple[bool, str | None]: |
| """Upload a single submission file to a private HF Dataset repo. |
| |
| Returns (success, dataset_url). Each submission is its own file under |
| queue/<filename> β never appending to a shared CSV (which races and |
| silently drops simultaneous submissions). |
| """ |
| if not HF_TOKEN: |
| return False, None |
|
|
| |
| try: |
| from huggingface_hub import HfApi, CommitOperationAdd |
| except ImportError: |
| print("WARN: huggingface_hub not installed; cannot upload to dataset") |
| return False, None |
|
|
| payload = json.dumps(bundle, indent=2, ensure_ascii=False).encode("utf-8") |
|
|
| try: |
| api = HfApi(token=HF_TOKEN) |
| api.create_commit( |
| repo_id=HF_QUEUE_REPO, |
| repo_type="dataset", |
| operations=[ |
| CommitOperationAdd( |
| path_in_repo=f"queue/{filename}", |
| path_or_fileobj=payload, |
| ) |
| ], |
| commit_message=f"submission: {bundle['metadata']['agent_name']} ({bundle['metadata']['organization']})", |
| ) |
| return True, f"https://huggingface.co/datasets/{HF_QUEUE_REPO}/blob/main/queue/{filename}" |
| except Exception as e: |
| print(f"WARN: HF Dataset upload failed: {e}") |
| return False, None |
|
|
|
|
| def notify_discord(bundle: dict, filename: str, dataset_url: str | None) -> bool: |
| """Post a submission summary to a Discord channel via webhook.""" |
| if not DISCORD_WEBHOOK_URL: |
| return False |
|
|
| meta = bundle["metadata"] |
| n_preds = len(bundle["predictions"]) |
| desc_lines = [ |
| f"**Agent:** `{meta['agent_name']}`", |
| f"**Base model:** `{meta['base_model']}`", |
| f"**Scaffold:** `{meta['scaffold']}` Β· **Split:** `{meta['split']}` Β· **Entries:** {n_preds}", |
| f"**Org:** {meta['organization']} Β· **Contact:** {meta['contact_email']}", |
| f"**Code:** {meta['code_url']}", |
| ] |
| if dataset_url: |
| desc_lines.append(f"**Submission file:** [view on HF]({dataset_url})") |
|
|
| payload = json.dumps({ |
| "content": "π **New DEEPSYNTH leaderboard submission**", |
| "embeds": [{ |
| "title": f"{meta['agent_name']} β {meta['organization']}", |
| "description": "\n".join(desc_lines), |
| "color": 0xff9d00, |
| "timestamp": bundle["received_at"], |
| }], |
| }).encode("utf-8") |
|
|
| req = urllib.request.Request( |
| DISCORD_WEBHOOK_URL, |
| data=payload, |
| method="POST", |
| headers={"Content-Type": "application/json", "User-Agent": "deepsynth-leaderboard"}, |
| ) |
| try: |
| with urllib.request.urlopen(req, timeout=10) as resp: |
| return resp.status in (200, 204) |
| except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e: |
| print(f"WARN: Discord notification failed: {e}") |
| return False |
|
|
|
|
| def notify_github_issue(bundle: dict, filename: str, dataset_url: str | None) -> bool: |
| """Open a GitHub issue on the benchmark repo so maintainers get an email |
| AND a permanent searchable record they can check off as they review. |
| """ |
| if not GH_TOKEN: |
| return False |
|
|
| meta = bundle["metadata"] |
| title = f"[Submission] {meta['agent_name']} Β· {meta['organization']}" |
| file_link = ( |
| f"[`{filename}`]({dataset_url})" if dataset_url else f"`{filename}` (in Space queue)" |
| ) |
| body = ( |
| f"**New DEEPSYNTH leaderboard submission received via the HF Space form.**\n\n" |
| f"| Field | Value |\n" |
| f"|---|---|\n" |
| f"| Agent | `{meta['agent_name']}` |\n" |
| f"| Base model | `{meta['base_model']}` |\n" |
| f"| Scaffold | `{meta['scaffold']}` |\n" |
| f"| Split | `{meta['split']}` |\n" |
| f"| Organization | {meta['organization']} |\n" |
| f"| Contact | {meta['contact_email']} |\n" |
| f"| Code URL | {meta['code_url']} |\n" |
| f"| Received at | {bundle['received_at']} |\n" |
| f"| Predictions count | {len(bundle['predictions'])} |\n" |
| f"| Submission file | {file_link} |\n\n" |
| f"**Maintainer checklist:**\n" |
| f"- [ ] Verify `code_url` is public and reproducible\n" |
| f"- [ ] Pull the file from the queue dataset\n" |
| f"- [ ] Run `eval_static_score.py` against private gold answers\n" |
| f"- [ ] Commit scored JSON to the Space's `submissions/`\n" |
| f"- [ ] Reply to submitter at {meta['contact_email']}\n" |
| f"- [ ] Close this issue\n" |
| ) |
|
|
| payload = json.dumps({ |
| "title": title, |
| "body": body, |
| "labels": ["submission", "needs-review"], |
| }).encode("utf-8") |
|
|
| req = urllib.request.Request( |
| f"https://api.github.com/repos/{GH_NOTIFY_REPO}/issues", |
| data=payload, |
| method="POST", |
| headers={ |
| "Accept": "application/vnd.github+json", |
| "Authorization": f"Bearer {GH_TOKEN}", |
| "X-GitHub-Api-Version": "2022-11-28", |
| "Content-Type": "application/json", |
| "User-Agent": "deepsynth-leaderboard-space", |
| }, |
| ) |
| try: |
| with urllib.request.urlopen(req, timeout=10) as resp: |
| return resp.status in (200, 201) |
| except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e: |
| print(f"WARN: GitHub notification failed: {e}") |
| return False |
|
|
|
|
| def submit_predictions( |
| file_obj, |
| agent_name: str, |
| base_model: str, |
| scaffold: str, |
| organization: str, |
| contact_email: str, |
| code_url: str, |
| split: str, |
| ) -> str: |
| |
| |
| |
| if file_obj is None: |
| return "β **Missing file.** Please attach a predictions JSON." |
| if not agent_name or not agent_name.strip(): |
| return "β **Missing agent name.** Please give your submission a short display name." |
|
|
| |
| soft_warnings: list[str] = [] |
| if contact_email.strip() and "@" not in contact_email: |
| soft_warnings.append("contact email looks malformed") |
| if code_url.strip() and not code_url.startswith(("http://", "https://")): |
| soft_warnings.append("code URL must start with http:// or https://") |
|
|
| |
| base_model = base_model.strip() or "unspecified" |
| organization = organization.strip() or "Anonymous" |
| scaffold = scaffold or "none" |
|
|
| try: |
| src_path = Path(file_obj.name if hasattr(file_obj, "name") else file_obj) |
| with src_path.open("r", encoding="utf-8") as f: |
| predictions = json.load(f) |
| except json.JSONDecodeError as e: |
| return f"β **Invalid JSON in uploaded file:** {e}" |
| except OSError as e: |
| return f"β **Could not read uploaded file:** {e}" |
|
|
| error = validate_predictions_payload(predictions, split) |
| if error and error.startswith("β"): |
| return error |
| warning_prefix = error if error else "" |
|
|
| bundle = { |
| "received_at": datetime.datetime.utcnow().isoformat() + "Z", |
| "metadata": { |
| "agent_name": agent_name.strip(), |
| "base_model": base_model.strip(), |
| "scaffold": scaffold, |
| "organization": organization.strip(), |
| "contact_email": contact_email.strip(), |
| "code_url": code_url.strip(), |
| "split": split, |
| "submission_date": datetime.date.today().isoformat(), |
| }, |
| "predictions": predictions, |
| } |
|
|
| date = datetime.date.today().isoformat() |
| fname = f"{date}-{_safe_slug(organization)}-{_safe_slug(agent_name)}.json" |
|
|
| |
| |
| |
| local_dest = QUEUE_DIR / fname |
| with local_dest.open("w", encoding="utf-8") as f: |
| json.dump(bundle, f, indent=2, ensure_ascii=False) |
|
|
| |
| hf_ok, dataset_url = upload_to_hf_dataset(bundle, fname) |
|
|
| |
| discord_ok = notify_discord(bundle, fname, dataset_url) |
| github_ok = notify_github_issue(bundle, fname, dataset_url) |
|
|
| |
| storage_line = ( |
| f"πΎ Saved permanently to [HF Dataset queue]({dataset_url}).\n\n" |
| if hf_ok |
| else "πΎ Saved to Space-local queue (HF Dataset persistence not configured β " |
| "submission may not survive a Space restart; please also open a PR).\n\n" |
| ) |
| notify_bits = [] |
| if discord_ok: notify_bits.append("Discord") |
| if github_ok: notify_bits.append("GitHub Issues") |
| notify_line = ( |
| f"π¬ Maintainers notified via {' + '.join(notify_bits)}.\n\n" |
| if notify_bits |
| else "π¬ No notification channels configured on this Space β " |
| "if you don't hear back in 10 days, please email the paper authors.\n\n" |
| ) |
| soft_line = "" |
| if soft_warnings: |
| soft_line = "β οΈ **Note:** " + "; ".join(soft_warnings) + ".\n\n" |
|
|
| email_line = ( |
| f"We may email `{bundle['metadata']['contact_email']}` if we need to verify " |
| f"reproducibility via your `code_url`." |
| if bundle["metadata"]["contact_email"] |
| else "**Tip:** add a contact email next time so we can follow up about " |
| "reproducibility or questions." |
| ) |
|
|
| return ( |
| (warning_prefix + "\n\n" if warning_prefix else "") |
| + soft_line |
| + f"β
**Submission received** as `{fname}` for the **{split}** split " |
| f"(**{len(predictions)}** entries).\n\n" |
| + storage_line |
| + notify_line |
| + f"A maintainer will score it against the {split}-set gold answers and merge it to the " |
| f"leaderboard within ~1 week. " + email_line + "\n\n" |
| f"**For a permanent public record,** please also open a PR to the " |
| f"[benchmark repo]({REPO_URL}) with your predictions file under `submissions/`." |
| ) |
|
|
|
|
| |
| |
| |
|
|
| def build_app() -> gr.Blocks: |
| df_test = leaderboard_dataframe(load_submissions(RESULTS_DIR)) |
| df_dev = leaderboard_dataframe(load_submissions(DEV_RESULTS_DIR)) |
|
|
| with gr.Blocks(title="DeepSynth Leaderboard", css=CUSTOM_CSS, theme=gr.themes.Default()) as app: |
| gr.Markdown(f"# {TITLE}") |
| gr.HTML(f"<div class='tagline'>{TAGLINE}</div>") |
| gr.HTML( |
| "<div class='link-row'>" |
| f"<a href='{PAPER_URL}' target='_blank'>π Paper</a>" |
| f"<a href='{REPO_URL}' target='_blank'>π» Code</a>" |
| f"<a href='{DATASET_URL}' target='_blank'>π€ Dataset</a>" |
| f"<a href='{REPO_URL}#submitting-to-the-leaderboard' target='_blank'>π₯ How to submit</a>" |
| "</div>" |
| ) |
|
|
| with gr.Tabs(): |
| |
| with gr.Tab("π Leaderboard"): |
| gr.Markdown( |
| "Results ranked by **F1** score (LLM Judge used as tiebreaker). " |
| "F1 / Precision / Recall measure prediction quality against gold " |
| "answers; **LLM Judge** reports average precision under semantic " |
| "matching. π = closed model, π = open-weights.", |
| elem_classes=["section-header"], |
| ) |
|
|
| with gr.Tabs(): |
| with gr.Tab("Dev (40 tasks Β· public)"): |
| gr.Markdown( |
| "Self-reported numbers on the **public dev set** (40 tasks, " |
| "Pass@1). Useful for prototyping and comparing methods during " |
| "development. Anyone can score themselves locally on this split.", |
| ) |
| gr.Dataframe( |
| value=df_dev, |
| interactive=False, |
| wrap=True, |
| ) |
|
|
| with gr.Tab("Test (80 tasks Β· held-out)"): |
| gr.Markdown( |
| "Official numbers on the **held-out test set** (80 tasks, " |
| "Pass@1). Gold answers are private; submissions are scored " |
| "by the maintainers.", |
| ) |
| gr.Dataframe( |
| value=df_test, |
| interactive=False, |
| wrap=True, |
| ) |
|
|
| |
| with gr.Tab("π€ Submit"): |
| gr.Markdown("## Submit your agent's predictions") |
| gr.Markdown( |
| "Upload a JSON file containing **your agent's output** on either the " |
| "**dev** (40 public tasks) or **test** (80 held-out tasks) split of DEEPSYNTH. " |
| "For dev submissions, the evaluator scores against the publicly released gold " |
| "answers. For test submissions, we score against our private gold answers and " |
| "add your row to the held-out leaderboard. The uploaded file must be the " |
| "*predictions file* produced by running your agent on the split's questions β " |
| "not your agent's source code, and not a raw transcript." |
| ) |
|
|
| gr.Markdown( |
| "### π Expected file format\n" |
| "The file must be a **JSON array** where each element is an object " |
| "with a `Question Number` and an `answer`:\n" |
| "\n" |
| "```json\n" |
| "[\n" |
| " {\"Question Number\": \"1\", \"answer\": {\"Sweden\": 1.2, \"Finland\": 0.8}},\n" |
| " {\"Question Number\": \"2\", \"answer\": {\"Brunei\": -0.67}},\n" |
| " ...\n" |
| "]\n" |
| "```\n" |
| "\n" |
| "**Required per entry:**\n" |
| "- `Question Number` β the task ID matching the DEEPSYNTH questions file " |
| "(dev, test).\n" |
| "- `answer` β your agent's final structured answer (JSON object / array / number), " |
| "**NOT** the chain-of-thought or tool transcript.\n\n" |
| f"Full spec: [`submission_schema.json`]({REPO_URL}/blob/main/scripts/evaluation/submission_schema.json). " |
| f"Validate locally before uploading: " |
| f"`python scripts/evaluation/validate_submission.py my_predictions.json --strict`." |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(): |
| agent_name_in = gr.Textbox( |
| label="Agent name (required)", |
| placeholder="e.g. ReAct-GPT5", |
| info="Short display name shown on the leaderboard.", |
| ) |
| base_model_in = gr.Textbox( |
| label="Base model", |
| placeholder="e.g. gpt-5.2-pro (2026-02)", |
| info="Optional. Defaults to 'unspecified'.", |
| ) |
| scaffold_in = gr.Dropdown( |
| choices=["none", "ReAct", "CodeAct", "Plan-and-Execute", "Reflexion", "MCTS", "Custom"], |
| label="Scaffold", |
| value="none", |
| info="Optional. Select the agent scaffold you used.", |
| ) |
| split_in = gr.Dropdown( |
| choices=["dev", "test"], |
| label="Split evaluated", |
| value="test", |
| ) |
| with gr.Column(): |
| organization_in = gr.Textbox( |
| label="Organization", |
| placeholder="e.g. DeepSeek, Huawei, Google, MSR", |
| info="Optional. Defaults to 'Anonymous'.", |
| ) |
| contact_email_in = gr.Textbox( |
| label="Contact email", |
| placeholder="you@org.edu", |
| info="Optional but recommended β we may email you about reproducibility.", |
| ) |
| code_url_in = gr.Textbox( |
| label="Code URL", |
| placeholder="https://github.com/you/your-agent", |
| info="Optional. A public URL helps us accept your submission faster.", |
| ) |
|
|
| predictions_in = gr.File( |
| label="Predictions JSON (required) β the output file produced by your agent", |
| file_types=[".json"], |
| ) |
| submit_btn = gr.Button("Submit for review", variant="primary") |
| submit_status = gr.Markdown() |
|
|
| submit_btn.click( |
| fn=submit_predictions, |
| inputs=[ |
| predictions_in, agent_name_in, base_model_in, scaffold_in, |
| organization_in, contact_email_in, code_url_in, split_in, |
| ], |
| outputs=submit_status, |
| ) |
|
|
| gr.Markdown( |
| "---\n" |
| "**What happens after you submit?** Your file is queued in the Space and a GitHub " |
| "issue is opened on the benchmark repo so maintainers get notified. We verify metadata " |
| "honesty and spot-check reproducibility via your `code_url` before computing scores and " |
| "merging to the leaderboard.\n\n" |
| f"**Prefer Git?** Open a PR to [{REPO_URL.split('//')[1]}]({REPO_URL}) " |
| "adding your file under `submissions/YYYY-MM-DD-org-agentname.json`." |
| ) |
|
|
| |
| with gr.Tab("π About"): |
| gr.Markdown(ABOUT_BLURB) |
|
|
| gr.Markdown( |
| "## The task\n" |
| "Each DeepSynth task presents a complex, real-world question that cannot " |
| "be answered by a single web search or a single document lookup. Producing " |
| "the correct answer requires an agent to **decompose** the question into " |
| "sub-problems, **gather** evidence from multiple heterogeneous sources " |
| "(news articles, government statistics, scientific publications, specialized " |
| "databases), **synthesize** findings into a coherent intermediate state, and " |
| "**return a structured answer** (typically a JSON object of key-value pairs, " |
| "a ranked list, or a numerical aggregate).\n\n" |
| "Tasks span **7 domains** β science, geography, economics, history, culture, " |
| "politics, and technology β and reference entities across **67 countries**. " |
| "Expert curators verified that every question has a well-defined answer " |
| "recoverable from public sources at the time of release, and that answering " |
| "it requires combining evidence from at least three distinct sources." |
| ) |
|
|
| gr.Markdown( |
| "## Splits\n" |
| "DeepSynth ships as **120 expert-curated tasks** divided into two splits:\n\n" |
| "- **Dev set β 40 tasks (public, with gold answers).** Each dev task includes " |
| "the question, the gold answer, a full **decomposition** into sub-problems, " |
| "and the **intermediate answers** expected at each step. Use this split for " |
| "prototyping, debugging, and agent development β you can score yourself " |
| "locally and inspect where your agent's reasoning diverges from the expected " |
| "trajectory.\n" |
| "- **Test set β 80 tasks (questions only).** Gold answers and decompositions " |
| "are held private to prevent contamination and enable clean evaluation. " |
| "Submit your predictions via the leaderboard and we score them against the " |
| "held-out answers." |
| ) |
|
|
| gr.Markdown( |
| "## Metrics\n" |
| "- **F1 / Precision / Recall** β token-level overlap between predicted and " |
| "gold answers, averaged over all tasks.\n" |
| "- **Exact Match (EM)** β fraction of tasks where the predicted answer " |
| "exactly equals the gold answer (strict structured-equality check).\n" |
| "- **LLM Judge** β semantic-equivalence scoring with small numerical " |
| "tolerance (1β5.5%), evaluated by a strong frozen judge model. Captures " |
| "cases where the answer is substantively correct but phrased or formatted " |
| "differently from the gold." |
| ) |
|
|
| gr.Markdown( |
| "## Dataset\n" |
| f"DeepSynth is hosted on π€ [`DeepSynthesisTeam/deepsynth-bench`]({DATASET_URL}). " |
| "Dev-set gold answers, decompositions, and intermediate-answer JSON schemas " |
| "are shipped alongside the questions. Test-set release is gated β downloading " |
| "requires agreeing to the evaluation protocol." |
| ) |
|
|
| |
| with gr.Tab("π Citation"): |
| gr.Markdown("### Please cite:") |
| gr.Markdown( |
| "```bibtex\n" |
| "@inproceedings{deepsynth2026,\n" |
| " title = {A Benchmark for Deep Information Synthesis},\n" |
| " author = {Paul, Debjit and Murphy, Daniel and Gritta, Milan and Cardenas, Ronald\n" |
| " and Prokhorov, Victor and Bolliger, Lena Sophia and Toker, Aysim\n" |
| " and Miles, Roy and Oncescu, Andreea-Maria and Sivakumar, Jasivan Alex\n" |
| " and Borchert, Philipp and Elezi, Ismail and Zhang, Meiru\n" |
| " and Lee, Ka Yiu and Zhang, Guchun and Wang, Jun and Lampouras, Gerasimos},\n" |
| " booktitle = {International Conference on Learning Representations (ICLR)},\n" |
| " year = {2026},\n" |
| f" url = {{{PAPER_URL}}}\n" |
| "}\n" |
| "```" |
| ) |
|
|
| return app |
|
|
|
|
| if __name__ == "__main__": |
| build_app().launch() |