Spaces:

DeepSynthesisTeam
/

deepsynth-leaderboard

Sleeping

App Files Files Community

debjitpaul commited on Apr 19

Commit

63fb47d

1 Parent(s): 369761f

Overhaul: Inter/JetBrains Mono typography, paper baselines, submission form

Browse files

Files changed (1) hide show

app.py +370 -129

app.py CHANGED Viewed

@@ -1,16 +1,15 @@
 """
 DeepSynth Leaderboard — Hugging Face Space (Gradio)
-Reads validated submissions from a results repo and renders a sortable,
-filterable leaderboard with per-domain breakdowns and efficiency metrics.
-Deploy by pushing this file + requirements.txt to a HF Space with SDK=gradio.
 """
 from __future__ import annotations
 import json
 import os
 from pathlib import Path
 from typing import Any
@@ -22,27 +21,158 @@ import pandas as pd
 # ---------------------------------------------------------------------------
 RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
 TITLE = "🐙 DeepSynth Leaderboard"
-SUBTITLE = (
-    "Large language model (LLM)-based agents are increasingly used to solve complex tasks involving tool use, such as web browsing, code execution, and data analysis. However, current evaluation benchmarks do not adequately assess their ability to solve real-world tasks that require synthesizing information from multiple sources and inferring insights beyond simple fact retrieval."
-    "We introduce DEEPSYNTH, a novel benchmark of 120 tasks across 7 domains and 67 countries, designed to evaluate agents on realistic, time-consuming problems that combine information gathering, synthesis, and structured reasoning. "
-    "across 7 domains and 67 countries. ICLR 2026."
 )
 REPO_URL = "https://github.com/agentdeepsynthesis/deepsynth-bench"
 PAPER_URL = "https://arxiv.org/abs/2602.21143"
 DATASET_URL = "https://huggingface.co/datasets/DeepSynthesisTeam/deepsynth-bench"
-DOMAINS = [
-    "science", "geography", "economics", "history",
-    "culture", "politics", "technology",
 ]
 # ---------------------------------------------------------------------------
-# Data loading
 # ---------------------------------------------------------------------------
 def load_submissions(results_dir: Path) -> list[dict[str, Any]]:
-    """Load every *.json file under results_dir as a submission result row."""
     if not results_dir.exists():
         return []
     rows = []
@@ -55,10 +185,12 @@ def load_submissions(results_dir: Path) -> list[dict[str, Any]]:
     return rows
-def build_dataframe(submissions: list[dict[str, Any]]) -> pd.DataFrame:
-    """Flatten submission result records into a leaderboard DataFrame."""
     if not submissions:
-        return pd.DataFrame()
     records = []
     for s in submissions:
@@ -66,153 +198,262 @@ def build_dataframe(submissions: list[dict[str, Any]]) -> pd.DataFrame:
         scores = s.get("scores", {})
         efficiency = s.get("efficiency", {})
-        row = {
-            "Agent": meta.get("agent_name", "—"),
-            "Base Model": meta.get("base_model", "—"),
-            "Scaffold": meta.get("scaffold", "—"),
-            "Overall EM": scores.get("overall", {}).get("exact_match"),
-            "Overall F1": scores.get("overall", {}).get("f1"),
-            "LLM Judge": scores.get("overall", {}).get("llm_judge"),
-        }
-        #for domain in DOMAINS:
-        #    row[f"{domain.title()}"] = scores.get("per_domain", {}).get(domain, {}).get("f1")
-        row["Avg Cost ($)"] = efficiency.get("avg_cost_usd")
-        row["Avg Latency (s)"] = efficiency.get("avg_latency_s")
-        row["Avg Tool Calls"] = efficiency.get("avg_num_tool_calls")
-        row["Split"] = meta.get("split", "—")
-        row["Org"] = meta.get("organization", "—")
-        row["Date"] = meta.get("submission_date", "—")
         paper = meta.get("paper_url")
         code = meta.get("code_url")
-        row["Paper"] = f"[link]({paper})" if paper else ""
-        row["Code"] = f"[link]({code})" if code else ""
-        records.append(row)
     df = pd.DataFrame(records)
-    if "LLM Judge" in df.columns:
         df = df.sort_values("LLM Judge", ascending=False, na_position="last").reset_index(drop=True)
     return df
 # ---------------------------------------------------------------------------
-# UI
 # ---------------------------------------------------------------------------
-def filter_df(
-    df: pd.DataFrame,
-    split_filter: str,
-    scaffolds: list[str],
-    min_seeds: int,
-) -> pd.DataFrame:
-    if df.empty:
-        return df
-    out = df.copy()
-    if split_filter != "all":
-        out = out[out["Split"] == split_filter]
-    if scaffolds:
-        out = out[out["Scaffold"].isin(scaffolds)]
-    # min_seeds filter omitted from display columns for clarity — expose in detail view later.
-    return out.reset_index(drop=True)
-def build_app() -> gr.Blocks:
-    submissions = load_submissions(RESULTS_DIR)
-    df_full = build_dataframe(submissions)
-    all_scaffolds = sorted(df_full["Scaffold"].unique().tolist()) if not df_full.empty else []
-    with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as app:
         gr.Markdown(f"# {TITLE}")
-        gr.Markdown(SUBTITLE)
-        gr.Markdown(
-            f"📄 [Paper]({PAPER_URL}) · "
-            f"💻 [Code]({REPO_URL}) · "
-            f"🤗 [Dataset]({DATASET_URL}) · "
-            f"📥 [How to submit]({REPO_URL}#submitting-to-the-leaderboard)"
         )
         with gr.Tabs():
             with gr.Tab("🏆 Leaderboard"):
-                with gr.Row():
-                    split_filter = gr.Radio(
-                        choices=["all", "test", "full", "dev"],
-                        value="all",
-                        label="Split",
-                    )
-                    scaffold_filter = gr.CheckboxGroup(
-                        choices=all_scaffolds,
-                        value=all_scaffolds,
-                        label="Scaffold",
-                    )
-                    min_seeds = gr.Slider(
-                        minimum=1, maximum=5, value=1, step=1, label="Min seeds",
-                    )
-                table = gr.Dataframe(
-                    value=df_full,
                     interactive=False,
                     wrap=True,
-                    datatype=["str"] * len(df_full.columns) if not df_full.empty else None,
                 )
-                for ctrl in (split_filter, scaffold_filter, min_seeds):
-                    ctrl.change(
-                        fn=lambda sp, sc, ms: filter_df(df_full, sp, sc, ms),
-                        inputs=[split_filter, scaffold_filter, min_seeds],
-                        outputs=table,
                     )
-            with gr.Tab("📖 About"):
-                gr.Markdown(f"""
-## About DeepSynth
-DeepSynth evaluates LLM agents on **multi-step web information synthesis** —
-tasks where the correct answer requires combining many pieces of evidence
-retrieved across the open web, not single-hop lookup.
-**Scoring:** submissions are evaluated with three metrics — Exact Match (EM),
-F1 on key-value pairs, and LLM-Judge with small numerical tolerance. The
-leaderboard sorts by LLM-Judge by default; click any column header to re-sort.
-**Efficiency columns** (cost, latency, tool calls) are optional and populated
-when submitters provide per-task instrumentation.
-                """)
-            with gr.Tab("📤 Submit"):
-                gr.Markdown(f"""
-## How to submit
-1. Run your agent on the DeepSynth **test set** (120 tasks, questions-only
-   file on the HF dataset).
-2. Produce a submission JSON conforming to [`submission_schema.json`]({REPO_URL}/blob/main/scripts/evaluation/submission_schema.json).
-3. Validate locally: `python scripts/evaluation/validate_submission.py my_submission.json`.
-4. Open a PR to the [leaderboard repo]({REPO_URL}) adding your file under
-   `submissions/YYYY-MM-DD-agent-name.json`.
-5. CI will run schema validation and score computation; on merge, your row
-   appears here automatically.
-**Required:** a public code URL that reproduces your results. We may ask for
-a run trace for spot-check verification.
-                """)
             with gr.Tab("📜 Citation"):
-                gr.Markdown("""
-```bibtex
-@inproceedings{deepsynth2026,
-  title     = {A Benchmark for Deep Information Synthesis},
-  author    = {{Paul, Debjit and Murphy, Daniel and Gritta, Milan and Cardenas, Ronald and Prokhorov, Victor and Bolliger, Lena Sophia and Toker, Aysim and Miles, Roy and Oncescu, Andreea-Maria and Sivakumar, Jasivan Alex and Borchert, Philipp and Elezi, Ismail and Zhang, Meiru and Lee, Ka Yiu and Zhang, Guchun and Wang, Jun and Lampouras, Gerasimos}}},
-  booktitle = {International Conference on Learning Representations (ICLR)},
-  year      = {2026}
-}
-```
-""")
     return app
 if __name__ == "__main__":
-    build_app().launch()

 """
 DeepSynth Leaderboard — Hugging Face Space (Gradio)
+GAIA-style leaderboard with tabs, custom typography, and a submission upload form.
 """
 from __future__ import annotations
+import datetime
 import json
 import os
+import re
 from pathlib import Path
 from typing import Any
 # ---------------------------------------------------------------------------
 RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
+QUEUE_DIR = Path(os.environ.get("DEEPSYNTH_QUEUE_DIR", "submissions_queue"))
+QUEUE_DIR.mkdir(exist_ok=True, parents=True)
 TITLE = "🐙 DeepSynth Leaderboard"
+TAGLINE = "A Benchmark for Deep Information Synthesis · ICLR 2026"
+ABOUT_BLURB = (
+    "Large language model (LLM)-based agents are increasingly used to solve complex tasks "
+    "involving tool use — web browsing, code execution, data analysis. Current benchmarks "
+    "do not adequately assess their ability to solve real-world tasks requiring synthesis "
+    "across multiple sources and inference beyond simple fact retrieval.\n\n"
+    "**DeepSynth** introduces 120 tasks across 7 domains and 67 countries, designed to evaluate "
+    "agents on realistic, time-consuming problems that combine information gathering, synthesis, "
+    "and structured reasoning."
 )
 REPO_URL = "https://github.com/agentdeepsynthesis/deepsynth-bench"
 PAPER_URL = "https://arxiv.org/abs/2602.21143"
 DATASET_URL = "https://huggingface.co/datasets/DeepSynthesisTeam/deepsynth-bench"
+# ---------------------------------------------------------------------------
+# Styling — GAIA-inspired: mono/sans headers, compact table, subtle colors
+# ---------------------------------------------------------------------------
+CUSTOM_CSS = """
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap');
+.gradio-container {
+    font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
+    max-width: 1400px !important;
+    margin: 0 auto !important;
+}
+.gradio-container h1, .gradio-container h2, .gradio-container h3 {
+    font-family: 'Inter', sans-serif !important;
+    font-weight: 700 !important;
+    letter-spacing: -0.02em !important;
+}
+.gradio-container h1 { font-size: 2.2rem !important; margin-bottom: 0.25rem !important; }
+.gradio-container h2 { font-size: 1.4rem !important; margin-top: 1.5rem !important; }
+.gradio-container code, .gradio-container pre {
+    font-family: 'JetBrains Mono', 'Fira Code', monospace !important;
+    font-size: 0.9em !important;
+}
+/* Tagline under title */
+.tagline {
+    color: #6b7280;
+    font-size: 1rem;
+    margin-bottom: 1.5rem;
+    font-weight: 500;
+}
+/* Leaderboard table — tighter, more readable */
+.gradio-container .table-wrap table {
+    font-family: 'Inter', sans-serif !important;
+    font-size: 0.92rem !important;
+}
+.gradio-container .table-wrap th {
+    font-weight: 600 !important;
+    background: #f9fafb !important;
+    border-bottom: 2px solid #e5e7eb !important;
+    text-align: left !important;
+}
+.gradio-container .table-wrap td {
+    padding: 0.55rem 0.75rem !important;
+}
+/* Section headers (like GAIA's "Test set" / "Validation set") */
+.section-header {
+    font-size: 1.1rem;
+    font-weight: 700;
+    margin: 1rem 0 0.5rem 0;
+    padding-bottom: 0.4rem;
+    border-bottom: 1px solid #e5e7eb;
+}
+/* Badge-style links under the title */
+.link-row a {
+    display: inline-block;
+    padding: 0.25rem 0.75rem;
+    margin-right: 0.5rem;
+    border-radius: 6px;
+    background: #f3f4f6;
+    color: #374151 !important;
+    text-decoration: none !important;
+    font-size: 0.9rem;
+    font-weight: 500;
+}
+.link-row a:hover { background: #e5e7eb; }
+/* Tabs — cleaner look */
+.gradio-container button.tab-nav {
+    font-weight: 600 !important;
+    font-size: 1rem !important;
+}
+"""
+# ---------------------------------------------------------------------------
+# Paper baselines — pre-populated from Table 1 of the DeepSynth paper
+# ---------------------------------------------------------------------------
+PAPER_BASELINES: list[dict[str, Any]] = [
+    # LLM Baselines (no agent scaffold, no tool use)
+    {"category": "LLM Baseline", "agent": "o4-mini",               "model": "o4-mini (2025-08)",         "access": "closed", "scaffold": "none", "f1": 3.05, "precision": 2.33, "recall": 4.39, "em": 0.00, "llm_judge": 0.00},
+    {"category": "LLM Baseline", "agent": "GPT-4.1",               "model": "gpt-4.1 (2025-08)",         "access": "closed", "scaffold": "none", "f1": 3.46, "precision": 2.86, "recall": 4.39, "em": 0.00, "llm_judge": 0.00},
+    {"category": "LLM Baseline", "agent": "o3",                    "model": "o3 (2025-08)",              "access": "closed", "scaffold": "none", "f1": 3.29, "precision": 2.85, "recall": 3.90, "em": 0.00, "llm_judge": 0.00},
+    {"category": "LLM Baseline", "agent": "GPT-5.1",               "model": "gpt-5.1 (2025-08)",         "access": "closed", "scaffold": "none", "f1": 3.83, "precision": 2.98, "recall": 5.37, "em": 0.00, "llm_judge": 0.00},
+    {"category": "LLM Baseline", "agent": "Gemini-Pro-2.5",        "model": "gemini-pro-2.5 (2025-08)",  "access": "closed", "scaffold": "none", "f1": 6.25, "precision": 4.71, "recall": 9.27, "em": 0.00, "llm_judge": 5.00},
+    {"category": "LLM Baseline", "agent": "GPT-5.2-Pro",           "model": "gpt-5.2-pro (2026-02)",     "access": "closed", "scaffold": "none", "f1": 8.70, "precision": 8.45, "recall": 8.96, "em": 6.25, "llm_judge": 6.67},
+    {"category": "LLM Baseline", "agent": "DeepSeek-R1-Chat",      "model": "deepseek-r1-chat (2025-08)","access": "open",   "scaffold": "none", "f1": 3.23, "precision": 2.75, "recall": 3.90, "em": 1.67, "llm_judge": 2.50},
+    {"category": "LLM Baseline", "agent": "DeepSeek-R1-Reasoner",  "model": "deepseek-r1 (2026-02)",     "access": "open",   "scaffold": "none", "f1": 2.80, "precision": 2.73, "recall": 2.87, "em": 2.50, "llm_judge": 6.67},
+    # Framework-based Agents
+    {"category": "Agent Framework", "agent": "o3-deep-research",   "model": "o3-deep-research (2025-08)","access": "closed", "scaffold": "Custom",    "f1": 8.97, "precision": 7.73, "recall": 10.69, "em": 2.50, "llm_judge": 17.50},
+    {"category": "Agent Framework", "agent": "Smolagent",          "model": "gpt-4.1",                    "access": "open",   "scaffold": "CodeAct",   "f1": 3.75, "precision": 3.27, "recall": 4.39,  "em": 2.50, "llm_judge": 7.50},
+    {"category": "Agent Framework", "agent": "Smolagent",          "model": "gpt-5",                      "access": "open",   "scaffold": "CodeAct",   "f1": 6.42, "precision": 6.34, "recall": 6.50,  "em": 1.67, "llm_judge": 2.50},
+    {"category": "Agent Framework", "agent": "OWL",                "model": "gpt-4.1",                    "access": "open",   "scaffold": "Custom",    "f1": 5.41, "precision": 4.62, "recall": 6.52,  "em": 1.67, "llm_judge": 12.50},
 ]
+def baselines_dataframe() -> pd.DataFrame:
+    """Build the paper-baselines DataFrame in display order."""
+    rows = []
+    for b in PAPER_BASELINES:
+        lock = "🔒" if b["access"] == "closed" else "🔓"
+        rows.append({
+            "Category":  b["category"],
+            "Agent":     b["agent"],
+            "Model":     b["model"],
+            "Access":    f"{lock} {b['access']}",
+            "Scaffold":  b["scaffold"],
+            "F1":        b["f1"],
+            "Precision": b["precision"],
+            "Recall":    b["recall"],
+            "EM":        b["em"],
+            "LLM Judge": b["llm_judge"],
+        })
+    df = pd.DataFrame(rows)
+    # Sort by LLM Judge within each category, preserving LLM Baseline first
+    df["__cat_order"] = df["Category"].map({"LLM Baseline": 0, "Agent Framework": 1})
+    df = df.sort_values(by=["__cat_order", "LLM Judge", "F1"], ascending=[True, False, False])
+    df = df.drop(columns=["__cat_order"]).reset_index(drop=True)
+    return df
 # ---------------------------------------------------------------------------
+# Community submissions — loaded from the submissions/ directory
 # ---------------------------------------------------------------------------
 def load_submissions(results_dir: Path) -> list[dict[str, Any]]:
     if not results_dir.exists():
         return []
     rows = []
     return rows
+def community_dataframe(submissions: list[dict[str, Any]]) -> pd.DataFrame:
     if not submissions:
+        return pd.DataFrame(columns=[
+            "Agent", "Model", "Scaffold", "Split", "F1", "EM", "LLM Judge",
+            "Avg Cost ($)", "Avg Latency (s)", "Org", "Date", "Links"
+        ])
     records = []
     for s in submissions:
         scores = s.get("scores", {})
         efficiency = s.get("efficiency", {})
         paper = meta.get("paper_url")
         code = meta.get("code_url")
+        links = " · ".join(filter(None, [
+            f"[paper]({paper})" if paper else None,
+            f"[code]({code})" if code else None,
+        ]))
+        records.append({
+            "Agent":           meta.get("agent_name", "—"),
+            "Model":           meta.get("base_model", "—"),
+            "Scaffold":        meta.get("scaffold", "—"),
+            "Split":           meta.get("split", "—"),
+            "F1":              scores.get("overall", {}).get("f1"),
+            "EM":              scores.get("overall", {}).get("exact_match"),
+            "LLM Judge":       scores.get("overall", {}).get("llm_judge"),
+            "Avg Cost ($)":   efficiency.get("avg_cost_usd"),
+            "Avg Latency (s)": efficiency.get("avg_latency_s"),
+            "Org":             meta.get("organization", "—"),
+            "Date":            meta.get("submission_date", "—"),
+            "Links":           links,
+        })
     df = pd.DataFrame(records)
+    if "LLM Judge" in df.columns and not df["LLM Judge"].isna().all():
         df = df.sort_values("LLM Judge", ascending=False, na_position="last").reset_index(drop=True)
     return df
 # ---------------------------------------------------------------------------
+# Submission upload handler — saves file to queue for maintainer review
 # ---------------------------------------------------------------------------
+SAFE_NAME_RE = re.compile(r"[^a-zA-Z0-9._-]+")
+def _safe_slug(text: str, maxlen: int = 40) -> str:
+    slug = SAFE_NAME_RE.sub("-", (text or "unnamed").strip()).strip("-").lower()
+    return slug[:maxlen] or "unnamed"
+def submit_predictions(
+    file_obj,
+    agent_name: str,
+    base_model: str,
+    scaffold: str,
+    organization: str,
+    contact_email: str,
+    code_url: str,
+    split: str,
+) -> str:
+    """Validate basic fields, save the uploaded predictions file to the review queue."""
+    if file_obj is None:
+        return "❌ **Missing file.** Please attach a predictions JSON."
+    if not agent_name.strip():
+        return "❌ **Missing agent name.**"
+    if not base_model.strip():
+        return "❌ **Missing base model.**"
+    if not organization.strip():
+        return "❌ **Missing organization.**"
+    if not contact_email.strip() or "@" not in contact_email:
+        return "❌ **Invalid contact email.**"
+    if not code_url.strip() or not code_url.startswith(("http://", "https://")):
+        return "❌ **Missing or invalid code URL.** A public `code_url` that reproduces your results is required."
+    # Try to parse the uploaded file as JSON to catch corruption early
+    try:
+        src_path = Path(file_obj.name if hasattr(file_obj, "name") else file_obj)
+        with src_path.open("r", encoding="utf-8") as f:
+            predictions = json.load(f)
+    except json.JSONDecodeError as e:
+        return f"❌ **Invalid JSON in uploaded file:** {e}"
+    except OSError as e:
+        return f"❌ **Could not read uploaded file:** {e}"
+    if not isinstance(predictions, dict) or not predictions:
+        return "❌ **Predictions file must be a non-empty JSON object mapping task IDs to answers.**"
+    # Build a bundle with metadata + raw predictions for the maintainer to review
+    bundle = {
+        "received_at": datetime.datetime.utcnow().isoformat() + "Z",
+        "metadata": {
+            "agent_name":     agent_name.strip(),
+            "base_model":     base_model.strip(),
+            "scaffold":       scaffold,
+            "organization":   organization.strip(),
+            "contact_email":  contact_email.strip(),
+            "code_url":       code_url.strip(),
+            "split":          split,
+            "submission_date": datetime.date.today().isoformat(),
+        },
+        "predictions": predictions,
+    }
+    date = datetime.date.today().isoformat()
+    fname = f"{date}-{_safe_slug(organization)}-{_safe_slug(agent_name)}.json"
+    dest = QUEUE_DIR / fname
+    with dest.open("w", encoding="utf-8") as f:
+        json.dump(bundle, f, indent=2, ensure_ascii=False)
+    return (
+        f"✅ **Submission received.** Your file has been queued for review as `{fname}`.\n\n"
+        f"A maintainer will score it against the private test-set answers and merge it to the "
+        f"leaderboard within ~1 week. We may email `{contact_email}` if we need to verify "
+        f"reproducibility via your `code_url`.\n\n"
+        f"**Note:** submissions in this Space's queue are held temporarily — for a permanent "
+        f"record, please also open a PR to the "
+        f"[benchmark repo]({REPO_URL}) with your predictions file under `submissions/`."
+    )
+# ---------------------------------------------------------------------------
+# UI
+# ---------------------------------------------------------------------------
+def build_app() -> gr.Blocks:
+    df_baselines = baselines_dataframe()
+    df_community = community_dataframe(load_submissions(RESULTS_DIR))
+    with gr.Blocks(title="DeepSynth Leaderboard", css=CUSTOM_CSS, theme=gr.themes.Default()) as app:
         gr.Markdown(f"# {TITLE}")
+        gr.HTML(f"<div class='tagline'>{TAGLINE}</div>")
+        gr.HTML(
+            "<div class='link-row'>"
+            f"<a href='{PAPER_URL}' target='_blank'>📄 Paper</a>"
+            f"<a href='{REPO_URL}' target='_blank'>💻 Code</a>"
+            f"<a href='{DATASET_URL}' target='_blank'>🤗 Dataset</a>"
+            f"<a href='{REPO_URL}#submitting-to-the-leaderboard' target='_blank'>📥 How to submit</a>"
+            "</div>"
         )
         with gr.Tabs():
+            # -------------------------------------------------------------
             with gr.Tab("🏆 Leaderboard"):
+                gr.Markdown(
+                    "Results on the **full DeepSynth test set** (120 tasks, Pass@1). "
+                    "F1 / Precision / Recall measure prediction quality against gold answers; "
+                    "LLM Judge reports average precision under semantic matching.",
+                    elem_classes=["section-header"],
+                )
+                gr.Markdown("### Paper baselines")
+                gr.Dataframe(
+                    value=df_baselines,
                     interactive=False,
                     wrap=True,
+                    datatype=["str", "str", "str", "str", "str",
+                              "number", "number", "number", "number", "number"],
                 )
+                if not df_community.empty:
+                    gr.Markdown("### Community submissions")
+                    gr.Dataframe(
+                        value=df_community,
+                        interactive=False,
+                        wrap=True,
                     )
+            # -------------------------------------------------------------
+            with gr.Tab("📤 Submit"):
+                gr.Markdown("## Submit your agent's predictions")
+                gr.Markdown(
+                    "Upload a JSON file of predictions on the DeepSynth **test set**. "
+                    "We'll score it against the private gold answers and add your row to the "
+                    "community leaderboard.\n\n"
+                    f"**Format:** a JSON object mapping task IDs (`\"001\"` … `\"120\"`) to your "
+                    "agent's answer. See "
+                    f"[`submission_schema.json`]({REPO_URL}/blob/main/scripts/evaluation/submission_schema.json) "
+                    "for the full spec."
+                )
+                with gr.Row():
+                    with gr.Column():
+                        agent_name_in   = gr.Textbox(label="Agent name", placeholder="e.g. ReAct-GPT5")
+                        base_model_in   = gr.Textbox(label="Base model", placeholder="e.g. gpt-5.2-pro (2026-02)")
+                        scaffold_in     = gr.Dropdown(
+                            choices=["none", "ReAct", "CodeAct", "Plan-and-Execute", "Reflexion", "MCTS", "Custom"],
+                            label="Scaffold",
+                            value="ReAct",
+                        )
+                        split_in        = gr.Dropdown(
+                            choices=["test", "full"],
+                            label="Split evaluated",
+                            value="test",
+                        )
+                    with gr.Column():
+                        organization_in = gr.Textbox(label="Organization", placeholder="e.g. MSR India")
+                        contact_email_in = gr.Textbox(label="Contact email", placeholder="you@org.edu")
+                        code_url_in     = gr.Textbox(
+                            label="Code URL (required)",
+                            placeholder="https://github.com/you/your-agent",
+                        )
+                predictions_in = gr.File(
+                    label="Predictions JSON file",
+                    file_types=[".json"],
+                )
+                submit_btn = gr.Button("Submit for review", variant="primary")
+                submit_status = gr.Markdown()
+                submit_btn.click(
+                    fn=submit_predictions,
+                    inputs=[
+                        predictions_in, agent_name_in, base_model_in, scaffold_in,
+                        organization_in, contact_email_in, code_url_in, split_in,
+                    ],
+                    outputs=submit_status,
+                )
+                gr.Markdown(
+                    "---\n"
+                    "**What happens next?** Submissions are queued for maintainer review. "
+                    "We verify metadata honesty and spot-check reproducibility via your "
+                    "`code_url` before computing scores and merging to the leaderboard.\n\n"
+                    f"**Prefer Git?** Open a PR to [{REPO_URL.split('//')[1]}]({REPO_URL}) "
+                    "adding your file under `submissions/YYYY-MM-DD-org-agentname.json`."
+                )
+            # -------------------------------------------------------------
+            with gr.Tab("📖 About"):
+                gr.Markdown(ABOUT_BLURB)
+                gr.Markdown(
+                    "## Metrics\n"
+                    "- **F1 / Precision / Recall** — token-level overlap between predicted "
+                    "and gold answers, averaged over all tasks.\n"
+                    "- **Exact Match (EM)** — fraction of tasks where the predicted answer "
+                    "exactly equals the gold answer (strict).\n"
+                    "- **LLM Judge** — semantic-equivalence scoring with small numerical "
+                    "tolerance (1–5.5%), evaluated by a strong frozen judge model.\n\n"
+                    "## Dataset\n"
+                    f"DeepSynth is hosted on 🤗 [`DeepSynthesisTeam/deepsynth-bench`]({DATASET_URL}). "
+                    "The dev set (40 tasks) ships with gold answers for prototyping; the test "
+                    "set (120 tasks) is released questions-only to prevent contamination."
+                )
+            # -------------------------------------------------------------
             with gr.Tab("📜 Citation"):
+                gr.Markdown("### Please cite:")
+                gr.Code(
+                    value=(
+                        "@inproceedings{deepsynth2026,\n"
+                        "  title     = {A Benchmark for Deep Information Synthesis},\n"
+                        "  author    = {Paul, Debjit and Murphy, Daniel and Gritta, Milan and Cardenas, Ronald\n"
+                        "               and Prokhorov, Victor and Bolliger, Lena Sophia and Toker, Aysim\n"
+                        "               and Miles, Roy and Oncescu, Andreea-Maria and Sivakumar, Jasivan Alex\n"
+                        "               and Borchert, Philipp and Elezi, Ismail and Zhang, Meiru\n"
+                        "               and Lee, Ka Yiu and Zhang, Guchun and Wang, Jun and Lampouras, Gerasimos},\n"
+                        "  booktitle = {International Conference on Learning Representations (ICLR)},\n"
+                        "  year      = {2026},\n"
+                        "  url       = {" + PAPER_URL + "}\n"
+                        "}"
+                    ),
+                    language="latex",
+                )
     return app
 if __name__ == "__main__":
+    build_app().launch()