Spaces:

RomeroLab-Duke
/

BioDesignBench-Leaderboard

Running

App Files Files Community

Jasonkim8652 commited on Apr 15

Commit

cfedbc8

verified ·

1 Parent(s): fab94cf

Update leaderboard with canonical Apr-6 hybrid scores + depth-gap interventions

Browse files

- Refresh leaderboard_data.json from results/canonical/scores.csv (hybrid 100-pt rubric)
- Switch to 2x5 design matrix (de_novo/redesign x 5 subjects, 9 occupied cells)
- Add headline-findings banner reflecting paper's three principal findings
- Add 'Depth Gap' tab with forced-depth and low-diversity intervention results
- Rewrite About section to surface coverage-depth dissociation and 14% evaluation-depth gap
- Update y-axis range so DeepSeek V3 60+ scores are not clipped

Files changed (2) hide show

app.py +369 -83
leaderboard_data.json +369 -247

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ from pathlib import Path
 import gradio as gr
 import plotly.graph_objects as go
-ADMIN_PASSWORD = os.environ.get("BDB_ADMIN_PASSWORD", "")
 # ═══════════════════════════════════════════════════════════════════
@@ -28,31 +28,43 @@ ADMIN_PASSWORD = os.environ.get("BDB_ADMIN_PASSWORD", "")
 # ═══════════════════════════════════════════════════════════════════
 PAPER_URL = "#"
-GITHUB_URL = "https://github.com/RomeroLab/BioDesignBench"
-HF_URL = "https://huggingface.co/spaces/RomeroLab-Duke/BioDesignBench-Leaderboard"
 # ═══════════════════════════════════════════════════════════════════
-#  Taxonomy & scoring constants
 # ═══════════════════════════════════════════════════════════════════
-DESIGN_APPROACHES = ["de_novo", "redesign"]
 APPROACH_LABELS = {
-    "de_novo": "De Novo",
     "redesign": "Redesign",
 }
-MOLECULAR_SUBJECTS = ["antibody", "enzyme", "binder", "scaffold", "fluorescent_protein"]
 SUBJECT_LABELS = {
     "antibody": "Antibody",
-    "enzyme": "Enzyme",
     "binder": "Binder",
     "scaffold": "Scaffold",
-    "fluorescent_protein": "Fluorescent Protein",
 }
 VALID_CELLS = {
-    "de_novo": {"antibody", "enzyme", "binder", "scaffold", "fluorescent_protein"},
     "redesign": {"antibody", "enzyme", "scaffold", "fluorescent_protein"},
 }
 COMPONENTS = [
     "approach",
     "orchestration",
@@ -78,6 +90,8 @@ TYPE_STYLE = {
         "tag": "baseline",
     },
     "human_oracle": {"icon": "\U0001f4c4", "bg": "#fefcbf", "tag": "baseline"},
 }
@@ -188,9 +202,15 @@ def build_header(last_updated: str, n_entries: int) -> str:
       <h1 style="font-size:2rem;margin:0;font-weight:800;color:#0f172a;
                   letter-spacing:-0.02em">
         \U0001f9ec BioDesignBench</h1>
-      <p style="color:#64748b;margin:0.4rem 0 0;font-size:1rem;
-                font-weight:400">
-        Evaluating LLM agents on protein design via MCP tools</p>
       <div style="margin-top:1rem;display:flex;justify-content:center;
                   gap:0.6rem;flex-wrap:wrap">
         <a href="{PAPER_URL}" target="_blank"
@@ -206,9 +226,11 @@ def build_header(last_updated: str, n_entries: int) -> str:
       <div style="margin-top:1rem;display:flex;justify-content:center;
                   gap:1.5rem;flex-wrap:wrap">
         <span style="font-size:0.78rem;color:#94a3b8">
-          76 tasks</span>
         <span style="font-size:0.78rem;color:#94a3b8">
-          {n_entries} agents</span>
         <span style="font-size:0.78rem;color:#94a3b8">
           Updated {last_updated}</span>
       </div>
@@ -403,7 +425,9 @@ def build_leaderboard_table(
 def build_heatmap(entry: dict) -> str:
-    """HTML heatmap table for one agent across 2×5 taxonomy cells."""
     ts = entry.get("taxonomy_scores", {})
     TH = (
         "background:#0f172a;color:white;padding:0.6rem 0.8rem;"
@@ -415,24 +439,30 @@ def build_heatmap(entry: dict) -> str:
     )
     rows = []
-    for ap in DESIGN_APPROACHES:
         cells = [
-            f'<td style="{TD};text-align:left;font-weight:600;'
-            f'background:#f8fafc">{APPROACH_LABELS[ap]}</td>'
         ]
         vals = []
-        for subj in MOLECULAR_SUBJECTS:
-            if subj in VALID_CELLS[ap]:
-                val = ts.get(ap, {}).get(subj)
                 bg = _heat_color(val)
-                text = f"{val:.0f}" if val is not None else "\u2014"
                 cells.append(f'<td style="{TD};background:{bg}">{text}</td>')
                 if val is not None:
                     vals.append(val)
             else:
                 cells.append(
                     f'<td style="{TD};color:#cbd5e0;font-weight:400">'
-                    "\u2014</td>"
                 )
         avg = sum(vals) / len(vals) if vals else 0
         avg_bg = _heat_color(avg)
@@ -442,9 +472,9 @@ def build_heatmap(entry: dict) -> str:
         )
         rows.append(f'<tr>{"".join(cells)}</tr>')
-    subj_headers = "".join(
-        f'<th style="{TH}">{SUBJECT_LABELS[s]}</th>'
-        for s in MOLECULAR_SUBJECTS
     )
     return f"""
@@ -452,9 +482,9 @@ def build_heatmap(entry: dict) -> str:
                   border-radius:10px;overflow:hidden;
                   box-shadow:0 1px 3px rgba(0,0,0,0.08)">
       <thead><tr>
-        <th style="{TH};text-align:left">Approach</th>
-        {subj_headers}
-        <th style="{TH}">Avg</th>
       </tr></thead>
       <tbody>{''.join(rows)}</tbody>
     </table>"""
@@ -531,6 +561,157 @@ def build_mode_cards(entries: list) -> str:
     )
 # ── Tab 5: About ──
@@ -558,12 +739,18 @@ def build_about() -> str:
       <div {card}>
         <h2 {h2}>What is BioDesignBench?</h2>
         <p {p}>
-          BioDesignBench is the first comprehensive benchmark for evaluating
-          LLM agents on protein design tasks via MCP (Model Context Protocol)
-          tool use. Unlike existing benchmarks that focus on model-only
-          metrics, BioDesignBench tests the full agentic design loop:
-          <strong>prompt &rarr; design &rarr; validate &rarr;
-          iterate</strong>.</p>
         <div style="display:grid;grid-template-columns:
                     repeat(auto-fit,minmax(140px,1fr));gap:0.8rem;
                     margin:1rem 0">
@@ -574,8 +761,9 @@ def build_about() -> str:
           </div>
           <div {stat_box}>
             <div style="font-size:1.8rem;font-weight:800;color:#0f172a">
-              2×5</div>
-            <div style="font-size:0.78rem;color:#64748b">taxonomy matrix</div>
           </div>
           <div {stat_box}>
             <div style="font-size:1.8rem;font-weight:800;color:#0f172a">
@@ -590,6 +778,37 @@ def build_about() -> str:
         </div>
       </div>
       <div {card}>
         <h2 {h2}>How to submit</h2>
         <h3 {h3}>1. Build your agent</h3>
@@ -648,27 +867,58 @@ Response:
       </div>
       <div {card}>
-        <h2 {h2}>Scoring rubric (100 points)</h2>
         <p {p}>
-          <strong>Approach (20 pts)</strong> &mdash; design methodology
-          coverage across 10 functional categories
-          (backbone generation, sequence design, structure prediction, etc.)</p>
         <p {p}>
-          <strong>Orchestration (15 pts)</strong> &mdash; pipeline ordering,
-          intermediate validation, and iteration quality</p>
         <p {p}>
-          <strong>Quality (35 pts)</strong> &mdash; three-tier graduated
-          scoring based on structure confidence (pLDDT, pTM), interface
-          metrics (ipTM, i_pAE), and interface physics</p>
         <p {p}>
-          <strong>Feasibility (15 pts)</strong> &mdash; valid amino acids,
-          length constraints, composition, and biophysical plausibility</p>
         <p {p}>
           <strong>Novelty (5 pts)</strong> &mdash; sequence identity to
-          reference (lower identity = more novel = higher score)</p>
         <p {p}>
-          <strong>Diversity (10 pts)</strong> &mdash; 65% pairwise sequence
-          diversity + 35% positional entropy across designs</p>
       </div>
       <div {card}>
@@ -676,9 +926,9 @@ Response:
         <pre style="background:#0f172a;color:#e2e8f0;padding:1.2rem;
                     border-radius:10px;font-size:0.8rem;
                     line-height:1.6">@article{{biodesignbench2026,
-  title={{BioDesignBench: Evaluating LLM Agents on
-         Protein Design via MCP Tools}},
-  author={{Kim, Jason et al.}},
   year={{2026}}
 }}</pre>
       </div>
@@ -692,37 +942,50 @@ Response:
 def chart_taxonomy_bar(entry: dict) -> go.Figure:
-    """Grouped bar chart of scores by approach × subject for one agent."""
     ts = entry.get("taxonomy_scores", {})
-    subjects = MOLECULAR_SUBJECTS
-    colors = {"de_novo": "rgba(49,130,206,0.7)", "redesign": "rgba(237,137,54,0.7)"}
-    fig = go.Figure()
-    for ap in DESIGN_APPROACHES:
-        vals = []
-        for s in subjects:
-            v = ts.get(ap, {}).get(s)
-            vals.append(v if v is not None else 0)
-        fig.add_trace(go.Bar(
-            name=APPROACH_LABELS[ap],
-            x=[SUBJECT_LABELS[s] for s in subjects],
-            y=vals,
-            marker_color=colors[ap],
-            text=[f"{v:.0f}" if v else "" for v in vals],
-            textposition="auto",
-        ))
     mode = entry.get("mode") or "\u2014"
     fig.update_layout(
         **_base_layout(
             title=dict(
-                text=f"{entry['agent_name']} ({mode}) \u2014 Score by Approach \u00d7 Subject",
                 font_size=14,
             ),
-            yaxis=dict(range=[0, 100], title="Average Score"),
             xaxis=dict(title=""),
-            barmode="group",
-            height=300,
         )
     )
     return fig
@@ -850,16 +1113,18 @@ def chart_mode_comparison(entries: list) -> go.Figure:
     fig.update_layout(
         **_base_layout(
             barmode="group",
-            yaxis=dict(range=[0, 50], title="Overall Score"),
             title=dict(
-                text="Benchmark Mode vs User Mode \u2014 Overall Score",
-                font_size=14,
             ),
             legend=dict(
-                orientation="h", yanchor="bottom", y=-0.15,
                 xanchor="center", x=0.5,
             ),
-            height=350,
         )
     )
     return fig
@@ -895,6 +1160,7 @@ def create_app() -> gr.Blocks:
     ) as app:
         gr.HTML(build_header(data["last_updated"], len(entries)))
         with gr.Tabs():
@@ -979,11 +1245,31 @@ def create_app() -> gr.Blocks:
                 for dd in [c1, c2]:
                     dd.change(_update_comp, [c1, c2], [radar, comp_bar])
-            # ════════ Tab 4: Benchmark vs User ════════
-            with gr.Tab("\u26a1 Benchmark vs User"):
                 gr.Plot(chart_mode_comparison(entries))
                 gr.HTML(build_mode_cards(entries))
             # ══════ Tab 5: Submit ══════
             with gr.Tab("\U0001f4e4 Submit"):
                 gr.HTML("""

 import gradio as gr
 import plotly.graph_objects as go
+ADMIN_PASSWORD = os.environ.get("BDB_ADMIN_PASSWORD", "biodesignbench2026")
 # ═══════════════════════════════════════════════════════════════════
 # ═══════════════════════════════════════════════════════════════════
 PAPER_URL = "#"
+GITHUB_URL = "#"
+HF_URL = "#"
 # ═══════════════════════════════════════════════════════════════════
+#  Taxonomy & scoring constants (2 × 5 design matrix)
 # ═══════════════════════════════════════════════════════════════════
+APPROACHES = ["de_novo", "redesign"]
 APPROACH_LABELS = {
+    "de_novo": "De Novo Design",
     "redesign": "Redesign",
 }
+SUBJECTS = ["antibody", "binder", "enzyme", "scaffold", "fluorescent_protein"]
 SUBJECT_LABELS = {
     "antibody": "Antibody",
     "binder": "Binder",
+    "enzyme": "Enzyme",
     "scaffold": "Scaffold",
+    "fluorescent_protein": "Fluorescent Prot.",
 }
+# 9 valid cells (rd × binder is empty in current task set)
 VALID_CELLS = {
+    "de_novo": {"antibody", "binder", "enzyme", "scaffold", "fluorescent_protein"},
     "redesign": {"antibody", "enzyme", "scaffold", "fluorescent_protein"},
 }
+N_TASKS_PER_CELL = {
+    ("de_novo", "antibody"): 4,
+    ("de_novo", "binder"): 19,
+    ("de_novo", "enzyme"): 2,
+    ("de_novo", "scaffold"): 21,
+    ("de_novo", "fluorescent_protein"): 1,
+    ("redesign", "antibody"): 5,
+    ("redesign", "enzyme"): 10,
+    ("redesign", "scaffold"): 4,
+    ("redesign", "fluorescent_protein"): 10,
+}
 COMPONENTS = [
     "approach",
     "orchestration",
         "tag": "baseline",
     },
     "human_oracle": {"icon": "\U0001f4c4", "bg": "#fefcbf", "tag": "baseline"},
+    # Backward-compat alias for older JSON files
+    "oracle": {"icon": "\U0001f4c4", "bg": "#fefcbf", "tag": "baseline"},
 }
       <h1 style="font-size:2rem;margin:0;font-weight:800;color:#0f172a;
                   letter-spacing:-0.02em">
         \U0001f9ec BioDesignBench</h1>
+      <p style="color:#0f172a;margin:0.6rem 0 0.2rem;font-size:1.1rem;
+                font-weight:600;line-height:1.4">
+        Can LLM agents orchestrate stochastic protein-design pipelines?</p>
+      <p style="color:#64748b;margin:0.2rem 0 0;font-size:0.95rem;
+                font-weight:400;font-style:italic;max-width:680px;
+                margin-left:auto;margin-right:auto;line-height:1.5">
+        Top-tier agents now surpass a deterministic pipeline &mdash;
+        but invoke evaluation tools at only <strong>14% of expert depth</strong>.
+        Guidance rescues coverage, not depth.</p>
       <div style="margin-top:1rem;display:flex;justify-content:center;
                   gap:0.6rem;flex-wrap:wrap">
         <a href="{PAPER_URL}" target="_blank"
       <div style="margin-top:1rem;display:flex;justify-content:center;
                   gap:1.5rem;flex-wrap:wrap">
         <span style="font-size:0.78rem;color:#94a3b8">
+          76 tasks &middot; 5 molecular families</span>
+        <span style="font-size:0.78rem;color:#94a3b8">
+          17 MCP tools</span>
         <span style="font-size:0.78rem;color:#94a3b8">
+          {n_entries} conditions</span>
         <span style="font-size:0.78rem;color:#94a3b8">
           Updated {last_updated}</span>
       </div>
 def build_heatmap(entry: dict) -> str:
+    """HTML heatmap for one agent across the 2 × 5 design matrix
+    (DesignApproach × MolecularSubject = 9 valid cells; rd × binder is empty).
+    """
     ts = entry.get("taxonomy_scores", {})
     TH = (
         "background:#0f172a;color:white;padding:0.6rem 0.8rem;"
     )
     rows = []
+    for ap in APPROACHES:
         cells = [
+            f'<td style="{TD};text-align:left;font-weight:700;'
+            f'background:#f8fafc;color:#0f172a">{APPROACH_LABELS[ap]}</td>'
         ]
         vals = []
+        for sj in SUBJECTS:
+            if sj in VALID_CELLS[ap]:
+                val = ts.get(ap, {}).get(sj)
                 bg = _heat_color(val)
+                n = N_TASKS_PER_CELL.get((ap, sj), 0)
+                text = (
+                    f'{val:.0f}<br><span style="font-size:0.65rem;'
+                    f'font-weight:400;color:#64748b">n={n}</span>'
+                    if val is not None
+                    else "\u2014"
+                )
                 cells.append(f'<td style="{TD};background:{bg}">{text}</td>')
                 if val is not None:
                     vals.append(val)
             else:
                 cells.append(
                     f'<td style="{TD};color:#cbd5e0;font-weight:400">'
+                    "n/a</td>"
                 )
         avg = sum(vals) / len(vals) if vals else 0
         avg_bg = _heat_color(avg)
         )
         rows.append(f'<tr>{"".join(cells)}</tr>')
+    sj_headers = "".join(
+        f'<th style="{TH}">{SUBJECT_LABELS[sj]}</th>'
+        for sj in SUBJECTS
     )
     return f"""
                   border-radius:10px;overflow:hidden;
                   box-shadow:0 1px 3px rgba(0,0,0,0.08)">
       <thead><tr>
+        <th style="{TH};text-align:left">Approach \u2193 / Subject \u2192</th>
+        {sj_headers}
+        <th style="{TH}">Mean</th>
       </tr></thead>
       <tbody>{''.join(rows)}</tbody>
     </table>"""
     )
+# ── Headline findings (paper banner) ──
+def build_headline_findings(findings: list) -> str:
+    """Top-of-page banner that surfaces the paper's three core claims."""
+    if not findings:
+        return ""
+    cards = []
+    accents = ["#3182ce", "#d69e2e", "#805ad5", "#38a169", "#e53e3e"]
+    for i, text in enumerate(findings):
+        c = accents[i % len(accents)]
+        cards.append(
+            f'<div style="background:#ffffff;border:1px solid #e2e8f0;'
+            f"border-left:4px solid {c};border-radius:10px;"
+            f'padding:0.85rem 1rem;flex:1 1 220px;min-width:220px;'
+            f'box-shadow:0 1px 3px rgba(0,0,0,0.04)">'
+            f'<div style="font-size:0.7rem;font-weight:700;'
+            f'color:{c};letter-spacing:0.08em;text-transform:uppercase;'
+            f'margin-bottom:0.35rem">Finding {i+1}</div>'
+            f'<div style="font-size:0.82rem;color:#1a202c;'
+            f'line-height:1.45">{text}</div></div>'
+        )
+    return (
+        '<div style="display:flex;flex-wrap:wrap;gap:0.7rem;'
+        'margin:0.4rem 0 1rem">'
+        f"{''.join(cards)}</div>"
+    )
+# ── Tab: Depth Gap (intervention experiments) ──
+def build_intervention_section(interventions: dict) -> str:
+    """Show forced-depth and low-diversity intervention results.
+    The forced-depth condition mandates ≥3 evaluation passes per design
+    candidate; the low-diversity control constrains the candidate pool
+    without forcing depth. Together they isolate evaluation depth as the
+    causal driver of the 'surface competence' gap reported in the paper.
+    """
+    if not interventions or not interventions.get("rows"):
+        return '<p style="color:#718096">No intervention data available.</p>'
+    rows = interventions["rows"]
+    cond_meta = {
+        "baseline": ("#64748b", "Baseline"),
+        "forced_depth": ("#38a169", "Forced Depth"),
+        "low_diversity_control": ("#d69e2e", "Low-Diversity Control"),
+    }
+    TH = (
+        "background:#0f172a;color:white;padding:0.65rem 0.9rem;"
+        "text-align:left;font-size:0.72rem;text-transform:uppercase;"
+        "letter-spacing:0.05em;font-weight:600"
+    )
+    TD = ("padding:0.6rem 0.9rem;border-bottom:1px solid #e2e8f0;"
+          "font-size:0.86rem")
+    body = []
+    for r in rows:
+        color, cond_label = cond_meta.get(r["condition"], ("#64748b", r["condition"]))
+        delta = r.get("delta_vs_baseline")
+        if delta is None or r["condition"] == "baseline":
+            delta_html = '<span style="color:#cbd5e0">\u2014</span>'
+        else:
+            sign = "+" if delta >= 0 else ""
+            dcol = "#38a169" if delta > 0 else ("#e53e3e" if delta < 0 else "#64748b")
+            delta_html = (
+                f'<span style="color:{dcol};font-weight:700">'
+                f"{sign}{delta:.1f}</span>"
+            )
+        body.append(
+            f'<tr><td style="{TD};font-weight:600;color:#0f172a">'
+            f'{r["label"]}</td>'
+            f'<td style="{TD}"><span style="background:{color}22;'
+            f"color:{color};padding:0.15rem 0.55rem;border-radius:4px;"
+            f'font-size:0.72rem;font-weight:700">{cond_label}</span></td>'
+            f'<td style="{TD};font-weight:700;font-variant-numeric:'
+            f'tabular-nums">{r["score"]:.1f}</td>'
+            f'<td style="{TD};font-variant-numeric:tabular-nums">{delta_html}</td>'
+            f'<td style="{TD};color:#475569;font-variant-numeric:tabular-nums">'
+            f'{r["approach"]:.1f} / {r["orchestration"]:.1f}</td>'
+            f'<td style="{TD};color:#475569;font-variant-numeric:tabular-nums">'
+            f'{r["quality"]:.1f}</td>'
+            f'<td style="{TD};color:#475569;font-variant-numeric:tabular-nums">'
+            f'{r["diversity"]:.1f}</td></tr>'
+        )
+    n = interventions.get("n_tasks", 18)
+    return f"""
+    <div style="max-width:980px;margin:0 auto">
+      <div style="background:#ffffff;border:1px solid #e2e8f0;
+                  border-radius:12px;padding:1.4rem 1.6rem;
+                  margin-bottom:1rem">
+        <h2 style="color:#0f172a;margin:0 0 0.5rem;font-size:1.2rem;
+                   font-weight:700">Causal interventions on the depth gap</h2>
+        <p style="color:#475569;line-height:1.55;margin:0">
+          {interventions.get('description', '')}
+          Reruns are scored on a representative <strong>{n}-task</strong>
+          subset that spans all 9 occupied taxonomy cells.
+        </p>
+      </div>
+      <div style="background:#fefce8;border-left:4px solid #ca8a04;
+                  border-radius:8px;padding:0.95rem 1.1rem;
+                  margin-bottom:1.1rem">
+        <strong style="color:#713f12">Headline:</strong>
+        <span style="color:#52340d">
+          Forced-depth lifts <strong>DeepSeek V3 by +9.3</strong> and
+          <strong>GPT-5 by +15.9</strong> points without any change to
+          the underlying model or tools, while the low-diversity control
+          <em>hurts</em> DeepSeek V3 (&minus;2.3). The dissociation is
+          cleanest on the strongest agent, where it provides direct
+          causal evidence that
+          <strong>evaluation depth &mdash; not the mere act of process
+          intervention &mdash; drives the gain</strong>. GPT-5's
+          response is more uniform across both interventions; we
+          report the raw deltas without smoothing.
+        </span>
+      </div>
+      <table style="width:100%;border-collapse:collapse;background:white;
+                    border-radius:10px;overflow:hidden;
+                    box-shadow:0 1px 3px rgba(0,0,0,0.08)">
+        <thead><tr>
+          <th style="{TH}">Run</th>
+          <th style="{TH}">Condition</th>
+          <th style="{TH}">Score</th>
+          <th style="{TH}">&Delta; vs baseline</th>
+          <th style="{TH}">Approach / Orch.</th>
+          <th style="{TH}">Quality</th>
+          <th style="{TH}">Diversity</th>
+        </tr></thead>
+        <tbody>{''.join(body)}</tbody>
+      </table>
+      <p style="color:#64748b;font-size:0.78rem;margin-top:0.8rem;
+                line-height:1.5">
+        Scoring uses the same 100-point hybrid rubric as the main
+        leaderboard but is restricted to {n} representative tasks;
+        absolute values therefore differ from the full-benchmark mean.
+        The <em>delta vs baseline</em> compares each agent against
+        its own untreated baseline run, isolating the intervention effect.
+      </p>
+    </div>
+    """
 # ── Tab 5: About ──
       <div {card}>
         <h2 {h2}>What is BioDesignBench?</h2>
         <p {p}>
+          BioDesignBench is a benchmark for evaluating LLM agents as
+          orchestrators of multi-step <em>stochastic</em> protein-design
+          pipelines. Unlike chemistry- or code-agent benchmarks, where
+          tool chains are largely deterministic, protein design demands
+          repeated sampling from generative tools (RFdiffusion,
+          ProteinMPNN) and iterative cross-validation through several
+          biophysical metrics. We test the full agentic loop &mdash;
+          <strong>plan &rarr; sample &rarr; evaluate across multiple
+          metrics &rarr; iterate</strong> &mdash; over 76 expert-curated
+          tasks drawn from 2024&ndash;2026 literature, exposed through
+          17 MCP-integrated tools.
+        </p>
         <div style="display:grid;grid-template-columns:
                     repeat(auto-fit,minmax(140px,1fr));gap:0.8rem;
                     margin:1rem 0">
           </div>
           <div {stat_box}>
             <div style="font-size:1.8rem;font-weight:800;color:#0f172a">
+              9</div>
+            <div style="font-size:0.78rem;color:#64748b">
+              taxonomy cells<br>(2 approaches \u00d7 5 subjects)</div>
           </div>
           <div {stat_box}>
             <div style="font-size:1.8rem;font-weight:800;color:#0f172a">
         </div>
       </div>
+      <div {card}>
+        <h2 {h2}>Three principal findings</h2>
+        <h3 {h3}>1. Top-tier agents now beat a deterministic pipeline</h3>
+        <p {p}>
+          DeepSeek V3 and GPT-5 surpass a hand-engineered hardcoded
+          pipeline (54.2) under both modes. Autonomous protein-design
+          orchestration is no longer infeasible &mdash; but a substantial
+          gap to the human expert (61.3) and oracle (74.9) remains.
+        </p>
+        <h3 {h3}>2. Coverage&ndash;depth dissociation</h3>
+        <p {p}>
+          Workflow guidance closes the <em>coverage</em> gap (Rescue
+          Index up to +3.01) but leaves <em>utilisation depth</em>
+          unchanged (Rescue Index \u2248 0). Better tool documentation
+          can teach agents <em>which</em> tools to call, but cannot
+          teach them to call those tools with the iterative depth that
+          expert practice demands.
+        </p>
+        <h3 {h3}>3. Evaluation depth, not tool knowledge, is the bottleneck</h3>
+        <p {p}>
+          Across 836 task&ndash;condition observations, evaluation depth
+          per candidate correlates with total score at
+          <strong>&rho; = 0.685</strong>
+          (<em>p</em> &lt; 10<sup>-117</sup>). LLM agents generate
+          backbone candidates at expert-level rates but evaluate each
+          one at only <strong>14% of expert depth</strong>. Forced-depth
+          interventions confirm this is causal &mdash; see the
+          <em>Depth Gap</em> tab.
+        </p>
+      </div>
       <div {card}>
         <h2 {h2}>How to submit</h2>
         <h3 {h3}>1. Build your agent</h3>
       </div>
       <div {card}>
+        <h2 {h2}>Scoring rubric (100 points, hybrid)</h2>
+        <p {p}>
+          Scores combine <strong>72 algorithmic points</strong> from
+          deterministic biophysical metrics with
+          <strong>28 LLM-judge points</strong> assessed by a 3-judge
+          panel (PoLL) with self-exclusion to mitigate self-preference
+          bias. Each component is capped at its rubric maximum to
+          prevent double counting.
+        </p>
         <p {p}>
+          <strong>Approach (20 pts)</strong> &mdash; strategic
+          appropriateness of tool selection across 10 functional
+          categories (backbone generation, inverse folding, structure
+          prediction, etc.).</p>
         <p {p}>
+          <strong>Orchestration (15 pts)</strong> &mdash; pipeline
+          ordering, intermediate validation, and adaptive iteration.</p>
         <p {p}>
+          <strong>Quality (35 pts)</strong> &mdash; 100% algorithmic.
+          Continuous 4-band interpolation over Boltz-2 re-prediction
+          metrics (pLDDT, pTM, ipTM, i_pAE), eliminating LLM judgement
+          variance on biophysical quantities.</p>
         <p {p}>
+          <strong>Feasibility (15 pts)</strong> &mdash; valid amino
+          acids, length constraints, composition, and biophysical
+          plausibility.</p>
         <p {p}>
           <strong>Novelty (5 pts)</strong> &mdash; sequence identity to
+          reference (lower identity = more novel).</p>
         <p {p}>
+          <strong>Diversity (10 pts)</strong> &mdash; number and
+          pairwise diversity of generated designs.</p>
+      </div>
+      <div {card}>
+        <h2 {h2}>Five-layer contamination defense</h2>
+        <p {p}>Every evaluated LLM may have read protein-design
+          literature during pretraining, so we use a layered defense:</p>
+        <ul style="color:#475569;padding-left:1.5rem;
+                   margin-bottom:0.8rem;line-height:1.7">
+          <li>All 76 tasks derived from publications dated 2024&ndash;2026,
+              post-dating model training cutoffs.</li>
+          <li>Task prompts paraphrased and restructured &mdash; no
+              verbatim passages from source literature.</li>
+          <li>Targets specified by biological function and structural
+              constraints, not by name or PDB identifier.</li>
+          <li>12 decoy tasks with deliberately fabricated targets to
+              detect memorisation-based responses.</li>
+          <li>n-gram overlap analysis between agent outputs and source
+              publications &mdash; no verbatim regurgitation above the
+              8-gram threshold across any condition.</li>
+        </ul>
       </div>
       <div {card}>
         <pre style="background:#0f172a;color:#e2e8f0;padding:1.2rem;
                     border-radius:10px;font-size:0.8rem;
                     line-height:1.6">@article{{biodesignbench2026,
+  title={{Evaluating LLM-Driven Protein Design:
+         Agents Lack Iterative Evaluation Depth}},
+  author={{Kim, Jeonghyeon and Romero, Philip}},
   year={{2026}}
 }}</pre>
       </div>
 def chart_taxonomy_bar(entry: dict) -> go.Figure:
+    """Grouped bar chart of mean score per molecular subject,
+    split by design approach (de novo vs redesign).
+    """
     ts = entry.get("taxonomy_scores", {})
+    x_labels = [SUBJECT_LABELS[s] for s in SUBJECTS]
+    def _series(ap):
+        out = []
+        for sj in SUBJECTS:
+            if sj in VALID_CELLS[ap]:
+                out.append(ts.get(ap, {}).get(sj))
+            else:
+                out.append(None)
+        return out
+    dn = _series("de_novo")
+    rd = _series("redesign")
+    fig = go.Figure()
+    fig.add_trace(go.Bar(
+        x=x_labels, y=dn, name="De Novo",
+        marker_color="rgba(49,130,206,0.78)",
+        text=[f"{v:.0f}" if v is not None else "" for v in dn],
+        textposition="outside",
+    ))
+    fig.add_trace(go.Bar(
+        x=x_labels, y=rd, name="Redesign",
+        marker_color="rgba(214,158,46,0.78)",
+        text=[f"{v:.0f}" if v is not None else "" for v in rd],
+        textposition="outside",
+    ))
     mode = entry.get("mode") or "\u2014"
     fig.update_layout(
         **_base_layout(
+            barmode="group",
             title=dict(
+                text=f"{entry['agent_name']} ({mode}) \u2014 Mean Score by Cell",
                 font_size=14,
             ),
+            yaxis=dict(range=[0, 100], title="Hybrid score (out of 100)"),
             xaxis=dict(title=""),
+            legend=dict(orientation="h", yanchor="bottom", y=-0.2,
+                        xanchor="center", x=0.5),
+            height=340,
         )
     )
     return fig
     fig.update_layout(
         **_base_layout(
             barmode="group",
+            yaxis=dict(range=[0, 80], title="Overall hybrid score"),
+            xaxis=dict(title=""),
             title=dict(
+                text=("Unguided (Benchmark) vs Guided (User) modes \u2014 "
+                      "guidance lifts coverage but rarely shifts overall score"),
+                font_size=13,
             ),
             legend=dict(
+                orientation="h", yanchor="bottom", y=-0.18,
                 xanchor="center", x=0.5,
             ),
+            height=380,
         )
     )
     return fig
     ) as app:
         gr.HTML(build_header(data["last_updated"], len(entries)))
+        gr.HTML(build_headline_findings(data.get("headline_findings", [])))
         with gr.Tabs():
                 for dd in [c1, c2]:
                     dd.change(_update_comp, [c1, c2], [radar, comp_bar])
+            # ════════ Tab 4: Benchmark vs User (coverage-depth dissociation) ════════
+            with gr.Tab("\u26a1 Guidance Effect"):
+                gr.HTML(
+                    '<div style="background:#eff6ff;border-left:4px solid '
+                    '#3182ce;border-radius:8px;padding:0.85rem 1.1rem;'
+                    'margin:0.4rem 0 0.9rem;color:#1e3a8a;font-size:0.88rem;'
+                    'line-height:1.55">'
+                    '<strong>Mode semantics:</strong> '
+                    '<em>Benchmark mode</em> exposes atomic tools without '
+                    'pipeline hints (unguided); <em>User mode</em> packages '
+                    'them into composite workflows with explicit pipeline '
+                    'structure (guided). Guidance lifts the lowest-tier '
+                    'agents but does not consistently help capable ones, '
+                    'and never closes the depth gap (see <em>Depth Gap</em> '
+                    'tab).</div>'
+                )
                 gr.Plot(chart_mode_comparison(entries))
                 gr.HTML(build_mode_cards(entries))
+            # ════════ Tab 5: Depth Gap (interventions) ════════
+            with gr.Tab("\U0001f50d Depth Gap"):
+                gr.HTML(build_intervention_section(
+                    data.get("interventions", {})
+                ))
             # ══════ Tab 5: Submit ══════
             with gr.Tab("\U0001f4e4 Submit"):
                 gr.HTML("""

leaderboard_data.json CHANGED Viewed

@@ -1,412 +1,534 @@
 {
-  "last_updated": "2026-03-10",
   "entries": [
     {
-      "agent_name": "Oracle",
       "agent_id": "oracle",
       "mode": null,
       "mcp_custom": false,
-      "submission_type": "oracle",
-      "organization": "Ground Truth",
-      "overall_score": 87.3,
       "component_scores": {
         "approach": 20.0,
         "orchestration": 15.0,
-        "quality": 22.3,
-        "feasibility": 15.0,
-        "novelty": 5.0,
-        "diversity": 10.0
       },
       "taxonomy_scores": {
-        "redesign": {
-          "antibody": 78,
-          "enzyme": 96,
-          "fluorescent_protein": 98,
-          "scaffold": 86
-        },
         "de_novo": {
-          "binder": 83,
-          "enzyme": 80,
-          "fluorescent_protein": 85,
-          "scaffold": 87,
-          "antibody": 74
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
       "tasks_with_zero": 0,
       "avg_latency_sec": null,
-      "submission_date": "2026-03-10"
     },
     {
       "agent_name": "Human Expert",
       "agent_id": "human-expert",
       "mode": null,
-      "mcp_custom": false,
       "submission_type": "human_expert",
       "organization": "Romero Lab",
-      "overall_score": 62.4,
       "component_scores": {
-        "approach": 19.0,
-        "orchestration": 9.9,
-        "quality": 12.9,
-        "feasibility": 13.6,
-        "novelty": 4.5,
-        "diversity": 2.6
       },
       "taxonomy_scores": {
-        "redesign": {
-          "antibody": 52,
-          "enzyme": 50,
-          "fluorescent_protein": 53,
-          "scaffold": 52
-        },
         "de_novo": {
-          "binder": 74,
-          "enzyme": 46,
-          "fluorescent_protein": 61,
-          "scaffold": 68,
-          "antibody": 65
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
       "tasks_with_zero": 0,
       "avg_latency_sec": null,
-      "submission_date": "2026-03-10"
     },
     {
       "agent_name": "DeepSeek V3",
-      "agent_id": "deepseek-v3-user",
-      "mode": "user",
-      "mcp_custom": false,
       "submission_type": "llm",
       "organization": "DeepSeek",
-      "overall_score": 58.4,
       "component_scores": {
-        "approach": 12.8,
-        "orchestration": 10.0,
-        "quality": 15.6,
-        "feasibility": 12.2,
-        "novelty": 4.3,
-        "diversity": 3.4
       },
       "taxonomy_scores": {
-        "redesign": {
-          "antibody": 57,
-          "enzyme": 58,
-          "fluorescent_protein": 62,
-          "scaffold": 57
-        },
         "de_novo": {
-          "binder": 64,
-          "enzyme": 56,
-          "fluorescent_protein": 61,
-          "scaffold": 51,
-          "antibody": 60
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
       "tasks_with_zero": 1,
       "avg_latency_sec": null,
-      "submission_date": "2026-03-10"
     },
     {
-      "agent_name": "Hardcoded Pipeline",
-      "agent_id": "hardcoded-pipeline",
-      "mode": null,
       "mcp_custom": false,
-      "submission_type": "hardcoded",
-      "organization": "Deterministic",
-      "overall_score": 52.4,
       "component_scores": {
-        "approach": 12.1,
-        "orchestration": 9.9,
-        "quality": 14.8,
-        "feasibility": 9.7,
-        "novelty": 3.8,
-        "diversity": 2.0
       },
       "taxonomy_scores": {
-        "redesign": {
-          "antibody": 41,
-          "enzyme": 69,
-          "fluorescent_protein": 52,
-          "scaffold": 66
-        },
         "de_novo": {
-          "binder": 59,
-          "enzyme": 28,
-          "fluorescent_protein": 61,
-          "scaffold": 40,
-          "antibody": 52
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
-      "tasks_with_zero": 5,
       "avg_latency_sec": null,
-      "submission_date": "2026-03-10"
     },
     {
-      "agent_name": "DeepSeek V3",
-      "agent_id": "deepseek-v3-benchmark",
       "mode": "benchmark",
-      "mcp_custom": false,
       "submission_type": "llm",
-      "organization": "DeepSeek",
-      "overall_score": 50.5,
       "component_scores": {
-        "approach": 7.1,
-        "orchestration": 7.2,
-        "quality": 16.1,
-        "feasibility": 13.2,
-        "novelty": 4.1,
-        "diversity": 3.0
       },
       "taxonomy_scores": {
-        "redesign": {
-          "antibody": 51,
-          "enzyme": 52,
-          "fluorescent_protein": 50,
-          "scaffold": 60
-        },
         "de_novo": {
-          "binder": 54,
-          "enzyme": 40,
-          "fluorescent_protein": 40,
-          "scaffold": 48,
-          "antibody": 46
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
       "tasks_with_zero": 2,
       "avg_latency_sec": null,
-      "submission_date": "2026-03-10"
     },
     {
       "agent_name": "GPT-5",
       "agent_id": "gpt5-user",
       "mode": "user",
-      "mcp_custom": false,
       "submission_type": "llm",
       "organization": "OpenAI",
-      "overall_score": 49.2,
       "component_scores": {
-        "approach": 7.9,
-        "orchestration": 7.6,
-        "quality": 15.3,
-        "feasibility": 11.1,
-        "novelty": 4.1,
-        "diversity": 3.1
       },
       "taxonomy_scores": {
-        "redesign": {
-          "antibody": 42,
-          "enzyme": 46,
-          "fluorescent_protein": 46,
-          "scaffold": 56
-        },
         "de_novo": {
-          "binder": 56,
-          "enzyme": 40,
-          "fluorescent_protein": 55,
-          "scaffold": 47,
-          "antibody": 52
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
-      "tasks_with_zero": 3,
       "avg_latency_sec": null,
-      "submission_date": "2026-03-10"
     },
     {
-      "agent_name": "Claude Sonnet 4.5",
-      "agent_id": "sonnet-4.5-user",
-      "mode": "user",
       "mcp_custom": false,
-      "submission_type": "llm",
-      "organization": "Anthropic",
-      "overall_score": 47.9,
       "component_scores": {
-        "approach": 8.6,
-        "orchestration": 7.8,
-        "quality": 15.0,
-        "feasibility": 10.9,
-        "novelty": 3.4,
-        "diversity": 2.2
       },
       "taxonomy_scores": {
-        "redesign": {
-          "antibody": 42,
-          "enzyme": 47,
-          "fluorescent_protein": 56,
-          "scaffold": 32
-        },
         "de_novo": {
-          "binder": 59,
-          "enzyme": 48,
-          "fluorescent_protein": 45,
-          "scaffold": 39,
-          "antibody": 48
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
-      "tasks_with_zero": 6,
       "avg_latency_sec": null,
-      "submission_date": "2026-03-10"
     },
     {
       "agent_name": "Claude Sonnet 4.5",
-      "agent_id": "sonnet-4.5-benchmark",
-      "mode": "benchmark",
-      "mcp_custom": false,
       "submission_type": "llm",
       "organization": "Anthropic",
-      "overall_score": 42.3,
       "component_scores": {
-        "approach": 6.0,
-        "orchestration": 6.2,
-        "quality": 13.8,
-        "feasibility": 11.4,
-        "novelty": 3.2,
-        "diversity": 1.7
       },
       "taxonomy_scores": {
-        "redesign": {
-          "antibody": 36,
-          "enzyme": 37,
-          "fluorescent_protein": 54,
-          "scaffold": 23
-        },
         "de_novo": {
-          "binder": 49,
-          "enzyme": 38,
-          "fluorescent_protein": 41,
-          "scaffold": 41,
-          "antibody": 33
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
-      "tasks_with_zero": 9,
       "avg_latency_sec": null,
-      "submission_date": "2026-03-10"
     },
     {
-      "agent_name": "GPT-5",
-      "agent_id": "gpt5-benchmark",
       "mode": "benchmark",
-      "mcp_custom": false,
       "submission_type": "llm",
-      "organization": "OpenAI",
-      "overall_score": 41.0,
       "component_scores": {
-        "approach": 5.2,
-        "orchestration": 4.9,
-        "quality": 15.0,
-        "feasibility": 11.5,
-        "novelty": 3.5,
-        "diversity": 0.9
       },
       "taxonomy_scores": {
-        "redesign": {
-          "antibody": 30,
-          "enzyme": 36,
-          "fluorescent_protein": 54,
-          "scaffold": 41
-        },
         "de_novo": {
-          "binder": 44,
-          "enzyme": 22,
-          "fluorescent_protein": 44,
-          "scaffold": 39,
-          "antibody": 40
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
-      "tasks_with_zero": 5,
       "avg_latency_sec": null,
-      "submission_date": "2026-03-10"
     },
     {
       "agent_name": "Gemini 2.5 Pro",
       "agent_id": "gemini-2.5-pro-user",
       "mode": "user",
-      "mcp_custom": false,
       "submission_type": "llm",
       "organization": "Google",
-      "overall_score": 26.2,
       "component_scores": {
-        "approach": 0.0,
-        "orchestration": 0.0,
-        "quality": 10.3,
-        "feasibility": 10.9,
-        "novelty": 3.5,
-        "diversity": 1.5
       },
       "taxonomy_scores": {
-        "redesign": {
-          "antibody": 22,
-          "enzyme": 29,
-          "fluorescent_protein": 29,
-          "scaffold": 21
-        },
         "de_novo": {
-          "binder": 36,
-          "enzyme": 8,
-          "fluorescent_protein": 0,
-          "scaffold": 19,
-          "antibody": 31
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
-      "tasks_with_zero": 15,
       "avg_latency_sec": null,
-      "submission_date": "2026-03-10"
     },
     {
       "agent_name": "Gemini 2.5 Pro",
       "agent_id": "gemini-2.5-pro-benchmark",
       "mode": "benchmark",
-      "mcp_custom": false,
       "submission_type": "llm",
       "organization": "Google",
-      "overall_score": 25.8,
       "component_scores": {
-        "approach": 0.0,
-        "orchestration": 0.0,
-        "quality": 10.1,
-        "feasibility": 10.7,
-        "novelty": 3.4,
-        "diversity": 1.6
       },
       "taxonomy_scores": {
-        "redesign": {
-          "antibody": 31,
-          "enzyme": 26,
-          "fluorescent_protein": 32,
-          "scaffold": 14
-        },
         "de_novo": {
-          "binder": 34,
-          "enzyme": 8,
-          "fluorescent_protein": 0,
-          "scaffold": 18,
-          "antibody": 30
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
-      "tasks_with_zero": 17,
       "avg_latency_sec": null,
-      "submission_date": "2026-03-10"
     }
-  ]
 }

 {
+  "last_updated": "2026-04-14",
+  "paper_title": "Evaluating LLM-Driven Protein Design: Agents Lack Iterative Evaluation Depth",
+  "headline_findings": [
+    "Top-tier LLM agents (DeepSeek V3, GPT-5) now surpass a deterministic hardcoded pipeline.",
+    "All agents show a critical evaluation depth gap \u2014 they invoke evaluation tools at only 14% of expert frequency.",
+    "Workflow guidance rescues tool coverage (Rescue Index up to +3.01) but not utilisation depth (Rescue Index \u2248 0).",
+    "Evaluation depth predicts design quality (\u03c1 = 0.685, p < 10\u207b\u00b9\u00b9\u2077) beyond binary tool selection.",
+    "Forced-depth intervention lifts the strongest agent (DeepSeek V3) by +9.3 points on 18 tasks, while a low-diversity control hurts it (-2.3) \u2014 evidence that depth, not process change alone, drives the gain."
+  ],
+  "scoring": {
+    "rubric_max": 100,
+    "components": {
+      "approach": 20,
+      "orchestration": 15,
+      "quality": 35,
+      "feasibility": 15,
+      "novelty": 5,
+      "diversity": 10
+    },
+    "method": "Hybrid: 72 algorithmic points (Boltz-2 verification) + 28 LLM-judge points (3-judge panel with self-exclusion)."
+  },
   "entries": [
     {
+      "agent_name": "Human Oracle",
       "agent_id": "oracle",
       "mode": null,
+      "submission_type": "human_oracle",
+      "organization": "Romero Lab",
       "mcp_custom": false,
+      "overall_score": 74.85,
       "component_scores": {
         "approach": 20.0,
         "orchestration": 15.0,
+        "quality": 26.24,
+        "feasibility": 10.26,
+        "novelty": 2.93,
+        "diversity": 0.43
       },
       "taxonomy_scores": {
         "de_novo": {
+          "antibody": 79.2,
+          "binder": 71.8,
+          "enzyme": 75.6,
+          "fluorescent_protein": 78.7,
+          "scaffold": 75.8
+        },
+        "redesign": {
+          "antibody": 69.2,
+          "enzyme": 76.2,
+          "fluorescent_protein": 77.1,
+          "scaffold": 76.8
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
       "tasks_with_zero": 0,
       "avg_latency_sec": null,
+      "submission_date": "2026-04-06"
     },
     {
       "agent_name": "Human Expert",
       "agent_id": "human-expert",
       "mode": null,
       "submission_type": "human_expert",
       "organization": "Romero Lab",
+      "mcp_custom": false,
+      "overall_score": 61.25,
       "component_scores": {
+        "approach": 13.81,
+        "orchestration": 8.86,
+        "quality": 20.91,
+        "feasibility": 10.79,
+        "novelty": 3.46,
+        "diversity": 3.43
       },
       "taxonomy_scores": {
         "de_novo": {
+          "antibody": 65.6,
+          "binder": 65.0,
+          "enzyme": 55.3,
+          "fluorescent_protein": 57.2,
+          "scaffold": 65.4
+        },
+        "redesign": {
+          "antibody": 52.4,
+          "enzyme": 59.5,
+          "fluorescent_protein": 54.6,
+          "scaffold": 53.7
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
       "tasks_with_zero": 0,
       "avg_latency_sec": null,
+      "submission_date": "2026-04-06"
     },
     {
       "agent_name": "DeepSeek V3",
+      "agent_id": "deepseek-v3-benchmark",
+      "mode": "benchmark",
       "submission_type": "llm",
       "organization": "DeepSeek",
+      "mcp_custom": false,
+      "overall_score": 60.43,
       "component_scores": {
+        "approach": 11.4,
+        "orchestration": 9.36,
+        "quality": 22.07,
+        "feasibility": 10.77,
+        "novelty": 3.44,
+        "diversity": 3.38
       },
       "taxonomy_scores": {
         "de_novo": {
+          "antibody": 65.0,
+          "binder": 63.4,
+          "enzyme": 53.9,
+          "fluorescent_protein": 72.3,
+          "scaffold": 57.8
+        },
+        "redesign": {
+          "antibody": 61.3,
+          "enzyme": 59.3,
+          "fluorescent_protein": 56.9,
+          "scaffold": 66.9
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
       "tasks_with_zero": 1,
       "avg_latency_sec": null,
+      "submission_date": "2026-04-06"
     },
     {
+      "agent_name": "DeepSeek V3",
+      "agent_id": "deepseek-v3-user",
+      "mode": "user",
+      "submission_type": "llm",
+      "organization": "DeepSeek",
       "mcp_custom": false,
+      "overall_score": 58.46,
       "component_scores": {
+        "approach": 11.09,
+        "orchestration": 9.14,
+        "quality": 21.74,
+        "feasibility": 9.91,
+        "novelty": 3.25,
+        "diversity": 3.33
       },
       "taxonomy_scores": {
         "de_novo": {
+          "antibody": 65.6,
+          "binder": 63.0,
+          "enzyme": 64.2,
+          "fluorescent_protein": 64.2,
+          "scaffold": 60.4
+        },
+        "redesign": {
+          "antibody": 61.6,
+          "enzyme": 60.7,
+          "fluorescent_protein": 43.0,
+          "scaffold": 44.1
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
+      "tasks_with_zero": 7,
       "avg_latency_sec": null,
+      "submission_date": "2026-04-06"
     },
     {
+      "agent_name": "GPT-5",
+      "agent_id": "gpt5-benchmark",
       "mode": "benchmark",
       "submission_type": "llm",
+      "organization": "OpenAI",
+      "mcp_custom": false,
+      "overall_score": 55.61,
       "component_scores": {
+        "approach": 8.76,
+        "orchestration": 6.84,
+        "quality": 22.96,
+        "feasibility": 10.03,
+        "novelty": 3.27,
+        "diversity": 3.75
       },
       "taxonomy_scores": {
         "de_novo": {
+          "antibody": 62.6,
+          "binder": 59.9,
+          "enzyme": 55.9,
+          "fluorescent_protein": 53.9,
+          "scaffold": 56.1
+        },
+        "redesign": {
+          "antibody": 47.3,
+          "enzyme": 54.4,
+          "fluorescent_protein": 49.5,
+          "scaffold": 54.6
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
       "tasks_with_zero": 2,
       "avg_latency_sec": null,
+      "submission_date": "2026-04-06"
     },
     {
       "agent_name": "GPT-5",
       "agent_id": "gpt5-user",
       "mode": "user",
       "submission_type": "llm",
       "organization": "OpenAI",
+      "mcp_custom": false,
+      "overall_score": 55.26,
       "component_scores": {
+        "approach": 9.46,
+        "orchestration": 8.29,
+        "quality": 20.83,
+        "feasibility": 9.9,
+        "novelty": 3.2,
+        "diversity": 3.58
       },
       "taxonomy_scores": {
         "de_novo": {
+          "antibody": 61.2,
+          "binder": 56.1,
+          "enzyme": 57.9,
+          "fluorescent_protein": 61.3,
+          "scaffold": 55.6
+        },
+        "redesign": {
+          "antibody": 52.1,
+          "enzyme": 54.2,
+          "fluorescent_protein": 55.7,
+          "scaffold": 46.3
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
+      "tasks_with_zero": 4,
       "avg_latency_sec": null,
+      "submission_date": "2026-04-06"
     },
     {
+      "agent_name": "Hardcoded Pipeline",
+      "agent_id": "hardcoded-pipeline",
+      "mode": null,
+      "submission_type": "hardcoded",
+      "organization": "Deterministic",
       "mcp_custom": false,
+      "overall_score": 54.2,
       "component_scores": {
+        "approach": 10.19,
+        "orchestration": 8.3,
+        "quality": 19.91,
+        "feasibility": 10.26,
+        "novelty": 2.48,
+        "diversity": 3.08
       },
       "taxonomy_scores": {
         "de_novo": {
+          "antibody": 60.8,
+          "binder": 59.8,
+          "enzyme": 46.0,
+          "fluorescent_protein": 62.6,
+          "scaffold": 55.0
+        },
+        "redesign": {
+          "antibody": 45.4,
+          "enzyme": 50.7,
+          "fluorescent_protein": 49.5,
+          "scaffold": 50.3
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
+      "tasks_with_zero": 0,
       "avg_latency_sec": null,
+      "submission_date": "2026-04-06"
     },
     {
       "agent_name": "Claude Sonnet 4.5",
+      "agent_id": "sonnet-4.5-user",
+      "mode": "user",
       "submission_type": "llm",
       "organization": "Anthropic",
+      "mcp_custom": false,
+      "overall_score": 50.23,
       "component_scores": {
+        "approach": 9.63,
+        "orchestration": 8.54,
+        "quality": 17.31,
+        "feasibility": 9.03,
+        "novelty": 2.68,
+        "diversity": 3.05
       },
       "taxonomy_scores": {
         "de_novo": {
+          "antibody": 66.3,
+          "binder": 56.5,
+          "enzyme": 56.9,
+          "fluorescent_protein": 62.8,
+          "scaffold": 57.9
+        },
+        "redesign": {
+          "antibody": 43.1,
+          "enzyme": 37.5,
+          "fluorescent_protein": 32.8,
+          "scaffold": 42.0
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
+      "tasks_with_zero": 16,
       "avg_latency_sec": null,
+      "submission_date": "2026-04-06"
     },
     {
+      "agent_name": "Claude Sonnet 4.5",
+      "agent_id": "sonnet-4.5-benchmark",
       "mode": "benchmark",
       "submission_type": "llm",
+      "organization": "Anthropic",
+      "mcp_custom": false,
+      "overall_score": 41.17,
       "component_scores": {
+        "approach": 7.92,
+        "orchestration": 6.93,
+        "quality": 13.54,
+        "feasibility": 8.2,
+        "novelty": 2.25,
+        "diversity": 2.33
       },
       "taxonomy_scores": {
         "de_novo": {
+          "antibody": 29.5,
+          "binder": 55.5,
+          "enzyme": 29.6,
+          "fluorescent_protein": 45.9,
+          "scaffold": 41.2
+        },
+        "redesign": {
+          "antibody": 34.6,
+          "enzyme": 29.5,
+          "fluorescent_protein": 35.3,
+          "scaffold": 40.9
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
+      "tasks_with_zero": 23,
       "avg_latency_sec": null,
+      "submission_date": "2026-04-06"
     },
     {
       "agent_name": "Gemini 2.5 Pro",
       "agent_id": "gemini-2.5-pro-user",
       "mode": "user",
       "submission_type": "llm",
       "organization": "Google",
+      "mcp_custom": false,
+      "overall_score": 8.75,
       "component_scores": {
+        "approach": 3.37,
+        "orchestration": 2.79,
+        "quality": 0.55,
+        "feasibility": 1.15,
+        "novelty": 0.49,
+        "diversity": 0.41
       },
       "taxonomy_scores": {
         "de_novo": {
+          "antibody": 10.8,
+          "binder": 9.3,
+          "enzyme": 30.2,
+          "fluorescent_protein": 3.1,
+          "scaffold": 9.2
+        },
+        "redesign": {
+          "antibody": 8.0,
+          "enzyme": 4.9,
+          "fluorescent_protein": 6.8,
+          "scaffold": 8.6
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
+      "tasks_with_zero": 74,
       "avg_latency_sec": null,
+      "submission_date": "2026-04-06"
     },
     {
       "agent_name": "Gemini 2.5 Pro",
       "agent_id": "gemini-2.5-pro-benchmark",
       "mode": "benchmark",
       "submission_type": "llm",
       "organization": "Google",
+      "mcp_custom": false,
+      "overall_score": 8.11,
       "component_scores": {
+        "approach": 3.58,
+        "orchestration": 2.47,
+        "quality": 0.34,
+        "feasibility": 0.93,
+        "novelty": 0.42,
+        "diversity": 0.37
       },
       "taxonomy_scores": {
         "de_novo": {
+          "antibody": 9.1,
+          "binder": 9.2,
+          "enzyme": 11.0,
+          "fluorescent_protein": 3.1,
+          "scaffold": 9.1
+        },
+        "redesign": {
+          "antibody": 7.3,
+          "enzyme": 4.4,
+          "fluorescent_protein": 6.2,
+          "scaffold": 11.4
         }
       },
       "tasks_completed": 76,
       "tasks_total": 76,
+      "tasks_with_zero": 75,
       "avg_latency_sec": null,
+      "submission_date": "2026-04-06"
     }
+  ],
+  "interventions": {
+    "description": "Causal intervention experiments on the depth gap. 18 representative tasks rerun under three conditions: baseline (no intervention), forced_depth (mandate \u22653 evaluation passes per candidate), and low_diversity_control (constrain candidate count without forcing depth).",
+    "n_tasks": 18,
+    "rows": [
+      {
+        "label": "DeepSeek V3 \u2014 baseline",
+        "condition": "baseline",
+        "agent": "deepseek-v3-tools-benchmark",
+        "n_tasks": 18,
+        "score": 58.72,
+        "delta_vs_baseline": 0.0,
+        "approach": 13.44,
+        "orchestration": 11.17,
+        "quality": 16.11,
+        "diversity": 3.56
+      },
+      {
+        "label": "GPT-5 \u2014 baseline",
+        "condition": "baseline",
+        "agent": "gpt5-tools-benchmark",
+        "n_tasks": 18,
+        "score": 46.78,
+        "delta_vs_baseline": 0.0,
+        "approach": 8.33,
+        "orchestration": 6.22,
+        "quality": 15.39,
+        "diversity": 3.94
+      },
+      {
+        "label": "Human Expert \u2014 baseline",
+        "condition": "baseline",
+        "agent": "human-expert-agent",
+        "n_tasks": 18,
+        "score": 56.67,
+        "delta_vs_baseline": 0.0,
+        "approach": 18.28,
+        "orchestration": 9.28,
+        "quality": 11.06,
+        "diversity": 2.28
+      },
+      {
+        "label": "DeepSeek V3 \u2014 forced depth",
+        "condition": "forced_depth",
+        "agent": "deepseek-v3-forced-depth",
+        "n_tasks": 18,
+        "score": 68.06,
+        "delta_vs_baseline": 9.34,
+        "approach": 18.39,
+        "orchestration": 12.28,
+        "quality": 16.11,
+        "diversity": 3.94
+      },
+      {
+        "label": "GPT-5 \u2014 forced depth",
+        "condition": "forced_depth",
+        "agent": "gpt5-tools-forced-depth",
+        "n_tasks": 18,
+        "score": 62.67,
+        "delta_vs_baseline": 15.89,
+        "approach": 18.28,
+        "orchestration": 11.67,
+        "quality": 15.0,
+        "diversity": 3.06
+      },
+      {
+        "label": "DeepSeek V3 \u2014 low diversity",
+        "condition": "low_diversity_control",
+        "agent": "deepseek-v3-low-diversity",
+        "n_tasks": 18,
+        "score": 56.39,
+        "delta_vs_baseline": -2.33,
+        "approach": 13.11,
+        "orchestration": 11.11,
+        "quality": 16.0,
+        "diversity": 3.22
+      },
+      {
+        "label": "GPT-5 \u2014 low diversity",
+        "condition": "low_diversity_control",
+        "agent": "gpt5-tools-low-diversity",
+        "n_tasks": 18,
+        "score": 61.5,
+        "delta_vs_baseline": 14.72,
+        "approach": 13.06,
+        "orchestration": 12.0,
+        "quality": 16.22,
+        "diversity": 3.22
+      },
+      {
+        "label": "Human Expert \u2014 shallow",
+        "condition": "low_diversity_control",
+        "agent": "human-expert-shallow",
+        "n_tasks": 18,
+        "score": 55.06,
+        "delta_vs_baseline": -1.61,
+        "approach": 18.22,
+        "orchestration": 9.28,
+        "quality": 11.17,
+        "diversity": 0.61
+      }
+    ]
+  }
 }