Jasonkim8652 commited on
Commit
cfedbc8
·
verified ·
1 Parent(s): fab94cf

Update leaderboard with canonical Apr-6 hybrid scores + depth-gap interventions

Browse files

- Refresh leaderboard_data.json from results/canonical/scores.csv (hybrid 100-pt rubric)
- Switch to 2x5 design matrix (de_novo/redesign x 5 subjects, 9 occupied cells)
- Add headline-findings banner reflecting paper's three principal findings
- Add 'Depth Gap' tab with forced-depth and low-diversity intervention results
- Rewrite About section to surface coverage-depth dissociation and 14% evaluation-depth gap
- Update y-axis range so DeepSeek V3 60+ scores are not clipped

Files changed (2) hide show
  1. app.py +369 -83
  2. leaderboard_data.json +369 -247
app.py CHANGED
@@ -20,7 +20,7 @@ from pathlib import Path
20
  import gradio as gr
21
  import plotly.graph_objects as go
22
 
23
- ADMIN_PASSWORD = os.environ.get("BDB_ADMIN_PASSWORD", "")
24
 
25
 
26
  # ═══════════════════════════════════════════════════════════════════
@@ -28,31 +28,43 @@ ADMIN_PASSWORD = os.environ.get("BDB_ADMIN_PASSWORD", "")
28
  # ═══════════════════════════════════════════════════════════════════
29
 
30
  PAPER_URL = "#"
31
- GITHUB_URL = "https://github.com/RomeroLab/BioDesignBench"
32
- HF_URL = "https://huggingface.co/spaces/RomeroLab-Duke/BioDesignBench-Leaderboard"
33
 
34
 
35
  # ═══════════════════════════════════════════════════════════════════
36
- # Taxonomy & scoring constants
37
  # ═══════════════════════════════════════════════════════════════════
38
 
39
- DESIGN_APPROACHES = ["de_novo", "redesign"]
40
  APPROACH_LABELS = {
41
- "de_novo": "De Novo",
42
  "redesign": "Redesign",
43
  }
44
- MOLECULAR_SUBJECTS = ["antibody", "enzyme", "binder", "scaffold", "fluorescent_protein"]
45
  SUBJECT_LABELS = {
46
  "antibody": "Antibody",
47
- "enzyme": "Enzyme",
48
  "binder": "Binder",
 
49
  "scaffold": "Scaffold",
50
- "fluorescent_protein": "Fluorescent Protein",
51
  }
 
52
  VALID_CELLS = {
53
- "de_novo": {"antibody", "enzyme", "binder", "scaffold", "fluorescent_protein"},
54
  "redesign": {"antibody", "enzyme", "scaffold", "fluorescent_protein"},
55
  }
 
 
 
 
 
 
 
 
 
 
 
56
  COMPONENTS = [
57
  "approach",
58
  "orchestration",
@@ -78,6 +90,8 @@ TYPE_STYLE = {
78
  "tag": "baseline",
79
  },
80
  "human_oracle": {"icon": "\U0001f4c4", "bg": "#fefcbf", "tag": "baseline"},
 
 
81
  }
82
 
83
 
@@ -188,9 +202,15 @@ def build_header(last_updated: str, n_entries: int) -> str:
188
  <h1 style="font-size:2rem;margin:0;font-weight:800;color:#0f172a;
189
  letter-spacing:-0.02em">
190
  \U0001f9ec BioDesignBench</h1>
191
- <p style="color:#64748b;margin:0.4rem 0 0;font-size:1rem;
192
- font-weight:400">
193
- Evaluating LLM agents on protein design via MCP tools</p>
 
 
 
 
 
 
194
  <div style="margin-top:1rem;display:flex;justify-content:center;
195
  gap:0.6rem;flex-wrap:wrap">
196
  <a href="{PAPER_URL}" target="_blank"
@@ -206,9 +226,11 @@ def build_header(last_updated: str, n_entries: int) -> str:
206
  <div style="margin-top:1rem;display:flex;justify-content:center;
207
  gap:1.5rem;flex-wrap:wrap">
208
  <span style="font-size:0.78rem;color:#94a3b8">
209
- 76 tasks</span>
 
 
210
  <span style="font-size:0.78rem;color:#94a3b8">
211
- {n_entries} agents</span>
212
  <span style="font-size:0.78rem;color:#94a3b8">
213
  Updated {last_updated}</span>
214
  </div>
@@ -403,7 +425,9 @@ def build_leaderboard_table(
403
 
404
 
405
  def build_heatmap(entry: dict) -> str:
406
- """HTML heatmap table for one agent across 2×5 taxonomy cells."""
 
 
407
  ts = entry.get("taxonomy_scores", {})
408
  TH = (
409
  "background:#0f172a;color:white;padding:0.6rem 0.8rem;"
@@ -415,24 +439,30 @@ def build_heatmap(entry: dict) -> str:
415
  )
416
 
417
  rows = []
418
- for ap in DESIGN_APPROACHES:
419
  cells = [
420
- f'<td style="{TD};text-align:left;font-weight:600;'
421
- f'background:#f8fafc">{APPROACH_LABELS[ap]}</td>'
422
  ]
423
  vals = []
424
- for subj in MOLECULAR_SUBJECTS:
425
- if subj in VALID_CELLS[ap]:
426
- val = ts.get(ap, {}).get(subj)
427
  bg = _heat_color(val)
428
- text = f"{val:.0f}" if val is not None else "\u2014"
 
 
 
 
 
 
429
  cells.append(f'<td style="{TD};background:{bg}">{text}</td>')
430
  if val is not None:
431
  vals.append(val)
432
  else:
433
  cells.append(
434
  f'<td style="{TD};color:#cbd5e0;font-weight:400">'
435
- "\u2014</td>"
436
  )
437
  avg = sum(vals) / len(vals) if vals else 0
438
  avg_bg = _heat_color(avg)
@@ -442,9 +472,9 @@ def build_heatmap(entry: dict) -> str:
442
  )
443
  rows.append(f'<tr>{"".join(cells)}</tr>')
444
 
445
- subj_headers = "".join(
446
- f'<th style="{TH}">{SUBJECT_LABELS[s]}</th>'
447
- for s in MOLECULAR_SUBJECTS
448
  )
449
 
450
  return f"""
@@ -452,9 +482,9 @@ def build_heatmap(entry: dict) -> str:
452
  border-radius:10px;overflow:hidden;
453
  box-shadow:0 1px 3px rgba(0,0,0,0.08)">
454
  <thead><tr>
455
- <th style="{TH};text-align:left">Approach</th>
456
- {subj_headers}
457
- <th style="{TH}">Avg</th>
458
  </tr></thead>
459
  <tbody>{''.join(rows)}</tbody>
460
  </table>"""
@@ -531,6 +561,157 @@ def build_mode_cards(entries: list) -> str:
531
  )
532
 
533
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
534
  # ── Tab 5: About ──
535
 
536
 
@@ -558,12 +739,18 @@ def build_about() -> str:
558
  <div {card}>
559
  <h2 {h2}>What is BioDesignBench?</h2>
560
  <p {p}>
561
- BioDesignBench is the first comprehensive benchmark for evaluating
562
- LLM agents on protein design tasks via MCP (Model Context Protocol)
563
- tool use. Unlike existing benchmarks that focus on model-only
564
- metrics, BioDesignBench tests the full agentic design loop:
565
- <strong>prompt &rarr; design &rarr; validate &rarr;
566
- iterate</strong>.</p>
 
 
 
 
 
 
567
  <div style="display:grid;grid-template-columns:
568
  repeat(auto-fit,minmax(140px,1fr));gap:0.8rem;
569
  margin:1rem 0">
@@ -574,8 +761,9 @@ def build_about() -> str:
574
  </div>
575
  <div {stat_box}>
576
  <div style="font-size:1.8rem;font-weight:800;color:#0f172a">
577
- 2×5</div>
578
- <div style="font-size:0.78rem;color:#64748b">taxonomy matrix</div>
 
579
  </div>
580
  <div {stat_box}>
581
  <div style="font-size:1.8rem;font-weight:800;color:#0f172a">
@@ -590,6 +778,37 @@ def build_about() -> str:
590
  </div>
591
  </div>
592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
  <div {card}>
594
  <h2 {h2}>How to submit</h2>
595
  <h3 {h3}>1. Build your agent</h3>
@@ -648,27 +867,58 @@ Response:
648
  </div>
649
 
650
  <div {card}>
651
- <h2 {h2}>Scoring rubric (100 points)</h2>
 
 
 
 
 
 
 
 
652
  <p {p}>
653
- <strong>Approach (20 pts)</strong> &mdash; design methodology
654
- coverage across 10 functional categories
655
- (backbone generation, sequence design, structure prediction, etc.)</p>
 
656
  <p {p}>
657
- <strong>Orchestration (15 pts)</strong> &mdash; pipeline ordering,
658
- intermediate validation, and iteration quality</p>
659
  <p {p}>
660
- <strong>Quality (35 pts)</strong> &mdash; three-tier graduated
661
- scoring based on structure confidence (pLDDT, pTM), interface
662
- metrics (ipTM, i_pAE), and interface physics</p>
 
663
  <p {p}>
664
- <strong>Feasibility (15 pts)</strong> &mdash; valid amino acids,
665
- length constraints, composition, and biophysical plausibility</p>
 
666
  <p {p}>
667
  <strong>Novelty (5 pts)</strong> &mdash; sequence identity to
668
- reference (lower identity = more novel = higher score)</p>
669
  <p {p}>
670
- <strong>Diversity (10 pts)</strong> &mdash; 65% pairwise sequence
671
- diversity + 35% positional entropy across designs</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
672
  </div>
673
 
674
  <div {card}>
@@ -676,9 +926,9 @@ Response:
676
  <pre style="background:#0f172a;color:#e2e8f0;padding:1.2rem;
677
  border-radius:10px;font-size:0.8rem;
678
  line-height:1.6">@article{{biodesignbench2026,
679
- title={{BioDesignBench: Evaluating LLM Agents on
680
- Protein Design via MCP Tools}},
681
- author={{Kim, Jason et al.}},
682
  year={{2026}}
683
  }}</pre>
684
  </div>
@@ -692,37 +942,50 @@ Response:
692
 
693
 
694
  def chart_taxonomy_bar(entry: dict) -> go.Figure:
695
- """Grouped bar chart of scores by approach × subject for one agent."""
 
 
696
  ts = entry.get("taxonomy_scores", {})
697
- subjects = MOLECULAR_SUBJECTS
698
- colors = {"de_novo": "rgba(49,130,206,0.7)", "redesign": "rgba(237,137,54,0.7)"}
699
 
700
- fig = go.Figure()
701
- for ap in DESIGN_APPROACHES:
702
- vals = []
703
- for s in subjects:
704
- v = ts.get(ap, {}).get(s)
705
- vals.append(v if v is not None else 0)
706
- fig.add_trace(go.Bar(
707
- name=APPROACH_LABELS[ap],
708
- x=[SUBJECT_LABELS[s] for s in subjects],
709
- y=vals,
710
- marker_color=colors[ap],
711
- text=[f"{v:.0f}" if v else "" for v in vals],
712
- textposition="auto",
713
- ))
714
 
 
 
 
 
 
 
 
 
 
 
 
 
 
715
  mode = entry.get("mode") or "\u2014"
716
  fig.update_layout(
717
  **_base_layout(
 
718
  title=dict(
719
- text=f"{entry['agent_name']} ({mode}) \u2014 Score by Approach \u00d7 Subject",
720
  font_size=14,
721
  ),
722
- yaxis=dict(range=[0, 100], title="Average Score"),
723
  xaxis=dict(title=""),
724
- barmode="group",
725
- height=300,
 
726
  )
727
  )
728
  return fig
@@ -850,16 +1113,18 @@ def chart_mode_comparison(entries: list) -> go.Figure:
850
  fig.update_layout(
851
  **_base_layout(
852
  barmode="group",
853
- yaxis=dict(range=[0, 50], title="Overall Score"),
 
854
  title=dict(
855
- text="Benchmark Mode vs User Mode \u2014 Overall Score",
856
- font_size=14,
 
857
  ),
858
  legend=dict(
859
- orientation="h", yanchor="bottom", y=-0.15,
860
  xanchor="center", x=0.5,
861
  ),
862
- height=350,
863
  )
864
  )
865
  return fig
@@ -895,6 +1160,7 @@ def create_app() -> gr.Blocks:
895
  ) as app:
896
 
897
  gr.HTML(build_header(data["last_updated"], len(entries)))
 
898
 
899
  with gr.Tabs():
900
 
@@ -979,11 +1245,31 @@ def create_app() -> gr.Blocks:
979
  for dd in [c1, c2]:
980
  dd.change(_update_comp, [c1, c2], [radar, comp_bar])
981
 
982
- # ════════ Tab 4: Benchmark vs User ════════
983
- with gr.Tab("\u26a1 Benchmark vs User"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
984
  gr.Plot(chart_mode_comparison(entries))
985
  gr.HTML(build_mode_cards(entries))
986
 
 
 
 
 
 
 
987
  # ══════ Tab 5: Submit ══════
988
  with gr.Tab("\U0001f4e4 Submit"):
989
  gr.HTML("""
 
20
  import gradio as gr
21
  import plotly.graph_objects as go
22
 
23
+ ADMIN_PASSWORD = os.environ.get("BDB_ADMIN_PASSWORD", "biodesignbench2026")
24
 
25
 
26
  # ═══════════════════════════════════════════════════════════════════
 
28
  # ═══════════════════════════════════════════════════════════════════
29
 
30
  PAPER_URL = "#"
31
+ GITHUB_URL = "#"
32
+ HF_URL = "#"
33
 
34
 
35
  # ═══════════════════════════════════════════════════════════════════
36
+ # Taxonomy & scoring constants (2 × 5 design matrix)
37
  # ═══════════════════════════════════════════════════════════════════
38
 
39
+ APPROACHES = ["de_novo", "redesign"]
40
  APPROACH_LABELS = {
41
+ "de_novo": "De Novo Design",
42
  "redesign": "Redesign",
43
  }
44
+ SUBJECTS = ["antibody", "binder", "enzyme", "scaffold", "fluorescent_protein"]
45
  SUBJECT_LABELS = {
46
  "antibody": "Antibody",
 
47
  "binder": "Binder",
48
+ "enzyme": "Enzyme",
49
  "scaffold": "Scaffold",
50
+ "fluorescent_protein": "Fluorescent Prot.",
51
  }
52
+ # 9 valid cells (rd × binder is empty in current task set)
53
  VALID_CELLS = {
54
+ "de_novo": {"antibody", "binder", "enzyme", "scaffold", "fluorescent_protein"},
55
  "redesign": {"antibody", "enzyme", "scaffold", "fluorescent_protein"},
56
  }
57
+ N_TASKS_PER_CELL = {
58
+ ("de_novo", "antibody"): 4,
59
+ ("de_novo", "binder"): 19,
60
+ ("de_novo", "enzyme"): 2,
61
+ ("de_novo", "scaffold"): 21,
62
+ ("de_novo", "fluorescent_protein"): 1,
63
+ ("redesign", "antibody"): 5,
64
+ ("redesign", "enzyme"): 10,
65
+ ("redesign", "scaffold"): 4,
66
+ ("redesign", "fluorescent_protein"): 10,
67
+ }
68
  COMPONENTS = [
69
  "approach",
70
  "orchestration",
 
90
  "tag": "baseline",
91
  },
92
  "human_oracle": {"icon": "\U0001f4c4", "bg": "#fefcbf", "tag": "baseline"},
93
+ # Backward-compat alias for older JSON files
94
+ "oracle": {"icon": "\U0001f4c4", "bg": "#fefcbf", "tag": "baseline"},
95
  }
96
 
97
 
 
202
  <h1 style="font-size:2rem;margin:0;font-weight:800;color:#0f172a;
203
  letter-spacing:-0.02em">
204
  \U0001f9ec BioDesignBench</h1>
205
+ <p style="color:#0f172a;margin:0.6rem 0 0.2rem;font-size:1.1rem;
206
+ font-weight:600;line-height:1.4">
207
+ Can LLM agents orchestrate stochastic protein-design pipelines?</p>
208
+ <p style="color:#64748b;margin:0.2rem 0 0;font-size:0.95rem;
209
+ font-weight:400;font-style:italic;max-width:680px;
210
+ margin-left:auto;margin-right:auto;line-height:1.5">
211
+ Top-tier agents now surpass a deterministic pipeline &mdash;
212
+ but invoke evaluation tools at only <strong>14% of expert depth</strong>.
213
+ Guidance rescues coverage, not depth.</p>
214
  <div style="margin-top:1rem;display:flex;justify-content:center;
215
  gap:0.6rem;flex-wrap:wrap">
216
  <a href="{PAPER_URL}" target="_blank"
 
226
  <div style="margin-top:1rem;display:flex;justify-content:center;
227
  gap:1.5rem;flex-wrap:wrap">
228
  <span style="font-size:0.78rem;color:#94a3b8">
229
+ 76 tasks &middot; 5 molecular families</span>
230
+ <span style="font-size:0.78rem;color:#94a3b8">
231
+ 17 MCP tools</span>
232
  <span style="font-size:0.78rem;color:#94a3b8">
233
+ {n_entries} conditions</span>
234
  <span style="font-size:0.78rem;color:#94a3b8">
235
  Updated {last_updated}</span>
236
  </div>
 
425
 
426
 
427
  def build_heatmap(entry: dict) -> str:
428
+ """HTML heatmap for one agent across the 2 × 5 design matrix
429
+ (DesignApproach × MolecularSubject = 9 valid cells; rd × binder is empty).
430
+ """
431
  ts = entry.get("taxonomy_scores", {})
432
  TH = (
433
  "background:#0f172a;color:white;padding:0.6rem 0.8rem;"
 
439
  )
440
 
441
  rows = []
442
+ for ap in APPROACHES:
443
  cells = [
444
+ f'<td style="{TD};text-align:left;font-weight:700;'
445
+ f'background:#f8fafc;color:#0f172a">{APPROACH_LABELS[ap]}</td>'
446
  ]
447
  vals = []
448
+ for sj in SUBJECTS:
449
+ if sj in VALID_CELLS[ap]:
450
+ val = ts.get(ap, {}).get(sj)
451
  bg = _heat_color(val)
452
+ n = N_TASKS_PER_CELL.get((ap, sj), 0)
453
+ text = (
454
+ f'{val:.0f}<br><span style="font-size:0.65rem;'
455
+ f'font-weight:400;color:#64748b">n={n}</span>'
456
+ if val is not None
457
+ else "\u2014"
458
+ )
459
  cells.append(f'<td style="{TD};background:{bg}">{text}</td>')
460
  if val is not None:
461
  vals.append(val)
462
  else:
463
  cells.append(
464
  f'<td style="{TD};color:#cbd5e0;font-weight:400">'
465
+ "n/a</td>"
466
  )
467
  avg = sum(vals) / len(vals) if vals else 0
468
  avg_bg = _heat_color(avg)
 
472
  )
473
  rows.append(f'<tr>{"".join(cells)}</tr>')
474
 
475
+ sj_headers = "".join(
476
+ f'<th style="{TH}">{SUBJECT_LABELS[sj]}</th>'
477
+ for sj in SUBJECTS
478
  )
479
 
480
  return f"""
 
482
  border-radius:10px;overflow:hidden;
483
  box-shadow:0 1px 3px rgba(0,0,0,0.08)">
484
  <thead><tr>
485
+ <th style="{TH};text-align:left">Approach \u2193 / Subject \u2192</th>
486
+ {sj_headers}
487
+ <th style="{TH}">Mean</th>
488
  </tr></thead>
489
  <tbody>{''.join(rows)}</tbody>
490
  </table>"""
 
561
  )
562
 
563
 
564
+ # ── Headline findings (paper banner) ──
565
+
566
+
567
+ def build_headline_findings(findings: list) -> str:
568
+ """Top-of-page banner that surfaces the paper's three core claims."""
569
+ if not findings:
570
+ return ""
571
+ cards = []
572
+ accents = ["#3182ce", "#d69e2e", "#805ad5", "#38a169", "#e53e3e"]
573
+ for i, text in enumerate(findings):
574
+ c = accents[i % len(accents)]
575
+ cards.append(
576
+ f'<div style="background:#ffffff;border:1px solid #e2e8f0;'
577
+ f"border-left:4px solid {c};border-radius:10px;"
578
+ f'padding:0.85rem 1rem;flex:1 1 220px;min-width:220px;'
579
+ f'box-shadow:0 1px 3px rgba(0,0,0,0.04)">'
580
+ f'<div style="font-size:0.7rem;font-weight:700;'
581
+ f'color:{c};letter-spacing:0.08em;text-transform:uppercase;'
582
+ f'margin-bottom:0.35rem">Finding {i+1}</div>'
583
+ f'<div style="font-size:0.82rem;color:#1a202c;'
584
+ f'line-height:1.45">{text}</div></div>'
585
+ )
586
+ return (
587
+ '<div style="display:flex;flex-wrap:wrap;gap:0.7rem;'
588
+ 'margin:0.4rem 0 1rem">'
589
+ f"{''.join(cards)}</div>"
590
+ )
591
+
592
+
593
+ # ── Tab: Depth Gap (intervention experiments) ──
594
+
595
+
596
+ def build_intervention_section(interventions: dict) -> str:
597
+ """Show forced-depth and low-diversity intervention results.
598
+
599
+ The forced-depth condition mandates ≥3 evaluation passes per design
600
+ candidate; the low-diversity control constrains the candidate pool
601
+ without forcing depth. Together they isolate evaluation depth as the
602
+ causal driver of the 'surface competence' gap reported in the paper.
603
+ """
604
+ if not interventions or not interventions.get("rows"):
605
+ return '<p style="color:#718096">No intervention data available.</p>'
606
+
607
+ rows = interventions["rows"]
608
+
609
+ cond_meta = {
610
+ "baseline": ("#64748b", "Baseline"),
611
+ "forced_depth": ("#38a169", "Forced Depth"),
612
+ "low_diversity_control": ("#d69e2e", "Low-Diversity Control"),
613
+ }
614
+
615
+ TH = (
616
+ "background:#0f172a;color:white;padding:0.65rem 0.9rem;"
617
+ "text-align:left;font-size:0.72rem;text-transform:uppercase;"
618
+ "letter-spacing:0.05em;font-weight:600"
619
+ )
620
+ TD = ("padding:0.6rem 0.9rem;border-bottom:1px solid #e2e8f0;"
621
+ "font-size:0.86rem")
622
+
623
+ body = []
624
+ for r in rows:
625
+ color, cond_label = cond_meta.get(r["condition"], ("#64748b", r["condition"]))
626
+ delta = r.get("delta_vs_baseline")
627
+ if delta is None or r["condition"] == "baseline":
628
+ delta_html = '<span style="color:#cbd5e0">\u2014</span>'
629
+ else:
630
+ sign = "+" if delta >= 0 else ""
631
+ dcol = "#38a169" if delta > 0 else ("#e53e3e" if delta < 0 else "#64748b")
632
+ delta_html = (
633
+ f'<span style="color:{dcol};font-weight:700">'
634
+ f"{sign}{delta:.1f}</span>"
635
+ )
636
+ body.append(
637
+ f'<tr><td style="{TD};font-weight:600;color:#0f172a">'
638
+ f'{r["label"]}</td>'
639
+ f'<td style="{TD}"><span style="background:{color}22;'
640
+ f"color:{color};padding:0.15rem 0.55rem;border-radius:4px;"
641
+ f'font-size:0.72rem;font-weight:700">{cond_label}</span></td>'
642
+ f'<td style="{TD};font-weight:700;font-variant-numeric:'
643
+ f'tabular-nums">{r["score"]:.1f}</td>'
644
+ f'<td style="{TD};font-variant-numeric:tabular-nums">{delta_html}</td>'
645
+ f'<td style="{TD};color:#475569;font-variant-numeric:tabular-nums">'
646
+ f'{r["approach"]:.1f} / {r["orchestration"]:.1f}</td>'
647
+ f'<td style="{TD};color:#475569;font-variant-numeric:tabular-nums">'
648
+ f'{r["quality"]:.1f}</td>'
649
+ f'<td style="{TD};color:#475569;font-variant-numeric:tabular-nums">'
650
+ f'{r["diversity"]:.1f}</td></tr>'
651
+ )
652
+
653
+ n = interventions.get("n_tasks", 18)
654
+
655
+ return f"""
656
+ <div style="max-width:980px;margin:0 auto">
657
+
658
+ <div style="background:#ffffff;border:1px solid #e2e8f0;
659
+ border-radius:12px;padding:1.4rem 1.6rem;
660
+ margin-bottom:1rem">
661
+ <h2 style="color:#0f172a;margin:0 0 0.5rem;font-size:1.2rem;
662
+ font-weight:700">Causal interventions on the depth gap</h2>
663
+ <p style="color:#475569;line-height:1.55;margin:0">
664
+ {interventions.get('description', '')}
665
+ Reruns are scored on a representative <strong>{n}-task</strong>
666
+ subset that spans all 9 occupied taxonomy cells.
667
+ </p>
668
+ </div>
669
+
670
+ <div style="background:#fefce8;border-left:4px solid #ca8a04;
671
+ border-radius:8px;padding:0.95rem 1.1rem;
672
+ margin-bottom:1.1rem">
673
+ <strong style="color:#713f12">Headline:</strong>
674
+ <span style="color:#52340d">
675
+ Forced-depth lifts <strong>DeepSeek V3 by +9.3</strong> and
676
+ <strong>GPT-5 by +15.9</strong> points without any change to
677
+ the underlying model or tools, while the low-diversity control
678
+ <em>hurts</em> DeepSeek V3 (&minus;2.3). The dissociation is
679
+ cleanest on the strongest agent, where it provides direct
680
+ causal evidence that
681
+ <strong>evaluation depth &mdash; not the mere act of process
682
+ intervention &mdash; drives the gain</strong>. GPT-5's
683
+ response is more uniform across both interventions; we
684
+ report the raw deltas without smoothing.
685
+ </span>
686
+ </div>
687
+
688
+ <table style="width:100%;border-collapse:collapse;background:white;
689
+ border-radius:10px;overflow:hidden;
690
+ box-shadow:0 1px 3px rgba(0,0,0,0.08)">
691
+ <thead><tr>
692
+ <th style="{TH}">Run</th>
693
+ <th style="{TH}">Condition</th>
694
+ <th style="{TH}">Score</th>
695
+ <th style="{TH}">&Delta; vs baseline</th>
696
+ <th style="{TH}">Approach / Orch.</th>
697
+ <th style="{TH}">Quality</th>
698
+ <th style="{TH}">Diversity</th>
699
+ </tr></thead>
700
+ <tbody>{''.join(body)}</tbody>
701
+ </table>
702
+
703
+ <p style="color:#64748b;font-size:0.78rem;margin-top:0.8rem;
704
+ line-height:1.5">
705
+ Scoring uses the same 100-point hybrid rubric as the main
706
+ leaderboard but is restricted to {n} representative tasks;
707
+ absolute values therefore differ from the full-benchmark mean.
708
+ The <em>delta vs baseline</em> compares each agent against
709
+ its own untreated baseline run, isolating the intervention effect.
710
+ </p>
711
+ </div>
712
+ """
713
+
714
+
715
  # ── Tab 5: About ──
716
 
717
 
 
739
  <div {card}>
740
  <h2 {h2}>What is BioDesignBench?</h2>
741
  <p {p}>
742
+ BioDesignBench is a benchmark for evaluating LLM agents as
743
+ orchestrators of multi-step <em>stochastic</em> protein-design
744
+ pipelines. Unlike chemistry- or code-agent benchmarks, where
745
+ tool chains are largely deterministic, protein design demands
746
+ repeated sampling from generative tools (RFdiffusion,
747
+ ProteinMPNN) and iterative cross-validation through several
748
+ biophysical metrics. We test the full agentic loop &mdash;
749
+ <strong>plan &rarr; sample &rarr; evaluate across multiple
750
+ metrics &rarr; iterate</strong> &mdash; over 76 expert-curated
751
+ tasks drawn from 2024&ndash;2026 literature, exposed through
752
+ 17 MCP-integrated tools.
753
+ </p>
754
  <div style="display:grid;grid-template-columns:
755
  repeat(auto-fit,minmax(140px,1fr));gap:0.8rem;
756
  margin:1rem 0">
 
761
  </div>
762
  <div {stat_box}>
763
  <div style="font-size:1.8rem;font-weight:800;color:#0f172a">
764
+ 9</div>
765
+ <div style="font-size:0.78rem;color:#64748b">
766
+ taxonomy cells<br>(2 approaches \u00d7 5 subjects)</div>
767
  </div>
768
  <div {stat_box}>
769
  <div style="font-size:1.8rem;font-weight:800;color:#0f172a">
 
778
  </div>
779
  </div>
780
 
781
+ <div {card}>
782
+ <h2 {h2}>Three principal findings</h2>
783
+ <h3 {h3}>1. Top-tier agents now beat a deterministic pipeline</h3>
784
+ <p {p}>
785
+ DeepSeek V3 and GPT-5 surpass a hand-engineered hardcoded
786
+ pipeline (54.2) under both modes. Autonomous protein-design
787
+ orchestration is no longer infeasible &mdash; but a substantial
788
+ gap to the human expert (61.3) and oracle (74.9) remains.
789
+ </p>
790
+ <h3 {h3}>2. Coverage&ndash;depth dissociation</h3>
791
+ <p {p}>
792
+ Workflow guidance closes the <em>coverage</em> gap (Rescue
793
+ Index up to +3.01) but leaves <em>utilisation depth</em>
794
+ unchanged (Rescue Index \u2248 0). Better tool documentation
795
+ can teach agents <em>which</em> tools to call, but cannot
796
+ teach them to call those tools with the iterative depth that
797
+ expert practice demands.
798
+ </p>
799
+ <h3 {h3}>3. Evaluation depth, not tool knowledge, is the bottleneck</h3>
800
+ <p {p}>
801
+ Across 836 task&ndash;condition observations, evaluation depth
802
+ per candidate correlates with total score at
803
+ <strong>&rho; = 0.685</strong>
804
+ (<em>p</em> &lt; 10<sup>-117</sup>). LLM agents generate
805
+ backbone candidates at expert-level rates but evaluate each
806
+ one at only <strong>14% of expert depth</strong>. Forced-depth
807
+ interventions confirm this is causal &mdash; see the
808
+ <em>Depth Gap</em> tab.
809
+ </p>
810
+ </div>
811
+
812
  <div {card}>
813
  <h2 {h2}>How to submit</h2>
814
  <h3 {h3}>1. Build your agent</h3>
 
867
  </div>
868
 
869
  <div {card}>
870
+ <h2 {h2}>Scoring rubric (100 points, hybrid)</h2>
871
+ <p {p}>
872
+ Scores combine <strong>72 algorithmic points</strong> from
873
+ deterministic biophysical metrics with
874
+ <strong>28 LLM-judge points</strong> assessed by a 3-judge
875
+ panel (PoLL) with self-exclusion to mitigate self-preference
876
+ bias. Each component is capped at its rubric maximum to
877
+ prevent double counting.
878
+ </p>
879
  <p {p}>
880
+ <strong>Approach (20 pts)</strong> &mdash; strategic
881
+ appropriateness of tool selection across 10 functional
882
+ categories (backbone generation, inverse folding, structure
883
+ prediction, etc.).</p>
884
  <p {p}>
885
+ <strong>Orchestration (15 pts)</strong> &mdash; pipeline
886
+ ordering, intermediate validation, and adaptive iteration.</p>
887
  <p {p}>
888
+ <strong>Quality (35 pts)</strong> &mdash; 100% algorithmic.
889
+ Continuous 4-band interpolation over Boltz-2 re-prediction
890
+ metrics (pLDDT, pTM, ipTM, i_pAE), eliminating LLM judgement
891
+ variance on biophysical quantities.</p>
892
  <p {p}>
893
+ <strong>Feasibility (15 pts)</strong> &mdash; valid amino
894
+ acids, length constraints, composition, and biophysical
895
+ plausibility.</p>
896
  <p {p}>
897
  <strong>Novelty (5 pts)</strong> &mdash; sequence identity to
898
+ reference (lower identity = more novel).</p>
899
  <p {p}>
900
+ <strong>Diversity (10 pts)</strong> &mdash; number and
901
+ pairwise diversity of generated designs.</p>
902
+ </div>
903
+
904
+ <div {card}>
905
+ <h2 {h2}>Five-layer contamination defense</h2>
906
+ <p {p}>Every evaluated LLM may have read protein-design
907
+ literature during pretraining, so we use a layered defense:</p>
908
+ <ul style="color:#475569;padding-left:1.5rem;
909
+ margin-bottom:0.8rem;line-height:1.7">
910
+ <li>All 76 tasks derived from publications dated 2024&ndash;2026,
911
+ post-dating model training cutoffs.</li>
912
+ <li>Task prompts paraphrased and restructured &mdash; no
913
+ verbatim passages from source literature.</li>
914
+ <li>Targets specified by biological function and structural
915
+ constraints, not by name or PDB identifier.</li>
916
+ <li>12 decoy tasks with deliberately fabricated targets to
917
+ detect memorisation-based responses.</li>
918
+ <li>n-gram overlap analysis between agent outputs and source
919
+ publications &mdash; no verbatim regurgitation above the
920
+ 8-gram threshold across any condition.</li>
921
+ </ul>
922
  </div>
923
 
924
  <div {card}>
 
926
  <pre style="background:#0f172a;color:#e2e8f0;padding:1.2rem;
927
  border-radius:10px;font-size:0.8rem;
928
  line-height:1.6">@article{{biodesignbench2026,
929
+ title={{Evaluating LLM-Driven Protein Design:
930
+ Agents Lack Iterative Evaluation Depth}},
931
+ author={{Kim, Jeonghyeon and Romero, Philip}},
932
  year={{2026}}
933
  }}</pre>
934
  </div>
 
942
 
943
 
944
  def chart_taxonomy_bar(entry: dict) -> go.Figure:
945
+ """Grouped bar chart of mean score per molecular subject,
946
+ split by design approach (de novo vs redesign).
947
+ """
948
  ts = entry.get("taxonomy_scores", {})
949
+ x_labels = [SUBJECT_LABELS[s] for s in SUBJECTS]
 
950
 
951
+ def _series(ap):
952
+ out = []
953
+ for sj in SUBJECTS:
954
+ if sj in VALID_CELLS[ap]:
955
+ out.append(ts.get(ap, {}).get(sj))
956
+ else:
957
+ out.append(None)
958
+ return out
959
+
960
+ dn = _series("de_novo")
961
+ rd = _series("redesign")
 
 
 
962
 
963
+ fig = go.Figure()
964
+ fig.add_trace(go.Bar(
965
+ x=x_labels, y=dn, name="De Novo",
966
+ marker_color="rgba(49,130,206,0.78)",
967
+ text=[f"{v:.0f}" if v is not None else "" for v in dn],
968
+ textposition="outside",
969
+ ))
970
+ fig.add_trace(go.Bar(
971
+ x=x_labels, y=rd, name="Redesign",
972
+ marker_color="rgba(214,158,46,0.78)",
973
+ text=[f"{v:.0f}" if v is not None else "" for v in rd],
974
+ textposition="outside",
975
+ ))
976
  mode = entry.get("mode") or "\u2014"
977
  fig.update_layout(
978
  **_base_layout(
979
+ barmode="group",
980
  title=dict(
981
+ text=f"{entry['agent_name']} ({mode}) \u2014 Mean Score by Cell",
982
  font_size=14,
983
  ),
984
+ yaxis=dict(range=[0, 100], title="Hybrid score (out of 100)"),
985
  xaxis=dict(title=""),
986
+ legend=dict(orientation="h", yanchor="bottom", y=-0.2,
987
+ xanchor="center", x=0.5),
988
+ height=340,
989
  )
990
  )
991
  return fig
 
1113
  fig.update_layout(
1114
  **_base_layout(
1115
  barmode="group",
1116
+ yaxis=dict(range=[0, 80], title="Overall hybrid score"),
1117
+ xaxis=dict(title=""),
1118
  title=dict(
1119
+ text=("Unguided (Benchmark) vs Guided (User) modes \u2014 "
1120
+ "guidance lifts coverage but rarely shifts overall score"),
1121
+ font_size=13,
1122
  ),
1123
  legend=dict(
1124
+ orientation="h", yanchor="bottom", y=-0.18,
1125
  xanchor="center", x=0.5,
1126
  ),
1127
+ height=380,
1128
  )
1129
  )
1130
  return fig
 
1160
  ) as app:
1161
 
1162
  gr.HTML(build_header(data["last_updated"], len(entries)))
1163
+ gr.HTML(build_headline_findings(data.get("headline_findings", [])))
1164
 
1165
  with gr.Tabs():
1166
 
 
1245
  for dd in [c1, c2]:
1246
  dd.change(_update_comp, [c1, c2], [radar, comp_bar])
1247
 
1248
+ # ════════ Tab 4: Benchmark vs User (coverage-depth dissociation) ════════
1249
+ with gr.Tab("\u26a1 Guidance Effect"):
1250
+ gr.HTML(
1251
+ '<div style="background:#eff6ff;border-left:4px solid '
1252
+ '#3182ce;border-radius:8px;padding:0.85rem 1.1rem;'
1253
+ 'margin:0.4rem 0 0.9rem;color:#1e3a8a;font-size:0.88rem;'
1254
+ 'line-height:1.55">'
1255
+ '<strong>Mode semantics:</strong> '
1256
+ '<em>Benchmark mode</em> exposes atomic tools without '
1257
+ 'pipeline hints (unguided); <em>User mode</em> packages '
1258
+ 'them into composite workflows with explicit pipeline '
1259
+ 'structure (guided). Guidance lifts the lowest-tier '
1260
+ 'agents but does not consistently help capable ones, '
1261
+ 'and never closes the depth gap (see <em>Depth Gap</em> '
1262
+ 'tab).</div>'
1263
+ )
1264
  gr.Plot(chart_mode_comparison(entries))
1265
  gr.HTML(build_mode_cards(entries))
1266
 
1267
+ # ════════ Tab 5: Depth Gap (interventions) ════════
1268
+ with gr.Tab("\U0001f50d Depth Gap"):
1269
+ gr.HTML(build_intervention_section(
1270
+ data.get("interventions", {})
1271
+ ))
1272
+
1273
  # ══════ Tab 5: Submit ══════
1274
  with gr.Tab("\U0001f4e4 Submit"):
1275
  gr.HTML("""
leaderboard_data.json CHANGED
@@ -1,412 +1,534 @@
1
  {
2
- "last_updated": "2026-03-10",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "entries": [
4
  {
5
- "agent_name": "Oracle",
6
  "agent_id": "oracle",
7
  "mode": null,
 
 
8
  "mcp_custom": false,
9
- "submission_type": "oracle",
10
- "organization": "Ground Truth",
11
- "overall_score": 87.3,
12
  "component_scores": {
13
  "approach": 20.0,
14
  "orchestration": 15.0,
15
- "quality": 22.3,
16
- "feasibility": 15.0,
17
- "novelty": 5.0,
18
- "diversity": 10.0
19
  },
20
  "taxonomy_scores": {
21
- "redesign": {
22
- "antibody": 78,
23
- "enzyme": 96,
24
- "fluorescent_protein": 98,
25
- "scaffold": 86
26
- },
27
  "de_novo": {
28
- "binder": 83,
29
- "enzyme": 80,
30
- "fluorescent_protein": 85,
31
- "scaffold": 87,
32
- "antibody": 74
 
 
 
 
 
 
33
  }
34
  },
35
  "tasks_completed": 76,
36
  "tasks_total": 76,
37
  "tasks_with_zero": 0,
38
  "avg_latency_sec": null,
39
- "submission_date": "2026-03-10"
40
  },
41
  {
42
  "agent_name": "Human Expert",
43
  "agent_id": "human-expert",
44
  "mode": null,
45
- "mcp_custom": false,
46
  "submission_type": "human_expert",
47
  "organization": "Romero Lab",
48
- "overall_score": 62.4,
 
49
  "component_scores": {
50
- "approach": 19.0,
51
- "orchestration": 9.9,
52
- "quality": 12.9,
53
- "feasibility": 13.6,
54
- "novelty": 4.5,
55
- "diversity": 2.6
56
  },
57
  "taxonomy_scores": {
58
- "redesign": {
59
- "antibody": 52,
60
- "enzyme": 50,
61
- "fluorescent_protein": 53,
62
- "scaffold": 52
63
- },
64
  "de_novo": {
65
- "binder": 74,
66
- "enzyme": 46,
67
- "fluorescent_protein": 61,
68
- "scaffold": 68,
69
- "antibody": 65
 
 
 
 
 
 
70
  }
71
  },
72
  "tasks_completed": 76,
73
  "tasks_total": 76,
74
  "tasks_with_zero": 0,
75
  "avg_latency_sec": null,
76
- "submission_date": "2026-03-10"
77
  },
78
  {
79
  "agent_name": "DeepSeek V3",
80
- "agent_id": "deepseek-v3-user",
81
- "mode": "user",
82
- "mcp_custom": false,
83
  "submission_type": "llm",
84
  "organization": "DeepSeek",
85
- "overall_score": 58.4,
 
86
  "component_scores": {
87
- "approach": 12.8,
88
- "orchestration": 10.0,
89
- "quality": 15.6,
90
- "feasibility": 12.2,
91
- "novelty": 4.3,
92
- "diversity": 3.4
93
  },
94
  "taxonomy_scores": {
95
- "redesign": {
96
- "antibody": 57,
97
- "enzyme": 58,
98
- "fluorescent_protein": 62,
99
- "scaffold": 57
100
- },
101
  "de_novo": {
102
- "binder": 64,
103
- "enzyme": 56,
104
- "fluorescent_protein": 61,
105
- "scaffold": 51,
106
- "antibody": 60
 
 
 
 
 
 
107
  }
108
  },
109
  "tasks_completed": 76,
110
  "tasks_total": 76,
111
  "tasks_with_zero": 1,
112
  "avg_latency_sec": null,
113
- "submission_date": "2026-03-10"
114
  },
115
  {
116
- "agent_name": "Hardcoded Pipeline",
117
- "agent_id": "hardcoded-pipeline",
118
- "mode": null,
 
 
119
  "mcp_custom": false,
120
- "submission_type": "hardcoded",
121
- "organization": "Deterministic",
122
- "overall_score": 52.4,
123
  "component_scores": {
124
- "approach": 12.1,
125
- "orchestration": 9.9,
126
- "quality": 14.8,
127
- "feasibility": 9.7,
128
- "novelty": 3.8,
129
- "diversity": 2.0
130
  },
131
  "taxonomy_scores": {
132
- "redesign": {
133
- "antibody": 41,
134
- "enzyme": 69,
135
- "fluorescent_protein": 52,
136
- "scaffold": 66
137
- },
138
  "de_novo": {
139
- "binder": 59,
140
- "enzyme": 28,
141
- "fluorescent_protein": 61,
142
- "scaffold": 40,
143
- "antibody": 52
 
 
 
 
 
 
144
  }
145
  },
146
  "tasks_completed": 76,
147
  "tasks_total": 76,
148
- "tasks_with_zero": 5,
149
  "avg_latency_sec": null,
150
- "submission_date": "2026-03-10"
151
  },
152
  {
153
- "agent_name": "DeepSeek V3",
154
- "agent_id": "deepseek-v3-benchmark",
155
  "mode": "benchmark",
156
- "mcp_custom": false,
157
  "submission_type": "llm",
158
- "organization": "DeepSeek",
159
- "overall_score": 50.5,
 
160
  "component_scores": {
161
- "approach": 7.1,
162
- "orchestration": 7.2,
163
- "quality": 16.1,
164
- "feasibility": 13.2,
165
- "novelty": 4.1,
166
- "diversity": 3.0
167
  },
168
  "taxonomy_scores": {
169
- "redesign": {
170
- "antibody": 51,
171
- "enzyme": 52,
172
- "fluorescent_protein": 50,
173
- "scaffold": 60
174
- },
175
  "de_novo": {
176
- "binder": 54,
177
- "enzyme": 40,
178
- "fluorescent_protein": 40,
179
- "scaffold": 48,
180
- "antibody": 46
 
 
 
 
 
 
181
  }
182
  },
183
  "tasks_completed": 76,
184
  "tasks_total": 76,
185
  "tasks_with_zero": 2,
186
  "avg_latency_sec": null,
187
- "submission_date": "2026-03-10"
188
  },
189
  {
190
  "agent_name": "GPT-5",
191
  "agent_id": "gpt5-user",
192
  "mode": "user",
193
- "mcp_custom": false,
194
  "submission_type": "llm",
195
  "organization": "OpenAI",
196
- "overall_score": 49.2,
 
197
  "component_scores": {
198
- "approach": 7.9,
199
- "orchestration": 7.6,
200
- "quality": 15.3,
201
- "feasibility": 11.1,
202
- "novelty": 4.1,
203
- "diversity": 3.1
204
  },
205
  "taxonomy_scores": {
206
- "redesign": {
207
- "antibody": 42,
208
- "enzyme": 46,
209
- "fluorescent_protein": 46,
210
- "scaffold": 56
211
- },
212
  "de_novo": {
213
- "binder": 56,
214
- "enzyme": 40,
215
- "fluorescent_protein": 55,
216
- "scaffold": 47,
217
- "antibody": 52
 
 
 
 
 
 
218
  }
219
  },
220
  "tasks_completed": 76,
221
  "tasks_total": 76,
222
- "tasks_with_zero": 3,
223
  "avg_latency_sec": null,
224
- "submission_date": "2026-03-10"
225
  },
226
  {
227
- "agent_name": "Claude Sonnet 4.5",
228
- "agent_id": "sonnet-4.5-user",
229
- "mode": "user",
 
 
230
  "mcp_custom": false,
231
- "submission_type": "llm",
232
- "organization": "Anthropic",
233
- "overall_score": 47.9,
234
  "component_scores": {
235
- "approach": 8.6,
236
- "orchestration": 7.8,
237
- "quality": 15.0,
238
- "feasibility": 10.9,
239
- "novelty": 3.4,
240
- "diversity": 2.2
241
  },
242
  "taxonomy_scores": {
243
- "redesign": {
244
- "antibody": 42,
245
- "enzyme": 47,
246
- "fluorescent_protein": 56,
247
- "scaffold": 32
248
- },
249
  "de_novo": {
250
- "binder": 59,
251
- "enzyme": 48,
252
- "fluorescent_protein": 45,
253
- "scaffold": 39,
254
- "antibody": 48
 
 
 
 
 
 
255
  }
256
  },
257
  "tasks_completed": 76,
258
  "tasks_total": 76,
259
- "tasks_with_zero": 6,
260
  "avg_latency_sec": null,
261
- "submission_date": "2026-03-10"
262
  },
263
  {
264
  "agent_name": "Claude Sonnet 4.5",
265
- "agent_id": "sonnet-4.5-benchmark",
266
- "mode": "benchmark",
267
- "mcp_custom": false,
268
  "submission_type": "llm",
269
  "organization": "Anthropic",
270
- "overall_score": 42.3,
 
271
  "component_scores": {
272
- "approach": 6.0,
273
- "orchestration": 6.2,
274
- "quality": 13.8,
275
- "feasibility": 11.4,
276
- "novelty": 3.2,
277
- "diversity": 1.7
278
  },
279
  "taxonomy_scores": {
280
- "redesign": {
281
- "antibody": 36,
282
- "enzyme": 37,
283
- "fluorescent_protein": 54,
284
- "scaffold": 23
285
- },
286
  "de_novo": {
287
- "binder": 49,
288
- "enzyme": 38,
289
- "fluorescent_protein": 41,
290
- "scaffold": 41,
291
- "antibody": 33
 
 
 
 
 
 
292
  }
293
  },
294
  "tasks_completed": 76,
295
  "tasks_total": 76,
296
- "tasks_with_zero": 9,
297
  "avg_latency_sec": null,
298
- "submission_date": "2026-03-10"
299
  },
300
  {
301
- "agent_name": "GPT-5",
302
- "agent_id": "gpt5-benchmark",
303
  "mode": "benchmark",
304
- "mcp_custom": false,
305
  "submission_type": "llm",
306
- "organization": "OpenAI",
307
- "overall_score": 41.0,
 
308
  "component_scores": {
309
- "approach": 5.2,
310
- "orchestration": 4.9,
311
- "quality": 15.0,
312
- "feasibility": 11.5,
313
- "novelty": 3.5,
314
- "diversity": 0.9
315
  },
316
  "taxonomy_scores": {
317
- "redesign": {
318
- "antibody": 30,
319
- "enzyme": 36,
320
- "fluorescent_protein": 54,
321
- "scaffold": 41
322
- },
323
  "de_novo": {
324
- "binder": 44,
325
- "enzyme": 22,
326
- "fluorescent_protein": 44,
327
- "scaffold": 39,
328
- "antibody": 40
 
 
 
 
 
 
329
  }
330
  },
331
  "tasks_completed": 76,
332
  "tasks_total": 76,
333
- "tasks_with_zero": 5,
334
  "avg_latency_sec": null,
335
- "submission_date": "2026-03-10"
336
  },
337
  {
338
  "agent_name": "Gemini 2.5 Pro",
339
  "agent_id": "gemini-2.5-pro-user",
340
  "mode": "user",
341
- "mcp_custom": false,
342
  "submission_type": "llm",
343
  "organization": "Google",
344
- "overall_score": 26.2,
 
345
  "component_scores": {
346
- "approach": 0.0,
347
- "orchestration": 0.0,
348
- "quality": 10.3,
349
- "feasibility": 10.9,
350
- "novelty": 3.5,
351
- "diversity": 1.5
352
  },
353
  "taxonomy_scores": {
354
- "redesign": {
355
- "antibody": 22,
356
- "enzyme": 29,
357
- "fluorescent_protein": 29,
358
- "scaffold": 21
359
- },
360
  "de_novo": {
361
- "binder": 36,
362
- "enzyme": 8,
363
- "fluorescent_protein": 0,
364
- "scaffold": 19,
365
- "antibody": 31
 
 
 
 
 
 
366
  }
367
  },
368
  "tasks_completed": 76,
369
  "tasks_total": 76,
370
- "tasks_with_zero": 15,
371
  "avg_latency_sec": null,
372
- "submission_date": "2026-03-10"
373
  },
374
  {
375
  "agent_name": "Gemini 2.5 Pro",
376
  "agent_id": "gemini-2.5-pro-benchmark",
377
  "mode": "benchmark",
378
- "mcp_custom": false,
379
  "submission_type": "llm",
380
  "organization": "Google",
381
- "overall_score": 25.8,
 
382
  "component_scores": {
383
- "approach": 0.0,
384
- "orchestration": 0.0,
385
- "quality": 10.1,
386
- "feasibility": 10.7,
387
- "novelty": 3.4,
388
- "diversity": 1.6
389
  },
390
  "taxonomy_scores": {
391
- "redesign": {
392
- "antibody": 31,
393
- "enzyme": 26,
394
- "fluorescent_protein": 32,
395
- "scaffold": 14
396
- },
397
  "de_novo": {
398
- "binder": 34,
399
- "enzyme": 8,
400
- "fluorescent_protein": 0,
401
- "scaffold": 18,
402
- "antibody": 30
 
 
 
 
 
 
403
  }
404
  },
405
  "tasks_completed": 76,
406
  "tasks_total": 76,
407
- "tasks_with_zero": 17,
408
  "avg_latency_sec": null,
409
- "submission_date": "2026-03-10"
410
  }
411
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  }
 
1
  {
2
+ "last_updated": "2026-04-14",
3
+ "paper_title": "Evaluating LLM-Driven Protein Design: Agents Lack Iterative Evaluation Depth",
4
+ "headline_findings": [
5
+ "Top-tier LLM agents (DeepSeek V3, GPT-5) now surpass a deterministic hardcoded pipeline.",
6
+ "All agents show a critical evaluation depth gap \u2014 they invoke evaluation tools at only 14% of expert frequency.",
7
+ "Workflow guidance rescues tool coverage (Rescue Index up to +3.01) but not utilisation depth (Rescue Index \u2248 0).",
8
+ "Evaluation depth predicts design quality (\u03c1 = 0.685, p < 10\u207b\u00b9\u00b9\u2077) beyond binary tool selection.",
9
+ "Forced-depth intervention lifts the strongest agent (DeepSeek V3) by +9.3 points on 18 tasks, while a low-diversity control hurts it (-2.3) \u2014 evidence that depth, not process change alone, drives the gain."
10
+ ],
11
+ "scoring": {
12
+ "rubric_max": 100,
13
+ "components": {
14
+ "approach": 20,
15
+ "orchestration": 15,
16
+ "quality": 35,
17
+ "feasibility": 15,
18
+ "novelty": 5,
19
+ "diversity": 10
20
+ },
21
+ "method": "Hybrid: 72 algorithmic points (Boltz-2 verification) + 28 LLM-judge points (3-judge panel with self-exclusion)."
22
+ },
23
  "entries": [
24
  {
25
+ "agent_name": "Human Oracle",
26
  "agent_id": "oracle",
27
  "mode": null,
28
+ "submission_type": "human_oracle",
29
+ "organization": "Romero Lab",
30
  "mcp_custom": false,
31
+ "overall_score": 74.85,
 
 
32
  "component_scores": {
33
  "approach": 20.0,
34
  "orchestration": 15.0,
35
+ "quality": 26.24,
36
+ "feasibility": 10.26,
37
+ "novelty": 2.93,
38
+ "diversity": 0.43
39
  },
40
  "taxonomy_scores": {
 
 
 
 
 
 
41
  "de_novo": {
42
+ "antibody": 79.2,
43
+ "binder": 71.8,
44
+ "enzyme": 75.6,
45
+ "fluorescent_protein": 78.7,
46
+ "scaffold": 75.8
47
+ },
48
+ "redesign": {
49
+ "antibody": 69.2,
50
+ "enzyme": 76.2,
51
+ "fluorescent_protein": 77.1,
52
+ "scaffold": 76.8
53
  }
54
  },
55
  "tasks_completed": 76,
56
  "tasks_total": 76,
57
  "tasks_with_zero": 0,
58
  "avg_latency_sec": null,
59
+ "submission_date": "2026-04-06"
60
  },
61
  {
62
  "agent_name": "Human Expert",
63
  "agent_id": "human-expert",
64
  "mode": null,
 
65
  "submission_type": "human_expert",
66
  "organization": "Romero Lab",
67
+ "mcp_custom": false,
68
+ "overall_score": 61.25,
69
  "component_scores": {
70
+ "approach": 13.81,
71
+ "orchestration": 8.86,
72
+ "quality": 20.91,
73
+ "feasibility": 10.79,
74
+ "novelty": 3.46,
75
+ "diversity": 3.43
76
  },
77
  "taxonomy_scores": {
 
 
 
 
 
 
78
  "de_novo": {
79
+ "antibody": 65.6,
80
+ "binder": 65.0,
81
+ "enzyme": 55.3,
82
+ "fluorescent_protein": 57.2,
83
+ "scaffold": 65.4
84
+ },
85
+ "redesign": {
86
+ "antibody": 52.4,
87
+ "enzyme": 59.5,
88
+ "fluorescent_protein": 54.6,
89
+ "scaffold": 53.7
90
  }
91
  },
92
  "tasks_completed": 76,
93
  "tasks_total": 76,
94
  "tasks_with_zero": 0,
95
  "avg_latency_sec": null,
96
+ "submission_date": "2026-04-06"
97
  },
98
  {
99
  "agent_name": "DeepSeek V3",
100
+ "agent_id": "deepseek-v3-benchmark",
101
+ "mode": "benchmark",
 
102
  "submission_type": "llm",
103
  "organization": "DeepSeek",
104
+ "mcp_custom": false,
105
+ "overall_score": 60.43,
106
  "component_scores": {
107
+ "approach": 11.4,
108
+ "orchestration": 9.36,
109
+ "quality": 22.07,
110
+ "feasibility": 10.77,
111
+ "novelty": 3.44,
112
+ "diversity": 3.38
113
  },
114
  "taxonomy_scores": {
 
 
 
 
 
 
115
  "de_novo": {
116
+ "antibody": 65.0,
117
+ "binder": 63.4,
118
+ "enzyme": 53.9,
119
+ "fluorescent_protein": 72.3,
120
+ "scaffold": 57.8
121
+ },
122
+ "redesign": {
123
+ "antibody": 61.3,
124
+ "enzyme": 59.3,
125
+ "fluorescent_protein": 56.9,
126
+ "scaffold": 66.9
127
  }
128
  },
129
  "tasks_completed": 76,
130
  "tasks_total": 76,
131
  "tasks_with_zero": 1,
132
  "avg_latency_sec": null,
133
+ "submission_date": "2026-04-06"
134
  },
135
  {
136
+ "agent_name": "DeepSeek V3",
137
+ "agent_id": "deepseek-v3-user",
138
+ "mode": "user",
139
+ "submission_type": "llm",
140
+ "organization": "DeepSeek",
141
  "mcp_custom": false,
142
+ "overall_score": 58.46,
 
 
143
  "component_scores": {
144
+ "approach": 11.09,
145
+ "orchestration": 9.14,
146
+ "quality": 21.74,
147
+ "feasibility": 9.91,
148
+ "novelty": 3.25,
149
+ "diversity": 3.33
150
  },
151
  "taxonomy_scores": {
 
 
 
 
 
 
152
  "de_novo": {
153
+ "antibody": 65.6,
154
+ "binder": 63.0,
155
+ "enzyme": 64.2,
156
+ "fluorescent_protein": 64.2,
157
+ "scaffold": 60.4
158
+ },
159
+ "redesign": {
160
+ "antibody": 61.6,
161
+ "enzyme": 60.7,
162
+ "fluorescent_protein": 43.0,
163
+ "scaffold": 44.1
164
  }
165
  },
166
  "tasks_completed": 76,
167
  "tasks_total": 76,
168
+ "tasks_with_zero": 7,
169
  "avg_latency_sec": null,
170
+ "submission_date": "2026-04-06"
171
  },
172
  {
173
+ "agent_name": "GPT-5",
174
+ "agent_id": "gpt5-benchmark",
175
  "mode": "benchmark",
 
176
  "submission_type": "llm",
177
+ "organization": "OpenAI",
178
+ "mcp_custom": false,
179
+ "overall_score": 55.61,
180
  "component_scores": {
181
+ "approach": 8.76,
182
+ "orchestration": 6.84,
183
+ "quality": 22.96,
184
+ "feasibility": 10.03,
185
+ "novelty": 3.27,
186
+ "diversity": 3.75
187
  },
188
  "taxonomy_scores": {
 
 
 
 
 
 
189
  "de_novo": {
190
+ "antibody": 62.6,
191
+ "binder": 59.9,
192
+ "enzyme": 55.9,
193
+ "fluorescent_protein": 53.9,
194
+ "scaffold": 56.1
195
+ },
196
+ "redesign": {
197
+ "antibody": 47.3,
198
+ "enzyme": 54.4,
199
+ "fluorescent_protein": 49.5,
200
+ "scaffold": 54.6
201
  }
202
  },
203
  "tasks_completed": 76,
204
  "tasks_total": 76,
205
  "tasks_with_zero": 2,
206
  "avg_latency_sec": null,
207
+ "submission_date": "2026-04-06"
208
  },
209
  {
210
  "agent_name": "GPT-5",
211
  "agent_id": "gpt5-user",
212
  "mode": "user",
 
213
  "submission_type": "llm",
214
  "organization": "OpenAI",
215
+ "mcp_custom": false,
216
+ "overall_score": 55.26,
217
  "component_scores": {
218
+ "approach": 9.46,
219
+ "orchestration": 8.29,
220
+ "quality": 20.83,
221
+ "feasibility": 9.9,
222
+ "novelty": 3.2,
223
+ "diversity": 3.58
224
  },
225
  "taxonomy_scores": {
 
 
 
 
 
 
226
  "de_novo": {
227
+ "antibody": 61.2,
228
+ "binder": 56.1,
229
+ "enzyme": 57.9,
230
+ "fluorescent_protein": 61.3,
231
+ "scaffold": 55.6
232
+ },
233
+ "redesign": {
234
+ "antibody": 52.1,
235
+ "enzyme": 54.2,
236
+ "fluorescent_protein": 55.7,
237
+ "scaffold": 46.3
238
  }
239
  },
240
  "tasks_completed": 76,
241
  "tasks_total": 76,
242
+ "tasks_with_zero": 4,
243
  "avg_latency_sec": null,
244
+ "submission_date": "2026-04-06"
245
  },
246
  {
247
+ "agent_name": "Hardcoded Pipeline",
248
+ "agent_id": "hardcoded-pipeline",
249
+ "mode": null,
250
+ "submission_type": "hardcoded",
251
+ "organization": "Deterministic",
252
  "mcp_custom": false,
253
+ "overall_score": 54.2,
 
 
254
  "component_scores": {
255
+ "approach": 10.19,
256
+ "orchestration": 8.3,
257
+ "quality": 19.91,
258
+ "feasibility": 10.26,
259
+ "novelty": 2.48,
260
+ "diversity": 3.08
261
  },
262
  "taxonomy_scores": {
 
 
 
 
 
 
263
  "de_novo": {
264
+ "antibody": 60.8,
265
+ "binder": 59.8,
266
+ "enzyme": 46.0,
267
+ "fluorescent_protein": 62.6,
268
+ "scaffold": 55.0
269
+ },
270
+ "redesign": {
271
+ "antibody": 45.4,
272
+ "enzyme": 50.7,
273
+ "fluorescent_protein": 49.5,
274
+ "scaffold": 50.3
275
  }
276
  },
277
  "tasks_completed": 76,
278
  "tasks_total": 76,
279
+ "tasks_with_zero": 0,
280
  "avg_latency_sec": null,
281
+ "submission_date": "2026-04-06"
282
  },
283
  {
284
  "agent_name": "Claude Sonnet 4.5",
285
+ "agent_id": "sonnet-4.5-user",
286
+ "mode": "user",
 
287
  "submission_type": "llm",
288
  "organization": "Anthropic",
289
+ "mcp_custom": false,
290
+ "overall_score": 50.23,
291
  "component_scores": {
292
+ "approach": 9.63,
293
+ "orchestration": 8.54,
294
+ "quality": 17.31,
295
+ "feasibility": 9.03,
296
+ "novelty": 2.68,
297
+ "diversity": 3.05
298
  },
299
  "taxonomy_scores": {
 
 
 
 
 
 
300
  "de_novo": {
301
+ "antibody": 66.3,
302
+ "binder": 56.5,
303
+ "enzyme": 56.9,
304
+ "fluorescent_protein": 62.8,
305
+ "scaffold": 57.9
306
+ },
307
+ "redesign": {
308
+ "antibody": 43.1,
309
+ "enzyme": 37.5,
310
+ "fluorescent_protein": 32.8,
311
+ "scaffold": 42.0
312
  }
313
  },
314
  "tasks_completed": 76,
315
  "tasks_total": 76,
316
+ "tasks_with_zero": 16,
317
  "avg_latency_sec": null,
318
+ "submission_date": "2026-04-06"
319
  },
320
  {
321
+ "agent_name": "Claude Sonnet 4.5",
322
+ "agent_id": "sonnet-4.5-benchmark",
323
  "mode": "benchmark",
 
324
  "submission_type": "llm",
325
+ "organization": "Anthropic",
326
+ "mcp_custom": false,
327
+ "overall_score": 41.17,
328
  "component_scores": {
329
+ "approach": 7.92,
330
+ "orchestration": 6.93,
331
+ "quality": 13.54,
332
+ "feasibility": 8.2,
333
+ "novelty": 2.25,
334
+ "diversity": 2.33
335
  },
336
  "taxonomy_scores": {
 
 
 
 
 
 
337
  "de_novo": {
338
+ "antibody": 29.5,
339
+ "binder": 55.5,
340
+ "enzyme": 29.6,
341
+ "fluorescent_protein": 45.9,
342
+ "scaffold": 41.2
343
+ },
344
+ "redesign": {
345
+ "antibody": 34.6,
346
+ "enzyme": 29.5,
347
+ "fluorescent_protein": 35.3,
348
+ "scaffold": 40.9
349
  }
350
  },
351
  "tasks_completed": 76,
352
  "tasks_total": 76,
353
+ "tasks_with_zero": 23,
354
  "avg_latency_sec": null,
355
+ "submission_date": "2026-04-06"
356
  },
357
  {
358
  "agent_name": "Gemini 2.5 Pro",
359
  "agent_id": "gemini-2.5-pro-user",
360
  "mode": "user",
 
361
  "submission_type": "llm",
362
  "organization": "Google",
363
+ "mcp_custom": false,
364
+ "overall_score": 8.75,
365
  "component_scores": {
366
+ "approach": 3.37,
367
+ "orchestration": 2.79,
368
+ "quality": 0.55,
369
+ "feasibility": 1.15,
370
+ "novelty": 0.49,
371
+ "diversity": 0.41
372
  },
373
  "taxonomy_scores": {
 
 
 
 
 
 
374
  "de_novo": {
375
+ "antibody": 10.8,
376
+ "binder": 9.3,
377
+ "enzyme": 30.2,
378
+ "fluorescent_protein": 3.1,
379
+ "scaffold": 9.2
380
+ },
381
+ "redesign": {
382
+ "antibody": 8.0,
383
+ "enzyme": 4.9,
384
+ "fluorescent_protein": 6.8,
385
+ "scaffold": 8.6
386
  }
387
  },
388
  "tasks_completed": 76,
389
  "tasks_total": 76,
390
+ "tasks_with_zero": 74,
391
  "avg_latency_sec": null,
392
+ "submission_date": "2026-04-06"
393
  },
394
  {
395
  "agent_name": "Gemini 2.5 Pro",
396
  "agent_id": "gemini-2.5-pro-benchmark",
397
  "mode": "benchmark",
 
398
  "submission_type": "llm",
399
  "organization": "Google",
400
+ "mcp_custom": false,
401
+ "overall_score": 8.11,
402
  "component_scores": {
403
+ "approach": 3.58,
404
+ "orchestration": 2.47,
405
+ "quality": 0.34,
406
+ "feasibility": 0.93,
407
+ "novelty": 0.42,
408
+ "diversity": 0.37
409
  },
410
  "taxonomy_scores": {
 
 
 
 
 
 
411
  "de_novo": {
412
+ "antibody": 9.1,
413
+ "binder": 9.2,
414
+ "enzyme": 11.0,
415
+ "fluorescent_protein": 3.1,
416
+ "scaffold": 9.1
417
+ },
418
+ "redesign": {
419
+ "antibody": 7.3,
420
+ "enzyme": 4.4,
421
+ "fluorescent_protein": 6.2,
422
+ "scaffold": 11.4
423
  }
424
  },
425
  "tasks_completed": 76,
426
  "tasks_total": 76,
427
+ "tasks_with_zero": 75,
428
  "avg_latency_sec": null,
429
+ "submission_date": "2026-04-06"
430
  }
431
+ ],
432
+ "interventions": {
433
+ "description": "Causal intervention experiments on the depth gap. 18 representative tasks rerun under three conditions: baseline (no intervention), forced_depth (mandate \u22653 evaluation passes per candidate), and low_diversity_control (constrain candidate count without forcing depth).",
434
+ "n_tasks": 18,
435
+ "rows": [
436
+ {
437
+ "label": "DeepSeek V3 \u2014 baseline",
438
+ "condition": "baseline",
439
+ "agent": "deepseek-v3-tools-benchmark",
440
+ "n_tasks": 18,
441
+ "score": 58.72,
442
+ "delta_vs_baseline": 0.0,
443
+ "approach": 13.44,
444
+ "orchestration": 11.17,
445
+ "quality": 16.11,
446
+ "diversity": 3.56
447
+ },
448
+ {
449
+ "label": "GPT-5 \u2014 baseline",
450
+ "condition": "baseline",
451
+ "agent": "gpt5-tools-benchmark",
452
+ "n_tasks": 18,
453
+ "score": 46.78,
454
+ "delta_vs_baseline": 0.0,
455
+ "approach": 8.33,
456
+ "orchestration": 6.22,
457
+ "quality": 15.39,
458
+ "diversity": 3.94
459
+ },
460
+ {
461
+ "label": "Human Expert \u2014 baseline",
462
+ "condition": "baseline",
463
+ "agent": "human-expert-agent",
464
+ "n_tasks": 18,
465
+ "score": 56.67,
466
+ "delta_vs_baseline": 0.0,
467
+ "approach": 18.28,
468
+ "orchestration": 9.28,
469
+ "quality": 11.06,
470
+ "diversity": 2.28
471
+ },
472
+ {
473
+ "label": "DeepSeek V3 \u2014 forced depth",
474
+ "condition": "forced_depth",
475
+ "agent": "deepseek-v3-forced-depth",
476
+ "n_tasks": 18,
477
+ "score": 68.06,
478
+ "delta_vs_baseline": 9.34,
479
+ "approach": 18.39,
480
+ "orchestration": 12.28,
481
+ "quality": 16.11,
482
+ "diversity": 3.94
483
+ },
484
+ {
485
+ "label": "GPT-5 \u2014 forced depth",
486
+ "condition": "forced_depth",
487
+ "agent": "gpt5-tools-forced-depth",
488
+ "n_tasks": 18,
489
+ "score": 62.67,
490
+ "delta_vs_baseline": 15.89,
491
+ "approach": 18.28,
492
+ "orchestration": 11.67,
493
+ "quality": 15.0,
494
+ "diversity": 3.06
495
+ },
496
+ {
497
+ "label": "DeepSeek V3 \u2014 low diversity",
498
+ "condition": "low_diversity_control",
499
+ "agent": "deepseek-v3-low-diversity",
500
+ "n_tasks": 18,
501
+ "score": 56.39,
502
+ "delta_vs_baseline": -2.33,
503
+ "approach": 13.11,
504
+ "orchestration": 11.11,
505
+ "quality": 16.0,
506
+ "diversity": 3.22
507
+ },
508
+ {
509
+ "label": "GPT-5 \u2014 low diversity",
510
+ "condition": "low_diversity_control",
511
+ "agent": "gpt5-tools-low-diversity",
512
+ "n_tasks": 18,
513
+ "score": 61.5,
514
+ "delta_vs_baseline": 14.72,
515
+ "approach": 13.06,
516
+ "orchestration": 12.0,
517
+ "quality": 16.22,
518
+ "diversity": 3.22
519
+ },
520
+ {
521
+ "label": "Human Expert \u2014 shallow",
522
+ "condition": "low_diversity_control",
523
+ "agent": "human-expert-shallow",
524
+ "n_tasks": 18,
525
+ "score": 55.06,
526
+ "delta_vs_baseline": -1.61,
527
+ "approach": 18.22,
528
+ "orchestration": 9.28,
529
+ "quality": 11.17,
530
+ "diversity": 0.61
531
+ }
532
+ ]
533
+ }
534
  }