"""BioDesignBench Leaderboard — Gradio App for HuggingFace Spaces Evaluating LLM Agents on Protein Design via MCP Tools Romero Lab, Duke University Tabs: 1. Overall Leaderboard 2. Taxonomy Breakdown 3. Component Analysis 4. Benchmark vs User 5. Submit (new submission form) 6. Status & Admin (password-protected pipeline control) 7. About """ import json import os from pathlib import Path import gradio as gr import plotly.graph_objects as go ADMIN_PASSWORD = os.environ.get("BDB_ADMIN_PASSWORD", "biodesignbench2026") # ═══════════════════════════════════════════════════════════════════ # Configuration — change these when deploying # ═══════════════════════════════════════════════════════════════════ PAPER_URL = "https://www.biorxiv.org/content/10.64898/2026.05.06.723381v1" GITHUB_URL = "https://github.com/RomeroLab/BioDesignBench" HF_URL = "https://huggingface.co/spaces/RomeroLab-Duke/BioDesignBench-Leaderboard" PYPI_URL = "https://pypi.org/project/protein-design-mcp/" # ═══════════════════════════════════════════════════════════════════ # Taxonomy & scoring constants (2 × 5 design matrix) # ═══════════════════════════════════════════════════════════════════ APPROACHES = ["de_novo", "redesign"] APPROACH_LABELS = { "de_novo": "De Novo Design", "redesign": "Redesign", } SUBJECTS = ["antibody", "binder", "enzyme", "scaffold", "fluorescent_protein"] SUBJECT_LABELS = { "antibody": "Antibody", "binder": "Binder", "enzyme": "Enzyme", "scaffold": "Scaffold", "fluorescent_protein": "Fluorescent Prot.", } # 9 valid cells (rd × binder is empty in current task set) VALID_CELLS = { "de_novo": {"antibody", "binder", "enzyme", "scaffold", "fluorescent_protein"}, "redesign": {"antibody", "enzyme", "scaffold", "fluorescent_protein"}, } N_TASKS_PER_CELL = { ("de_novo", "antibody"): 4, ("de_novo", "binder"): 19, ("de_novo", "enzyme"): 2, ("de_novo", "scaffold"): 21, ("de_novo", "fluorescent_protein"): 1, ("redesign", "antibody"): 5, ("redesign", "enzyme"): 10, ("redesign", "scaffold"): 4, ("redesign", "fluorescent_protein"): 10, } COMPONENTS = [ "approach", "orchestration", "quality", "feasibility", "novelty", "diversity", ] COMP_MAX = { "approach": 20, "orchestration": 15, "quality": 35, "feasibility": 15, "novelty": 5, "diversity": 10, } TYPE_STYLE = { "llm": {"icon": "", "bg": "#ffffff", "tag": ""}, "hardcoded": {"icon": "\U0001f527", "bg": "#f0f0f0", "tag": "baseline"}, "human_expert": { "icon": "\U0001f468\u200d\U0001f52c", "bg": "#ebf4ff", "tag": "baseline", }, "human_oracle": {"icon": "\U0001f4c4", "bg": "#fefcbf", "tag": "baseline"}, # Backward-compat alias for older JSON files "oracle": {"icon": "\U0001f4c4", "bg": "#fefcbf", "tag": "baseline"}, } # ═══════════════════════════════════════════════════════════════════ # Data loading # ═══════════════════════════════════════════════════════════════════ def load_data() -> dict: path = Path(__file__).parent / "leaderboard_data.json" with open(path) as f: return json.load(f) # ═══════════════════════════════════════════════════════════════════ # Custom CSS # ═══════════════════════════════════════════════════════════════════ CUSTOM_CSS = """ .gradio-container { max-width: 1200px !important; } .gr-padded { padding: 0 !important; } /* Force light appearance for all inline-styled HTML content */ .dark .gradio-container { --body-background-fill: #f7fafc !important; --block-background-fill: #ffffff !important; --body-text-color: #1a202c !important; --block-label-text-color: #1a202c !important; --input-background-fill: #ffffff !important; --border-color-primary: #e2e8f0 !important; --color-accent-soft: rgba(49,130,206,0.15) !important; --neutral-50: #f7fafc !important; --neutral-100: #edf2f7 !important; --neutral-200: #e2e8f0 !important; --neutral-700: #4a5568 !important; --neutral-800: #2d3748 !important; color: #1a202c !important; background: #f7fafc !important; } .dark .tabs { background: #ffffff !important; } .dark .tab-nav button { color: #2d3748 !important; } .dark .tab-nav button.selected { color: #0f172a !important; border-color: #3182ce !important; } .dark .block { background: #ffffff !important; } .dark label, .dark .label-wrap { color: #2d3748 !important; } .dark input, .dark textarea, .dark select { background: #ffffff !important; color: #1a202c !important; border-color: #e2e8f0 !important; } .dark .accordion { background: #ffffff !important; } .dark .accordion > .label-wrap { color: #2d3748 !important; } """ # Force light mode on page load FORCE_LIGHT_JS = """ () => { document.querySelector('body').classList.remove('dark'); const obs = new MutationObserver(() => { document.querySelector('body').classList.remove('dark'); }); obs.observe(document.body, {attributes: true, attributeFilter: ['class']}); setTimeout(() => obs.disconnect(), 5000); } """ # ═══════════════════════════════════════════════════════════════════ # Plotly layout helper # ═══════════════════════════════════════════════════════════════════ def _base_layout(**overrides) -> dict: """Shared Plotly layout defaults, with per-chart overrides.""" base = dict( plot_bgcolor="white", paper_bgcolor="white", font=dict( family="system-ui, -apple-system, sans-serif", size=12, color="#2d3748" ), margin=dict(l=40, r=20, t=50, b=40), ) base.update(overrides) return base # ═══════════════════════════════════════════════════════════════════ # HTML builders # ═══════════════════════════════════════════════════════════════════ def build_header(last_updated: str, n_entries: int) -> str: btn = ( "display:inline-block;padding:0.45rem 1.1rem;border-radius:8px;" "text-decoration:none;font-size:0.82rem;font-weight:600;" "transition:opacity 0.15s" ) return f"""

Romero Lab · Duke University

\U0001f9ec BioDesignBench

Can LLM agents orchestrate stochastic protein-design pipelines?

Top-tier agents now surpass a deterministic pipeline — but invoke evaluation tools at only 14% of expert depth. Guidance rescues coverage, not depth.

\U0001f4c4 Paper \U0001f4bb GitHub \U0001f917 HuggingFace \U0001f4e6 PyPI
76 tasks · 5 molecular families 17 MCP tools {n_entries} conditions Updated {last_updated}
""" # ── Score styling helpers ── def _score_color(s: float) -> str: if s >= 50: return "#38a169" if s >= 25: return "#d69e2e" return "#e53e3e" def _bar_bg(s: float) -> str: if s >= 50: return "rgba(56,161,105,0.15)" if s >= 25: return "rgba(214,158,46,0.15)" return "rgba(229,62,62,0.12)" def _heat_color(val, max_val=95) -> str: if val is None: return "#f7fafc" r = val / max_val if r >= 0.7: return f"rgba(56,161,105,{min(0.2 + r * 0.4, 0.8):.2f})" if r >= 0.4: return f"rgba(214,158,46,{min(0.2 + r * 0.4, 0.8):.2f})" return f"rgba(229,62,62,{min(0.15 + r * 0.3, 0.6):.2f})" # ── Tab 1: Overall leaderboard table ── def build_leaderboard_table( entries: list, mode_f: str, mcp_f: str, type_f: str ) -> str: """Generate the mixed-ranking HTML table with inline styles.""" # Filter filtered = [] for e in entries: st = e["submission_type"] if mode_f != "All" and st == "llm": if (e.get("mode") or "").lower() != mode_f.lower(): continue if mcp_f == "Reference" and e.get("mcp_custom"): continue if mcp_f == "Custom" and not e.get("mcp_custom"): continue if type_f == "LLM Only" and st != "llm": continue if type_f == "Baselines Only" and st == "llm": continue filtered.append(e) filtered.sort(key=lambda x: x["overall_score"], reverse=True) # Shared cell styles TD = ( "padding:0.65rem 1rem;border-bottom:1px solid #e2e8f0;" "font-size:0.9rem" ) TH = ( "background:#0f172a;color:white;padding:0.75rem 1rem;" "text-align:left;font-size:0.75rem;text-transform:uppercase;" "letter-spacing:0.05em;font-weight:600" ) rows = [] llm_rank = 0 for e in filtered: st = e["submission_type"] sty = TYPE_STYLE.get(st, TYPE_STYLE["llm"]) is_bl = st != "llm" sc = e["overall_score"] # ── Rank cell ── if is_bl: rank = ( f'{sty["icon"]}' ) else: llm_rank += 1 rcolor = {1: "#d69e2e", 2: "#a0aec0", 3: "#c17832"}.get( llm_rank, "#0f172a" ) rsize = ( "1.1rem" if llm_rank == 1 else ("1.05rem" if llm_rank <= 3 else "0.9rem") ) rank = ( f'" f"{llm_rank}" ) # ── Name cell ── tag_html = "" if sty["tag"]: tag_html = ( ' ' f'{sty["tag"]}' ) icon_pfx = f'{sty["icon"]} ' if sty["icon"] else "" fw = "600" if is_bl else "500" name = ( f'' f'{icon_pfx}{e["agent_name"]}{tag_html}' ) # ── Organization ── org = f'{e["organization"]}' # ── Mode badge ── if is_bl: mode = f'\u2014' elif e.get("mode") == "benchmark": mode = ( f'benchmark' ) else: mode = ( f'user' ) # ── MCP ── if is_bl: mcp = f'\u2014' elif e.get("mcp_custom"): mcp = ( f'custom' ) else: mcp = ( f'reference' ) # ── Score with proportional bar ── scol = _score_color(sc) bbg = _bar_bg(sc) score_cell = ( f'' f'
' f'{sc:.1f}' ) # ── Tasks & zeros ── tc = e.get("tasks_completed", 0) tt = e.get("tasks_total", 76) tasks = f'{tc}/{tt}' zeros = f'{e.get("tasks_with_zero", 0)}' rows.append( f'' f"{rank}{name}{org}{mode}{mcp}{score_cell}{tasks}{zeros}" ) return f""" {''.join(rows)}
# Agent Organization Mode MCP Score Tasks Zero-Score
""" # ── Tab 2: Taxonomy heatmap ── def build_heatmap(entry: dict) -> str: """HTML heatmap for one agent across the 2 × 5 design matrix (DesignApproach × MolecularSubject = 9 valid cells; rd × binder is empty). """ ts = entry.get("taxonomy_scores", {}) TH = ( "background:#0f172a;color:white;padding:0.6rem 0.8rem;" "text-align:center;font-size:0.75rem;font-weight:600" ) TD = ( "text-align:center;padding:0.5rem;font-size:0.85rem;" "font-weight:600;border-bottom:1px solid #e2e8f0" ) rows = [] for ap in APPROACHES: cells = [ f'{APPROACH_LABELS[ap]}' ] vals = [] for sj in SUBJECTS: if sj in VALID_CELLS[ap]: val = ts.get(ap, {}).get(sj) bg = _heat_color(val) n = N_TASKS_PER_CELL.get((ap, sj), 0) text = ( f'{val:.0f}
n={n}' if val is not None else "\u2014" ) cells.append(f'{text}') if val is not None: vals.append(val) else: cells.append( f'' "n/a" ) avg = sum(vals) / len(vals) if vals else 0 avg_bg = _heat_color(avg) cells.append( f'' f"{avg:.1f}" ) rows.append(f'{"".join(cells)}') sj_headers = "".join( f'{SUBJECT_LABELS[sj]}' for sj in SUBJECTS ) return f""" {sj_headers} {''.join(rows)}
Approach \u2193 / Subject \u2192Mean
""" # ── Tab 4: Mode comparison cards ── def build_mode_cards(entries: list) -> str: """Per-LLM cards showing benchmark vs user delta.""" by_name: dict[str, dict] = {} for e in entries: if e["submission_type"] != "llm": continue by_name.setdefault(e["agent_name"], {})[e["mode"]] = e ordered = sorted( by_name.items(), key=lambda x: x[1].get("user", {}).get("overall_score", 0), reverse=True, ) cards = [] for name, modes in ordered: bench = modes.get("benchmark") user = modes.get("user") if not bench or not user: continue delta = user["overall_score"] - bench["overall_score"] pct = (delta / bench["overall_score"] * 100) if bench["overall_score"] else 0 lines = [ '
' "Benchmark" f'' f'{bench["overall_score"]:.1f}
', '
' "User" f'' f'{user["overall_score"]:.1f}
', '
' "Delta" f'' f"+{delta:.1f} (+{pct:.0f}%)
", ] for c in COMPONENTS: d = user["component_scores"][c] - bench["component_scores"][c] color = "#38a169" if d >= 0 else "#e53e3e" sign = "+" if d >= 0 else "" lines.append( '
' f'{c}' f'' f"{sign}{d:.1f}
" ) cards.append( '
' f'

{name}

' f'{"".join(lines)}
' ) return ( '
' f'{"".join(cards)}
' ) # ── Headline findings (paper banner) ── def build_headline_findings(findings: list) -> str: """Top-of-page banner that surfaces the paper's three core claims.""" if not findings: return "" cards = [] accents = ["#3182ce", "#d69e2e", "#805ad5", "#38a169", "#e53e3e"] for i, text in enumerate(findings): c = accents[i % len(accents)] cards.append( f'
' f'
Finding {i+1}
' f'
{text}
' ) return ( '
' f"{''.join(cards)}
" ) # ── Tab: Depth Gap (intervention experiments) ── def build_intervention_section(interventions: dict) -> str: """Show forced-depth and low-diversity intervention results. The forced-depth condition mandates ≥3 evaluation passes per design candidate; the low-diversity control constrains the candidate pool without forcing depth. Together they isolate evaluation depth as the causal driver of the 'surface competence' gap reported in the paper. """ if not interventions or not interventions.get("rows"): return '

No intervention data available.

' rows = interventions["rows"] cond_meta = { "baseline": ("#64748b", "Baseline"), "forced_depth": ("#38a169", "Forced Depth"), "low_diversity_control": ("#d69e2e", "Low-Diversity Control"), } TH = ( "background:#0f172a;color:white;padding:0.65rem 0.9rem;" "text-align:left;font-size:0.72rem;text-transform:uppercase;" "letter-spacing:0.05em;font-weight:600" ) TD = ("padding:0.6rem 0.9rem;border-bottom:1px solid #e2e8f0;" "font-size:0.86rem") body = [] for r in rows: color, cond_label = cond_meta.get(r["condition"], ("#64748b", r["condition"])) delta = r.get("delta_vs_baseline") if delta is None or r["condition"] == "baseline": delta_html = '\u2014' else: sign = "+" if delta >= 0 else "" dcol = "#38a169" if delta > 0 else ("#e53e3e" if delta < 0 else "#64748b") delta_html = ( f'' f"{sign}{delta:.1f}" ) body.append( f'' f'{r["label"]}' f'{cond_label}' f'{r["score"]:.1f}' f'{delta_html}' f'' f'{r["approach"]:.1f} / {r["orchestration"]:.1f}' f'' f'{r["quality"]:.1f}' f'' f'{r["diversity"]:.1f}' ) n = interventions.get("n_tasks", 18) return f"""

Causal interventions on the depth gap

{interventions.get('description', '')} Reruns are scored on a representative {n}-task subset that spans all 9 occupied taxonomy cells.

Headline: Forced-depth lifts DeepSeek V3 by +9.3 and GPT-5 by +15.9 points without any change to the underlying model or tools, while the low-diversity control hurts DeepSeek V3 (−2.3). The dissociation is cleanest on the strongest agent, where it provides direct causal evidence that evaluation depth — not the mere act of process intervention — drives the gain. GPT-5's response is more uniform across both interventions; we report the raw deltas without smoothing.
{''.join(body)}
Run Condition Score Δ vs baseline Approach / Orch. Quality Diversity

Scoring uses the same 100-point hybrid rubric as the main leaderboard but is restricted to {n} representative tasks; absolute values therefore differ from the full-benchmark mean. The delta vs baseline compares each agent against its own untreated baseline run, isolating the intervention effect.

""" # ── Tab 5: About ── def build_about() -> str: h2 = ( 'style="color:#0f172a;margin:0 0 0.8rem;font-size:1.25rem;' 'font-weight:700"' ) h3 = ( 'style="color:#334155;margin:1.2rem 0 0.5rem;font-size:1rem;' 'font-weight:600"' ) p = 'style="margin-bottom:0.8rem;color:#475569;line-height:1.6"' card = ( 'style="background:#ffffff;border:1px solid #e2e8f0;' 'border-radius:12px;padding:2rem;margin-bottom:1.2rem"' ) stat_box = ( 'style="background:#f8fafc;border:1px solid #e2e8f0;' 'border-radius:10px;padding:1rem;text-align:center"' ) return f"""

What is BioDesignBench?

BioDesignBench is a benchmark for evaluating LLM agents as orchestrators of multi-step stochastic protein-design pipelines. Unlike chemistry- or code-agent benchmarks, where tool chains are largely deterministic, protein design demands repeated sampling from generative tools (RFdiffusion, ProteinMPNN) and iterative cross-validation through several biophysical metrics. We test the full agentic loop — plan → sample → evaluate across multiple metrics → iterate — over 76 expert-curated tasks drawn from 2024–2026 literature, exposed through 17 MCP-integrated tools.

76
design tasks
9
taxonomy cells
(2 approaches \u00d7 5 subjects)
17
MCP tools
100
point rubric

Three principal findings

1. Top-tier agents now beat a deterministic pipeline

DeepSeek V3 and GPT-5 surpass a hand-engineered hardcoded pipeline (54.2) under both modes. Autonomous protein-design orchestration is no longer infeasible — but a substantial gap to the human expert (61.3) and oracle (74.9) remains.

2. Coverage–depth dissociation

Workflow guidance closes the coverage gap (Rescue Index up to +3.01) but leaves utilisation depth unchanged (Rescue Index \u2248 0). Better tool documentation can teach agents which tools to call, but cannot teach them to call those tools with the iterative depth that expert practice demands.

3. Evaluation depth, not tool knowledge, is the bottleneck

Across 836 task–condition observations, evaluation depth per candidate correlates with total score at ρ = 0.685 (p < 10-117). LLM agents generate backbone candidates at expert-level rates but evaluate each one at only 14% of expert depth. Forced-depth interventions confirm this is causal — see the Depth Gap tab.

How to submit

Unlike most agent benchmarks, you do not host an HTTP endpoint. The 76 task descriptions never leave Romero Lab infrastructure. Instead you provide:

  1. an LLM provider + API key (Anthropic / OpenAI / Google / DeepSeek). We run the BioDesignBench agent loop against your chosen model inside the leaderboard backend. Your key is scrubbed from our records immediately after the dispatch phase completes.
  2. optionally, a custom MCP URL if you want to evaluate your own tool implementations. Otherwise, the agent calls our reference protein-design-mcp endpoint (in progress).

Data flow

Each task prompt is sent to your chosen LLM provider via their standard API (Anthropic, OpenAI, Google, DeepSeek) — that single channel is the only path by which task data leaves Romero Lab. The MCP server (reference or custom) only ever sees operational tool arguments (sequences, PDB paths, hotspot residues); it never sees the raw task prompt or evaluation criteria. Every task prompt also carries a unique 16-character canary token as an HTML comment, for retrospective leakage detection.

Bring your own tools (Custom MCP)

If you want to benchmark a new tool implementation (a faster structure predictor, a different diffusion backbone, your own stability model) against the same 76 tasks and rubric, stand up an HTTPS endpoint that satisfies the MCP contract and paste the URL into the submission form's Advanced: Custom MCP section:

Limits

Scoring rubric (100 points, hybrid)

Scores combine 72 algorithmic points from deterministic biophysical metrics with 28 LLM-judge points assessed by a 3-judge panel (PoLL) with self-exclusion to mitigate self-preference bias. Each component is capped at its rubric maximum to prevent double counting.

Approach (20 pts) — strategic appropriateness of tool selection across 10 functional categories (backbone generation, inverse folding, structure prediction, etc.).

Orchestration (15 pts) — pipeline ordering, intermediate validation, and adaptive iteration.

Quality (35 pts) — 100% algorithmic. Continuous 4-band interpolation over Boltz-2 re-prediction metrics (pLDDT, pTM, ipTM, i_pAE), eliminating LLM judgement variance on biophysical quantities.

Feasibility (15 pts) — valid amino acids, length constraints, composition, and biophysical plausibility.

Novelty (5 pts) — sequence identity to reference (lower identity = more novel).

Diversity (10 pts) — number and pairwise diversity of generated designs.

Five-layer contamination defense

Every evaluated LLM may have read protein-design literature during pretraining, so we use a layered defense:

Citation

@article{{biodesignbench2026,
  title={{Evaluating LLM-Driven Protein Design:
         Agents Lack Iterative Evaluation Depth}},
  author={{Kim, Jeonghyeon and Romero, Philip}},
  journal={{bioRxiv}},
  year={{2026}},
  doi={{10.64898/2026.05.06.723381}},
  url={{https://www.biorxiv.org/content/10.64898/2026.05.06.723381v1}}
}}
""" # ═══════════════════════════════════════════════════════════════════ # Chart builders (Plotly) # ═══════════════════════════════════════════════════════════════════ def chart_taxonomy_bar(entry: dict) -> go.Figure: """Grouped bar chart of mean score per molecular subject, split by design approach (de novo vs redesign). """ ts = entry.get("taxonomy_scores", {}) x_labels = [SUBJECT_LABELS[s] for s in SUBJECTS] def _series(ap): out = [] for sj in SUBJECTS: if sj in VALID_CELLS[ap]: out.append(ts.get(ap, {}).get(sj)) else: out.append(None) return out dn = _series("de_novo") rd = _series("redesign") fig = go.Figure() fig.add_trace(go.Bar( x=x_labels, y=dn, name="De Novo", marker_color="rgba(49,130,206,0.78)", text=[f"{v:.0f}" if v is not None else "" for v in dn], textposition="outside", )) fig.add_trace(go.Bar( x=x_labels, y=rd, name="Redesign", marker_color="rgba(214,158,46,0.78)", text=[f"{v:.0f}" if v is not None else "" for v in rd], textposition="outside", )) mode = entry.get("mode") or "\u2014" fig.update_layout( **_base_layout( barmode="group", title=dict( text=f"{entry['agent_name']} ({mode}) \u2014 Mean Score by Cell", font_size=14, ), yaxis=dict(range=[0, 100], title="Hybrid score (out of 100)"), xaxis=dict(title=""), legend=dict(orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5), height=340, ) ) return fig def chart_radar(e1: dict, e2: dict) -> go.Figure: """Radar chart comparing two agents' component scores (% of max).""" labels = [c.capitalize() for c in COMPONENTS] def norm(e): return [e["component_scores"][c] / COMP_MAX[c] * 100 for c in COMPONENTS] v1, v2 = norm(e1), norm(e2) m1 = e1.get("mode") or "\u2014" m2 = e2.get("mode") or "\u2014" fig = go.Figure() fig.add_trace( go.Scatterpolar( r=v1 + [v1[0]], theta=labels + [labels[0]], fill="toself", name=f'{e1["agent_name"]} ({m1})', line=dict(color="rgba(49,130,206,0.8)"), fillcolor="rgba(49,130,206,0.15)", ) ) fig.add_trace( go.Scatterpolar( r=v2 + [v2[0]], theta=labels + [labels[0]], fill="toself", name=f'{e2["agent_name"]} ({m2})', line=dict(color="rgba(229,62,62,0.8)"), fillcolor="rgba(229,62,62,0.15)", ) ) fig.update_layout( **_base_layout( polar=dict( radialaxis=dict(visible=True, range=[0, 100], ticksuffix="%") ), showlegend=True, legend=dict( orientation="h", yanchor="bottom", y=-0.25, xanchor="center", x=0.5, ), title=dict(text="Component Radar (% of max)", font_size=14), height=420, ) ) return fig def chart_component_bar(e1: dict, e2: dict) -> go.Figure: """Horizontal bar chart of raw component scores for two agents.""" labels = [f"{c.capitalize()} (/{COMP_MAX[c]})" for c in COMPONENTS] m1 = e1.get("mode") or "\u2014" m2 = e2.get("mode") or "\u2014" fig = go.Figure() fig.add_trace( go.Bar( y=labels, x=[e1["component_scores"][c] for c in COMPONENTS], name=f'{e1["agent_name"]} ({m1})', orientation="h", marker_color="rgba(49,130,206,0.7)", ) ) fig.add_trace( go.Bar( y=labels, x=[e2["component_scores"][c] for c in COMPONENTS], name=f'{e2["agent_name"]} ({m2})', orientation="h", marker_color="rgba(229,62,62,0.7)", ) ) fig.update_layout( **_base_layout( barmode="group", xaxis=dict(title="Score"), title=dict(text="Component Breakdown", font_size=14), legend=dict( orientation="h", yanchor="bottom", y=-0.3, xanchor="center", x=0.5, ), height=420, ) ) return fig def chart_mode_comparison(entries: list) -> go.Figure: """Grouped bar chart: benchmark vs user mode for each LLM.""" by_name: dict[str, dict[str, float]] = {} for e in entries: if e["submission_type"] != "llm": continue by_name.setdefault(e["agent_name"], {})[e["mode"]] = e["overall_score"] ordered = sorted( by_name.items(), key=lambda x: x[1].get("user", 0), reverse=True, ) names = [n for n, _ in ordered] bench = [m.get("benchmark", 0) for _, m in ordered] user = [m.get("user", 0) for _, m in ordered] fig = go.Figure() fig.add_trace( go.Bar( x=names, y=bench, name="Benchmark Mode", marker_color="rgba(229,62,62,0.6)", ) ) fig.add_trace( go.Bar( x=names, y=user, name="User Mode", marker_color="rgba(56,161,105,0.6)", ) ) fig.update_layout( **_base_layout( barmode="group", yaxis=dict(range=[0, 80], title="Overall hybrid score"), xaxis=dict(title=""), title=dict( text=("Unguided (Benchmark) vs Guided (User) modes \u2014 " "guidance lifts coverage but rarely shifts overall score"), font_size=13, ), legend=dict( orientation="h", yanchor="bottom", y=-0.18, xanchor="center", x=0.5, ), height=380, ) ) return fig # ═══════════════════════════════════════════════════════════════════ # Gradio application # ═══════════════════════════════════════════════════════════════════ def create_app() -> gr.Blocks: data = load_data() entries = data["entries"] by_id = {e["agent_id"]: e for e in entries} # Build dropdown choices: (display_label, agent_id) agent_choices = [] for e in entries: sty = TYPE_STYLE.get(e["submission_type"], TYPE_STYLE["llm"]) icon = sty["icon"] mode = e.get("mode") or "\u2014" label = f"{icon} {e['agent_name']} ({mode})".strip() agent_choices.append((label, e["agent_id"])) # Safe index helper def _choice_val(idx: int) -> str: return agent_choices[min(idx, len(agent_choices) - 1)][1] with gr.Blocks( theme=gr.themes.Soft(primary_hue="blue"), css=CUSTOM_CSS, js=FORCE_LIGHT_JS, ) as app: gr.HTML(build_header(data["last_updated"], len(entries))) gr.HTML(build_headline_findings(data.get("headline_findings", []))) with gr.Tabs(): # ════════ Tab 1: Overall Leaderboard ════════ with gr.Tab("\U0001f4ca Overall"): with gr.Row(): f_mode = gr.Dropdown( ["All", "Benchmark", "User"], value="All", label="Mode", scale=1, ) f_mcp = gr.Dropdown( ["All", "Reference", "Custom"], value="All", label="MCP Tools", scale=1, ) f_type = gr.Dropdown( ["All Entries", "LLM Only", "Baselines Only"], value="All Entries", label="Show", scale=1, ) tbl = gr.HTML( build_leaderboard_table( entries, "All", "All", "All Entries" ) ) def _update_table(m, mc, t): return build_leaderboard_table(entries, m, mc, t) for dd in [f_mode, f_mcp, f_type]: dd.change( _update_table, [f_mode, f_mcp, f_type], tbl ) # ════════ Tab 2: Taxonomy Breakdown ════════ with gr.Tab("\U0001f9ec Taxonomy"): tax_dd = gr.Dropdown( agent_choices, value=_choice_val(0), label="Select Agent", ) hm_html = gr.HTML(build_heatmap(entries[0])) tax_plot = gr.Plot(chart_taxonomy_bar(entries[0])) def _update_taxonomy(aid): e = by_id.get(aid, entries[0]) return build_heatmap(e), chart_taxonomy_bar(e) tax_dd.change( _update_taxonomy, [tax_dd], [hm_html, tax_plot] ) # ════════ Tab 3: Component Analysis ════════ with gr.Tab("\U0001f3af Components"): with gr.Row(): c1 = gr.Dropdown( agent_choices, value=_choice_val(0), label="Agent 1", scale=1, ) c2 = gr.Dropdown( agent_choices, value=_choice_val(4), label="Agent 2", scale=1, ) with gr.Row(): radar = gr.Plot( chart_radar( entries[0], entries[min(4, len(entries) - 1)], ) ) comp_bar = gr.Plot( chart_component_bar( entries[0], entries[min(4, len(entries) - 1)], ) ) def _update_comp(a1, a2): e1 = by_id.get(a1, entries[0]) e2 = by_id.get(a2, entries[-1]) return chart_radar(e1, e2), chart_component_bar(e1, e2) for dd in [c1, c2]: dd.change(_update_comp, [c1, c2], [radar, comp_bar]) # ════════ Tab 4: Benchmark vs User (coverage-depth dissociation) ════════ with gr.Tab("\u26a1 Guidance Effect"): gr.HTML( '
' 'Mode semantics: ' 'Benchmark mode exposes atomic tools without ' 'pipeline hints (unguided); User mode packages ' 'them into composite workflows with explicit pipeline ' 'structure (guided). Guidance lifts the lowest-tier ' 'agents but does not consistently help capable ones, ' 'and never closes the depth gap (see Depth Gap ' 'tab).
' ) gr.Plot(chart_mode_comparison(entries)) gr.HTML(build_mode_cards(entries)) # ════════ Tab 5: Depth Gap (interventions) ════════ with gr.Tab("\U0001f50d Depth Gap"): gr.HTML(build_intervention_section( data.get("interventions", {}) )) # ══════ Tab: Submit ══════ with gr.Tab("\U0001f4e4 Submit"): gr.HTML("""

Submit your agent

BioDesignBench evaluates models inside Romero Lab infrastructure to keep the 76 task specifications contamination-clean. You provide an LLM API key and a model name, and we run the BioDesignBench agent loop against your model with the reference 17-tool MCP server. Task content never leaves Romero Lab except through your chosen LLM provider's API call.

How your credentials are handled:
Reference vs Custom MCP
Rate limit: 1 submission per calendar month per organization. Your LLM-API and (if reference) MCP-GPU costs are billed to your account / paid by Romero Lab respectively; please be considerate.
""") with gr.Column(scale=1): sub_agent = gr.Textbox( label="Agent Name", placeholder="e.g., GPT-5 with reference MCP", ) sub_org = gr.Textbox( label="Organization", placeholder="e.g., OpenAI", ) with gr.Row(): sub_provider = gr.Dropdown( choices=[ ("Anthropic Claude", "anthropic"), ("OpenAI GPT", "openai"), ("Google Gemini", "google"), ("DeepSeek", "deepseek"), ], value="anthropic", label="LLM Provider", ) sub_model = gr.Textbox( label="Model name", placeholder="e.g., claude-sonnet-4-20250514", ) sub_api_key = gr.Textbox( label="API key (transient -- scrubbed after dispatch)", placeholder="sk-...", type="password", ) sub_desc = gr.Textbox( label="Description (optional)", placeholder="Brief description of your agent...", lines=2, ) with gr.Accordion("Advanced: Custom MCP", open=False): sub_custom_mcp_url = gr.Textbox( label="Custom MCP URL (optional)", placeholder="https://your-mcp.example.com/predict", ) sub_custom_mcp_token = gr.Textbox( label="Custom MCP bearer token (optional)", placeholder="(empty if your MCP needs no auth)", type="password", ) sub_btn = gr.Button( "Submit for Review", variant="primary", ) sub_result = gr.HTML() def _handle_submit( name, org, provider, model, api_key, desc, custom_mcp_url, custom_mcp_token, ): if not name or not org or not model or not api_key: return ('
' "agent name, organization, model name, and " "API key are required.
") try: from eval_queue import submit result = submit( agent_name=name, organization=org, provider=provider, model_name=model, api_key=api_key, description=desc, custom_mcp_url=custom_mcp_url or "", custom_mcp_token=custom_mcp_token or "", ) if "error" in result: return (f'
' f'{result["error"]}
') mcp_mode = "custom" if custom_mcp_url else "reference" return ( f'
' f'Submitted! ' f'ID: {result["submission_id"]}
' f'Status: {result["status"]}
' f'Provider: {provider} ' f'/ Model: {model}
' f'MCP mode: {mcp_mode}
' f'Canary: {result.get("canary_token","")}
' f'{result.get("message", "")}
' ) except Exception as e: return (f'
' f"Error: {str(e)[:200]}
") sub_btn.click( _handle_submit, [sub_agent, sub_org, sub_provider, sub_model, sub_api_key, sub_desc, sub_custom_mcp_url, sub_custom_mcp_token], sub_result, ) # ══════ Tab 6: Status & Admin ══════ with gr.Tab("\U0001f6e0 Status"): gr.HTML("""

Submission status

Check your submission status or manage the pipeline (admin only).

""") # --- Public status check --- with gr.Accordion("Check Submission Status", open=True): status_id = gr.Textbox( label="Submission ID", placeholder="Enter your submission ID...", ) status_btn = gr.Button("Check Status") status_out = gr.HTML() def _check_status(sid): if not sid: return '
Enter an ID above.
' try: from eval_queue import get_submission sub = get_submission(sid.strip()) if sub is None: return ('
' "Submission not found.
") status_color = { "pending": "#d69e2e", "approved": "#38a169", "dispatching": "#3182ce", "boltz": "#805ad5", "scoring": "#805ad5", "complete": "#38a169", "failed": "#e53e3e", "rejected": "#e53e3e", }.get(sub["status"], "#718096") score_html = "" if sub.get("overall_score") is not None: score_html = ( f'
' f'Score: {sub["overall_score"]:.1f}/100' f'
' ) return ( f'
' f'{sub["agent_name"]} ' f'({sub["organization"]})
' f'Status: {sub["status"]}
' f'Tasks: {sub.get("tasks_dispatched", 0)}' f'/{sub.get("tasks_total", 76)}
' f'Created: {sub.get("created_at", "")[:10]}' f'{score_html}
' ) except Exception as e: return f'
{e}
' status_btn.click(_check_status, [status_id], status_out) # --- Admin panel (password-protected) --- with gr.Accordion("Admin Panel", open=False): admin_pw = gr.Textbox( label="Admin Password", type="password", ) admin_auth_btn = gr.Button("Authenticate") admin_panel = gr.Column(visible=False) admin_msg = gr.HTML() with admin_panel: gr.HTML('

' 'Pending Submissions

') pending_html = gr.HTML() refresh_btn = gr.Button("Refresh List") with gr.Row(): approve_id = gr.Textbox( label="Submission ID to Approve/Reject", scale=2, ) approve_btn = gr.Button( "Approve", variant="primary", scale=1, ) reject_btn = gr.Button( "Reject", variant="stop", scale=1, ) approve_msg = gr.HTML() gr.HTML('

' 'Pipeline Control

') with gr.Row(): dispatch_id = gr.Textbox( label="Submission ID", scale=2, ) dispatch_btn = gr.Button( "Phase A: Dispatch Tasks", scale=1, ) with gr.Row(): boltz_id = gr.Textbox( label="Submission ID", scale=2, ) boltz_btn = gr.Button( "Phase B: Run Boltz (GPU)", scale=1, ) with gr.Row(): judge_id = gr.Textbox( label="Submission ID", scale=2, ) judge_btn = gr.Button( "Phase C: Run LLM Judge", scale=1, ) with gr.Row(): final_id = gr.Textbox( label="Submission ID", scale=2, ) final_btn = gr.Button( "Phase D: Finalize & Publish", scale=1, ) pipeline_out = gr.HTML() def _admin_auth(pw): if pw == ADMIN_PASSWORD: return ( gr.Column(visible=True), '
' 'Authenticated.
', ) return ( gr.Column(visible=False), '
' 'Wrong password.
', ) admin_auth_btn.click( _admin_auth, [admin_pw], [admin_panel, admin_msg], ) def _refresh_pending(): try: from eval_queue import get_pending_submissions pending = get_pending_submissions() if not pending: return "

No pending submissions.

" rows = [] for s in pending: mcp = "custom" if s.get("custom_mcp_url") else "reference" key_state = "set" if s.get("api_key") else "scrubbed" rows.append( f'{s["submission_id"]}' f'{s["agent_name"]}' f'{s["organization"]}' f'{s.get("provider","?")}/{s.get("model_name","?")}' f'{mcp}' f'{key_state}' f'{s.get("created_at","")[:10]}' ) return ( '' "" "" "" + "".join(rows) + "
IDAgentOrgProvider/ModelMCPKeyDate
" ) except Exception as e: return f"

Error: {e}

" refresh_btn.click( _refresh_pending, [], pending_html, ) def _approve_sub(sid): try: from eval_queue import update_status ok = update_status(sid.strip(), "approved") if ok: return ( f'
' f'Approved: {sid}
' ) return ( f'
' f'Failed to approve {sid}
' ) except Exception as e: return f'
{e}
' def _reject_sub(sid): try: from eval_queue import update_status ok = update_status(sid.strip(), "rejected") if ok: return ( f'
' f'Rejected: {sid}
' ) return ( f'
' f'Failed to reject {sid}
' ) except Exception as e: return f'
{e}
' approve_btn.click( _approve_sub, [approve_id], approve_msg, ) reject_btn.click( _reject_sub, [approve_id], approve_msg, ) def _run_dispatch(sid): try: from eval_queue import get_submission from eval_dispatcher import dispatch_all_tasks sub = get_submission(sid.strip()) if sub is None: return ('
' 'Not found
') if sub["status"] not in ("approved", "dispatching"): return ( f'
' f'Cannot dispatch: status=' f'{sub["status"]}
' ) if not sub.get("api_key"): return ( '
' 'API key already scrubbed -- this ' 'submission has already been dispatched. ' 'Resubmit if you need to re-run.
' ) results = dispatch_all_tasks(sid.strip()) ok = sum(1 for r in results if r.get("success")) return ( f'
' f'Dispatched: {ok}/{len(results)} tasks ' f'succeeded. API key scrubbed.
' ) except Exception as e: import traceback return ( f'
' f'Dispatch error: {e}
' f'
'
                                f'{traceback.format_exc()[:600]}
' ) def _run_boltz(sid): try: from eval_queue import get_submission from eval_boltz import run_boltz_posteval sub = get_submission(sid.strip()) if sub is None: return ( '
' 'Not found
' ) per_task = json.loads( sub.get("per_task_results", "{}") ) if not per_task: return ( '
' "No task results to process.
" ) run_boltz_posteval(per_task) from eval_queue import save_task_result for tid, tres in per_task.items(): save_task_result(sid.strip(), tid, tres) return ( '
' "Boltz post-assessment complete.
" ) except Exception as e: return f'
{e}
' def _run_judge(sid): try: import eval_judge as ej from eval_queue import ( get_submission, save_task_result, update_status, ) sub = get_submission(sid.strip()) if sub is None: return ('
' 'Not found
') per_task = json.loads( sub.get("per_task_results", "{}") ) if not per_task: return ('
' "No task results to process.
") update_status(sid.strip(), "scoring") ej.run_judge_panel( per_task, agent_id=sub.get("agent_name", "unknown"), dry_run=False, ) for tid, tres in per_task.items(): save_task_result(sid.strip(), tid, tres) n_done = sum( 1 for r in per_task.values() if r.get("hybrid_total") is not None ) return ( f'
' f"LLM judge complete on {n_done} tasks." "
" ) except Exception as e: import traceback return ( f'
' f'Judge error: {e}
' f'
'
                                f'{traceback.format_exc()[:600]}
' ) def _run_finalize(sid): try: from eval_queue import ( finalize_submission, get_submission, ) from eval_scorer import aggregate_scores sub = get_submission(sid.strip()) if sub is None: return ( '
' 'Not found
' ) per_task = json.loads( sub.get("per_task_results", "{}") ) agg = aggregate_scores(per_task) finalize_submission( sid.strip(), overall_score=agg["overall_score"], component_scores=agg["component_scores"], taxonomy_scores=agg["taxonomy_scores"], ) mode_label = agg.get("scoring_mode", "algo") return ( f'
' f'Finalized! Score: ' f'{agg["overall_score"]:.1f} ' f'(scoring={mode_label})
' ) except Exception as e: return f'
{e}
' dispatch_btn.click( _run_dispatch, [dispatch_id], pipeline_out, ) boltz_btn.click( _run_boltz, [boltz_id], pipeline_out, ) judge_btn.click( _run_judge, [judge_id], pipeline_out, ) final_btn.click( _run_finalize, [final_id], pipeline_out, ) # ══════ Tab 7: About ══════ with gr.Tab("\u2139\ufe0f About"): gr.HTML(build_about()) return app # ═══════════════════════════════════════════════════════════════════ # Entry point # ═══════════════════════════════════════════════════════════════════ if __name__ == "__main__": create_app().launch()