Jasonkim8652's picture
fix: update Paper/GitHub/HF/PyPI header links + BibTeX DOI + protein-design-mcp refs
c4982fe verified
"""BioDesignBench Leaderboard β€” Gradio App for HuggingFace Spaces
Evaluating LLM Agents on Protein Design via MCP Tools
Romero Lab, Duke University
Tabs:
1. Overall Leaderboard
2. Taxonomy Breakdown
3. Component Analysis
4. Benchmark vs User
5. Submit (new submission form)
6. Status & Admin (password-protected pipeline control)
7. About
"""
import json
import os
from pathlib import Path
import gradio as gr
import plotly.graph_objects as go
ADMIN_PASSWORD = os.environ.get("BDB_ADMIN_PASSWORD", "biodesignbench2026")
# ═══════════════════════════════════════════════════════════════════
# Configuration β€” change these when deploying
# ═══════════════════════════════════════════════════════════════════
PAPER_URL = "https://www.biorxiv.org/content/10.64898/2026.05.06.723381v1"
GITHUB_URL = "https://github.com/RomeroLab/BioDesignBench"
HF_URL = "https://huggingface.co/spaces/RomeroLab-Duke/BioDesignBench-Leaderboard"
PYPI_URL = "https://pypi.org/project/protein-design-mcp/"
# ═══════════════════════════════════════════════════════════════════
# Taxonomy & scoring constants (2 Γ— 5 design matrix)
# ═══════════════════════════════════════════════════════════════════
APPROACHES = ["de_novo", "redesign"]
APPROACH_LABELS = {
"de_novo": "De Novo Design",
"redesign": "Redesign",
}
SUBJECTS = ["antibody", "binder", "enzyme", "scaffold", "fluorescent_protein"]
SUBJECT_LABELS = {
"antibody": "Antibody",
"binder": "Binder",
"enzyme": "Enzyme",
"scaffold": "Scaffold",
"fluorescent_protein": "Fluorescent Prot.",
}
# 9 valid cells (rd Γ— binder is empty in current task set)
VALID_CELLS = {
"de_novo": {"antibody", "binder", "enzyme", "scaffold", "fluorescent_protein"},
"redesign": {"antibody", "enzyme", "scaffold", "fluorescent_protein"},
}
N_TASKS_PER_CELL = {
("de_novo", "antibody"): 4,
("de_novo", "binder"): 19,
("de_novo", "enzyme"): 2,
("de_novo", "scaffold"): 21,
("de_novo", "fluorescent_protein"): 1,
("redesign", "antibody"): 5,
("redesign", "enzyme"): 10,
("redesign", "scaffold"): 4,
("redesign", "fluorescent_protein"): 10,
}
COMPONENTS = [
"approach",
"orchestration",
"quality",
"feasibility",
"novelty",
"diversity",
]
COMP_MAX = {
"approach": 20,
"orchestration": 15,
"quality": 35,
"feasibility": 15,
"novelty": 5,
"diversity": 10,
}
TYPE_STYLE = {
"llm": {"icon": "", "bg": "#ffffff", "tag": ""},
"hardcoded": {"icon": "\U0001f527", "bg": "#f0f0f0", "tag": "baseline"},
"human_expert": {
"icon": "\U0001f468\u200d\U0001f52c",
"bg": "#ebf4ff",
"tag": "baseline",
},
"human_oracle": {"icon": "\U0001f4c4", "bg": "#fefcbf", "tag": "baseline"},
# Backward-compat alias for older JSON files
"oracle": {"icon": "\U0001f4c4", "bg": "#fefcbf", "tag": "baseline"},
}
# ═══════════════════════════════════════════════════════════════════
# Data loading
# ═══════════════════════════════════════════════════════════════════
def load_data() -> dict:
path = Path(__file__).parent / "leaderboard_data.json"
with open(path) as f:
return json.load(f)
# ═══════════════════════════════════════════════════════════════════
# Custom CSS
# ═══════════════════════════════════════════════════════════════════
CUSTOM_CSS = """
.gradio-container { max-width: 1200px !important; }
.gr-padded { padding: 0 !important; }
/* Force light appearance for all inline-styled HTML content */
.dark .gradio-container {
--body-background-fill: #f7fafc !important;
--block-background-fill: #ffffff !important;
--body-text-color: #1a202c !important;
--block-label-text-color: #1a202c !important;
--input-background-fill: #ffffff !important;
--border-color-primary: #e2e8f0 !important;
--color-accent-soft: rgba(49,130,206,0.15) !important;
--neutral-50: #f7fafc !important;
--neutral-100: #edf2f7 !important;
--neutral-200: #e2e8f0 !important;
--neutral-700: #4a5568 !important;
--neutral-800: #2d3748 !important;
color: #1a202c !important;
background: #f7fafc !important;
}
.dark .tabs { background: #ffffff !important; }
.dark .tab-nav button { color: #2d3748 !important; }
.dark .tab-nav button.selected {
color: #0f172a !important;
border-color: #3182ce !important;
}
.dark .block { background: #ffffff !important; }
.dark label, .dark .label-wrap { color: #2d3748 !important; }
.dark input, .dark textarea, .dark select {
background: #ffffff !important;
color: #1a202c !important;
border-color: #e2e8f0 !important;
}
.dark .accordion { background: #ffffff !important; }
.dark .accordion > .label-wrap { color: #2d3748 !important; }
"""
# Force light mode on page load
FORCE_LIGHT_JS = """
() => {
document.querySelector('body').classList.remove('dark');
const obs = new MutationObserver(() => {
document.querySelector('body').classList.remove('dark');
});
obs.observe(document.body, {attributes: true, attributeFilter: ['class']});
setTimeout(() => obs.disconnect(), 5000);
}
"""
# ═══════════════════════════════════════════════════════════════════
# Plotly layout helper
# ═══════════════════════════════════════════════════════════════════
def _base_layout(**overrides) -> dict:
"""Shared Plotly layout defaults, with per-chart overrides."""
base = dict(
plot_bgcolor="white",
paper_bgcolor="white",
font=dict(
family="system-ui, -apple-system, sans-serif", size=12, color="#2d3748"
),
margin=dict(l=40, r=20, t=50, b=40),
)
base.update(overrides)
return base
# ═══════════════════════════════════════════════════════════════════
# HTML builders
# ═══════════════════════════════════════════════════════════════════
def build_header(last_updated: str, n_entries: int) -> str:
btn = (
"display:inline-block;padding:0.45rem 1.1rem;border-radius:8px;"
"text-decoration:none;font-size:0.82rem;font-weight:600;"
"transition:opacity 0.15s"
)
return f"""
<div style="background:#ffffff;border:1px solid #e2e8f0;
padding:2.2rem 2rem 1.8rem;text-align:center;
border-radius:16px;margin-bottom:0.8rem;
box-shadow:0 1px 4px rgba(0,0,0,0.04)">
<p style="margin:0 0 0.3rem;font-size:0.75rem;font-weight:700;
letter-spacing:0.12em;text-transform:uppercase;
color:#3182ce">Romero Lab &middot; Duke University</p>
<h1 style="font-size:2rem;margin:0;font-weight:800;color:#0f172a;
letter-spacing:-0.02em">
\U0001f9ec BioDesignBench</h1>
<p style="color:#0f172a;margin:0.6rem 0 0.2rem;font-size:1.1rem;
font-weight:600;line-height:1.4">
Can LLM agents orchestrate stochastic protein-design pipelines?</p>
<p style="color:#64748b;margin:0.2rem 0 0;font-size:0.95rem;
font-weight:400;font-style:italic;max-width:680px;
margin-left:auto;margin-right:auto;line-height:1.5">
Top-tier agents now surpass a deterministic pipeline &mdash;
but invoke evaluation tools at only <strong>14% of expert depth</strong>.
Guidance rescues coverage, not depth.</p>
<div style="margin-top:1rem;display:flex;justify-content:center;
gap:0.6rem;flex-wrap:wrap">
<a href="{PAPER_URL}" target="_blank"
style="{btn};background:#0f172a;color:#ffffff">
\U0001f4c4 Paper</a>
<a href="{GITHUB_URL}" target="_blank"
style="{btn};background:#f1f5f9;color:#334155">
\U0001f4bb GitHub</a>
<a href="{HF_URL}" target="_blank"
style="{btn};background:#f1f5f9;color:#334155">
\U0001f917 HuggingFace</a>
<a href="{PYPI_URL}" target="_blank"
style="{btn};background:#f1f5f9;color:#334155">
\U0001f4e6 PyPI</a>
</div>
<div style="margin-top:1rem;display:flex;justify-content:center;
gap:1.5rem;flex-wrap:wrap">
<span style="font-size:0.78rem;color:#94a3b8">
76 tasks &middot; 5 molecular families</span>
<span style="font-size:0.78rem;color:#94a3b8">
17 MCP tools</span>
<span style="font-size:0.78rem;color:#94a3b8">
{n_entries} conditions</span>
<span style="font-size:0.78rem;color:#94a3b8">
Updated {last_updated}</span>
</div>
</div>"""
# ── Score styling helpers ──
def _score_color(s: float) -> str:
if s >= 50:
return "#38a169"
if s >= 25:
return "#d69e2e"
return "#e53e3e"
def _bar_bg(s: float) -> str:
if s >= 50:
return "rgba(56,161,105,0.15)"
if s >= 25:
return "rgba(214,158,46,0.15)"
return "rgba(229,62,62,0.12)"
def _heat_color(val, max_val=95) -> str:
if val is None:
return "#f7fafc"
r = val / max_val
if r >= 0.7:
return f"rgba(56,161,105,{min(0.2 + r * 0.4, 0.8):.2f})"
if r >= 0.4:
return f"rgba(214,158,46,{min(0.2 + r * 0.4, 0.8):.2f})"
return f"rgba(229,62,62,{min(0.15 + r * 0.3, 0.6):.2f})"
# ── Tab 1: Overall leaderboard table ──
def build_leaderboard_table(
entries: list, mode_f: str, mcp_f: str, type_f: str
) -> str:
"""Generate the mixed-ranking HTML table with inline styles."""
# Filter
filtered = []
for e in entries:
st = e["submission_type"]
if mode_f != "All" and st == "llm":
if (e.get("mode") or "").lower() != mode_f.lower():
continue
if mcp_f == "Reference" and e.get("mcp_custom"):
continue
if mcp_f == "Custom" and not e.get("mcp_custom"):
continue
if type_f == "LLM Only" and st != "llm":
continue
if type_f == "Baselines Only" and st == "llm":
continue
filtered.append(e)
filtered.sort(key=lambda x: x["overall_score"], reverse=True)
# Shared cell styles
TD = (
"padding:0.65rem 1rem;border-bottom:1px solid #e2e8f0;"
"font-size:0.9rem"
)
TH = (
"background:#0f172a;color:white;padding:0.75rem 1rem;"
"text-align:left;font-size:0.75rem;text-transform:uppercase;"
"letter-spacing:0.05em;font-weight:600"
)
rows = []
llm_rank = 0
for e in filtered:
st = e["submission_type"]
sty = TYPE_STYLE.get(st, TYPE_STYLE["llm"])
is_bl = st != "llm"
sc = e["overall_score"]
# ── Rank cell ──
if is_bl:
rank = (
f'<td style="{TD};text-align:center;font-size:1.1rem;'
f'width:50px">{sty["icon"]}</td>'
)
else:
llm_rank += 1
rcolor = {1: "#d69e2e", 2: "#a0aec0", 3: "#c17832"}.get(
llm_rank, "#0f172a"
)
rsize = (
"1.1rem"
if llm_rank == 1
else ("1.05rem" if llm_rank <= 3 else "0.9rem")
)
rank = (
f'<td style="{TD};text-align:center;font-weight:700;'
f"color:{rcolor};font-size:{rsize};width:50px\">"
f"{llm_rank}</td>"
)
# ── Name cell ──
tag_html = ""
if sty["tag"]:
tag_html = (
' <span style="font-size:0.7rem;background:#e2e8f0;'
"padding:0.1rem 0.4rem;border-radius:3px;color:#4a5568;"
f'margin-left:0.3rem;vertical-align:middle">'
f'{sty["tag"]}</span>'
)
icon_pfx = f'{sty["icon"]} ' if sty["icon"] else ""
fw = "600" if is_bl else "500"
name = (
f'<td style="{TD};font-weight:{fw}">'
f'{icon_pfx}{e["agent_name"]}{tag_html}</td>'
)
# ── Organization ──
org = f'<td style="{TD}">{e["organization"]}</td>'
# ── Mode badge ──
if is_bl:
mode = f'<td style="{TD};color:#718096">\u2014</td>'
elif e.get("mode") == "benchmark":
mode = (
f'<td style="{TD}"><span style="background:#fed7d7;'
"color:#c53030;padding:0.15rem 0.5rem;border-radius:4px;"
'font-size:0.75rem;font-weight:600">benchmark</span></td>'
)
else:
mode = (
f'<td style="{TD}"><span style="background:#c6f6d5;'
"color:#276749;padding:0.15rem 0.5rem;border-radius:4px;"
'font-size:0.75rem;font-weight:600">user</span></td>'
)
# ── MCP ──
if is_bl:
mcp = f'<td style="{TD};color:#718096">\u2014</td>'
elif e.get("mcp_custom"):
mcp = (
f'<td style="{TD}"><span style="background:#fef3c7;'
"color:#92400e;padding:0.15rem 0.55rem;border-radius:4px;"
'font-size:0.72rem;font-weight:700">custom</span></td>'
)
else:
mcp = (
f'<td style="{TD}"><span style="background:#dbeafe;'
"color:#1e40af;padding:0.15rem 0.55rem;border-radius:4px;"
'font-size:0.72rem;font-weight:700">reference</span></td>'
)
# ── Score with proportional bar ──
scol = _score_color(sc)
bbg = _bar_bg(sc)
score_cell = (
f'<td style="{TD};font-weight:700;font-size:1rem;color:{scol};'
f'position:relative;font-variant-numeric:tabular-nums">'
f'<div style="position:absolute;left:0;top:0;bottom:0;'
f"width:{sc}%;background:{bbg};"
f'border-radius:3px"></div>'
f'<span style="position:relative">{sc:.1f}</span></td>'
)
# ── Tasks & zeros ──
tc = e.get("tasks_completed", 0)
tt = e.get("tasks_total", 76)
tasks = f'<td style="{TD}">{tc}/{tt}</td>'
zeros = f'<td style="{TD}">{e.get("tasks_with_zero", 0)}</td>'
rows.append(
f'<tr style="background:{sty["bg"]}">'
f"{rank}{name}{org}{mode}{mcp}{score_cell}{tasks}{zeros}</tr>"
)
return f"""
<table style="width:100%;border-collapse:collapse;background:white;
border-radius:10px;overflow:hidden;
box-shadow:0 1px 3px rgba(0,0,0,0.08)">
<thead><tr>
<th style="{TH};width:50px">#</th>
<th style="{TH}">Agent</th>
<th style="{TH}">Organization</th>
<th style="{TH}">Mode</th>
<th style="{TH}">MCP</th>
<th style="{TH}">Score</th>
<th style="{TH}">Tasks</th>
<th style="{TH}">Zero-Score</th>
</tr></thead>
<tbody>{''.join(rows)}</tbody>
</table>"""
# ── Tab 2: Taxonomy heatmap ──
def build_heatmap(entry: dict) -> str:
"""HTML heatmap for one agent across the 2 Γ— 5 design matrix
(DesignApproach Γ— MolecularSubject = 9 valid cells; rd Γ— binder is empty).
"""
ts = entry.get("taxonomy_scores", {})
TH = (
"background:#0f172a;color:white;padding:0.6rem 0.8rem;"
"text-align:center;font-size:0.75rem;font-weight:600"
)
TD = (
"text-align:center;padding:0.5rem;font-size:0.85rem;"
"font-weight:600;border-bottom:1px solid #e2e8f0"
)
rows = []
for ap in APPROACHES:
cells = [
f'<td style="{TD};text-align:left;font-weight:700;'
f'background:#f8fafc;color:#0f172a">{APPROACH_LABELS[ap]}</td>'
]
vals = []
for sj in SUBJECTS:
if sj in VALID_CELLS[ap]:
val = ts.get(ap, {}).get(sj)
bg = _heat_color(val)
n = N_TASKS_PER_CELL.get((ap, sj), 0)
text = (
f'{val:.0f}<br><span style="font-size:0.65rem;'
f'font-weight:400;color:#64748b">n={n}</span>'
if val is not None
else "\u2014"
)
cells.append(f'<td style="{TD};background:{bg}">{text}</td>')
if val is not None:
vals.append(val)
else:
cells.append(
f'<td style="{TD};color:#cbd5e0;font-weight:400">'
"n/a</td>"
)
avg = sum(vals) / len(vals) if vals else 0
avg_bg = _heat_color(avg)
cells.append(
f'<td style="{TD};font-weight:700;background:{avg_bg}">'
f"{avg:.1f}</td>"
)
rows.append(f'<tr>{"".join(cells)}</tr>')
sj_headers = "".join(
f'<th style="{TH}">{SUBJECT_LABELS[sj]}</th>'
for sj in SUBJECTS
)
return f"""
<table style="width:100%;border-collapse:collapse;background:white;
border-radius:10px;overflow:hidden;
box-shadow:0 1px 3px rgba(0,0,0,0.08)">
<thead><tr>
<th style="{TH};text-align:left">Approach \u2193 / Subject \u2192</th>
{sj_headers}
<th style="{TH}">Mean</th>
</tr></thead>
<tbody>{''.join(rows)}</tbody>
</table>"""
# ── Tab 4: Mode comparison cards ──
def build_mode_cards(entries: list) -> str:
"""Per-LLM cards showing benchmark vs user delta."""
by_name: dict[str, dict] = {}
for e in entries:
if e["submission_type"] != "llm":
continue
by_name.setdefault(e["agent_name"], {})[e["mode"]] = e
ordered = sorted(
by_name.items(),
key=lambda x: x[1].get("user", {}).get("overall_score", 0),
reverse=True,
)
cards = []
for name, modes in ordered:
bench = modes.get("benchmark")
user = modes.get("user")
if not bench or not user:
continue
delta = user["overall_score"] - bench["overall_score"]
pct = (delta / bench["overall_score"] * 100) if bench["overall_score"] else 0
lines = [
'<div style="display:flex;justify-content:space-between;'
'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
"<span>Benchmark</span>"
f'<span style="font-weight:700;color:#e53e3e">'
f'{bench["overall_score"]:.1f}</span></div>',
'<div style="display:flex;justify-content:space-between;'
'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
"<span>User</span>"
f'<span style="font-weight:700;color:#d69e2e">'
f'{user["overall_score"]:.1f}</span></div>',
'<div style="display:flex;justify-content:space-between;'
'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
"<span>Delta</span>"
f'<span style="font-weight:700;color:#38a169">'
f"+{delta:.1f} (+{pct:.0f}%)</span></div>",
]
for c in COMPONENTS:
d = user["component_scores"][c] - bench["component_scores"][c]
color = "#38a169" if d >= 0 else "#e53e3e"
sign = "+" if d >= 0 else ""
lines.append(
'<div style="display:flex;justify-content:space-between;'
'padding:0.3rem 0;border-bottom:1px solid #e2e8f0;'
'font-size:0.85rem">'
f'<span style="color:#718096">{c}</span>'
f'<span style="font-weight:700;color:{color}">'
f"{sign}{d:.1f}</span></div>"
)
cards.append(
'<div style="background:white;border-radius:10px;padding:1.2rem;'
'box-shadow:0 1px 3px rgba(0,0,0,0.08)">'
f'<h4 style="font-size:0.95rem;color:#0f172a;'
f'margin:0 0 0.8rem">{name}</h4>'
f'{"".join(lines)}</div>'
)
return (
'<div style="display:grid;grid-template-columns:'
'repeat(auto-fit,minmax(250px,1fr));gap:1rem;margin-top:1rem">'
f'{"".join(cards)}</div>'
)
# ── Headline findings (paper banner) ──
def build_headline_findings(findings: list) -> str:
"""Top-of-page banner that surfaces the paper's three core claims."""
if not findings:
return ""
cards = []
accents = ["#3182ce", "#d69e2e", "#805ad5", "#38a169", "#e53e3e"]
for i, text in enumerate(findings):
c = accents[i % len(accents)]
cards.append(
f'<div style="background:#ffffff;border:1px solid #e2e8f0;'
f"border-left:4px solid {c};border-radius:10px;"
f'padding:0.85rem 1rem;flex:1 1 220px;min-width:220px;'
f'box-shadow:0 1px 3px rgba(0,0,0,0.04)">'
f'<div style="font-size:0.7rem;font-weight:700;'
f'color:{c};letter-spacing:0.08em;text-transform:uppercase;'
f'margin-bottom:0.35rem">Finding {i+1}</div>'
f'<div style="font-size:0.82rem;color:#1a202c;'
f'line-height:1.45">{text}</div></div>'
)
return (
'<div style="display:flex;flex-wrap:wrap;gap:0.7rem;'
'margin:0.4rem 0 1rem">'
f"{''.join(cards)}</div>"
)
# ── Tab: Depth Gap (intervention experiments) ──
def build_intervention_section(interventions: dict) -> str:
"""Show forced-depth and low-diversity intervention results.
The forced-depth condition mandates β‰₯3 evaluation passes per design
candidate; the low-diversity control constrains the candidate pool
without forcing depth. Together they isolate evaluation depth as the
causal driver of the 'surface competence' gap reported in the paper.
"""
if not interventions or not interventions.get("rows"):
return '<p style="color:#718096">No intervention data available.</p>'
rows = interventions["rows"]
cond_meta = {
"baseline": ("#64748b", "Baseline"),
"forced_depth": ("#38a169", "Forced Depth"),
"low_diversity_control": ("#d69e2e", "Low-Diversity Control"),
}
TH = (
"background:#0f172a;color:white;padding:0.65rem 0.9rem;"
"text-align:left;font-size:0.72rem;text-transform:uppercase;"
"letter-spacing:0.05em;font-weight:600"
)
TD = ("padding:0.6rem 0.9rem;border-bottom:1px solid #e2e8f0;"
"font-size:0.86rem")
body = []
for r in rows:
color, cond_label = cond_meta.get(r["condition"], ("#64748b", r["condition"]))
delta = r.get("delta_vs_baseline")
if delta is None or r["condition"] == "baseline":
delta_html = '<span style="color:#cbd5e0">\u2014</span>'
else:
sign = "+" if delta >= 0 else ""
dcol = "#38a169" if delta > 0 else ("#e53e3e" if delta < 0 else "#64748b")
delta_html = (
f'<span style="color:{dcol};font-weight:700">'
f"{sign}{delta:.1f}</span>"
)
body.append(
f'<tr><td style="{TD};font-weight:600;color:#0f172a">'
f'{r["label"]}</td>'
f'<td style="{TD}"><span style="background:{color}22;'
f"color:{color};padding:0.15rem 0.55rem;border-radius:4px;"
f'font-size:0.72rem;font-weight:700">{cond_label}</span></td>'
f'<td style="{TD};font-weight:700;font-variant-numeric:'
f'tabular-nums">{r["score"]:.1f}</td>'
f'<td style="{TD};font-variant-numeric:tabular-nums">{delta_html}</td>'
f'<td style="{TD};color:#475569;font-variant-numeric:tabular-nums">'
f'{r["approach"]:.1f} / {r["orchestration"]:.1f}</td>'
f'<td style="{TD};color:#475569;font-variant-numeric:tabular-nums">'
f'{r["quality"]:.1f}</td>'
f'<td style="{TD};color:#475569;font-variant-numeric:tabular-nums">'
f'{r["diversity"]:.1f}</td></tr>'
)
n = interventions.get("n_tasks", 18)
return f"""
<div style="max-width:980px;margin:0 auto">
<div style="background:#ffffff;border:1px solid #e2e8f0;
border-radius:12px;padding:1.4rem 1.6rem;
margin-bottom:1rem">
<h2 style="color:#0f172a;margin:0 0 0.5rem;font-size:1.2rem;
font-weight:700">Causal interventions on the depth gap</h2>
<p style="color:#475569;line-height:1.55;margin:0">
{interventions.get('description', '')}
Reruns are scored on a representative <strong>{n}-task</strong>
subset that spans all 9 occupied taxonomy cells.
</p>
</div>
<div style="background:#fefce8;border-left:4px solid #ca8a04;
border-radius:8px;padding:0.95rem 1.1rem;
margin-bottom:1.1rem">
<strong style="color:#713f12">Headline:</strong>
<span style="color:#52340d">
Forced-depth lifts <strong>DeepSeek V3 by +9.3</strong> and
<strong>GPT-5 by +15.9</strong> points without any change to
the underlying model or tools, while the low-diversity control
<em>hurts</em> DeepSeek V3 (&minus;2.3). The dissociation is
cleanest on the strongest agent, where it provides direct
causal evidence that
<strong>evaluation depth &mdash; not the mere act of process
intervention &mdash; drives the gain</strong>. GPT-5's
response is more uniform across both interventions; we
report the raw deltas without smoothing.
</span>
</div>
<table style="width:100%;border-collapse:collapse;background:white;
border-radius:10px;overflow:hidden;
box-shadow:0 1px 3px rgba(0,0,0,0.08)">
<thead><tr>
<th style="{TH}">Run</th>
<th style="{TH}">Condition</th>
<th style="{TH}">Score</th>
<th style="{TH}">&Delta; vs baseline</th>
<th style="{TH}">Approach / Orch.</th>
<th style="{TH}">Quality</th>
<th style="{TH}">Diversity</th>
</tr></thead>
<tbody>{''.join(body)}</tbody>
</table>
<p style="color:#64748b;font-size:0.78rem;margin-top:0.8rem;
line-height:1.5">
Scoring uses the same 100-point hybrid rubric as the main
leaderboard but is restricted to {n} representative tasks;
absolute values therefore differ from the full-benchmark mean.
The <em>delta vs baseline</em> compares each agent against
its own untreated baseline run, isolating the intervention effect.
</p>
</div>
"""
# ── Tab 5: About ──
def build_about() -> str:
h2 = (
'style="color:#0f172a;margin:0 0 0.8rem;font-size:1.25rem;'
'font-weight:700"'
)
h3 = (
'style="color:#334155;margin:1.2rem 0 0.5rem;font-size:1rem;'
'font-weight:600"'
)
p = 'style="margin-bottom:0.8rem;color:#475569;line-height:1.6"'
card = (
'style="background:#ffffff;border:1px solid #e2e8f0;'
'border-radius:12px;padding:2rem;margin-bottom:1.2rem"'
)
stat_box = (
'style="background:#f8fafc;border:1px solid #e2e8f0;'
'border-radius:10px;padding:1rem;text-align:center"'
)
return f"""
<div style="max-width:900px;margin:0 auto">
<div {card}>
<h2 {h2}>What is BioDesignBench?</h2>
<p {p}>
BioDesignBench is a benchmark for evaluating LLM agents as
orchestrators of multi-step <em>stochastic</em> protein-design
pipelines. Unlike chemistry- or code-agent benchmarks, where
tool chains are largely deterministic, protein design demands
repeated sampling from generative tools (RFdiffusion,
ProteinMPNN) and iterative cross-validation through several
biophysical metrics. We test the full agentic loop &mdash;
<strong>plan &rarr; sample &rarr; evaluate across multiple
metrics &rarr; iterate</strong> &mdash; over 76 expert-curated
tasks drawn from 2024&ndash;2026 literature, exposed through
17 MCP-integrated tools.
</p>
<div style="display:grid;grid-template-columns:
repeat(auto-fit,minmax(140px,1fr));gap:0.8rem;
margin:1rem 0">
<div {stat_box}>
<div style="font-size:1.8rem;font-weight:800;color:#0f172a">
76</div>
<div style="font-size:0.78rem;color:#64748b">design tasks</div>
</div>
<div {stat_box}>
<div style="font-size:1.8rem;font-weight:800;color:#0f172a">
9</div>
<div style="font-size:0.78rem;color:#64748b">
taxonomy cells<br>(2 approaches \u00d7 5 subjects)</div>
</div>
<div {stat_box}>
<div style="font-size:1.8rem;font-weight:800;color:#0f172a">
17</div>
<div style="font-size:0.78rem;color:#64748b">MCP tools</div>
</div>
<div {stat_box}>
<div style="font-size:1.8rem;font-weight:800;color:#0f172a">
100</div>
<div style="font-size:0.78rem;color:#64748b">point rubric</div>
</div>
</div>
</div>
<div {card}>
<h2 {h2}>Three principal findings</h2>
<h3 {h3}>1. Top-tier agents now beat a deterministic pipeline</h3>
<p {p}>
DeepSeek V3 and GPT-5 surpass a hand-engineered hardcoded
pipeline (54.2) under both modes. Autonomous protein-design
orchestration is no longer infeasible &mdash; but a substantial
gap to the human expert (61.3) and oracle (74.9) remains.
</p>
<h3 {h3}>2. Coverage&ndash;depth dissociation</h3>
<p {p}>
Workflow guidance closes the <em>coverage</em> gap (Rescue
Index up to +3.01) but leaves <em>utilisation depth</em>
unchanged (Rescue Index \u2248 0). Better tool documentation
can teach agents <em>which</em> tools to call, but cannot
teach them to call those tools with the iterative depth that
expert practice demands.
</p>
<h3 {h3}>3. Evaluation depth, not tool knowledge, is the bottleneck</h3>
<p {p}>
Across 836 task&ndash;condition observations, evaluation depth
per candidate correlates with total score at
<strong>&rho; = 0.685</strong>
(<em>p</em> &lt; 10<sup>-117</sup>). LLM agents generate
backbone candidates at expert-level rates but evaluate each
one at only <strong>14% of expert depth</strong>. Forced-depth
interventions confirm this is causal &mdash; see the
<em>Depth Gap</em> tab.
</p>
</div>
<div {card}>
<h2 {h2}>How to submit</h2>
<p {p}>
Unlike most agent benchmarks, <strong>you do not host an HTTP
endpoint</strong>. The 76 task descriptions never leave Romero
Lab infrastructure. Instead you provide:</p>
<ol style="color:#475569;padding-left:1.5rem;margin-bottom:0.8rem;
line-height:1.7">
<li>an <strong>LLM provider + API key</strong>
(Anthropic / OpenAI / Google / DeepSeek).
We run the BioDesignBench agent loop against your chosen
model inside the leaderboard backend. Your key is
<em>scrubbed</em> from our records immediately after the
dispatch phase completes.</li>
<li>optionally, a <strong>custom MCP URL</strong> if you want
to evaluate your own tool implementations. Otherwise, the
agent calls our reference
<a href="https://github.com/jasonkim8652/protein-design-mcp"
style="color:#2563eb;font-weight:600">protein-design-mcp</a>
endpoint (in progress).</li>
</ol>
<h3 {h3}>Data flow</h3>
<p {p}>
Each task prompt is sent to your chosen LLM provider via
their standard API (Anthropic, OpenAI, Google, DeepSeek) &mdash;
that single channel is the only path by which task data leaves
Romero Lab. The MCP server (reference or custom) only ever
sees operational tool arguments (sequences, PDB paths, hotspot
residues); it never sees the raw task prompt or evaluation
criteria. Every task prompt also carries a unique 16-character
canary token as an HTML comment, for retrospective leakage
detection.</p>
<h3 {h3}>Bring your own tools (Custom MCP)</h3>
<p {p}>
If you want to benchmark a new tool implementation (a faster
structure predictor, a different diffusion backbone, your own
stability model) against the same 76 tasks and rubric, stand
up an HTTPS endpoint that satisfies the MCP contract and paste
the URL into the submission form's
<em>Advanced: Custom MCP</em> section:</p>
<ul style="color:#475569;padding-left:1.5rem;margin-bottom:0.8rem;
line-height:1.7">
<li><strong>Contract + hosting options</strong>:
<a href="https://github.com/RomeroLab/BioDesignBench/blob/main/biodesignbench-leaderboard/README.md#bringing-your-own-mcp-tools"
style="color:#2563eb;font-weight:600">leaderboard README</a></li>
<li><strong>Minimal FastAPI stub (~150 lines)</strong>:
<a href="https://github.com/RomeroLab/BioDesignBench/blob/main/biodesignbench-leaderboard/example_mcp_server.py"
style="color:#2563eb;font-weight:600"><code>example_mcp_server.py</code></a></li>
<li><strong>Reference implementation to fork</strong>:
<a href="https://github.com/jasonkim8652/protein-design-mcp"
style="color:#2563eb;font-weight:600">jasonkim8652/protein-design-mcp</a></li>
</ul>
<h3 {h3}>Limits</h3>
<ul style="color:#475569;padding-left:1.5rem;margin-bottom:0.8rem;
line-height:1.7">
<li>Maximum 1 submission per calendar month per organization</li>
<li>73 hidden tasks are used for ranking; 3 public example
tasks are available for development</li>
<li>LLM-judge API costs are paid by Romero Lab; your own
agent LLM calls are billed to your provider</li>
</ul>
</div>
<div {card}>
<h2 {h2}>Scoring rubric (100 points, hybrid)</h2>
<p {p}>
Scores combine <strong>72 algorithmic points</strong> from
deterministic biophysical metrics with
<strong>28 LLM-judge points</strong> assessed by a 3-judge
panel (PoLL) with self-exclusion to mitigate self-preference
bias. Each component is capped at its rubric maximum to
prevent double counting.
</p>
<p {p}>
<strong>Approach (20 pts)</strong> &mdash; strategic
appropriateness of tool selection across 10 functional
categories (backbone generation, inverse folding, structure
prediction, etc.).</p>
<p {p}>
<strong>Orchestration (15 pts)</strong> &mdash; pipeline
ordering, intermediate validation, and adaptive iteration.</p>
<p {p}>
<strong>Quality (35 pts)</strong> &mdash; 100% algorithmic.
Continuous 4-band interpolation over Boltz-2 re-prediction
metrics (pLDDT, pTM, ipTM, i_pAE), eliminating LLM judgement
variance on biophysical quantities.</p>
<p {p}>
<strong>Feasibility (15 pts)</strong> &mdash; valid amino
acids, length constraints, composition, and biophysical
plausibility.</p>
<p {p}>
<strong>Novelty (5 pts)</strong> &mdash; sequence identity to
reference (lower identity = more novel).</p>
<p {p}>
<strong>Diversity (10 pts)</strong> &mdash; number and
pairwise diversity of generated designs.</p>
</div>
<div {card}>
<h2 {h2}>Five-layer contamination defense</h2>
<p {p}>Every evaluated LLM may have read protein-design
literature during pretraining, so we use a layered defense:</p>
<ul style="color:#475569;padding-left:1.5rem;
margin-bottom:0.8rem;line-height:1.7">
<li>All 76 tasks derived from publications dated 2024&ndash;2026,
post-dating model training cutoffs.</li>
<li>Task prompts paraphrased and restructured &mdash; no
verbatim passages from source literature.</li>
<li>Targets specified by biological function and structural
constraints, not by name or PDB identifier.</li>
<li>12 decoy tasks with deliberately fabricated targets to
detect memorisation-based responses.</li>
<li>n-gram overlap analysis between agent outputs and source
publications &mdash; no verbatim regurgitation above the
8-gram threshold across any condition.</li>
</ul>
</div>
<div {card}>
<h2 {h2}>Citation</h2>
<pre style="background:#0f172a;color:#e2e8f0;padding:1.2rem;
border-radius:10px;font-size:0.8rem;
line-height:1.6">@article{{biodesignbench2026,
title={{Evaluating LLM-Driven Protein Design:
Agents Lack Iterative Evaluation Depth}},
author={{Kim, Jeonghyeon and Romero, Philip}},
journal={{bioRxiv}},
year={{2026}},
doi={{10.64898/2026.05.06.723381}},
url={{https://www.biorxiv.org/content/10.64898/2026.05.06.723381v1}}
}}</pre>
</div>
</div>"""
# ═══════════════════════════════════════════════════════════════════
# Chart builders (Plotly)
# ═══════════════════════════════════════════════════════════════════
def chart_taxonomy_bar(entry: dict) -> go.Figure:
"""Grouped bar chart of mean score per molecular subject,
split by design approach (de novo vs redesign).
"""
ts = entry.get("taxonomy_scores", {})
x_labels = [SUBJECT_LABELS[s] for s in SUBJECTS]
def _series(ap):
out = []
for sj in SUBJECTS:
if sj in VALID_CELLS[ap]:
out.append(ts.get(ap, {}).get(sj))
else:
out.append(None)
return out
dn = _series("de_novo")
rd = _series("redesign")
fig = go.Figure()
fig.add_trace(go.Bar(
x=x_labels, y=dn, name="De Novo",
marker_color="rgba(49,130,206,0.78)",
text=[f"{v:.0f}" if v is not None else "" for v in dn],
textposition="outside",
))
fig.add_trace(go.Bar(
x=x_labels, y=rd, name="Redesign",
marker_color="rgba(214,158,46,0.78)",
text=[f"{v:.0f}" if v is not None else "" for v in rd],
textposition="outside",
))
mode = entry.get("mode") or "\u2014"
fig.update_layout(
**_base_layout(
barmode="group",
title=dict(
text=f"{entry['agent_name']} ({mode}) \u2014 Mean Score by Cell",
font_size=14,
),
yaxis=dict(range=[0, 100], title="Hybrid score (out of 100)"),
xaxis=dict(title=""),
legend=dict(orientation="h", yanchor="bottom", y=-0.2,
xanchor="center", x=0.5),
height=340,
)
)
return fig
def chart_radar(e1: dict, e2: dict) -> go.Figure:
"""Radar chart comparing two agents' component scores (% of max)."""
labels = [c.capitalize() for c in COMPONENTS]
def norm(e):
return [e["component_scores"][c] / COMP_MAX[c] * 100 for c in COMPONENTS]
v1, v2 = norm(e1), norm(e2)
m1 = e1.get("mode") or "\u2014"
m2 = e2.get("mode") or "\u2014"
fig = go.Figure()
fig.add_trace(
go.Scatterpolar(
r=v1 + [v1[0]],
theta=labels + [labels[0]],
fill="toself",
name=f'{e1["agent_name"]} ({m1})',
line=dict(color="rgba(49,130,206,0.8)"),
fillcolor="rgba(49,130,206,0.15)",
)
)
fig.add_trace(
go.Scatterpolar(
r=v2 + [v2[0]],
theta=labels + [labels[0]],
fill="toself",
name=f'{e2["agent_name"]} ({m2})',
line=dict(color="rgba(229,62,62,0.8)"),
fillcolor="rgba(229,62,62,0.15)",
)
)
fig.update_layout(
**_base_layout(
polar=dict(
radialaxis=dict(visible=True, range=[0, 100], ticksuffix="%")
),
showlegend=True,
legend=dict(
orientation="h", yanchor="bottom", y=-0.25,
xanchor="center", x=0.5,
),
title=dict(text="Component Radar (% of max)", font_size=14),
height=420,
)
)
return fig
def chart_component_bar(e1: dict, e2: dict) -> go.Figure:
"""Horizontal bar chart of raw component scores for two agents."""
labels = [f"{c.capitalize()} (/{COMP_MAX[c]})" for c in COMPONENTS]
m1 = e1.get("mode") or "\u2014"
m2 = e2.get("mode") or "\u2014"
fig = go.Figure()
fig.add_trace(
go.Bar(
y=labels,
x=[e1["component_scores"][c] for c in COMPONENTS],
name=f'{e1["agent_name"]} ({m1})',
orientation="h",
marker_color="rgba(49,130,206,0.7)",
)
)
fig.add_trace(
go.Bar(
y=labels,
x=[e2["component_scores"][c] for c in COMPONENTS],
name=f'{e2["agent_name"]} ({m2})',
orientation="h",
marker_color="rgba(229,62,62,0.7)",
)
)
fig.update_layout(
**_base_layout(
barmode="group",
xaxis=dict(title="Score"),
title=dict(text="Component Breakdown", font_size=14),
legend=dict(
orientation="h", yanchor="bottom", y=-0.3,
xanchor="center", x=0.5,
),
height=420,
)
)
return fig
def chart_mode_comparison(entries: list) -> go.Figure:
"""Grouped bar chart: benchmark vs user mode for each LLM."""
by_name: dict[str, dict[str, float]] = {}
for e in entries:
if e["submission_type"] != "llm":
continue
by_name.setdefault(e["agent_name"], {})[e["mode"]] = e["overall_score"]
ordered = sorted(
by_name.items(),
key=lambda x: x[1].get("user", 0),
reverse=True,
)
names = [n for n, _ in ordered]
bench = [m.get("benchmark", 0) for _, m in ordered]
user = [m.get("user", 0) for _, m in ordered]
fig = go.Figure()
fig.add_trace(
go.Bar(
x=names, y=bench, name="Benchmark Mode",
marker_color="rgba(229,62,62,0.6)",
)
)
fig.add_trace(
go.Bar(
x=names, y=user, name="User Mode",
marker_color="rgba(56,161,105,0.6)",
)
)
fig.update_layout(
**_base_layout(
barmode="group",
yaxis=dict(range=[0, 80], title="Overall hybrid score"),
xaxis=dict(title=""),
title=dict(
text=("Unguided (Benchmark) vs Guided (User) modes \u2014 "
"guidance lifts coverage but rarely shifts overall score"),
font_size=13,
),
legend=dict(
orientation="h", yanchor="bottom", y=-0.18,
xanchor="center", x=0.5,
),
height=380,
)
)
return fig
# ═══════════════════════════════════════════════════════════════════
# Gradio application
# ═══════════════════════════════════════════════════════════════════
def create_app() -> gr.Blocks:
data = load_data()
entries = data["entries"]
by_id = {e["agent_id"]: e for e in entries}
# Build dropdown choices: (display_label, agent_id)
agent_choices = []
for e in entries:
sty = TYPE_STYLE.get(e["submission_type"], TYPE_STYLE["llm"])
icon = sty["icon"]
mode = e.get("mode") or "\u2014"
label = f"{icon} {e['agent_name']} ({mode})".strip()
agent_choices.append((label, e["agent_id"]))
# Safe index helper
def _choice_val(idx: int) -> str:
return agent_choices[min(idx, len(agent_choices) - 1)][1]
with gr.Blocks(
theme=gr.themes.Soft(primary_hue="blue"),
css=CUSTOM_CSS,
js=FORCE_LIGHT_JS,
) as app:
gr.HTML(build_header(data["last_updated"], len(entries)))
gr.HTML(build_headline_findings(data.get("headline_findings", [])))
with gr.Tabs():
# ════════ Tab 1: Overall Leaderboard ════════
with gr.Tab("\U0001f4ca Overall"):
with gr.Row():
f_mode = gr.Dropdown(
["All", "Benchmark", "User"],
value="All", label="Mode", scale=1,
)
f_mcp = gr.Dropdown(
["All", "Reference", "Custom"],
value="All", label="MCP Tools", scale=1,
)
f_type = gr.Dropdown(
["All Entries", "LLM Only", "Baselines Only"],
value="All Entries", label="Show", scale=1,
)
tbl = gr.HTML(
build_leaderboard_table(
entries, "All", "All", "All Entries"
)
)
def _update_table(m, mc, t):
return build_leaderboard_table(entries, m, mc, t)
for dd in [f_mode, f_mcp, f_type]:
dd.change(
_update_table, [f_mode, f_mcp, f_type], tbl
)
# ════════ Tab 2: Taxonomy Breakdown ════════
with gr.Tab("\U0001f9ec Taxonomy"):
tax_dd = gr.Dropdown(
agent_choices,
value=_choice_val(0),
label="Select Agent",
)
hm_html = gr.HTML(build_heatmap(entries[0]))
tax_plot = gr.Plot(chart_taxonomy_bar(entries[0]))
def _update_taxonomy(aid):
e = by_id.get(aid, entries[0])
return build_heatmap(e), chart_taxonomy_bar(e)
tax_dd.change(
_update_taxonomy, [tax_dd], [hm_html, tax_plot]
)
# ════════ Tab 3: Component Analysis ════════
with gr.Tab("\U0001f3af Components"):
with gr.Row():
c1 = gr.Dropdown(
agent_choices, value=_choice_val(0),
label="Agent 1", scale=1,
)
c2 = gr.Dropdown(
agent_choices, value=_choice_val(4),
label="Agent 2", scale=1,
)
with gr.Row():
radar = gr.Plot(
chart_radar(
entries[0],
entries[min(4, len(entries) - 1)],
)
)
comp_bar = gr.Plot(
chart_component_bar(
entries[0],
entries[min(4, len(entries) - 1)],
)
)
def _update_comp(a1, a2):
e1 = by_id.get(a1, entries[0])
e2 = by_id.get(a2, entries[-1])
return chart_radar(e1, e2), chart_component_bar(e1, e2)
for dd in [c1, c2]:
dd.change(_update_comp, [c1, c2], [radar, comp_bar])
# ════════ Tab 4: Benchmark vs User (coverage-depth dissociation) ════════
with gr.Tab("\u26a1 Guidance Effect"):
gr.HTML(
'<div style="background:#eff6ff;border-left:4px solid '
'#3182ce;border-radius:8px;padding:0.85rem 1.1rem;'
'margin:0.4rem 0 0.9rem;color:#1e3a8a;font-size:0.88rem;'
'line-height:1.55">'
'<strong>Mode semantics:</strong> '
'<em>Benchmark mode</em> exposes atomic tools without '
'pipeline hints (unguided); <em>User mode</em> packages '
'them into composite workflows with explicit pipeline '
'structure (guided). Guidance lifts the lowest-tier '
'agents but does not consistently help capable ones, '
'and never closes the depth gap (see <em>Depth Gap</em> '
'tab).</div>'
)
gr.Plot(chart_mode_comparison(entries))
gr.HTML(build_mode_cards(entries))
# ════════ Tab 5: Depth Gap (interventions) ════════
with gr.Tab("\U0001f50d Depth Gap"):
gr.HTML(build_intervention_section(
data.get("interventions", {})
))
# ══════ Tab: Submit ══════
with gr.Tab("\U0001f4e4 Submit"):
gr.HTML("""
<div style="max-width:820px;margin:0 auto;padding:1rem">
<h2 style="color:#0f172a;margin:0 0 0.5rem;
font-weight:700;font-size:1.25rem">
Submit your agent</h2>
<p style="color:#475569;margin-bottom:1rem;line-height:1.6">
BioDesignBench evaluates models inside Romero Lab
infrastructure to keep the 76 task specifications
contamination-clean. You provide an LLM API key and
a model name, and we run the BioDesignBench agent
loop against your model with the reference 17-tool
MCP server. Task content never leaves Romero Lab
except through your chosen LLM provider's API call.
</p>
<div style="background:#dcfce7;border-left:4px solid #15803d;
padding:0.95rem 1.1rem;border-radius:8px;
margin-bottom:1rem;font-size:0.86rem;
color:#14532d;line-height:1.55">
<strong>How your credentials are handled:</strong>
<ul style="margin:0.5rem 0 0 1.1rem;padding:0">
<li>Your API key is stored on the submission row
only between submission and dispatch, then
<strong>scrubbed automatically</strong> regardless
of whether the run succeeded.</li>
<li>Each task carries a unique 16-character canary
token (invisible HTML comment) so we can
retrospectively detect leakage in published
models.</li>
<li>The MCP server (reference or custom) sees
only operational tool arguments, never the raw
task description or evaluation criteria.</li>
</ul>
</div>
<div style="background:#eff6ff;border-left:4px solid #3182ce;
padding:0.95rem 1.1rem;border-radius:8px;
margin-bottom:1rem;font-size:0.86rem;
color:#1e3a8a;line-height:1.55">
<strong>Reference vs Custom MCP</strong>
<ul style="margin:0.5rem 0 0 1.1rem;padding:0">
<li><strong>Reference</strong> (default): your
agent uses our hosted
<a href="https://github.com/jasonkim8652/protein-design-mcp"
style="color:#1d4ed8;font-weight:600">protein-design-mcp</a>
endpoint. Eligible for the reference ranking.</li>
<li><strong>Custom</strong>: provide your own
public MCP URL implementing the same 17-tool
schema. Useful for benchmarking new tool
implementations against an identical model
under identical task prompts. Tagged with a
<code>custom</code> badge.</li>
</ul>
</div>
<div style="background:#fefce8;border-left:3px solid #ca8a04;
padding:0.8rem 1rem;border-radius:6px;
margin-bottom:1rem;font-size:0.85rem;color:#713f12">
<strong>Rate limit:</strong> 1 submission per calendar
month per organization. Your LLM-API and (if reference)
MCP-GPU costs are billed to your account / paid by
Romero Lab respectively; please be considerate.
</div>
</div>""")
with gr.Column(scale=1):
sub_agent = gr.Textbox(
label="Agent Name",
placeholder="e.g., GPT-5 with reference MCP",
)
sub_org = gr.Textbox(
label="Organization",
placeholder="e.g., OpenAI",
)
with gr.Row():
sub_provider = gr.Dropdown(
choices=[
("Anthropic Claude", "anthropic"),
("OpenAI GPT", "openai"),
("Google Gemini", "google"),
("DeepSeek", "deepseek"),
],
value="anthropic",
label="LLM Provider",
)
sub_model = gr.Textbox(
label="Model name",
placeholder="e.g., claude-sonnet-4-20250514",
)
sub_api_key = gr.Textbox(
label="API key (transient -- scrubbed after dispatch)",
placeholder="sk-...",
type="password",
)
sub_desc = gr.Textbox(
label="Description (optional)",
placeholder="Brief description of your agent...",
lines=2,
)
with gr.Accordion("Advanced: Custom MCP", open=False):
sub_custom_mcp_url = gr.Textbox(
label="Custom MCP URL (optional)",
placeholder="https://your-mcp.example.com/predict",
)
sub_custom_mcp_token = gr.Textbox(
label="Custom MCP bearer token (optional)",
placeholder="(empty if your MCP needs no auth)",
type="password",
)
sub_btn = gr.Button(
"Submit for Review",
variant="primary",
)
sub_result = gr.HTML()
def _handle_submit(
name, org, provider, model, api_key, desc,
custom_mcp_url, custom_mcp_token,
):
if not name or not org or not model or not api_key:
return ('<div style="color:#e53e3e;padding:0.5rem">'
"agent name, organization, model name, and "
"API key are required.</div>")
try:
from eval_queue import submit
result = submit(
agent_name=name,
organization=org,
provider=provider,
model_name=model,
api_key=api_key,
description=desc,
custom_mcp_url=custom_mcp_url or "",
custom_mcp_token=custom_mcp_token or "",
)
if "error" in result:
return (f'<div style="color:#e53e3e;padding:0.5rem">'
f'{result["error"]}</div>')
mcp_mode = "custom" if custom_mcp_url else "reference"
return (
f'<div style="background:#c6f6d5;padding:1rem;'
f'border-radius:8px;margin-top:0.5rem">'
f'<strong>Submitted!</strong> '
f'ID: <code>{result["submission_id"]}</code><br>'
f'Status: {result["status"]}<br>'
f'Provider: <strong>{provider}</strong> '
f'/ Model: <strong>{model}</strong><br>'
f'MCP mode: <strong>{mcp_mode}</strong><br>'
f'Canary: <code>{result.get("canary_token","")}</code><br>'
f'{result.get("message", "")}</div>'
)
except Exception as e:
return (f'<div style="color:#e53e3e;padding:0.5rem">'
f"Error: {str(e)[:200]}</div>")
sub_btn.click(
_handle_submit,
[sub_agent, sub_org, sub_provider, sub_model,
sub_api_key, sub_desc, sub_custom_mcp_url,
sub_custom_mcp_token],
sub_result,
)
# ══════ Tab 6: Status & Admin ══════
with gr.Tab("\U0001f6e0 Status"):
gr.HTML("""
<div style="max-width:800px;margin:0 auto;padding:1rem">
<h2 style="color:#0f172a;margin:0 0 0.5rem;
font-weight:700;font-size:1.25rem">
Submission status</h2>
<p style="color:#475569;margin-bottom:0.5rem;line-height:1.6">
Check your submission status or manage the pipeline
(admin only).</p>
</div>""")
# --- Public status check ---
with gr.Accordion("Check Submission Status", open=True):
status_id = gr.Textbox(
label="Submission ID",
placeholder="Enter your submission ID...",
)
status_btn = gr.Button("Check Status")
status_out = gr.HTML()
def _check_status(sid):
if not sid:
return '<div style="color:#718096">Enter an ID above.</div>'
try:
from eval_queue import get_submission
sub = get_submission(sid.strip())
if sub is None:
return ('<div style="color:#e53e3e">'
"Submission not found.</div>")
status_color = {
"pending": "#d69e2e", "approved": "#38a169",
"dispatching": "#3182ce", "boltz": "#805ad5",
"scoring": "#805ad5", "complete": "#38a169",
"failed": "#e53e3e", "rejected": "#e53e3e",
}.get(sub["status"], "#718096")
score_html = ""
if sub.get("overall_score") is not None:
score_html = (
f'<div style="font-size:1.2rem;'
f'font-weight:700;color:#0f172a;'
f'margin-top:0.5rem">'
f'Score: {sub["overall_score"]:.1f}/100'
f'</div>'
)
return (
f'<div style="background:white;padding:1rem;'
f'border-radius:8px;border:1px solid #e2e8f0">'
f'<strong>{sub["agent_name"]}</strong> '
f'({sub["organization"]})<br>'
f'Status: <span style="color:{status_color};'
f'font-weight:700">{sub["status"]}</span><br>'
f'Tasks: {sub.get("tasks_dispatched", 0)}'
f'/{sub.get("tasks_total", 76)}<br>'
f'Created: {sub.get("created_at", "")[:10]}'
f'{score_html}</div>'
)
except Exception as e:
return f'<div style="color:#e53e3e">{e}</div>'
status_btn.click(_check_status, [status_id], status_out)
# --- Admin panel (password-protected) ---
with gr.Accordion("Admin Panel", open=False):
admin_pw = gr.Textbox(
label="Admin Password", type="password",
)
admin_auth_btn = gr.Button("Authenticate")
admin_panel = gr.Column(visible=False)
admin_msg = gr.HTML()
with admin_panel:
gr.HTML('<h3 style="color:#0f172a">'
'Pending Submissions</h3>')
pending_html = gr.HTML()
refresh_btn = gr.Button("Refresh List")
with gr.Row():
approve_id = gr.Textbox(
label="Submission ID to Approve/Reject",
scale=2,
)
approve_btn = gr.Button(
"Approve", variant="primary", scale=1,
)
reject_btn = gr.Button(
"Reject", variant="stop", scale=1,
)
approve_msg = gr.HTML()
gr.HTML('<h3 style="color:#0f172a;margin-top:1rem">'
'Pipeline Control</h3>')
with gr.Row():
dispatch_id = gr.Textbox(
label="Submission ID", scale=2,
)
dispatch_btn = gr.Button(
"Phase A: Dispatch Tasks", scale=1,
)
with gr.Row():
boltz_id = gr.Textbox(
label="Submission ID", scale=2,
)
boltz_btn = gr.Button(
"Phase B: Run Boltz (GPU)", scale=1,
)
with gr.Row():
judge_id = gr.Textbox(
label="Submission ID", scale=2,
)
judge_btn = gr.Button(
"Phase C: Run LLM Judge", scale=1,
)
with gr.Row():
final_id = gr.Textbox(
label="Submission ID", scale=2,
)
final_btn = gr.Button(
"Phase D: Finalize & Publish", scale=1,
)
pipeline_out = gr.HTML()
def _admin_auth(pw):
if pw == ADMIN_PASSWORD:
return (
gr.Column(visible=True),
'<div style="color:#38a169">'
'Authenticated.</div>',
)
return (
gr.Column(visible=False),
'<div style="color:#e53e3e">'
'Wrong password.</div>',
)
admin_auth_btn.click(
_admin_auth, [admin_pw],
[admin_panel, admin_msg],
)
def _refresh_pending():
try:
from eval_queue import get_pending_submissions
pending = get_pending_submissions()
if not pending:
return "<p>No pending submissions.</p>"
rows = []
for s in pending:
mcp = "custom" if s.get("custom_mcp_url") else "reference"
key_state = "set" if s.get("api_key") else "scrubbed"
rows.append(
f'<tr><td><code>{s["submission_id"]}</code></td>'
f'<td>{s["agent_name"]}</td>'
f'<td>{s["organization"]}</td>'
f'<td>{s.get("provider","?")}/{s.get("model_name","?")}</td>'
f'<td>{mcp}</td>'
f'<td>{key_state}</td>'
f'<td>{s.get("created_at","")[:10]}</td></tr>'
)
return (
'<table style="width:100%;font-size:0.85rem;'
'border-collapse:collapse">'
"<tr><th>ID</th><th>Agent</th><th>Org</th>"
"<th>Provider/Model</th><th>MCP</th>"
"<th>Key</th><th>Date</th></tr>"
+ "".join(rows) + "</table>"
)
except Exception as e:
return f"<p>Error: {e}</p>"
refresh_btn.click(
_refresh_pending, [], pending_html,
)
def _approve_sub(sid):
try:
from eval_queue import update_status
ok = update_status(sid.strip(), "approved")
if ok:
return (
f'<div style="color:#38a169">'
f'Approved: {sid}</div>'
)
return (
f'<div style="color:#e53e3e">'
f'Failed to approve {sid}</div>'
)
except Exception as e:
return f'<div style="color:#e53e3e">{e}</div>'
def _reject_sub(sid):
try:
from eval_queue import update_status
ok = update_status(sid.strip(), "rejected")
if ok:
return (
f'<div style="color:#d69e2e">'
f'Rejected: {sid}</div>'
)
return (
f'<div style="color:#e53e3e">'
f'Failed to reject {sid}</div>'
)
except Exception as e:
return f'<div style="color:#e53e3e">{e}</div>'
approve_btn.click(
_approve_sub, [approve_id], approve_msg,
)
reject_btn.click(
_reject_sub, [approve_id], approve_msg,
)
def _run_dispatch(sid):
try:
from eval_queue import get_submission
from eval_dispatcher import dispatch_all_tasks
sub = get_submission(sid.strip())
if sub is None:
return ('<div style="color:#e53e3e">'
'Not found</div>')
if sub["status"] not in ("approved", "dispatching"):
return (
f'<div style="color:#e53e3e">'
f'Cannot dispatch: status='
f'{sub["status"]}</div>'
)
if not sub.get("api_key"):
return (
'<div style="color:#e53e3e">'
'API key already scrubbed -- this '
'submission has already been dispatched. '
'Resubmit if you need to re-run.</div>'
)
results = dispatch_all_tasks(sid.strip())
ok = sum(1 for r in results if r.get("success"))
return (
f'<div style="color:#38a169">'
f'Dispatched: {ok}/{len(results)} tasks '
f'succeeded. API key scrubbed.</div>'
)
except Exception as e:
import traceback
return (
f'<div style="color:#e53e3e">'
f'<strong>Dispatch error:</strong> {e}<br>'
f'<pre style="font-size:0.7rem">'
f'{traceback.format_exc()[:600]}</pre></div>'
)
def _run_boltz(sid):
try:
from eval_queue import get_submission
from eval_boltz import run_boltz_posteval
sub = get_submission(sid.strip())
if sub is None:
return (
'<div style="color:#e53e3e">'
'Not found</div>'
)
per_task = json.loads(
sub.get("per_task_results", "{}")
)
if not per_task:
return (
'<div style="color:#e53e3e">'
"No task results to process.</div>"
)
run_boltz_posteval(per_task)
from eval_queue import save_task_result
for tid, tres in per_task.items():
save_task_result(sid.strip(), tid, tres)
return (
'<div style="color:#38a169">'
"Boltz post-assessment complete.</div>"
)
except Exception as e:
return f'<div style="color:#e53e3e">{e}</div>'
def _run_judge(sid):
try:
import eval_judge as ej
from eval_queue import (
get_submission, save_task_result, update_status,
)
sub = get_submission(sid.strip())
if sub is None:
return ('<div style="color:#e53e3e">'
'Not found</div>')
per_task = json.loads(
sub.get("per_task_results", "{}")
)
if not per_task:
return ('<div style="color:#e53e3e">'
"No task results to process.</div>")
update_status(sid.strip(), "scoring")
ej.run_judge_panel(
per_task,
agent_id=sub.get("agent_name", "unknown"),
dry_run=False,
)
for tid, tres in per_task.items():
save_task_result(sid.strip(), tid, tres)
n_done = sum(
1 for r in per_task.values()
if r.get("hybrid_total") is not None
)
return (
f'<div style="color:#38a169">'
f"LLM judge complete on {n_done} tasks."
"</div>"
)
except Exception as e:
import traceback
return (
f'<div style="color:#e53e3e">'
f'<strong>Judge error:</strong> {e}<br>'
f'<pre style="font-size:0.7rem">'
f'{traceback.format_exc()[:600]}</pre></div>'
)
def _run_finalize(sid):
try:
from eval_queue import (
finalize_submission,
get_submission,
)
from eval_scorer import aggregate_scores
sub = get_submission(sid.strip())
if sub is None:
return (
'<div style="color:#e53e3e">'
'Not found</div>'
)
per_task = json.loads(
sub.get("per_task_results", "{}")
)
agg = aggregate_scores(per_task)
finalize_submission(
sid.strip(),
overall_score=agg["overall_score"],
component_scores=agg["component_scores"],
taxonomy_scores=agg["taxonomy_scores"],
)
mode_label = agg.get("scoring_mode", "algo")
return (
f'<div style="color:#38a169">'
f'Finalized! Score: '
f'{agg["overall_score"]:.1f} '
f'(scoring={mode_label})</div>'
)
except Exception as e:
return f'<div style="color:#e53e3e">{e}</div>'
dispatch_btn.click(
_run_dispatch, [dispatch_id], pipeline_out,
)
boltz_btn.click(
_run_boltz, [boltz_id], pipeline_out,
)
judge_btn.click(
_run_judge, [judge_id], pipeline_out,
)
final_btn.click(
_run_finalize, [final_id], pipeline_out,
)
# ══════ Tab 7: About ══════
with gr.Tab("\u2139\ufe0f About"):
gr.HTML(build_about())
return app
# ═══════════════════════════════════════════════════════════════════
# Entry point
# ═══════════════════════════════════════════════════════════════════
if __name__ == "__main__":
create_app().launch()