fix: update Paper/GitHub/HF/PyPI header links + BibTeX DOI + protein-design-mcp refs
c4982fe verified | """BioDesignBench Leaderboard β Gradio App for HuggingFace Spaces | |
| Evaluating LLM Agents on Protein Design via MCP Tools | |
| Romero Lab, Duke University | |
| Tabs: | |
| 1. Overall Leaderboard | |
| 2. Taxonomy Breakdown | |
| 3. Component Analysis | |
| 4. Benchmark vs User | |
| 5. Submit (new submission form) | |
| 6. Status & Admin (password-protected pipeline control) | |
| 7. About | |
| """ | |
| import json | |
| import os | |
| from pathlib import Path | |
| import gradio as gr | |
| import plotly.graph_objects as go | |
| ADMIN_PASSWORD = os.environ.get("BDB_ADMIN_PASSWORD", "biodesignbench2026") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Configuration β change these when deploying | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| PAPER_URL = "https://www.biorxiv.org/content/10.64898/2026.05.06.723381v1" | |
| GITHUB_URL = "https://github.com/RomeroLab/BioDesignBench" | |
| HF_URL = "https://huggingface.co/spaces/RomeroLab-Duke/BioDesignBench-Leaderboard" | |
| PYPI_URL = "https://pypi.org/project/protein-design-mcp/" | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Taxonomy & scoring constants (2 Γ 5 design matrix) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| APPROACHES = ["de_novo", "redesign"] | |
| APPROACH_LABELS = { | |
| "de_novo": "De Novo Design", | |
| "redesign": "Redesign", | |
| } | |
| SUBJECTS = ["antibody", "binder", "enzyme", "scaffold", "fluorescent_protein"] | |
| SUBJECT_LABELS = { | |
| "antibody": "Antibody", | |
| "binder": "Binder", | |
| "enzyme": "Enzyme", | |
| "scaffold": "Scaffold", | |
| "fluorescent_protein": "Fluorescent Prot.", | |
| } | |
| # 9 valid cells (rd Γ binder is empty in current task set) | |
| VALID_CELLS = { | |
| "de_novo": {"antibody", "binder", "enzyme", "scaffold", "fluorescent_protein"}, | |
| "redesign": {"antibody", "enzyme", "scaffold", "fluorescent_protein"}, | |
| } | |
| N_TASKS_PER_CELL = { | |
| ("de_novo", "antibody"): 4, | |
| ("de_novo", "binder"): 19, | |
| ("de_novo", "enzyme"): 2, | |
| ("de_novo", "scaffold"): 21, | |
| ("de_novo", "fluorescent_protein"): 1, | |
| ("redesign", "antibody"): 5, | |
| ("redesign", "enzyme"): 10, | |
| ("redesign", "scaffold"): 4, | |
| ("redesign", "fluorescent_protein"): 10, | |
| } | |
| COMPONENTS = [ | |
| "approach", | |
| "orchestration", | |
| "quality", | |
| "feasibility", | |
| "novelty", | |
| "diversity", | |
| ] | |
| COMP_MAX = { | |
| "approach": 20, | |
| "orchestration": 15, | |
| "quality": 35, | |
| "feasibility": 15, | |
| "novelty": 5, | |
| "diversity": 10, | |
| } | |
| TYPE_STYLE = { | |
| "llm": {"icon": "", "bg": "#ffffff", "tag": ""}, | |
| "hardcoded": {"icon": "\U0001f527", "bg": "#f0f0f0", "tag": "baseline"}, | |
| "human_expert": { | |
| "icon": "\U0001f468\u200d\U0001f52c", | |
| "bg": "#ebf4ff", | |
| "tag": "baseline", | |
| }, | |
| "human_oracle": {"icon": "\U0001f4c4", "bg": "#fefcbf", "tag": "baseline"}, | |
| # Backward-compat alias for older JSON files | |
| "oracle": {"icon": "\U0001f4c4", "bg": "#fefcbf", "tag": "baseline"}, | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Data loading | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_data() -> dict: | |
| path = Path(__file__).parent / "leaderboard_data.json" | |
| with open(path) as f: | |
| return json.load(f) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Custom CSS | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CUSTOM_CSS = """ | |
| .gradio-container { max-width: 1200px !important; } | |
| .gr-padded { padding: 0 !important; } | |
| /* Force light appearance for all inline-styled HTML content */ | |
| .dark .gradio-container { | |
| --body-background-fill: #f7fafc !important; | |
| --block-background-fill: #ffffff !important; | |
| --body-text-color: #1a202c !important; | |
| --block-label-text-color: #1a202c !important; | |
| --input-background-fill: #ffffff !important; | |
| --border-color-primary: #e2e8f0 !important; | |
| --color-accent-soft: rgba(49,130,206,0.15) !important; | |
| --neutral-50: #f7fafc !important; | |
| --neutral-100: #edf2f7 !important; | |
| --neutral-200: #e2e8f0 !important; | |
| --neutral-700: #4a5568 !important; | |
| --neutral-800: #2d3748 !important; | |
| color: #1a202c !important; | |
| background: #f7fafc !important; | |
| } | |
| .dark .tabs { background: #ffffff !important; } | |
| .dark .tab-nav button { color: #2d3748 !important; } | |
| .dark .tab-nav button.selected { | |
| color: #0f172a !important; | |
| border-color: #3182ce !important; | |
| } | |
| .dark .block { background: #ffffff !important; } | |
| .dark label, .dark .label-wrap { color: #2d3748 !important; } | |
| .dark input, .dark textarea, .dark select { | |
| background: #ffffff !important; | |
| color: #1a202c !important; | |
| border-color: #e2e8f0 !important; | |
| } | |
| .dark .accordion { background: #ffffff !important; } | |
| .dark .accordion > .label-wrap { color: #2d3748 !important; } | |
| """ | |
| # Force light mode on page load | |
| FORCE_LIGHT_JS = """ | |
| () => { | |
| document.querySelector('body').classList.remove('dark'); | |
| const obs = new MutationObserver(() => { | |
| document.querySelector('body').classList.remove('dark'); | |
| }); | |
| obs.observe(document.body, {attributes: true, attributeFilter: ['class']}); | |
| setTimeout(() => obs.disconnect(), 5000); | |
| } | |
| """ | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Plotly layout helper | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _base_layout(**overrides) -> dict: | |
| """Shared Plotly layout defaults, with per-chart overrides.""" | |
| base = dict( | |
| plot_bgcolor="white", | |
| paper_bgcolor="white", | |
| font=dict( | |
| family="system-ui, -apple-system, sans-serif", size=12, color="#2d3748" | |
| ), | |
| margin=dict(l=40, r=20, t=50, b=40), | |
| ) | |
| base.update(overrides) | |
| return base | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # HTML builders | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_header(last_updated: str, n_entries: int) -> str: | |
| btn = ( | |
| "display:inline-block;padding:0.45rem 1.1rem;border-radius:8px;" | |
| "text-decoration:none;font-size:0.82rem;font-weight:600;" | |
| "transition:opacity 0.15s" | |
| ) | |
| return f""" | |
| <div style="background:#ffffff;border:1px solid #e2e8f0; | |
| padding:2.2rem 2rem 1.8rem;text-align:center; | |
| border-radius:16px;margin-bottom:0.8rem; | |
| box-shadow:0 1px 4px rgba(0,0,0,0.04)"> | |
| <p style="margin:0 0 0.3rem;font-size:0.75rem;font-weight:700; | |
| letter-spacing:0.12em;text-transform:uppercase; | |
| color:#3182ce">Romero Lab · Duke University</p> | |
| <h1 style="font-size:2rem;margin:0;font-weight:800;color:#0f172a; | |
| letter-spacing:-0.02em"> | |
| \U0001f9ec BioDesignBench</h1> | |
| <p style="color:#0f172a;margin:0.6rem 0 0.2rem;font-size:1.1rem; | |
| font-weight:600;line-height:1.4"> | |
| Can LLM agents orchestrate stochastic protein-design pipelines?</p> | |
| <p style="color:#64748b;margin:0.2rem 0 0;font-size:0.95rem; | |
| font-weight:400;font-style:italic;max-width:680px; | |
| margin-left:auto;margin-right:auto;line-height:1.5"> | |
| Top-tier agents now surpass a deterministic pipeline — | |
| but invoke evaluation tools at only <strong>14% of expert depth</strong>. | |
| Guidance rescues coverage, not depth.</p> | |
| <div style="margin-top:1rem;display:flex;justify-content:center; | |
| gap:0.6rem;flex-wrap:wrap"> | |
| <a href="{PAPER_URL}" target="_blank" | |
| style="{btn};background:#0f172a;color:#ffffff"> | |
| \U0001f4c4 Paper</a> | |
| <a href="{GITHUB_URL}" target="_blank" | |
| style="{btn};background:#f1f5f9;color:#334155"> | |
| \U0001f4bb GitHub</a> | |
| <a href="{HF_URL}" target="_blank" | |
| style="{btn};background:#f1f5f9;color:#334155"> | |
| \U0001f917 HuggingFace</a> | |
| <a href="{PYPI_URL}" target="_blank" | |
| style="{btn};background:#f1f5f9;color:#334155"> | |
| \U0001f4e6 PyPI</a> | |
| </div> | |
| <div style="margin-top:1rem;display:flex;justify-content:center; | |
| gap:1.5rem;flex-wrap:wrap"> | |
| <span style="font-size:0.78rem;color:#94a3b8"> | |
| 76 tasks · 5 molecular families</span> | |
| <span style="font-size:0.78rem;color:#94a3b8"> | |
| 17 MCP tools</span> | |
| <span style="font-size:0.78rem;color:#94a3b8"> | |
| {n_entries} conditions</span> | |
| <span style="font-size:0.78rem;color:#94a3b8"> | |
| Updated {last_updated}</span> | |
| </div> | |
| </div>""" | |
| # ββ Score styling helpers ββ | |
| def _score_color(s: float) -> str: | |
| if s >= 50: | |
| return "#38a169" | |
| if s >= 25: | |
| return "#d69e2e" | |
| return "#e53e3e" | |
| def _bar_bg(s: float) -> str: | |
| if s >= 50: | |
| return "rgba(56,161,105,0.15)" | |
| if s >= 25: | |
| return "rgba(214,158,46,0.15)" | |
| return "rgba(229,62,62,0.12)" | |
| def _heat_color(val, max_val=95) -> str: | |
| if val is None: | |
| return "#f7fafc" | |
| r = val / max_val | |
| if r >= 0.7: | |
| return f"rgba(56,161,105,{min(0.2 + r * 0.4, 0.8):.2f})" | |
| if r >= 0.4: | |
| return f"rgba(214,158,46,{min(0.2 + r * 0.4, 0.8):.2f})" | |
| return f"rgba(229,62,62,{min(0.15 + r * 0.3, 0.6):.2f})" | |
| # ββ Tab 1: Overall leaderboard table ββ | |
| def build_leaderboard_table( | |
| entries: list, mode_f: str, mcp_f: str, type_f: str | |
| ) -> str: | |
| """Generate the mixed-ranking HTML table with inline styles.""" | |
| # Filter | |
| filtered = [] | |
| for e in entries: | |
| st = e["submission_type"] | |
| if mode_f != "All" and st == "llm": | |
| if (e.get("mode") or "").lower() != mode_f.lower(): | |
| continue | |
| if mcp_f == "Reference" and e.get("mcp_custom"): | |
| continue | |
| if mcp_f == "Custom" and not e.get("mcp_custom"): | |
| continue | |
| if type_f == "LLM Only" and st != "llm": | |
| continue | |
| if type_f == "Baselines Only" and st == "llm": | |
| continue | |
| filtered.append(e) | |
| filtered.sort(key=lambda x: x["overall_score"], reverse=True) | |
| # Shared cell styles | |
| TD = ( | |
| "padding:0.65rem 1rem;border-bottom:1px solid #e2e8f0;" | |
| "font-size:0.9rem" | |
| ) | |
| TH = ( | |
| "background:#0f172a;color:white;padding:0.75rem 1rem;" | |
| "text-align:left;font-size:0.75rem;text-transform:uppercase;" | |
| "letter-spacing:0.05em;font-weight:600" | |
| ) | |
| rows = [] | |
| llm_rank = 0 | |
| for e in filtered: | |
| st = e["submission_type"] | |
| sty = TYPE_STYLE.get(st, TYPE_STYLE["llm"]) | |
| is_bl = st != "llm" | |
| sc = e["overall_score"] | |
| # ββ Rank cell ββ | |
| if is_bl: | |
| rank = ( | |
| f'<td style="{TD};text-align:center;font-size:1.1rem;' | |
| f'width:50px">{sty["icon"]}</td>' | |
| ) | |
| else: | |
| llm_rank += 1 | |
| rcolor = {1: "#d69e2e", 2: "#a0aec0", 3: "#c17832"}.get( | |
| llm_rank, "#0f172a" | |
| ) | |
| rsize = ( | |
| "1.1rem" | |
| if llm_rank == 1 | |
| else ("1.05rem" if llm_rank <= 3 else "0.9rem") | |
| ) | |
| rank = ( | |
| f'<td style="{TD};text-align:center;font-weight:700;' | |
| f"color:{rcolor};font-size:{rsize};width:50px\">" | |
| f"{llm_rank}</td>" | |
| ) | |
| # ββ Name cell ββ | |
| tag_html = "" | |
| if sty["tag"]: | |
| tag_html = ( | |
| ' <span style="font-size:0.7rem;background:#e2e8f0;' | |
| "padding:0.1rem 0.4rem;border-radius:3px;color:#4a5568;" | |
| f'margin-left:0.3rem;vertical-align:middle">' | |
| f'{sty["tag"]}</span>' | |
| ) | |
| icon_pfx = f'{sty["icon"]} ' if sty["icon"] else "" | |
| fw = "600" if is_bl else "500" | |
| name = ( | |
| f'<td style="{TD};font-weight:{fw}">' | |
| f'{icon_pfx}{e["agent_name"]}{tag_html}</td>' | |
| ) | |
| # ββ Organization ββ | |
| org = f'<td style="{TD}">{e["organization"]}</td>' | |
| # ββ Mode badge ββ | |
| if is_bl: | |
| mode = f'<td style="{TD};color:#718096">\u2014</td>' | |
| elif e.get("mode") == "benchmark": | |
| mode = ( | |
| f'<td style="{TD}"><span style="background:#fed7d7;' | |
| "color:#c53030;padding:0.15rem 0.5rem;border-radius:4px;" | |
| 'font-size:0.75rem;font-weight:600">benchmark</span></td>' | |
| ) | |
| else: | |
| mode = ( | |
| f'<td style="{TD}"><span style="background:#c6f6d5;' | |
| "color:#276749;padding:0.15rem 0.5rem;border-radius:4px;" | |
| 'font-size:0.75rem;font-weight:600">user</span></td>' | |
| ) | |
| # ββ MCP ββ | |
| if is_bl: | |
| mcp = f'<td style="{TD};color:#718096">\u2014</td>' | |
| elif e.get("mcp_custom"): | |
| mcp = ( | |
| f'<td style="{TD}"><span style="background:#fef3c7;' | |
| "color:#92400e;padding:0.15rem 0.55rem;border-radius:4px;" | |
| 'font-size:0.72rem;font-weight:700">custom</span></td>' | |
| ) | |
| else: | |
| mcp = ( | |
| f'<td style="{TD}"><span style="background:#dbeafe;' | |
| "color:#1e40af;padding:0.15rem 0.55rem;border-radius:4px;" | |
| 'font-size:0.72rem;font-weight:700">reference</span></td>' | |
| ) | |
| # ββ Score with proportional bar ββ | |
| scol = _score_color(sc) | |
| bbg = _bar_bg(sc) | |
| score_cell = ( | |
| f'<td style="{TD};font-weight:700;font-size:1rem;color:{scol};' | |
| f'position:relative;font-variant-numeric:tabular-nums">' | |
| f'<div style="position:absolute;left:0;top:0;bottom:0;' | |
| f"width:{sc}%;background:{bbg};" | |
| f'border-radius:3px"></div>' | |
| f'<span style="position:relative">{sc:.1f}</span></td>' | |
| ) | |
| # ββ Tasks & zeros ββ | |
| tc = e.get("tasks_completed", 0) | |
| tt = e.get("tasks_total", 76) | |
| tasks = f'<td style="{TD}">{tc}/{tt}</td>' | |
| zeros = f'<td style="{TD}">{e.get("tasks_with_zero", 0)}</td>' | |
| rows.append( | |
| f'<tr style="background:{sty["bg"]}">' | |
| f"{rank}{name}{org}{mode}{mcp}{score_cell}{tasks}{zeros}</tr>" | |
| ) | |
| return f""" | |
| <table style="width:100%;border-collapse:collapse;background:white; | |
| border-radius:10px;overflow:hidden; | |
| box-shadow:0 1px 3px rgba(0,0,0,0.08)"> | |
| <thead><tr> | |
| <th style="{TH};width:50px">#</th> | |
| <th style="{TH}">Agent</th> | |
| <th style="{TH}">Organization</th> | |
| <th style="{TH}">Mode</th> | |
| <th style="{TH}">MCP</th> | |
| <th style="{TH}">Score</th> | |
| <th style="{TH}">Tasks</th> | |
| <th style="{TH}">Zero-Score</th> | |
| </tr></thead> | |
| <tbody>{''.join(rows)}</tbody> | |
| </table>""" | |
| # ββ Tab 2: Taxonomy heatmap ββ | |
| def build_heatmap(entry: dict) -> str: | |
| """HTML heatmap for one agent across the 2 Γ 5 design matrix | |
| (DesignApproach Γ MolecularSubject = 9 valid cells; rd Γ binder is empty). | |
| """ | |
| ts = entry.get("taxonomy_scores", {}) | |
| TH = ( | |
| "background:#0f172a;color:white;padding:0.6rem 0.8rem;" | |
| "text-align:center;font-size:0.75rem;font-weight:600" | |
| ) | |
| TD = ( | |
| "text-align:center;padding:0.5rem;font-size:0.85rem;" | |
| "font-weight:600;border-bottom:1px solid #e2e8f0" | |
| ) | |
| rows = [] | |
| for ap in APPROACHES: | |
| cells = [ | |
| f'<td style="{TD};text-align:left;font-weight:700;' | |
| f'background:#f8fafc;color:#0f172a">{APPROACH_LABELS[ap]}</td>' | |
| ] | |
| vals = [] | |
| for sj in SUBJECTS: | |
| if sj in VALID_CELLS[ap]: | |
| val = ts.get(ap, {}).get(sj) | |
| bg = _heat_color(val) | |
| n = N_TASKS_PER_CELL.get((ap, sj), 0) | |
| text = ( | |
| f'{val:.0f}<br><span style="font-size:0.65rem;' | |
| f'font-weight:400;color:#64748b">n={n}</span>' | |
| if val is not None | |
| else "\u2014" | |
| ) | |
| cells.append(f'<td style="{TD};background:{bg}">{text}</td>') | |
| if val is not None: | |
| vals.append(val) | |
| else: | |
| cells.append( | |
| f'<td style="{TD};color:#cbd5e0;font-weight:400">' | |
| "n/a</td>" | |
| ) | |
| avg = sum(vals) / len(vals) if vals else 0 | |
| avg_bg = _heat_color(avg) | |
| cells.append( | |
| f'<td style="{TD};font-weight:700;background:{avg_bg}">' | |
| f"{avg:.1f}</td>" | |
| ) | |
| rows.append(f'<tr>{"".join(cells)}</tr>') | |
| sj_headers = "".join( | |
| f'<th style="{TH}">{SUBJECT_LABELS[sj]}</th>' | |
| for sj in SUBJECTS | |
| ) | |
| return f""" | |
| <table style="width:100%;border-collapse:collapse;background:white; | |
| border-radius:10px;overflow:hidden; | |
| box-shadow:0 1px 3px rgba(0,0,0,0.08)"> | |
| <thead><tr> | |
| <th style="{TH};text-align:left">Approach \u2193 / Subject \u2192</th> | |
| {sj_headers} | |
| <th style="{TH}">Mean</th> | |
| </tr></thead> | |
| <tbody>{''.join(rows)}</tbody> | |
| </table>""" | |
| # ββ Tab 4: Mode comparison cards ββ | |
| def build_mode_cards(entries: list) -> str: | |
| """Per-LLM cards showing benchmark vs user delta.""" | |
| by_name: dict[str, dict] = {} | |
| for e in entries: | |
| if e["submission_type"] != "llm": | |
| continue | |
| by_name.setdefault(e["agent_name"], {})[e["mode"]] = e | |
| ordered = sorted( | |
| by_name.items(), | |
| key=lambda x: x[1].get("user", {}).get("overall_score", 0), | |
| reverse=True, | |
| ) | |
| cards = [] | |
| for name, modes in ordered: | |
| bench = modes.get("benchmark") | |
| user = modes.get("user") | |
| if not bench or not user: | |
| continue | |
| delta = user["overall_score"] - bench["overall_score"] | |
| pct = (delta / bench["overall_score"] * 100) if bench["overall_score"] else 0 | |
| lines = [ | |
| '<div style="display:flex;justify-content:space-between;' | |
| 'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">' | |
| "<span>Benchmark</span>" | |
| f'<span style="font-weight:700;color:#e53e3e">' | |
| f'{bench["overall_score"]:.1f}</span></div>', | |
| '<div style="display:flex;justify-content:space-between;' | |
| 'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">' | |
| "<span>User</span>" | |
| f'<span style="font-weight:700;color:#d69e2e">' | |
| f'{user["overall_score"]:.1f}</span></div>', | |
| '<div style="display:flex;justify-content:space-between;' | |
| 'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">' | |
| "<span>Delta</span>" | |
| f'<span style="font-weight:700;color:#38a169">' | |
| f"+{delta:.1f} (+{pct:.0f}%)</span></div>", | |
| ] | |
| for c in COMPONENTS: | |
| d = user["component_scores"][c] - bench["component_scores"][c] | |
| color = "#38a169" if d >= 0 else "#e53e3e" | |
| sign = "+" if d >= 0 else "" | |
| lines.append( | |
| '<div style="display:flex;justify-content:space-between;' | |
| 'padding:0.3rem 0;border-bottom:1px solid #e2e8f0;' | |
| 'font-size:0.85rem">' | |
| f'<span style="color:#718096">{c}</span>' | |
| f'<span style="font-weight:700;color:{color}">' | |
| f"{sign}{d:.1f}</span></div>" | |
| ) | |
| cards.append( | |
| '<div style="background:white;border-radius:10px;padding:1.2rem;' | |
| 'box-shadow:0 1px 3px rgba(0,0,0,0.08)">' | |
| f'<h4 style="font-size:0.95rem;color:#0f172a;' | |
| f'margin:0 0 0.8rem">{name}</h4>' | |
| f'{"".join(lines)}</div>' | |
| ) | |
| return ( | |
| '<div style="display:grid;grid-template-columns:' | |
| 'repeat(auto-fit,minmax(250px,1fr));gap:1rem;margin-top:1rem">' | |
| f'{"".join(cards)}</div>' | |
| ) | |
| # ββ Headline findings (paper banner) ββ | |
| def build_headline_findings(findings: list) -> str: | |
| """Top-of-page banner that surfaces the paper's three core claims.""" | |
| if not findings: | |
| return "" | |
| cards = [] | |
| accents = ["#3182ce", "#d69e2e", "#805ad5", "#38a169", "#e53e3e"] | |
| for i, text in enumerate(findings): | |
| c = accents[i % len(accents)] | |
| cards.append( | |
| f'<div style="background:#ffffff;border:1px solid #e2e8f0;' | |
| f"border-left:4px solid {c};border-radius:10px;" | |
| f'padding:0.85rem 1rem;flex:1 1 220px;min-width:220px;' | |
| f'box-shadow:0 1px 3px rgba(0,0,0,0.04)">' | |
| f'<div style="font-size:0.7rem;font-weight:700;' | |
| f'color:{c};letter-spacing:0.08em;text-transform:uppercase;' | |
| f'margin-bottom:0.35rem">Finding {i+1}</div>' | |
| f'<div style="font-size:0.82rem;color:#1a202c;' | |
| f'line-height:1.45">{text}</div></div>' | |
| ) | |
| return ( | |
| '<div style="display:flex;flex-wrap:wrap;gap:0.7rem;' | |
| 'margin:0.4rem 0 1rem">' | |
| f"{''.join(cards)}</div>" | |
| ) | |
| # ββ Tab: Depth Gap (intervention experiments) ββ | |
| def build_intervention_section(interventions: dict) -> str: | |
| """Show forced-depth and low-diversity intervention results. | |
| The forced-depth condition mandates β₯3 evaluation passes per design | |
| candidate; the low-diversity control constrains the candidate pool | |
| without forcing depth. Together they isolate evaluation depth as the | |
| causal driver of the 'surface competence' gap reported in the paper. | |
| """ | |
| if not interventions or not interventions.get("rows"): | |
| return '<p style="color:#718096">No intervention data available.</p>' | |
| rows = interventions["rows"] | |
| cond_meta = { | |
| "baseline": ("#64748b", "Baseline"), | |
| "forced_depth": ("#38a169", "Forced Depth"), | |
| "low_diversity_control": ("#d69e2e", "Low-Diversity Control"), | |
| } | |
| TH = ( | |
| "background:#0f172a;color:white;padding:0.65rem 0.9rem;" | |
| "text-align:left;font-size:0.72rem;text-transform:uppercase;" | |
| "letter-spacing:0.05em;font-weight:600" | |
| ) | |
| TD = ("padding:0.6rem 0.9rem;border-bottom:1px solid #e2e8f0;" | |
| "font-size:0.86rem") | |
| body = [] | |
| for r in rows: | |
| color, cond_label = cond_meta.get(r["condition"], ("#64748b", r["condition"])) | |
| delta = r.get("delta_vs_baseline") | |
| if delta is None or r["condition"] == "baseline": | |
| delta_html = '<span style="color:#cbd5e0">\u2014</span>' | |
| else: | |
| sign = "+" if delta >= 0 else "" | |
| dcol = "#38a169" if delta > 0 else ("#e53e3e" if delta < 0 else "#64748b") | |
| delta_html = ( | |
| f'<span style="color:{dcol};font-weight:700">' | |
| f"{sign}{delta:.1f}</span>" | |
| ) | |
| body.append( | |
| f'<tr><td style="{TD};font-weight:600;color:#0f172a">' | |
| f'{r["label"]}</td>' | |
| f'<td style="{TD}"><span style="background:{color}22;' | |
| f"color:{color};padding:0.15rem 0.55rem;border-radius:4px;" | |
| f'font-size:0.72rem;font-weight:700">{cond_label}</span></td>' | |
| f'<td style="{TD};font-weight:700;font-variant-numeric:' | |
| f'tabular-nums">{r["score"]:.1f}</td>' | |
| f'<td style="{TD};font-variant-numeric:tabular-nums">{delta_html}</td>' | |
| f'<td style="{TD};color:#475569;font-variant-numeric:tabular-nums">' | |
| f'{r["approach"]:.1f} / {r["orchestration"]:.1f}</td>' | |
| f'<td style="{TD};color:#475569;font-variant-numeric:tabular-nums">' | |
| f'{r["quality"]:.1f}</td>' | |
| f'<td style="{TD};color:#475569;font-variant-numeric:tabular-nums">' | |
| f'{r["diversity"]:.1f}</td></tr>' | |
| ) | |
| n = interventions.get("n_tasks", 18) | |
| return f""" | |
| <div style="max-width:980px;margin:0 auto"> | |
| <div style="background:#ffffff;border:1px solid #e2e8f0; | |
| border-radius:12px;padding:1.4rem 1.6rem; | |
| margin-bottom:1rem"> | |
| <h2 style="color:#0f172a;margin:0 0 0.5rem;font-size:1.2rem; | |
| font-weight:700">Causal interventions on the depth gap</h2> | |
| <p style="color:#475569;line-height:1.55;margin:0"> | |
| {interventions.get('description', '')} | |
| Reruns are scored on a representative <strong>{n}-task</strong> | |
| subset that spans all 9 occupied taxonomy cells. | |
| </p> | |
| </div> | |
| <div style="background:#fefce8;border-left:4px solid #ca8a04; | |
| border-radius:8px;padding:0.95rem 1.1rem; | |
| margin-bottom:1.1rem"> | |
| <strong style="color:#713f12">Headline:</strong> | |
| <span style="color:#52340d"> | |
| Forced-depth lifts <strong>DeepSeek V3 by +9.3</strong> and | |
| <strong>GPT-5 by +15.9</strong> points without any change to | |
| the underlying model or tools, while the low-diversity control | |
| <em>hurts</em> DeepSeek V3 (−2.3). The dissociation is | |
| cleanest on the strongest agent, where it provides direct | |
| causal evidence that | |
| <strong>evaluation depth — not the mere act of process | |
| intervention — drives the gain</strong>. GPT-5's | |
| response is more uniform across both interventions; we | |
| report the raw deltas without smoothing. | |
| </span> | |
| </div> | |
| <table style="width:100%;border-collapse:collapse;background:white; | |
| border-radius:10px;overflow:hidden; | |
| box-shadow:0 1px 3px rgba(0,0,0,0.08)"> | |
| <thead><tr> | |
| <th style="{TH}">Run</th> | |
| <th style="{TH}">Condition</th> | |
| <th style="{TH}">Score</th> | |
| <th style="{TH}">Δ vs baseline</th> | |
| <th style="{TH}">Approach / Orch.</th> | |
| <th style="{TH}">Quality</th> | |
| <th style="{TH}">Diversity</th> | |
| </tr></thead> | |
| <tbody>{''.join(body)}</tbody> | |
| </table> | |
| <p style="color:#64748b;font-size:0.78rem;margin-top:0.8rem; | |
| line-height:1.5"> | |
| Scoring uses the same 100-point hybrid rubric as the main | |
| leaderboard but is restricted to {n} representative tasks; | |
| absolute values therefore differ from the full-benchmark mean. | |
| The <em>delta vs baseline</em> compares each agent against | |
| its own untreated baseline run, isolating the intervention effect. | |
| </p> | |
| </div> | |
| """ | |
| # ββ Tab 5: About ββ | |
| def build_about() -> str: | |
| h2 = ( | |
| 'style="color:#0f172a;margin:0 0 0.8rem;font-size:1.25rem;' | |
| 'font-weight:700"' | |
| ) | |
| h3 = ( | |
| 'style="color:#334155;margin:1.2rem 0 0.5rem;font-size:1rem;' | |
| 'font-weight:600"' | |
| ) | |
| p = 'style="margin-bottom:0.8rem;color:#475569;line-height:1.6"' | |
| card = ( | |
| 'style="background:#ffffff;border:1px solid #e2e8f0;' | |
| 'border-radius:12px;padding:2rem;margin-bottom:1.2rem"' | |
| ) | |
| stat_box = ( | |
| 'style="background:#f8fafc;border:1px solid #e2e8f0;' | |
| 'border-radius:10px;padding:1rem;text-align:center"' | |
| ) | |
| return f""" | |
| <div style="max-width:900px;margin:0 auto"> | |
| <div {card}> | |
| <h2 {h2}>What is BioDesignBench?</h2> | |
| <p {p}> | |
| BioDesignBench is a benchmark for evaluating LLM agents as | |
| orchestrators of multi-step <em>stochastic</em> protein-design | |
| pipelines. Unlike chemistry- or code-agent benchmarks, where | |
| tool chains are largely deterministic, protein design demands | |
| repeated sampling from generative tools (RFdiffusion, | |
| ProteinMPNN) and iterative cross-validation through several | |
| biophysical metrics. We test the full agentic loop — | |
| <strong>plan → sample → evaluate across multiple | |
| metrics → iterate</strong> — over 76 expert-curated | |
| tasks drawn from 2024–2026 literature, exposed through | |
| 17 MCP-integrated tools. | |
| </p> | |
| <div style="display:grid;grid-template-columns: | |
| repeat(auto-fit,minmax(140px,1fr));gap:0.8rem; | |
| margin:1rem 0"> | |
| <div {stat_box}> | |
| <div style="font-size:1.8rem;font-weight:800;color:#0f172a"> | |
| 76</div> | |
| <div style="font-size:0.78rem;color:#64748b">design tasks</div> | |
| </div> | |
| <div {stat_box}> | |
| <div style="font-size:1.8rem;font-weight:800;color:#0f172a"> | |
| 9</div> | |
| <div style="font-size:0.78rem;color:#64748b"> | |
| taxonomy cells<br>(2 approaches \u00d7 5 subjects)</div> | |
| </div> | |
| <div {stat_box}> | |
| <div style="font-size:1.8rem;font-weight:800;color:#0f172a"> | |
| 17</div> | |
| <div style="font-size:0.78rem;color:#64748b">MCP tools</div> | |
| </div> | |
| <div {stat_box}> | |
| <div style="font-size:1.8rem;font-weight:800;color:#0f172a"> | |
| 100</div> | |
| <div style="font-size:0.78rem;color:#64748b">point rubric</div> | |
| </div> | |
| </div> | |
| </div> | |
| <div {card}> | |
| <h2 {h2}>Three principal findings</h2> | |
| <h3 {h3}>1. Top-tier agents now beat a deterministic pipeline</h3> | |
| <p {p}> | |
| DeepSeek V3 and GPT-5 surpass a hand-engineered hardcoded | |
| pipeline (54.2) under both modes. Autonomous protein-design | |
| orchestration is no longer infeasible — but a substantial | |
| gap to the human expert (61.3) and oracle (74.9) remains. | |
| </p> | |
| <h3 {h3}>2. Coverage–depth dissociation</h3> | |
| <p {p}> | |
| Workflow guidance closes the <em>coverage</em> gap (Rescue | |
| Index up to +3.01) but leaves <em>utilisation depth</em> | |
| unchanged (Rescue Index \u2248 0). Better tool documentation | |
| can teach agents <em>which</em> tools to call, but cannot | |
| teach them to call those tools with the iterative depth that | |
| expert practice demands. | |
| </p> | |
| <h3 {h3}>3. Evaluation depth, not tool knowledge, is the bottleneck</h3> | |
| <p {p}> | |
| Across 836 task–condition observations, evaluation depth | |
| per candidate correlates with total score at | |
| <strong>ρ = 0.685</strong> | |
| (<em>p</em> < 10<sup>-117</sup>). LLM agents generate | |
| backbone candidates at expert-level rates but evaluate each | |
| one at only <strong>14% of expert depth</strong>. Forced-depth | |
| interventions confirm this is causal — see the | |
| <em>Depth Gap</em> tab. | |
| </p> | |
| </div> | |
| <div {card}> | |
| <h2 {h2}>How to submit</h2> | |
| <p {p}> | |
| Unlike most agent benchmarks, <strong>you do not host an HTTP | |
| endpoint</strong>. The 76 task descriptions never leave Romero | |
| Lab infrastructure. Instead you provide:</p> | |
| <ol style="color:#475569;padding-left:1.5rem;margin-bottom:0.8rem; | |
| line-height:1.7"> | |
| <li>an <strong>LLM provider + API key</strong> | |
| (Anthropic / OpenAI / Google / DeepSeek). | |
| We run the BioDesignBench agent loop against your chosen | |
| model inside the leaderboard backend. Your key is | |
| <em>scrubbed</em> from our records immediately after the | |
| dispatch phase completes.</li> | |
| <li>optionally, a <strong>custom MCP URL</strong> if you want | |
| to evaluate your own tool implementations. Otherwise, the | |
| agent calls our reference | |
| <a href="https://github.com/jasonkim8652/protein-design-mcp" | |
| style="color:#2563eb;font-weight:600">protein-design-mcp</a> | |
| endpoint (in progress).</li> | |
| </ol> | |
| <h3 {h3}>Data flow</h3> | |
| <p {p}> | |
| Each task prompt is sent to your chosen LLM provider via | |
| their standard API (Anthropic, OpenAI, Google, DeepSeek) — | |
| that single channel is the only path by which task data leaves | |
| Romero Lab. The MCP server (reference or custom) only ever | |
| sees operational tool arguments (sequences, PDB paths, hotspot | |
| residues); it never sees the raw task prompt or evaluation | |
| criteria. Every task prompt also carries a unique 16-character | |
| canary token as an HTML comment, for retrospective leakage | |
| detection.</p> | |
| <h3 {h3}>Bring your own tools (Custom MCP)</h3> | |
| <p {p}> | |
| If you want to benchmark a new tool implementation (a faster | |
| structure predictor, a different diffusion backbone, your own | |
| stability model) against the same 76 tasks and rubric, stand | |
| up an HTTPS endpoint that satisfies the MCP contract and paste | |
| the URL into the submission form's | |
| <em>Advanced: Custom MCP</em> section:</p> | |
| <ul style="color:#475569;padding-left:1.5rem;margin-bottom:0.8rem; | |
| line-height:1.7"> | |
| <li><strong>Contract + hosting options</strong>: | |
| <a href="https://github.com/RomeroLab/BioDesignBench/blob/main/biodesignbench-leaderboard/README.md#bringing-your-own-mcp-tools" | |
| style="color:#2563eb;font-weight:600">leaderboard README</a></li> | |
| <li><strong>Minimal FastAPI stub (~150 lines)</strong>: | |
| <a href="https://github.com/RomeroLab/BioDesignBench/blob/main/biodesignbench-leaderboard/example_mcp_server.py" | |
| style="color:#2563eb;font-weight:600"><code>example_mcp_server.py</code></a></li> | |
| <li><strong>Reference implementation to fork</strong>: | |
| <a href="https://github.com/jasonkim8652/protein-design-mcp" | |
| style="color:#2563eb;font-weight:600">jasonkim8652/protein-design-mcp</a></li> | |
| </ul> | |
| <h3 {h3}>Limits</h3> | |
| <ul style="color:#475569;padding-left:1.5rem;margin-bottom:0.8rem; | |
| line-height:1.7"> | |
| <li>Maximum 1 submission per calendar month per organization</li> | |
| <li>73 hidden tasks are used for ranking; 3 public example | |
| tasks are available for development</li> | |
| <li>LLM-judge API costs are paid by Romero Lab; your own | |
| agent LLM calls are billed to your provider</li> | |
| </ul> | |
| </div> | |
| <div {card}> | |
| <h2 {h2}>Scoring rubric (100 points, hybrid)</h2> | |
| <p {p}> | |
| Scores combine <strong>72 algorithmic points</strong> from | |
| deterministic biophysical metrics with | |
| <strong>28 LLM-judge points</strong> assessed by a 3-judge | |
| panel (PoLL) with self-exclusion to mitigate self-preference | |
| bias. Each component is capped at its rubric maximum to | |
| prevent double counting. | |
| </p> | |
| <p {p}> | |
| <strong>Approach (20 pts)</strong> — strategic | |
| appropriateness of tool selection across 10 functional | |
| categories (backbone generation, inverse folding, structure | |
| prediction, etc.).</p> | |
| <p {p}> | |
| <strong>Orchestration (15 pts)</strong> — pipeline | |
| ordering, intermediate validation, and adaptive iteration.</p> | |
| <p {p}> | |
| <strong>Quality (35 pts)</strong> — 100% algorithmic. | |
| Continuous 4-band interpolation over Boltz-2 re-prediction | |
| metrics (pLDDT, pTM, ipTM, i_pAE), eliminating LLM judgement | |
| variance on biophysical quantities.</p> | |
| <p {p}> | |
| <strong>Feasibility (15 pts)</strong> — valid amino | |
| acids, length constraints, composition, and biophysical | |
| plausibility.</p> | |
| <p {p}> | |
| <strong>Novelty (5 pts)</strong> — sequence identity to | |
| reference (lower identity = more novel).</p> | |
| <p {p}> | |
| <strong>Diversity (10 pts)</strong> — number and | |
| pairwise diversity of generated designs.</p> | |
| </div> | |
| <div {card}> | |
| <h2 {h2}>Five-layer contamination defense</h2> | |
| <p {p}>Every evaluated LLM may have read protein-design | |
| literature during pretraining, so we use a layered defense:</p> | |
| <ul style="color:#475569;padding-left:1.5rem; | |
| margin-bottom:0.8rem;line-height:1.7"> | |
| <li>All 76 tasks derived from publications dated 2024–2026, | |
| post-dating model training cutoffs.</li> | |
| <li>Task prompts paraphrased and restructured — no | |
| verbatim passages from source literature.</li> | |
| <li>Targets specified by biological function and structural | |
| constraints, not by name or PDB identifier.</li> | |
| <li>12 decoy tasks with deliberately fabricated targets to | |
| detect memorisation-based responses.</li> | |
| <li>n-gram overlap analysis between agent outputs and source | |
| publications — no verbatim regurgitation above the | |
| 8-gram threshold across any condition.</li> | |
| </ul> | |
| </div> | |
| <div {card}> | |
| <h2 {h2}>Citation</h2> | |
| <pre style="background:#0f172a;color:#e2e8f0;padding:1.2rem; | |
| border-radius:10px;font-size:0.8rem; | |
| line-height:1.6">@article{{biodesignbench2026, | |
| title={{Evaluating LLM-Driven Protein Design: | |
| Agents Lack Iterative Evaluation Depth}}, | |
| author={{Kim, Jeonghyeon and Romero, Philip}}, | |
| journal={{bioRxiv}}, | |
| year={{2026}}, | |
| doi={{10.64898/2026.05.06.723381}}, | |
| url={{https://www.biorxiv.org/content/10.64898/2026.05.06.723381v1}} | |
| }}</pre> | |
| </div> | |
| </div>""" | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Chart builders (Plotly) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def chart_taxonomy_bar(entry: dict) -> go.Figure: | |
| """Grouped bar chart of mean score per molecular subject, | |
| split by design approach (de novo vs redesign). | |
| """ | |
| ts = entry.get("taxonomy_scores", {}) | |
| x_labels = [SUBJECT_LABELS[s] for s in SUBJECTS] | |
| def _series(ap): | |
| out = [] | |
| for sj in SUBJECTS: | |
| if sj in VALID_CELLS[ap]: | |
| out.append(ts.get(ap, {}).get(sj)) | |
| else: | |
| out.append(None) | |
| return out | |
| dn = _series("de_novo") | |
| rd = _series("redesign") | |
| fig = go.Figure() | |
| fig.add_trace(go.Bar( | |
| x=x_labels, y=dn, name="De Novo", | |
| marker_color="rgba(49,130,206,0.78)", | |
| text=[f"{v:.0f}" if v is not None else "" for v in dn], | |
| textposition="outside", | |
| )) | |
| fig.add_trace(go.Bar( | |
| x=x_labels, y=rd, name="Redesign", | |
| marker_color="rgba(214,158,46,0.78)", | |
| text=[f"{v:.0f}" if v is not None else "" for v in rd], | |
| textposition="outside", | |
| )) | |
| mode = entry.get("mode") or "\u2014" | |
| fig.update_layout( | |
| **_base_layout( | |
| barmode="group", | |
| title=dict( | |
| text=f"{entry['agent_name']} ({mode}) \u2014 Mean Score by Cell", | |
| font_size=14, | |
| ), | |
| yaxis=dict(range=[0, 100], title="Hybrid score (out of 100)"), | |
| xaxis=dict(title=""), | |
| legend=dict(orientation="h", yanchor="bottom", y=-0.2, | |
| xanchor="center", x=0.5), | |
| height=340, | |
| ) | |
| ) | |
| return fig | |
| def chart_radar(e1: dict, e2: dict) -> go.Figure: | |
| """Radar chart comparing two agents' component scores (% of max).""" | |
| labels = [c.capitalize() for c in COMPONENTS] | |
| def norm(e): | |
| return [e["component_scores"][c] / COMP_MAX[c] * 100 for c in COMPONENTS] | |
| v1, v2 = norm(e1), norm(e2) | |
| m1 = e1.get("mode") or "\u2014" | |
| m2 = e2.get("mode") or "\u2014" | |
| fig = go.Figure() | |
| fig.add_trace( | |
| go.Scatterpolar( | |
| r=v1 + [v1[0]], | |
| theta=labels + [labels[0]], | |
| fill="toself", | |
| name=f'{e1["agent_name"]} ({m1})', | |
| line=dict(color="rgba(49,130,206,0.8)"), | |
| fillcolor="rgba(49,130,206,0.15)", | |
| ) | |
| ) | |
| fig.add_trace( | |
| go.Scatterpolar( | |
| r=v2 + [v2[0]], | |
| theta=labels + [labels[0]], | |
| fill="toself", | |
| name=f'{e2["agent_name"]} ({m2})', | |
| line=dict(color="rgba(229,62,62,0.8)"), | |
| fillcolor="rgba(229,62,62,0.15)", | |
| ) | |
| ) | |
| fig.update_layout( | |
| **_base_layout( | |
| polar=dict( | |
| radialaxis=dict(visible=True, range=[0, 100], ticksuffix="%") | |
| ), | |
| showlegend=True, | |
| legend=dict( | |
| orientation="h", yanchor="bottom", y=-0.25, | |
| xanchor="center", x=0.5, | |
| ), | |
| title=dict(text="Component Radar (% of max)", font_size=14), | |
| height=420, | |
| ) | |
| ) | |
| return fig | |
| def chart_component_bar(e1: dict, e2: dict) -> go.Figure: | |
| """Horizontal bar chart of raw component scores for two agents.""" | |
| labels = [f"{c.capitalize()} (/{COMP_MAX[c]})" for c in COMPONENTS] | |
| m1 = e1.get("mode") or "\u2014" | |
| m2 = e2.get("mode") or "\u2014" | |
| fig = go.Figure() | |
| fig.add_trace( | |
| go.Bar( | |
| y=labels, | |
| x=[e1["component_scores"][c] for c in COMPONENTS], | |
| name=f'{e1["agent_name"]} ({m1})', | |
| orientation="h", | |
| marker_color="rgba(49,130,206,0.7)", | |
| ) | |
| ) | |
| fig.add_trace( | |
| go.Bar( | |
| y=labels, | |
| x=[e2["component_scores"][c] for c in COMPONENTS], | |
| name=f'{e2["agent_name"]} ({m2})', | |
| orientation="h", | |
| marker_color="rgba(229,62,62,0.7)", | |
| ) | |
| ) | |
| fig.update_layout( | |
| **_base_layout( | |
| barmode="group", | |
| xaxis=dict(title="Score"), | |
| title=dict(text="Component Breakdown", font_size=14), | |
| legend=dict( | |
| orientation="h", yanchor="bottom", y=-0.3, | |
| xanchor="center", x=0.5, | |
| ), | |
| height=420, | |
| ) | |
| ) | |
| return fig | |
| def chart_mode_comparison(entries: list) -> go.Figure: | |
| """Grouped bar chart: benchmark vs user mode for each LLM.""" | |
| by_name: dict[str, dict[str, float]] = {} | |
| for e in entries: | |
| if e["submission_type"] != "llm": | |
| continue | |
| by_name.setdefault(e["agent_name"], {})[e["mode"]] = e["overall_score"] | |
| ordered = sorted( | |
| by_name.items(), | |
| key=lambda x: x[1].get("user", 0), | |
| reverse=True, | |
| ) | |
| names = [n for n, _ in ordered] | |
| bench = [m.get("benchmark", 0) for _, m in ordered] | |
| user = [m.get("user", 0) for _, m in ordered] | |
| fig = go.Figure() | |
| fig.add_trace( | |
| go.Bar( | |
| x=names, y=bench, name="Benchmark Mode", | |
| marker_color="rgba(229,62,62,0.6)", | |
| ) | |
| ) | |
| fig.add_trace( | |
| go.Bar( | |
| x=names, y=user, name="User Mode", | |
| marker_color="rgba(56,161,105,0.6)", | |
| ) | |
| ) | |
| fig.update_layout( | |
| **_base_layout( | |
| barmode="group", | |
| yaxis=dict(range=[0, 80], title="Overall hybrid score"), | |
| xaxis=dict(title=""), | |
| title=dict( | |
| text=("Unguided (Benchmark) vs Guided (User) modes \u2014 " | |
| "guidance lifts coverage but rarely shifts overall score"), | |
| font_size=13, | |
| ), | |
| legend=dict( | |
| orientation="h", yanchor="bottom", y=-0.18, | |
| xanchor="center", x=0.5, | |
| ), | |
| height=380, | |
| ) | |
| ) | |
| return fig | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Gradio application | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def create_app() -> gr.Blocks: | |
| data = load_data() | |
| entries = data["entries"] | |
| by_id = {e["agent_id"]: e for e in entries} | |
| # Build dropdown choices: (display_label, agent_id) | |
| agent_choices = [] | |
| for e in entries: | |
| sty = TYPE_STYLE.get(e["submission_type"], TYPE_STYLE["llm"]) | |
| icon = sty["icon"] | |
| mode = e.get("mode") or "\u2014" | |
| label = f"{icon} {e['agent_name']} ({mode})".strip() | |
| agent_choices.append((label, e["agent_id"])) | |
| # Safe index helper | |
| def _choice_val(idx: int) -> str: | |
| return agent_choices[min(idx, len(agent_choices) - 1)][1] | |
| with gr.Blocks( | |
| theme=gr.themes.Soft(primary_hue="blue"), | |
| css=CUSTOM_CSS, | |
| js=FORCE_LIGHT_JS, | |
| ) as app: | |
| gr.HTML(build_header(data["last_updated"], len(entries))) | |
| gr.HTML(build_headline_findings(data.get("headline_findings", []))) | |
| with gr.Tabs(): | |
| # ββββββββ Tab 1: Overall Leaderboard ββββββββ | |
| with gr.Tab("\U0001f4ca Overall"): | |
| with gr.Row(): | |
| f_mode = gr.Dropdown( | |
| ["All", "Benchmark", "User"], | |
| value="All", label="Mode", scale=1, | |
| ) | |
| f_mcp = gr.Dropdown( | |
| ["All", "Reference", "Custom"], | |
| value="All", label="MCP Tools", scale=1, | |
| ) | |
| f_type = gr.Dropdown( | |
| ["All Entries", "LLM Only", "Baselines Only"], | |
| value="All Entries", label="Show", scale=1, | |
| ) | |
| tbl = gr.HTML( | |
| build_leaderboard_table( | |
| entries, "All", "All", "All Entries" | |
| ) | |
| ) | |
| def _update_table(m, mc, t): | |
| return build_leaderboard_table(entries, m, mc, t) | |
| for dd in [f_mode, f_mcp, f_type]: | |
| dd.change( | |
| _update_table, [f_mode, f_mcp, f_type], tbl | |
| ) | |
| # ββββββββ Tab 2: Taxonomy Breakdown ββββββββ | |
| with gr.Tab("\U0001f9ec Taxonomy"): | |
| tax_dd = gr.Dropdown( | |
| agent_choices, | |
| value=_choice_val(0), | |
| label="Select Agent", | |
| ) | |
| hm_html = gr.HTML(build_heatmap(entries[0])) | |
| tax_plot = gr.Plot(chart_taxonomy_bar(entries[0])) | |
| def _update_taxonomy(aid): | |
| e = by_id.get(aid, entries[0]) | |
| return build_heatmap(e), chart_taxonomy_bar(e) | |
| tax_dd.change( | |
| _update_taxonomy, [tax_dd], [hm_html, tax_plot] | |
| ) | |
| # ββββββββ Tab 3: Component Analysis ββββββββ | |
| with gr.Tab("\U0001f3af Components"): | |
| with gr.Row(): | |
| c1 = gr.Dropdown( | |
| agent_choices, value=_choice_val(0), | |
| label="Agent 1", scale=1, | |
| ) | |
| c2 = gr.Dropdown( | |
| agent_choices, value=_choice_val(4), | |
| label="Agent 2", scale=1, | |
| ) | |
| with gr.Row(): | |
| radar = gr.Plot( | |
| chart_radar( | |
| entries[0], | |
| entries[min(4, len(entries) - 1)], | |
| ) | |
| ) | |
| comp_bar = gr.Plot( | |
| chart_component_bar( | |
| entries[0], | |
| entries[min(4, len(entries) - 1)], | |
| ) | |
| ) | |
| def _update_comp(a1, a2): | |
| e1 = by_id.get(a1, entries[0]) | |
| e2 = by_id.get(a2, entries[-1]) | |
| return chart_radar(e1, e2), chart_component_bar(e1, e2) | |
| for dd in [c1, c2]: | |
| dd.change(_update_comp, [c1, c2], [radar, comp_bar]) | |
| # ββββββββ Tab 4: Benchmark vs User (coverage-depth dissociation) ββββββββ | |
| with gr.Tab("\u26a1 Guidance Effect"): | |
| gr.HTML( | |
| '<div style="background:#eff6ff;border-left:4px solid ' | |
| '#3182ce;border-radius:8px;padding:0.85rem 1.1rem;' | |
| 'margin:0.4rem 0 0.9rem;color:#1e3a8a;font-size:0.88rem;' | |
| 'line-height:1.55">' | |
| '<strong>Mode semantics:</strong> ' | |
| '<em>Benchmark mode</em> exposes atomic tools without ' | |
| 'pipeline hints (unguided); <em>User mode</em> packages ' | |
| 'them into composite workflows with explicit pipeline ' | |
| 'structure (guided). Guidance lifts the lowest-tier ' | |
| 'agents but does not consistently help capable ones, ' | |
| 'and never closes the depth gap (see <em>Depth Gap</em> ' | |
| 'tab).</div>' | |
| ) | |
| gr.Plot(chart_mode_comparison(entries)) | |
| gr.HTML(build_mode_cards(entries)) | |
| # ββββββββ Tab 5: Depth Gap (interventions) ββββββββ | |
| with gr.Tab("\U0001f50d Depth Gap"): | |
| gr.HTML(build_intervention_section( | |
| data.get("interventions", {}) | |
| )) | |
| # ββββββ Tab: Submit ββββββ | |
| with gr.Tab("\U0001f4e4 Submit"): | |
| gr.HTML(""" | |
| <div style="max-width:820px;margin:0 auto;padding:1rem"> | |
| <h2 style="color:#0f172a;margin:0 0 0.5rem; | |
| font-weight:700;font-size:1.25rem"> | |
| Submit your agent</h2> | |
| <p style="color:#475569;margin-bottom:1rem;line-height:1.6"> | |
| BioDesignBench evaluates models inside Romero Lab | |
| infrastructure to keep the 76 task specifications | |
| contamination-clean. You provide an LLM API key and | |
| a model name, and we run the BioDesignBench agent | |
| loop against your model with the reference 17-tool | |
| MCP server. Task content never leaves Romero Lab | |
| except through your chosen LLM provider's API call. | |
| </p> | |
| <div style="background:#dcfce7;border-left:4px solid #15803d; | |
| padding:0.95rem 1.1rem;border-radius:8px; | |
| margin-bottom:1rem;font-size:0.86rem; | |
| color:#14532d;line-height:1.55"> | |
| <strong>How your credentials are handled:</strong> | |
| <ul style="margin:0.5rem 0 0 1.1rem;padding:0"> | |
| <li>Your API key is stored on the submission row | |
| only between submission and dispatch, then | |
| <strong>scrubbed automatically</strong> regardless | |
| of whether the run succeeded.</li> | |
| <li>Each task carries a unique 16-character canary | |
| token (invisible HTML comment) so we can | |
| retrospectively detect leakage in published | |
| models.</li> | |
| <li>The MCP server (reference or custom) sees | |
| only operational tool arguments, never the raw | |
| task description or evaluation criteria.</li> | |
| </ul> | |
| </div> | |
| <div style="background:#eff6ff;border-left:4px solid #3182ce; | |
| padding:0.95rem 1.1rem;border-radius:8px; | |
| margin-bottom:1rem;font-size:0.86rem; | |
| color:#1e3a8a;line-height:1.55"> | |
| <strong>Reference vs Custom MCP</strong> | |
| <ul style="margin:0.5rem 0 0 1.1rem;padding:0"> | |
| <li><strong>Reference</strong> (default): your | |
| agent uses our hosted | |
| <a href="https://github.com/jasonkim8652/protein-design-mcp" | |
| style="color:#1d4ed8;font-weight:600">protein-design-mcp</a> | |
| endpoint. Eligible for the reference ranking.</li> | |
| <li><strong>Custom</strong>: provide your own | |
| public MCP URL implementing the same 17-tool | |
| schema. Useful for benchmarking new tool | |
| implementations against an identical model | |
| under identical task prompts. Tagged with a | |
| <code>custom</code> badge.</li> | |
| </ul> | |
| </div> | |
| <div style="background:#fefce8;border-left:3px solid #ca8a04; | |
| padding:0.8rem 1rem;border-radius:6px; | |
| margin-bottom:1rem;font-size:0.85rem;color:#713f12"> | |
| <strong>Rate limit:</strong> 1 submission per calendar | |
| month per organization. Your LLM-API and (if reference) | |
| MCP-GPU costs are billed to your account / paid by | |
| Romero Lab respectively; please be considerate. | |
| </div> | |
| </div>""") | |
| with gr.Column(scale=1): | |
| sub_agent = gr.Textbox( | |
| label="Agent Name", | |
| placeholder="e.g., GPT-5 with reference MCP", | |
| ) | |
| sub_org = gr.Textbox( | |
| label="Organization", | |
| placeholder="e.g., OpenAI", | |
| ) | |
| with gr.Row(): | |
| sub_provider = gr.Dropdown( | |
| choices=[ | |
| ("Anthropic Claude", "anthropic"), | |
| ("OpenAI GPT", "openai"), | |
| ("Google Gemini", "google"), | |
| ("DeepSeek", "deepseek"), | |
| ], | |
| value="anthropic", | |
| label="LLM Provider", | |
| ) | |
| sub_model = gr.Textbox( | |
| label="Model name", | |
| placeholder="e.g., claude-sonnet-4-20250514", | |
| ) | |
| sub_api_key = gr.Textbox( | |
| label="API key (transient -- scrubbed after dispatch)", | |
| placeholder="sk-...", | |
| type="password", | |
| ) | |
| sub_desc = gr.Textbox( | |
| label="Description (optional)", | |
| placeholder="Brief description of your agent...", | |
| lines=2, | |
| ) | |
| with gr.Accordion("Advanced: Custom MCP", open=False): | |
| sub_custom_mcp_url = gr.Textbox( | |
| label="Custom MCP URL (optional)", | |
| placeholder="https://your-mcp.example.com/predict", | |
| ) | |
| sub_custom_mcp_token = gr.Textbox( | |
| label="Custom MCP bearer token (optional)", | |
| placeholder="(empty if your MCP needs no auth)", | |
| type="password", | |
| ) | |
| sub_btn = gr.Button( | |
| "Submit for Review", | |
| variant="primary", | |
| ) | |
| sub_result = gr.HTML() | |
| def _handle_submit( | |
| name, org, provider, model, api_key, desc, | |
| custom_mcp_url, custom_mcp_token, | |
| ): | |
| if not name or not org or not model or not api_key: | |
| return ('<div style="color:#e53e3e;padding:0.5rem">' | |
| "agent name, organization, model name, and " | |
| "API key are required.</div>") | |
| try: | |
| from eval_queue import submit | |
| result = submit( | |
| agent_name=name, | |
| organization=org, | |
| provider=provider, | |
| model_name=model, | |
| api_key=api_key, | |
| description=desc, | |
| custom_mcp_url=custom_mcp_url or "", | |
| custom_mcp_token=custom_mcp_token or "", | |
| ) | |
| if "error" in result: | |
| return (f'<div style="color:#e53e3e;padding:0.5rem">' | |
| f'{result["error"]}</div>') | |
| mcp_mode = "custom" if custom_mcp_url else "reference" | |
| return ( | |
| f'<div style="background:#c6f6d5;padding:1rem;' | |
| f'border-radius:8px;margin-top:0.5rem">' | |
| f'<strong>Submitted!</strong> ' | |
| f'ID: <code>{result["submission_id"]}</code><br>' | |
| f'Status: {result["status"]}<br>' | |
| f'Provider: <strong>{provider}</strong> ' | |
| f'/ Model: <strong>{model}</strong><br>' | |
| f'MCP mode: <strong>{mcp_mode}</strong><br>' | |
| f'Canary: <code>{result.get("canary_token","")}</code><br>' | |
| f'{result.get("message", "")}</div>' | |
| ) | |
| except Exception as e: | |
| return (f'<div style="color:#e53e3e;padding:0.5rem">' | |
| f"Error: {str(e)[:200]}</div>") | |
| sub_btn.click( | |
| _handle_submit, | |
| [sub_agent, sub_org, sub_provider, sub_model, | |
| sub_api_key, sub_desc, sub_custom_mcp_url, | |
| sub_custom_mcp_token], | |
| sub_result, | |
| ) | |
| # ββββββ Tab 6: Status & Admin ββββββ | |
| with gr.Tab("\U0001f6e0 Status"): | |
| gr.HTML(""" | |
| <div style="max-width:800px;margin:0 auto;padding:1rem"> | |
| <h2 style="color:#0f172a;margin:0 0 0.5rem; | |
| font-weight:700;font-size:1.25rem"> | |
| Submission status</h2> | |
| <p style="color:#475569;margin-bottom:0.5rem;line-height:1.6"> | |
| Check your submission status or manage the pipeline | |
| (admin only).</p> | |
| </div>""") | |
| # --- Public status check --- | |
| with gr.Accordion("Check Submission Status", open=True): | |
| status_id = gr.Textbox( | |
| label="Submission ID", | |
| placeholder="Enter your submission ID...", | |
| ) | |
| status_btn = gr.Button("Check Status") | |
| status_out = gr.HTML() | |
| def _check_status(sid): | |
| if not sid: | |
| return '<div style="color:#718096">Enter an ID above.</div>' | |
| try: | |
| from eval_queue import get_submission | |
| sub = get_submission(sid.strip()) | |
| if sub is None: | |
| return ('<div style="color:#e53e3e">' | |
| "Submission not found.</div>") | |
| status_color = { | |
| "pending": "#d69e2e", "approved": "#38a169", | |
| "dispatching": "#3182ce", "boltz": "#805ad5", | |
| "scoring": "#805ad5", "complete": "#38a169", | |
| "failed": "#e53e3e", "rejected": "#e53e3e", | |
| }.get(sub["status"], "#718096") | |
| score_html = "" | |
| if sub.get("overall_score") is not None: | |
| score_html = ( | |
| f'<div style="font-size:1.2rem;' | |
| f'font-weight:700;color:#0f172a;' | |
| f'margin-top:0.5rem">' | |
| f'Score: {sub["overall_score"]:.1f}/100' | |
| f'</div>' | |
| ) | |
| return ( | |
| f'<div style="background:white;padding:1rem;' | |
| f'border-radius:8px;border:1px solid #e2e8f0">' | |
| f'<strong>{sub["agent_name"]}</strong> ' | |
| f'({sub["organization"]})<br>' | |
| f'Status: <span style="color:{status_color};' | |
| f'font-weight:700">{sub["status"]}</span><br>' | |
| f'Tasks: {sub.get("tasks_dispatched", 0)}' | |
| f'/{sub.get("tasks_total", 76)}<br>' | |
| f'Created: {sub.get("created_at", "")[:10]}' | |
| f'{score_html}</div>' | |
| ) | |
| except Exception as e: | |
| return f'<div style="color:#e53e3e">{e}</div>' | |
| status_btn.click(_check_status, [status_id], status_out) | |
| # --- Admin panel (password-protected) --- | |
| with gr.Accordion("Admin Panel", open=False): | |
| admin_pw = gr.Textbox( | |
| label="Admin Password", type="password", | |
| ) | |
| admin_auth_btn = gr.Button("Authenticate") | |
| admin_panel = gr.Column(visible=False) | |
| admin_msg = gr.HTML() | |
| with admin_panel: | |
| gr.HTML('<h3 style="color:#0f172a">' | |
| 'Pending Submissions</h3>') | |
| pending_html = gr.HTML() | |
| refresh_btn = gr.Button("Refresh List") | |
| with gr.Row(): | |
| approve_id = gr.Textbox( | |
| label="Submission ID to Approve/Reject", | |
| scale=2, | |
| ) | |
| approve_btn = gr.Button( | |
| "Approve", variant="primary", scale=1, | |
| ) | |
| reject_btn = gr.Button( | |
| "Reject", variant="stop", scale=1, | |
| ) | |
| approve_msg = gr.HTML() | |
| gr.HTML('<h3 style="color:#0f172a;margin-top:1rem">' | |
| 'Pipeline Control</h3>') | |
| with gr.Row(): | |
| dispatch_id = gr.Textbox( | |
| label="Submission ID", scale=2, | |
| ) | |
| dispatch_btn = gr.Button( | |
| "Phase A: Dispatch Tasks", scale=1, | |
| ) | |
| with gr.Row(): | |
| boltz_id = gr.Textbox( | |
| label="Submission ID", scale=2, | |
| ) | |
| boltz_btn = gr.Button( | |
| "Phase B: Run Boltz (GPU)", scale=1, | |
| ) | |
| with gr.Row(): | |
| judge_id = gr.Textbox( | |
| label="Submission ID", scale=2, | |
| ) | |
| judge_btn = gr.Button( | |
| "Phase C: Run LLM Judge", scale=1, | |
| ) | |
| with gr.Row(): | |
| final_id = gr.Textbox( | |
| label="Submission ID", scale=2, | |
| ) | |
| final_btn = gr.Button( | |
| "Phase D: Finalize & Publish", scale=1, | |
| ) | |
| pipeline_out = gr.HTML() | |
| def _admin_auth(pw): | |
| if pw == ADMIN_PASSWORD: | |
| return ( | |
| gr.Column(visible=True), | |
| '<div style="color:#38a169">' | |
| 'Authenticated.</div>', | |
| ) | |
| return ( | |
| gr.Column(visible=False), | |
| '<div style="color:#e53e3e">' | |
| 'Wrong password.</div>', | |
| ) | |
| admin_auth_btn.click( | |
| _admin_auth, [admin_pw], | |
| [admin_panel, admin_msg], | |
| ) | |
| def _refresh_pending(): | |
| try: | |
| from eval_queue import get_pending_submissions | |
| pending = get_pending_submissions() | |
| if not pending: | |
| return "<p>No pending submissions.</p>" | |
| rows = [] | |
| for s in pending: | |
| mcp = "custom" if s.get("custom_mcp_url") else "reference" | |
| key_state = "set" if s.get("api_key") else "scrubbed" | |
| rows.append( | |
| f'<tr><td><code>{s["submission_id"]}</code></td>' | |
| f'<td>{s["agent_name"]}</td>' | |
| f'<td>{s["organization"]}</td>' | |
| f'<td>{s.get("provider","?")}/{s.get("model_name","?")}</td>' | |
| f'<td>{mcp}</td>' | |
| f'<td>{key_state}</td>' | |
| f'<td>{s.get("created_at","")[:10]}</td></tr>' | |
| ) | |
| return ( | |
| '<table style="width:100%;font-size:0.85rem;' | |
| 'border-collapse:collapse">' | |
| "<tr><th>ID</th><th>Agent</th><th>Org</th>" | |
| "<th>Provider/Model</th><th>MCP</th>" | |
| "<th>Key</th><th>Date</th></tr>" | |
| + "".join(rows) + "</table>" | |
| ) | |
| except Exception as e: | |
| return f"<p>Error: {e}</p>" | |
| refresh_btn.click( | |
| _refresh_pending, [], pending_html, | |
| ) | |
| def _approve_sub(sid): | |
| try: | |
| from eval_queue import update_status | |
| ok = update_status(sid.strip(), "approved") | |
| if ok: | |
| return ( | |
| f'<div style="color:#38a169">' | |
| f'Approved: {sid}</div>' | |
| ) | |
| return ( | |
| f'<div style="color:#e53e3e">' | |
| f'Failed to approve {sid}</div>' | |
| ) | |
| except Exception as e: | |
| return f'<div style="color:#e53e3e">{e}</div>' | |
| def _reject_sub(sid): | |
| try: | |
| from eval_queue import update_status | |
| ok = update_status(sid.strip(), "rejected") | |
| if ok: | |
| return ( | |
| f'<div style="color:#d69e2e">' | |
| f'Rejected: {sid}</div>' | |
| ) | |
| return ( | |
| f'<div style="color:#e53e3e">' | |
| f'Failed to reject {sid}</div>' | |
| ) | |
| except Exception as e: | |
| return f'<div style="color:#e53e3e">{e}</div>' | |
| approve_btn.click( | |
| _approve_sub, [approve_id], approve_msg, | |
| ) | |
| reject_btn.click( | |
| _reject_sub, [approve_id], approve_msg, | |
| ) | |
| def _run_dispatch(sid): | |
| try: | |
| from eval_queue import get_submission | |
| from eval_dispatcher import dispatch_all_tasks | |
| sub = get_submission(sid.strip()) | |
| if sub is None: | |
| return ('<div style="color:#e53e3e">' | |
| 'Not found</div>') | |
| if sub["status"] not in ("approved", "dispatching"): | |
| return ( | |
| f'<div style="color:#e53e3e">' | |
| f'Cannot dispatch: status=' | |
| f'{sub["status"]}</div>' | |
| ) | |
| if not sub.get("api_key"): | |
| return ( | |
| '<div style="color:#e53e3e">' | |
| 'API key already scrubbed -- this ' | |
| 'submission has already been dispatched. ' | |
| 'Resubmit if you need to re-run.</div>' | |
| ) | |
| results = dispatch_all_tasks(sid.strip()) | |
| ok = sum(1 for r in results if r.get("success")) | |
| return ( | |
| f'<div style="color:#38a169">' | |
| f'Dispatched: {ok}/{len(results)} tasks ' | |
| f'succeeded. API key scrubbed.</div>' | |
| ) | |
| except Exception as e: | |
| import traceback | |
| return ( | |
| f'<div style="color:#e53e3e">' | |
| f'<strong>Dispatch error:</strong> {e}<br>' | |
| f'<pre style="font-size:0.7rem">' | |
| f'{traceback.format_exc()[:600]}</pre></div>' | |
| ) | |
| def _run_boltz(sid): | |
| try: | |
| from eval_queue import get_submission | |
| from eval_boltz import run_boltz_posteval | |
| sub = get_submission(sid.strip()) | |
| if sub is None: | |
| return ( | |
| '<div style="color:#e53e3e">' | |
| 'Not found</div>' | |
| ) | |
| per_task = json.loads( | |
| sub.get("per_task_results", "{}") | |
| ) | |
| if not per_task: | |
| return ( | |
| '<div style="color:#e53e3e">' | |
| "No task results to process.</div>" | |
| ) | |
| run_boltz_posteval(per_task) | |
| from eval_queue import save_task_result | |
| for tid, tres in per_task.items(): | |
| save_task_result(sid.strip(), tid, tres) | |
| return ( | |
| '<div style="color:#38a169">' | |
| "Boltz post-assessment complete.</div>" | |
| ) | |
| except Exception as e: | |
| return f'<div style="color:#e53e3e">{e}</div>' | |
| def _run_judge(sid): | |
| try: | |
| import eval_judge as ej | |
| from eval_queue import ( | |
| get_submission, save_task_result, update_status, | |
| ) | |
| sub = get_submission(sid.strip()) | |
| if sub is None: | |
| return ('<div style="color:#e53e3e">' | |
| 'Not found</div>') | |
| per_task = json.loads( | |
| sub.get("per_task_results", "{}") | |
| ) | |
| if not per_task: | |
| return ('<div style="color:#e53e3e">' | |
| "No task results to process.</div>") | |
| update_status(sid.strip(), "scoring") | |
| ej.run_judge_panel( | |
| per_task, | |
| agent_id=sub.get("agent_name", "unknown"), | |
| dry_run=False, | |
| ) | |
| for tid, tres in per_task.items(): | |
| save_task_result(sid.strip(), tid, tres) | |
| n_done = sum( | |
| 1 for r in per_task.values() | |
| if r.get("hybrid_total") is not None | |
| ) | |
| return ( | |
| f'<div style="color:#38a169">' | |
| f"LLM judge complete on {n_done} tasks." | |
| "</div>" | |
| ) | |
| except Exception as e: | |
| import traceback | |
| return ( | |
| f'<div style="color:#e53e3e">' | |
| f'<strong>Judge error:</strong> {e}<br>' | |
| f'<pre style="font-size:0.7rem">' | |
| f'{traceback.format_exc()[:600]}</pre></div>' | |
| ) | |
| def _run_finalize(sid): | |
| try: | |
| from eval_queue import ( | |
| finalize_submission, | |
| get_submission, | |
| ) | |
| from eval_scorer import aggregate_scores | |
| sub = get_submission(sid.strip()) | |
| if sub is None: | |
| return ( | |
| '<div style="color:#e53e3e">' | |
| 'Not found</div>' | |
| ) | |
| per_task = json.loads( | |
| sub.get("per_task_results", "{}") | |
| ) | |
| agg = aggregate_scores(per_task) | |
| finalize_submission( | |
| sid.strip(), | |
| overall_score=agg["overall_score"], | |
| component_scores=agg["component_scores"], | |
| taxonomy_scores=agg["taxonomy_scores"], | |
| ) | |
| mode_label = agg.get("scoring_mode", "algo") | |
| return ( | |
| f'<div style="color:#38a169">' | |
| f'Finalized! Score: ' | |
| f'{agg["overall_score"]:.1f} ' | |
| f'(scoring={mode_label})</div>' | |
| ) | |
| except Exception as e: | |
| return f'<div style="color:#e53e3e">{e}</div>' | |
| dispatch_btn.click( | |
| _run_dispatch, [dispatch_id], pipeline_out, | |
| ) | |
| boltz_btn.click( | |
| _run_boltz, [boltz_id], pipeline_out, | |
| ) | |
| judge_btn.click( | |
| _run_judge, [judge_id], pipeline_out, | |
| ) | |
| final_btn.click( | |
| _run_finalize, [final_id], pipeline_out, | |
| ) | |
| # ββββββ Tab 7: About ββββββ | |
| with gr.Tab("\u2139\ufe0f About"): | |
| gr.HTML(build_about()) | |
| return app | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Entry point | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| create_app().launch() | |