Spaces:

RomeroLab-Duke
/

BioDesignBench-Leaderboard

Running

App Files Files Community

Jasonkim8652 commited on Mar 3

Commit

e239859

verified ·

1 Parent(s): 25af141

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +934 -0

app.py ADDED Viewed

	@@ -0,0 +1,934 @@

+"""BioDesignBench Leaderboard — Gradio App for HuggingFace Spaces
+Evaluating LLM Agents on Protein Design via MCP Tools
+Romero Lab, Duke University
+"""
+import json
+from pathlib import Path
+import gradio as gr
+import plotly.graph_objects as go
+# ═══════════════════════════════════════════════════════════════════
+#  Configuration — change these when deploying
+# ═══════════════════════════════════════════════════════════════════
+PAPER_URL = "#"
+GITHUB_URL = "#"
+HF_URL = "#"
+# ═══════════════════════════════════════════════════════════════════
+#  Taxonomy & scoring constants
+# ═══════════════════════════════════════════════════════════════════
+TASK_TYPES = [
+    "de_novo_binder",
+    "sequence_optimization",
+    "de_novo_backbone",
+    "complex_engineering",
+    "conformational_design",
+]
+TASK_TYPE_LABELS = {
+    "de_novo_binder": "De Novo Binder",
+    "sequence_optimization": "Seq Optimization",
+    "de_novo_backbone": "De Novo Backbone",
+    "complex_engineering": "Complex Eng.",
+    "conformational_design": "Conformational",
+}
+BIO_CONTEXTS = ["ab", "enz", "sig", "str", "flu"]
+BIO_CONTEXT_LABELS = {
+    "ab": "Antibody",
+    "enz": "Enzyme",
+    "sig": "Signaling",
+    "str": "Structural",
+    "flu": "Fluorescent",
+}
+VALID_CELLS = {
+    "de_novo_binder": {"ab", "enz", "sig"},
+    "sequence_optimization": {"ab", "enz", "sig", "str", "flu"},
+    "de_novo_backbone": {"str"},
+    "complex_engineering": {"enz", "sig", "str"},
+    "conformational_design": {"enz", "sig", "str", "flu"},
+}
+COMPONENTS = [
+    "approach",
+    "orchestration",
+    "quality",
+    "feasibility",
+    "novelty",
+    "diversity",
+]
+COMP_MAX = {
+    "approach": 20,
+    "orchestration": 15,
+    "quality": 35,
+    "feasibility": 15,
+    "novelty": 5,
+    "diversity": 10,
+}
+TYPE_STYLE = {
+    "llm": {"icon": "", "bg": "#ffffff", "tag": ""},
+    "hardcoded": {"icon": "\U0001f527", "bg": "#f0f0f0", "tag": "baseline"},
+    "human_expert": {
+        "icon": "\U0001f468\u200d\U0001f52c",
+        "bg": "#ebf4ff",
+        "tag": "baseline",
+    },
+    "human_oracle": {"icon": "\U0001f4c4", "bg": "#fefcbf", "tag": "baseline"},
+}
+# ═══════════════════════════════════════════════════════════════════
+#  Data loading
+# ═══════════════════════════════════════════════════════════════════
+def load_data() -> dict:
+    path = Path(__file__).parent / "leaderboard_data.json"
+    with open(path) as f:
+        return json.load(f)
+# ═══════════════════════════════════════════════════════════════════
+#  Custom CSS
+# ═══════════════════════════════════════════════════════════════════
+CUSTOM_CSS = """
+.gradio-container { max-width: 1200px !important; }
+.gr-padded { padding: 0 !important; }
+"""
+# ═══════════════════════════════════════════════════════════════════
+#  Plotly layout helper
+# ═══════════════════════════════════════════════════════════════════
+def _base_layout(**overrides) -> dict:
+    """Shared Plotly layout defaults, with per-chart overrides."""
+    base = dict(
+        plot_bgcolor="white",
+        paper_bgcolor="white",
+        font=dict(
+            family="system-ui, -apple-system, sans-serif", size=12, color="#2d3748"
+        ),
+        margin=dict(l=40, r=20, t=50, b=40),
+    )
+    base.update(overrides)
+    return base
+# ═══════════════════════════════════════════════════════════════════
+#  HTML builders
+# ═══════════════════════════════════════════════════════════════════
+def build_header(last_updated: str, n_entries: int) -> str:
+    return f"""
+    <div style="background:linear-gradient(135deg,#1a365d 0%,#2b6cb0 100%);
+                color:white;padding:2rem;text-align:center;border-radius:12px;
+                margin-bottom:0.5rem">
+      <h1 style="font-size:2rem;margin:0;font-weight:700">
+        \U0001f9ec BioDesignBench Leaderboard</h1>
+      <p style="opacity:0.85;margin:0.3rem 0 0;font-size:1rem">
+        Evaluating LLM Agents on Protein Design via MCP Tools</p>
+      <div style="margin-top:0.6rem;display:flex;justify-content:center;
+                  gap:0.8rem;flex-wrap:wrap">
+        <a href="{PAPER_URL}" target="_blank"
+           style="background:rgba(255,255,255,0.2);color:white;
+                  padding:0.3rem 0.8rem;border-radius:5px;
+                  text-decoration:none;font-size:0.85rem;
+                  font-weight:600">\U0001f4c4 Paper</a>
+        <a href="{GITHUB_URL}" target="_blank"
+           style="background:rgba(255,255,255,0.2);color:white;
+                  padding:0.3rem 0.8rem;border-radius:5px;
+                  text-decoration:none;font-size:0.85rem;
+                  font-weight:600">\U0001f4bb GitHub</a>
+        <a href="{HF_URL}" target="_blank"
+           style="background:rgba(255,255,255,0.2);color:white;
+                  padding:0.3rem 0.8rem;border-radius:5px;
+                  text-decoration:none;font-size:0.85rem;
+                  font-weight:600">\U0001f917 HuggingFace</a>
+      </div>
+      <div style="font-size:0.8rem;opacity:0.6;margin-top:0.5rem">
+        Romero Lab, Duke University &middot; Last updated: {last_updated}
+        &middot; 76 tasks &middot; {n_entries} conditions</div>
+    </div>"""
+# ── Score styling helpers ──
+def _score_color(s: float) -> str:
+    if s >= 50:
+        return "#38a169"
+    if s >= 25:
+        return "#d69e2e"
+    return "#e53e3e"
+def _bar_bg(s: float) -> str:
+    if s >= 50:
+        return "rgba(56,161,105,0.15)"
+    if s >= 25:
+        return "rgba(214,158,46,0.15)"
+    return "rgba(229,62,62,0.12)"
+def _heat_color(val, max_val=95) -> str:
+    if val is None:
+        return "#f7fafc"
+    r = val / max_val
+    if r >= 0.7:
+        return f"rgba(56,161,105,{min(0.2 + r * 0.4, 0.8):.2f})"
+    if r >= 0.4:
+        return f"rgba(214,158,46,{min(0.2 + r * 0.4, 0.8):.2f})"
+    return f"rgba(229,62,62,{min(0.15 + r * 0.3, 0.6):.2f})"
+# ── Tab 1: Overall leaderboard table ──
+def build_leaderboard_table(
+    entries: list, mode_f: str, mcp_f: str, type_f: str
+) -> str:
+    """Generate the mixed-ranking HTML table with inline styles."""
+    # Filter
+    filtered = []
+    for e in entries:
+        st = e["submission_type"]
+        if mode_f != "All" and st == "llm":
+            if (e.get("mode") or "").lower() != mode_f.lower():
+                continue
+        if mcp_f == "Reference" and e.get("mcp_custom"):
+            continue
+        if mcp_f == "Custom" and not e.get("mcp_custom"):
+            continue
+        if type_f == "LLM Only" and st != "llm":
+            continue
+        if type_f == "Baselines Only" and st == "llm":
+            continue
+        filtered.append(e)
+    filtered.sort(key=lambda x: x["overall_score"], reverse=True)
+    # Shared cell styles
+    TD = (
+        "padding:0.65rem 1rem;border-bottom:1px solid #e2e8f0;"
+        "font-size:0.9rem"
+    )
+    TH = (
+        "background:#1a365d;color:white;padding:0.75rem 1rem;"
+        "text-align:left;font-size:0.8rem;text-transform:uppercase;"
+        "letter-spacing:0.5px"
+    )
+    rows = []
+    llm_rank = 0
+    for e in filtered:
+        st = e["submission_type"]
+        sty = TYPE_STYLE.get(st, TYPE_STYLE["llm"])
+        is_bl = st != "llm"
+        sc = e["overall_score"]
+        # ── Rank cell ──
+        if is_bl:
+            rank = (
+                f'<td style="{TD};text-align:center;font-size:1.1rem;'
+                f'width:50px">{sty["icon"]}</td>'
+            )
+        else:
+            llm_rank += 1
+            rcolor = {1: "#d69e2e", 2: "#a0aec0", 3: "#c17832"}.get(
+                llm_rank, "#1a365d"
+            )
+            rsize = (
+                "1.1rem"
+                if llm_rank == 1
+                else ("1.05rem" if llm_rank <= 3 else "0.9rem")
+            )
+            rank = (
+                f'<td style="{TD};text-align:center;font-weight:700;'
+                f"color:{rcolor};font-size:{rsize};width:50px\">"
+                f"{llm_rank}</td>"
+            )
+        # ── Name cell ──
+        tag_html = ""
+        if sty["tag"]:
+            tag_html = (
+                ' <span style="font-size:0.7rem;background:#e2e8f0;'
+                "padding:0.1rem 0.4rem;border-radius:3px;color:#4a5568;"
+                f'margin-left:0.3rem;vertical-align:middle">'
+                f'{sty["tag"]}</span>'
+            )
+        icon_pfx = f'{sty["icon"]} ' if sty["icon"] else ""
+        fw = "600" if is_bl else "500"
+        name = (
+            f'<td style="{TD};font-weight:{fw}">'
+            f'{icon_pfx}{e["agent_name"]}{tag_html}</td>'
+        )
+        # ── Organization ──
+        org = f'<td style="{TD}">{e["organization"]}</td>'
+        # ── Mode badge ──
+        if is_bl:
+            mode = f'<td style="{TD};color:#718096">\u2014</td>'
+        elif e.get("mode") == "benchmark":
+            mode = (
+                f'<td style="{TD}"><span style="background:#fed7d7;'
+                "color:#c53030;padding:0.15rem 0.5rem;border-radius:4px;"
+                'font-size:0.75rem;font-weight:600">benchmark</span></td>'
+            )
+        else:
+            mode = (
+                f'<td style="{TD}"><span style="background:#c6f6d5;'
+                "color:#276749;padding:0.15rem 0.5rem;border-radius:4px;"
+                'font-size:0.75rem;font-weight:600">user</span></td>'
+            )
+        # ── MCP ──
+        if is_bl:
+            mcp = f'<td style="{TD};color:#718096">\u2014</td>'
+        elif e.get("mcp_custom"):
+            mcp = (
+                f'<td style="{TD};color:#38a169;font-weight:700">'
+                "\u2713 custom</td>"
+            )
+        else:
+            mcp = f'<td style="{TD};color:#718096">reference</td>'
+        # ── Score with proportional bar ──
+        scol = _score_color(sc)
+        bbg = _bar_bg(sc)
+        score_cell = (
+            f'<td style="{TD};font-weight:700;font-size:1rem;color:{scol};'
+            f'position:relative;font-variant-numeric:tabular-nums">'
+            f'<div style="position:absolute;left:0;top:0;bottom:0;'
+            f"width:{sc}%;background:{bbg};"
+            f'border-radius:3px"></div>'
+            f'<span style="position:relative">{sc:.1f}</span></td>'
+        )
+        # ── Tasks & zeros ──
+        tc = e.get("tasks_completed", 0)
+        tt = e.get("tasks_total", 76)
+        tasks = f'<td style="{TD}">{tc}/{tt}</td>'
+        zeros = f'<td style="{TD}">{e.get("tasks_with_zero", 0)}</td>'
+        rows.append(
+            f'<tr style="background:{sty["bg"]}">'
+            f"{rank}{name}{org}{mode}{mcp}{score_cell}{tasks}{zeros}</tr>"
+        )
+    return f"""
+    <table style="width:100%;border-collapse:collapse;background:white;
+                  border-radius:10px;overflow:hidden;
+                  box-shadow:0 1px 3px rgba(0,0,0,0.08)">
+      <thead><tr>
+        <th style="{TH};width:50px">#</th>
+        <th style="{TH}">Agent</th>
+        <th style="{TH}">Organization</th>
+        <th style="{TH}">Mode</th>
+        <th style="{TH}">MCP</th>
+        <th style="{TH}">Score</th>
+        <th style="{TH}">Tasks</th>
+        <th style="{TH}">Zero-Score</th>
+      </tr></thead>
+      <tbody>{''.join(rows)}</tbody>
+    </table>"""
+# ── Tab 2: Taxonomy heatmap ──
+def build_heatmap(entry: dict) -> str:
+    """HTML heatmap table for one agent across 17 taxonomy cells."""
+    ts = entry.get("taxonomy_scores", {})
+    TH = (
+        "background:#1a365d;color:white;padding:0.6rem 0.8rem;"
+        "text-align:center;font-size:0.75rem"
+    )
+    TD = (
+        "text-align:center;padding:0.5rem;font-size:0.85rem;"
+        "font-weight:600;border-bottom:1px solid #e2e8f0"
+    )
+    rows = []
+    for tt in TASK_TYPES:
+        cells = [
+            f'<td style="{TD};text-align:left;font-weight:600;'
+            f'background:#f8fafc">{TASK_TYPE_LABELS[tt]}</td>'
+        ]
+        vals = []
+        for bc in BIO_CONTEXTS:
+            if bc in VALID_CELLS[tt]:
+                val = ts.get(tt, {}).get(bc)
+                bg = _heat_color(val)
+                text = f"{val:.0f}" if val is not None else "\u2014"
+                cells.append(f'<td style="{TD};background:{bg}">{text}</td>')
+                if val is not None:
+                    vals.append(val)
+            else:
+                cells.append(
+                    f'<td style="{TD};color:#cbd5e0;font-weight:400">'
+                    "\u2014</td>"
+                )
+        avg = sum(vals) / len(vals) if vals else 0
+        avg_bg = _heat_color(avg)
+        cells.append(
+            f'<td style="{TD};font-weight:700;background:{avg_bg}">'
+            f"{avg:.1f}</td>"
+        )
+        rows.append(f'<tr>{"".join(cells)}</tr>')
+    bc_headers = "".join(
+        f'<th style="{TH}">{BIO_CONTEXT_LABELS[bc]}</th>'
+        for bc in BIO_CONTEXTS
+    )
+    return f"""
+    <table style="width:100%;border-collapse:collapse;background:white;
+                  border-radius:10px;overflow:hidden;
+                  box-shadow:0 1px 3px rgba(0,0,0,0.08)">
+      <thead><tr>
+        <th style="{TH};text-align:left">Task Type</th>
+        {bc_headers}
+        <th style="{TH}">Avg</th>
+      </tr></thead>
+      <tbody>{''.join(rows)}</tbody>
+    </table>"""
+# ── Tab 4: Mode comparison cards ──
+def build_mode_cards(entries: list) -> str:
+    """Per-LLM cards showing benchmark vs user delta."""
+    by_name: dict[str, dict] = {}
+    for e in entries:
+        if e["submission_type"] != "llm":
+            continue
+        by_name.setdefault(e["agent_name"], {})[e["mode"]] = e
+    ordered = sorted(
+        by_name.items(),
+        key=lambda x: x[1].get("user", {}).get("overall_score", 0),
+        reverse=True,
+    )
+    cards = []
+    for name, modes in ordered:
+        bench = modes.get("benchmark")
+        user = modes.get("user")
+        if not bench or not user:
+            continue
+        delta = user["overall_score"] - bench["overall_score"]
+        pct = (delta / bench["overall_score"] * 100) if bench["overall_score"] else 0
+        lines = [
+            '<div style="display:flex;justify-content:space-between;'
+            'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
+            "<span>Benchmark</span>"
+            f'<span style="font-weight:700;color:#e53e3e">'
+            f'{bench["overall_score"]:.1f}</span></div>',
+            '<div style="display:flex;justify-content:space-between;'
+            'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
+            "<span>User</span>"
+            f'<span style="font-weight:700;color:#d69e2e">'
+            f'{user["overall_score"]:.1f}</span></div>',
+            '<div style="display:flex;justify-content:space-between;'
+            'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
+            "<span>Delta</span>"
+            f'<span style="font-weight:700;color:#38a169">'
+            f"+{delta:.1f} (+{pct:.0f}%)</span></div>",
+        ]
+        for c in COMPONENTS:
+            d = user["component_scores"][c] - bench["component_scores"][c]
+            color = "#38a169" if d >= 0 else "#e53e3e"
+            sign = "+" if d >= 0 else ""
+            lines.append(
+                '<div style="display:flex;justify-content:space-between;'
+                'padding:0.3rem 0;border-bottom:1px solid #e2e8f0;'
+                'font-size:0.85rem">'
+                f'<span style="color:#718096">{c}</span>'
+                f'<span style="font-weight:700;color:{color}">'
+                f"{sign}{d:.1f}</span></div>"
+            )
+        cards.append(
+            '<div style="background:white;border-radius:10px;padding:1.2rem;'
+            'box-shadow:0 1px 3px rgba(0,0,0,0.08)">'
+            f'<h4 style="font-size:0.95rem;color:#1a365d;'
+            f'margin:0 0 0.8rem">{name}</h4>'
+            f'{"".join(lines)}</div>'
+        )
+    return (
+        '<div style="display:grid;grid-template-columns:'
+        'repeat(auto-fit,minmax(250px,1fr));gap:1rem;margin-top:1rem">'
+        f'{"".join(cards)}</div>'
+    )
+# ── Tab 5: About ──
+def build_about() -> str:
+    return """
+    <div style="max-width:900px;margin:0 auto">
+      <div style="background:white;border-radius:10px;padding:2rem;
+                  box-shadow:0 1px 3px rgba(0,0,0,0.08);margin-bottom:1.5rem">
+        <h2 style="color:#1a365d;margin:0 0 0.8rem;font-size:1.3rem">
+          What is BioDesignBench?</h2>
+        <p style="margin-bottom:0.8rem;color:#2d3748;line-height:1.6">
+          BioDesignBench is the first comprehensive benchmark for evaluating
+          LLM agents on protein design tasks via MCP (Model Context Protocol)
+          tool use. Unlike existing benchmarks that focus on model-only
+          evaluation, BioDesignBench tests the full design loop:
+          <strong>Natural language &rarr; Design &rarr; Evaluate &rarr;
+          Iterate</strong>.</p>
+        <div style="display:grid;grid-template-columns:
+                    repeat(auto-fit,minmax(140px,1fr));gap:1rem;margin:1rem 0">
+          <div style="background:#f7fafc;border-radius:8px;padding:1rem;
+                      text-align:center">
+            <div style="font-size:1.8rem;font-weight:700;color:#3182ce">
+              76</div>
+            <div style="font-size:0.8rem;color:#718096">Design Tasks</div>
+          </div>
+          <div style="background:#f7fafc;border-radius:8px;padding:1rem;
+                      text-align:center">
+            <div style="font-size:1.8rem;font-weight:700;color:#3182ce">
+              17</div>
+            <div style="font-size:0.8rem;color:#718096">Taxonomy Cells</div>
+          </div>
+          <div style="background:#f7fafc;border-radius:8px;padding:1rem;
+                      text-align:center">
+            <div style="font-size:1.8rem;font-weight:700;color:#3182ce">
+              17</div>
+            <div style="font-size:0.8rem;color:#718096">MCP Tools</div>
+          </div>
+          <div style="background:#f7fafc;border-radius:8px;padding:1rem;
+                      text-align:center">
+            <div style="font-size:1.8rem;font-weight:700;color:#3182ce">
+              100</div>
+            <div style="font-size:0.8rem;color:#718096">Point Rubric</div>
+          </div>
+        </div>
+      </div>
+      <div style="background:white;border-radius:10px;padding:2rem;
+                  box-shadow:0 1px 3px rgba(0,0,0,0.08);margin-bottom:1.5rem">
+        <h2 style="color:#1a365d;margin:0 0 0.8rem;font-size:1.3rem">
+          How to Submit</h2>
+        <h3 style="color:#2b6cb0;margin:1.2rem 0 0.5rem;font-size:1.05rem">
+          1. Build Your Agent</h3>
+        <p style="margin-bottom:0.8rem;color:#2d3748">
+          Create a protein design agent that accepts tasks via our API spec.
+          You may use our 17 reference MCP tools as-is, modify them, or build
+          entirely custom tools.</p>
+        <h3 style="color:#2b6cb0;margin:1.2rem 0 0.5rem;font-size:1.05rem">
+          2. Host as API Endpoint</h3>
+        <p style="margin-bottom:0.8rem;color:#2d3748">
+          Your agent must be accessible as a POST endpoint that accepts task
+          descriptions and returns designed sequences.</p>
+        <h3 style="color:#2b6cb0;margin:1.2rem 0 0.5rem;font-size:1.05rem">
+          API Specification</h3>
+        <pre style="background:#1a202c;color:#e2e8f0;padding:1rem;
+                    border-radius:8px;font-size:0.8rem;overflow-x:auto;
+                    line-height:1.5">POST /evaluate
+Input:
+{
+  "task_id": "dnb_sig_001",
+  "task_description": "Design a de novo binder for...",
+  "available_tools": [...],
+  "max_steps": 50,
+  "timeout_sec": 300
+}
+Output:
+{
+  "sequences": ["MKKL..."],
+  "run_log": [...],
+  "total_steps": 12,
+  "total_time_sec": 142.5
+}</pre>
+        <h3 style="color:#2b6cb0;margin:1.2rem 0 0.5rem;font-size:1.05rem">
+          3. Submit &amp; Evaluate</h3>
+        <p style="margin-bottom:0.8rem;color:#2d3748">
+          We run 73 hidden tasks against your endpoint. Results are
+          independently verified with AlphaFold2.
+          Maximum <strong>2 submissions per month</strong>.</p>
+        <p style="color:#2d3748">
+          3 example tasks are publicly available for development and
+          testing.</p>
+        <h3 style="color:#2b6cb0;margin:1.2rem 0 0.5rem;font-size:1.05rem">
+          MCP Reference Tools</h3>
+        <p style="margin-bottom:0.8rem;color:#2d3748">
+          We provide 17 reference MCP tools for protein design. You may use
+          them as-is, modify them, or build entirely custom tools.
+          <a href="#" style="color:#3182ce">GitHub repository &rarr;</a></p>
+        <h3 style="color:#2b6cb0;margin:1.2rem 0 0.5rem;font-size:1.05rem">
+          Submission Limits</h3>
+        <ul style="color:#2d3748;padding-left:1.5rem;margin-bottom:0.8rem">
+          <li>Maximum 2 submissions per month</li>
+          <li>Hidden test set (73 tasks) is used for ranking</li>
+          <li>3 example tasks are publicly available for development</li>
+        </ul>
+      </div>
+      <div style="background:white;border-radius:10px;padding:2rem;
+                  box-shadow:0 1px 3px rgba(0,0,0,0.08);margin-bottom:1.5rem">
+        <h2 style="color:#1a365d;margin:0 0 0.8rem;font-size:1.3rem">
+          Scoring Rubric (100 points)</h2>
+        <p style="margin-bottom:0.5rem;color:#2d3748">
+          <strong>Approach (20 pts)</strong> &mdash; Function-based design
+          methodology evaluation across 10 DesignFunctions</p>
+        <p style="margin-bottom:0.5rem;color:#2d3748">
+          <strong>Orchestration (15 pts)</strong> &mdash; Pipeline ordering
+          and intermediate validation</p>
+        <p style="margin-bottom:0.5rem;color:#2d3748">
+          <strong>Quality (35 pts)</strong> &mdash; 3-tier graduated scoring:
+          structure confidence, interface confidence, interface physics</p>
+        <p style="margin-bottom:0.5rem;color:#2d3748">
+          <strong>Feasibility (15 pts)</strong> &mdash; Valid amino acids,
+          length, composition, biophysical checks</p>
+        <p style="margin-bottom:0.5rem;color:#2d3748">
+          <strong>Novelty (5 pts)</strong> &mdash; Sequence identity to
+          reference (lower = more novel = better)</p>
+        <p style="margin-bottom:0.5rem;color:#2d3748">
+          <strong>Diversity (10 pts)</strong> &mdash; Number and diversity
+          of generated designs</p>
+      </div>
+      <div style="background:white;border-radius:10px;padding:2rem;
+                  box-shadow:0 1px 3px rgba(0,0,0,0.08);margin-bottom:1.5rem">
+        <h2 style="color:#1a365d;margin:0 0 0.8rem;font-size:1.3rem">
+          Citation</h2>
+        <pre style="background:#1a202c;color:#e2e8f0;padding:1rem;
+                    border-radius:8px;font-size:0.8rem;
+                    line-height:1.5">@article{biodesignbench2026,
+  title={BioDesignBench: Evaluating LLM Agents on
+         Protein Design via MCP Tools},
+  author={Kim, Jason et al.},
+  year={2026}
+}</pre>
+      </div>
+    </div>"""
+# ═══════════════════════════════════════════════════════════════════
+#  Chart builders (Plotly)
+# ═══════════════════════════════════════════════════════════════════
+def chart_taxonomy_bar(entry: dict) -> go.Figure:
+    """Bar chart of average score per task type for one agent."""
+    ts = entry.get("taxonomy_scores", {})
+    avgs = []
+    for tt in TASK_TYPES:
+        vals = [v for v in ts.get(tt, {}).values() if v is not None]
+        avgs.append(sum(vals) / len(vals) if vals else 0)
+    fig = go.Figure(
+        go.Bar(
+            x=[TASK_TYPE_LABELS[t] for t in TASK_TYPES],
+            y=avgs,
+            marker_color="rgba(49,130,206,0.7)",
+            marker_line_width=0,
+            text=[f"{v:.1f}" for v in avgs],
+            textposition="auto",
+        )
+    )
+    mode = entry.get("mode") or "\u2014"
+    fig.update_layout(
+        **_base_layout(
+            title=dict(
+                text=f"{entry['agent_name']} ({mode}) \u2014 Score by Task Type",
+                font_size=14,
+            ),
+            yaxis=dict(range=[0, 100], title="Average Score"),
+            xaxis=dict(title=""),
+            height=300,
+        )
+    )
+    return fig
+def chart_radar(e1: dict, e2: dict) -> go.Figure:
+    """Radar chart comparing two agents' component scores (% of max)."""
+    labels = [c.capitalize() for c in COMPONENTS]
+    def norm(e):
+        return [e["component_scores"][c] / COMP_MAX[c] * 100 for c in COMPONENTS]
+    v1, v2 = norm(e1), norm(e2)
+    m1 = e1.get("mode") or "\u2014"
+    m2 = e2.get("mode") or "\u2014"
+    fig = go.Figure()
+    fig.add_trace(
+        go.Scatterpolar(
+            r=v1 + [v1[0]],
+            theta=labels + [labels[0]],
+            fill="toself",
+            name=f'{e1["agent_name"]} ({m1})',
+            line=dict(color="rgba(49,130,206,0.8)"),
+            fillcolor="rgba(49,130,206,0.15)",
+        )
+    )
+    fig.add_trace(
+        go.Scatterpolar(
+            r=v2 + [v2[0]],
+            theta=labels + [labels[0]],
+            fill="toself",
+            name=f'{e2["agent_name"]} ({m2})',
+            line=dict(color="rgba(229,62,62,0.8)"),
+            fillcolor="rgba(229,62,62,0.15)",
+        )
+    )
+    fig.update_layout(
+        **_base_layout(
+            polar=dict(
+                radialaxis=dict(visible=True, range=[0, 100], ticksuffix="%")
+            ),
+            showlegend=True,
+            legend=dict(
+                orientation="h", yanchor="bottom", y=-0.25,
+                xanchor="center", x=0.5,
+            ),
+            title=dict(text="Component Radar (% of max)", font_size=14),
+            height=420,
+        )
+    )
+    return fig
+def chart_component_bar(e1: dict, e2: dict) -> go.Figure:
+    """Horizontal bar chart of raw component scores for two agents."""
+    labels = [f"{c.capitalize()} (/{COMP_MAX[c]})" for c in COMPONENTS]
+    m1 = e1.get("mode") or "\u2014"
+    m2 = e2.get("mode") or "\u2014"
+    fig = go.Figure()
+    fig.add_trace(
+        go.Bar(
+            y=labels,
+            x=[e1["component_scores"][c] for c in COMPONENTS],
+            name=f'{e1["agent_name"]} ({m1})',
+            orientation="h",
+            marker_color="rgba(49,130,206,0.7)",
+        )
+    )
+    fig.add_trace(
+        go.Bar(
+            y=labels,
+            x=[e2["component_scores"][c] for c in COMPONENTS],
+            name=f'{e2["agent_name"]} ({m2})',
+            orientation="h",
+            marker_color="rgba(229,62,62,0.7)",
+        )
+    )
+    fig.update_layout(
+        **_base_layout(
+            barmode="group",
+            xaxis=dict(title="Score"),
+            title=dict(text="Component Breakdown", font_size=14),
+            legend=dict(
+                orientation="h", yanchor="bottom", y=-0.3,
+                xanchor="center", x=0.5,
+            ),
+            height=420,
+        )
+    )
+    return fig
+def chart_mode_comparison(entries: list) -> go.Figure:
+    """Grouped bar chart: benchmark vs user mode for each LLM."""
+    by_name: dict[str, dict[str, float]] = {}
+    for e in entries:
+        if e["submission_type"] != "llm":
+            continue
+        by_name.setdefault(e["agent_name"], {})[e["mode"]] = e["overall_score"]
+    ordered = sorted(
+        by_name.items(),
+        key=lambda x: x[1].get("user", 0),
+        reverse=True,
+    )
+    names = [n for n, _ in ordered]
+    bench = [m.get("benchmark", 0) for _, m in ordered]
+    user = [m.get("user", 0) for _, m in ordered]
+    fig = go.Figure()
+    fig.add_trace(
+        go.Bar(
+            x=names, y=bench, name="Benchmark Mode",
+            marker_color="rgba(229,62,62,0.6)",
+        )
+    )
+    fig.add_trace(
+        go.Bar(
+            x=names, y=user, name="User Mode",
+            marker_color="rgba(56,161,105,0.6)",
+        )
+    )
+    fig.update_layout(
+        **_base_layout(
+            barmode="group",
+            yaxis=dict(range=[0, 50], title="Overall Score"),
+            title=dict(
+                text="Benchmark Mode vs User Mode \u2014 Overall Score",
+                font_size=14,
+            ),
+            legend=dict(
+                orientation="h", yanchor="bottom", y=-0.15,
+                xanchor="center", x=0.5,
+            ),
+            height=350,
+        )
+    )
+    return fig
+# ═══════════════════════════════════════════════════════════════════
+#  Gradio application
+# ═══════════════════════════════════════════════════════════════════
+def create_app() -> gr.Blocks:
+    data = load_data()
+    entries = data["entries"]
+    by_id = {e["agent_id"]: e for e in entries}
+    # Build dropdown choices: (display_label, agent_id)
+    agent_choices = []
+    for e in entries:
+        sty = TYPE_STYLE.get(e["submission_type"], TYPE_STYLE["llm"])
+        icon = sty["icon"]
+        mode = e.get("mode") or "\u2014"
+        label = f"{icon} {e['agent_name']} ({mode})".strip()
+        agent_choices.append((label, e["agent_id"]))
+    # Safe index helper
+    def _choice_val(idx: int) -> str:
+        return agent_choices[min(idx, len(agent_choices) - 1)][1]
+    with gr.Blocks() as app:
+        gr.HTML(build_header(data["last_updated"], len(entries)))
+        with gr.Tabs():
+            # ════════ Tab 1: Overall Leaderboard ════════
+            with gr.Tab("\U0001f4ca Overall"):
+                with gr.Row():
+                    f_mode = gr.Dropdown(
+                        ["All", "Benchmark", "User"],
+                        value="All", label="Mode", scale=1,
+                    )
+                    f_mcp = gr.Dropdown(
+                        ["All", "Reference", "Custom"],
+                        value="All", label="MCP Tools", scale=1,
+                    )
+                    f_type = gr.Dropdown(
+                        ["All Entries", "LLM Only", "Baselines Only"],
+                        value="All Entries", label="Show", scale=1,
+                    )
+                tbl = gr.HTML(
+                    build_leaderboard_table(
+                        entries, "All", "All", "All Entries"
+                    )
+                )
+                def _update_table(m, mc, t):
+                    return build_leaderboard_table(entries, m, mc, t)
+                for dd in [f_mode, f_mcp, f_type]:
+                    dd.change(
+                        _update_table, [f_mode, f_mcp, f_type], tbl
+                    )
+            # ════════ Tab 2: Taxonomy Breakdown ════════
+            with gr.Tab("\U0001f9ec Taxonomy"):
+                tax_dd = gr.Dropdown(
+                    agent_choices,
+                    value=_choice_val(0),
+                    label="Select Agent",
+                )
+                hm_html = gr.HTML(build_heatmap(entries[0]))
+                tax_plot = gr.Plot(chart_taxonomy_bar(entries[0]))
+                def _update_taxonomy(aid):
+                    e = by_id.get(aid, entries[0])
+                    return build_heatmap(e), chart_taxonomy_bar(e)
+                tax_dd.change(
+                    _update_taxonomy, [tax_dd], [hm_html, tax_plot]
+                )
+            # ════════ Tab 3: Component Analysis ════════
+            with gr.Tab("\U0001f3af Components"):
+                with gr.Row():
+                    c1 = gr.Dropdown(
+                        agent_choices, value=_choice_val(0),
+                        label="Agent 1", scale=1,
+                    )
+                    c2 = gr.Dropdown(
+                        agent_choices, value=_choice_val(4),
+                        label="Agent 2", scale=1,
+                    )
+                with gr.Row():
+                    radar = gr.Plot(
+                        chart_radar(
+                            entries[0],
+                            entries[min(4, len(entries) - 1)],
+                        )
+                    )
+                    comp_bar = gr.Plot(
+                        chart_component_bar(
+                            entries[0],
+                            entries[min(4, len(entries) - 1)],
+                        )
+                    )
+                def _update_comp(a1, a2):
+                    e1 = by_id.get(a1, entries[0])
+                    e2 = by_id.get(a2, entries[-1])
+                    return chart_radar(e1, e2), chart_component_bar(e1, e2)
+                for dd in [c1, c2]:
+                    dd.change(_update_comp, [c1, c2], [radar, comp_bar])
+            # ════════ Tab 4: Benchmark vs User ════════
+            with gr.Tab("\u26a1 Benchmark vs User"):
+                gr.Plot(chart_mode_comparison(entries))
+                gr.HTML(build_mode_cards(entries))
+            # ════════ Tab 5: About ════════
+            with gr.Tab("\u2139\ufe0f About"):
+                gr.HTML(build_about())
+    return app
+# ═════════════��═════════════════════════════════════════════════════
+#  Entry point
+# ═══════════════════════════════════════════════════════════════════
+if __name__ == "__main__":
+    create_app().launch(
+        theme=gr.themes.Soft(primary_hue="blue"),
+        css=CUSTOM_CSS,
+    )