Jasonkim8652 commited on
Commit
e239859
Β·
verified Β·
1 Parent(s): 25af141

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +934 -0
app.py ADDED
@@ -0,0 +1,934 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """BioDesignBench Leaderboard β€” Gradio App for HuggingFace Spaces
2
+
3
+ Evaluating LLM Agents on Protein Design via MCP Tools
4
+ Romero Lab, Duke University
5
+ """
6
+
7
+ import json
8
+ from pathlib import Path
9
+
10
+ import gradio as gr
11
+ import plotly.graph_objects as go
12
+
13
+
14
+ # ═══════════════════════════════════════════════════════════════════
15
+ # Configuration β€” change these when deploying
16
+ # ═══════════════════════════════════════════════════════════════════
17
+
18
+ PAPER_URL = "#"
19
+ GITHUB_URL = "#"
20
+ HF_URL = "#"
21
+
22
+
23
+ # ═══════════════════════════════════════════════════════════════════
24
+ # Taxonomy & scoring constants
25
+ # ═══════════════════════════════════════════════════════════════════
26
+
27
+ TASK_TYPES = [
28
+ "de_novo_binder",
29
+ "sequence_optimization",
30
+ "de_novo_backbone",
31
+ "complex_engineering",
32
+ "conformational_design",
33
+ ]
34
+ TASK_TYPE_LABELS = {
35
+ "de_novo_binder": "De Novo Binder",
36
+ "sequence_optimization": "Seq Optimization",
37
+ "de_novo_backbone": "De Novo Backbone",
38
+ "complex_engineering": "Complex Eng.",
39
+ "conformational_design": "Conformational",
40
+ }
41
+ BIO_CONTEXTS = ["ab", "enz", "sig", "str", "flu"]
42
+ BIO_CONTEXT_LABELS = {
43
+ "ab": "Antibody",
44
+ "enz": "Enzyme",
45
+ "sig": "Signaling",
46
+ "str": "Structural",
47
+ "flu": "Fluorescent",
48
+ }
49
+ VALID_CELLS = {
50
+ "de_novo_binder": {"ab", "enz", "sig"},
51
+ "sequence_optimization": {"ab", "enz", "sig", "str", "flu"},
52
+ "de_novo_backbone": {"str"},
53
+ "complex_engineering": {"enz", "sig", "str"},
54
+ "conformational_design": {"enz", "sig", "str", "flu"},
55
+ }
56
+ COMPONENTS = [
57
+ "approach",
58
+ "orchestration",
59
+ "quality",
60
+ "feasibility",
61
+ "novelty",
62
+ "diversity",
63
+ ]
64
+ COMP_MAX = {
65
+ "approach": 20,
66
+ "orchestration": 15,
67
+ "quality": 35,
68
+ "feasibility": 15,
69
+ "novelty": 5,
70
+ "diversity": 10,
71
+ }
72
+ TYPE_STYLE = {
73
+ "llm": {"icon": "", "bg": "#ffffff", "tag": ""},
74
+ "hardcoded": {"icon": "\U0001f527", "bg": "#f0f0f0", "tag": "baseline"},
75
+ "human_expert": {
76
+ "icon": "\U0001f468\u200d\U0001f52c",
77
+ "bg": "#ebf4ff",
78
+ "tag": "baseline",
79
+ },
80
+ "human_oracle": {"icon": "\U0001f4c4", "bg": "#fefcbf", "tag": "baseline"},
81
+ }
82
+
83
+
84
+ # ═══════════════════════════════════════════════════════════════════
85
+ # Data loading
86
+ # ═══════════════════════════════════════════════════════════════════
87
+
88
+
89
+ def load_data() -> dict:
90
+ path = Path(__file__).parent / "leaderboard_data.json"
91
+ with open(path) as f:
92
+ return json.load(f)
93
+
94
+
95
+ # ═══════════════════════════════════════════════════════════════════
96
+ # Custom CSS
97
+ # ═══════════════════════════════════════════════════════════════════
98
+
99
+ CUSTOM_CSS = """
100
+ .gradio-container { max-width: 1200px !important; }
101
+ .gr-padded { padding: 0 !important; }
102
+ """
103
+
104
+
105
+ # ═══════════════════════════════════════════════════════════════════
106
+ # Plotly layout helper
107
+ # ═══════════════════════════════════════════════════════════════════
108
+
109
+
110
+ def _base_layout(**overrides) -> dict:
111
+ """Shared Plotly layout defaults, with per-chart overrides."""
112
+ base = dict(
113
+ plot_bgcolor="white",
114
+ paper_bgcolor="white",
115
+ font=dict(
116
+ family="system-ui, -apple-system, sans-serif", size=12, color="#2d3748"
117
+ ),
118
+ margin=dict(l=40, r=20, t=50, b=40),
119
+ )
120
+ base.update(overrides)
121
+ return base
122
+
123
+
124
+ # ═══════════════════════════════════════════════════════════════════
125
+ # HTML builders
126
+ # ═══════════════════════════════════════════════════════════════════
127
+
128
+
129
+ def build_header(last_updated: str, n_entries: int) -> str:
130
+ return f"""
131
+ <div style="background:linear-gradient(135deg,#1a365d 0%,#2b6cb0 100%);
132
+ color:white;padding:2rem;text-align:center;border-radius:12px;
133
+ margin-bottom:0.5rem">
134
+ <h1 style="font-size:2rem;margin:0;font-weight:700">
135
+ \U0001f9ec BioDesignBench Leaderboard</h1>
136
+ <p style="opacity:0.85;margin:0.3rem 0 0;font-size:1rem">
137
+ Evaluating LLM Agents on Protein Design via MCP Tools</p>
138
+ <div style="margin-top:0.6rem;display:flex;justify-content:center;
139
+ gap:0.8rem;flex-wrap:wrap">
140
+ <a href="{PAPER_URL}" target="_blank"
141
+ style="background:rgba(255,255,255,0.2);color:white;
142
+ padding:0.3rem 0.8rem;border-radius:5px;
143
+ text-decoration:none;font-size:0.85rem;
144
+ font-weight:600">\U0001f4c4 Paper</a>
145
+ <a href="{GITHUB_URL}" target="_blank"
146
+ style="background:rgba(255,255,255,0.2);color:white;
147
+ padding:0.3rem 0.8rem;border-radius:5px;
148
+ text-decoration:none;font-size:0.85rem;
149
+ font-weight:600">\U0001f4bb GitHub</a>
150
+ <a href="{HF_URL}" target="_blank"
151
+ style="background:rgba(255,255,255,0.2);color:white;
152
+ padding:0.3rem 0.8rem;border-radius:5px;
153
+ text-decoration:none;font-size:0.85rem;
154
+ font-weight:600">\U0001f917 HuggingFace</a>
155
+ </div>
156
+ <div style="font-size:0.8rem;opacity:0.6;margin-top:0.5rem">
157
+ Romero Lab, Duke University &middot; Last updated: {last_updated}
158
+ &middot; 76 tasks &middot; {n_entries} conditions</div>
159
+ </div>"""
160
+
161
+
162
+ # ── Score styling helpers ──
163
+
164
+
165
+ def _score_color(s: float) -> str:
166
+ if s >= 50:
167
+ return "#38a169"
168
+ if s >= 25:
169
+ return "#d69e2e"
170
+ return "#e53e3e"
171
+
172
+
173
+ def _bar_bg(s: float) -> str:
174
+ if s >= 50:
175
+ return "rgba(56,161,105,0.15)"
176
+ if s >= 25:
177
+ return "rgba(214,158,46,0.15)"
178
+ return "rgba(229,62,62,0.12)"
179
+
180
+
181
+ def _heat_color(val, max_val=95) -> str:
182
+ if val is None:
183
+ return "#f7fafc"
184
+ r = val / max_val
185
+ if r >= 0.7:
186
+ return f"rgba(56,161,105,{min(0.2 + r * 0.4, 0.8):.2f})"
187
+ if r >= 0.4:
188
+ return f"rgba(214,158,46,{min(0.2 + r * 0.4, 0.8):.2f})"
189
+ return f"rgba(229,62,62,{min(0.15 + r * 0.3, 0.6):.2f})"
190
+
191
+
192
+ # ── Tab 1: Overall leaderboard table ──
193
+
194
+
195
+ def build_leaderboard_table(
196
+ entries: list, mode_f: str, mcp_f: str, type_f: str
197
+ ) -> str:
198
+ """Generate the mixed-ranking HTML table with inline styles."""
199
+ # Filter
200
+ filtered = []
201
+ for e in entries:
202
+ st = e["submission_type"]
203
+ if mode_f != "All" and st == "llm":
204
+ if (e.get("mode") or "").lower() != mode_f.lower():
205
+ continue
206
+ if mcp_f == "Reference" and e.get("mcp_custom"):
207
+ continue
208
+ if mcp_f == "Custom" and not e.get("mcp_custom"):
209
+ continue
210
+ if type_f == "LLM Only" and st != "llm":
211
+ continue
212
+ if type_f == "Baselines Only" and st == "llm":
213
+ continue
214
+ filtered.append(e)
215
+
216
+ filtered.sort(key=lambda x: x["overall_score"], reverse=True)
217
+
218
+ # Shared cell styles
219
+ TD = (
220
+ "padding:0.65rem 1rem;border-bottom:1px solid #e2e8f0;"
221
+ "font-size:0.9rem"
222
+ )
223
+ TH = (
224
+ "background:#1a365d;color:white;padding:0.75rem 1rem;"
225
+ "text-align:left;font-size:0.8rem;text-transform:uppercase;"
226
+ "letter-spacing:0.5px"
227
+ )
228
+
229
+ rows = []
230
+ llm_rank = 0
231
+ for e in filtered:
232
+ st = e["submission_type"]
233
+ sty = TYPE_STYLE.get(st, TYPE_STYLE["llm"])
234
+ is_bl = st != "llm"
235
+ sc = e["overall_score"]
236
+
237
+ # ── Rank cell ──
238
+ if is_bl:
239
+ rank = (
240
+ f'<td style="{TD};text-align:center;font-size:1.1rem;'
241
+ f'width:50px">{sty["icon"]}</td>'
242
+ )
243
+ else:
244
+ llm_rank += 1
245
+ rcolor = {1: "#d69e2e", 2: "#a0aec0", 3: "#c17832"}.get(
246
+ llm_rank, "#1a365d"
247
+ )
248
+ rsize = (
249
+ "1.1rem"
250
+ if llm_rank == 1
251
+ else ("1.05rem" if llm_rank <= 3 else "0.9rem")
252
+ )
253
+ rank = (
254
+ f'<td style="{TD};text-align:center;font-weight:700;'
255
+ f"color:{rcolor};font-size:{rsize};width:50px\">"
256
+ f"{llm_rank}</td>"
257
+ )
258
+
259
+ # ── Name cell ──
260
+ tag_html = ""
261
+ if sty["tag"]:
262
+ tag_html = (
263
+ ' <span style="font-size:0.7rem;background:#e2e8f0;'
264
+ "padding:0.1rem 0.4rem;border-radius:3px;color:#4a5568;"
265
+ f'margin-left:0.3rem;vertical-align:middle">'
266
+ f'{sty["tag"]}</span>'
267
+ )
268
+ icon_pfx = f'{sty["icon"]} ' if sty["icon"] else ""
269
+ fw = "600" if is_bl else "500"
270
+ name = (
271
+ f'<td style="{TD};font-weight:{fw}">'
272
+ f'{icon_pfx}{e["agent_name"]}{tag_html}</td>'
273
+ )
274
+
275
+ # ── Organization ──
276
+ org = f'<td style="{TD}">{e["organization"]}</td>'
277
+
278
+ # ── Mode badge ──
279
+ if is_bl:
280
+ mode = f'<td style="{TD};color:#718096">\u2014</td>'
281
+ elif e.get("mode") == "benchmark":
282
+ mode = (
283
+ f'<td style="{TD}"><span style="background:#fed7d7;'
284
+ "color:#c53030;padding:0.15rem 0.5rem;border-radius:4px;"
285
+ 'font-size:0.75rem;font-weight:600">benchmark</span></td>'
286
+ )
287
+ else:
288
+ mode = (
289
+ f'<td style="{TD}"><span style="background:#c6f6d5;'
290
+ "color:#276749;padding:0.15rem 0.5rem;border-radius:4px;"
291
+ 'font-size:0.75rem;font-weight:600">user</span></td>'
292
+ )
293
+
294
+ # ── MCP ──
295
+ if is_bl:
296
+ mcp = f'<td style="{TD};color:#718096">\u2014</td>'
297
+ elif e.get("mcp_custom"):
298
+ mcp = (
299
+ f'<td style="{TD};color:#38a169;font-weight:700">'
300
+ "\u2713 custom</td>"
301
+ )
302
+ else:
303
+ mcp = f'<td style="{TD};color:#718096">reference</td>'
304
+
305
+ # ── Score with proportional bar ──
306
+ scol = _score_color(sc)
307
+ bbg = _bar_bg(sc)
308
+ score_cell = (
309
+ f'<td style="{TD};font-weight:700;font-size:1rem;color:{scol};'
310
+ f'position:relative;font-variant-numeric:tabular-nums">'
311
+ f'<div style="position:absolute;left:0;top:0;bottom:0;'
312
+ f"width:{sc}%;background:{bbg};"
313
+ f'border-radius:3px"></div>'
314
+ f'<span style="position:relative">{sc:.1f}</span></td>'
315
+ )
316
+
317
+ # ── Tasks & zeros ──
318
+ tc = e.get("tasks_completed", 0)
319
+ tt = e.get("tasks_total", 76)
320
+ tasks = f'<td style="{TD}">{tc}/{tt}</td>'
321
+ zeros = f'<td style="{TD}">{e.get("tasks_with_zero", 0)}</td>'
322
+
323
+ rows.append(
324
+ f'<tr style="background:{sty["bg"]}">'
325
+ f"{rank}{name}{org}{mode}{mcp}{score_cell}{tasks}{zeros}</tr>"
326
+ )
327
+
328
+ return f"""
329
+ <table style="width:100%;border-collapse:collapse;background:white;
330
+ border-radius:10px;overflow:hidden;
331
+ box-shadow:0 1px 3px rgba(0,0,0,0.08)">
332
+ <thead><tr>
333
+ <th style="{TH};width:50px">#</th>
334
+ <th style="{TH}">Agent</th>
335
+ <th style="{TH}">Organization</th>
336
+ <th style="{TH}">Mode</th>
337
+ <th style="{TH}">MCP</th>
338
+ <th style="{TH}">Score</th>
339
+ <th style="{TH}">Tasks</th>
340
+ <th style="{TH}">Zero-Score</th>
341
+ </tr></thead>
342
+ <tbody>{''.join(rows)}</tbody>
343
+ </table>"""
344
+
345
+
346
+ # ── Tab 2: Taxonomy heatmap ──
347
+
348
+
349
+ def build_heatmap(entry: dict) -> str:
350
+ """HTML heatmap table for one agent across 17 taxonomy cells."""
351
+ ts = entry.get("taxonomy_scores", {})
352
+ TH = (
353
+ "background:#1a365d;color:white;padding:0.6rem 0.8rem;"
354
+ "text-align:center;font-size:0.75rem"
355
+ )
356
+ TD = (
357
+ "text-align:center;padding:0.5rem;font-size:0.85rem;"
358
+ "font-weight:600;border-bottom:1px solid #e2e8f0"
359
+ )
360
+
361
+ rows = []
362
+ for tt in TASK_TYPES:
363
+ cells = [
364
+ f'<td style="{TD};text-align:left;font-weight:600;'
365
+ f'background:#f8fafc">{TASK_TYPE_LABELS[tt]}</td>'
366
+ ]
367
+ vals = []
368
+ for bc in BIO_CONTEXTS:
369
+ if bc in VALID_CELLS[tt]:
370
+ val = ts.get(tt, {}).get(bc)
371
+ bg = _heat_color(val)
372
+ text = f"{val:.0f}" if val is not None else "\u2014"
373
+ cells.append(f'<td style="{TD};background:{bg}">{text}</td>')
374
+ if val is not None:
375
+ vals.append(val)
376
+ else:
377
+ cells.append(
378
+ f'<td style="{TD};color:#cbd5e0;font-weight:400">'
379
+ "\u2014</td>"
380
+ )
381
+ avg = sum(vals) / len(vals) if vals else 0
382
+ avg_bg = _heat_color(avg)
383
+ cells.append(
384
+ f'<td style="{TD};font-weight:700;background:{avg_bg}">'
385
+ f"{avg:.1f}</td>"
386
+ )
387
+ rows.append(f'<tr>{"".join(cells)}</tr>')
388
+
389
+ bc_headers = "".join(
390
+ f'<th style="{TH}">{BIO_CONTEXT_LABELS[bc]}</th>'
391
+ for bc in BIO_CONTEXTS
392
+ )
393
+
394
+ return f"""
395
+ <table style="width:100%;border-collapse:collapse;background:white;
396
+ border-radius:10px;overflow:hidden;
397
+ box-shadow:0 1px 3px rgba(0,0,0,0.08)">
398
+ <thead><tr>
399
+ <th style="{TH};text-align:left">Task Type</th>
400
+ {bc_headers}
401
+ <th style="{TH}">Avg</th>
402
+ </tr></thead>
403
+ <tbody>{''.join(rows)}</tbody>
404
+ </table>"""
405
+
406
+
407
+ # ── Tab 4: Mode comparison cards ──
408
+
409
+
410
+ def build_mode_cards(entries: list) -> str:
411
+ """Per-LLM cards showing benchmark vs user delta."""
412
+ by_name: dict[str, dict] = {}
413
+ for e in entries:
414
+ if e["submission_type"] != "llm":
415
+ continue
416
+ by_name.setdefault(e["agent_name"], {})[e["mode"]] = e
417
+
418
+ ordered = sorted(
419
+ by_name.items(),
420
+ key=lambda x: x[1].get("user", {}).get("overall_score", 0),
421
+ reverse=True,
422
+ )
423
+
424
+ cards = []
425
+ for name, modes in ordered:
426
+ bench = modes.get("benchmark")
427
+ user = modes.get("user")
428
+ if not bench or not user:
429
+ continue
430
+ delta = user["overall_score"] - bench["overall_score"]
431
+ pct = (delta / bench["overall_score"] * 100) if bench["overall_score"] else 0
432
+
433
+ lines = [
434
+ '<div style="display:flex;justify-content:space-between;'
435
+ 'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
436
+ "<span>Benchmark</span>"
437
+ f'<span style="font-weight:700;color:#e53e3e">'
438
+ f'{bench["overall_score"]:.1f}</span></div>',
439
+ '<div style="display:flex;justify-content:space-between;'
440
+ 'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
441
+ "<span>User</span>"
442
+ f'<span style="font-weight:700;color:#d69e2e">'
443
+ f'{user["overall_score"]:.1f}</span></div>',
444
+ '<div style="display:flex;justify-content:space-between;'
445
+ 'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
446
+ "<span>Delta</span>"
447
+ f'<span style="font-weight:700;color:#38a169">'
448
+ f"+{delta:.1f} (+{pct:.0f}%)</span></div>",
449
+ ]
450
+ for c in COMPONENTS:
451
+ d = user["component_scores"][c] - bench["component_scores"][c]
452
+ color = "#38a169" if d >= 0 else "#e53e3e"
453
+ sign = "+" if d >= 0 else ""
454
+ lines.append(
455
+ '<div style="display:flex;justify-content:space-between;'
456
+ 'padding:0.3rem 0;border-bottom:1px solid #e2e8f0;'
457
+ 'font-size:0.85rem">'
458
+ f'<span style="color:#718096">{c}</span>'
459
+ f'<span style="font-weight:700;color:{color}">'
460
+ f"{sign}{d:.1f}</span></div>"
461
+ )
462
+
463
+ cards.append(
464
+ '<div style="background:white;border-radius:10px;padding:1.2rem;'
465
+ 'box-shadow:0 1px 3px rgba(0,0,0,0.08)">'
466
+ f'<h4 style="font-size:0.95rem;color:#1a365d;'
467
+ f'margin:0 0 0.8rem">{name}</h4>'
468
+ f'{"".join(lines)}</div>'
469
+ )
470
+
471
+ return (
472
+ '<div style="display:grid;grid-template-columns:'
473
+ 'repeat(auto-fit,minmax(250px,1fr));gap:1rem;margin-top:1rem">'
474
+ f'{"".join(cards)}</div>'
475
+ )
476
+
477
+
478
+ # ── Tab 5: About ──
479
+
480
+
481
+ def build_about() -> str:
482
+ return """
483
+ <div style="max-width:900px;margin:0 auto">
484
+
485
+ <div style="background:white;border-radius:10px;padding:2rem;
486
+ box-shadow:0 1px 3px rgba(0,0,0,0.08);margin-bottom:1.5rem">
487
+ <h2 style="color:#1a365d;margin:0 0 0.8rem;font-size:1.3rem">
488
+ What is BioDesignBench?</h2>
489
+ <p style="margin-bottom:0.8rem;color:#2d3748;line-height:1.6">
490
+ BioDesignBench is the first comprehensive benchmark for evaluating
491
+ LLM agents on protein design tasks via MCP (Model Context Protocol)
492
+ tool use. Unlike existing benchmarks that focus on model-only
493
+ evaluation, BioDesignBench tests the full design loop:
494
+ <strong>Natural language &rarr; Design &rarr; Evaluate &rarr;
495
+ Iterate</strong>.</p>
496
+ <div style="display:grid;grid-template-columns:
497
+ repeat(auto-fit,minmax(140px,1fr));gap:1rem;margin:1rem 0">
498
+ <div style="background:#f7fafc;border-radius:8px;padding:1rem;
499
+ text-align:center">
500
+ <div style="font-size:1.8rem;font-weight:700;color:#3182ce">
501
+ 76</div>
502
+ <div style="font-size:0.8rem;color:#718096">Design Tasks</div>
503
+ </div>
504
+ <div style="background:#f7fafc;border-radius:8px;padding:1rem;
505
+ text-align:center">
506
+ <div style="font-size:1.8rem;font-weight:700;color:#3182ce">
507
+ 17</div>
508
+ <div style="font-size:0.8rem;color:#718096">Taxonomy Cells</div>
509
+ </div>
510
+ <div style="background:#f7fafc;border-radius:8px;padding:1rem;
511
+ text-align:center">
512
+ <div style="font-size:1.8rem;font-weight:700;color:#3182ce">
513
+ 17</div>
514
+ <div style="font-size:0.8rem;color:#718096">MCP Tools</div>
515
+ </div>
516
+ <div style="background:#f7fafc;border-radius:8px;padding:1rem;
517
+ text-align:center">
518
+ <div style="font-size:1.8rem;font-weight:700;color:#3182ce">
519
+ 100</div>
520
+ <div style="font-size:0.8rem;color:#718096">Point Rubric</div>
521
+ </div>
522
+ </div>
523
+ </div>
524
+
525
+ <div style="background:white;border-radius:10px;padding:2rem;
526
+ box-shadow:0 1px 3px rgba(0,0,0,0.08);margin-bottom:1.5rem">
527
+ <h2 style="color:#1a365d;margin:0 0 0.8rem;font-size:1.3rem">
528
+ How to Submit</h2>
529
+ <h3 style="color:#2b6cb0;margin:1.2rem 0 0.5rem;font-size:1.05rem">
530
+ 1. Build Your Agent</h3>
531
+ <p style="margin-bottom:0.8rem;color:#2d3748">
532
+ Create a protein design agent that accepts tasks via our API spec.
533
+ You may use our 17 reference MCP tools as-is, modify them, or build
534
+ entirely custom tools.</p>
535
+ <h3 style="color:#2b6cb0;margin:1.2rem 0 0.5rem;font-size:1.05rem">
536
+ 2. Host as API Endpoint</h3>
537
+ <p style="margin-bottom:0.8rem;color:#2d3748">
538
+ Your agent must be accessible as a POST endpoint that accepts task
539
+ descriptions and returns designed sequences.</p>
540
+ <h3 style="color:#2b6cb0;margin:1.2rem 0 0.5rem;font-size:1.05rem">
541
+ API Specification</h3>
542
+ <pre style="background:#1a202c;color:#e2e8f0;padding:1rem;
543
+ border-radius:8px;font-size:0.8rem;overflow-x:auto;
544
+ line-height:1.5">POST /evaluate
545
+
546
+ Input:
547
+ {
548
+ "task_id": "dnb_sig_001",
549
+ "task_description": "Design a de novo binder for...",
550
+ "available_tools": [...],
551
+ "max_steps": 50,
552
+ "timeout_sec": 300
553
+ }
554
+
555
+ Output:
556
+ {
557
+ "sequences": ["MKKL..."],
558
+ "run_log": [...],
559
+ "total_steps": 12,
560
+ "total_time_sec": 142.5
561
+ }</pre>
562
+ <h3 style="color:#2b6cb0;margin:1.2rem 0 0.5rem;font-size:1.05rem">
563
+ 3. Submit &amp; Evaluate</h3>
564
+ <p style="margin-bottom:0.8rem;color:#2d3748">
565
+ We run 73 hidden tasks against your endpoint. Results are
566
+ independently verified with AlphaFold2.
567
+ Maximum <strong>2 submissions per month</strong>.</p>
568
+ <p style="color:#2d3748">
569
+ 3 example tasks are publicly available for development and
570
+ testing.</p>
571
+
572
+ <h3 style="color:#2b6cb0;margin:1.2rem 0 0.5rem;font-size:1.05rem">
573
+ MCP Reference Tools</h3>
574
+ <p style="margin-bottom:0.8rem;color:#2d3748">
575
+ We provide 17 reference MCP tools for protein design. You may use
576
+ them as-is, modify them, or build entirely custom tools.
577
+ <a href="#" style="color:#3182ce">GitHub repository &rarr;</a></p>
578
+
579
+ <h3 style="color:#2b6cb0;margin:1.2rem 0 0.5rem;font-size:1.05rem">
580
+ Submission Limits</h3>
581
+ <ul style="color:#2d3748;padding-left:1.5rem;margin-bottom:0.8rem">
582
+ <li>Maximum 2 submissions per month</li>
583
+ <li>Hidden test set (73 tasks) is used for ranking</li>
584
+ <li>3 example tasks are publicly available for development</li>
585
+ </ul>
586
+ </div>
587
+
588
+ <div style="background:white;border-radius:10px;padding:2rem;
589
+ box-shadow:0 1px 3px rgba(0,0,0,0.08);margin-bottom:1.5rem">
590
+ <h2 style="color:#1a365d;margin:0 0 0.8rem;font-size:1.3rem">
591
+ Scoring Rubric (100 points)</h2>
592
+ <p style="margin-bottom:0.5rem;color:#2d3748">
593
+ <strong>Approach (20 pts)</strong> &mdash; Function-based design
594
+ methodology evaluation across 10 DesignFunctions</p>
595
+ <p style="margin-bottom:0.5rem;color:#2d3748">
596
+ <strong>Orchestration (15 pts)</strong> &mdash; Pipeline ordering
597
+ and intermediate validation</p>
598
+ <p style="margin-bottom:0.5rem;color:#2d3748">
599
+ <strong>Quality (35 pts)</strong> &mdash; 3-tier graduated scoring:
600
+ structure confidence, interface confidence, interface physics</p>
601
+ <p style="margin-bottom:0.5rem;color:#2d3748">
602
+ <strong>Feasibility (15 pts)</strong> &mdash; Valid amino acids,
603
+ length, composition, biophysical checks</p>
604
+ <p style="margin-bottom:0.5rem;color:#2d3748">
605
+ <strong>Novelty (5 pts)</strong> &mdash; Sequence identity to
606
+ reference (lower = more novel = better)</p>
607
+ <p style="margin-bottom:0.5rem;color:#2d3748">
608
+ <strong>Diversity (10 pts)</strong> &mdash; Number and diversity
609
+ of generated designs</p>
610
+ </div>
611
+
612
+ <div style="background:white;border-radius:10px;padding:2rem;
613
+ box-shadow:0 1px 3px rgba(0,0,0,0.08);margin-bottom:1.5rem">
614
+ <h2 style="color:#1a365d;margin:0 0 0.8rem;font-size:1.3rem">
615
+ Citation</h2>
616
+ <pre style="background:#1a202c;color:#e2e8f0;padding:1rem;
617
+ border-radius:8px;font-size:0.8rem;
618
+ line-height:1.5">@article{biodesignbench2026,
619
+ title={BioDesignBench: Evaluating LLM Agents on
620
+ Protein Design via MCP Tools},
621
+ author={Kim, Jason et al.},
622
+ year={2026}
623
+ }</pre>
624
+ </div>
625
+
626
+ </div>"""
627
+
628
+
629
+ # ═══════════════════════════════════════════════════════════════════
630
+ # Chart builders (Plotly)
631
+ # ═══════════════════════════════════════════════════════════════════
632
+
633
+
634
+ def chart_taxonomy_bar(entry: dict) -> go.Figure:
635
+ """Bar chart of average score per task type for one agent."""
636
+ ts = entry.get("taxonomy_scores", {})
637
+ avgs = []
638
+ for tt in TASK_TYPES:
639
+ vals = [v for v in ts.get(tt, {}).values() if v is not None]
640
+ avgs.append(sum(vals) / len(vals) if vals else 0)
641
+
642
+ fig = go.Figure(
643
+ go.Bar(
644
+ x=[TASK_TYPE_LABELS[t] for t in TASK_TYPES],
645
+ y=avgs,
646
+ marker_color="rgba(49,130,206,0.7)",
647
+ marker_line_width=0,
648
+ text=[f"{v:.1f}" for v in avgs],
649
+ textposition="auto",
650
+ )
651
+ )
652
+ mode = entry.get("mode") or "\u2014"
653
+ fig.update_layout(
654
+ **_base_layout(
655
+ title=dict(
656
+ text=f"{entry['agent_name']} ({mode}) \u2014 Score by Task Type",
657
+ font_size=14,
658
+ ),
659
+ yaxis=dict(range=[0, 100], title="Average Score"),
660
+ xaxis=dict(title=""),
661
+ height=300,
662
+ )
663
+ )
664
+ return fig
665
+
666
+
667
+ def chart_radar(e1: dict, e2: dict) -> go.Figure:
668
+ """Radar chart comparing two agents' component scores (% of max)."""
669
+ labels = [c.capitalize() for c in COMPONENTS]
670
+
671
+ def norm(e):
672
+ return [e["component_scores"][c] / COMP_MAX[c] * 100 for c in COMPONENTS]
673
+
674
+ v1, v2 = norm(e1), norm(e2)
675
+ m1 = e1.get("mode") or "\u2014"
676
+ m2 = e2.get("mode") or "\u2014"
677
+
678
+ fig = go.Figure()
679
+ fig.add_trace(
680
+ go.Scatterpolar(
681
+ r=v1 + [v1[0]],
682
+ theta=labels + [labels[0]],
683
+ fill="toself",
684
+ name=f'{e1["agent_name"]} ({m1})',
685
+ line=dict(color="rgba(49,130,206,0.8)"),
686
+ fillcolor="rgba(49,130,206,0.15)",
687
+ )
688
+ )
689
+ fig.add_trace(
690
+ go.Scatterpolar(
691
+ r=v2 + [v2[0]],
692
+ theta=labels + [labels[0]],
693
+ fill="toself",
694
+ name=f'{e2["agent_name"]} ({m2})',
695
+ line=dict(color="rgba(229,62,62,0.8)"),
696
+ fillcolor="rgba(229,62,62,0.15)",
697
+ )
698
+ )
699
+ fig.update_layout(
700
+ **_base_layout(
701
+ polar=dict(
702
+ radialaxis=dict(visible=True, range=[0, 100], ticksuffix="%")
703
+ ),
704
+ showlegend=True,
705
+ legend=dict(
706
+ orientation="h", yanchor="bottom", y=-0.25,
707
+ xanchor="center", x=0.5,
708
+ ),
709
+ title=dict(text="Component Radar (% of max)", font_size=14),
710
+ height=420,
711
+ )
712
+ )
713
+ return fig
714
+
715
+
716
+ def chart_component_bar(e1: dict, e2: dict) -> go.Figure:
717
+ """Horizontal bar chart of raw component scores for two agents."""
718
+ labels = [f"{c.capitalize()} (/{COMP_MAX[c]})" for c in COMPONENTS]
719
+ m1 = e1.get("mode") or "\u2014"
720
+ m2 = e2.get("mode") or "\u2014"
721
+
722
+ fig = go.Figure()
723
+ fig.add_trace(
724
+ go.Bar(
725
+ y=labels,
726
+ x=[e1["component_scores"][c] for c in COMPONENTS],
727
+ name=f'{e1["agent_name"]} ({m1})',
728
+ orientation="h",
729
+ marker_color="rgba(49,130,206,0.7)",
730
+ )
731
+ )
732
+ fig.add_trace(
733
+ go.Bar(
734
+ y=labels,
735
+ x=[e2["component_scores"][c] for c in COMPONENTS],
736
+ name=f'{e2["agent_name"]} ({m2})',
737
+ orientation="h",
738
+ marker_color="rgba(229,62,62,0.7)",
739
+ )
740
+ )
741
+ fig.update_layout(
742
+ **_base_layout(
743
+ barmode="group",
744
+ xaxis=dict(title="Score"),
745
+ title=dict(text="Component Breakdown", font_size=14),
746
+ legend=dict(
747
+ orientation="h", yanchor="bottom", y=-0.3,
748
+ xanchor="center", x=0.5,
749
+ ),
750
+ height=420,
751
+ )
752
+ )
753
+ return fig
754
+
755
+
756
+ def chart_mode_comparison(entries: list) -> go.Figure:
757
+ """Grouped bar chart: benchmark vs user mode for each LLM."""
758
+ by_name: dict[str, dict[str, float]] = {}
759
+ for e in entries:
760
+ if e["submission_type"] != "llm":
761
+ continue
762
+ by_name.setdefault(e["agent_name"], {})[e["mode"]] = e["overall_score"]
763
+
764
+ ordered = sorted(
765
+ by_name.items(),
766
+ key=lambda x: x[1].get("user", 0),
767
+ reverse=True,
768
+ )
769
+ names = [n for n, _ in ordered]
770
+ bench = [m.get("benchmark", 0) for _, m in ordered]
771
+ user = [m.get("user", 0) for _, m in ordered]
772
+
773
+ fig = go.Figure()
774
+ fig.add_trace(
775
+ go.Bar(
776
+ x=names, y=bench, name="Benchmark Mode",
777
+ marker_color="rgba(229,62,62,0.6)",
778
+ )
779
+ )
780
+ fig.add_trace(
781
+ go.Bar(
782
+ x=names, y=user, name="User Mode",
783
+ marker_color="rgba(56,161,105,0.6)",
784
+ )
785
+ )
786
+ fig.update_layout(
787
+ **_base_layout(
788
+ barmode="group",
789
+ yaxis=dict(range=[0, 50], title="Overall Score"),
790
+ title=dict(
791
+ text="Benchmark Mode vs User Mode \u2014 Overall Score",
792
+ font_size=14,
793
+ ),
794
+ legend=dict(
795
+ orientation="h", yanchor="bottom", y=-0.15,
796
+ xanchor="center", x=0.5,
797
+ ),
798
+ height=350,
799
+ )
800
+ )
801
+ return fig
802
+
803
+
804
+ # ═══════════════════════════════════════════════════════════════════
805
+ # Gradio application
806
+ # ═══════════════════════════════════════════════════════════════════
807
+
808
+
809
+ def create_app() -> gr.Blocks:
810
+ data = load_data()
811
+ entries = data["entries"]
812
+ by_id = {e["agent_id"]: e for e in entries}
813
+
814
+ # Build dropdown choices: (display_label, agent_id)
815
+ agent_choices = []
816
+ for e in entries:
817
+ sty = TYPE_STYLE.get(e["submission_type"], TYPE_STYLE["llm"])
818
+ icon = sty["icon"]
819
+ mode = e.get("mode") or "\u2014"
820
+ label = f"{icon} {e['agent_name']} ({mode})".strip()
821
+ agent_choices.append((label, e["agent_id"]))
822
+
823
+ # Safe index helper
824
+ def _choice_val(idx: int) -> str:
825
+ return agent_choices[min(idx, len(agent_choices) - 1)][1]
826
+
827
+ with gr.Blocks() as app:
828
+
829
+ gr.HTML(build_header(data["last_updated"], len(entries)))
830
+
831
+ with gr.Tabs():
832
+
833
+ # ════════ Tab 1: Overall Leaderboard ════════
834
+ with gr.Tab("\U0001f4ca Overall"):
835
+ with gr.Row():
836
+ f_mode = gr.Dropdown(
837
+ ["All", "Benchmark", "User"],
838
+ value="All", label="Mode", scale=1,
839
+ )
840
+ f_mcp = gr.Dropdown(
841
+ ["All", "Reference", "Custom"],
842
+ value="All", label="MCP Tools", scale=1,
843
+ )
844
+ f_type = gr.Dropdown(
845
+ ["All Entries", "LLM Only", "Baselines Only"],
846
+ value="All Entries", label="Show", scale=1,
847
+ )
848
+
849
+ tbl = gr.HTML(
850
+ build_leaderboard_table(
851
+ entries, "All", "All", "All Entries"
852
+ )
853
+ )
854
+
855
+ def _update_table(m, mc, t):
856
+ return build_leaderboard_table(entries, m, mc, t)
857
+
858
+ for dd in [f_mode, f_mcp, f_type]:
859
+ dd.change(
860
+ _update_table, [f_mode, f_mcp, f_type], tbl
861
+ )
862
+
863
+ # ════════ Tab 2: Taxonomy Breakdown ════════
864
+ with gr.Tab("\U0001f9ec Taxonomy"):
865
+ tax_dd = gr.Dropdown(
866
+ agent_choices,
867
+ value=_choice_val(0),
868
+ label="Select Agent",
869
+ )
870
+ hm_html = gr.HTML(build_heatmap(entries[0]))
871
+ tax_plot = gr.Plot(chart_taxonomy_bar(entries[0]))
872
+
873
+ def _update_taxonomy(aid):
874
+ e = by_id.get(aid, entries[0])
875
+ return build_heatmap(e), chart_taxonomy_bar(e)
876
+
877
+ tax_dd.change(
878
+ _update_taxonomy, [tax_dd], [hm_html, tax_plot]
879
+ )
880
+
881
+ # ════════ Tab 3: Component Analysis ════════
882
+ with gr.Tab("\U0001f3af Components"):
883
+ with gr.Row():
884
+ c1 = gr.Dropdown(
885
+ agent_choices, value=_choice_val(0),
886
+ label="Agent 1", scale=1,
887
+ )
888
+ c2 = gr.Dropdown(
889
+ agent_choices, value=_choice_val(4),
890
+ label="Agent 2", scale=1,
891
+ )
892
+ with gr.Row():
893
+ radar = gr.Plot(
894
+ chart_radar(
895
+ entries[0],
896
+ entries[min(4, len(entries) - 1)],
897
+ )
898
+ )
899
+ comp_bar = gr.Plot(
900
+ chart_component_bar(
901
+ entries[0],
902
+ entries[min(4, len(entries) - 1)],
903
+ )
904
+ )
905
+
906
+ def _update_comp(a1, a2):
907
+ e1 = by_id.get(a1, entries[0])
908
+ e2 = by_id.get(a2, entries[-1])
909
+ return chart_radar(e1, e2), chart_component_bar(e1, e2)
910
+
911
+ for dd in [c1, c2]:
912
+ dd.change(_update_comp, [c1, c2], [radar, comp_bar])
913
+
914
+ # ════════ Tab 4: Benchmark vs User ════════
915
+ with gr.Tab("\u26a1 Benchmark vs User"):
916
+ gr.Plot(chart_mode_comparison(entries))
917
+ gr.HTML(build_mode_cards(entries))
918
+
919
+ # ════════ Tab 5: About ════════
920
+ with gr.Tab("\u2139\ufe0f About"):
921
+ gr.HTML(build_about())
922
+
923
+ return app
924
+
925
+
926
+ # ═════════════��═════════════════════════════════════════════════════
927
+ # Entry point
928
+ # ═══════════════════════════════════════════════════════════════════
929
+
930
+ if __name__ == "__main__":
931
+ create_app().launch(
932
+ theme=gr.themes.Soft(primary_hue="blue"),
933
+ css=CUSTOM_CSS,
934
+ )