Phase C: MCP toggle (reference vs custom) + tighten rate limit to 1/month
Browse files- Submit form: replace checkbox with explicit 2-option radio
- Add inline pointer to RomeroLab/protein-design-mcp
- About 'How to submit' rewritten with both modes
- Leaderboard MCP column: badges for both reference and custom
- eval_queue.MAX_SUBMISSIONS_PER_MONTH: 2 -> 1 (judge cost guard)
- app.py +100 -36
- eval_queue.py +3 -2
app.py
CHANGED
|
@@ -374,11 +374,16 @@ def build_leaderboard_table(
|
|
| 374 |
mcp = f'<td style="{TD};color:#718096">\u2014</td>'
|
| 375 |
elif e.get("mcp_custom"):
|
| 376 |
mcp = (
|
| 377 |
-
f'<td style="{TD}
|
| 378 |
-
"
|
|
|
|
| 379 |
)
|
| 380 |
else:
|
| 381 |
-
mcp =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
|
| 383 |
# ββ Score with proportional bar ββ
|
| 384 |
scol = _score_color(sc)
|
|
@@ -813,13 +818,26 @@ def build_about() -> str:
|
|
| 813 |
<h2 {h2}>How to submit</h2>
|
| 814 |
<h3 {h3}>1. Build your agent</h3>
|
| 815 |
<p {p}>
|
| 816 |
-
Create a protein design agent that
|
| 817 |
-
|
| 818 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 819 |
<h3 {h3}>2. Host an API endpoint</h3>
|
| 820 |
<p {p}>
|
| 821 |
Your agent must be accessible as a POST endpoint that accepts
|
| 822 |
-
task payloads and returns designed
|
|
|
|
|
|
|
| 823 |
<h3 {h3}>API specification</h3>
|
| 824 |
<pre style="background:#0f172a;color:#e2e8f0;padding:1.2rem;
|
| 825 |
border-radius:10px;font-size:0.8rem;overflow-x:auto;
|
|
@@ -830,6 +848,8 @@ Request:
|
|
| 830 |
"task_id": "dnb_ab_001",
|
| 831 |
"task_description": "Design a de novo binder for...",
|
| 832 |
"available_tools": [...],
|
|
|
|
|
|
|
| 833 |
"max_steps": 50,
|
| 834 |
"timeout_sec": 300
|
| 835 |
}}
|
|
@@ -837,30 +857,27 @@ Request:
|
|
| 837 |
Response:
|
| 838 |
{{
|
| 839 |
"sequences": ["MKKL..."],
|
| 840 |
-
"run_log": [...],
|
| 841 |
"total_steps": 12,
|
| 842 |
-
"total_time_sec": 142.5
|
|
|
|
| 843 |
}}</pre>
|
| 844 |
<h3 {h3}>3. Submit and wait</h3>
|
| 845 |
<p {p}>
|
| 846 |
-
We dispatch 73 hidden tasks to your endpoint
|
| 847 |
-
|
| 848 |
-
|
|
|
|
|
|
|
|
|
|
| 849 |
<p {p}>
|
| 850 |
3 example tasks are publicly available for development and
|
| 851 |
-
testing.</p>
|
| 852 |
-
|
| 853 |
-
<h3 {h3}>Reference MCP tools</h3>
|
| 854 |
-
<p {p}>
|
| 855 |
-
We provide 17 reference MCP tools for protein design. You can use
|
| 856 |
-
them as-is, extend them, or build entirely custom tools.
|
| 857 |
-
<a href="#" style="color:#2563eb;font-weight:500">
|
| 858 |
-
GitHub repository →</a></p>
|
| 859 |
|
| 860 |
<h3 {h3}>Limits</h3>
|
| 861 |
<ul style="color:#475569;padding-left:1.5rem;margin-bottom:0.8rem;
|
| 862 |
line-height:1.7">
|
| 863 |
-
<li>Maximum
|
| 864 |
<li>73 hidden tasks are used for ranking</li>
|
| 865 |
<li>3 public example tasks are available for development</li>
|
| 866 |
</ul>
|
|
@@ -1273,27 +1290,64 @@ def create_app() -> gr.Blocks:
|
|
| 1273 |
# ββββββ Tab 5: Submit ββββββ
|
| 1274 |
with gr.Tab("\U0001f4e4 Submit"):
|
| 1275 |
gr.HTML("""
|
| 1276 |
-
<div style="max-width:
|
| 1277 |
<h2 style="color:#0f172a;margin:0 0 0.5rem;
|
| 1278 |
font-weight:700;font-size:1.25rem">
|
| 1279 |
Submit your agent</h2>
|
| 1280 |
<p style="color:#475569;margin-bottom:1rem;line-height:1.6">
|
| 1281 |
-
|
| 1282 |
-
|
| 1283 |
-
|
| 1284 |
-
|
| 1285 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1286 |
<div style="background:#fefce8;border-left:3px solid #ca8a04;
|
| 1287 |
padding:0.8rem 1rem;border-radius:6px;
|
| 1288 |
margin-bottom:1rem;font-size:0.85rem;color:#713f12">
|
| 1289 |
-
<strong>Rate limit:</strong>
|
| 1290 |
-
month per organization.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1291 |
</div>""")
|
| 1292 |
|
| 1293 |
with gr.Column(scale=1):
|
| 1294 |
sub_agent = gr.Textbox(
|
| 1295 |
label="Agent Name",
|
| 1296 |
-
placeholder="e.g., GPT-5 +
|
| 1297 |
)
|
| 1298 |
sub_org = gr.Textbox(
|
| 1299 |
label="Organization",
|
|
@@ -1308,9 +1362,18 @@ def create_app() -> gr.Blocks:
|
|
| 1308 |
placeholder="Brief description of your agent...",
|
| 1309 |
lines=3,
|
| 1310 |
)
|
| 1311 |
-
|
| 1312 |
-
|
| 1313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1314 |
)
|
| 1315 |
sub_btn = gr.Button(
|
| 1316 |
"Submit for Review",
|
|
@@ -1318,7 +1381,7 @@ def create_app() -> gr.Blocks:
|
|
| 1318 |
)
|
| 1319 |
sub_result = gr.HTML()
|
| 1320 |
|
| 1321 |
-
def _handle_submit(name, org, url, desc,
|
| 1322 |
if not name or not org or not url:
|
| 1323 |
return ('<div style="color:#e53e3e;padding:0.5rem">'
|
| 1324 |
"Please fill in all required fields.</div>")
|
|
@@ -1332,7 +1395,7 @@ def create_app() -> gr.Blocks:
|
|
| 1332 |
organization=org,
|
| 1333 |
endpoint_url=url,
|
| 1334 |
description=desc,
|
| 1335 |
-
mcp_custom=
|
| 1336 |
)
|
| 1337 |
if "error" in result:
|
| 1338 |
return (f'<div style="color:#e53e3e;padding:0.5rem">'
|
|
@@ -1343,6 +1406,7 @@ def create_app() -> gr.Blocks:
|
|
| 1343 |
f'<strong>Submitted!</strong> '
|
| 1344 |
f'ID: <code>{result["submission_id"]}</code><br>'
|
| 1345 |
f'Status: {result["status"]}<br>'
|
|
|
|
| 1346 |
f'{result.get("message", "")}</div>'
|
| 1347 |
)
|
| 1348 |
except Exception as e:
|
|
@@ -1351,7 +1415,7 @@ def create_app() -> gr.Blocks:
|
|
| 1351 |
|
| 1352 |
sub_btn.click(
|
| 1353 |
_handle_submit,
|
| 1354 |
-
[sub_agent, sub_org, sub_url, sub_desc,
|
| 1355 |
sub_result,
|
| 1356 |
)
|
| 1357 |
|
|
|
|
| 374 |
mcp = f'<td style="{TD};color:#718096">\u2014</td>'
|
| 375 |
elif e.get("mcp_custom"):
|
| 376 |
mcp = (
|
| 377 |
+
f'<td style="{TD}"><span style="background:#fef3c7;'
|
| 378 |
+
"color:#92400e;padding:0.15rem 0.55rem;border-radius:4px;"
|
| 379 |
+
'font-size:0.72rem;font-weight:700">custom</span></td>'
|
| 380 |
)
|
| 381 |
else:
|
| 382 |
+
mcp = (
|
| 383 |
+
f'<td style="{TD}"><span style="background:#dbeafe;'
|
| 384 |
+
"color:#1e40af;padding:0.15rem 0.55rem;border-radius:4px;"
|
| 385 |
+
'font-size:0.72rem;font-weight:700">reference</span></td>'
|
| 386 |
+
)
|
| 387 |
|
| 388 |
# ββ Score with proportional bar ββ
|
| 389 |
scol = _score_color(sc)
|
|
|
|
| 818 |
<h2 {h2}>How to submit</h2>
|
| 819 |
<h3 {h3}>1. Build your agent</h3>
|
| 820 |
<p {p}>
|
| 821 |
+
Create a protein design agent that runs the full plan →
|
| 822 |
+
sample → evaluate → iterate loop on each task. Pick one
|
| 823 |
+
of two MCP options:</p>
|
| 824 |
+
<ul style="color:#475569;padding-left:1.5rem;margin-bottom:0.8rem;
|
| 825 |
+
line-height:1.7">
|
| 826 |
+
<li><strong>Reference MCP</strong> — connect to our published
|
| 827 |
+
<a href="https://github.com/RomeroLab/protein-design-mcp"
|
| 828 |
+
style="color:#2563eb;font-weight:600">protein-design-mcp</a>
|
| 829 |
+
server (Docker image / Modal endpoint, in progress). Eligible for
|
| 830 |
+
the reference ranking.</li>
|
| 831 |
+
<li><strong>Custom MCP</strong> — bring your own tool
|
| 832 |
+
implementations. Tagged with a <code>custom</code> badge on the
|
| 833 |
+
leaderboard, excluded from the reference ranking.</li>
|
| 834 |
+
</ul>
|
| 835 |
<h3 {h3}>2. Host an API endpoint</h3>
|
| 836 |
<p {p}>
|
| 837 |
Your agent must be accessible as a POST endpoint that accepts
|
| 838 |
+
task payloads and returns designed sequences plus a tool-call
|
| 839 |
+
trace. See <code>biodesignbench-leaderboard/example_server.py</code>
|
| 840 |
+
for a 200-line reference.</p>
|
| 841 |
<h3 {h3}>API specification</h3>
|
| 842 |
<pre style="background:#0f172a;color:#e2e8f0;padding:1.2rem;
|
| 843 |
border-radius:10px;font-size:0.8rem;overflow-x:auto;
|
|
|
|
| 848 |
"task_id": "dnb_ab_001",
|
| 849 |
"task_description": "Design a de novo binder for...",
|
| 850 |
"available_tools": [...],
|
| 851 |
+
"input_files": {{ "<pdb-name>": "<base64>" }},
|
| 852 |
+
"design_constraints": {{ ... }},
|
| 853 |
"max_steps": 50,
|
| 854 |
"timeout_sec": 300
|
| 855 |
}}
|
|
|
|
| 857 |
Response:
|
| 858 |
{{
|
| 859 |
"sequences": ["MKKL..."],
|
| 860 |
+
"run_log": [{{ "step": 1, "tool": "...", "success": true }}],
|
| 861 |
"total_steps": 12,
|
| 862 |
+
"total_time_sec": 142.5,
|
| 863 |
+
"metrics": {{}}
|
| 864 |
}}</pre>
|
| 865 |
<h3 {h3}>3. Submit and wait</h3>
|
| 866 |
<p {p}>
|
| 867 |
+
We dispatch 73 hidden tasks to your endpoint, run Boltz-2
|
| 868 |
+
structure verification on each design, and score against the
|
| 869 |
+
100-point hybrid rubric (algorithmic + 3-judge LLM panel).
|
| 870 |
+
Maximum <strong>1 submission per month</strong> per
|
| 871 |
+
organization — LLM-judge API costs are paid by Romero
|
| 872 |
+
Lab.</p>
|
| 873 |
<p {p}>
|
| 874 |
3 example tasks are publicly available for development and
|
| 875 |
+
testing your endpoint before submission.</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 876 |
|
| 877 |
<h3 {h3}>Limits</h3>
|
| 878 |
<ul style="color:#475569;padding-left:1.5rem;margin-bottom:0.8rem;
|
| 879 |
line-height:1.7">
|
| 880 |
+
<li>Maximum 1 submission per calendar month per organization</li>
|
| 881 |
<li>73 hidden tasks are used for ranking</li>
|
| 882 |
<li>3 public example tasks are available for development</li>
|
| 883 |
</ul>
|
|
|
|
| 1290 |
# ββββββ Tab 5: Submit ββββββ
|
| 1291 |
with gr.Tab("\U0001f4e4 Submit"):
|
| 1292 |
gr.HTML("""
|
| 1293 |
+
<div style="max-width:780px;margin:0 auto;padding:1rem">
|
| 1294 |
<h2 style="color:#0f172a;margin:0 0 0.5rem;
|
| 1295 |
font-weight:700;font-size:1.25rem">
|
| 1296 |
Submit your agent</h2>
|
| 1297 |
<p style="color:#475569;margin-bottom:1rem;line-height:1.6">
|
| 1298 |
+
Host your protein-design agent as an HTTPS endpoint that
|
| 1299 |
+
accepts task payloads and returns designed sequences plus
|
| 1300 |
+
a tool-call trace. The leaderboard will POST each of the
|
| 1301 |
+
76 hidden tasks to your endpoint, run Boltz-2 structure
|
| 1302 |
+
verification, score the rubric, and publish the result.
|
| 1303 |
+
</p>
|
| 1304 |
+
|
| 1305 |
+
<div style="background:#eff6ff;border-left:4px solid #3182ce;
|
| 1306 |
+
padding:0.95rem 1.1rem;border-radius:8px;
|
| 1307 |
+
margin-bottom:1rem;font-size:0.86rem;
|
| 1308 |
+
color:#1e3a8a;line-height:1.55">
|
| 1309 |
+
<strong>Two MCP options — pick one below:</strong>
|
| 1310 |
+
<ul style="margin:0.5rem 0 0 1.1rem;padding:0">
|
| 1311 |
+
<li><strong>Reference MCP</strong> (recommended):
|
| 1312 |
+
connect your agent to our published
|
| 1313 |
+
<a href="https://github.com/RomeroLab/protein-design-mcp"
|
| 1314 |
+
style="color:#1d4ed8;font-weight:600">protein-design-mcp</a>
|
| 1315 |
+
Docker image / Modal endpoint so every submission uses
|
| 1316 |
+
the identical 17-tool reference implementation.
|
| 1317 |
+
Eligible for the <em>reference</em> ranking.
|
| 1318 |
+
</li>
|
| 1319 |
+
<li><strong>Custom MCP</strong>: bring your own tool
|
| 1320 |
+
implementations. Tagged with a <code>custom</code>
|
| 1321 |
+
badge and excluded from the reference ranking. Useful
|
| 1322 |
+
for measuring tool-implementation contributions.
|
| 1323 |
+
</li>
|
| 1324 |
+
</ul>
|
| 1325 |
+
</div>
|
| 1326 |
+
|
| 1327 |
<div style="background:#fefce8;border-left:3px solid #ca8a04;
|
| 1328 |
padding:0.8rem 1rem;border-radius:6px;
|
| 1329 |
margin-bottom:1rem;font-size:0.85rem;color:#713f12">
|
| 1330 |
+
<strong>Rate limit:</strong> 1 submission per calendar
|
| 1331 |
+
month per organization. LLM-judge API costs (~$10/run)
|
| 1332 |
+
are paid by Romero Lab, so please be considerate.
|
| 1333 |
+
You bear your own agent / tool compute costs.
|
| 1334 |
+
</div>
|
| 1335 |
+
|
| 1336 |
+
<p style="color:#475569;font-size:0.85rem;line-height:1.55;
|
| 1337 |
+
margin:0">
|
| 1338 |
+
See
|
| 1339 |
+
<code>biodesignbench-leaderboard/example_server.py</code>
|
| 1340 |
+
in the
|
| 1341 |
+
<a href="https://github.com/RomeroLab/BioDesignBench"
|
| 1342 |
+
style="color:#2563eb;font-weight:500">GitHub repo</a>
|
| 1343 |
+
for a 200-line reference implementation of the endpoint.
|
| 1344 |
+
</p>
|
| 1345 |
</div>""")
|
| 1346 |
|
| 1347 |
with gr.Column(scale=1):
|
| 1348 |
sub_agent = gr.Textbox(
|
| 1349 |
label="Agent Name",
|
| 1350 |
+
placeholder="e.g., GPT-5 + protein-design-mcp",
|
| 1351 |
)
|
| 1352 |
sub_org = gr.Textbox(
|
| 1353 |
label="Organization",
|
|
|
|
| 1362 |
placeholder="Brief description of your agent...",
|
| 1363 |
lines=3,
|
| 1364 |
)
|
| 1365 |
+
sub_mcp_mode = gr.Radio(
|
| 1366 |
+
choices=[
|
| 1367 |
+
("Reference MCP (eligible for ranking)", "reference"),
|
| 1368 |
+
("Custom MCP (own tool implementations)", "custom"),
|
| 1369 |
+
],
|
| 1370 |
+
value="reference",
|
| 1371 |
+
label="MCP tool implementation",
|
| 1372 |
+
info=(
|
| 1373 |
+
"Reference = your agent calls our published "
|
| 1374 |
+
"protein-design-mcp server. Custom = your agent "
|
| 1375 |
+
"uses its own tool implementations."
|
| 1376 |
+
),
|
| 1377 |
)
|
| 1378 |
sub_btn = gr.Button(
|
| 1379 |
"Submit for Review",
|
|
|
|
| 1381 |
)
|
| 1382 |
sub_result = gr.HTML()
|
| 1383 |
|
| 1384 |
+
def _handle_submit(name, org, url, desc, mcp_mode):
|
| 1385 |
if not name or not org or not url:
|
| 1386 |
return ('<div style="color:#e53e3e;padding:0.5rem">'
|
| 1387 |
"Please fill in all required fields.</div>")
|
|
|
|
| 1395 |
organization=org,
|
| 1396 |
endpoint_url=url,
|
| 1397 |
description=desc,
|
| 1398 |
+
mcp_custom=(mcp_mode == "custom"),
|
| 1399 |
)
|
| 1400 |
if "error" in result:
|
| 1401 |
return (f'<div style="color:#e53e3e;padding:0.5rem">'
|
|
|
|
| 1406 |
f'<strong>Submitted!</strong> '
|
| 1407 |
f'ID: <code>{result["submission_id"]}</code><br>'
|
| 1408 |
f'Status: {result["status"]}<br>'
|
| 1409 |
+
f'MCP mode: <strong>{mcp_mode}</strong><br>'
|
| 1410 |
f'{result.get("message", "")}</div>'
|
| 1411 |
)
|
| 1412 |
except Exception as e:
|
|
|
|
| 1415 |
|
| 1416 |
sub_btn.click(
|
| 1417 |
_handle_submit,
|
| 1418 |
+
[sub_agent, sub_org, sub_url, sub_desc, sub_mcp_mode],
|
| 1419 |
sub_result,
|
| 1420 |
)
|
| 1421 |
|
eval_queue.py
CHANGED
|
@@ -3,7 +3,8 @@
|
|
| 3 |
Manages the lifecycle of benchmark submissions:
|
| 4 |
pending β approved β dispatching β boltz β scoring β complete / failed
|
| 5 |
|
| 6 |
-
Rate limiting:
|
|
|
|
| 7 |
|
| 8 |
HF Dataset: RomeroLab-Duke/biodesignbench-submissions (private)
|
| 9 |
Schema: Each row is a submission with per-task results stored as JSON.
|
|
@@ -29,7 +30,7 @@ SUBMISSIONS_DATASET = os.environ.get(
|
|
| 29 |
"RomeroLab-Duke/biodesignbench-submissions",
|
| 30 |
)
|
| 31 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 32 |
-
MAX_SUBMISSIONS_PER_MONTH =
|
| 33 |
|
| 34 |
# Submission status progression
|
| 35 |
VALID_STATUSES = {
|
|
|
|
| 3 |
Manages the lifecycle of benchmark submissions:
|
| 4 |
pending β approved β dispatching β boltz β scoring β complete / failed
|
| 5 |
|
| 6 |
+
Rate limiting: 1 submission per calendar month per organization.
|
| 7 |
+
LLM-judge API costs are paid by Romero Lab, so the limit is intentionally low.
|
| 8 |
|
| 9 |
HF Dataset: RomeroLab-Duke/biodesignbench-submissions (private)
|
| 10 |
Schema: Each row is a submission with per-task results stored as JSON.
|
|
|
|
| 30 |
"RomeroLab-Duke/biodesignbench-submissions",
|
| 31 |
)
|
| 32 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 33 |
+
MAX_SUBMISSIONS_PER_MONTH = 1
|
| 34 |
|
| 35 |
# Submission status progression
|
| 36 |
VALID_STATUSES = {
|