Jasonkim8652 commited on
Commit
af5defe
Β·
verified Β·
1 Parent(s): cfedbc8

Phase C: MCP toggle (reference vs custom) + tighten rate limit to 1/month

Browse files

- Submit form: replace checkbox with explicit 2-option radio
- Add inline pointer to RomeroLab/protein-design-mcp
- About 'How to submit' rewritten with both modes
- Leaderboard MCP column: badges for both reference and custom
- eval_queue.MAX_SUBMISSIONS_PER_MONTH: 2 -> 1 (judge cost guard)

Files changed (2) hide show
  1. app.py +100 -36
  2. eval_queue.py +3 -2
app.py CHANGED
@@ -374,11 +374,16 @@ def build_leaderboard_table(
374
  mcp = f'<td style="{TD};color:#718096">\u2014</td>'
375
  elif e.get("mcp_custom"):
376
  mcp = (
377
- f'<td style="{TD};color:#38a169;font-weight:700">'
378
- "\u2713 custom</td>"
 
379
  )
380
  else:
381
- mcp = f'<td style="{TD};color:#718096">reference</td>'
 
 
 
 
382
 
383
  # ── Score with proportional bar ──
384
  scol = _score_color(sc)
@@ -813,13 +818,26 @@ def build_about() -> str:
813
  <h2 {h2}>How to submit</h2>
814
  <h3 {h3}>1. Build your agent</h3>
815
  <p {p}>
816
- Create a protein design agent that accepts tasks via our API.
817
- You can use our 17 reference MCP tools as-is, extend them, or
818
- build entirely custom tools.</p>
 
 
 
 
 
 
 
 
 
 
 
819
  <h3 {h3}>2. Host an API endpoint</h3>
820
  <p {p}>
821
  Your agent must be accessible as a POST endpoint that accepts
822
- task payloads and returns designed protein sequences.</p>
 
 
823
  <h3 {h3}>API specification</h3>
824
  <pre style="background:#0f172a;color:#e2e8f0;padding:1.2rem;
825
  border-radius:10px;font-size:0.8rem;overflow-x:auto;
@@ -830,6 +848,8 @@ Request:
830
  "task_id": "dnb_ab_001",
831
  "task_description": "Design a de novo binder for...",
832
  "available_tools": [...],
 
 
833
  "max_steps": 50,
834
  "timeout_sec": 300
835
  }}
@@ -837,30 +857,27 @@ Request:
837
  Response:
838
  {{
839
  "sequences": ["MKKL..."],
840
- "run_log": [...],
841
  "total_steps": 12,
842
- "total_time_sec": 142.5
 
843
  }}</pre>
844
  <h3 {h3}>3. Submit and wait</h3>
845
  <p {p}>
846
- We dispatch 73 hidden tasks to your endpoint and verify results
847
- with Boltz structure prediction.
848
- Maximum <strong>2 submissions per month</strong> per organization.</p>
 
 
 
849
  <p {p}>
850
  3 example tasks are publicly available for development and
851
- testing.</p>
852
-
853
- <h3 {h3}>Reference MCP tools</h3>
854
- <p {p}>
855
- We provide 17 reference MCP tools for protein design. You can use
856
- them as-is, extend them, or build entirely custom tools.
857
- <a href="#" style="color:#2563eb;font-weight:500">
858
- GitHub repository &rarr;</a></p>
859
 
860
  <h3 {h3}>Limits</h3>
861
  <ul style="color:#475569;padding-left:1.5rem;margin-bottom:0.8rem;
862
  line-height:1.7">
863
- <li>Maximum 2 submissions per calendar month per organization</li>
864
  <li>73 hidden tasks are used for ranking</li>
865
  <li>3 public example tasks are available for development</li>
866
  </ul>
@@ -1273,27 +1290,64 @@ def create_app() -> gr.Blocks:
1273
  # ══════ Tab 5: Submit ══════
1274
  with gr.Tab("\U0001f4e4 Submit"):
1275
  gr.HTML("""
1276
- <div style="max-width:700px;margin:0 auto;padding:1rem">
1277
  <h2 style="color:#0f172a;margin:0 0 0.5rem;
1278
  font-weight:700;font-size:1.25rem">
1279
  Submit your agent</h2>
1280
  <p style="color:#475569;margin-bottom:1rem;line-height:1.6">
1281
- Submit your protein design agent for benchmarking.
1282
- Your agent must be hosted as a POST endpoint that accepts
1283
- task payloads and returns designed sequences.
1284
- <strong>You bear all LLM and tool costs</strong> &mdash;
1285
- we only run Boltz structure prediction on our side.</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1286
  <div style="background:#fefce8;border-left:3px solid #ca8a04;
1287
  padding:0.8rem 1rem;border-radius:6px;
1288
  margin-bottom:1rem;font-size:0.85rem;color:#713f12">
1289
- <strong>Rate limit:</strong> 2 submissions per calendar
1290
- month per organization.</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
1291
  </div>""")
1292
 
1293
  with gr.Column(scale=1):
1294
  sub_agent = gr.Textbox(
1295
  label="Agent Name",
1296
- placeholder="e.g., GPT-5 + Custom MCP Tools",
1297
  )
1298
  sub_org = gr.Textbox(
1299
  label="Organization",
@@ -1308,9 +1362,18 @@ def create_app() -> gr.Blocks:
1308
  placeholder="Brief description of your agent...",
1309
  lines=3,
1310
  )
1311
- sub_mcp = gr.Checkbox(
1312
- label="Uses custom MCP tools (not reference)",
1313
- value=False,
 
 
 
 
 
 
 
 
 
1314
  )
1315
  sub_btn = gr.Button(
1316
  "Submit for Review",
@@ -1318,7 +1381,7 @@ def create_app() -> gr.Blocks:
1318
  )
1319
  sub_result = gr.HTML()
1320
 
1321
- def _handle_submit(name, org, url, desc, mcp):
1322
  if not name or not org or not url:
1323
  return ('<div style="color:#e53e3e;padding:0.5rem">'
1324
  "Please fill in all required fields.</div>")
@@ -1332,7 +1395,7 @@ def create_app() -> gr.Blocks:
1332
  organization=org,
1333
  endpoint_url=url,
1334
  description=desc,
1335
- mcp_custom=mcp,
1336
  )
1337
  if "error" in result:
1338
  return (f'<div style="color:#e53e3e;padding:0.5rem">'
@@ -1343,6 +1406,7 @@ def create_app() -> gr.Blocks:
1343
  f'<strong>Submitted!</strong> '
1344
  f'ID: <code>{result["submission_id"]}</code><br>'
1345
  f'Status: {result["status"]}<br>'
 
1346
  f'{result.get("message", "")}</div>'
1347
  )
1348
  except Exception as e:
@@ -1351,7 +1415,7 @@ def create_app() -> gr.Blocks:
1351
 
1352
  sub_btn.click(
1353
  _handle_submit,
1354
- [sub_agent, sub_org, sub_url, sub_desc, sub_mcp],
1355
  sub_result,
1356
  )
1357
 
 
374
  mcp = f'<td style="{TD};color:#718096">\u2014</td>'
375
  elif e.get("mcp_custom"):
376
  mcp = (
377
+ f'<td style="{TD}"><span style="background:#fef3c7;'
378
+ "color:#92400e;padding:0.15rem 0.55rem;border-radius:4px;"
379
+ 'font-size:0.72rem;font-weight:700">custom</span></td>'
380
  )
381
  else:
382
+ mcp = (
383
+ f'<td style="{TD}"><span style="background:#dbeafe;'
384
+ "color:#1e40af;padding:0.15rem 0.55rem;border-radius:4px;"
385
+ 'font-size:0.72rem;font-weight:700">reference</span></td>'
386
+ )
387
 
388
  # ── Score with proportional bar ──
389
  scol = _score_color(sc)
 
818
  <h2 {h2}>How to submit</h2>
819
  <h3 {h3}>1. Build your agent</h3>
820
  <p {p}>
821
+ Create a protein design agent that runs the full plan &rarr;
822
+ sample &rarr; evaluate &rarr; iterate loop on each task. Pick one
823
+ of two MCP options:</p>
824
+ <ul style="color:#475569;padding-left:1.5rem;margin-bottom:0.8rem;
825
+ line-height:1.7">
826
+ <li><strong>Reference MCP</strong> &mdash; connect to our published
827
+ <a href="https://github.com/RomeroLab/protein-design-mcp"
828
+ style="color:#2563eb;font-weight:600">protein-design-mcp</a>
829
+ server (Docker image / Modal endpoint, in progress). Eligible for
830
+ the reference ranking.</li>
831
+ <li><strong>Custom MCP</strong> &mdash; bring your own tool
832
+ implementations. Tagged with a <code>custom</code> badge on the
833
+ leaderboard, excluded from the reference ranking.</li>
834
+ </ul>
835
  <h3 {h3}>2. Host an API endpoint</h3>
836
  <p {p}>
837
  Your agent must be accessible as a POST endpoint that accepts
838
+ task payloads and returns designed sequences plus a tool-call
839
+ trace. See <code>biodesignbench-leaderboard/example_server.py</code>
840
+ for a 200-line reference.</p>
841
  <h3 {h3}>API specification</h3>
842
  <pre style="background:#0f172a;color:#e2e8f0;padding:1.2rem;
843
  border-radius:10px;font-size:0.8rem;overflow-x:auto;
 
848
  "task_id": "dnb_ab_001",
849
  "task_description": "Design a de novo binder for...",
850
  "available_tools": [...],
851
+ "input_files": {{ "<pdb-name>": "<base64>" }},
852
+ "design_constraints": {{ ... }},
853
  "max_steps": 50,
854
  "timeout_sec": 300
855
  }}
 
857
  Response:
858
  {{
859
  "sequences": ["MKKL..."],
860
+ "run_log": [{{ "step": 1, "tool": "...", "success": true }}],
861
  "total_steps": 12,
862
+ "total_time_sec": 142.5,
863
+ "metrics": {{}}
864
  }}</pre>
865
  <h3 {h3}>3. Submit and wait</h3>
866
  <p {p}>
867
+ We dispatch 73 hidden tasks to your endpoint, run Boltz-2
868
+ structure verification on each design, and score against the
869
+ 100-point hybrid rubric (algorithmic + 3-judge LLM panel).
870
+ Maximum <strong>1 submission per month</strong> per
871
+ organization &mdash; LLM-judge API costs are paid by Romero
872
+ Lab.</p>
873
  <p {p}>
874
  3 example tasks are publicly available for development and
875
+ testing your endpoint before submission.</p>
 
 
 
 
 
 
 
876
 
877
  <h3 {h3}>Limits</h3>
878
  <ul style="color:#475569;padding-left:1.5rem;margin-bottom:0.8rem;
879
  line-height:1.7">
880
+ <li>Maximum 1 submission per calendar month per organization</li>
881
  <li>73 hidden tasks are used for ranking</li>
882
  <li>3 public example tasks are available for development</li>
883
  </ul>
 
1290
  # ══════ Tab 5: Submit ══════
1291
  with gr.Tab("\U0001f4e4 Submit"):
1292
  gr.HTML("""
1293
+ <div style="max-width:780px;margin:0 auto;padding:1rem">
1294
  <h2 style="color:#0f172a;margin:0 0 0.5rem;
1295
  font-weight:700;font-size:1.25rem">
1296
  Submit your agent</h2>
1297
  <p style="color:#475569;margin-bottom:1rem;line-height:1.6">
1298
+ Host your protein-design agent as an HTTPS endpoint that
1299
+ accepts task payloads and returns designed sequences plus
1300
+ a tool-call trace. The leaderboard will POST each of the
1301
+ 76 hidden tasks to your endpoint, run Boltz-2 structure
1302
+ verification, score the rubric, and publish the result.
1303
+ </p>
1304
+
1305
+ <div style="background:#eff6ff;border-left:4px solid #3182ce;
1306
+ padding:0.95rem 1.1rem;border-radius:8px;
1307
+ margin-bottom:1rem;font-size:0.86rem;
1308
+ color:#1e3a8a;line-height:1.55">
1309
+ <strong>Two MCP options &mdash; pick one below:</strong>
1310
+ <ul style="margin:0.5rem 0 0 1.1rem;padding:0">
1311
+ <li><strong>Reference MCP</strong> (recommended):
1312
+ connect your agent to our published
1313
+ <a href="https://github.com/RomeroLab/protein-design-mcp"
1314
+ style="color:#1d4ed8;font-weight:600">protein-design-mcp</a>
1315
+ Docker image / Modal endpoint so every submission uses
1316
+ the identical 17-tool reference implementation.
1317
+ Eligible for the <em>reference</em> ranking.
1318
+ </li>
1319
+ <li><strong>Custom MCP</strong>: bring your own tool
1320
+ implementations. Tagged with a <code>custom</code>
1321
+ badge and excluded from the reference ranking. Useful
1322
+ for measuring tool-implementation contributions.
1323
+ </li>
1324
+ </ul>
1325
+ </div>
1326
+
1327
  <div style="background:#fefce8;border-left:3px solid #ca8a04;
1328
  padding:0.8rem 1rem;border-radius:6px;
1329
  margin-bottom:1rem;font-size:0.85rem;color:#713f12">
1330
+ <strong>Rate limit:</strong> 1 submission per calendar
1331
+ month per organization. LLM-judge API costs (~$10/run)
1332
+ are paid by Romero Lab, so please be considerate.
1333
+ You bear your own agent / tool compute costs.
1334
+ </div>
1335
+
1336
+ <p style="color:#475569;font-size:0.85rem;line-height:1.55;
1337
+ margin:0">
1338
+ See
1339
+ <code>biodesignbench-leaderboard/example_server.py</code>
1340
+ in the
1341
+ <a href="https://github.com/RomeroLab/BioDesignBench"
1342
+ style="color:#2563eb;font-weight:500">GitHub repo</a>
1343
+ for a 200-line reference implementation of the endpoint.
1344
+ </p>
1345
  </div>""")
1346
 
1347
  with gr.Column(scale=1):
1348
  sub_agent = gr.Textbox(
1349
  label="Agent Name",
1350
+ placeholder="e.g., GPT-5 + protein-design-mcp",
1351
  )
1352
  sub_org = gr.Textbox(
1353
  label="Organization",
 
1362
  placeholder="Brief description of your agent...",
1363
  lines=3,
1364
  )
1365
+ sub_mcp_mode = gr.Radio(
1366
+ choices=[
1367
+ ("Reference MCP (eligible for ranking)", "reference"),
1368
+ ("Custom MCP (own tool implementations)", "custom"),
1369
+ ],
1370
+ value="reference",
1371
+ label="MCP tool implementation",
1372
+ info=(
1373
+ "Reference = your agent calls our published "
1374
+ "protein-design-mcp server. Custom = your agent "
1375
+ "uses its own tool implementations."
1376
+ ),
1377
  )
1378
  sub_btn = gr.Button(
1379
  "Submit for Review",
 
1381
  )
1382
  sub_result = gr.HTML()
1383
 
1384
+ def _handle_submit(name, org, url, desc, mcp_mode):
1385
  if not name or not org or not url:
1386
  return ('<div style="color:#e53e3e;padding:0.5rem">'
1387
  "Please fill in all required fields.</div>")
 
1395
  organization=org,
1396
  endpoint_url=url,
1397
  description=desc,
1398
+ mcp_custom=(mcp_mode == "custom"),
1399
  )
1400
  if "error" in result:
1401
  return (f'<div style="color:#e53e3e;padding:0.5rem">'
 
1406
  f'<strong>Submitted!</strong> '
1407
  f'ID: <code>{result["submission_id"]}</code><br>'
1408
  f'Status: {result["status"]}<br>'
1409
+ f'MCP mode: <strong>{mcp_mode}</strong><br>'
1410
  f'{result.get("message", "")}</div>'
1411
  )
1412
  except Exception as e:
 
1415
 
1416
  sub_btn.click(
1417
  _handle_submit,
1418
+ [sub_agent, sub_org, sub_url, sub_desc, sub_mcp_mode],
1419
  sub_result,
1420
  )
1421
 
eval_queue.py CHANGED
@@ -3,7 +3,8 @@
3
  Manages the lifecycle of benchmark submissions:
4
  pending β†’ approved β†’ dispatching β†’ boltz β†’ scoring β†’ complete / failed
5
 
6
- Rate limiting: 2 submissions per calendar month per organization.
 
7
 
8
  HF Dataset: RomeroLab-Duke/biodesignbench-submissions (private)
9
  Schema: Each row is a submission with per-task results stored as JSON.
@@ -29,7 +30,7 @@ SUBMISSIONS_DATASET = os.environ.get(
29
  "RomeroLab-Duke/biodesignbench-submissions",
30
  )
31
  HF_TOKEN = os.environ.get("HF_TOKEN")
32
- MAX_SUBMISSIONS_PER_MONTH = 2
33
 
34
  # Submission status progression
35
  VALID_STATUSES = {
 
3
  Manages the lifecycle of benchmark submissions:
4
  pending β†’ approved β†’ dispatching β†’ boltz β†’ scoring β†’ complete / failed
5
 
6
+ Rate limiting: 1 submission per calendar month per organization.
7
+ LLM-judge API costs are paid by Romero Lab, so the limit is intentionally low.
8
 
9
  HF Dataset: RomeroLab-Duke/biodesignbench-submissions (private)
10
  Schema: Each row is a submission with per-task results stored as JSON.
 
30
  "RomeroLab-Duke/biodesignbench-submissions",
31
  )
32
  HF_TOKEN = os.environ.get("HF_TOKEN")
33
+ MAX_SUBMISSIONS_PER_MONTH = 1
34
 
35
  # Submission status progression
36
  VALID_STATUSES = {