Spaces:

RomeroLab-Duke
/

BioDesignBench-Leaderboard

Running

App Files Files Community

Jasonkim8652 commited on Mar 3

Commit

6205b94

verified ·

1 Parent(s): eecaec9

feat: add submission & scoring infrastructure (eval_scorer, dispatcher, boltz, queue, tasks) + fix gradio 5.x for Python 3.13

Browse files

Files changed (9) hide show

app.py +407 -1
eval_boltz.py +272 -0
eval_dispatcher.py +361 -0
eval_queue.py +312 -0
eval_scorer.py +1643 -0
eval_tasks.py +236 -0
example_server.py +205 -0
mcp_tool_schemas.json +468 -0
requirements.txt +6 -1

app.py CHANGED Viewed

@@ -2,14 +2,26 @@
 Evaluating LLM Agents on Protein Design via MCP Tools
 Romero Lab, Duke University
 """
 import json
 from pathlib import Path
 import gradio as gr
 import plotly.graph_objects as go
 # ═══════════════════════════════════════════════════════════════════
 #  Configuration — change these when deploying
@@ -916,7 +928,401 @@ def create_app() -> gr.Blocks:
                 gr.Plot(chart_mode_comparison(entries))
                 gr.HTML(build_mode_cards(entries))
-            # ════════ Tab 5: About ════════
             with gr.Tab("\u2139\ufe0f About"):
                 gr.HTML(build_about())

 Evaluating LLM Agents on Protein Design via MCP Tools
 Romero Lab, Duke University
+Tabs:
+  1. Overall Leaderboard
+  2. Taxonomy Breakdown
+  3. Component Analysis
+  4. Benchmark vs User
+  5. Submit (new submission form)
+  6. Status & Admin (password-protected pipeline control)
+  7. About
 """
 import json
+import os
 from pathlib import Path
 import gradio as gr
 import plotly.graph_objects as go
+ADMIN_PASSWORD = os.environ.get("BDB_ADMIN_PASSWORD", "biodesignbench2026")
 # ═══════════════════════════════════════════════════════════════════
 #  Configuration — change these when deploying
                 gr.Plot(chart_mode_comparison(entries))
                 gr.HTML(build_mode_cards(entries))
+            # ══════ Tab 5: Submit ══════
+            with gr.Tab("\U0001f4e4 Submit"):
+                gr.HTML("""
+                <div style="max-width:700px;margin:0 auto;padding:1rem">
+                  <h2 style="color:#1a365d;margin:0 0 0.5rem">
+                    Submit Your Agent</h2>
+                  <p style="color:#4a5568;margin-bottom:1rem;line-height:1.5">
+                    Submit your protein design agent for benchmarking.
+                    Your agent must be hosted as a POST endpoint that accepts
+                    task descriptions and returns designed sequences.
+                    <strong>You bear all LLM and MCP tool costs</strong>;
+                    we only run Boltz structure prediction on our end.</p>
+                  <div style="background:#fefcbf;border-left:4px solid #d69e2e;
+                              padding:0.8rem;border-radius:4px;margin-bottom:1rem;
+                              font-size:0.85rem;color:#744210">
+                    <strong>Rate limit:</strong> 2 submissions per calendar
+                    month per organization.</div>
+                </div>""")
+                with gr.Column(scale=1):
+                    sub_agent = gr.Textbox(
+                        label="Agent Name",
+                        placeholder="e.g., GPT-5 + Custom MCP Tools",
+                    )
+                    sub_org = gr.Textbox(
+                        label="Organization",
+                        placeholder="e.g., OpenAI",
+                    )
+                    sub_url = gr.Textbox(
+                        label="Endpoint URL",
+                        placeholder="https://your-server.com/api/run",
+                    )
+                    sub_desc = gr.Textbox(
+                        label="Description (optional)",
+                        placeholder="Brief description of your agent...",
+                        lines=3,
+                    )
+                    sub_mcp = gr.Checkbox(
+                        label="Uses custom MCP tools (not reference)",
+                        value=False,
+                    )
+                    sub_btn = gr.Button(
+                        "Submit for Review",
+                        variant="primary",
+                    )
+                    sub_result = gr.HTML()
+                def _handle_submit(name, org, url, desc, mcp):
+                    if not name or not org or not url:
+                        return ('<div style="color:#e53e3e;padding:0.5rem">'
+                                "Please fill in all required fields.</div>")
+                    if not url.startswith(("http://", "https://")):
+                        return ('<div style="color:#e53e3e;padding:0.5rem">'
+                                "URL must start with http:// or https://</div>")
+                    try:
+                        from eval_queue import submit
+                        result = submit(
+                            agent_name=name,
+                            organization=org,
+                            endpoint_url=url,
+                            description=desc,
+                            mcp_custom=mcp,
+                        )
+                        if "error" in result:
+                            return (f'<div style="color:#e53e3e;padding:0.5rem">'
+                                    f'{result["error"]}</div>')
+                        return (
+                            f'<div style="background:#c6f6d5;padding:1rem;'
+                            f'border-radius:8px;margin-top:0.5rem">'
+                            f'<strong>Submitted!</strong> '
+                            f'ID: <code>{result["submission_id"]}</code><br>'
+                            f'Status: {result["status"]}<br>'
+                            f'{result.get("message", "")}</div>'
+                        )
+                    except Exception as e:
+                        return (f'<div style="color:#e53e3e;padding:0.5rem">'
+                                f"Error: {str(e)[:200]}</div>")
+                sub_btn.click(
+                    _handle_submit,
+                    [sub_agent, sub_org, sub_url, sub_desc, sub_mcp],
+                    sub_result,
+                )
+            # ══════ Tab 6: Status & Admin ══════
+            with gr.Tab("\U0001f6e0 Status"):
+                gr.HTML("""
+                <div style="max-width:800px;margin:0 auto;padding:1rem">
+                  <h2 style="color:#1a365d;margin:0 0 0.5rem">
+                    Submission Status & Admin</h2>
+                  <p style="color:#4a5568;margin-bottom:0.5rem">
+                    Check your submission status or manage the pipeline
+                    (admin only).</p>
+                </div>""")
+                # --- Public status check ---
+                with gr.Accordion("Check Submission Status", open=True):
+                    status_id = gr.Textbox(
+                        label="Submission ID",
+                        placeholder="Enter your submission ID...",
+                    )
+                    status_btn = gr.Button("Check Status")
+                    status_out = gr.HTML()
+                    def _check_status(sid):
+                        if not sid:
+                            return '<div style="color:#718096">Enter an ID above.</div>'
+                        try:
+                            from eval_queue import get_submission
+                            sub = get_submission(sid.strip())
+                            if sub is None:
+                                return ('<div style="color:#e53e3e">'
+                                        "Submission not found.</div>")
+                            status_color = {
+                                "pending": "#d69e2e", "approved": "#38a169",
+                                "dispatching": "#3182ce", "boltz": "#805ad5",
+                                "scoring": "#805ad5", "complete": "#38a169",
+                                "failed": "#e53e3e", "rejected": "#e53e3e",
+                            }.get(sub["status"], "#718096")
+                            score_html = ""
+                            if sub.get("overall_score") is not None:
+                                score_html = (
+                                    f'<div style="font-size:1.2rem;'
+                                    f'font-weight:700;color:#1a365d;'
+                                    f'margin-top:0.5rem">'
+                                    f'Score: {sub["overall_score"]:.1f}/100'
+                                    f'</div>'
+                                )
+                            return (
+                                f'<div style="background:white;padding:1rem;'
+                                f'border-radius:8px;border:1px solid #e2e8f0">'
+                                f'<strong>{sub["agent_name"]}</strong> '
+                                f'({sub["organization"]})<br>'
+                                f'Status: <span style="color:{status_color};'
+                                f'font-weight:700">{sub["status"]}</span><br>'
+                                f'Tasks: {sub.get("tasks_dispatched", 0)}'
+                                f'/{sub.get("tasks_total", 76)}<br>'
+                                f'Created: {sub.get("created_at", "")[:10]}'
+                                f'{score_html}</div>'
+                            )
+                        except Exception as e:
+                            return f'<div style="color:#e53e3e">{e}</div>'
+                    status_btn.click(_check_status, [status_id], status_out)
+                # --- Admin panel (password-protected) ---
+                with gr.Accordion("Admin Panel", open=False):
+                    admin_pw = gr.Textbox(
+                        label="Admin Password", type="password",
+                    )
+                    admin_auth_btn = gr.Button("Authenticate")
+                    admin_panel = gr.Column(visible=False)
+                    admin_msg = gr.HTML()
+                    with admin_panel:
+                        gr.HTML('<h3 style="color:#1a365d">'
+                                'Pending Submissions</h3>')
+                        pending_html = gr.HTML()
+                        refresh_btn = gr.Button("Refresh List")
+                        with gr.Row():
+                            approve_id = gr.Textbox(
+                                label="Submission ID to Approve/Reject",
+                                scale=2,
+                            )
+                            approve_btn = gr.Button(
+                                "Approve", variant="primary", scale=1,
+                            )
+                            reject_btn = gr.Button(
+                                "Reject", variant="stop", scale=1,
+                            )
+                        approve_msg = gr.HTML()
+                        gr.HTML('<h3 style="color:#1a365d;margin-top:1rem">'
+                                'Pipeline Control</h3>')
+                        with gr.Row():
+                            dispatch_id = gr.Textbox(
+                                label="Submission ID", scale=2,
+                            )
+                            dispatch_btn = gr.Button(
+                                "Phase A: Dispatch Tasks", scale=1,
+                            )
+                        with gr.Row():
+                            boltz_id = gr.Textbox(
+                                label="Submission ID", scale=2,
+                            )
+                            boltz_btn = gr.Button(
+                                "Phase B: Run Boltz", scale=1,
+                            )
+                        with gr.Row():
+                            final_id = gr.Textbox(
+                                label="Submission ID", scale=2,
+                            )
+                            final_btn = gr.Button(
+                                "Phase C: Finalize & Publish", scale=1,
+                            )
+                        pipeline_out = gr.HTML()
+                    def _admin_auth(pw):
+                        if pw == ADMIN_PASSWORD:
+                            return (
+                                gr.update(visible=True),
+                                '<div style="color:#38a169">'
+                                'Authenticated.</div>',
+                            )
+                        return (
+                            gr.update(visible=False),
+                            '<div style="color:#e53e3e">'
+                            'Wrong password.</div>',
+                        )
+                    admin_auth_btn.click(
+                        _admin_auth, [admin_pw],
+                        [admin_panel, admin_msg],
+                    )
+                    def _refresh_pending():
+                        try:
+                            from eval_queue import get_pending_submissions
+                            pending = get_pending_submissions()
+                            if not pending:
+                                return "<p>No pending submissions.</p>"
+                            rows = []
+                            for s in pending:
+                                rows.append(
+                                    f'<tr><td>{s["submission_id"]}</td>'
+                                    f'<td>{s["agent_name"]}</td>'
+                                    f'<td>{s["organization"]}</td>'
+                                    f'<td>{s.get("endpoint_url","")[:40]}'
+                                    f'...</td>'
+                                    f'<td>{s.get("created_at","")[:10]}'
+                                    f'</td></tr>'
+                                )
+                            return (
+                                '<table style="width:100%;font-size:0.85rem;'
+                                'border-collapse:collapse">'
+                                "<tr><th>ID</th><th>Agent</th><th>Org</th>"
+                                "<th>URL</th><th>Date</th></tr>"
+                                + "".join(rows) + "</table>"
+                            )
+                        except Exception as e:
+                            return f"<p>Error: {e}</p>"
+                    refresh_btn.click(
+                        _refresh_pending, [], pending_html,
+                    )
+                    def _approve_sub(sid):
+                        try:
+                            from eval_queue import update_status
+                            ok = update_status(sid.strip(), "approved")
+                            if ok:
+                                return (
+                                    f'<div style="color:#38a169">'
+                                    f'Approved: {sid}</div>'
+                                )
+                            return (
+                                f'<div style="color:#e53e3e">'
+                                f'Failed to approve {sid}</div>'
+                            )
+                        except Exception as e:
+                            return f'<div style="color:#e53e3e">{e}</div>'
+                    def _reject_sub(sid):
+                        try:
+                            from eval_queue import update_status
+                            ok = update_status(sid.strip(), "rejected")
+                            if ok:
+                                return (
+                                    f'<div style="color:#d69e2e">'
+                                    f'Rejected: {sid}</div>'
+                                )
+                            return (
+                                f'<div style="color:#e53e3e">'
+                                f'Failed to reject {sid}</div>'
+                            )
+                        except Exception as e:
+                            return f'<div style="color:#e53e3e">{e}</div>'
+                    approve_btn.click(
+                        _approve_sub, [approve_id], approve_msg,
+                    )
+                    reject_btn.click(
+                        _reject_sub, [approve_id], approve_msg,
+                    )
+                    def _run_dispatch(sid):
+                        try:
+                            import asyncio as _aio
+                            from eval_queue import get_submission
+                            from eval_dispatcher import dispatch_all_tasks
+                            sub = get_submission(sid.strip())
+                            if sub is None:
+                                return (
+                                    '<div style="color:#e53e3e">'
+                                    'Not found</div>'
+                                )
+                            if sub["status"] not in (
+                                "approved", "dispatching"
+                            ):
+                                return (
+                                    f'<div style="color:#e53e3e">'
+                                    f'Cannot dispatch: status='
+                                    f'{sub["status"]}</div>'
+                                )
+                            loop = _aio.new_event_loop()
+                            results = loop.run_until_complete(
+                                dispatch_all_tasks(
+                                    sid.strip(),
+                                    sub["endpoint_url"],
+                                )
+                            )
+                            loop.close()
+                            ok = sum(
+                                1 for r in results if r.get("success")
+                            )
+                            return (
+                                f'<div style="color:#38a169">'
+                                f'Dispatched: {ok}/{len(results)} '
+                                f'tasks succeeded.</div>'
+                            )
+                        except Exception as e:
+                            return f'<div style="color:#e53e3e">{e}</div>'
+                    def _run_boltz(sid):
+                        try:
+                            from eval_queue import get_submission
+                            from eval_boltz import run_boltz_posteval
+                            sub = get_submission(sid.strip())
+                            if sub is None:
+                                return (
+                                    '<div style="color:#e53e3e">'
+                                    'Not found</div>'
+                                )
+                            per_task = json.loads(
+                                sub.get("per_task_results", "{}")
+                            )
+                            if not per_task:
+                                return (
+                                    '<div style="color:#e53e3e">'
+                                    "No task results to process.</div>"
+                                )
+                            run_boltz_posteval(per_task)
+                            return (
+                                '<div style="color:#38a169">'
+                                "Boltz post-assessment complete.</div>"
+                            )
+                        except Exception as e:
+                            return f'<div style="color:#e53e3e">{e}</div>'
+                    def _run_finalize(sid):
+                        try:
+                            from eval_queue import (
+                                finalize_submission,
+                                get_submission,
+                            )
+                            from eval_scorer import aggregate_scores
+                            sub = get_submission(sid.strip())
+                            if sub is None:
+                                return (
+                                    '<div style="color:#e53e3e">'
+                                    'Not found</div>'
+                                )
+                            per_task = json.loads(
+                                sub.get("per_task_results", "{}")
+                            )
+                            agg = aggregate_scores(per_task)
+                            finalize_submission(
+                                sid.strip(),
+                                overall_score=agg["overall_score"],
+                                component_scores=agg["component_scores"],
+                                taxonomy_scores=agg["taxonomy_scores"],
+                            )
+                            return (
+                                f'<div style="color:#38a169">'
+                                f'Finalized! Score: '
+                                f'{agg["overall_score"]:.1f}</div>'
+                            )
+                        except Exception as e:
+                            return f'<div style="color:#e53e3e">{e}</div>'
+                    dispatch_btn.click(
+                        _run_dispatch, [dispatch_id], pipeline_out,
+                    )
+                    boltz_btn.click(
+                        _run_boltz, [boltz_id], pipeline_out,
+                    )
+                    final_btn.click(
+                        _run_finalize, [final_id], pipeline_out,
+                    )
+            # ══════ Tab 7: About ══════
             with gr.Tab("\u2139\ufe0f About"):
                 gr.HTML(build_about())

eval_boltz.py ADDED Viewed

	@@ -0,0 +1,272 @@

+"""Boltz structure prediction for post-assessment scoring.
+Uses @spaces.GPU decorator for ZeroGPU on HuggingFace Spaces.
+Two prediction modes:
+  - Monomer: Non-binding tasks -> pLDDT, pTM
+  - Complex: Binding tasks (binder + target) -> ipTM, i_pAE
+Batch chunking respects ZeroGPU time limits (~180-240s per burst).
+"""
+from __future__ import annotations
+import logging
+import time
+from typing import Any
+logger = logging.getLogger(__name__)
+# Chunking limits for ZeroGPU (free tier: ~300s max per burst)
+MONOMER_CHUNK_SIZE = 5    # ~30-60s per monomer
+COMPLEX_CHUNK_SIZE = 2    # ~60-120s per complex
+MAX_GPU_TIME = 240         # safety margin under 300s ZeroGPU limit
+# ---------------------------------------------------------------------------
+#  Boltz prediction (GPU-accelerated)
+# ---------------------------------------------------------------------------
+def _predict_monomer(sequence: str) -> dict[str, float]:
+    """Predict structure of a single protein sequence using Boltz.
+    Returns:
+        Dict with: pLDDT, pTM (or error).
+    """
+    try:
+        import torch
+        from boltz import Boltz
+        model = Boltz.from_pretrained("boltz2")
+        result = model.predict(sequence)
+        plddt = float(result.confidence.plddt.mean())
+        ptm = float(result.confidence.ptm)
+        return {
+            "pLDDT": round(plddt, 2),
+            "pTM": round(ptm, 4),
+            "success": True,
+        }
+    except Exception as e:
+        logger.error(f"Boltz monomer prediction failed: {e}")
+        return {"pLDDT": 0.0, "pTM": 0.0, "success": False, "error": str(e)}
+def _predict_complex(
+    binder_seq: str,
+    target_seq: str,
+) -> dict[str, float]:
+    """Predict complex structure and binding metrics using Boltz.
+    Returns:
+        Dict with: ipTM, i_pAE, pLDDT, pTM (or error).
+    """
+    try:
+        import torch
+        from boltz import Boltz
+        model = Boltz.from_pretrained("boltz2")
+        result = model.predict([binder_seq, target_seq])
+        plddt = float(result.confidence.plddt.mean())
+        ptm = float(result.confidence.ptm)
+        iptm = float(result.confidence.iptm) if hasattr(result.confidence, "iptm") else 0.0
+        ipae = float(result.confidence.ipae) if hasattr(result.confidence, "ipae") else 0.0
+        return {
+            "pLDDT": round(plddt, 2),
+            "pTM": round(ptm, 4),
+            "ipTM": round(iptm, 4),
+            "i_pAE": round(ipae, 2),
+            "success": True,
+        }
+    except Exception as e:
+        logger.error(f"Boltz complex prediction failed: {e}")
+        return {
+            "pLDDT": 0.0, "pTM": 0.0, "ipTM": 0.0, "i_pAE": 0.0,
+            "success": False, "error": str(e),
+        }
+# ---------------------------------------------------------------------------
+#  GPU-decorated entry points (for HF Spaces with ZeroGPU)
+# ---------------------------------------------------------------------------
+try:
+    import spaces
+    @spaces.GPU(duration=MAX_GPU_TIME)
+    def predict_monomer_batch(sequences: list[str]) -> list[dict[str, float]]:
+        """Predict structures for a batch of monomer sequences.
+        Decorated with @spaces.GPU for ZeroGPU allocation.
+        Args:
+            sequences: List of amino acid sequences (max MONOMER_CHUNK_SIZE).
+        Returns:
+            List of prediction result dicts with pLDDT, pTM.
+        """
+        results = []
+        for seq in sequences[:MONOMER_CHUNK_SIZE]:
+            results.append(_predict_monomer(seq))
+        return results
+    @spaces.GPU(duration=MAX_GPU_TIME)
+    def predict_complex_batch(
+        pairs: list[tuple[str, str]],
+    ) -> list[dict[str, float]]:
+        """Predict structures for a batch of binder-target pairs.
+        Args:
+            pairs: List of (binder_seq, target_seq) tuples.
+        Returns:
+            List of prediction result dicts with ipTM, i_pAE, pLDDT, pTM.
+        """
+        results = []
+        for binder, target in pairs[:COMPLEX_CHUNK_SIZE]:
+            results.append(_predict_complex(binder, target))
+        return results
+except ImportError:
+    # Not running on HF Spaces -- provide un-decorated versions
+    def predict_monomer_batch(sequences: list[str]) -> list[dict[str, float]]:
+        return [_predict_monomer(seq) for seq in sequences[:MONOMER_CHUNK_SIZE]]
+    def predict_complex_batch(
+        pairs: list[tuple[str, str]],
+    ) -> list[dict[str, float]]:
+        return [_predict_complex(b, t) for b, t in pairs[:COMPLEX_CHUNK_SIZE]]
+# ---------------------------------------------------------------------------
+#  High-level assessment API
+# ---------------------------------------------------------------------------
+def run_boltz_posteval(
+    per_task_results: dict[str, dict[str, Any]],
+    progress_callback=None,
+) -> dict[str, dict[str, Any]]:
+    """Run Boltz post-assessment on all tasks that need it.
+    For each task:
+      - Non-binding: pick best design -> monomer prediction
+      - Binding: pick best design + target sequence -> complex prediction
+      - Merge Boltz metrics into existing results
+      - Re-score quality component
+    Args:
+        per_task_results: Dict of task_id -> dispatch result (from dispatcher).
+        progress_callback: Optional callback(task_id, i, total, metrics).
+    Returns:
+        Updated per_task_results with Boltz metrics and final quality scores.
+    """
+    from eval_scorer import _is_binding_task, score_quality
+    # Separate tasks into monomer and complex batches
+    monomer_tasks = []
+    complex_tasks = []
+    for task_id, result in per_task_results.items():
+        if not result.get("success") or not result.get("quality_pending"):
+            continue
+        sequences = result.get("sequences", [])
+        if not sequences:
+            continue
+        best_seq = sequences[0]  # Use first design for Boltz
+        if _is_binding_task(task_id):
+            # Need target sequence from ground truth
+            target_seq = result.get("ground_truth_thresholds", {}).get("target_sequence")
+            if target_seq:
+                complex_tasks.append((task_id, best_seq, target_seq))
+            else:
+                # Fall back to monomer if no target
+                monomer_tasks.append((task_id, best_seq))
+        else:
+            monomer_tasks.append((task_id, best_seq))
+    total = len(monomer_tasks) + len(complex_tasks)
+    done = 0
+    # Process monomer tasks in chunks
+    for chunk_start in range(0, len(monomer_tasks), MONOMER_CHUNK_SIZE):
+        chunk = monomer_tasks[chunk_start:chunk_start + MONOMER_CHUNK_SIZE]
+        seqs = [seq for _, seq in chunk]
+        boltz_results = predict_monomer_batch(seqs)
+        for (task_id, _), metrics in zip(chunk, boltz_results):
+            if metrics.get("success"):
+                _merge_boltz_metrics(per_task_results[task_id], metrics)
+            done += 1
+            if progress_callback:
+                progress_callback(task_id, done, total, metrics)
+    # Process complex tasks in chunks
+    for chunk_start in range(0, len(complex_tasks), COMPLEX_CHUNK_SIZE):
+        chunk = complex_tasks[chunk_start:chunk_start + COMPLEX_CHUNK_SIZE]
+        pairs = [(binder, target) for _, binder, target in chunk]
+        boltz_results = predict_complex_batch(pairs)
+        for (task_id, _, _), metrics in zip(chunk, boltz_results):
+            if metrics.get("success"):
+                _merge_boltz_metrics(per_task_results[task_id], metrics)
+            done += 1
+            if progress_callback:
+                progress_callback(task_id, done, total, metrics)
+    return per_task_results
+def _merge_boltz_metrics(
+    task_result: dict[str, Any],
+    boltz_metrics: dict[str, float],
+) -> None:
+    """Merge Boltz prediction metrics into a task result and re-score quality.
+    Modifies task_result in-place.
+    """
+    from eval_scorer import apply_design_gate, score_quality
+    # Merge Boltz metrics with any agent-reported metrics
+    merged_metrics = task_result.get("agent_metrics", {}).copy()
+    for key in ("pLDDT", "pTM", "ipTM", "i_pAE"):
+        if key in boltz_metrics and boltz_metrics[key] > 0:
+            merged_metrics[key] = boltz_metrics[key]
+    # Re-score quality with Boltz metrics
+    quality_result = score_quality(
+        agent_metrics=merged_metrics,
+        thresholds=task_result.get("ground_truth_thresholds", {}),
+        task_id=task_result.get("task_id", ""),
+        designs=task_result.get("sequences"),
+        oracle_sequences=task_result.get("oracle_sequences"),
+    )
+    # Update scores
+    task_result["boltz_metrics"] = boltz_metrics
+    task_result["quality_pending"] = False
+    if "cpu_scores" in task_result:
+        task_result["cpu_scores"]["quality"] = quality_result["score"]
+    # Compute final gated score
+    if "cpu_scores" in task_result:
+        component_scores = dict(task_result["cpu_scores"])
+        gated = apply_design_gate(component_scores, task_result.get("num_designs", 0))
+        task_result["final_scores"] = gated
+        task_result["total_score"] = sum(gated.values())
+    if "cpu_details" in task_result:
+        task_result["cpu_details"]["quality"] = quality_result

eval_dispatcher.py ADDED Viewed

	@@ -0,0 +1,361 @@

+"""HTTP task dispatcher — sends benchmark tasks to submitter endpoints.
+For each of 76 tasks:
+  1. Build task payload (prompt + tools + PDB data)
+  2. POST to submitter's endpoint with timeout
+  3. Validate response format
+  4. Run CPU-only scoring (approach, orchestration, feasibility, novelty, diversity)
+  5. Save results to submission queue
+CPU scoring runs immediately; quality scoring waits for Boltz post-eval.
+"""
+from __future__ import annotations
+import logging
+import time
+from typing import Any, Generator
+logger = logging.getLogger(__name__)
+# Response validation limits
+MAX_SEQUENCES = 50
+MAX_SEQUENCE_LENGTH = 2000
+MAX_LOG_ENTRIES = 200
+DISPATCH_TIMEOUT = 300  # seconds per task
+# ---------------------------------------------------------------------------
+#  Response validation
+# ---------------------------------------------------------------------------
+def validate_response(response: dict[str, Any]) -> tuple[bool, str]:
+    """Validate the submitter's response format.
+    Expected format:
+    {
+        "sequences": ["MKKL...", ...],
+        "run_log": [{"step": 1, "tool": "...", "success": true, ...}, ...],
+        "total_steps": 12,
+        "total_time_sec": 142.5,
+        "metrics": {}
+    }
+    Returns:
+        (is_valid, error_message)
+    """
+    if not isinstance(response, dict):
+        return False, "Response must be a JSON object"
+    # sequences (required)
+    sequences = response.get("sequences")
+    if not isinstance(sequences, list):
+        return False, "Missing or invalid 'sequences' field (must be a list)"
+    if len(sequences) > MAX_SEQUENCES:
+        return False, f"Too many sequences: {len(sequences)} > {MAX_SEQUENCES}"
+    for i, seq in enumerate(sequences):
+        if not isinstance(seq, str):
+            return False, f"sequences[{i}] must be a string"
+        if len(seq) > MAX_SEQUENCE_LENGTH:
+            return False, f"sequences[{i}] too long: {len(seq)} > {MAX_SEQUENCE_LENGTH}"
+        if len(seq) == 0:
+            return False, f"sequences[{i}] is empty"
+    # run_log (required)
+    run_log = response.get("run_log")
+    if not isinstance(run_log, list):
+        return False, "Missing or invalid 'run_log' field (must be a list)"
+    if len(run_log) > MAX_LOG_ENTRIES:
+        return False, f"Too many log entries: {len(run_log)} > {MAX_LOG_ENTRIES}"
+    for i, entry in enumerate(run_log):
+        if not isinstance(entry, dict):
+            return False, f"run_log[{i}] must be a dict"
+        if "tool" not in entry:
+            return False, f"run_log[{i}] missing 'tool' field"
+    # Optional fields — validate types if present
+    if "total_steps" in response:
+        if not isinstance(response["total_steps"], (int, float)):
+            return False, "'total_steps' must be a number"
+    if "total_time_sec" in response:
+        if not isinstance(response["total_time_sec"], (int, float)):
+            return False, "'total_time_sec' must be a number"
+    return True, ""
+# ---------------------------------------------------------------------------
+#  Single task dispatch
+# ---------------------------------------------------------------------------
+async def dispatch_single_task(
+    endpoint_url: str,
+    task_payload: dict[str, Any],
+    timeout: int = DISPATCH_TIMEOUT,
+) -> dict[str, Any]:
+    """Send a single task to the submitter's endpoint.
+    Args:
+        endpoint_url: Submitter's POST endpoint URL.
+        task_payload: Task payload from eval_tasks.build_task_payload().
+        timeout: Request timeout in seconds.
+    Returns:
+        Dict with: success, task_id, response (if success), error (if failed),
+        latency_sec.
+    """
+    import httpx
+    task_id = task_payload["task_id"]
+    start = time.monotonic()
+    try:
+        async with httpx.AsyncClient(timeout=timeout) as client:
+            resp = await client.post(
+                endpoint_url,
+                json=task_payload,
+                headers={"Content-Type": "application/json"},
+            )
+            latency = time.monotonic() - start
+            if resp.status_code != 200:
+                return {
+                    "success": False,
+                    "task_id": task_id,
+                    "error": f"HTTP {resp.status_code}: {resp.text[:200]}",
+                    "latency_sec": round(latency, 1),
+                }
+            try:
+                data = resp.json()
+            except Exception:
+                return {
+                    "success": False,
+                    "task_id": task_id,
+                    "error": "Response is not valid JSON",
+                    "latency_sec": round(latency, 1),
+                }
+            is_valid, error_msg = validate_response(data)
+            if not is_valid:
+                return {
+                    "success": False,
+                    "task_id": task_id,
+                    "error": f"Invalid response: {error_msg}",
+                    "latency_sec": round(latency, 1),
+                }
+            return {
+                "success": True,
+                "task_id": task_id,
+                "response": data,
+                "latency_sec": round(latency, 1),
+            }
+    except httpx.TimeoutException:
+        latency = time.monotonic() - start
+        return {
+            "success": False,
+            "task_id": task_id,
+            "error": f"Timeout after {timeout}s",
+            "latency_sec": round(latency, 1),
+        }
+    except Exception as e:
+        latency = time.monotonic() - start
+        return {
+            "success": False,
+            "task_id": task_id,
+            "error": f"Connection error: {str(e)[:200]}",
+            "latency_sec": round(latency, 1),
+        }
+# ---------------------------------------------------------------------------
+#  CPU scoring (runs immediately, no GPU needed)
+# ---------------------------------------------------------------------------
+def score_cpu_components(
+    task_id: str,
+    sequences: list[str],
+    run_log: list[dict[str, Any]],
+    ground_truth: dict[str, Any],
+    oracle_sequences: list[str] | None = None,
+) -> dict[str, Any]:
+    """Run CPU-only scoring components.
+    Scores: approach, orchestration, feasibility, novelty, diversity.
+    Quality scoring is deferred until Boltz post-eval provides pLDDT/ipTM.
+    Args:
+        task_id: Task identifier.
+        sequences: Designed sequences from submitter.
+        run_log: Tool call log from submitter.
+        ground_truth: Ground truth data for this task.
+        oracle_sequences: Oracle sequences for non-binding tasks.
+    Returns:
+        Dict with partial scores and metadata for later Boltz completion.
+    """
+    from eval_scorer import (
+        get_category,
+        score_approach,
+        score_diversity,
+        score_feasibility,
+        score_novelty,
+        score_orchestration,
+    )
+    # Extract fields
+    thresholds = ground_truth.get("thresholds", {})
+    reference_seq = ground_truth.get("reference_sequence")
+    constraints = ground_truth.get("design_constraints", {})
+    tools_expected = ground_truth.get("tools_expected", [])
+    max_designs = ground_truth.get("max_designs", 10)
+    cat = get_category(task_id)
+    task_type = cat.task_type if cat else None
+    tools_used = [e.get("tool", "") for e in run_log if e.get("tool")]
+    approach_result = score_approach(
+        tools_used=tools_used,
+        tools_expected=tools_expected,
+        task_type=task_type,
+    )
+    orchestration_result = score_orchestration(
+        tool_call_log=run_log,
+        task_id=task_id,
+    )
+    feasibility_result = score_feasibility(
+        designs=sequences,
+        constraints=constraints,
+    )
+    novelty_result = score_novelty(
+        designs=sequences,
+        reference_seq=reference_seq,
+        thresholds=thresholds,
+    )
+    diversity_result = score_diversity(
+        designs=sequences,
+        max_designs=max_designs,
+    )
+    return {
+        "task_id": task_id,
+        "num_designs": len(sequences),
+        "sequences": sequences,
+        "cpu_scores": {
+            "approach": approach_result["score"],
+            "orchestration": orchestration_result["score"],
+            "feasibility": feasibility_result["score"],
+            "novelty": novelty_result["score"],
+            "diversity": diversity_result["score"],
+        },
+        "cpu_details": {
+            "approach": approach_result,
+            "orchestration": orchestration_result,
+            "feasibility": feasibility_result,
+            "novelty": novelty_result,
+            "diversity": diversity_result,
+        },
+        "quality_pending": True,  # Needs Boltz post-eval
+        "oracle_sequences": oracle_sequences or [],
+        "ground_truth_thresholds": thresholds,
+    }
+# ---------------------------------------------------------------------------
+#  Full dispatch pipeline
+# ---------------------------------------------------------------------------
+async def dispatch_all_tasks(
+    submission_id: str,
+    endpoint_url: str,
+    progress_callback=None,
+) -> Generator[dict[str, Any], None, None]:
+    """Dispatch all hidden tasks to a submitter endpoint.
+    Yields progress updates as each task completes. Saves results
+    to the submission queue incrementally.
+    Args:
+        submission_id: Submission ID for queue tracking.
+        endpoint_url: Submitter's POST endpoint.
+        progress_callback: Optional callback(task_id, i, total, result)
+            for streaming progress updates.
+    Returns:
+        List of per-task results.
+    """
+    from eval_queue import save_task_result, update_status
+    from eval_tasks import build_task_payload, get_hidden_task_ids, get_task
+    task_ids = get_hidden_task_ids()
+    total = len(task_ids)
+    results = []
+    update_status(submission_id, "dispatching", tasks_total=total)
+    for i, task_id in enumerate(task_ids):
+        # Build payload
+        payload = build_task_payload(task_id)
+        if payload is None:
+            result = {
+                "task_id": task_id,
+                "success": False,
+                "error": "Task not found",
+            }
+            results.append(result)
+            save_task_result(submission_id, task_id, result)
+            continue
+        # Dispatch
+        dispatch_result = await dispatch_single_task(endpoint_url, payload)
+        if dispatch_result["success"]:
+            # Run CPU scoring
+            task_data = get_task(task_id)
+            ground_truth = task_data["ground_truth"] if task_data else {}
+            oracle_seqs = task_data.get("oracle_sequences", []) if task_data else []
+            response = dispatch_result["response"]
+            cpu_result = score_cpu_components(
+                task_id=task_id,
+                sequences=response["sequences"],
+                run_log=response["run_log"],
+                ground_truth=ground_truth,
+                oracle_sequences=oracle_seqs,
+            )
+            cpu_result["latency_sec"] = dispatch_result["latency_sec"]
+            cpu_result["success"] = True
+            cpu_result["agent_metrics"] = response.get("metrics", {})
+            results.append(cpu_result)
+            save_task_result(submission_id, task_id, cpu_result)
+        else:
+            result = {
+                "task_id": task_id,
+                "success": False,
+                "error": dispatch_result["error"],
+                "latency_sec": dispatch_result.get("latency_sec"),
+            }
+            results.append(result)
+            save_task_result(submission_id, task_id, result)
+        if progress_callback:
+            progress_callback(task_id, i + 1, total, results[-1])
+        logger.info(
+            f"[{i+1}/{total}] {task_id}: "
+            f"{'OK' if results[-1].get('success') else 'FAIL'} "
+            f"({results[-1].get('latency_sec', 0):.1f}s)"
+        )
+    return results

eval_queue.py ADDED Viewed

	@@ -0,0 +1,312 @@

+"""Submission queue management using HuggingFace Datasets.
+Manages the lifecycle of benchmark submissions:
+  pending → approved → dispatching → boltz → scoring → complete / failed
+Rate limiting: 2 submissions per calendar month per organization.
+HF Dataset: RomeroLab-Duke/biodesignbench-submissions (private)
+Schema: Each row is a submission with per-task results stored as JSON.
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+import uuid
+from datetime import datetime, timezone
+from typing import Any
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+#  Constants
+# ---------------------------------------------------------------------------
+SUBMISSIONS_DATASET = os.environ.get(
+    "BDB_SUBMISSIONS_DATASET",
+    "RomeroLab-Duke/biodesignbench-submissions",
+)
+HF_TOKEN = os.environ.get("HF_TOKEN")
+MAX_SUBMISSIONS_PER_MONTH = 2
+# Submission status progression
+VALID_STATUSES = {
+    "pending",
+    "approved",
+    "dispatching",
+    "boltz",
+    "scoring",
+    "complete",
+    "failed",
+    "rejected",
+}
+# ---------------------------------------------------------------------------
+#  Data model
+# ---------------------------------------------------------------------------
+def _make_submission_row(
+    agent_name: str,
+    organization: str,
+    endpoint_url: str,
+    description: str = "",
+    mcp_custom: bool = False,
+) -> dict[str, Any]:
+    """Create a new submission row."""
+    now = datetime.now(timezone.utc).isoformat()
+    return {
+        "submission_id": str(uuid.uuid4())[:12],
+        "agent_name": agent_name,
+        "organization": organization,
+        "endpoint_url": endpoint_url,
+        "description": description,
+        "mcp_custom": mcp_custom,
+        "status": "pending",
+        "created_at": now,
+        "updated_at": now,
+        "tasks_dispatched": 0,
+        "tasks_total": 76,
+        "tasks_boltz_done": 0,
+        "overall_score": None,
+        "component_scores": None,
+        "taxonomy_scores": None,
+        "per_task_results": "{}",  # JSON string of task_id → result
+        "error_message": None,
+    }
+# ---------------------------------------------------------------------------
+#  Queue operations (HF Datasets API)
+# ---------------------------------------------------------------------------
+def _get_dataset():
+    """Load the submissions dataset from HF Hub."""
+    try:
+        from datasets import load_dataset
+        ds = load_dataset(
+            SUBMISSIONS_DATASET,
+            split="train",
+            token=HF_TOKEN,
+        )
+        return ds
+    except Exception as e:
+        logger.warning(f"Could not load submissions dataset: {e}")
+        return None
+def _save_rows(rows: list[dict[str, Any]]) -> bool:
+    """Save rows back to HF Dataset."""
+    try:
+        from datasets import Dataset
+        from huggingface_hub import HfApi
+        ds = Dataset.from_list(rows)
+        ds.push_to_hub(
+            SUBMISSIONS_DATASET,
+            token=HF_TOKEN,
+            private=True,
+        )
+        return True
+    except Exception as e:
+        logger.error(f"Failed to save submissions: {e}")
+        return False
+def _load_all_rows() -> list[dict[str, Any]]:
+    """Load all submission rows as a list of dicts."""
+    ds = _get_dataset()
+    if ds is None:
+        return []
+    return [dict(row) for row in ds]
+def submit(
+    agent_name: str,
+    organization: str,
+    endpoint_url: str,
+    description: str = "",
+    mcp_custom: bool = False,
+) -> dict[str, Any]:
+    """Create a new submission.
+    Returns:
+        Dict with submission_id and status, or error message.
+    """
+    # Rate limit check
+    error = check_rate_limit(organization)
+    if error:
+        return {"error": error}
+    # Validate endpoint URL
+    if not endpoint_url.startswith(("http://", "https://")):
+        return {"error": "Endpoint URL must start with http:// or https://"}
+    row = _make_submission_row(
+        agent_name=agent_name,
+        organization=organization,
+        endpoint_url=endpoint_url,
+        description=description,
+        mcp_custom=mcp_custom,
+    )
+    rows = _load_all_rows()
+    rows.append(row)
+    if _save_rows(rows):
+        return {
+            "submission_id": row["submission_id"],
+            "status": "pending",
+            "message": f"Submission created. Awaiting admin approval.",
+        }
+    return {"error": "Failed to save submission. Please try again."}
+def check_rate_limit(organization: str) -> str | None:
+    """Check if an organization has exceeded the monthly submission limit.
+    Returns:
+        Error message string if rate limited, None if OK.
+    """
+    rows = _load_all_rows()
+    now = datetime.now(timezone.utc)
+    current_month = now.strftime("%Y-%m")
+    monthly_count = 0
+    for row in rows:
+        if row.get("organization", "").lower() != organization.lower():
+            continue
+        if row.get("status") in ("rejected", "failed"):
+            continue
+        created = row.get("created_at", "")
+        if created.startswith(current_month):
+            monthly_count += 1
+    if monthly_count >= MAX_SUBMISSIONS_PER_MONTH:
+        return (
+            f"Organization '{organization}' has reached the limit of "
+            f"{MAX_SUBMISSIONS_PER_MONTH} submissions for {current_month}."
+        )
+    return None
+def update_status(
+    submission_id: str,
+    status: str,
+    **extra_fields: Any,
+) -> bool:
+    """Update a submission's status and optional extra fields.
+    Args:
+        submission_id: The submission to update.
+        status: New status (must be in VALID_STATUSES).
+        **extra_fields: Additional fields to update (e.g., tasks_dispatched=10).
+    Returns:
+        True if updated successfully.
+    """
+    if status not in VALID_STATUSES:
+        logger.error(f"Invalid status: {status}")
+        return False
+    rows = _load_all_rows()
+    found = False
+    for row in rows:
+        if row.get("submission_id") == submission_id:
+            row["status"] = status
+            row["updated_at"] = datetime.now(timezone.utc).isoformat()
+            for k, v in extra_fields.items():
+                if k in row:
+                    row[k] = v
+            found = True
+            break
+    if not found:
+        logger.error(f"Submission {submission_id} not found")
+        return False
+    return _save_rows(rows)
+def save_task_result(
+    submission_id: str,
+    task_id: str,
+    result: dict[str, Any],
+) -> bool:
+    """Save a per-task result to the submission.
+    Args:
+        submission_id: The submission to update.
+        task_id: Task identifier.
+        result: Score result dict from eval_scorer.score_submission_task().
+    Returns:
+        True if saved successfully.
+    """
+    rows = _load_all_rows()
+    for row in rows:
+        if row.get("submission_id") == submission_id:
+            per_task = json.loads(row.get("per_task_results", "{}"))
+            per_task[task_id] = result
+            row["per_task_results"] = json.dumps(per_task)
+            row["tasks_dispatched"] = len(per_task)
+            row["updated_at"] = datetime.now(timezone.utc).isoformat()
+            return _save_rows(rows)
+    logger.error(f"Submission {submission_id} not found")
+    return False
+def get_submission(submission_id: str) -> dict[str, Any] | None:
+    """Get a single submission by ID."""
+    rows = _load_all_rows()
+    for row in rows:
+        if row.get("submission_id") == submission_id:
+            return row
+    return None
+def get_pending_submissions() -> list[dict[str, Any]]:
+    """Get all submissions awaiting admin approval."""
+    return [r for r in _load_all_rows() if r.get("status") == "pending"]
+def get_approved_submissions() -> list[dict[str, Any]]:
+    """Get all approved submissions ready for dispatch."""
+    return [r for r in _load_all_rows() if r.get("status") == "approved"]
+def get_all_submissions() -> list[dict[str, Any]]:
+    """Get all submissions for the admin panel."""
+    return _load_all_rows()
+def finalize_submission(
+    submission_id: str,
+    overall_score: float,
+    component_scores: dict[str, float],
+    taxonomy_scores: dict[str, dict[str, float]],
+) -> bool:
+    """Finalize a submission with aggregated scores.
+    Args:
+        submission_id: The submission to finalize.
+        overall_score: Overall score (0-100).
+        component_scores: Dict of component → averaged score.
+        taxonomy_scores: Nested dict of task_type → context → avg score.
+    Returns:
+        True if finalized successfully.
+    """
+    return update_status(
+        submission_id,
+        status="complete",
+        overall_score=overall_score,
+        component_scores=json.dumps(component_scores),
+        taxonomy_scores=json.dumps(taxonomy_scores),
+    )

eval_scorer.py ADDED Viewed

	@@ -0,0 +1,1643 @@

+"""Standalone 100-point scoring rubric for BioDesignBench Tier 2 design tasks.
+This file is a **self-contained extraction** of the scoring logic from the
+``biodesignbench`` package.  It has **zero external dependencies** (stdlib only)
+so it can run on HuggingFace Spaces without installing the full package.
+Modules consolidated:
+  - biodesignbench/taxonomy.py
+  - biodesignbench/eval/metrics/sequence.py
+  - biodesignbench/eval/metrics/approach.py
+  - biodesignbench/eval/metrics/orchestration.py
+  - biodesignbench/eval/tier2/scoring.py
+  - biodesignbench/eval/tier2/oracle.py  (oracle loading stub)
+Six scoring components (sum = 100):
+  approach      (20 pts)  — Tool/methodology selection
+  orchestration (15 pts)  — Pipeline ordering + intermediate validation
+  quality       (35 pts)  — 3-tier continuous scoring (structure/interface/physics)
+  feasibility   (15 pts)  — Valid AAs, length, composition + biophysical checks
+  novelty       ( 5 pts)  — Sequence identity to known sequences
+  diversity     (10 pts)  — Number + diversity of designs
+"""
+from __future__ import annotations
+import json
+import math
+import re
+from collections import Counter
+from dataclasses import dataclass, field
+from enum import Enum
+from functools import lru_cache
+from itertools import combinations
+from typing import Any, Optional
+# ═══════════════════════════════════════════════════════════════════════════════
+#  SECTION 1 — Taxonomy  (from biodesignbench/taxonomy.py)
+# ═══════════════════════════════════════════════════════════════════════════════
+class DesignTaskType(str, Enum):
+    """What the agent does."""
+    DE_NOVO_BINDER = "de_novo_binder"
+    SEQUENCE_OPTIMIZATION = "sequence_optimization"
+    DE_NOVO_BACKBONE = "de_novo_backbone"
+    COMPLEX_ENGINEERING = "complex_engineering"
+    CONFORMATIONAL_DESIGN = "conformational_design"
+    @property
+    def short(self) -> str:
+        return _TASK_TYPE_SHORT[self]
+class BiologicalContext(str, Enum):
+    """Domain knowledge required."""
+    ANTIBODY = "antibody"
+    ENZYME = "enzyme"
+    SIGNALING = "signaling"
+    STRUCTURAL = "structural"
+    FLUORESCENT = "fluorescent"
+    THERAPEUTIC = "therapeutic"
+    @property
+    def short(self) -> str:
+        return _CONTEXT_SHORT[self]
+_TASK_TYPE_SHORT: dict[DesignTaskType, str] = {
+    DesignTaskType.DE_NOVO_BINDER: "dnb",
+    DesignTaskType.SEQUENCE_OPTIMIZATION: "sqo",
+    DesignTaskType.DE_NOVO_BACKBONE: "dnk",
+    DesignTaskType.COMPLEX_ENGINEERING: "cpx",
+    DesignTaskType.CONFORMATIONAL_DESIGN: "cfd",
+}
+_CONTEXT_SHORT: dict[BiologicalContext, str] = {
+    BiologicalContext.ANTIBODY: "ab",
+    BiologicalContext.ENZYME: "enz",
+    BiologicalContext.SIGNALING: "sig",
+    BiologicalContext.STRUCTURAL: "str",
+    BiologicalContext.FLUORESCENT: "flu",
+    BiologicalContext.THERAPEUTIC: "thr",
+}
+_SHORT_TO_TASK_TYPE: dict[str, DesignTaskType] = {v: k for k, v in _TASK_TYPE_SHORT.items()}
+_SHORT_TO_CONTEXT: dict[str, BiologicalContext] = {v: k for k, v in _CONTEXT_SHORT.items()}
+# Core tools expected per task type
+_CORE_TOOLS: dict[DesignTaskType, list[str]] = {
+    DesignTaskType.DE_NOVO_BINDER: ["rfdiffusion", "proteinmpnn", "alphafold2"],
+    DesignTaskType.SEQUENCE_OPTIMIZATION: ["proteinmpnn", "esmfold", "alphafold2"],
+    DesignTaskType.DE_NOVO_BACKBONE: ["rfdiffusion", "proteinmpnn", "alphafold2"],
+    DesignTaskType.COMPLEX_ENGINEERING: ["rfdiffusion", "proteinmpnn", "alphafold2"],
+    DesignTaskType.CONFORMATIONAL_DESIGN: ["esmfold", "proteinmpnn", "alphafold2"],
+}
+_PRIMARY_METRIC: dict[DesignTaskType, str] = {
+    DesignTaskType.DE_NOVO_BINDER: "ipTM",
+    DesignTaskType.SEQUENCE_OPTIMIZATION: "pLDDT",
+    DesignTaskType.DE_NOVO_BACKBONE: "pLDDT",
+    DesignTaskType.COMPLEX_ENGINEERING: "ipTM",
+    DesignTaskType.CONFORMATIONAL_DESIGN: "pLDDT",
+}
+@dataclass(frozen=True)
+class TaskCategory:
+    """A valid cell in the DesignTaskType × BiologicalContext matrix."""
+    task_type: DesignTaskType
+    context: BiologicalContext
+    @property
+    def category_id(self) -> str:
+        return f"{self.task_type.short}_{self.context.short}"
+    @property
+    def expected_core_tools(self) -> list[str]:
+        return list(_CORE_TOOLS[self.task_type])
+    @property
+    def primary_quality_metric(self) -> str:
+        return _PRIMARY_METRIC[self.task_type]
+VALID_CATEGORIES: list[TaskCategory] = [
+    # de_novo_binder (4)
+    TaskCategory(DesignTaskType.DE_NOVO_BINDER, BiologicalContext.ANTIBODY),
+    TaskCategory(DesignTaskType.DE_NOVO_BINDER, BiologicalContext.ENZYME),
+    TaskCategory(DesignTaskType.DE_NOVO_BINDER, BiologicalContext.SIGNALING),
+    TaskCategory(DesignTaskType.DE_NOVO_BINDER, BiologicalContext.THERAPEUTIC),
+    # sequence_optimization (5)
+    TaskCategory(DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.ANTIBODY),
+    TaskCategory(DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.ENZYME),
+    TaskCategory(DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.SIGNALING),
+    TaskCategory(DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.STRUCTURAL),
+    TaskCategory(DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.FLUORESCENT),
+    # de_novo_backbone (1)
+    TaskCategory(DesignTaskType.DE_NOVO_BACKBONE, BiologicalContext.STRUCTURAL),
+    # complex_engineering (3)
+    TaskCategory(DesignTaskType.COMPLEX_ENGINEERING, BiologicalContext.ENZYME),
+    TaskCategory(DesignTaskType.COMPLEX_ENGINEERING, BiologicalContext.SIGNALING),
+    TaskCategory(DesignTaskType.COMPLEX_ENGINEERING, BiologicalContext.STRUCTURAL),
+    # conformational_design (4)
+    TaskCategory(DesignTaskType.CONFORMATIONAL_DESIGN, BiologicalContext.ENZYME),
+    TaskCategory(DesignTaskType.CONFORMATIONAL_DESIGN, BiologicalContext.SIGNALING),
+    TaskCategory(DesignTaskType.CONFORMATIONAL_DESIGN, BiologicalContext.STRUCTURAL),
+    TaskCategory(DesignTaskType.CONFORMATIONAL_DESIGN, BiologicalContext.FLUORESCENT),
+]
+_CATEGORY_BY_ID: dict[str, TaskCategory] = {c.category_id: c for c in VALID_CATEGORIES}
+# OLD → NEW task ID mapping (30 tasks)
+OLD_TO_NEW_MAPPING: dict[str, str] = {
+    "binder_001": "dnb_sig_001", "binder_003": "dnb_sig_002",
+    "binder_005": "dnb_sig_003", "binder_007": "dnb_sig_004",
+    "ppi_004": "dnb_sig_005",
+    "binder_002": "dnb_thr_001", "binder_006": "dnb_thr_002",
+    "binder_008": "dnb_thr_003", "peptide_001": "dnb_thr_004",
+    "peptide_002": "dnb_thr_005", "peptide_003": "dnb_thr_006",
+    "antibody_001": "sqo_ab_001", "antibody_002": "sqo_ab_002",
+    "antibody_003": "sqo_ab_003", "antibody_004": "sqo_ab_004",
+    "antibody_005": "sqo_ab_005",
+    "stability_002": "sqo_enz_001", "enzyme_001": "sqo_enz_002",
+    "enzyme_002": "sqo_enz_003", "enzyme_003": "sqo_enz_004",
+    "stability_003": "sqo_str_001", "stability_004": "sqo_str_002",
+    "stability_001": "sqo_flu_001",
+    "scaffold_001": "dnk_str_001", "scaffold_002": "dnk_str_002",
+    "scaffold_003": "dnk_str_003",
+    "ppi_001": "cpx_str_001", "ppi_002": "cpx_str_002",
+    "ppi_003": "cfd_sig_001",
+    "fluorescence_001": "cfd_flu_001",
+}
+_NEW_TO_OLD_MAPPING: dict[str, str] = {v: k for k, v in OLD_TO_NEW_MAPPING.items()}
+_NEW_ID_RE = re.compile(r"^([a-z]{2,3})_([a-z]{2,3})_(\d{3})$")
+_OLD_TYPE_TO_CANONICAL: dict[str, str] = {
+    "binder": "de_novo_binder", "antibody": "de_novo_binder",
+    "peptide": "de_novo_binder", "stability": "sequence_optimization",
+    "enzyme": "sequence_optimization", "fluorescence": "sequence_optimization",
+    "scaffold": "de_novo_backbone", "ppi": "complex_engineering",
+}
+_CANONICAL_VALUES = {e.value for e in DesignTaskType}
+def get_category(task_id: str) -> Optional[TaskCategory]:
+    """Get the TaskCategory for a task ID (old or new format)."""
+    if task_id in OLD_TO_NEW_MAPPING:
+        new_id = OLD_TO_NEW_MAPPING[task_id]
+        cat_id = new_id.rsplit("_", 1)[0]
+        return _CATEGORY_BY_ID.get(cat_id)
+    m = _NEW_ID_RE.match(task_id)
+    if m:
+        cat_id = f"{m.group(1)}_{m.group(2)}"
+        return _CATEGORY_BY_ID.get(cat_id)
+    return None
+def get_new_task_id(old_task_id: str) -> Optional[str]:
+    return OLD_TO_NEW_MAPPING.get(old_task_id)
+def get_old_task_id(new_task_id: str) -> Optional[str]:
+    return _NEW_TO_OLD_MAPPING.get(new_task_id)
+def is_valid_category(task_type: DesignTaskType, context: BiologicalContext) -> bool:
+    cat_id = f"{task_type.short}_{context.short}"
+    return cat_id in _CATEGORY_BY_ID
+def parse_new_task_id(
+    task_id: str,
+) -> Optional[tuple[DesignTaskType, BiologicalContext, int]]:
+    m = _NEW_ID_RE.match(task_id)
+    if not m:
+        return None
+    task_short, ctx_short, num_str = m.group(1), m.group(2), m.group(3)
+    task_type = _SHORT_TO_TASK_TYPE.get(task_short)
+    context = _SHORT_TO_CONTEXT.get(ctx_short)
+    if task_type is None or context is None:
+        return None
+    if not is_valid_category(task_type, context):
+        return None
+    return task_type, context, int(num_str)
+def normalize_task_type(task_type: str) -> str:
+    lower = task_type.lower().strip()
+    if lower in _CANONICAL_VALUES:
+        return lower
+    return _OLD_TYPE_TO_CANONICAL.get(lower, task_type)
+# ═══════════════════════════════════════════════════════════════════════════════
+#  SECTION 2 — Sequence Metrics  (from biodesignbench/eval/metrics/sequence.py)
+# ════════════════════════════════════════════════════════════════��══════════════
+_KD_SCALE: dict[str, float] = {
+    "A": 1.8, "C": 2.5, "D": -3.5, "E": -3.5, "F": 2.8,
+    "G": -0.4, "H": -3.2, "I": 4.5, "K": -3.9, "L": 3.8,
+    "M": 1.9, "N": -3.5, "P": -1.6, "Q": -3.5, "R": -4.5,
+    "S": -0.8, "T": -0.7, "V": 4.2, "W": -0.9, "Y": -1.3,
+}
+STANDARD_AAS = set("ACDEFGHIKLMNPQRSTVWY")
+def sequence_identity(seq1: str, seq2: str) -> float:
+    """Compute fractional sequence identity between two sequences."""
+    if not seq1 or not seq2:
+        return 0.0
+    s1, s2 = seq1.upper(), seq2.upper()
+    if len(s1) == len(s2):
+        return sum(a == b for a, b in zip(s1, s2)) / len(s1)
+    short, long = (s1, s2) if len(s1) <= len(s2) else (s2, s1)
+    best = 0.0
+    for offset in range(len(long) - len(short) + 1):
+        matches = sum(a == b for a, b in zip(short, long[offset:offset + len(short)]))
+        identity = matches / len(short)
+        if identity > best:
+            best = identity
+    return best
+def max_identity_to_reference(designs: list[str], reference: str) -> float:
+    if not designs or not reference:
+        return 0.0
+    return max(sequence_identity(d, reference) for d in designs)
+def mean_pairwise_diversity(sequences: list[str]) -> float:
+    if len(sequences) < 2:
+        return 0.0
+    total = 0.0
+    count = 0
+    for s1, s2 in combinations(sequences, 2):
+        total += 1.0 - sequence_identity(s1, s2)
+        count += 1
+    return total / count if count > 0 else 0.0
+def sequence_entropy(sequences: list[str], truncate: bool = False) -> float:
+    if len(sequences) < 2:
+        return 0.0
+    lengths = {len(s) for s in sequences}
+    if len(lengths) != 1:
+        if not truncate:
+            return 0.0
+        seq_len = min(lengths)
+        sequences = [s[:seq_len] for s in sequences]
+    else:
+        seq_len = lengths.pop()
+    if seq_len == 0:
+        return 0.0
+    n = len(sequences)
+    total_entropy = 0.0
+    for pos in range(seq_len):
+        counts: dict[str, int] = {}
+        for seq in sequences:
+            aa = seq[pos].upper()
+            counts[aa] = counts.get(aa, 0) + 1
+        pos_entropy = 0.0
+        for count in counts.values():
+            if count > 0:
+                p = count / n
+                pos_entropy -= p * math.log(p)
+        total_entropy += pos_entropy / math.log(20)
+    return total_entropy / seq_len
+def validate_amino_acids(sequence: str) -> dict:
+    if not sequence or not sequence.strip():
+        return {"valid": False, "invalid_chars": set(), "fraction_valid": 0.0}
+    upper = sequence.upper()
+    chars = set(upper)
+    invalid = chars - STANDARD_AAS
+    valid_count = sum(1 for c in upper if c in STANDARD_AAS)
+    return {
+        "valid": len(invalid) == 0,
+        "invalid_chars": invalid,
+        "fraction_valid": valid_count / len(upper),
+    }
+def check_length_constraints(
+    sequence: str,
+    length_range: tuple[int, int] | None,
+) -> dict:
+    length = len(sequence)
+    if length_range is None:
+        return {"length": length, "within_range": True, "range": None}
+    min_len, max_len = length_range
+    return {
+        "length": length,
+        "within_range": min_len <= length <= max_len,
+        "range": length_range,
+    }
+def hydrophobicity_profile(sequence: str) -> dict:
+    if not sequence:
+        return {"mean": 0.0, "std": 0.0, "fraction_hydrophobic": 0.0, "min": 0.0, "max": 0.0}
+    values = [_KD_SCALE.get(aa.upper(), 0.0) for aa in sequence]
+    n = len(values)
+    mean = sum(values) / n
+    variance = sum((v - mean) ** 2 for v in values) / n
+    std = math.sqrt(variance)
+    hydrophobic_count = sum(1 for v in values if v > 0)
+    return {
+        "mean": round(mean, 3),
+        "std": round(std, 3),
+        "fraction_hydrophobic": round(hydrophobic_count / n, 3),
+        "min": round(min(values), 3),
+        "max": round(max(values), 3),
+    }
+def count_mutations(wt: str, designed: str) -> int:
+    if len(wt) != len(designed):
+        return -1
+    return sum(a != b for a, b in zip(wt.upper(), designed.upper()))
+# ═══════════════════════════════════════════════════════════════════════════════
+#  SECTION 3 — Approach Scoring  (from biodesignbench/eval/metrics/approach.py)
+# ═══════════════════════════════════════════════════════════════════════════════
+class DesignFunction(str, Enum):
+    """Functional capabilities that tools provide."""
+    BACKBONE_GENERATION = "backbone_generation"
+    SEQUENCE_DESIGN = "sequence_design"
+    STRUCTURE_PREDICTION = "structure_prediction"
+    COMPLEX_PREDICTION = "complex_prediction"
+    INTERFACE_ANALYSIS = "interface_analysis"
+    STABILITY_SCORING = "stability_scoring"
+    ENERGY_MINIMIZATION = "energy_minimization"
+    HOTSPOT_IDENTIFICATION = "hotspot_identification"
+    SEQUENCE_SCORING = "sequence_scoring"
+    PHYSICS_VALIDATION = "physics_validation"
+TOOL_CATEGORIES: dict[str, str] = {
+    "alphafold2": "structure_prediction", "alphafold": "structure_prediction",
+    "af2": "structure_prediction", "esmfold": "structure_prediction",
+    "openfold": "structure_prediction", "boltz": "structure_prediction",
+    "colabfold": "structure_prediction", "omegafold": "structure_prediction",
+    "rosettafold": "structure_prediction",
+    "proteinmpnn": "sequence_design", "mpnn": "sequence_design",
+    "esm_if": "sequence_design", "ligandmpnn": "sequence_design",
+    "rfdiffusion": "backbone_generation", "rfdiff": "backbone_generation",
+    "chroma": "backbone_generation", "framediff": "backbone_generation",
+    "foldingdiff": "backbone_generation",
+    "rosetta": "energy_optimization", "pyrosetta": "energy_optimization",
+    "foldx": "energy_optimization", "openmm": "energy_optimization",
+    "amber": "energy_optimization", "esm2": "energy_optimization",
+    "foldseek": "structure_search", "dali": "structure_search",
+    "tmalign": "structure_search",
+}
+MCP_TOOL_EXPANSION: dict[str, list[str]] = {
+    "design_binder": ["rfdiffusion", "proteinmpnn", "esmfold"],
+    "validate_design": ["esmfold", "alphafold2"],
+    "optimize_sequence": ["proteinmpnn"],
+    "predict_complex": ["alphafold2"],
+    "analyze_interface": ["pyrosetta"],
+    "predict_structure": ["esmfold", "alphafold2"],
+    "score_stability": ["esm2"],
+    "energy_minimize": ["openmm"],
+    "suggest_hotspots": [],
+    "get_design_status": [],
+    "generate_backbone": ["rfdiffusion"],
+    "rosetta_score": ["pyrosetta"],
+    "rosetta_relax": ["pyrosetta"],
+    "rosetta_interface_score": ["pyrosetta"],
+    "rosetta_design": ["pyrosetta"],
+    "predict_structure_boltz": ["boltz"],
+    "predict_affinity_boltz": ["boltz"],
+}
+TOOL_TO_FUNCTION: dict[str, set[DesignFunction]] = {
+    # MCP wrappers
+    "design_binder": {DesignFunction.BACKBONE_GENERATION, DesignFunction.SEQUENCE_DESIGN, DesignFunction.STRUCTURE_PREDICTION},
+    "validate_design": {DesignFunction.STRUCTURE_PREDICTION},
+    "optimize_sequence": {DesignFunction.SEQUENCE_DESIGN},
+    "predict_complex": {DesignFunction.COMPLEX_PREDICTION, DesignFunction.STRUCTURE_PREDICTION},
+    "analyze_interface": {DesignFunction.INTERFACE_ANALYSIS},
+    "predict_structure": {DesignFunction.STRUCTURE_PREDICTION},
+    "score_stability": {DesignFunction.STABILITY_SCORING},
+    "energy_minimize": {DesignFunction.ENERGY_MINIMIZATION},
+    "suggest_hotspots": {DesignFunction.HOTSPOT_IDENTIFICATION},
+    "get_design_status": set(),
+    "generate_backbone": {DesignFunction.BACKBONE_GENERATION},
+    "rosetta_score": {DesignFunction.PHYSICS_VALIDATION},
+    "rosetta_relax": {DesignFunction.ENERGY_MINIMIZATION},
+    "rosetta_interface_score": {DesignFunction.INTERFACE_ANALYSIS},
+    "rosetta_design": {DesignFunction.SEQUENCE_DESIGN},
+    "predict_structure_boltz": {DesignFunction.STRUCTURE_PREDICTION},
+    "predict_affinity_boltz": {DesignFunction.COMPLEX_PREDICTION, DesignFunction.INTERFACE_ANALYSIS},
+    # Bio-level tools
+    "rfdiffusion": {DesignFunction.BACKBONE_GENERATION},
+    "proteinmpnn": {DesignFunction.SEQUENCE_DESIGN},
+    "alphafold2": {DesignFunction.STRUCTURE_PREDICTION, DesignFunction.COMPLEX_PREDICTION},
+    "alphafold": {DesignFunction.STRUCTURE_PREDICTION, DesignFunction.COMPLEX_PREDICTION},
+    "esmfold": {DesignFunction.STRUCTURE_PREDICTION},
+    "esm2": {DesignFunction.STABILITY_SCORING, DesignFunction.SEQUENCE_SCORING},
+    "pyrosetta": {DesignFunction.ENERGY_MINIMIZATION, DesignFunction.PHYSICS_VALIDATION, DesignFunction.INTERFACE_ANALYSIS},
+    "rosetta": {DesignFunction.ENERGY_MINIMIZATION, DesignFunction.PHYSICS_VALIDATION, DesignFunction.INTERFACE_ANALYSIS},
+    "openmm": {DesignFunction.ENERGY_MINIMIZATION},
+    "boltz": {DesignFunction.STRUCTURE_PREDICTION, DesignFunction.COMPLEX_PREDICTION},
+    "foldx": {DesignFunction.STABILITY_SCORING, DesignFunction.PHYSICS_VALIDATION},
+    "colabfold": {DesignFunction.STRUCTURE_PREDICTION, DesignFunction.COMPLEX_PREDICTION},
+    "foldseek": {DesignFunction.STRUCTURE_PREDICTION},
+    "chroma": {DesignFunction.BACKBONE_GENERATION},
+    "ligandmpnn": {DesignFunction.SEQUENCE_DESIGN},
+    "esm_if": {DesignFunction.SEQUENCE_DESIGN},
+    "mpnn": {DesignFunction.SEQUENCE_DESIGN},
+}
+class _TaskTypeDict(dict):
+    """Dict that accepts both DesignTaskType enum and string keys."""
+    def __init__(self, raw: dict[str, set[DesignFunction]]):
+        super().__init__()
+        self._raw = raw
+        for k, v in raw.items():
+            super().__setitem__(k, v)
+    def __contains__(self, key):
+        k = key.value if hasattr(key, "value") else key
+        return super().__contains__(k)
+    def __getitem__(self, key):
+        k = key.value if hasattr(key, "value") else key
+        return super().__getitem__(k)
+    def get(self, key, default=None):
+        k = key.value if hasattr(key, "value") else key
+        return super().get(k, default)
+REQUIRED_FUNCTIONS = _TaskTypeDict({
+    "de_novo_binder": {DesignFunction.BACKBONE_GENERATION, DesignFunction.SEQUENCE_DESIGN, DesignFunction.STRUCTURE_PREDICTION},
+    "sequence_optimization": {DesignFunction.SEQUENCE_DESIGN, DesignFunction.STRUCTURE_PREDICTION},
+    "de_novo_backbone": {DesignFunction.BACKBONE_GENERATION, DesignFunction.SEQUENCE_DESIGN, DesignFunction.STRUCTURE_PREDICTION},
+    "complex_engineering": {DesignFunction.SEQUENCE_DESIGN, DesignFunction.COMPLEX_PREDICTION},
+    "conformational_design": {DesignFunction.SEQUENCE_DESIGN, DesignFunction.STRUCTURE_PREDICTION},
+})
+BONUS_FUNCTIONS = _TaskTypeDict({
+    "de_novo_binder": {DesignFunction.COMPLEX_PREDICTION, DesignFunction.INTERFACE_ANALYSIS, DesignFunction.ENERGY_MINIMIZATION, DesignFunction.HOTSPOT_IDENTIFICATION},
+    "sequence_optimization": {DesignFunction.STABILITY_SCORING, DesignFunction.ENERGY_MINIMIZATION, DesignFunction.PHYSICS_VALIDATION},
+    "de_novo_backbone": {DesignFunction.ENERGY_MINIMIZATION, DesignFunction.PHYSICS_VALIDATION},
+    "complex_engineering": {DesignFunction.BACKBONE_GENERATION, DesignFunction.INTERFACE_ANALYSIS, DesignFunction.ENERGY_MINIMIZATION, DesignFunction.STRUCTURE_PREDICTION},
+    "conformational_design": {DesignFunction.STABILITY_SCORING, DesignFunction.ENERGY_MINIMIZATION, DesignFunction.COMPLEX_PREDICTION},
+})
+_GENERATION_TOOLS: set[str] = {
+    "rfdiffusion", "proteinmpnn", "design_binder", "optimize_sequence",
+    "generate_backbone", "rosetta_design", "chroma", "ligandmpnn",
+    "esm_if", "mpnn",
+}
+_VALIDATION_TOOLS: set[str] = {
+    "esmfold", "alphafold2", "validate_design", "predict_structure",
+    "predict_complex", "score_stability", "rosetta_score",
+    "rosetta_interface_score", "predict_structure_boltz",
+    "predict_affinity_boltz", "analyze_interface",
+}
+_REFINEMENT_TOOLS: set[str] = {
+    "energy_minimize", "rosetta_relax", "openmm", "pyrosetta", "rosetta",
+}
+def expand_mcp_tools(tools: list[str]) -> list[str]:
+    """Expand MCP wrapper tool names to their underlying bio tools."""
+    seen: set[str] = set()
+    expanded: list[str] = []
+    for tool in tools:
+        if tool in MCP_TOOL_EXPANSION:
+            underlying = MCP_TOOL_EXPANSION[tool]
+            if not underlying:
+                if tool not in seen:
+                    expanded.append(tool)
+                    seen.add(tool)
+            else:
+                for ut in underlying:
+                    if ut not in seen:
+                        expanded.append(ut)
+                        seen.add(ut)
+        else:
+            if tool not in seen:
+                expanded.append(tool)
+                seen.add(tool)
+    return expanded
+def normalize_tool_name(tool: str) -> str:
+    return tool.lower().strip().replace(" ", "").replace("-", "").replace("_", "")
+def get_tool_category(tool: str) -> str | None:
+    normalized = normalize_tool_name(tool)
+    for name, category in TOOL_CATEGORIES.items():
+        if normalize_tool_name(name) == normalized:
+            return category
+    return None
+def _extract_functions_from_tools(tools: list[str]) -> set[DesignFunction]:
+    functions: set[DesignFunction] = set()
+    for tool in tools:
+        if tool in TOOL_TO_FUNCTION:
+            functions.update(TOOL_TO_FUNCTION[tool])
+        else:
+            norm = normalize_tool_name(tool)
+            for known, funcs in TOOL_TO_FUNCTION.items():
+                if normalize_tool_name(known) == norm:
+                    functions.update(funcs)
+                    break
+    return functions
+def _check_validation(tools_used: list[str]) -> float:
+    if not tools_used:
+        return 0.0
+    has_generation = False
+    has_validation_after_generation = False
+    has_any_validation = False
+    for tool in tools_used:
+        if tool in _GENERATION_TOOLS:
+            has_generation = True
+        if tool in _VALIDATION_TOOLS:
+            has_any_validation = True
+            if has_generation:
+                has_validation_after_generation = True
+    if has_validation_after_generation:
+        return 4.0
+    if has_any_validation:
+        return 2.0
+    return 0.0
+def _check_refinement(tools_used: list[str]) -> float:
+    if not tools_used:
+        return 0.0
+    for tool in tools_used:
+        if tool in _REFINEMENT_TOOLS:
+            return 4.0
+    counts = Counter(tools_used)
+    for tool, count in counts.items():
+        if count >= 2 and (tool in _GENERATION_TOOLS or tool in _VALIDATION_TOOLS):
+            return 4.0
+    return 0.0
+def _score_approach_legacy(
+    tools_used: list[str],
+    tools_expected: list[str],
+    max_points: int = 20,
+) -> dict:
+    if not tools_expected:
+        return {
+            "score": max_points, "max": max_points,
+            "breakdown": [], "tools_matched": [], "tools_missing": [],
+            "mode": "legacy",
+        }
+    expanded_used = expand_mcp_tools(tools_used)
+    per_tool = max_points / len(tools_expected)
+    used_normalized = [normalize_tool_name(t) for t in expanded_used]
+    used_categories = [get_tool_category(t) for t in expanded_used]
+    total = 0.0
+    breakdown = []
+    matched = []
+    missing = []
+    for expected in tools_expected:
+        expected_norm = normalize_tool_name(expected)
+        expected_cat = get_tool_category(expected)
+        if expected_norm in used_normalized:
+            total += per_tool
+            breakdown.append({"tool": expected, "match": "exact", "points": per_tool})
+            matched.append(expected)
+        elif expected_cat and expected_cat in used_categories:
+            points = per_tool * 0.7
+            total += points
+            breakdown.append({"tool": expected, "match": "category", "points": points})
+            matched.append(expected)
+        else:
+            breakdown.append({"tool": expected, "match": "none", "points": 0})
+            missing.append(expected)
+    return {
+        "score": int(round(total)), "max": max_points,
+        "breakdown": breakdown, "tools_matched": matched,
+        "tools_missing": missing, "mode": "legacy",
+    }
+def score_approach(
+    tools_used: list[str],
+    tools_expected: list[str],
+    max_points: int = 20,
+    task_type: DesignTaskType | str | None = None,
+) -> dict:
+    """Score the agent's tool/methodology selection."""
+    if task_type is None:
+        return _score_approach_legacy(tools_used, tools_expected, max_points)
+    tt_key = task_type.value if hasattr(task_type, "value") else str(task_type)
+    scale = max_points / 20.0
+    func_max = 12.0 * scale
+    agent_functions = _extract_functions_from_tools(tools_used)
+    required = REQUIRED_FUNCTIONS.get(tt_key, set())
+    bonus = BONUS_FUNCTIONS.get(tt_key, set())
+    if required:
+        covered_required = agent_functions & required
+        required_ratio = len(covered_required) / len(required)
+    else:
+        required_ratio = 1.0 if agent_functions else 0.0
+        covered_required = set()
+    covered_bonus = agent_functions & bonus
+    bonus_count = min(len(covered_bonus), 3)
+    func_score = (required_ratio * 9.0 + bonus_count * 1.0) * scale
+    func_score = min(func_score, func_max)
+    val_score = _check_validation(tools_used) * scale
+    ref_score = _check_refinement(tools_used) * scale
+    total = min(func_score + val_score + ref_score, float(max_points))
+    return {
+        "score": int(round(total)), "max": max_points, "mode": "function",
+        "function_coverage": round(func_score, 1),
+        "validation_inclusion": round(val_score, 1),
+        "iterative_refinement": round(ref_score, 1),
+        "required_functions": sorted(f.value for f in required),
+        "covered_required": sorted(f.value for f in covered_required),
+        "covered_bonus": sorted(f.value for f in covered_bonus),
+        "agent_functions": sorted(f.value for f in agent_functions),
+    }
+# ═══════════════════════════════════════════════════════════════════════════════
+#  SECTION 4 — Orchestration Scoring  (from biodesignbench/eval/metrics/orchestration.py)
+# ═══════════════════════════════════════════════════════════════════════════════
+EXPECTED_PIPELINES: dict[str, list[str]] = {
+    "de_novo_binder": ["rfdiffusion", "proteinmpnn", "esmfold"],
+    "sequence_optimization": ["proteinmpnn", "esmfold"],
+    "de_novo_backbone": ["rfdiffusion", "proteinmpnn", "esmfold"],
+    "complex_engineering": ["rfdiffusion", "proteinmpnn", "esmfold"],
+    "conformational_design": ["proteinmpnn", "esmfold"],
+    # Old category names (backward compat)
+    "binder": ["rfdiffusion", "proteinmpnn", "esmfold"],
+    "antibody": ["proteinmpnn", "esmfold"],
+    "stability": ["proteinmpnn", "esmfold"],
+    "enzyme": ["rfdiffusion", "proteinmpnn", "esmfold"],
+}
+ORCHESTRATION_VALIDATION_TOOLS: set[str] = {
+    "validate_design", "predict_complex", "analyze_interface",
+    "esmfold", "score_stability", "rosetta_score",
+    "rosetta_interface_score", "predict_structure_boltz",
+    "predict_affinity_boltz",
+}
+def _expand_tool_name(tool: str) -> list[str]:
+    if tool in MCP_TOOL_EXPANSION:
+        underlying = MCP_TOOL_EXPANSION[tool]
+        return underlying if underlying else [tool]
+    return [tool]
+def _extract_ordered_bio_tools(tool_call_log: list[dict[str, Any]]) -> list[str]:
+    utility_tools = {"execute_python", "read_file", "write_file"}
+    ordered: list[str] = []
+    for entry in tool_call_log:
+        tool = entry.get("tool", "")
+        if tool in utility_tools:
+            continue
+        expanded = _expand_tool_name(tool)
+        for t in expanded:
+            ordered.append(normalize_tool_name(t))
+    return ordered
+def _longest_ordered_subsequence_length(
+    actual: list[str], expected: list[str]
+) -> int:
+    if not expected or not actual:
+        return 0
+    j = 0
+    matched = 0
+    for tool in actual:
+        k = j
+        while k < len(expected):
+            if tool == normalize_tool_name(expected[k]):
+                matched += 1
+                j = k + 1
+                break
+            k += 1
+    return matched
+def _count_validation_steps(tool_call_log: list[dict[str, Any]]) -> int:
+    count = 0
+    for entry in tool_call_log:
+        tool = entry.get("tool", "")
+        if tool in ORCHESTRATION_VALIDATION_TOOLS:
+            count += 1
+        expanded = _expand_tool_name(tool)
+        for t in expanded:
+            if t in ORCHESTRATION_VALIDATION_TOOLS and tool not in ORCHESTRATION_VALIDATION_TOOLS:
+                count += 1
+    return count
+def _has_adaptive_behavior(tool_call_log: list[dict[str, Any]]) -> bool:
+    tool_args: dict[str, list[dict]] = {}
+    for entry in tool_call_log:
+        tool = entry.get("tool", "")
+        args = entry.get("args_summary", {})
+        if tool not in tool_args:
+            tool_args[tool] = []
+        tool_args[tool].append(args)
+    for tool, args_list in tool_args.items():
+        if len(args_list) >= 2:
+            for i in range(1, len(args_list)):
+                if args_list[i] != args_list[i - 1]:
+                    return True
+    return False
+def _get_task_category_for_orchestration(task_id: str) -> str | None:
+    """Extract category from task_id using taxonomy, with legacy fallback."""
+    category = get_category(task_id)
+    if category is not None:
+        return category.task_type.value
+    for cat in ("binder", "antibody", "stability", "enzyme"):
+        if task_id.startswith(cat):
+            return cat
+    return None
+def score_orchestration(
+    tool_call_log: list[dict[str, Any]],
+    task_id: str,
+    max_points: int = 15,
+) -> dict[str, Any]:
+    """Score the agent's multi-step pipeline orchestration."""
+    if not tool_call_log:
+        return {
+            "score": 0, "max": max_points,
+            "pipeline_order_score": 0.0, "validation_score": 0.0,
+            "adaptive_score": 0.0, "details": "No tool calls recorded",
+        }
+    category = _get_task_category_for_orchestration(task_id)
+    expected_pipeline = EXPECTED_PIPELINES.get(category, [])
+    ordered_tools = _extract_ordered_bio_tools(tool_call_log)
+    if expected_pipeline:
+        matched = _longest_ordered_subsequence_length(ordered_tools, expected_pipeline)
+        order_ratio = matched / len(expected_pipeline)
+    else:
+        order_ratio = 1.0 if ordered_tools else 0.0
+    pipeline_points = order_ratio * max_points * 0.5
+    validation_count = _count_validation_steps(tool_call_log)
+    if validation_count >= 2:
+        validation_ratio = 1.0
+    elif validation_count == 1:
+        validation_ratio = 0.6
+    else:
+        validation_ratio = 0.0
+    validation_points = validation_ratio * max_points * 0.3
+    adaptive = _has_adaptive_behavior(tool_call_log)
+    adaptive_points = max_points * 0.2 if adaptive else 0.0
+    total = int(round(pipeline_points + validation_points + adaptive_points))
+    return {
+        "score": min(total, max_points), "max": max_points,
+        "pipeline_order_score": round(pipeline_points, 1),
+        "validation_score": round(validation_points, 1),
+        "adaptive_score": round(adaptive_points, 1),
+        "expected_pipeline": expected_pipeline,
+        "actual_tool_order": ordered_tools,
+        "validation_steps": validation_count,
+        "adaptive_behavior": adaptive,
+    }
+# ═══════════════════════════════════════════════════════════════════════════════
+#  SECTION 5 — Quality + Scoring  (from biodesignbench/eval/tier2/scoring.py)
+# ═══════════════════════════════════════════════════════════════════════════════
+DEFAULT_DESIGN_RUBRIC = {
+    "approach": 20, "orchestration": 15, "quality": 35,
+    "feasibility": 15, "novelty": 5, "diversity": 10,
+}
+METRIC_RANGES: dict[str, tuple[float, float]] = {
+    "pLDDT": (0, 100), "pTM": (0, 1), "ipTM": (0, 1),
+    "i_pAE": (0, 50), "predicted_kd": (0, 1e6),
+    "predicted_ddG": (-100, 100), "active_site_rmsd": (0, 50),
+    "max_sequence_identity": (0, 1), "TM_score": (0, 1),
+}
+THRESHOLD_TO_METRIC: dict[str, tuple[str, str]] = {
+    "pLDDT_good": ("pLDDT", "higher_is_better"),
+    "ipTM_good": ("ipTM", "higher_is_better"),
+    "kd_nM_good": ("predicted_kd", "lower_is_better"),
+    "predicted_ddG_good": ("predicted_ddG", "lower_is_better"),
+    "active_site_rmsd_good": ("active_site_rmsd", "lower_is_better"),
+}
+# Tier A: Structure Confidence
+_TIER_A_THRESHOLDS: dict[str, dict[str, float]] = {
+    "pLDDT": {"pass": 65, "good": 80, "excellent": 90},
+    "pTM": {"pass": 0.45, "good": 0.65, "excellent": 0.80},
+}
+# Tier B: Interface Confidence (binding only)
+_TIER_B_THRESHOLDS: dict[str, dict[str, float]] = {
+    "ipTM": {"pass": 0.15, "good": 0.40, "excellent": 0.70},
+    "i_pAE": {"pass": 25.0, "good": 15.0, "excellent": 8.0},
+}
+_TIER_B_DIRECTIONS: dict[str, str] = {"i_pAE": "lower_is_better"}
+# Tier C: Interface Physics
+_TIER_C_METRICS: dict[str, tuple[str, str]] = {
+    "kd_nM_good": ("predicted_kd", "lower_is_better"),
+    "predicted_ddG_good": ("predicted_ddG", "lower_is_better"),
+    "active_site_rmsd_good": ("active_site_rmsd", "lower_is_better"),
+}
+_TIER_C_PHYSICS: dict[str, dict[str, float]] = {
+    "buried_surface_area": {"pass": 800, "good": 1500, "excellent": 2500},
+    "hydrogen_bonds": {"pass": 5, "good": 15, "excellent": 30},
+}
+_TIER_A_BASE = 15
+_TIER_B_BASE = 10
+_TIER_C_BASE = 10
+_QUALITY_BASE = _TIER_A_BASE + _TIER_B_BASE + _TIER_C_BASE  # 35
+_BINDING_TASK_TYPES: set[DesignTaskType] = {
+    DesignTaskType.DE_NOVO_BINDER,
+    DesignTaskType.COMPLEX_ENGINEERING,
+}
+_BINDING_OLD_PREFIXES: set[str] = {"binder", "antibody", "ppi", "peptide"}
+def _is_binding_task(task_id: str | None) -> bool:
+    if not task_id:
+        return False
+    cat = get_category(task_id)
+    if cat is not None:
+        return cat.task_type in _BINDING_TASK_TYPES
+    prefix = task_id.split("_")[0]
+    return prefix in _BINDING_OLD_PREFIXES
+def _get_tier_weights(
+    task_id: str | None = None,
+    max_points: int = 35,
+) -> tuple[int, int, int]:
+    if not task_id:
+        scale = max_points / _QUALITY_BASE if _QUALITY_BASE > 0 else 0
+        return (
+            int(round(_TIER_A_BASE * scale)),
+            int(round(_TIER_B_BASE * scale)),
+            int(round(_TIER_C_BASE * scale)),
+        )
+    is_binding = _is_binding_task(task_id)
+    cat = get_category(task_id)
+    if cat is None and not is_binding:
+        scale = max_points / _QUALITY_BASE if _QUALITY_BASE > 0 else 0
+        return (
+            int(round(_TIER_A_BASE * scale)),
+            int(round(_TIER_B_BASE * scale)),
+            int(round(_TIER_C_BASE * scale)),
+        )
+    if is_binding:
+        ratio_a = 12 / 35
+        ratio_b = 18 / 35
+        a = int(round(max_points * ratio_a))
+        b = int(round(max_points * ratio_b))
+        c = max_points - a - b
+        return (a, b, c)
+    else:
+        ratio_a = 25 / 35
+        ratio_b = 10 / 35
+        a = int(round(max_points * ratio_a))
+        b = int(round(max_points * ratio_b))
+        c = max_points - a - b
+        return (a, b, c)
+def _continuous_score(
+    value: float,
+    thresholds: dict[str, float],
+    direction: str = "higher_is_better",
+) -> float:
+    """Return continuous fraction [0.0, 1.0] via linear interpolation."""
+    p, g, e = thresholds["pass"], thresholds["good"], thresholds["excellent"]
+    if direction == "lower_is_better":
+        floor = p + abs(p) * 0.3 if p != 0 else 0.3
+        if value <= e:
+            return 1.0
+        if value >= floor:
+            return 0.0
+        if value <= g:
+            span = g - e
+            if span == 0:
+                return 1.0
+            return 0.66 + (g - value) / span * 0.34
+        if value <= p:
+            span = p - g
+            if span == 0:
+                return 0.66
+            return 0.33 + (p - value) / span * 0.33
+        span = floor - p
+        if span == 0:
+            return 0.0
+        return 0.33 * (floor - value) / span
+    # higher_is_better
+    floor = p * 0.7
+    if value >= e:
+        return 1.0
+    if value <= floor:
+        return 0.0
+    if value >= g:
+        span = e - g
+        if span == 0:
+            return 1.0
+        return 0.66 + (value - g) / span * 0.34
+    if value >= p:
+        span = g - p
+        if span == 0:
+            return 0.66
+        return 0.33 + (value - p) / span * 0.33
+    span = p - floor
+    if span == 0:
+        return 0.0
+    return 0.33 * (value - floor) / span
+# Category-specific quality metrics (17 valid taxonomy cells)
+QUALITY_METRICS: dict[tuple[DesignTaskType, BiologicalContext], dict[str, Any]] = {
+    # de_novo_binder (4 cells)
+    (DesignTaskType.DE_NOVO_BINDER, BiologicalContext.ANTIBODY): {
+        "primary_metric": "ipTM",
+        "thresholds": {"excellent": 0.75, "good": 0.50, "pass": 0.20},
+        "secondary_metrics": ["pLDDT", "predicted_kd"],
+    },
+    (DesignTaskType.DE_NOVO_BINDER, BiologicalContext.SIGNALING): {
+        "primary_metric": "ipTM",
+        "thresholds": {"excellent": 0.70, "good": 0.45, "pass": 0.18},
+        "secondary_metrics": ["pLDDT", "predicted_kd"],
+    },
+    (DesignTaskType.DE_NOVO_BINDER, BiologicalContext.THERAPEUTIC): {
+        "primary_metric": "ipTM",
+        "thresholds": {"excellent": 0.70, "good": 0.45, "pass": 0.18},
+        "secondary_metrics": ["pLDDT", "predicted_kd"],
+    },
+    (DesignTaskType.DE_NOVO_BINDER, BiologicalContext.ENZYME): {
+        "primary_metric": "ipTM",
+        "thresholds": {"excellent": 0.70, "good": 0.45, "pass": 0.18},
+        "secondary_metrics": ["pLDDT", "predicted_kd", "active_site_rmsd"],
+    },
+    # sequence_optimization (5 cells)
+    (DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.ANTIBODY): {
+        "primary_metric": "pLDDT",
+        "thresholds": {"excellent": 90, "good": 80, "pass": 65},
+        "secondary_metrics": ["ipTM", "max_sequence_identity"],
+    },
+    (DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.ENZYME): {
+        "primary_metric": "pLDDT",
+        "thresholds": {"excellent": 90, "good": 80, "pass": 65},
+        "secondary_metrics": ["predicted_ddG", "active_site_rmsd"],
+    },
+    (DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.STRUCTURAL): {
+        "primary_metric": "pLDDT",
+        "thresholds": {"excellent": 92, "good": 82, "pass": 68},
+        "secondary_metrics": ["TM_score", "predicted_ddG"],
+    },
+    (DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.FLUORESCENT): {
+        "primary_metric": "pLDDT",
+        "thresholds": {"excellent": 88, "good": 78, "pass": 62},
+        "secondary_metrics": ["predicted_ddG", "max_sequence_identity"],
+    },
+    (DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.SIGNALING): {
+        "primary_metric": "pLDDT",
+        "thresholds": {"excellent": 90, "good": 80, "pass": 65},
+        "secondary_metrics": ["ipTM", "predicted_ddG"],
+    },
+    # de_novo_backbone (1 cell)
+    (DesignTaskType.DE_NOVO_BACKBONE, BiologicalContext.STRUCTURAL): {
+        "primary_metric": "pLDDT",
+        "thresholds": {"excellent": 88, "good": 78, "pass": 60},
+        "secondary_metrics": ["TM_score", "predicted_ddG"],
+    },
+    # complex_engineering (3 cells)
+    (DesignTaskType.COMPLEX_ENGINEERING, BiologicalContext.SIGNALING): {
+        "primary_metric": "ipTM",
+        "thresholds": {"excellent": 0.72, "good": 0.48, "pass": 0.20},
+        "secondary_metrics": ["pLDDT", "predicted_kd"],
+    },
+    (DesignTaskType.COMPLEX_ENGINEERING, BiologicalContext.STRUCTURAL): {
+        "primary_metric": "ipTM",
+        "thresholds": {"excellent": 0.72, "good": 0.48, "pass": 0.20},
+        "secondary_metrics": ["pLDDT", "TM_score"],
+    },
+    (DesignTaskType.COMPLEX_ENGINEERING, BiologicalContext.ENZYME): {
+        "primary_metric": "ipTM",
+        "thresholds": {"excellent": 0.70, "good": 0.45, "pass": 0.18},
+        "secondary_metrics": ["pLDDT", "predicted_kd", "active_site_rmsd"],
+    },
+    # conformational_design (4 cells)
+    (DesignTaskType.CONFORMATIONAL_DESIGN, BiologicalContext.ENZYME): {
+        "primary_metric": "pLDDT",
+        "thresholds": {"excellent": 88, "good": 78, "pass": 62},
+        "secondary_metrics": ["predicted_ddG", "active_site_rmsd"],
+    },
+    (DesignTaskType.CONFORMATIONAL_DESIGN, BiologicalContext.SIGNALING): {
+        "primary_metric": "pLDDT",
+        "thresholds": {"excellent": 85, "good": 75, "pass": 60},
+        "secondary_metrics": ["ipTM", "predicted_kd"],
+    },
+    (DesignTaskType.CONFORMATIONAL_DESIGN, BiologicalContext.FLUORESCENT): {
+        "primary_metric": "pLDDT",
+        "thresholds": {"excellent": 85, "good": 75, "pass": 60},
+        "secondary_metrics": ["predicted_ddG", "max_sequence_identity"],
+    },
+    (DesignTaskType.CONFORMATIONAL_DESIGN, BiologicalContext.STRUCTURAL): {
+        "primary_metric": "pLDDT",
+        "thresholds": {"excellent": 88, "good": 78, "pass": 62},
+        "secondary_metrics": ["TM_score", "predicted_ddG"],
+    },
+}
+def get_quality_config(task_id: str) -> dict[str, Any] | None:
+    category = get_category(task_id)
+    if category is None:
+        return None
+    key = (category.task_type, category.context)
+    return QUALITY_METRICS.get(key)
+@dataclass
+class DesignScoringRubric:
+    components: dict[str, int] = field(default_factory=lambda: dict(DEFAULT_DESIGN_RUBRIC))
+    @property
+    def max_score(self) -> int:
+        return sum(self.components.values())
+    def validate(self) -> None:
+        total = sum(self.components.values())
+        if total != 100:
+            raise ValueError(f"Rubric total must be 100, got {total}")
+def _has_reasonable_composition(seq: str, min_length: int = 20) -> bool:
+    upper = seq.upper()
+    if len(upper) < min_length:
+        return False
+    unique_aas = len(set(upper))
+    if unique_aas < 5:
+        return False
+    counts = Counter(upper)
+    max_fraction = max(counts.values()) / len(upper)
+    if max_fraction > 0.5:
+        return False
+    ala_fraction = counts.get("A", 0) / len(upper)
+    if ala_fraction > 0.3:
+        return False
+    hp = hydrophobicity_profile(upper)
+    if hp["mean"] > 2.0:
+        return False
+    return True
+def validate_metric_range(name: str, value: float) -> bool:
+    if name not in METRIC_RANGES:
+        return True
+    low, high = METRIC_RANGES[name]
+    return low <= value <= high
+# Functional Similarity thresholds for non-binding Tier B
+_FUNCTIONAL_SIM_DEFAULTS: dict[DesignTaskType, dict[str, float]] = {
+    DesignTaskType.SEQUENCE_OPTIMIZATION: {"pass": 0.40, "good": 0.60, "excellent": 0.85},
+    DesignTaskType.CONFORMATIONAL_DESIGN: {"pass": 0.15, "good": 0.30, "excellent": 0.50},
+    DesignTaskType.DE_NOVO_BACKBONE: {"pass": 0.10, "good": 0.20, "excellent": 0.40},
+}
+def _derive_functional_sim_thresholds(value: float) -> dict[str, float]:
+    return {
+        "pass": value * 0.5,
+        "good": value,
+        "excellent": min(value * 2, 1.0),
+    }
+def _get_functional_sim_thresholds(
+    thresholds: dict[str, float],
+    task_id: str,
+) -> dict[str, float] | None:
+    if _is_binding_task(task_id):
+        return None
+    gt_value = thresholds.get("max_seq_identity_good")
+    if gt_value is not None:
+        return _derive_functional_sim_thresholds(gt_value)
+    cat = get_category(task_id)
+    if cat is None:
+        return None
+    return _FUNCTIONAL_SIM_DEFAULTS.get(cat.task_type)
+def _score_functional_similarity(
+    designs: list[str],
+    oracle_sequences: list[str],
+    thresholds: dict[str, float],
+) -> float | None:
+    if not designs or not oracle_sequences:
+        return None
+    best_identity = 0.0
+    for design in designs:
+        for oracle in oracle_sequences:
+            ident = sequence_identity(design, oracle)
+            if ident > best_identity:
+                best_identity = ident
+    return _continuous_score(best_identity, thresholds, "higher_is_better")
+def score_quality(
+    agent_metrics: dict[str, float],
+    thresholds: dict[str, float],
+    max_points: int = 35,
+    task_id: str | None = None,
+    designs: list[str] | None = None,
+    oracle_sequences: list[str] | None = None,
+) -> dict[str, Any]:
+    """Score quality using 3-tier continuous system."""
+    valid_metrics = {
+        k: v for k, v in agent_metrics.items() if validate_metric_range(k, v)
+    }
+    for extra_key in ("buried_surface_area", "hydrogen_bonds"):
+        if extra_key in agent_metrics and extra_key not in valid_metrics:
+            val = agent_metrics[extra_key]
+            if isinstance(val, (int, float)) and val >= 0:
+                valid_metrics[extra_key] = float(val)
+    tier_a_max, tier_b_max, tier_c_max = _get_tier_weights(task_id, max_points)
+    is_binding = _is_binding_task(task_id)
+    overrides: dict[str, dict[str, float]] = {}
+    if task_id:
+        config = get_quality_config(task_id)
+        if config and "thresholds" in config:
+            primary = config["primary_metric"]
+            overrides[primary] = config["thresholds"]
+    # Tier A: Structure Confidence
+    tier_a_scores: dict[str, float] = {}
+    for metric, default_thresh in _TIER_A_THRESHOLDS.items():
+        if metric in valid_metrics:
+            thresh = overrides.get(metric, default_thresh)
+            tier_a_scores[metric] = _continuous_score(
+                valid_metrics[metric], thresh, "higher_is_better"
+            )
+    tier_a_pts = (sum(tier_a_scores.values()) / len(tier_a_scores)) * tier_a_max if tier_a_scores else 0.0
+    # Tier B: Interface or Functional Similarity
+    tier_b_scores: dict[str, float] = {}
+    tier_b_pts = 0.0
+    _use_functional_sim = (
+        tier_b_max > 0
+        and task_id is not None
+        and not is_binding
+        and get_category(task_id) is not None
+    )
+    if tier_b_max > 0:
+        if _use_functional_sim:
+            if designs and oracle_sequences:
+                func_thresh = _get_functional_sim_thresholds(thresholds, task_id)
+                if func_thresh is not None:
+                    frac = _score_functional_similarity(designs, oracle_sequences, func_thresh)
+                    if frac is not None:
+                        tier_b_pts = frac * tier_b_max
+                        tier_b_scores["oracle_identity"] = frac
+        else:
+            for metric, default_thresh in _TIER_B_THRESHOLDS.items():
+                if metric in valid_metrics:
+                    thresh = overrides.get(metric, default_thresh)
+                    direction = _TIER_B_DIRECTIONS.get(metric, "higher_is_better")
+                    tier_b_scores[metric] = _continuous_score(
+                        valid_metrics[metric], thresh, direction
+                    )
+            if tier_b_scores:
+                tier_b_pts = (sum(tier_b_scores.values()) / len(tier_b_scores)) * tier_b_max
+    # Tier C: Interface Physics
+    tier_c_fractions: list[float] = []
+    tier_c_breakdown: list[dict] = []
+    if tier_c_max > 0:
+        if is_binding:
+            for metric_key, phys_thresh in _TIER_C_PHYSICS.items():
+                if metric_key in valid_metrics:
+                    frac = _continuous_score(valid_metrics[metric_key], phys_thresh, "higher_is_better")
+                    tier_c_fractions.append(frac)
+                    tier_c_breakdown.append({
+                        "threshold": metric_key, "metric": metric_key,
+                        "value": valid_metrics[metric_key],
+                        "threshold_value": phys_thresh, "fraction": round(frac, 3),
+                    })
+        for thresh_key, (metric_key, direction) in _TIER_C_METRICS.items():
+            if thresh_key in thresholds and metric_key in valid_metrics:
+                threshold_val = thresholds[thresh_key]
+                agent_val = valid_metrics[metric_key]
+                margin = abs(threshold_val) * 0.5 if threshold_val != 0 else 1.0
+                if direction == "lower_is_better":
+                    gt_thresh = {
+                        "pass": threshold_val + margin,
+                        "good": threshold_val,
+                        "excellent": threshold_val - margin,
+                    }
+                else:
+                    gt_thresh = {
+                        "pass": threshold_val - margin,
+                        "good": threshold_val,
+                        "excellent": threshold_val + margin,
+                    }
+                frac = _continuous_score(agent_val, gt_thresh, direction)
+                tier_c_fractions.append(frac)
+                tier_c_breakdown.append({
+                    "threshold": thresh_key, "metric": metric_key,
+                    "value": agent_val, "threshold_value": threshold_val,
+                    "fraction": round(frac, 3),
+                })
+    tier_c_pts = (sum(tier_c_fractions) / len(tier_c_fractions)) * tier_c_max if tier_c_fractions else 0.0
+    total = min(tier_a_pts + tier_b_pts + tier_c_pts, max_points)
+    metrics_evaluated = len(tier_a_scores) + len(tier_b_scores) + len(tier_c_fractions)
+    return {
+        "score": int(round(total)), "max": max_points,
+        "tier_a": round(tier_a_pts, 1), "tier_b": round(tier_b_pts, 1),
+        "tier_c": round(tier_c_pts, 1),
+        "metrics_evaluated": metrics_evaluated,
+        "breakdown": {
+            "structure": tier_a_scores, "interface": tier_b_scores,
+            "physics": tier_c_breakdown,
+        },
+    }
+def score_novelty(
+    designs: list[str],
+    reference_seq: str | None,
+    thresholds: dict[str, float],
+    max_points: int = 5,
+) -> dict[str, Any]:
+    """Score novelty by computing sequence identity to reference."""
+    if not designs:
+        return {"score": 0, "max": max_points, "max_identity": 0.0, "identity_threshold": None}
+    identity_threshold = thresholds.get("max_seq_identity_good")
+    max_id = max_identity_to_reference(designs, reference_seq) if reference_seq else 0.0
+    if identity_threshold is None:
+        if reference_seq:
+            novelty_ratio = 1.0 - max_id
+            score = int(round(max_points * min(novelty_ratio * 2, 1.0)))
+        else:
+            score = max_points
+    elif identity_threshold >= 0.9:
+        if max_id >= identity_threshold:
+            score = max_points
+        elif max_id >= identity_threshold * 0.9:
+            score = int(round(max_points * 0.7))
+        else:
+            score = int(round(max_points * 0.3))
+    else:
+        if max_id <= identity_threshold:
+            score = max_points
+        elif max_id <= identity_threshold * 1.5:
+            score = int(round(max_points * 0.5))
+        else:
+            score = int(round(max_points * 0.2))
+    return {
+        "score": min(score, max_points), "max": max_points,
+        "max_identity": round(max_id, 3), "identity_threshold": identity_threshold,
+    }
+def score_diversity(
+    designs: list[str],
+    max_designs: int = 10,
+    max_points: int = 5,
+) -> dict[str, Any]:
+    """Score diversity of designs."""
+    if not designs:
+        return {"score": 0, "max": max_points, "num_designs": 0, "pairwise_diversity": 0.0, "entropy": 0.0}
+    num = len(designs)
+    count_fraction = min(num / max_designs, 1.0) if max_designs > 0 else 1.0
+    diversity = mean_pairwise_diversity(designs)
+    entropy = sequence_entropy(designs)
+    count_score = count_fraction * max_points * 0.4
+    diversity_score = diversity * max_points * 0.4
+    entropy_score = entropy * max_points * 0.2
+    total = int(round(count_score + diversity_score + entropy_score))
+    return {
+        "score": min(total, max_points), "max": max_points,
+        "num_designs": num, "pairwise_diversity": round(diversity, 3),
+        "entropy": round(entropy, 3),
+    }
+def score_feasibility(
+    designs: list[str],
+    constraints: dict[str, Any],
+    max_points: int = 25,
+) -> dict[str, Any]:
+    """Score feasibility of designed sequences."""
+    if not designs:
+        return {"score": 0, "max": max_points, "aa_validity": 0.0, "length_validity": 0.0, "composition_check": 0.0}
+    per_check = max_points / 3
+    length_range = constraints.get("length_range")
+    if isinstance(length_range, list):
+        length_range = tuple(length_range)
+    comp_min_length = 20
+    if length_range and length_range[1] < 20:
+        comp_min_length = max(length_range[0], 5)
+    aa_valid_count = sum(1 for seq in designs if validate_amino_acids(seq)["valid"])
+    aa_fraction = aa_valid_count / len(designs)
+    length_valid_count = sum(1 for seq in designs if check_length_constraints(seq, length_range)["within_range"])
+    length_fraction = length_valid_count / len(designs)
+    composition_ok = sum(1 for seq in designs if _has_reasonable_composition(seq, min_length=comp_min_length))
+    composition_fraction = composition_ok / len(designs)
+    aa_score = aa_fraction * per_check
+    length_score = length_fraction * per_check
+    comp_score = composition_fraction * per_check
+    total = int(round(aa_score + length_score + comp_score))
+    return {
+        "score": min(total, max_points), "max": max_points,
+        "aa_validity": round(aa_fraction, 3),
+        "length_validity": round(length_fraction, 3),
+        "composition_check": round(composition_fraction, 3),
+    }
+# ═══════════════════════════════════════════════════════════════════════════════
+#  SECTION 6 — Design Gate + Final Score
+# ═══════════════════════════════════════════════════════════════════════════════
+_DESIGN_GATE_ZEROED = {"quality", "novelty", "diversity", "feasibility"}
+_DESIGN_GATE_CAP = 30
+def apply_design_gate(
+    component_scores: dict[str, int],
+    num_designs: int,
+) -> dict[str, int]:
+    """If no designs produced, cap total at 30."""
+    if num_designs >= 1:
+        return dict(component_scores)
+    gated = dict(component_scores)
+    for key in _DESIGN_GATE_ZEROED:
+        gated[key] = 0
+    remaining_sum = sum(v for k, v in gated.items() if k not in _DESIGN_GATE_ZEROED)
+    if remaining_sum > _DESIGN_GATE_CAP:
+        scale = _DESIGN_GATE_CAP / remaining_sum
+        for key in gated:
+            if key not in _DESIGN_GATE_ZEROED:
+                gated[key] = int(round(gated[key] * scale))
+    return gated
+def calculate_design_score(
+    rubric: DesignScoringRubric,
+    results: dict[str, int],
+) -> dict[str, Any]:
+    """Calculate final design task score from component results."""
+    breakdown = {}
+    for component, max_pts in rubric.components.items():
+        actual = min(results.get(component, 0), max_pts)
+        breakdown[component] = {"score": actual, "max": max_pts}
+    total = sum(v["score"] for v in breakdown.values())
+    max_possible = rubric.max_score
+    return {
+        "breakdown": breakdown,
+        "total": total,
+        "max_possible": max_possible,
+        "percentage": round(total / max_possible * 100, 1) if max_possible > 0 else 0,
+    }
+# ═══════════════════════════════════════════════════════════════════════════════
+#  SECTION 7 — Full Task Scorer (high-level API for eval pipeline)
+# ═══════════════════════════════════════════════════════════════════════════════
+def score_submission_task(
+    task_id: str,
+    sequences: list[str],
+    run_log: list[dict[str, Any]],
+    ground_truth: dict[str, Any],
+    agent_metrics: dict[str, float] | None = None,
+    oracle_sequences: list[str] | None = None,
+) -> dict[str, Any]:
+    """Score a single task submission end-to-end.
+    This is the main entry point for the evaluation pipeline.
+    Args:
+        task_id: Task identifier (e.g., "dnb_sig_001").
+        sequences: Designed amino acid sequences from the agent.
+        run_log: Tool call log from the agent.
+        ground_truth: Ground truth dict with thresholds, reference_sequence,
+            design_constraints, tools_expected, max_designs.
+        agent_metrics: Optional metrics reported by the agent or from Boltz
+            (e.g., {"pLDDT": 85.0, "ipTM": 0.35}).
+        oracle_sequences: Optional oracle sequences for functional similarity.
+    Returns:
+        Dict with: total_score, component_scores, details, num_designs.
+    """
+    if agent_metrics is None:
+        agent_metrics = {}
+    # Extract fields from ground truth
+    thresholds = ground_truth.get("thresholds", {})
+    reference_seq = ground_truth.get("reference_sequence")
+    constraints = ground_truth.get("design_constraints", {})
+    tools_expected = ground_truth.get("tools_expected", [])
+    max_designs = ground_truth.get("max_designs", 10)
+    # Get task category for function-based scoring
+    cat = get_category(task_id)
+    task_type = cat.task_type if cat else None
+    # Extract tools used from run_log
+    tools_used = [entry.get("tool", "") for entry in run_log if entry.get("tool")]
+    # Score all 6 components
+    approach_result = score_approach(
+        tools_used=tools_used,
+        tools_expected=tools_expected,
+        task_type=task_type,
+    )
+    orchestration_result = score_orchestration(
+        tool_call_log=run_log,
+        task_id=task_id,
+    )
+    quality_result = score_quality(
+        agent_metrics=agent_metrics,
+        thresholds=thresholds,
+        task_id=task_id,
+        designs=sequences,
+        oracle_sequences=oracle_sequences,
+    )
+    feasibility_result = score_feasibility(
+        designs=sequences,
+        constraints=constraints,
+    )
+    novelty_result = score_novelty(
+        designs=sequences,
+        reference_seq=reference_seq,
+        thresholds=thresholds,
+    )
+    diversity_result = score_diversity(
+        designs=sequences,
+        max_designs=max_designs,
+    )
+    # Build component scores dict
+    component_scores = {
+        "approach": approach_result["score"],
+        "orchestration": orchestration_result["score"],
+        "quality": quality_result["score"],
+        "feasibility": feasibility_result["score"],
+        "novelty": novelty_result["score"],
+        "diversity": diversity_result["score"],
+    }
+    # Apply design gate
+    num_designs = len(sequences)
+    gated = apply_design_gate(component_scores, num_designs)
+    total = sum(gated.values())
+    return {
+        "total_score": total,
+        "component_scores": gated,
+        "num_designs": num_designs,
+        "details": {
+            "approach": approach_result,
+            "orchestration": orchestration_result,
+            "quality": quality_result,
+            "feasibility": feasibility_result,
+            "novelty": novelty_result,
+            "diversity": diversity_result,
+        },
+    }
+def aggregate_scores(
+    per_task_scores: dict[str, dict[str, Any]],
+) -> dict[str, Any]:
+    """Aggregate per-task scores into an overall submission result.
+    Args:
+        per_task_scores: Dict mapping task_id → score_submission_task() result.
+    Returns:
+        Dict with: overall_score, component_scores (averaged), taxonomy_scores,
+        tasks_completed, tasks_with_zero.
+    """
+    if not per_task_scores:
+        return {
+            "overall_score": 0.0,
+            "component_scores": {c: 0.0 for c in DEFAULT_DESIGN_RUBRIC},
+            "taxonomy_scores": {},
+            "tasks_completed": 0,
+            "tasks_total": 0,
+            "tasks_with_zero": 0,
+        }
+    totals = {c: 0.0 for c in DEFAULT_DESIGN_RUBRIC}
+    n = len(per_task_scores)
+    tasks_with_zero = 0
+    # Taxonomy breakdown
+    taxonomy_scores: dict[str, dict[str, list[float]]] = {}
+    for task_id, result in per_task_scores.items():
+        total_score = result["total_score"]
+        if total_score == 0:
+            tasks_with_zero += 1
+        for comp, val in result["component_scores"].items():
+            totals[comp] += val
+        # Taxonomy mapping
+        cat = get_category(task_id)
+        if cat:
+            tt = cat.task_type.value
+            ctx = cat.context.short
+            taxonomy_scores.setdefault(tt, {}).setdefault(ctx, []).append(total_score)
+    # Average components
+    avg_components = {c: round(v / n, 1) for c, v in totals.items()}
+    overall = round(sum(avg_components.values()), 1)
+    # Average taxonomy scores
+    taxonomy_avg: dict[str, dict[str, float]] = {}
+    for tt, contexts in taxonomy_scores.items():
+        taxonomy_avg[tt] = {}
+        for ctx, scores in contexts.items():
+            taxonomy_avg[tt][ctx] = round(sum(scores) / len(scores), 1)
+    return {
+        "overall_score": overall,
+        "component_scores": avg_components,
+        "taxonomy_scores": taxonomy_avg,
+        "tasks_completed": n,
+        "tasks_total": n,
+        "tasks_with_zero": tasks_with_zero,
+    }

eval_tasks.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""Load hidden benchmark tasks from a private HuggingFace Dataset.
+Each task row contains:
+  - task_id:          e.g., "dnb_sig_001"
+  - task_json:        Full task definition (JSON string)
+  - ground_truth:     Ground truth thresholds + reference (JSON string)
+  - prompt_md:        Task prompt in Markdown
+  - pdb_data:         Base64-encoded PDB file (if needed)
+  - pdb_filename:     Original PDB filename (e.g., "7n1j.pdb")
+  - oracle_sequences: JSON list of oracle sequences (for non-binding tasks)
+Falls back to local files in development (when BDB_USE_LOCAL=1).
+HF Dataset: RomeroLab-Duke/biodesignbench-hidden-tasks (private)
+"""
+from __future__ import annotations
+import base64
+import json
+import logging
+import os
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+#  Configuration
+# ---------------------------------------------------------------------------
+TASKS_DATASET = os.environ.get(
+    "BDB_TASKS_DATASET",
+    "RomeroLab-Duke/biodesignbench-hidden-tasks",
+)
+HF_TOKEN = os.environ.get("HF_TOKEN")
+USE_LOCAL = os.environ.get("BDB_USE_LOCAL", "0") == "1"
+# Local paths (for development)
+_PROJECT_ROOT = Path(__file__).resolve().parents[1]
+_TASKS_DIR = _PROJECT_ROOT / "tasks" / "tier2"
+_GT_DIR = _PROJECT_ROOT / "data" / "tier2" / "ground_truth"
+_PROMPTS_DIR = _PROJECT_ROOT / "data" / "tier2" / "prompts"
+_INPUT_DIR = _PROJECT_ROOT / "data" / "tier2" / "input"
+_ORACLE_PATH = _PROJECT_ROOT / "data" / "oracle" / "sequences.json"
+_TOOL_SCHEMAS_PATH = Path(__file__).parent / "mcp_tool_schemas.json"
+# Public task IDs (for development/testing — not hidden)
+PUBLIC_TASK_IDS = {"dnb_sig_001", "sqo_enz_005", "cpx_sig_001"}
+# ---------------------------------------------------------------------------
+#  HF Dataset loading
+# ---------------------------------------------------------------------------
+@lru_cache(maxsize=1)
+def _load_from_hf() -> dict[str, dict[str, Any]]:
+    """Load all tasks from the private HF Dataset."""
+    try:
+        from datasets import load_dataset
+        ds = load_dataset(
+            TASKS_DATASET,
+            split="train",
+            token=HF_TOKEN,
+        )
+        tasks = {}
+        for row in ds:
+            task_id = row["task_id"]
+            tasks[task_id] = {
+                "task_id": task_id,
+                "task_json": json.loads(row["task_json"]),
+                "ground_truth": json.loads(row["ground_truth"]),
+                "prompt_md": row["prompt_md"],
+                "pdb_data": row.get("pdb_data"),
+                "pdb_filename": row.get("pdb_filename"),
+                "oracle_sequences": json.loads(row.get("oracle_sequences", "[]")),
+            }
+        logger.info(f"Loaded {len(tasks)} tasks from HF Dataset")
+        return tasks
+    except Exception as e:
+        logger.error(f"Failed to load tasks from HF: {e}")
+        return {}
+@lru_cache(maxsize=1)
+def _load_from_local() -> dict[str, dict[str, Any]]:
+    """Load tasks from local project files (development mode)."""
+    tasks = {}
+    # Load oracle data
+    oracle_data = {}
+    if _ORACLE_PATH.exists():
+        with open(_ORACLE_PATH) as f:
+            oracle_data = json.load(f)
+    # Enumerate task files
+    if not _TASKS_DIR.exists():
+        logger.warning(f"Tasks directory not found: {_TASKS_DIR}")
+        return tasks
+    for task_path in sorted(_TASKS_DIR.glob("*.json")):
+        task_id = task_path.stem
+        try:
+            with open(task_path) as f:
+                task_json = json.load(f)
+            # Ground truth
+            gt_path = _GT_DIR / f"{task_id}.json"
+            ground_truth = {}
+            if gt_path.exists():
+                with open(gt_path) as f:
+                    ground_truth = json.load(f)
+            # Prompt
+            prompt_path = _PROMPTS_DIR / f"{task_id}.md"
+            prompt_md = ""
+            if prompt_path.exists():
+                prompt_md = prompt_path.read_text()
+            # PDB data
+            pdb_data = None
+            pdb_filename = None
+            input_pdb = task_json.get("input_pdb") or task_json.get("pdb_file")
+            if input_pdb:
+                pdb_path = _INPUT_DIR / input_pdb
+                if pdb_path.exists():
+                    pdb_data = base64.b64encode(pdb_path.read_bytes()).decode()
+                    pdb_filename = input_pdb
+            # Oracle sequences
+            oracle_entry = oracle_data.get(task_id, {})
+            oracle_seqs = oracle_entry.get("sequences", []) if isinstance(oracle_entry, dict) else []
+            tasks[task_id] = {
+                "task_id": task_id,
+                "task_json": task_json,
+                "ground_truth": ground_truth,
+                "prompt_md": prompt_md,
+                "pdb_data": pdb_data,
+                "pdb_filename": pdb_filename,
+                "oracle_sequences": oracle_seqs,
+            }
+        except Exception as e:
+            logger.warning(f"Failed to load task {task_id}: {e}")
+    logger.info(f"Loaded {len(tasks)} tasks from local files")
+    return tasks
+# ---------------------------------------------------------------------------
+#  Public API
+# ---------------------------------------------------------------------------
+def load_all_tasks() -> dict[str, dict[str, Any]]:
+    """Load all benchmark tasks.
+    Returns:
+        Dict mapping task_id → task data dict.
+    """
+    if USE_LOCAL:
+        return _load_from_local()
+    return _load_from_hf()
+def get_task(task_id: str) -> dict[str, Any] | None:
+    """Load a single task by ID."""
+    tasks = load_all_tasks()
+    return tasks.get(task_id)
+def get_hidden_task_ids() -> list[str]:
+    """Get the list of hidden (non-public) task IDs."""
+    tasks = load_all_tasks()
+    return sorted(tid for tid in tasks if tid not in PUBLIC_TASK_IDS)
+def get_all_task_ids() -> list[str]:
+    """Get all task IDs (public + hidden)."""
+    return sorted(load_all_tasks().keys())
+def get_public_task_ids() -> list[str]:
+    """Get the 3 public task IDs for development."""
+    tasks = load_all_tasks()
+    return sorted(tid for tid in tasks if tid in PUBLIC_TASK_IDS)
+@lru_cache(maxsize=1)
+def load_tool_schemas() -> list[dict[str, Any]]:
+    """Load the 17 MCP tool schemas for task payloads."""
+    if _TOOL_SCHEMAS_PATH.exists():
+        with open(_TOOL_SCHEMAS_PATH) as f:
+            return json.load(f)
+    return []
+def build_task_payload(task_id: str) -> dict[str, Any] | None:
+    """Build the payload to send to a submitter's endpoint.
+    Returns:
+        Dict with: task_id, task_description, available_tools,
+        input_files, design_constraints, max_steps, timeout_sec.
+        Returns None if task not found.
+    """
+    task = get_task(task_id)
+    if task is None:
+        return None
+    task_json = task["task_json"]
+    prompt = task["prompt_md"]
+    # Build input files (base64-encoded PDBs)
+    input_files = {}
+    if task.get("pdb_data") and task.get("pdb_filename"):
+        input_files[task["pdb_filename"]] = task["pdb_data"]
+    # Extract constraints from task JSON
+    constraints = task_json.get("design_constraints", {})
+    max_designs = task_json.get("max_designs", 10)
+    return {
+        "task_id": task_id,
+        "task_description": prompt,
+        "available_tools": load_tool_schemas(),
+        "input_files": input_files,
+        "design_constraints": {
+            **constraints,
+            "max_designs": max_designs,
+        },
+        "max_steps": 50,
+        "timeout_sec": 300,
+    }

example_server.py ADDED Viewed

	@@ -0,0 +1,205 @@

+"""Reference FastAPI server for BioDesignBench submitters.
+This example shows how to implement the API endpoint that BioDesignBench
+will call during benchmarking. Replace the mock agent logic with your
+actual LLM agent + MCP tool pipeline.
+Usage:
+    pip install fastapi uvicorn
+    python example_server.py
+    # Or with uvicorn directly:
+    uvicorn example_server:app --host 0.0.0.0 --port 8000
+Your endpoint will receive POST requests at /api/run with the task payload.
+Task Payload Format:
+    {
+        "task_id": "dnb_sig_001",
+        "task_description": "Design a de novo binder for...",
+        "available_tools": [... 17 tool schemas ...],
+        "input_files": {"7n1j.pdb": "<base64>"},
+        "design_constraints": {"length_range": [80, 150], "max_designs": 10},
+        "max_steps": 50,
+        "timeout_sec": 300
+    }
+Expected Response Format:
+    {
+        "sequences": ["MKKL...", "MFQR..."],
+        "run_log": [{"step": 1, "tool": "suggest_hotspots", "success": true}, ...],
+        "total_steps": 12,
+        "total_time_sec": 142.5,
+        "metrics": {}
+    }
+"""
+from __future__ import annotations
+import base64
+import random
+import time
+from pathlib import Path
+from typing import Any
+from fastapi import FastAPI
+from pydantic import BaseModel
+app = FastAPI(
+    title="BioDesignBench Example Agent",
+    description="Reference implementation for benchmark submission",
+    version="0.1.0",
+)
+# ---------------------------------------------------------------------------
+#  Request/Response models
+# ---------------------------------------------------------------------------
+class TaskPayload(BaseModel):
+    task_id: str
+    task_description: str
+    available_tools: list[dict[str, Any]] = []
+    input_files: dict[str, str] = {}  # filename -> base64 data
+    design_constraints: dict[str, Any] = {}
+    max_steps: int = 50
+    timeout_sec: int = 300
+class AgentResponse(BaseModel):
+    sequences: list[str]
+    run_log: list[dict[str, Any]]
+    total_steps: int
+    total_time_sec: float
+    metrics: dict[str, Any] = {}
+# ---------------------------------------------------------------------------
+#  Mock agent (replace with your real agent)
+# ---------------------------------------------------------------------------
+# Standard amino acids for mock sequence generation
+_AAS = "ACDEFGHIKLMNPQRSTVWY"
+def _generate_mock_sequence(length: int) -> str:
+    """Generate a random protein sequence with reasonable composition."""
+    # Weight towards common amino acids
+    weights = [
+        7, 2, 5, 6, 4, 7, 2, 5, 6, 9,  # A C D E F G H I K L
+        2, 4, 5, 4, 5, 7, 6, 7, 1, 3,   # M N P Q R S T V W Y
+    ]
+    return "".join(random.choices(_AAS, weights=weights, k=length))
+def mock_agent(payload: TaskPayload) -> AgentResponse:
+    """Mock agent that generates random but valid designs.
+    Replace this with your actual LLM agent + MCP tool pipeline.
+    This mock demonstrates the expected response format.
+    """
+    start = time.monotonic()
+    # Determine design parameters
+    constraints = payload.design_constraints
+    length_range = constraints.get("length_range", [80, 150])
+    max_designs = constraints.get("max_designs", 10)
+    num_designs = min(max_designs, 5)  # Generate 5 for this mock
+    # "Decode" input PDB files (in a real agent, you'd use these)
+    for filename, b64_data in payload.input_files.items():
+        pdb_bytes = base64.b64decode(b64_data)
+        # In a real agent: save to temp file and pass to MCP tools
+    # Simulate a multi-step design pipeline
+    run_log = [
+        {
+            "step": 1,
+            "tool": "suggest_hotspots",
+            "success": True,
+            "args_summary": {"target": "from_pdb"},
+            "output_summary": "Found 5 hotspot residues",
+        },
+        {
+            "step": 2,
+            "tool": "generate_backbone",
+            "success": True,
+            "args_summary": {"length": length_range[0]},
+            "output_summary": f"Generated {num_designs} backbones",
+        },
+        {
+            "step": 3,
+            "tool": "optimize_sequence",
+            "success": True,
+            "args_summary": {"optimization_target": "both"},
+            "output_summary": f"Optimized {num_designs} sequences",
+        },
+        {
+            "step": 4,
+            "tool": "predict_structure",
+            "success": True,
+            "args_summary": {"predictor": "esmfold"},
+            "output_summary": "Predicted structures for all designs",
+        },
+        {
+            "step": 5,
+            "tool": "validate_design",
+            "success": True,
+            "args_summary": {},
+            "output_summary": "Validated all designs",
+        },
+    ]
+    # Generate mock sequences
+    min_len, max_len = length_range
+    sequences = [
+        _generate_mock_sequence(random.randint(min_len, max_len))
+        for _ in range(num_designs)
+    ]
+    elapsed = time.monotonic() - start
+    return AgentResponse(
+        sequences=sequences,
+        run_log=run_log,
+        total_steps=len(run_log),
+        total_time_sec=round(elapsed, 2),
+        metrics={},  # Agent-reported metrics (optional)
+    )
+# ---------------------------------------------------------------------------
+#  API endpoint
+# ---------------------------------------------------------------------------
+@app.post("/api/run", response_model=AgentResponse)
+async def run_task(payload: TaskPayload) -> AgentResponse:
+    """Run a single benchmark task.
+    This is the endpoint that BioDesignBench will POST to during benchmarking.
+    Replace mock_agent() with your actual agent logic.
+    """
+    return mock_agent(payload)
+@app.get("/health")
+async def health():
+    """Health check endpoint."""
+    return {"status": "ok", "agent": "example-mock"}
+# ---------------------------------------------------------------------------
+#  Entry point
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    import uvicorn
+    print("Starting BioDesignBench example server...")
+    print("POST endpoint: http://localhost:8000/api/run")
+    print("Health check:  http://localhost:8000/health")
+    print()
+    print("Replace mock_agent() with your real agent logic.")
+    uvicorn.run(app, host="0.0.0.0", port=8000)

mcp_tool_schemas.json ADDED Viewed

	@@ -0,0 +1,468 @@

+[
+  {
+    "name": "design_binder",
+    "description": "Design protein binders for a target protein. Runs RFdiffusion -> ProteinMPNN -> ESMFold pipeline.",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "target_pdb": {
+          "type": "string",
+          "description": "Path to target protein PDB file"
+        },
+        "hotspot_residues": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          },
+          "description": "Target residues for binder interface, e.g. ['A45', 'A46']"
+        },
+        "num_designs": {
+          "type": "integer",
+          "description": "Number of designs to generate (default: 10)",
+          "default": 10
+        },
+        "binder_length": {
+          "type": "integer",
+          "description": "Binder length in residues (default: 80)",
+          "default": 80
+        }
+      },
+      "required": [
+        "target_pdb",
+        "hotspot_residues"
+      ]
+    }
+  },
+  {
+    "name": "analyze_interface",
+    "description": "Analyze protein-protein interface: buried surface area, H-bonds, salt bridges, hydrophobic contacts.",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "complex_pdb": {
+          "type": "string",
+          "description": "Path to complex PDB file"
+        },
+        "chain_a": {
+          "type": "string",
+          "description": "Chain ID of first protein"
+        },
+        "chain_b": {
+          "type": "string",
+          "description": "Chain ID of second protein"
+        }
+      },
+      "required": [
+        "complex_pdb",
+        "chain_a",
+        "chain_b"
+      ]
+    }
+  },
+  {
+    "name": "validate_design",
+    "description": "Validate a designed sequence by predicting its structure (ESMFold/AlphaFold2) and computing pLDDT, pTM.",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "sequence": {
+          "type": "string",
+          "description": "Amino acid sequence to validate"
+        },
+        "expected_structure": {
+          "type": "string",
+          "description": "Optional PDB path for RMSD comparison"
+        },
+        "predictor": {
+          "type": "string",
+          "enum": [
+            "esmfold",
+            "alphafold2"
+          ],
+          "default": "esmfold",
+          "description": "Structure predictor to use"
+        }
+      },
+      "required": [
+        "sequence"
+      ]
+    }
+  },
+  {
+    "name": "optimize_sequence",
+    "description": "Optimize binder sequence for improved stability and/or binding affinity.",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "current_sequence": {
+          "type": "string",
+          "description": "Starting amino acid sequence"
+        },
+        "target_pdb": {
+          "type": "string",
+          "description": "Path to target protein PDB"
+        },
+        "optimization_target": {
+          "type": "string",
+          "enum": [
+            "stability",
+            "affinity",
+            "both"
+          ],
+          "default": "both"
+        },
+        "fixed_positions": {
+          "type": "array",
+          "items": {
+            "type": "integer"
+          },
+          "description": "Positions to keep fixed (1-indexed)"
+        }
+      },
+      "required": [
+        "current_sequence",
+        "target_pdb"
+      ]
+    }
+  },
+  {
+    "name": "suggest_hotspots",
+    "description": "Analyze target protein and suggest binding hotspots using structure, conservation, and literature.",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "target": {
+          "type": "string",
+          "description": "Protein name, UniProt ID, PDB ID, or local PDB path"
+        },
+        "chain_id": {
+          "type": "string",
+          "description": "Chain to analyze (default: first)"
+        },
+        "criteria": {
+          "type": "string",
+          "enum": [
+            "druggable",
+            "exposed",
+            "conserved"
+          ],
+          "default": "exposed"
+        },
+        "include_literature": {
+          "type": "boolean",
+          "default": false,
+          "description": "Search PubMed for known binders"
+        }
+      },
+      "required": [
+        "target"
+      ]
+    }
+  },
+  {
+    "name": "get_design_status",
+    "description": "Check status of running design jobs.",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "job_id": {
+          "type": "string",
+          "description": "Job ID from design_binder call"
+        }
+      },
+      "required": [
+        "job_id"
+      ]
+    }
+  },
+  {
+    "name": "predict_complex",
+    "description": "Predict protein complex structure using AlphaFold2-Multimer.",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "sequences": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          },
+          "description": "List of sequences, one per chain"
+        },
+        "chain_names": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          },
+          "description": "Optional chain identifiers"
+        }
+      },
+      "required": [
+        "sequences"
+      ]
+    }
+  },
+  {
+    "name": "predict_structure",
+    "description": "Predict the 3D structure of a single protein chain using ESMFold or AlphaFold2. Returns predicted PDB, pLDDT, and pTM scores.",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "sequence": {
+          "type": "string",
+          "description": "Amino acid sequence to predict structure for"
+        },
+        "predictor": {
+          "type": "string",
+          "enum": [
+            "esmfold",
+            "alphafold2"
+          ],
+          "default": "esmfold",
+          "description": "Structure predictor to use"
+        }
+      },
+      "required": [
+        "sequence"
+      ]
+    }
+  },
+  {
+    "name": "score_stability",
+    "description": "Score protein stability using ESM2 pseudo-log-likelihood. Optionally compute per-mutation effects (delta log-likelihood).",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "sequence": {
+          "type": "string",
+          "description": "Amino acid sequence to score"
+        },
+        "mutations": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          },
+          "description": "Optional mutations in 'X42Y' format for delta scoring"
+        },
+        "reference_sequence": {
+          "type": "string",
+          "description": "Optional wild-type sequence for mutation scoring"
+        }
+      },
+      "required": [
+        "sequence"
+      ]
+    }
+  },
+  {
+    "name": "energy_minimize",
+    "description": "Energy-minimize a protein structure using OpenMM with AMBER14 force field. Returns minimized PDB, energy change, and RMSD from initial structure.",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "pdb_path": {
+          "type": "string",
+          "description": "Path to input PDB file"
+        },
+        "force_field": {
+          "type": "string",
+          "default": "amber14-all.xml",
+          "description": "OpenMM force field XML"
+        },
+        "num_steps": {
+          "type": "integer",
+          "default": 500,
+          "description": "Maximum minimization iterations"
+        },
+        "solvent": {
+          "type": "string",
+          "enum": [
+            "implicit",
+            "none"
+          ],
+          "default": "implicit",
+          "description": "Solvent model: implicit (GBn2) or none (vacuum)"
+        }
+      },
+      "required": [
+        "pdb_path"
+      ]
+    }
+  },
+  {
+    "name": "generate_backbone",
+    "description": "Generate de novo protein backbones using RFdiffusion unconditional generation. No target protein required.",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "length": {
+          "type": "integer",
+          "description": "Backbone length in residues"
+        },
+        "num_designs": {
+          "type": "integer",
+          "default": 10,
+          "description": "Number of designs to generate"
+        }
+      },
+      "required": [
+        "length"
+      ]
+    }
+  },
+  {
+    "name": "rosetta_score",
+    "description": "Score a protein structure using Rosetta energy function (ref2015). Returns total score, per-residue energies, and energy breakdown.",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "pdb_path": {
+          "type": "string",
+          "description": "Path to input PDB file"
+        },
+        "score_function": {
+          "type": "string",
+          "default": "ref2015",
+          "description": "Rosetta score function name"
+        }
+      },
+      "required": [
+        "pdb_path"
+      ]
+    }
+  },
+  {
+    "name": "rosetta_relax",
+    "description": "Relax a protein structure using Rosetta FastRelax. Returns relaxed PDB, energy change, and CA-RMSD.",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "pdb_path": {
+          "type": "string",
+          "description": "Path to input PDB file"
+        },
+        "nstruct": {
+          "type": "integer",
+          "default": 1,
+          "description": "Number of relaxation trajectories"
+        },
+        "score_function": {
+          "type": "string",
+          "default": "ref2015",
+          "description": "Rosetta score function name"
+        }
+      },
+      "required": [
+        "pdb_path"
+      ]
+    }
+  },
+  {
+    "name": "rosetta_interface_score",
+    "description": "Compute interface energy metrics for a protein complex using Rosetta. Returns dG_separated, dSASA, interface hbonds, and packing stats.",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "pdb_path": {
+          "type": "string",
+          "description": "Path to complex PDB file"
+        },
+        "chains": {
+          "type": "string",
+          "default": "A_B",
+          "description": "Chain grouping, e.g. 'A_B' or 'AB_C'"
+        },
+        "score_function": {
+          "type": "string",
+          "default": "ref2015",
+          "description": "Rosetta score function name"
+        }
+      },
+      "required": [
+        "pdb_path"
+      ]
+    }
+  },
+  {
+    "name": "rosetta_design",
+    "description": "Fixed-backbone sequence design using Rosetta PackRotamers + MinMover. Composite convenience tool (hidden in benchmark mode).",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "pdb_path": {
+          "type": "string",
+          "description": "Path to input PDB file"
+        },
+        "chains": {
+          "type": "string",
+          "default": "A_B",
+          "description": "Chain grouping for interface detection"
+        },
+        "fixed_positions": {
+          "type": "array",
+          "items": {
+            "type": "integer"
+          },
+          "description": "1-indexed positions to keep fixed"
+        },
+        "score_function": {
+          "type": "string",
+          "default": "ref2015",
+          "description": "Rosetta score function name"
+        }
+      },
+      "required": [
+        "pdb_path"
+      ]
+    }
+  },
+  {
+    "name": "predict_structure_boltz",
+    "description": "Predict protein structure using Boltz (fast alternative to AF2/ESMFold). Returns predicted PDB, pLDDT, and pTM scores.",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "sequence": {
+          "type": "string",
+          "description": "Amino acid sequence to predict structure for"
+        },
+        "model": {
+          "type": "string",
+          "default": "boltz2",
+          "description": "Model name (default: boltz2)"
+        },
+        "num_samples": {
+          "type": "integer",
+          "default": 1,
+          "description": "Number of structure samples"
+        }
+      },
+      "required": [
+        "sequence"
+      ]
+    }
+  },
+  {
+    "name": "predict_affinity_boltz",
+    "description": "Predict binding affinity for a protein complex using Boltz. Returns affinity score, predicted structure, and confidence metrics.",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "sequences": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          },
+          "description": "List of amino acid sequences, one per chain"
+        },
+        "model": {
+          "type": "string",
+          "default": "boltz2",
+          "description": "Model name (default: boltz2)"
+        }
+      },
+      "required": [
+        "sequences"
+      ]
+    }
+  }
+]

requirements.txt CHANGED Viewed

@@ -1,3 +1,8 @@
-gradio>=4.0
 pandas
 plotly

+gradio>=5.6
 pandas
 plotly
+httpx>=0.25
+huggingface_hub>=0.20
+datasets>=2.16
+boltz>=0.4
+pyaudioop