BioDesignBench-Leaderboard / leaderboard_data.json
Jasonkim8652's picture
Update leaderboard with canonical Apr-6 hybrid scores + depth-gap interventions
cfedbc8 verified
{
"last_updated": "2026-04-14",
"paper_title": "Evaluating LLM-Driven Protein Design: Agents Lack Iterative Evaluation Depth",
"headline_findings": [
"Top-tier LLM agents (DeepSeek V3, GPT-5) now surpass a deterministic hardcoded pipeline.",
"All agents show a critical evaluation depth gap \u2014 they invoke evaluation tools at only 14% of expert frequency.",
"Workflow guidance rescues tool coverage (Rescue Index up to +3.01) but not utilisation depth (Rescue Index \u2248 0).",
"Evaluation depth predicts design quality (\u03c1 = 0.685, p < 10\u207b\u00b9\u00b9\u2077) beyond binary tool selection.",
"Forced-depth intervention lifts the strongest agent (DeepSeek V3) by +9.3 points on 18 tasks, while a low-diversity control hurts it (-2.3) \u2014 evidence that depth, not process change alone, drives the gain."
],
"scoring": {
"rubric_max": 100,
"components": {
"approach": 20,
"orchestration": 15,
"quality": 35,
"feasibility": 15,
"novelty": 5,
"diversity": 10
},
"method": "Hybrid: 72 algorithmic points (Boltz-2 verification) + 28 LLM-judge points (3-judge panel with self-exclusion)."
},
"entries": [
{
"agent_name": "Human Oracle",
"agent_id": "oracle",
"mode": null,
"submission_type": "human_oracle",
"organization": "Romero Lab",
"mcp_custom": false,
"overall_score": 74.85,
"component_scores": {
"approach": 20.0,
"orchestration": 15.0,
"quality": 26.24,
"feasibility": 10.26,
"novelty": 2.93,
"diversity": 0.43
},
"taxonomy_scores": {
"de_novo": {
"antibody": 79.2,
"binder": 71.8,
"enzyme": 75.6,
"fluorescent_protein": 78.7,
"scaffold": 75.8
},
"redesign": {
"antibody": 69.2,
"enzyme": 76.2,
"fluorescent_protein": 77.1,
"scaffold": 76.8
}
},
"tasks_completed": 76,
"tasks_total": 76,
"tasks_with_zero": 0,
"avg_latency_sec": null,
"submission_date": "2026-04-06"
},
{
"agent_name": "Human Expert",
"agent_id": "human-expert",
"mode": null,
"submission_type": "human_expert",
"organization": "Romero Lab",
"mcp_custom": false,
"overall_score": 61.25,
"component_scores": {
"approach": 13.81,
"orchestration": 8.86,
"quality": 20.91,
"feasibility": 10.79,
"novelty": 3.46,
"diversity": 3.43
},
"taxonomy_scores": {
"de_novo": {
"antibody": 65.6,
"binder": 65.0,
"enzyme": 55.3,
"fluorescent_protein": 57.2,
"scaffold": 65.4
},
"redesign": {
"antibody": 52.4,
"enzyme": 59.5,
"fluorescent_protein": 54.6,
"scaffold": 53.7
}
},
"tasks_completed": 76,
"tasks_total": 76,
"tasks_with_zero": 0,
"avg_latency_sec": null,
"submission_date": "2026-04-06"
},
{
"agent_name": "DeepSeek V3",
"agent_id": "deepseek-v3-benchmark",
"mode": "benchmark",
"submission_type": "llm",
"organization": "DeepSeek",
"mcp_custom": false,
"overall_score": 60.43,
"component_scores": {
"approach": 11.4,
"orchestration": 9.36,
"quality": 22.07,
"feasibility": 10.77,
"novelty": 3.44,
"diversity": 3.38
},
"taxonomy_scores": {
"de_novo": {
"antibody": 65.0,
"binder": 63.4,
"enzyme": 53.9,
"fluorescent_protein": 72.3,
"scaffold": 57.8
},
"redesign": {
"antibody": 61.3,
"enzyme": 59.3,
"fluorescent_protein": 56.9,
"scaffold": 66.9
}
},
"tasks_completed": 76,
"tasks_total": 76,
"tasks_with_zero": 1,
"avg_latency_sec": null,
"submission_date": "2026-04-06"
},
{
"agent_name": "DeepSeek V3",
"agent_id": "deepseek-v3-user",
"mode": "user",
"submission_type": "llm",
"organization": "DeepSeek",
"mcp_custom": false,
"overall_score": 58.46,
"component_scores": {
"approach": 11.09,
"orchestration": 9.14,
"quality": 21.74,
"feasibility": 9.91,
"novelty": 3.25,
"diversity": 3.33
},
"taxonomy_scores": {
"de_novo": {
"antibody": 65.6,
"binder": 63.0,
"enzyme": 64.2,
"fluorescent_protein": 64.2,
"scaffold": 60.4
},
"redesign": {
"antibody": 61.6,
"enzyme": 60.7,
"fluorescent_protein": 43.0,
"scaffold": 44.1
}
},
"tasks_completed": 76,
"tasks_total": 76,
"tasks_with_zero": 7,
"avg_latency_sec": null,
"submission_date": "2026-04-06"
},
{
"agent_name": "GPT-5",
"agent_id": "gpt5-benchmark",
"mode": "benchmark",
"submission_type": "llm",
"organization": "OpenAI",
"mcp_custom": false,
"overall_score": 55.61,
"component_scores": {
"approach": 8.76,
"orchestration": 6.84,
"quality": 22.96,
"feasibility": 10.03,
"novelty": 3.27,
"diversity": 3.75
},
"taxonomy_scores": {
"de_novo": {
"antibody": 62.6,
"binder": 59.9,
"enzyme": 55.9,
"fluorescent_protein": 53.9,
"scaffold": 56.1
},
"redesign": {
"antibody": 47.3,
"enzyme": 54.4,
"fluorescent_protein": 49.5,
"scaffold": 54.6
}
},
"tasks_completed": 76,
"tasks_total": 76,
"tasks_with_zero": 2,
"avg_latency_sec": null,
"submission_date": "2026-04-06"
},
{
"agent_name": "GPT-5",
"agent_id": "gpt5-user",
"mode": "user",
"submission_type": "llm",
"organization": "OpenAI",
"mcp_custom": false,
"overall_score": 55.26,
"component_scores": {
"approach": 9.46,
"orchestration": 8.29,
"quality": 20.83,
"feasibility": 9.9,
"novelty": 3.2,
"diversity": 3.58
},
"taxonomy_scores": {
"de_novo": {
"antibody": 61.2,
"binder": 56.1,
"enzyme": 57.9,
"fluorescent_protein": 61.3,
"scaffold": 55.6
},
"redesign": {
"antibody": 52.1,
"enzyme": 54.2,
"fluorescent_protein": 55.7,
"scaffold": 46.3
}
},
"tasks_completed": 76,
"tasks_total": 76,
"tasks_with_zero": 4,
"avg_latency_sec": null,
"submission_date": "2026-04-06"
},
{
"agent_name": "Hardcoded Pipeline",
"agent_id": "hardcoded-pipeline",
"mode": null,
"submission_type": "hardcoded",
"organization": "Deterministic",
"mcp_custom": false,
"overall_score": 54.2,
"component_scores": {
"approach": 10.19,
"orchestration": 8.3,
"quality": 19.91,
"feasibility": 10.26,
"novelty": 2.48,
"diversity": 3.08
},
"taxonomy_scores": {
"de_novo": {
"antibody": 60.8,
"binder": 59.8,
"enzyme": 46.0,
"fluorescent_protein": 62.6,
"scaffold": 55.0
},
"redesign": {
"antibody": 45.4,
"enzyme": 50.7,
"fluorescent_protein": 49.5,
"scaffold": 50.3
}
},
"tasks_completed": 76,
"tasks_total": 76,
"tasks_with_zero": 0,
"avg_latency_sec": null,
"submission_date": "2026-04-06"
},
{
"agent_name": "Claude Sonnet 4.5",
"agent_id": "sonnet-4.5-user",
"mode": "user",
"submission_type": "llm",
"organization": "Anthropic",
"mcp_custom": false,
"overall_score": 50.23,
"component_scores": {
"approach": 9.63,
"orchestration": 8.54,
"quality": 17.31,
"feasibility": 9.03,
"novelty": 2.68,
"diversity": 3.05
},
"taxonomy_scores": {
"de_novo": {
"antibody": 66.3,
"binder": 56.5,
"enzyme": 56.9,
"fluorescent_protein": 62.8,
"scaffold": 57.9
},
"redesign": {
"antibody": 43.1,
"enzyme": 37.5,
"fluorescent_protein": 32.8,
"scaffold": 42.0
}
},
"tasks_completed": 76,
"tasks_total": 76,
"tasks_with_zero": 16,
"avg_latency_sec": null,
"submission_date": "2026-04-06"
},
{
"agent_name": "Claude Sonnet 4.5",
"agent_id": "sonnet-4.5-benchmark",
"mode": "benchmark",
"submission_type": "llm",
"organization": "Anthropic",
"mcp_custom": false,
"overall_score": 41.17,
"component_scores": {
"approach": 7.92,
"orchestration": 6.93,
"quality": 13.54,
"feasibility": 8.2,
"novelty": 2.25,
"diversity": 2.33
},
"taxonomy_scores": {
"de_novo": {
"antibody": 29.5,
"binder": 55.5,
"enzyme": 29.6,
"fluorescent_protein": 45.9,
"scaffold": 41.2
},
"redesign": {
"antibody": 34.6,
"enzyme": 29.5,
"fluorescent_protein": 35.3,
"scaffold": 40.9
}
},
"tasks_completed": 76,
"tasks_total": 76,
"tasks_with_zero": 23,
"avg_latency_sec": null,
"submission_date": "2026-04-06"
},
{
"agent_name": "Gemini 2.5 Pro",
"agent_id": "gemini-2.5-pro-user",
"mode": "user",
"submission_type": "llm",
"organization": "Google",
"mcp_custom": false,
"overall_score": 8.75,
"component_scores": {
"approach": 3.37,
"orchestration": 2.79,
"quality": 0.55,
"feasibility": 1.15,
"novelty": 0.49,
"diversity": 0.41
},
"taxonomy_scores": {
"de_novo": {
"antibody": 10.8,
"binder": 9.3,
"enzyme": 30.2,
"fluorescent_protein": 3.1,
"scaffold": 9.2
},
"redesign": {
"antibody": 8.0,
"enzyme": 4.9,
"fluorescent_protein": 6.8,
"scaffold": 8.6
}
},
"tasks_completed": 76,
"tasks_total": 76,
"tasks_with_zero": 74,
"avg_latency_sec": null,
"submission_date": "2026-04-06"
},
{
"agent_name": "Gemini 2.5 Pro",
"agent_id": "gemini-2.5-pro-benchmark",
"mode": "benchmark",
"submission_type": "llm",
"organization": "Google",
"mcp_custom": false,
"overall_score": 8.11,
"component_scores": {
"approach": 3.58,
"orchestration": 2.47,
"quality": 0.34,
"feasibility": 0.93,
"novelty": 0.42,
"diversity": 0.37
},
"taxonomy_scores": {
"de_novo": {
"antibody": 9.1,
"binder": 9.2,
"enzyme": 11.0,
"fluorescent_protein": 3.1,
"scaffold": 9.1
},
"redesign": {
"antibody": 7.3,
"enzyme": 4.4,
"fluorescent_protein": 6.2,
"scaffold": 11.4
}
},
"tasks_completed": 76,
"tasks_total": 76,
"tasks_with_zero": 75,
"avg_latency_sec": null,
"submission_date": "2026-04-06"
}
],
"interventions": {
"description": "Causal intervention experiments on the depth gap. 18 representative tasks rerun under three conditions: baseline (no intervention), forced_depth (mandate \u22653 evaluation passes per candidate), and low_diversity_control (constrain candidate count without forcing depth).",
"n_tasks": 18,
"rows": [
{
"label": "DeepSeek V3 \u2014 baseline",
"condition": "baseline",
"agent": "deepseek-v3-tools-benchmark",
"n_tasks": 18,
"score": 58.72,
"delta_vs_baseline": 0.0,
"approach": 13.44,
"orchestration": 11.17,
"quality": 16.11,
"diversity": 3.56
},
{
"label": "GPT-5 \u2014 baseline",
"condition": "baseline",
"agent": "gpt5-tools-benchmark",
"n_tasks": 18,
"score": 46.78,
"delta_vs_baseline": 0.0,
"approach": 8.33,
"orchestration": 6.22,
"quality": 15.39,
"diversity": 3.94
},
{
"label": "Human Expert \u2014 baseline",
"condition": "baseline",
"agent": "human-expert-agent",
"n_tasks": 18,
"score": 56.67,
"delta_vs_baseline": 0.0,
"approach": 18.28,
"orchestration": 9.28,
"quality": 11.06,
"diversity": 2.28
},
{
"label": "DeepSeek V3 \u2014 forced depth",
"condition": "forced_depth",
"agent": "deepseek-v3-forced-depth",
"n_tasks": 18,
"score": 68.06,
"delta_vs_baseline": 9.34,
"approach": 18.39,
"orchestration": 12.28,
"quality": 16.11,
"diversity": 3.94
},
{
"label": "GPT-5 \u2014 forced depth",
"condition": "forced_depth",
"agent": "gpt5-tools-forced-depth",
"n_tasks": 18,
"score": 62.67,
"delta_vs_baseline": 15.89,
"approach": 18.28,
"orchestration": 11.67,
"quality": 15.0,
"diversity": 3.06
},
{
"label": "DeepSeek V3 \u2014 low diversity",
"condition": "low_diversity_control",
"agent": "deepseek-v3-low-diversity",
"n_tasks": 18,
"score": 56.39,
"delta_vs_baseline": -2.33,
"approach": 13.11,
"orchestration": 11.11,
"quality": 16.0,
"diversity": 3.22
},
{
"label": "GPT-5 \u2014 low diversity",
"condition": "low_diversity_control",
"agent": "gpt5-tools-low-diversity",
"n_tasks": 18,
"score": 61.5,
"delta_vs_baseline": 14.72,
"approach": 13.06,
"orchestration": 12.0,
"quality": 16.22,
"diversity": 3.22
},
{
"label": "Human Expert \u2014 shallow",
"condition": "low_diversity_control",
"agent": "human-expert-shallow",
"n_tasks": 18,
"score": 55.06,
"delta_vs_baseline": -1.61,
"approach": 18.22,
"orchestration": 9.28,
"quality": 11.17,
"diversity": 0.61
}
]
}
}