{
  "last_updated": "2026-04-14",
  "paper_title": "Evaluating LLM-Driven Protein Design: Agents Lack Iterative Evaluation Depth",
  "headline_findings": [
    "Top-tier LLM agents (DeepSeek V3, GPT-5) now surpass a deterministic hardcoded pipeline.",
    "All agents show a critical evaluation depth gap \u2014 they invoke evaluation tools at only 14% of expert frequency.",
    "Workflow guidance rescues tool coverage (Rescue Index up to +3.01) but not utilisation depth (Rescue Index \u2248 0).",
    "Evaluation depth predicts design quality (\u03c1 = 0.685, p < 10\u207b\u00b9\u00b9\u2077) beyond binary tool selection.",
    "Forced-depth intervention lifts the strongest agent (DeepSeek V3) by +9.3 points on 18 tasks, while a low-diversity control hurts it (-2.3) \u2014 evidence that depth, not process change alone, drives the gain."
  ],
  "scoring": {
    "rubric_max": 100,
    "components": {
      "approach": 20,
      "orchestration": 15,
      "quality": 35,
      "feasibility": 15,
      "novelty": 5,
      "diversity": 10
    },
    "method": "Hybrid: 72 algorithmic points (Boltz-2 verification) + 28 LLM-judge points (3-judge panel with self-exclusion)."
  },
  "entries": [
    {
      "agent_name": "Human Oracle",
      "agent_id": "oracle",
      "mode": null,
      "submission_type": "human_oracle",
      "organization": "Romero Lab",
      "mcp_custom": false,
      "overall_score": 74.85,
      "component_scores": {
        "approach": 20.0,
        "orchestration": 15.0,
        "quality": 26.24,
        "feasibility": 10.26,
        "novelty": 2.93,
        "diversity": 0.43
      },
      "taxonomy_scores": {
        "de_novo": {
          "antibody": 79.2,
          "binder": 71.8,
          "enzyme": 75.6,
          "fluorescent_protein": 78.7,
          "scaffold": 75.8
        },
        "redesign": {
          "antibody": 69.2,
          "enzyme": 76.2,
          "fluorescent_protein": 77.1,
          "scaffold": 76.8
        }
      },
      "tasks_completed": 76,
      "tasks_total": 76,
      "tasks_with_zero": 0,
      "avg_latency_sec": null,
      "submission_date": "2026-04-06"
    },
    {
      "agent_name": "Human Expert",
      "agent_id": "human-expert",
      "mode": null,
      "submission_type": "human_expert",
      "organization": "Romero Lab",
      "mcp_custom": false,
      "overall_score": 61.25,
      "component_scores": {
        "approach": 13.81,
        "orchestration": 8.86,
        "quality": 20.91,
        "feasibility": 10.79,
        "novelty": 3.46,
        "diversity": 3.43
      },
      "taxonomy_scores": {
        "de_novo": {
          "antibody": 65.6,
          "binder": 65.0,
          "enzyme": 55.3,
          "fluorescent_protein": 57.2,
          "scaffold": 65.4
        },
        "redesign": {
          "antibody": 52.4,
          "enzyme": 59.5,
          "fluorescent_protein": 54.6,
          "scaffold": 53.7
        }
      },
      "tasks_completed": 76,
      "tasks_total": 76,
      "tasks_with_zero": 0,
      "avg_latency_sec": null,
      "submission_date": "2026-04-06"
    },
    {
      "agent_name": "DeepSeek V3",
      "agent_id": "deepseek-v3-benchmark",
      "mode": "benchmark",
      "submission_type": "llm",
      "organization": "DeepSeek",
      "mcp_custom": false,
      "overall_score": 60.43,
      "component_scores": {
        "approach": 11.4,
        "orchestration": 9.36,
        "quality": 22.07,
        "feasibility": 10.77,
        "novelty": 3.44,
        "diversity": 3.38
      },
      "taxonomy_scores": {
        "de_novo": {
          "antibody": 65.0,
          "binder": 63.4,
          "enzyme": 53.9,
          "fluorescent_protein": 72.3,
          "scaffold": 57.8
        },
        "redesign": {
          "antibody": 61.3,
          "enzyme": 59.3,
          "fluorescent_protein": 56.9,
          "scaffold": 66.9
        }
      },
      "tasks_completed": 76,
      "tasks_total": 76,
      "tasks_with_zero": 1,
      "avg_latency_sec": null,
      "submission_date": "2026-04-06"
    },
    {
      "agent_name": "DeepSeek V3",
      "agent_id": "deepseek-v3-user",
      "mode": "user",
      "submission_type": "llm",
      "organization": "DeepSeek",
      "mcp_custom": false,
      "overall_score": 58.46,
      "component_scores": {
        "approach": 11.09,
        "orchestration": 9.14,
        "quality": 21.74,
        "feasibility": 9.91,
        "novelty": 3.25,
        "diversity": 3.33
      },
      "taxonomy_scores": {
        "de_novo": {
          "antibody": 65.6,
          "binder": 63.0,
          "enzyme": 64.2,
          "fluorescent_protein": 64.2,
          "scaffold": 60.4
        },
        "redesign": {
          "antibody": 61.6,
          "enzyme": 60.7,
          "fluorescent_protein": 43.0,
          "scaffold": 44.1
        }
      },
      "tasks_completed": 76,
      "tasks_total": 76,
      "tasks_with_zero": 7,
      "avg_latency_sec": null,
      "submission_date": "2026-04-06"
    },
    {
      "agent_name": "GPT-5",
      "agent_id": "gpt5-benchmark",
      "mode": "benchmark",
      "submission_type": "llm",
      "organization": "OpenAI",
      "mcp_custom": false,
      "overall_score": 55.61,
      "component_scores": {
        "approach": 8.76,
        "orchestration": 6.84,
        "quality": 22.96,
        "feasibility": 10.03,
        "novelty": 3.27,
        "diversity": 3.75
      },
      "taxonomy_scores": {
        "de_novo": {
          "antibody": 62.6,
          "binder": 59.9,
          "enzyme": 55.9,
          "fluorescent_protein": 53.9,
          "scaffold": 56.1
        },
        "redesign": {
          "antibody": 47.3,
          "enzyme": 54.4,
          "fluorescent_protein": 49.5,
          "scaffold": 54.6
        }
      },
      "tasks_completed": 76,
      "tasks_total": 76,
      "tasks_with_zero": 2,
      "avg_latency_sec": null,
      "submission_date": "2026-04-06"
    },
    {
      "agent_name": "GPT-5",
      "agent_id": "gpt5-user",
      "mode": "user",
      "submission_type": "llm",
      "organization": "OpenAI",
      "mcp_custom": false,
      "overall_score": 55.26,
      "component_scores": {
        "approach": 9.46,
        "orchestration": 8.29,
        "quality": 20.83,
        "feasibility": 9.9,
        "novelty": 3.2,
        "diversity": 3.58
      },
      "taxonomy_scores": {
        "de_novo": {
          "antibody": 61.2,
          "binder": 56.1,
          "enzyme": 57.9,
          "fluorescent_protein": 61.3,
          "scaffold": 55.6
        },
        "redesign": {
          "antibody": 52.1,
          "enzyme": 54.2,
          "fluorescent_protein": 55.7,
          "scaffold": 46.3
        }
      },
      "tasks_completed": 76,
      "tasks_total": 76,
      "tasks_with_zero": 4,
      "avg_latency_sec": null,
      "submission_date": "2026-04-06"
    },
    {
      "agent_name": "Hardcoded Pipeline",
      "agent_id": "hardcoded-pipeline",
      "mode": null,
      "submission_type": "hardcoded",
      "organization": "Deterministic",
      "mcp_custom": false,
      "overall_score": 54.2,
      "component_scores": {
        "approach": 10.19,
        "orchestration": 8.3,
        "quality": 19.91,
        "feasibility": 10.26,
        "novelty": 2.48,
        "diversity": 3.08
      },
      "taxonomy_scores": {
        "de_novo": {
          "antibody": 60.8,
          "binder": 59.8,
          "enzyme": 46.0,
          "fluorescent_protein": 62.6,
          "scaffold": 55.0
        },
        "redesign": {
          "antibody": 45.4,
          "enzyme": 50.7,
          "fluorescent_protein": 49.5,
          "scaffold": 50.3
        }
      },
      "tasks_completed": 76,
      "tasks_total": 76,
      "tasks_with_zero": 0,
      "avg_latency_sec": null,
      "submission_date": "2026-04-06"
    },
    {
      "agent_name": "Claude Sonnet 4.5",
      "agent_id": "sonnet-4.5-user",
      "mode": "user",
      "submission_type": "llm",
      "organization": "Anthropic",
      "mcp_custom": false,
      "overall_score": 50.23,
      "component_scores": {
        "approach": 9.63,
        "orchestration": 8.54,
        "quality": 17.31,
        "feasibility": 9.03,
        "novelty": 2.68,
        "diversity": 3.05
      },
      "taxonomy_scores": {
        "de_novo": {
          "antibody": 66.3,
          "binder": 56.5,
          "enzyme": 56.9,
          "fluorescent_protein": 62.8,
          "scaffold": 57.9
        },
        "redesign": {
          "antibody": 43.1,
          "enzyme": 37.5,
          "fluorescent_protein": 32.8,
          "scaffold": 42.0
        }
      },
      "tasks_completed": 76,
      "tasks_total": 76,
      "tasks_with_zero": 16,
      "avg_latency_sec": null,
      "submission_date": "2026-04-06"
    },
    {
      "agent_name": "Claude Sonnet 4.5",
      "agent_id": "sonnet-4.5-benchmark",
      "mode": "benchmark",
      "submission_type": "llm",
      "organization": "Anthropic",
      "mcp_custom": false,
      "overall_score": 41.17,
      "component_scores": {
        "approach": 7.92,
        "orchestration": 6.93,
        "quality": 13.54,
        "feasibility": 8.2,
        "novelty": 2.25,
        "diversity": 2.33
      },
      "taxonomy_scores": {
        "de_novo": {
          "antibody": 29.5,
          "binder": 55.5,
          "enzyme": 29.6,
          "fluorescent_protein": 45.9,
          "scaffold": 41.2
        },
        "redesign": {
          "antibody": 34.6,
          "enzyme": 29.5,
          "fluorescent_protein": 35.3,
          "scaffold": 40.9
        }
      },
      "tasks_completed": 76,
      "tasks_total": 76,
      "tasks_with_zero": 23,
      "avg_latency_sec": null,
      "submission_date": "2026-04-06"
    },
    {
      "agent_name": "Gemini 2.5 Pro",
      "agent_id": "gemini-2.5-pro-user",
      "mode": "user",
      "submission_type": "llm",
      "organization": "Google",
      "mcp_custom": false,
      "overall_score": 8.75,
      "component_scores": {
        "approach": 3.37,
        "orchestration": 2.79,
        "quality": 0.55,
        "feasibility": 1.15,
        "novelty": 0.49,
        "diversity": 0.41
      },
      "taxonomy_scores": {
        "de_novo": {
          "antibody": 10.8,
          "binder": 9.3,
          "enzyme": 30.2,
          "fluorescent_protein": 3.1,
          "scaffold": 9.2
        },
        "redesign": {
          "antibody": 8.0,
          "enzyme": 4.9,
          "fluorescent_protein": 6.8,
          "scaffold": 8.6
        }
      },
      "tasks_completed": 76,
      "tasks_total": 76,
      "tasks_with_zero": 74,
      "avg_latency_sec": null,
      "submission_date": "2026-04-06"
    },
    {
      "agent_name": "Gemini 2.5 Pro",
      "agent_id": "gemini-2.5-pro-benchmark",
      "mode": "benchmark",
      "submission_type": "llm",
      "organization": "Google",
      "mcp_custom": false,
      "overall_score": 8.11,
      "component_scores": {
        "approach": 3.58,
        "orchestration": 2.47,
        "quality": 0.34,
        "feasibility": 0.93,
        "novelty": 0.42,
        "diversity": 0.37
      },
      "taxonomy_scores": {
        "de_novo": {
          "antibody": 9.1,
          "binder": 9.2,
          "enzyme": 11.0,
          "fluorescent_protein": 3.1,
          "scaffold": 9.1
        },
        "redesign": {
          "antibody": 7.3,
          "enzyme": 4.4,
          "fluorescent_protein": 6.2,
          "scaffold": 11.4
        }
      },
      "tasks_completed": 76,
      "tasks_total": 76,
      "tasks_with_zero": 75,
      "avg_latency_sec": null,
      "submission_date": "2026-04-06"
    }
  ],
  "interventions": {
    "description": "Causal intervention experiments on the depth gap. 18 representative tasks rerun under three conditions: baseline (no intervention), forced_depth (mandate \u22653 evaluation passes per candidate), and low_diversity_control (constrain candidate count without forcing depth).",
    "n_tasks": 18,
    "rows": [
      {
        "label": "DeepSeek V3 \u2014 baseline",
        "condition": "baseline",
        "agent": "deepseek-v3-tools-benchmark",
        "n_tasks": 18,
        "score": 58.72,
        "delta_vs_baseline": 0.0,
        "approach": 13.44,
        "orchestration": 11.17,
        "quality": 16.11,
        "diversity": 3.56
      },
      {
        "label": "GPT-5 \u2014 baseline",
        "condition": "baseline",
        "agent": "gpt5-tools-benchmark",
        "n_tasks": 18,
        "score": 46.78,
        "delta_vs_baseline": 0.0,
        "approach": 8.33,
        "orchestration": 6.22,
        "quality": 15.39,
        "diversity": 3.94
      },
      {
        "label": "Human Expert \u2014 baseline",
        "condition": "baseline",
        "agent": "human-expert-agent",
        "n_tasks": 18,
        "score": 56.67,
        "delta_vs_baseline": 0.0,
        "approach": 18.28,
        "orchestration": 9.28,
        "quality": 11.06,
        "diversity": 2.28
      },
      {
        "label": "DeepSeek V3 \u2014 forced depth",
        "condition": "forced_depth",
        "agent": "deepseek-v3-forced-depth",
        "n_tasks": 18,
        "score": 68.06,
        "delta_vs_baseline": 9.34,
        "approach": 18.39,
        "orchestration": 12.28,
        "quality": 16.11,
        "diversity": 3.94
      },
      {
        "label": "GPT-5 \u2014 forced depth",
        "condition": "forced_depth",
        "agent": "gpt5-tools-forced-depth",
        "n_tasks": 18,
        "score": 62.67,
        "delta_vs_baseline": 15.89,
        "approach": 18.28,
        "orchestration": 11.67,
        "quality": 15.0,
        "diversity": 3.06
      },
      {
        "label": "DeepSeek V3 \u2014 low diversity",
        "condition": "low_diversity_control",
        "agent": "deepseek-v3-low-diversity",
        "n_tasks": 18,
        "score": 56.39,
        "delta_vs_baseline": -2.33,
        "approach": 13.11,
        "orchestration": 11.11,
        "quality": 16.0,
        "diversity": 3.22
      },
      {
        "label": "GPT-5 \u2014 low diversity",
        "condition": "low_diversity_control",
        "agent": "gpt5-tools-low-diversity",
        "n_tasks": 18,
        "score": 61.5,
        "delta_vs_baseline": 14.72,
        "approach": 13.06,
        "orchestration": 12.0,
        "quality": 16.22,
        "diversity": 3.22
      },
      {
        "label": "Human Expert \u2014 shallow",
        "condition": "low_diversity_control",
        "agent": "human-expert-shallow",
        "n_tasks": 18,
        "score": 55.06,
        "delta_vs_baseline": -1.61,
        "approach": 18.22,
        "orchestration": 9.28,
        "quality": 11.17,
        "diversity": 0.61
      }
    ]
  }
}