Spaces:

RomeroLab-Duke
/

BioDesignBench-Leaderboard

Running

App Files Files Community

Jasonkim8652 commited on Mar 3

Commit

d647ac9

verified ·

1 Parent(s): e239859

Upload leaderboard_data.json with huggingface_hub

Browse files

Files changed (1) hide show

leaderboard_data.json +382 -0

leaderboard_data.json ADDED Viewed

	@@ -0,0 +1,382 @@

+{
+  "last_updated": "2026-03-03",
+  "entries": [
+    {
+      "agent_name": "Human Oracle",
+      "agent_id": "human-oracle",
+      "mode": null,
+      "mcp_custom": false,
+      "submission_type": "human_oracle",
+      "organization": "Ground Truth",
+      "overall_score": 85.0,
+      "component_scores": {
+        "approach": 17.5,
+        "orchestration": 13.5,
+        "quality": 30.0,
+        "feasibility": 13.8,
+        "novelty": 3.5,
+        "diversity": 6.7
+      },
+      "taxonomy_scores": {
+        "de_novo_binder": {"ab": 88, "enz": 82, "sig": 86},
+        "sequence_optimization": {"ab": 90, "enz": 85, "sig": 80, "str": 87, "flu": 92},
+        "de_novo_backbone": {"str": 75},
+        "complex_engineering": {"enz": 80, "sig": 85, "str": 88},
+        "conformational_design": {"enz": 78, "sig": 82, "str": 80, "flu": 85}
+      },
+      "tasks_completed": 76,
+      "tasks_total": 76,
+      "tasks_with_zero": 0,
+      "avg_latency_sec": null,
+      "submission_date": "2026-03-01"
+    },
+    {
+      "agent_name": "Human Expert",
+      "agent_id": "human-expert",
+      "mode": null,
+      "mcp_custom": false,
+      "submission_type": "human_expert",
+      "organization": "Manual (Jason)",
+      "overall_score": 62.0,
+      "component_scores": {
+        "approach": 14.0,
+        "orchestration": 11.0,
+        "quality": 20.5,
+        "feasibility": 10.5,
+        "novelty": 2.5,
+        "diversity": 3.5
+      },
+      "taxonomy_scores": {
+        "de_novo_binder": {"ab": 65, "enz": 58, "sig": 63},
+        "sequence_optimization": {"ab": 70, "enz": 62, "sig": 55, "str": 64, "flu": 72},
+        "de_novo_backbone": {"str": 50},
+        "complex_engineering": {"enz": 58, "sig": 62, "str": 66},
+        "conformational_design": {"enz": 55, "sig": 60, "str": 58, "flu": 62}
+      },
+      "tasks_completed": 76,
+      "tasks_total": 76,
+      "tasks_with_zero": 2,
+      "avg_latency_sec": null,
+      "submission_date": "2026-03-01"
+    },
+    {
+      "agent_name": "Hardcoded Pipeline",
+      "agent_id": "hardcoded-pipeline",
+      "mode": null,
+      "mcp_custom": false,
+      "submission_type": "hardcoded",
+      "organization": "Deterministic",
+      "overall_score": 41.5,
+      "component_scores": {
+        "approach": 10.0,
+        "orchestration": 9.5,
+        "quality": 12.0,
+        "feasibility": 6.5,
+        "novelty": 1.5,
+        "diversity": 2.0
+      },
+      "taxonomy_scores": {
+        "de_novo_binder": {"ab": 42, "enz": 38, "sig": 44},
+        "sequence_optimization": {"ab": 48, "enz": 40, "sig": 35, "str": 42, "flu": 50},
+        "de_novo_backbone": {"str": 30},
+        "complex_engineering": {"enz": 38, "sig": 42, "str": 45},
+        "conformational_design": {"enz": 35, "sig": 40, "str": 38, "flu": 42}
+      },
+      "tasks_completed": 76,
+      "tasks_total": 76,
+      "tasks_with_zero": 5,
+      "avg_latency_sec": null,
+      "submission_date": "2026-03-01"
+    },
+    {
+      "agent_name": "Claude-4.5",
+      "agent_id": "claude45-user",
+      "mode": "user",
+      "mcp_custom": false,
+      "submission_type": "llm",
+      "organization": "Anthropic",
+      "overall_score": 35.0,
+      "component_scores": {
+        "approach": 8.5,
+        "orchestration": 7.0,
+        "quality": 10.5,
+        "feasibility": 5.5,
+        "novelty": 1.5,
+        "diversity": 2.0
+      },
+      "taxonomy_scores": {
+        "de_novo_binder": {"ab": 38, "enz": 32, "sig": 36},
+        "sequence_optimization": {"ab": 42, "enz": 35, "sig": 30, "str": 36, "flu": 44},
+        "de_novo_backbone": {"str": 22},
+        "complex_engineering": {"enz": 32, "sig": 36, "str": 38},
+        "conformational_design": {"enz": 30, "sig": 34, "str": 32, "flu": 36}
+      },
+      "tasks_completed": 76,
+      "tasks_total": 76,
+      "tasks_with_zero": 6,
+      "avg_latency_sec": 52.3,
+      "submission_date": "2026-03-01"
+    },
+    {
+      "agent_name": "GPT-5",
+      "agent_id": "gpt5-user",
+      "mode": "user",
+      "mcp_custom": false,
+      "submission_type": "llm",
+      "organization": "OpenAI",
+      "overall_score": 33.0,
+      "component_scores": {
+        "approach": 8.0,
+        "orchestration": 6.5,
+        "quality": 10.0,
+        "feasibility": 5.0,
+        "novelty": 1.5,
+        "diversity": 2.0
+      },
+      "taxonomy_scores": {
+        "de_novo_binder": {"ab": 35, "enz": 30, "sig": 34},
+        "sequence_optimization": {"ab": 40, "enz": 33, "sig": 28, "str": 34, "flu": 42},
+        "de_novo_backbone": {"str": 20},
+        "complex_engineering": {"enz": 30, "sig": 34, "str": 36},
+        "conformational_design": {"enz": 28, "sig": 32, "str": 30, "flu": 34}
+      },
+      "tasks_completed": 76,
+      "tasks_total": 76,
+      "tasks_with_zero": 8,
+      "avg_latency_sec": 45.2,
+      "submission_date": "2026-03-01"
+    },
+    {
+      "agent_name": "Deepseek-v3.2",
+      "agent_id": "deepseek32-user",
+      "mode": "user",
+      "mcp_custom": false,
+      "submission_type": "llm",
+      "organization": "Deepseek",
+      "overall_score": 30.0,
+      "component_scores": {
+        "approach": 7.2,
+        "orchestration": 6.0,
+        "quality": 9.0,
+        "feasibility": 4.5,
+        "novelty": 1.3,
+        "diversity": 2.0
+      },
+      "taxonomy_scores": {
+        "de_novo_binder": {"ab": 32, "enz": 28, "sig": 31},
+        "sequence_optimization": {"ab": 36, "enz": 30, "sig": 25, "str": 31, "flu": 38},
+        "de_novo_backbone": {"str": 18},
+        "complex_engineering": {"enz": 28, "sig": 31, "str": 33},
+        "conformational_design": {"enz": 25, "sig": 29, "str": 28, "flu": 31}
+      },
+      "tasks_completed": 76,
+      "tasks_total": 76,
+      "tasks_with_zero": 10,
+      "avg_latency_sec": 38.7,
+      "submission_date": "2026-03-02"
+    },
+    {
+      "agent_name": "Gemini-2.5-Pro",
+      "agent_id": "gemini25-user",
+      "mode": "user",
+      "mcp_custom": false,
+      "submission_type": "llm",
+      "organization": "Google",
+      "overall_score": 28.0,
+      "component_scores": {
+        "approach": 6.5,
+        "orchestration": 5.5,
+        "quality": 8.5,
+        "feasibility": 4.5,
+        "novelty": 1.2,
+        "diversity": 1.8
+      },
+      "taxonomy_scores": {
+        "de_novo_binder": {"ab": 30, "enz": 25, "sig": 29},
+        "sequence_optimization": {"ab": 34, "enz": 28, "sig": 22, "str": 29, "flu": 36},
+        "de_novo_backbone": {"str": 16},
+        "complex_engineering": {"enz": 25, "sig": 28, "str": 30},
+        "conformational_design": {"enz": 22, "sig": 27, "str": 25, "flu": 29}
+      },
+      "tasks_completed": 76,
+      "tasks_total": 76,
+      "tasks_with_zero": 12,
+      "avg_latency_sec": 55.1,
+      "submission_date": "2026-03-02"
+    },
+    {
+      "agent_name": "QWEN-3.5",
+      "agent_id": "qwen35-user",
+      "mode": "user",
+      "mcp_custom": false,
+      "submission_type": "llm",
+      "organization": "Alibaba",
+      "overall_score": 26.0,
+      "component_scores": {
+        "approach": 6.0,
+        "orchestration": 5.0,
+        "quality": 8.0,
+        "feasibility": 4.0,
+        "novelty": 1.2,
+        "diversity": 1.8
+      },
+      "taxonomy_scores": {
+        "de_novo_binder": {"ab": 28, "enz": 23, "sig": 27},
+        "sequence_optimization": {"ab": 32, "enz": 26, "sig": 20, "str": 27, "flu": 34},
+        "de_novo_backbone": {"str": 14},
+        "complex_engineering": {"enz": 23, "sig": 26, "str": 28},
+        "conformational_design": {"enz": 20, "sig": 25, "str": 23, "flu": 27}
+      },
+      "tasks_completed": 76,
+      "tasks_total": 76,
+      "tasks_with_zero": 14,
+      "avg_latency_sec": 41.8,
+      "submission_date": "2026-03-02"
+    },
+    {
+      "agent_name": "Claude-4.5",
+      "agent_id": "claude45-benchmark",
+      "mode": "benchmark",
+      "mcp_custom": false,
+      "submission_type": "llm",
+      "organization": "Anthropic",
+      "overall_score": 20.0,
+      "component_scores": {
+        "approach": 5.5,
+        "orchestration": 3.5,
+        "quality": 6.0,
+        "feasibility": 3.0,
+        "novelty": 1.0,
+        "diversity": 1.0
+      },
+      "taxonomy_scores": {
+        "de_novo_binder": {"ab": 22, "enz": 18, "sig": 21},
+        "sequence_optimization": {"ab": 25, "enz": 20, "sig": 16, "str": 21, "flu": 28},
+        "de_novo_backbone": {"str": 12},
+        "complex_engineering": {"enz": 18, "sig": 20, "str": 22},
+        "conformational_design": {"enz": 16, "sig": 19, "str": 18, "flu": 20}
+      },
+      "tasks_completed": 76,
+      "tasks_total": 76,
+      "tasks_with_zero": 14,
+      "avg_latency_sec": 48.5,
+      "submission_date": "2026-03-01"
+    },
+    {
+      "agent_name": "GPT-5",
+      "agent_id": "gpt5-benchmark",
+      "mode": "benchmark",
+      "mcp_custom": false,
+      "submission_type": "llm",
+      "organization": "OpenAI",
+      "overall_score": 18.5,
+      "component_scores": {
+        "approach": 5.2,
+        "orchestration": 3.1,
+        "quality": 5.8,
+        "feasibility": 2.5,
+        "novelty": 0.9,
+        "diversity": 1.0
+      },
+      "taxonomy_scores": {
+        "de_novo_binder": {"ab": 20, "enz": 16, "sig": 19},
+        "sequence_optimization": {"ab": 23, "enz": 18, "sig": 14, "str": 19, "flu": 26},
+        "de_novo_backbone": {"str": 10},
+        "complex_engineering": {"enz": 16, "sig": 18, "str": 20},
+        "conformational_design": {"enz": 14, "sig": 17, "str": 16, "flu": 18}
+      },
+      "tasks_completed": 76,
+      "tasks_total": 76,
+      "tasks_with_zero": 16,
+      "avg_latency_sec": 42.0,
+      "submission_date": "2026-03-01"
+    },
+    {
+      "agent_name": "Deepseek-v3.2",
+      "agent_id": "deepseek32-benchmark",
+      "mode": "benchmark",
+      "mcp_custom": false,
+      "submission_type": "llm",
+      "organization": "Deepseek",
+      "overall_score": 16.0,
+      "component_scores": {
+        "approach": 4.5,
+        "orchestration": 2.8,
+        "quality": 5.0,
+        "feasibility": 2.2,
+        "novelty": 0.7,
+        "diversity": 0.8
+      },
+      "taxonomy_scores": {
+        "de_novo_binder": {"ab": 18, "enz": 14, "sig": 17},
+        "sequence_optimization": {"ab": 20, "enz": 16, "sig": 12, "str": 17, "flu": 22},
+        "de_novo_backbone": {"str": 8},
+        "complex_engineering": {"enz": 14, "sig": 16, "str": 18},
+        "conformational_design": {"enz": 12, "sig": 15, "str": 14, "flu": 16}
+      },
+      "tasks_completed": 76,
+      "tasks_total": 76,
+      "tasks_with_zero": 18,
+      "avg_latency_sec": 35.2,
+      "submission_date": "2026-03-02"
+    },
+    {
+      "agent_name": "Gemini-2.5-Pro",
+      "agent_id": "gemini25-benchmark",
+      "mode": "benchmark",
+      "mcp_custom": false,
+      "submission_type": "llm",
+      "organization": "Google",
+      "overall_score": 15.0,
+      "component_scores": {
+        "approach": 4.2,
+        "orchestration": 2.5,
+        "quality": 4.5,
+        "feasibility": 2.0,
+        "novelty": 0.8,
+        "diversity": 1.0
+      },
+      "taxonomy_scores": {
+        "de_novo_binder": {"ab": 16, "enz": 12, "sig": 16},
+        "sequence_optimization": {"ab": 18, "enz": 15, "sig": 10, "str": 16, "flu": 20},
+        "de_novo_backbone": {"str": 8},
+        "complex_engineering": {"enz": 12, "sig": 15, "str": 16},
+        "conformational_design": {"enz": 10, "sig": 14, "str": 12, "flu": 15}
+      },
+      "tasks_completed": 76,
+      "tasks_total": 76,
+      "tasks_with_zero": 20,
+      "avg_latency_sec": 50.3,
+      "submission_date": "2026-03-02"
+    },
+    {
+      "agent_name": "QWEN-3.5",
+      "agent_id": "qwen35-benchmark",
+      "mode": "benchmark",
+      "mcp_custom": false,
+      "submission_type": "llm",
+      "organization": "Alibaba",
+      "overall_score": 14.0,
+      "component_scores": {
+        "approach": 3.8,
+        "orchestration": 2.2,
+        "quality": 4.2,
+        "feasibility": 2.0,
+        "novelty": 0.8,
+        "diversity": 1.0
+      },
+      "taxonomy_scores": {
+        "de_novo_binder": {"ab": 15, "enz": 11, "sig": 14},
+        "sequence_optimization": {"ab": 17, "enz": 14, "sig": 10, "str": 15, "flu": 18},
+        "de_novo_backbone": {"str": 7},
+        "complex_engineering": {"enz": 11, "sig": 14, "str": 15},
+        "conformational_design": {"enz": 10, "sig": 13, "str": 11, "flu": 14}
+      },
+      "tasks_completed": 76,
+      "tasks_total": 76,
+      "tasks_with_zero": 22,
+      "avg_latency_sec": 39.5,
+      "submission_date": "2026-03-02"
+    }
+  ]
+}