Jasonkim8652 commited on
Commit
d647ac9
·
verified ·
1 Parent(s): e239859

Upload leaderboard_data.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. leaderboard_data.json +382 -0
leaderboard_data.json ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_updated": "2026-03-03",
3
+ "entries": [
4
+ {
5
+ "agent_name": "Human Oracle",
6
+ "agent_id": "human-oracle",
7
+ "mode": null,
8
+ "mcp_custom": false,
9
+ "submission_type": "human_oracle",
10
+ "organization": "Ground Truth",
11
+ "overall_score": 85.0,
12
+ "component_scores": {
13
+ "approach": 17.5,
14
+ "orchestration": 13.5,
15
+ "quality": 30.0,
16
+ "feasibility": 13.8,
17
+ "novelty": 3.5,
18
+ "diversity": 6.7
19
+ },
20
+ "taxonomy_scores": {
21
+ "de_novo_binder": {"ab": 88, "enz": 82, "sig": 86},
22
+ "sequence_optimization": {"ab": 90, "enz": 85, "sig": 80, "str": 87, "flu": 92},
23
+ "de_novo_backbone": {"str": 75},
24
+ "complex_engineering": {"enz": 80, "sig": 85, "str": 88},
25
+ "conformational_design": {"enz": 78, "sig": 82, "str": 80, "flu": 85}
26
+ },
27
+ "tasks_completed": 76,
28
+ "tasks_total": 76,
29
+ "tasks_with_zero": 0,
30
+ "avg_latency_sec": null,
31
+ "submission_date": "2026-03-01"
32
+ },
33
+ {
34
+ "agent_name": "Human Expert",
35
+ "agent_id": "human-expert",
36
+ "mode": null,
37
+ "mcp_custom": false,
38
+ "submission_type": "human_expert",
39
+ "organization": "Manual (Jason)",
40
+ "overall_score": 62.0,
41
+ "component_scores": {
42
+ "approach": 14.0,
43
+ "orchestration": 11.0,
44
+ "quality": 20.5,
45
+ "feasibility": 10.5,
46
+ "novelty": 2.5,
47
+ "diversity": 3.5
48
+ },
49
+ "taxonomy_scores": {
50
+ "de_novo_binder": {"ab": 65, "enz": 58, "sig": 63},
51
+ "sequence_optimization": {"ab": 70, "enz": 62, "sig": 55, "str": 64, "flu": 72},
52
+ "de_novo_backbone": {"str": 50},
53
+ "complex_engineering": {"enz": 58, "sig": 62, "str": 66},
54
+ "conformational_design": {"enz": 55, "sig": 60, "str": 58, "flu": 62}
55
+ },
56
+ "tasks_completed": 76,
57
+ "tasks_total": 76,
58
+ "tasks_with_zero": 2,
59
+ "avg_latency_sec": null,
60
+ "submission_date": "2026-03-01"
61
+ },
62
+ {
63
+ "agent_name": "Hardcoded Pipeline",
64
+ "agent_id": "hardcoded-pipeline",
65
+ "mode": null,
66
+ "mcp_custom": false,
67
+ "submission_type": "hardcoded",
68
+ "organization": "Deterministic",
69
+ "overall_score": 41.5,
70
+ "component_scores": {
71
+ "approach": 10.0,
72
+ "orchestration": 9.5,
73
+ "quality": 12.0,
74
+ "feasibility": 6.5,
75
+ "novelty": 1.5,
76
+ "diversity": 2.0
77
+ },
78
+ "taxonomy_scores": {
79
+ "de_novo_binder": {"ab": 42, "enz": 38, "sig": 44},
80
+ "sequence_optimization": {"ab": 48, "enz": 40, "sig": 35, "str": 42, "flu": 50},
81
+ "de_novo_backbone": {"str": 30},
82
+ "complex_engineering": {"enz": 38, "sig": 42, "str": 45},
83
+ "conformational_design": {"enz": 35, "sig": 40, "str": 38, "flu": 42}
84
+ },
85
+ "tasks_completed": 76,
86
+ "tasks_total": 76,
87
+ "tasks_with_zero": 5,
88
+ "avg_latency_sec": null,
89
+ "submission_date": "2026-03-01"
90
+ },
91
+ {
92
+ "agent_name": "Claude-4.5",
93
+ "agent_id": "claude45-user",
94
+ "mode": "user",
95
+ "mcp_custom": false,
96
+ "submission_type": "llm",
97
+ "organization": "Anthropic",
98
+ "overall_score": 35.0,
99
+ "component_scores": {
100
+ "approach": 8.5,
101
+ "orchestration": 7.0,
102
+ "quality": 10.5,
103
+ "feasibility": 5.5,
104
+ "novelty": 1.5,
105
+ "diversity": 2.0
106
+ },
107
+ "taxonomy_scores": {
108
+ "de_novo_binder": {"ab": 38, "enz": 32, "sig": 36},
109
+ "sequence_optimization": {"ab": 42, "enz": 35, "sig": 30, "str": 36, "flu": 44},
110
+ "de_novo_backbone": {"str": 22},
111
+ "complex_engineering": {"enz": 32, "sig": 36, "str": 38},
112
+ "conformational_design": {"enz": 30, "sig": 34, "str": 32, "flu": 36}
113
+ },
114
+ "tasks_completed": 76,
115
+ "tasks_total": 76,
116
+ "tasks_with_zero": 6,
117
+ "avg_latency_sec": 52.3,
118
+ "submission_date": "2026-03-01"
119
+ },
120
+ {
121
+ "agent_name": "GPT-5",
122
+ "agent_id": "gpt5-user",
123
+ "mode": "user",
124
+ "mcp_custom": false,
125
+ "submission_type": "llm",
126
+ "organization": "OpenAI",
127
+ "overall_score": 33.0,
128
+ "component_scores": {
129
+ "approach": 8.0,
130
+ "orchestration": 6.5,
131
+ "quality": 10.0,
132
+ "feasibility": 5.0,
133
+ "novelty": 1.5,
134
+ "diversity": 2.0
135
+ },
136
+ "taxonomy_scores": {
137
+ "de_novo_binder": {"ab": 35, "enz": 30, "sig": 34},
138
+ "sequence_optimization": {"ab": 40, "enz": 33, "sig": 28, "str": 34, "flu": 42},
139
+ "de_novo_backbone": {"str": 20},
140
+ "complex_engineering": {"enz": 30, "sig": 34, "str": 36},
141
+ "conformational_design": {"enz": 28, "sig": 32, "str": 30, "flu": 34}
142
+ },
143
+ "tasks_completed": 76,
144
+ "tasks_total": 76,
145
+ "tasks_with_zero": 8,
146
+ "avg_latency_sec": 45.2,
147
+ "submission_date": "2026-03-01"
148
+ },
149
+ {
150
+ "agent_name": "Deepseek-v3.2",
151
+ "agent_id": "deepseek32-user",
152
+ "mode": "user",
153
+ "mcp_custom": false,
154
+ "submission_type": "llm",
155
+ "organization": "Deepseek",
156
+ "overall_score": 30.0,
157
+ "component_scores": {
158
+ "approach": 7.2,
159
+ "orchestration": 6.0,
160
+ "quality": 9.0,
161
+ "feasibility": 4.5,
162
+ "novelty": 1.3,
163
+ "diversity": 2.0
164
+ },
165
+ "taxonomy_scores": {
166
+ "de_novo_binder": {"ab": 32, "enz": 28, "sig": 31},
167
+ "sequence_optimization": {"ab": 36, "enz": 30, "sig": 25, "str": 31, "flu": 38},
168
+ "de_novo_backbone": {"str": 18},
169
+ "complex_engineering": {"enz": 28, "sig": 31, "str": 33},
170
+ "conformational_design": {"enz": 25, "sig": 29, "str": 28, "flu": 31}
171
+ },
172
+ "tasks_completed": 76,
173
+ "tasks_total": 76,
174
+ "tasks_with_zero": 10,
175
+ "avg_latency_sec": 38.7,
176
+ "submission_date": "2026-03-02"
177
+ },
178
+ {
179
+ "agent_name": "Gemini-2.5-Pro",
180
+ "agent_id": "gemini25-user",
181
+ "mode": "user",
182
+ "mcp_custom": false,
183
+ "submission_type": "llm",
184
+ "organization": "Google",
185
+ "overall_score": 28.0,
186
+ "component_scores": {
187
+ "approach": 6.5,
188
+ "orchestration": 5.5,
189
+ "quality": 8.5,
190
+ "feasibility": 4.5,
191
+ "novelty": 1.2,
192
+ "diversity": 1.8
193
+ },
194
+ "taxonomy_scores": {
195
+ "de_novo_binder": {"ab": 30, "enz": 25, "sig": 29},
196
+ "sequence_optimization": {"ab": 34, "enz": 28, "sig": 22, "str": 29, "flu": 36},
197
+ "de_novo_backbone": {"str": 16},
198
+ "complex_engineering": {"enz": 25, "sig": 28, "str": 30},
199
+ "conformational_design": {"enz": 22, "sig": 27, "str": 25, "flu": 29}
200
+ },
201
+ "tasks_completed": 76,
202
+ "tasks_total": 76,
203
+ "tasks_with_zero": 12,
204
+ "avg_latency_sec": 55.1,
205
+ "submission_date": "2026-03-02"
206
+ },
207
+ {
208
+ "agent_name": "QWEN-3.5",
209
+ "agent_id": "qwen35-user",
210
+ "mode": "user",
211
+ "mcp_custom": false,
212
+ "submission_type": "llm",
213
+ "organization": "Alibaba",
214
+ "overall_score": 26.0,
215
+ "component_scores": {
216
+ "approach": 6.0,
217
+ "orchestration": 5.0,
218
+ "quality": 8.0,
219
+ "feasibility": 4.0,
220
+ "novelty": 1.2,
221
+ "diversity": 1.8
222
+ },
223
+ "taxonomy_scores": {
224
+ "de_novo_binder": {"ab": 28, "enz": 23, "sig": 27},
225
+ "sequence_optimization": {"ab": 32, "enz": 26, "sig": 20, "str": 27, "flu": 34},
226
+ "de_novo_backbone": {"str": 14},
227
+ "complex_engineering": {"enz": 23, "sig": 26, "str": 28},
228
+ "conformational_design": {"enz": 20, "sig": 25, "str": 23, "flu": 27}
229
+ },
230
+ "tasks_completed": 76,
231
+ "tasks_total": 76,
232
+ "tasks_with_zero": 14,
233
+ "avg_latency_sec": 41.8,
234
+ "submission_date": "2026-03-02"
235
+ },
236
+ {
237
+ "agent_name": "Claude-4.5",
238
+ "agent_id": "claude45-benchmark",
239
+ "mode": "benchmark",
240
+ "mcp_custom": false,
241
+ "submission_type": "llm",
242
+ "organization": "Anthropic",
243
+ "overall_score": 20.0,
244
+ "component_scores": {
245
+ "approach": 5.5,
246
+ "orchestration": 3.5,
247
+ "quality": 6.0,
248
+ "feasibility": 3.0,
249
+ "novelty": 1.0,
250
+ "diversity": 1.0
251
+ },
252
+ "taxonomy_scores": {
253
+ "de_novo_binder": {"ab": 22, "enz": 18, "sig": 21},
254
+ "sequence_optimization": {"ab": 25, "enz": 20, "sig": 16, "str": 21, "flu": 28},
255
+ "de_novo_backbone": {"str": 12},
256
+ "complex_engineering": {"enz": 18, "sig": 20, "str": 22},
257
+ "conformational_design": {"enz": 16, "sig": 19, "str": 18, "flu": 20}
258
+ },
259
+ "tasks_completed": 76,
260
+ "tasks_total": 76,
261
+ "tasks_with_zero": 14,
262
+ "avg_latency_sec": 48.5,
263
+ "submission_date": "2026-03-01"
264
+ },
265
+ {
266
+ "agent_name": "GPT-5",
267
+ "agent_id": "gpt5-benchmark",
268
+ "mode": "benchmark",
269
+ "mcp_custom": false,
270
+ "submission_type": "llm",
271
+ "organization": "OpenAI",
272
+ "overall_score": 18.5,
273
+ "component_scores": {
274
+ "approach": 5.2,
275
+ "orchestration": 3.1,
276
+ "quality": 5.8,
277
+ "feasibility": 2.5,
278
+ "novelty": 0.9,
279
+ "diversity": 1.0
280
+ },
281
+ "taxonomy_scores": {
282
+ "de_novo_binder": {"ab": 20, "enz": 16, "sig": 19},
283
+ "sequence_optimization": {"ab": 23, "enz": 18, "sig": 14, "str": 19, "flu": 26},
284
+ "de_novo_backbone": {"str": 10},
285
+ "complex_engineering": {"enz": 16, "sig": 18, "str": 20},
286
+ "conformational_design": {"enz": 14, "sig": 17, "str": 16, "flu": 18}
287
+ },
288
+ "tasks_completed": 76,
289
+ "tasks_total": 76,
290
+ "tasks_with_zero": 16,
291
+ "avg_latency_sec": 42.0,
292
+ "submission_date": "2026-03-01"
293
+ },
294
+ {
295
+ "agent_name": "Deepseek-v3.2",
296
+ "agent_id": "deepseek32-benchmark",
297
+ "mode": "benchmark",
298
+ "mcp_custom": false,
299
+ "submission_type": "llm",
300
+ "organization": "Deepseek",
301
+ "overall_score": 16.0,
302
+ "component_scores": {
303
+ "approach": 4.5,
304
+ "orchestration": 2.8,
305
+ "quality": 5.0,
306
+ "feasibility": 2.2,
307
+ "novelty": 0.7,
308
+ "diversity": 0.8
309
+ },
310
+ "taxonomy_scores": {
311
+ "de_novo_binder": {"ab": 18, "enz": 14, "sig": 17},
312
+ "sequence_optimization": {"ab": 20, "enz": 16, "sig": 12, "str": 17, "flu": 22},
313
+ "de_novo_backbone": {"str": 8},
314
+ "complex_engineering": {"enz": 14, "sig": 16, "str": 18},
315
+ "conformational_design": {"enz": 12, "sig": 15, "str": 14, "flu": 16}
316
+ },
317
+ "tasks_completed": 76,
318
+ "tasks_total": 76,
319
+ "tasks_with_zero": 18,
320
+ "avg_latency_sec": 35.2,
321
+ "submission_date": "2026-03-02"
322
+ },
323
+ {
324
+ "agent_name": "Gemini-2.5-Pro",
325
+ "agent_id": "gemini25-benchmark",
326
+ "mode": "benchmark",
327
+ "mcp_custom": false,
328
+ "submission_type": "llm",
329
+ "organization": "Google",
330
+ "overall_score": 15.0,
331
+ "component_scores": {
332
+ "approach": 4.2,
333
+ "orchestration": 2.5,
334
+ "quality": 4.5,
335
+ "feasibility": 2.0,
336
+ "novelty": 0.8,
337
+ "diversity": 1.0
338
+ },
339
+ "taxonomy_scores": {
340
+ "de_novo_binder": {"ab": 16, "enz": 12, "sig": 16},
341
+ "sequence_optimization": {"ab": 18, "enz": 15, "sig": 10, "str": 16, "flu": 20},
342
+ "de_novo_backbone": {"str": 8},
343
+ "complex_engineering": {"enz": 12, "sig": 15, "str": 16},
344
+ "conformational_design": {"enz": 10, "sig": 14, "str": 12, "flu": 15}
345
+ },
346
+ "tasks_completed": 76,
347
+ "tasks_total": 76,
348
+ "tasks_with_zero": 20,
349
+ "avg_latency_sec": 50.3,
350
+ "submission_date": "2026-03-02"
351
+ },
352
+ {
353
+ "agent_name": "QWEN-3.5",
354
+ "agent_id": "qwen35-benchmark",
355
+ "mode": "benchmark",
356
+ "mcp_custom": false,
357
+ "submission_type": "llm",
358
+ "organization": "Alibaba",
359
+ "overall_score": 14.0,
360
+ "component_scores": {
361
+ "approach": 3.8,
362
+ "orchestration": 2.2,
363
+ "quality": 4.2,
364
+ "feasibility": 2.0,
365
+ "novelty": 0.8,
366
+ "diversity": 1.0
367
+ },
368
+ "taxonomy_scores": {
369
+ "de_novo_binder": {"ab": 15, "enz": 11, "sig": 14},
370
+ "sequence_optimization": {"ab": 17, "enz": 14, "sig": 10, "str": 15, "flu": 18},
371
+ "de_novo_backbone": {"str": 7},
372
+ "complex_engineering": {"enz": 11, "sig": 14, "str": 15},
373
+ "conformational_design": {"enz": 10, "sig": 13, "str": 11, "flu": 14}
374
+ },
375
+ "tasks_completed": 76,
376
+ "tasks_total": 76,
377
+ "tasks_with_zero": 22,
378
+ "avg_latency_sec": 39.5,
379
+ "submission_date": "2026-03-02"
380
+ }
381
+ ]
382
+ }