Upload summary_data.csv
Browse files
runs/agro1_2025-12-10/summary_data.csv
CHANGED
|
@@ -1,41 +1,41 @@
|
|
| 1 |
-
Model,Iterations,
|
| 2 |
-
Claude-3.5-haiku,205,3.676
|
| 3 |
-
Claude-haiku-4.5,196,4.445
|
| 4 |
-
Claude-opus-4.5,194,4.600
|
| 5 |
-
Claude-sonnet-4.5,203,4.453
|
| 6 |
-
DeepSeek-R1-0528,198,4.536
|
| 7 |
-
Deepseek-v3.1,205,4.377
|
| 8 |
-
Deepseek-v3.2-exp,194,4.378
|
| 9 |
-
DeepSeek-V3-0324,205,4.183
|
| 10 |
-
Gemini-2.5-flash,204,4.475
|
| 11 |
-
Gemini-2.5-flash-lite,200,4.329
|
| 12 |
-
Gemini-2.5-pro,205,4.630
|
| 13 |
-
Gemini-3-pro-preview,194,4.642
|
| 14 |
-
Gemma-3-27b-it,204,4.339
|
| 15 |
-
GLM-4.5,204,4.556
|
| 16 |
-
GLM-4.5-Air,196,4.279
|
| 17 |
-
Gpt-5,192,4.827
|
| 18 |
-
Gpt-5.1,195,4.849
|
| 19 |
-
Gpt-5-mini,196,4.594
|
| 20 |
-
Gpt-oss-120b,205,4.574
|
| 21 |
-
Grok-3-mini,204,4.320
|
| 22 |
-
Grok-4,197,4.535
|
| 23 |
-
Grok-4.1-fast,197,4.582
|
| 24 |
-
Grok-4.1-fast-thinking,197,4.640
|
| 25 |
-
Kimi-K2-Instruct,205,4.517
|
| 26 |
-
Kimi-k2-thinking,192,4.559
|
| 27 |
-
Llama-3.1-nemotron-ultra-253b-v1,203,4.163
|
| 28 |
-
Llama-3.3-nemotron-super-49b-v1.5,196,4.269
|
| 29 |
-
Llama-4-maverick,205,3.659
|
| 30 |
-
Llama-4-scout,205,3.611
|
| 31 |
-
Magistral-small-2506,203,3.911
|
| 32 |
-
Minimax-m2,193,4.524
|
| 33 |
-
Mistral-large-2512,175,4.586
|
| 34 |
-
Nemotron-nano-9b-v2,194,3.434
|
| 35 |
-
Nova-lite-v1,205,3.513
|
| 36 |
-
Nova-pro-v1,205,3.476
|
| 37 |
-
Phi-3-mini-128k-instruct,186,2.900
|
| 38 |
-
Phi-4,205,3.444
|
| 39 |
-
Qwen3-235B-A22B-Thinking-2507,193,4.585
|
| 40 |
-
Qwen3-30b-a3b-instruct-2507,204,4.460
|
| 41 |
-
Qwen3-next-80b-a3b-thinking,204,4.439
|
|
|
|
| 1 |
+
Model,Iterations,AutoBench,LMArena,AAI Index,MMLU-Pro,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec),Fail Rate %
|
| 2 |
+
Claude-3.5-haiku,205,3.676,,,,0.0067,12.37,73.19,0.00%
|
| 3 |
+
Claude-haiku-4.5,196,4.445,,,,0.0195,52.84,365.38,0.51%
|
| 4 |
+
Claude-opus-4.5,194,4.600,,,,0.0731,66.00,238.10,1.52%
|
| 5 |
+
Claude-sonnet-4.5,203,4.453,,,,0.0208,42.23,283.44,0.98%
|
| 6 |
+
DeepSeek-R1-0528,198,4.536,,,,0.0030,53.70,159.18,3.41%
|
| 7 |
+
Deepseek-v3.1,205,4.377,,,,0.0010,29.33,155.68,0.00%
|
| 8 |
+
Deepseek-v3.2-exp,194,4.378,,,,0.0008,71.34,381.40,1.52%
|
| 9 |
+
DeepSeek-V3-0324,205,4.183,,,,0.0007,26.09,100.63,0.00%
|
| 10 |
+
Gemini-2.5-flash,204,4.475,,,,0.0043,16.98,90.11,0.49%
|
| 11 |
+
Gemini-2.5-flash-lite,200,4.329,,,,0.0007,10.98,79.85,2.44%
|
| 12 |
+
Gemini-2.5-pro,205,4.630,,,,0.0395,50.43,186.92,0.00%
|
| 13 |
+
Gemini-3-pro-preview,194,4.642,,,,0.0388,46.15,143.02,1.52%
|
| 14 |
+
Gemma-3-27b-it,204,4.339,,,,0.0003,30.64,111.58,0.49%
|
| 15 |
+
GLM-4.5,204,4.556,,,,0.0034,50.84,200.73,0.49%
|
| 16 |
+
GLM-4.5-Air,196,4.279,,,,0.0016,35.26,144.36,4.39%
|
| 17 |
+
Gpt-5,192,4.827,,,,0.0543,112.19,312.34,1.54%
|
| 18 |
+
Gpt-5.1,195,4.849,,,,0.0770,140.66,347.66,1.02%
|
| 19 |
+
Gpt-5-mini,196,4.594,,,,0.0081,74.34,224.19,4.39%
|
| 20 |
+
Gpt-oss-120b,205,4.574,,,,0.0007,34.63,152.50,0.00%
|
| 21 |
+
Grok-3-mini,204,4.320,,,,0.0010,23.30,97.02,0.49%
|
| 22 |
+
Grok-4,197,4.535,,,,0.0341,70.41,219.87,3.90%
|
| 23 |
+
Grok-4.1-fast,197,4.582,,,,0.0008,24.09,64.98,0.00%
|
| 24 |
+
Grok-4.1-fast-thinking,197,4.640,,,,0.0007,45.41,176.91,0.00%
|
| 25 |
+
Kimi-K2-Instruct,205,4.517,,,,0.0021,21.11,86.47,0.00%
|
| 26 |
+
Kimi-k2-thinking,192,4.559,,,,0.0080,68.03,360.26,2.54%
|
| 27 |
+
Llama-3.1-nemotron-ultra-253b-v1,203,4.163,,,,0.0021,35.68,162.15,0.98%
|
| 28 |
+
Llama-3.3-nemotron-super-49b-v1.5,196,4.269,,,,0.0011,35.56,166.08,0.51%
|
| 29 |
+
Llama-4-maverick,205,3.659,,,,0.0005,12.09,65.33,0.00%
|
| 30 |
+
Llama-4-scout,205,3.611,,,,0.0002,15.16,60.13,0.00%
|
| 31 |
+
Magistral-small-2506,203,3.911,,,,0.0010,7.51,56.42,0.98%
|
| 32 |
+
Minimax-m2,193,4.524,,,,0.0036,68.36,238.55,2.03%
|
| 33 |
+
Mistral-large-2512,175,4.586,,,,0.0033,61.60,143.01,0.00%
|
| 34 |
+
Nemotron-nano-9b-v2,194,3.434,,,,0.0003,17.50,88.99,1.52%
|
| 35 |
+
Nova-lite-v1,205,3.513,,,,0.0002,6.53,41.75,0.00%
|
| 36 |
+
Nova-pro-v1,205,3.476,,,,0.0016,7.84,45.94,0.00%
|
| 37 |
+
Phi-3-mini-128k-instruct,186,2.900,,,,0.0002,19.89,142.96,5.58%
|
| 38 |
+
Phi-4,205,3.444,,,,0.0001,14.87,59.91,0.00%
|
| 39 |
+
Qwen3-235B-A22B-Thinking-2507,193,4.585,,,,0.0013,74.18,254.75,5.85%
|
| 40 |
+
Qwen3-30b-a3b-instruct-2507,204,4.460,,,,0.0003,21.87,174.47,0.49%
|
| 41 |
+
Qwen3-next-80b-a3b-thinking,204,4.439,,,,0.0040,32.19,126.90,0.49%
|