Upload summary_data.csv
Browse files
runs/agent1_2026-04-16/summary_data.csv
CHANGED
|
@@ -1,33 +1,33 @@
|
|
| 1 |
Model,Iterations,AutoBench,AAI Index,Terminal-bench,GDPval-AA,Tau2-bench Telecom,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec),Fail Rate %
|
| 2 |
-
Claude-haiku-4.5,193,2.987257,40,27,34,55
|
| 3 |
-
Claude-opus-4.6,198,3.241761,68,46,56,92
|
| 4 |
Claude-opus-4.7,187,3.295774,,,,,0.027191898,20.84747735,47.0074,6.03%
|
| 5 |
-
Claude-sonnet-4.6,193,3.157161,63,53,58,76
|
| 6 |
-
Deepseek-v3.2,197,2.642664,53,36,35,91
|
| 7 |
-
Gemini-3.1-flash-lite-preview,179,2.817498,26,24,21,31
|
| 8 |
-
Gemini-3.1-pro-preview,198,3.214863,59,54,41,96
|
| 9 |
-
Gemini-3-flash-preview,198,2.984602,50,39,35,80
|
| 10 |
-
Gemma-4-26b-a4b-it,192,2.60664,32,14,26,44
|
| 11 |
-
Gemma-4-31b-it,183,2.792605,41,36,31,60
|
| 12 |
-
GLM-4.7,194,2.916867,55,32,35,96
|
| 13 |
-
GLM-5,197,3.14801,67,43,52,98
|
| 14 |
-
Gpt-5.4 1 (xhigh),145,3.127315,68,58,59,87
|
| 15 |
-
Gpt-5.4-mini (xhigh),104,2.907268,59,52,46,83
|
| 16 |
-
Gpt-5.4-nano (xhigh),113,2.781217,48,42,34,76
|
| 17 |
-
Gpt-oss-120b,198,2.762188,38,24,22,66
|
| 18 |
-
Gpt-oss-20b,197,2.648577,28,11,8,60
|
| 19 |
-
Grok-4.1-fast,197,2.843729,49,24,27,93
|
| 20 |
-
Grok-4.20,189,3.00445,54,38,27,93
|
| 21 |
-
Kimi-K2.5,187,3.022371,59,35,39,96
|
| 22 |
-
Llama-4-maverick,195,2.269251,7,7,0,18
|
| 23 |
-
Mimo-V2-Pro,199,3.103969,63,41,46,95
|
| 24 |
-
Minimax-m2.5,184,2.790468,56,35,34,95
|
| 25 |
-
Minimax-m2.7,193,3.011179,61,39,51,85
|
| 26 |
-
Mistral-large-2512,193,2.624168,22,16,18,41
|
| 27 |
-
Mistral-small-4,194,2.68593,26,17,18,25
|
| 28 |
-
Nemotron-3-nano-30b-a3b,190,2.713783,19,14,4,41
|
| 29 |
-
Nemotron-3-super-120b-a12b,187,2.796666,40,29,25,68
|
| 30 |
-
Nova-2-lite-v1,181,2.65909,37,17,17,73
|
| 31 |
-
Qwen3.5-122b-a10b,198,2.835386,53,31,31,89
|
| 32 |
-
Qwen3.5-35b-a3b,198,2.824791,44,27,21,94
|
| 33 |
-
Qwen3.6-plus,198,3.072465,62,44,43,
|
|
|
|
| 1 |
Model,Iterations,AutoBench,AAI Index,Terminal-bench,GDPval-AA,Tau2-bench Telecom,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec),Fail Rate %
|
| 2 |
+
Claude-haiku-4.5,193,2.987257,40,27,34,55,0.008421041,43.07080917,151.0514,3.02%
|
| 3 |
+
Claude-opus-4.6,198,3.241761,68,46,56,92,0.025840455,37.87360967,97.9209,0.50%
|
| 4 |
Claude-opus-4.7,187,3.295774,,,,,0.027191898,20.84747735,47.0074,6.03%
|
| 5 |
+
Claude-sonnet-4.6,193,3.157161,63,53,58,76,0.019781285,47.09331481,149.1208,3.02%
|
| 6 |
+
Deepseek-v3.2,197,2.642664,53,36,35,91,0.000590106,53.5962432,129.2483,1.01%
|
| 7 |
+
Gemini-3.1-flash-lite-preview,179,2.817498,26,24,21,31,0.001152242,23.16857653,114.3943,10.05%
|
| 8 |
+
Gemini-3.1-pro-preview,198,3.214863,59,54,41,96,0.013335061,25.91624477,58.2089,0.50%
|
| 9 |
+
Gemini-3-flash-preview,198,2.984602,50,39,35,80,0.002816407,12.50534394,22.5291,0.50%
|
| 10 |
+
Gemma-4-26b-a4b-it,192,2.60664,32,14,26,44,0.000200836,12.38208868,40.9958,3.52%
|
| 11 |
+
Gemma-4-31b-it,183,2.792605,41,36,31,60,0.00024191,45.28161148,174.1861,8.04%
|
| 12 |
+
GLM-4.7,194,2.916867,55,32,35,96,0.001388525,43.54876781,134.6278,2.51%
|
| 13 |
+
GLM-5,197,3.14801,67,43,52,98,0.005112927,60.30060454,183.2705,1.01%
|
| 14 |
+
Gpt-5.4 1 (xhigh),145,3.127315,68,58,59,87,0.063282983,131.0088557,325.7716,27.14%
|
| 15 |
+
Gpt-5.4-mini (xhigh),104,2.907268,59,52,46,83,0.020302969,86.91001719,240.8822,47.74%
|
| 16 |
+
Gpt-5.4-nano (xhigh),113,2.781217,48,42,34,76,0.004317464,93.33818571,262.2088,43.22%
|
| 17 |
+
Gpt-oss-120b,198,2.762188,38,24,22,66,0.000170961,18.02638485,63.3616,0.50%
|
| 18 |
+
Gpt-oss-20b,197,2.648577,28,11,8,60,0.000132809,42.86725918,165.5231,1.01%
|
| 19 |
+
Grok-4.1-fast,197,2.843729,49,24,27,93,0.001201663,36.20348296,95.8181,1.01%
|
| 20 |
+
Grok-4.20,189,3.00445,54,38,27,93,0.015201661,33.09318092,78.5717,5.03%
|
| 21 |
+
Kimi-K2.5,187,3.022371,59,35,39,96,0.001254149,53.60984548,177.5503,6.03%
|
| 22 |
+
Llama-4-maverick,195,2.269251,7,7,0,18,0.000278018,41.27163619,76.1007,2.01%
|
| 23 |
+
Mimo-V2-Pro,199,3.103969,63,41,46,95,0.003325538,25.88939247,54.6709,0.00%
|
| 24 |
+
Minimax-m2.5,184,2.790468,56,35,34,95,0.000530249,75.14721297,240.8106,7.54%
|
| 25 |
+
Minimax-m2.7,193,3.011179,61,39,51,85,0.000953587,27.06517621,57.5405,3.02%
|
| 26 |
+
Mistral-large-2512,193,2.624168,22,16,18,41,0.000962583,9.267698618,21.5524,3.02%
|
| 27 |
+
Mistral-small-4,194,2.68593,26,17,18,25,0.000501963,10.55675923,41.2537,2.51%
|
| 28 |
+
Nemotron-3-nano-30b-a3b,190,2.713783,19,14,4,41,0.000824281,105.9488772,284.5594,4.52%
|
| 29 |
+
Nemotron-3-super-120b-a12b,187,2.796666,40,29,25,68,0.000634388,69.39686943,245.2075,6.03%
|
| 30 |
+
Nova-2-lite-v1,181,2.65909,37,17,17,73,0.015561471,56.85229296,138.8302,9.05%
|
| 31 |
+
Qwen3.5-122b-a10b,198,2.835386,53,31,31,89,0.001435478,13.13992066,36.451,0.50%
|
| 32 |
+
Qwen3.5-35b-a3b,198,2.824791,44,27,21,94,0.001115424,14.33161555,30.7295,0.50%
|
| 33 |
+
Qwen3.6-plus,198,3.072465,62,44,43,95,0.001947453,46.4302181,106.8828,0.005025126
|