PeterKruger commited on
Commit
1d2d479
·
verified ·
1 Parent(s): b516aad

Upload summary_data.csv

Browse files
runs/agent1_2026-04-16/summary_data.csv CHANGED
@@ -1,33 +1,33 @@
1
  Model,Iterations,AutoBench,AAI Index,Terminal-bench,GDPval-AA,Tau2-bench Telecom,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec),Fail Rate %
2
- Claude-haiku-4.5,193,2.987257,40,27,34,55%,0.008421041,43.07080917,151.0514,3.02%
3
- Claude-opus-4.6,198,3.241761,68,46,56,92%,0.025840455,37.87360967,97.9209,0.50%
4
  Claude-opus-4.7,187,3.295774,,,,,0.027191898,20.84747735,47.0074,6.03%
5
- Claude-sonnet-4.6,193,3.157161,63,53,58,76%,0.019781285,47.09331481,149.1208,3.02%
6
- Deepseek-v3.2,197,2.642664,53,36,35,91%,0.000590106,53.5962432,129.2483,1.01%
7
- Gemini-3.1-flash-lite-preview,179,2.817498,26,24,21,31%,0.001152242,23.16857653,114.3943,10.05%
8
- Gemini-3.1-pro-preview,198,3.214863,59,54,41,96%,0.013335061,25.91624477,58.2089,0.50%
9
- Gemini-3-flash-preview,198,2.984602,50,39,35,80%,0.002816407,12.50534394,22.5291,0.50%
10
- Gemma-4-26b-a4b-it,192,2.60664,32,14,26,44%,0.000200836,12.38208868,40.9958,3.52%
11
- Gemma-4-31b-it,183,2.792605,41,36,31,60%,0.00024191,45.28161148,174.1861,8.04%
12
- GLM-4.7,194,2.916867,55,32,35,96%,0.001388525,43.54876781,134.6278,2.51%
13
- GLM-5,197,3.14801,67,43,52,98%,0.005112927,60.30060454,183.2705,1.01%
14
- Gpt-5.4 1 (xhigh),145,3.127315,68,58,59,87%,0.063282983,131.0088557,325.7716,27.14%
15
- Gpt-5.4-mini (xhigh),104,2.907268,59,52,46,83%,0.020302969,86.91001719,240.8822,47.74%
16
- Gpt-5.4-nano (xhigh),113,2.781217,48,42,34,76%,0.004317464,93.33818571,262.2088,43.22%
17
- Gpt-oss-120b,198,2.762188,38,24,22,66%,0.000170961,18.02638485,63.3616,0.50%
18
- Gpt-oss-20b,197,2.648577,28,11,8,60%,0.000132809,42.86725918,165.5231,1.01%
19
- Grok-4.1-fast,197,2.843729,49,24,27,93%,0.001201663,36.20348296,95.8181,1.01%
20
- Grok-4.20,189,3.00445,54,38,27,93%,0.015201661,33.09318092,78.5717,5.03%
21
- Kimi-K2.5,187,3.022371,59,35,39,96%,0.001254149,53.60984548,177.5503,6.03%
22
- Llama-4-maverick,195,2.269251,7,7,0,18%,0.000278018,41.27163619,76.1007,2.01%
23
- Mimo-V2-Pro,199,3.103969,63,41,46,95%,0.003325538,25.88939247,54.6709,0.00%
24
- Minimax-m2.5,184,2.790468,56,35,34,95%,0.000530249,75.14721297,240.8106,7.54%
25
- Minimax-m2.7,193,3.011179,61,39,51,85%,0.000953587,27.06517621,57.5405,3.02%
26
- Mistral-large-2512,193,2.624168,22,16,18,41%,0.000962583,9.267698618,21.5524,3.02%
27
- Mistral-small-4,194,2.68593,26,17,18,25%,0.000501963,10.55675923,41.2537,2.51%
28
- Nemotron-3-nano-30b-a3b,190,2.713783,19,14,4,41%,0.000824281,105.9488772,284.5594,4.52%
29
- Nemotron-3-super-120b-a12b,187,2.796666,40,29,25,68%,0.000634388,69.39686943,245.2075,6.03%
30
- Nova-2-lite-v1,181,2.65909,37,17,17,73%,0.015561471,56.85229296,138.8302,9.05%
31
- Qwen3.5-122b-a10b,198,2.835386,53,31,31,89%,0.001435478,13.13992066,36.451,0.50%
32
- Qwen3.5-35b-a3b,198,2.824791,44,27,21,94%,0.001115424,14.33161555,30.7295,0.50%
33
- Qwen3.6-plus,198,3.072465,62,44,43,0.95,0.001947453,46.4302181,106.8828,0.005025126
 
1
  Model,Iterations,AutoBench,AAI Index,Terminal-bench,GDPval-AA,Tau2-bench Telecom,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec),Fail Rate %
2
+ Claude-haiku-4.5,193,2.987257,40,27,34,55,0.008421041,43.07080917,151.0514,3.02%
3
+ Claude-opus-4.6,198,3.241761,68,46,56,92,0.025840455,37.87360967,97.9209,0.50%
4
  Claude-opus-4.7,187,3.295774,,,,,0.027191898,20.84747735,47.0074,6.03%
5
+ Claude-sonnet-4.6,193,3.157161,63,53,58,76,0.019781285,47.09331481,149.1208,3.02%
6
+ Deepseek-v3.2,197,2.642664,53,36,35,91,0.000590106,53.5962432,129.2483,1.01%
7
+ Gemini-3.1-flash-lite-preview,179,2.817498,26,24,21,31,0.001152242,23.16857653,114.3943,10.05%
8
+ Gemini-3.1-pro-preview,198,3.214863,59,54,41,96,0.013335061,25.91624477,58.2089,0.50%
9
+ Gemini-3-flash-preview,198,2.984602,50,39,35,80,0.002816407,12.50534394,22.5291,0.50%
10
+ Gemma-4-26b-a4b-it,192,2.60664,32,14,26,44,0.000200836,12.38208868,40.9958,3.52%
11
+ Gemma-4-31b-it,183,2.792605,41,36,31,60,0.00024191,45.28161148,174.1861,8.04%
12
+ GLM-4.7,194,2.916867,55,32,35,96,0.001388525,43.54876781,134.6278,2.51%
13
+ GLM-5,197,3.14801,67,43,52,98,0.005112927,60.30060454,183.2705,1.01%
14
+ Gpt-5.4 1 (xhigh),145,3.127315,68,58,59,87,0.063282983,131.0088557,325.7716,27.14%
15
+ Gpt-5.4-mini (xhigh),104,2.907268,59,52,46,83,0.020302969,86.91001719,240.8822,47.74%
16
+ Gpt-5.4-nano (xhigh),113,2.781217,48,42,34,76,0.004317464,93.33818571,262.2088,43.22%
17
+ Gpt-oss-120b,198,2.762188,38,24,22,66,0.000170961,18.02638485,63.3616,0.50%
18
+ Gpt-oss-20b,197,2.648577,28,11,8,60,0.000132809,42.86725918,165.5231,1.01%
19
+ Grok-4.1-fast,197,2.843729,49,24,27,93,0.001201663,36.20348296,95.8181,1.01%
20
+ Grok-4.20,189,3.00445,54,38,27,93,0.015201661,33.09318092,78.5717,5.03%
21
+ Kimi-K2.5,187,3.022371,59,35,39,96,0.001254149,53.60984548,177.5503,6.03%
22
+ Llama-4-maverick,195,2.269251,7,7,0,18,0.000278018,41.27163619,76.1007,2.01%
23
+ Mimo-V2-Pro,199,3.103969,63,41,46,95,0.003325538,25.88939247,54.6709,0.00%
24
+ Minimax-m2.5,184,2.790468,56,35,34,95,0.000530249,75.14721297,240.8106,7.54%
25
+ Minimax-m2.7,193,3.011179,61,39,51,85,0.000953587,27.06517621,57.5405,3.02%
26
+ Mistral-large-2512,193,2.624168,22,16,18,41,0.000962583,9.267698618,21.5524,3.02%
27
+ Mistral-small-4,194,2.68593,26,17,18,25,0.000501963,10.55675923,41.2537,2.51%
28
+ Nemotron-3-nano-30b-a3b,190,2.713783,19,14,4,41,0.000824281,105.9488772,284.5594,4.52%
29
+ Nemotron-3-super-120b-a12b,187,2.796666,40,29,25,68,0.000634388,69.39686943,245.2075,6.03%
30
+ Nova-2-lite-v1,181,2.65909,37,17,17,73,0.015561471,56.85229296,138.8302,9.05%
31
+ Qwen3.5-122b-a10b,198,2.835386,53,31,31,89,0.001435478,13.13992066,36.451,0.50%
32
+ Qwen3.5-35b-a3b,198,2.824791,44,27,21,94,0.001115424,14.33161555,30.7295,0.50%
33
+ Qwen3.6-plus,198,3.072465,62,44,43,95,0.001947453,46.4302181,106.8828,0.005025126