PeterKruger commited on
Commit
3d84e78
·
verified ·
1 Parent(s): cf5cf07

Upload 8 files

Browse files

Adding run agentic 1

runs/agent1_2026-04-16/avg_latency.csv ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name,adaptive_replanning,api_workflow,domain_workflow,error_handling,failure_recovery,multi_step_orchestration,parallel_execution,parameter_complexity,single_tool_call,tool_selection,Average (All Topics)
2
+ Claude-haiku-4.5,46.1336,46.1122,48.0425,60.335,36.1667,55.4396,39.5512,35.164,24.5206,52.1675,43.95979553
3
+ Claude-opus-4.6,42.1124,53.3346,45.461,43.0368,39.8221,37.2462,45.1839,28.905,16.9896,22.3545,37.85942643
4
+ Claude-sonnet-4.6,51.1068,52.8177,76.5625,50.1423,52.0923,42.691,52.3426,38.5508,18.7954,28.4199,46.45277089
5
+ Deepseek-v3.2,70.4417,60.617,46.1213,56.2128,79.4036,77.5038,55.4781,35.3541,26.0408,27.0283,54.31511897
6
+ GLM-4.7,50.4502,40.405,43.2821,66.6907,39.1247,48.016,52.613,36.519,23.0467,22.9732,23.16857653
7
+ GLM-5.1,70.5772,71.5014,86.1363,80.6663,75.4139,67.054,95.9933,32.9641,26.5569,35.1629,25.96268468
8
+ Gemini-3-flash-preview,16.7706,13.8003,14.096,13.6193,14.8726,12.9421,15.4606,9.8464,8.0062,8.2219,12.95436087
9
+ Gemini-3.1-flash-lite-preview,25.9681,15.6794,18.291,29.8768,31.6073,36.0709,29.2888,13.9176,12.3058,17.3742,13.64022252
10
+ Gemini-3.1-pro-preview,25.2172,28.3457,25.5406,28.3863,26.0591,29.5616,35.0487,23.2719,16.5858,20.0105,51.69145245
11
+ Gemma-4-26b-a4b-it,16.9871,10.217,13.0184,16.2429,16.5371,16.1576,20.9129,6.4718,7.1763,5.9912,43.54876781
12
+ Gemma-4-31b-it,41.6696,53.5776,81.3055,54.9655,52.0971,59.0855,83.5473,22.6931,24.6705,49.91,66.08806343
13
+ Gpt-5.4,190.6972,125.9768,111.2079,140.3793,81.5197,131.3381,101.409,160.1292,108.8256,175.7158,129.263534
14
+ Gpt-5.4-mini,153.3592,90.6251,81.1314,90.317,101.9265,103.8755,49.8503,91.5657,78.5646,26.5415,86.91001719
15
+ Gpt-5.4-nano,128.935,68.2781,67.8651,145.881,77.745,140.7391,82.563,78.7779,80.9436,78.7016,93.33818571
16
+ Gpt-oss-120b,18.207,12.6311,20.5413,25.8277,20.1329,22.4405,20.2597,14.1992,14.5309,6.511,18.02638485
17
+ Gpt-oss-20b,43.7038,43.953,49.1998,60.1144,48.8196,37.083,49.6733,32.327,22.8055,30.2565,42.86725918
18
+ Grok-4.1-fast,36.4589,39.2228,40.1179,31.148,35.6533,64.629,41.2036,30.4571,21.3232,29.431,36.20348296
19
+ Grok-4.20,35.1335,48.2891,37.4329,51.0896,35.0874,40.0045,36.0676,40.4575,19.2072,24.2121,36.77304221
20
+ Kimi-K2.5,87.0184,40.7341,54.4748,62.1407,52.2043,52.926,76.3145,40.2496,39.7803,31.1301,54.67825758
21
+ Llama-4-maverick,44.0942,43.6807,36.7344,44.4963,42.2188,45.3713,34.3164,42.2113,36.3075,45.0926,41.27163619
22
+ Mimo-V2-Pro,28.5039,28.9949,27.0924,30.5006,26.2146,27.5217,30.3201,19.0163,17.842,15.4146,25.60343568
23
+ Minimax-m2.5,93.1916,84.3699,103.6352,118.2817,98.4981,93.134,86.3949,45.7514,43.5987,60.6954,85.19656068
24
+ Minimax-m2.7,33.9994,25.9516,30.3238,34.9334,31.0331,37.5538,31.5957,17.6385,17.5101,15.506,28.14855769
25
+ Mistral-large-2512,11.0209,11.3873,12.334,9.9634,9.4495,9.7352,12.1589,5.7922,4.0873,5.7834,9.267698618
26
+ Mistral-small-4,10.4777,9.5762,11.4058,9.2103,22.2899,9.7938,12.1189,4.8819,4.3778,6.4862,10.55675923
27
+ Nemotron-3-nano-30b-a3b,136.4421,120.1786,121.2343,103.1525,107.5256,131.6642,127.4537,68.5201,53.644,60.038,102.9642784
28
+ Nemotron-3-super-120b-a12b,109.4218,65.4083,83.8886,86.9403,89.8686,82.8918,72.4423,33.2578,37.0007,44.3284,71.86705572
29
+ Nova-2-lite-v1,84.8424,69.59,61.1813,79.1771,69.1531,58.651,81.8847,23.9343,12.6582,16.2989,56.85229296
30
+ Qwen3.5-122b-a10b,15.3923,14.5582,13.6746,13.4825,13.4436,14.0513,19.8363,8.3142,8.0972,8.4222,13.13992066
31
+ Qwen3.5-35b-a3b,18.6065,17.7434,17.0379,17.1596,14.0943,16.2087,18.5391,6.9951,6.5958,8.4414,14.33161555
32
+ Qwen3.6-plus,55.5548,40.2061,61.5001,60.691,43.9526,47.4626,60.6145,24.8841,32.3799,40.6555,47.51772478
runs/agent1_2026-04-16/correlations.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "correlations": {
3
+ "Artificial Analysis Intelligence Index": 82.71,
4
+ "Terminal-Bench Hard": 80.25,
5
+ "GDPval-AA":81.81,
6
+ "Tau2-Bench Telecom":66.45
7
+
8
+ },
9
+ "description": "Correlation percentages between AutoBench scores and main agentic benchmark scores"
10
+ }
runs/agent1_2026-04-16/cost_data.csv ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name,adaptive_replanning,api_workflow,domain_workflow,error_handling,failure_recovery,multi_step_orchestration,parallel_execution,parameter_complexity,single_tool_call,tool_selection,Average (All Topics)
2
+ Claude-haiku-4.5,0.00808905,0.00876072,0.01240729,0.01352842,0.00610779,0.00782664,0.00615164,0.00776982,0.00667186,0.0082366,0.00860
3
+ Claude-opus-4.6,0.027098,0.03036395,0.02974176,0.02710231,0.0267796,0.02575733,0.03427523,0.02213794,0.01376205,0.017109,0.02563
4
+ Claude-sonnet-4.6,0.01848345,0.01957926,0.02999362,0.02050589,0.02137464,0.01709979,0.02698677,0.01397471,0.01081964,0.0146042,0.01948
5
+ Deepseek-v3.2,0.00072267,0.00063554,0.00049758,0.00058632,0.00067593,0.00070094,0.00056376,0.00051767,0.00045667,0.00051202,0.00059
6
+ GLM-4.7,0.00147033,0.00124156,0.00162454,0.00190286,0.00131191,0.00139006,0.0018656,0.00102319,0.00086426,0.00080832,0.00115
7
+ GLM-5.1,0.00510934,0.00532995,0.00614466,0.00567328,0.00549074,0.00465484,0.00616262,0.00451313,0.00289774,0.00309761,0.01328
8
+ Gemini-3-flash-preview,0.00303385,0.00267647,0.002966,0.00268887,0.00277996,0.00250525,0.00312325,0.00368191,0.0019587,0.0024742,0.00278
9
+ Gemini-3.1-flash-lite-preview,0.00118982,0.00120504,0.00110204,0.00110311,0.00109502,0.0009483,0.001208,0.00144613,0.00113141,0.00113895,0.00020
10
+ Gemini-3.1-pro-preview,0.0121991,0.01238768,0.013072,0.01407541,0.01194816,0.01342957,0.01415764,0.01647259,0.01054545,0.01578933,0.00024
11
+ Gemma-4-26b-a4b-it,0.0001865,0.00018407,0.00023366,0.00022579,0.00019316,0.0002439,0.00028535,0.00014517,0.00010597,0.0001335,0.00139
12
+ Gemma-4-31b-it,0.000262,0.0002922,0.00026882,0.00027561,0.0002497,0.00026655,0.00027881,0.00021136,0.00013407,0.00018046,0.00499
13
+ Gpt-5.4,0.06666437,0.06379091,0.0584385,0.0612925,0.04071687,0.05445833,0.02721679,0.07230909,0.05625375,0.09743614,0.05821
14
+ Gpt-5.4-mini,0.03089784,0.01762582,0.01512068,0.01276102,0.02595707,0.01934788,0.01180568,0.03532892,0.02447409,0.00615875,0.02030
15
+ Gpt-5.4-nano,0.00457758,0.00190104,0.00452011,0.00490867,0.00343015,0.0061414,0.00485002,0.00545835,0.00433444,0.00393266,0.00432
16
+ Gpt-oss-120b,0.00015626,0.00021258,0.00017285,0.00019027,0.00016331,0.00021977,0.00017995,0.00013667,0.0001373,0.00014125,0.00017
17
+ Gpt-oss-20b,0.00012129,0.00013599,0.00013857,0.00013626,0.00011116,0.00012501,0.00014351,0.0001672,0.00010762,0.00015682,0.00013
18
+ Grok-4.1-fast,0.00118526,0.00146777,0.00127826,0.00110943,0.00118303,0.00121426,0.00119787,0.00121695,0.00102116,0.00123915,0.00120
19
+ Grok-4.20,0.01342326,0.018248,0.01405763,0.01805544,0.01293952,0.01611514,0.01163133,0.02519412,0.01138838,0.01545813,0.01536
20
+ Kimi-K2.5,0.00133754,0.00132217,0.00140378,0.00141836,0.00115815,0.00111338,0.00149777,0.00132587,0.00094932,0.00101612,0.00126
21
+ Llama-4-maverick,0.00023614,0.00027203,0.00030222,0.00024075,0.00027289,0.00029246,0.00027234,0.00037564,0.00025891,0.00030767,0.00028
22
+ Mimo-V2-Pro,0.00346965,0.00348921,0.00345641,0.00368526,0.00328376,0.0031878,0.00370886,0.00311506,0.00248505,0.00210693,0.00324
23
+ Minimax-m2.5,0.00051993,0.00059681,0.00065405,0.00056667,0.00054284,0.0005375,0.00060684,0.00044259,0.0003387,0.00033676,0.00052
24
+ Minimax-m2.7,0.00096638,0.00099688,0.00113488,0.00098879,0.00097712,0.00093218,0.00114299,0.00083556,0.00057865,0.00078994,0.00094
25
+ Mistral-large-2512,0.00101718,0.00098131,0.00120288,0.00106283,0.00098152,0.0010059,0.0012328,0.00081516,0.00053648,0.00070873,0.00096
26
+ Mistral-small-4,0.00048613,0.00051284,0.00058871,0.00042199,0.00080302,0.00043751,0.00056428,0.00038369,0.00029427,0.00043662,0.00050
27
+ Nemotron-3-nano-30b-a3b,0.00088842,0.0008176,0.00082007,0.00065529,0.00081284,0.00093832,0.00069204,0.00121825,0.00075232,0.00071001,0.00082
28
+ Nemotron-3-super-120b-a12b,0.00065421,0.00045663,0.00069701,0.00106853,0.00055697,0.00107016,0.00094218,0.00042322,0.00037991,0.00051843,0.00068
29
+ Nova-2-lite-v1,0.0192584,0.01805111,0.01775971,0.0203214,0.01862289,0.01734272,0.02235556,0.00855767,0.00497041,0.00613215,0.01556
30
+ Qwen3.5-122b-a10b,0.00162306,0.00118032,0.00122518,0.00135294,0.00142354,0.00105206,0.00181228,0.00153387,0.00144515,0.00162699,0.00144
31
+ Qwen3.5-35b-a3b,0.00116103,0.001385,0.00132452,0.00107441,0.00102931,0.00110916,0.00114048,0.00102208,0.00082191,0.00120101,0.00112
32
+ Qwen3.6-plus,0.00160726,0.00113903,0.00224072,0.00287606,0.00137679,0.00148291,0.00189326,0.00202712,0.00227825,0.00252956,0.00197
runs/agent1_2026-04-16/domain_ranks.csv ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name,adaptive_replanning,api_workflow,domain_workflow,error_handling,failure_recovery,multi_step_orchestration,parallel_execution,parameter_complexity,single_tool_call,tool_selection,Average (All Topics)
2
+ Claude-haiku-4.5,3.0614,2.8375,2.9419,3.07,3.221,2.9925,2.8819,2.4656,2.8192,2.6517,2.9151
3
+ Claude-opus-4.6,3.2254,3.2528,3.1685,3.1008,3.6112,3.143,3.3686,2.6688,2.972,2.9478,3.1682
4
+ Claude-sonnet-4.6,3.0719,2.8159,3.3619,3.1191,3.622,3.0645,3.3271,2.7493,2.9482,2.9773,3.1262
5
+ Deepseek-v3.2,2.6536,2.5936,2.5558,2.3426,2.6388,2.4324,2.5576,2.5272,2.6462,2.5903,2.5518
6
+ GLM-4.7,2.8156,2.7517,3.0237,2.8089,3.1913,3.1423,3.2166,2.4618,2.9373,2.6377,2.8201
7
+ GLM-5.1,2.9743,3.1265,3.1676,3.0665,3.2953,3.0381,3.2561,2.6182,3.0416,2.7205,3.0959
8
+ Gemini-3-flash-preview,3.0713,3.0608,3.107,2.6955,3.0183,3.0539,2.9513,2.7208,2.6366,2.6772,2.8926
9
+ Gemini-3.1-flash-lite-preview,2.7071,2.9008,2.7554,2.6945,3.0595,2.9101,3.1382,2.5291,2.6783,2.7213,2.5328
10
+ Gemini-3.1-pro-preview,3.1791,3.2185,3.2664,2.9015,3.2574,2.8581,3.2808,2.5238,3.1878,3.1821,2.6980
11
+ Gemma-4-26b-a4b-it,2.2699,2.5902,2.6241,2.4963,2.5757,2.7545,2.8982,2.1141,2.4297,2.5124,2.9172
12
+ Gemma-4-31b-it,2.8553,2.7678,2.9285,2.5916,2.7969,2.7997,2.7872,2.2097,2.6403,2.6407,3.0559
13
+ Gpt-5.4,3.102,3.0287,3.2705,3.0277,3.0711,3.2832,3.0165,2.5045,2.922,3.0751,3.0246
14
+ Gpt-5.4-mini,3.3417,3.0365,2.8875,2.5461,3.2261,3.1224,2.791,2.393,2.9426,2.8658,2.9075
15
+ Gpt-5.4-nano,2.9039,2.764,2.9425,2.269,2.913,2.7384,2.7862,2.5623,2.9551,2.9552,2.7823
16
+ Gpt-oss-120b,2.8299,2.9759,2.8175,2.6792,2.9697,2.6809,2.6406,2.3854,2.8958,2.6491,2.7639
17
+ Gpt-oss-20b,2.4834,2.9056,2.65,2.4672,2.7635,2.6706,2.5309,2.4596,2.8156,2.8078,2.6495
18
+ Grok-4.1-fast,2.7256,2.7798,2.8956,2.7138,2.9297,2.9768,3.0699,2.4118,2.8696,3.0586,2.8449
19
+ Grok-4.20,2.8101,2.992,2.8084,2.8254,3.069,3.0622,2.9404,2.6096,3.0932,2.8865,2.9155
20
+ Kimi-K2.5,2.7529,2.9016,3.1202,2.8565,3.0409,3.2633,3.1816,2.5213,2.9282,2.7499,2.9215
21
+ Llama-4-maverick,2.2207,2.1623,2.2359,2.2829,2.1537,2.4089,2.1973,2.2927,2.3858,2.4429,2.2695
22
+ Mimo-V2-Pro,3.165,3.1059,3.0574,2.8061,3.2456,2.9782,3.2639,2.4309,2.8583,2.8374,2.9878
23
+ Minimax-m2.5,2.6819,2.5663,2.7079,2.49,3.1716,2.8785,2.9233,2.4104,2.6426,2.5862,2.7226
24
+ Minimax-m2.7,2.9146,3.1791,3.111,2.733,3.1253,2.7982,3.1518,2.5031,2.6978,2.7335,2.9030
25
+ Mistral-large-2512,2.4911,2.9176,2.6082,2.4303,2.5595,2.5928,2.9121,2.4414,2.7411,2.5481,2.6249
26
+ Mistral-small-4,2.6428,2.7426,2.6579,2.5289,2.7196,2.669,2.785,2.6008,2.861,2.636,2.6874
27
+ Nemotron-3-nano-30b-a3b,2.6756,2.6616,2.387,2.5064,2.8724,2.6711,2.6363,2.5052,2.7506,2.574,2.6313
28
+ Nemotron-3-super-120b-a12b,2.5963,2.6885,2.7757,2.6367,2.5973,2.9963,2.5992,2.7177,2.7433,2.7751,2.6956
29
+ Nova-2-lite-v1,2.4537,2.6576,2.7964,2.5301,2.6316,2.8817,2.6522,2.4999,2.8745,2.6604,2.6603
30
+ Qwen3.5-122b-a10b,2.6317,2.7519,2.9903,2.613,3.1279,2.8175,3.0919,2.5094,2.8085,2.9913,2.8363
31
+ Qwen3.5-35b-a3b,2.9033,2.7141,2.9996,2.5789,2.9716,2.8106,2.9955,2.3478,2.9025,3.0803,2.8262
32
+ Qwen3.6-plus,2.9985,3.0064,3.0355,2.8342,3.114,2.9565,3.192,2.7408,2.9821,3.041,2.9923
runs/agent1_2026-04-16/metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_id": "run_2026-04-16",
3
+ "title": "AutoBench Agentic Run 1 - April 2026",
4
+ "date": "2026-04-16",
5
+ "description": "The first AutoBench run to measure agentic performance of top LLMs",
6
+ "blog_url": "https://huggingface.co/blog/PeterKruger/autobench-agentic-1",
7
+ "model_count": 31,
8
+ "is_latest": true
9
+ }
runs/agent1_2026-04-16/models.csv ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ company_id,model_name,model_display_name,model_logo_url,api_type,context,parameters,model_version,release_date,thinking_mode,description
2
+ 3,Gemini-3-flash-preview,Gemini 3 Flash Preview,/logos/models/gemini.svg,OpenRouter,1048576,,,17/12/2025,1,Gemini 3 Flash Preview is a highly efficient model delivering Gemini 3 Pro-level reasoning and near real-time agentic tool orchestration with significantly lower latency and cost.
3
+ 3,Gemini-3.1-flash-lite-preview,Gemini 3.1 Flash Lite Preview,/logos/models/gemini.svg,OpenRouter,1048576,,,03/03/2026,1,"Gemini 3.1 Flash Lite Preview is an ultra-efficient, high-volume workhorse model featuring a 1M context window and 2.5x faster time-to-first-token than previous generations."
4
+ 3,Gemma-4-31b-it,Gemma 4 31B IT,/logos/models/gemma.svg,OpenRouter,262144,31000000000,,31/03/2026,1,"Gemma 4 31B IT is an open-weights dense multimodal model from Google DeepMind featuring a 256K context window, native video/audio processing, and advanced configurable reasoning capabilities under an Apache 2.0 license."
5
+ 3,Gemma-4-26b-a4b-it,Gemma 4 26B A4B IT,/logos/models/gemma.svg,OpenRouter,262144,26000000000,,31/03/2026,1,Gemma 4 26B A4B IT is a latency-optimized open-weights MoE model activating only 3.8B parameters per token. It delivers near-31B dense quality while preserving hardware constraints for edge and enterprise deployments.
6
+ 3,Gemini-3.1-pro-preview,Gemini 3.1 Pro Preview,/logos/models/gemini.svg,OpenRouter,1048576,,,19/02/2026,1,"Gemini 3.1 Pro Preview is Google's flagship reasoning model featuring a 1M token context, three-tier adjustable reasoning depth controls, and unparalleled complex problem-solving capabilities across multimodal inputs."
7
+ 1,Gpt-5.4-mini,GPT-5.4 Mini,/logos/models/openai.svg,OpenRouter,400000,,,17/03/2026,1,"GPT-5.4 Mini is OpenAI's highly efficient small model offering 2x faster execution than GPT-5 Mini. It supports a 400K context window and achieves 54.4% on SWE-Bench Pro, ideal for responsive coding assistants."
8
+ 1,Gpt-5.4-nano,GPT-5.4 Nano,/logos/models/openai.svg,OpenRouter,400000,,,16/03/2026,1,"GPT-5.4 Nano is OpenAI's most cost-effective tier, optimized for massive-scale classification and supporting subagents. It features a 400K context window at just $0.20 per million input tokens."
9
+ 1,Gpt-oss-120b,Gpt oss 120b,/logos/models/openai.svg,OpenRouter,131000,1.17E+11,,05/08/2025,1,"GPT-OSS-120B is an open-weight MoE model from OpenAI containing 116.8B total parameters (5.1B active). Licensed under Apache 2.0, it is post-trained with MXFP4 quantization to run inference efficiently on a single 80GB GPU."
10
+ 1,Gpt-oss-20b,Gpt oss 20b,/logos/models/openai.svg,OpenRouter,131000,21000000000,,05/08/2025,1,GPT-OSS-20B is a compact open-weight MoE model from OpenAI containing 21B parameters (3.6B active). It uses grouped multi-query attention for low-latency inference on consumer hardware under an Apache 2.0 license.
11
+ 1,Gpt-5.4,GPT-5.4,/logos/models/openai.svg,OpenRouter,1048576,,,05/03/2026,1,"GPT-5.4 is OpenAI's flagship frontier model, natively integrating frontier coding (57.7% SWE-bench Pro), state-of-the-art computer-use abilities, and deep agentic workflows over a 1.05M token context window."
12
+ 30,Deepseek-v3.2,Deepseek v3.2,/logos/models/deepseek.svg,OpenRouter,163840,6.85E+10,,01/12/2025,1,"DeepSeek V3.2 is a 685B parameter MoE model leveraging DeepSeek Sparse Attention (DSA). It excels in complex mathematical reasoning and programming competitions, featuring integrated tool-use thinking modes."
13
+ 23,Grok-4.1-fast,Grok 4.1 fast,/logos/models/grok.svg,OpenRouter,2000000,,,19/11/2025,1,"Grok 4.1 Fast provides an immense 2M token context window. It is specifically optimized for high-speed document retrieval, customer support automation, and processing massive data pipelines."
14
+ 23,Grok-4.20,Grok 4.20,/logos/models/grok.svg,OpenRouter,2000000,6E+12,,03/03/2026,1,Grok 4.20 is a revolutionary ~6-trillion parameter MoE model that runs four specialized agents simultaneously on a shared backbone. It utilizes persona adapters to coordinate multi-agent workflows within a 2M token context.
15
+ 2,Claude-haiku-4.5,Claude haiku 4.5,/logos/models/claude.svg,OpenRouter,200000,,,16/10/2025,1,"Claude Haiku 4.5 is Anthropic's fastest and most cost-effective model, featuring a 200K context window. It delivers near-frontier reasoning and coding speeds suitable for real-time agentic applications."
16
+ 2,Claude-opus-4.6,Claude Opus 4.6,/logos/models/claude.svg,OpenRouter,1000000,,,05/02/2026,1,"Claude Opus 4.6 is Anthropic's pinnacle reasoning engine, featuring a 1M token context and leading scores on Terminal-Bench 2.0. It leverages Context Compaction to sustain infinitely long agentic coding workflows."
17
+ 2,Claude-sonnet-4.6,Claude Sonnet 4.6,/logos/models/claude.svg,OpenRouter,1000000,,,17/02/2026,1,"Claude Sonnet 4.6 represents a total upgrade in knowledge work and design. It achieves unprecedented computer-use reliability, executing complex UI automation and software engineering across a 1M token context window."
18
+ 27,Nemotron-3-nano-30b-a3b,Nemotron 3 Nano 30B A3B,/logos/companies/nvidia.svg,OpenRouter,262000,3.16E+10,,15/12/2025,1,Nemotron 3 Nano 30B A3B is a highly efficient 31.6B total parameter MoE model activating only 3.2B parameters. It offers a 1M token context window and up to 3.3x higher throughput for agentic systems.
19
+ 27,Nemotron-3-super-120b-a12b,Nemotron 3 Super 120B A12B,/logos/companies/nvidia.svg,OpenRouter,262000,1.20E+11,,11/03/2026,1,Nemotron 3 Super is a 120B parameter hybrid Mamba-Transformer model (12B active). It utilizes LatentMoE and Multi-Token Prediction (MTP) to maximize compute efficiency for complex RAG and IT ticket automation.
20
+ 7,Nova-2-lite-v1,Nova 2 lite v1,/logos/models/nova.svg,OpenRouter,1000000,,,02/12/2025,1,"Amazon Nova 2 Lite is a cost-efficient multimodal engine with a 1M token context. It seamlessly processes text, code, images, and video, natively supporting python interpreter tools for data analysis workflows."
21
+ 13,Mistral-large-2512,Mistral large 2512,/logos/models/mistral.svg,OpenRouter,262144,6.75E+11,,01/12/2025,,Mistral Large 3 is a massive open-weight granular MoE model featuring 675B total parameters (41B active). It offers top-tier reliability for production-grade assistants and long-context code comprehension.
22
+ 13,Mistral-small-4,Mistral Small 4,/logos/models/mistral.svg,OpenRouter,262144,1.19E+11,,16/03/2026,1,"Mistral Small 4 unifies Instruct, Magistral, and Devstral capabilities into a single 119B MoE architecture activating just 6.5B parameters. It offers configurable reasoning effort and native multimodality."
23
+ 29,Kimi-K2.5,Kimi K2.5,/logos/models/kimi.svg,OpenRouter,262144,1E+12,,27/01/2026,1,Kimi K2.5 is a 1-trillion parameter open-weight MoE (32B active). It features a native MoonViT encoder and self-directed Agent Swarm technology capable of orchestrating 100 sub-agents in parallel.
24
+ 31,Minimax-m2.7,MiniMax M2.7,/logos/models/minimax.svg,OpenRouter,204800,2.3E+11,,11/04/2026,1,"MiniMax M2.7 is a 230B parameter MoE model (10B active) utilizing RoPE and QK RMSNorm. It features recursive self-optimization, updating its own memory to execute highly complex software engineering tasks."
25
+ 31,Minimax-m2.5,MiniMax M2.5,/logos/models/minimax.svg,OpenRouter,196608,2.3E+11,,15/02/2026,1,"MiniMax M2.5 is a hyper-efficient 230B MoE model (10B active) trained via large-scale RL in 200,000+ environments. It excels in office productivity, outputting at 100 tokens/sec at unprecedented cost efficiency."
26
+ 27,Llama-4-maverick,Llama 4 Maverick,/logos/companies/nvidia.svg,OpenRouter,1000000,4E+11,,05/04/2025,,Llama 4 Maverick is Meta's natively multimodal 400B MoE model (17B active). It utilizes early fusion of text and vision tokens and was codistilled using online RL to master complex visual-reasoning tasks.
27
+ 18,Qwen3.5-35b-a3b,Qwen3.5 35B A3B,/logos/models/qwen.svg,OpenRouter,262144,35000000000,,25/02/2026,1,Qwen3.5 35B A3B is an efficient hybrid Gated DeltaNet + MoE transformer activating 3B of its 35B parameters. It delivers massive multimodal capabilities and 201-language support under an Apache 2.0 license.
28
+ 18,Qwen3.6-plus,Qwen3.6 Plus,/logos/models/qwen.svg,OpenRouter,1000000,,,30/03/2026,1,"Qwen3.6 Plus is Alibaba's proprietary flagship featuring a 1M token context. It provides a superior ""vibe coding"" experience through highly stable hybrid thinking modes and repository-level problem solving."
29
+ 18,Qwen3.5-122b-a10b,Qwen3.5 122B A10B,/logos/models/qwen.svg,OpenRouter,262144,1.22E+11,,25/02/2026,1,"Qwen3.5 122B A10B balances high performance with efficiency, activating 10B of its 122B parameters. It achieves 72.4% on SWE-bench Verified, making it a premier open-weight model for agentic workflows."
30
+ 22,GLM-5.1,GLM 5.1,/logos/models/glm.svg,OpenRouter,202752,7.44E+11,,27/03/2026,1,"GLM-5.1 is an open-weight 744B MoE model (40B active) released under the MIT license. Integrating DeepSeek Sparse Attention, it matches proprietary frontier models on SWE-Bench Pro (58.4%)."
31
+ 22,GLM-4.7,GLM 4.7,/logos/models/glm.svg,OpenRouter,202752,3.58E+11,,22/12/2025,1,GLM-4.7 is a highly stable 358B parameter model optimized for coding and UI generation. It utilizes Interleaved Thinking and Turn-level Thinking for reliable execution of complex mathematical tasks.
32
+ 40,Mimo-V2-Pro,Mimo V2 Pro,/logos/models/mimo.svg,OpenRouter,1000000,1E+12,,18/03/2026,1,"MiMo V2 Pro is Xiaomi's flagship ~1-Trillion parameter MoE (42B active) agentic engine. Achieving an Elo of 1426 on GDPval-AA, it is designed for extreme reliability in long-horizon autonomous task execution."
runs/agent1_2026-04-16/p99_latency.csv ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name,adaptive_replanning,api_workflow,domain_workflow,error_handling,failure_recovery,multi_step_orchestration,parallel_execution,parameter_complexity,single_tool_call,tool_selection,Average (All Topics)
2
+ Claude-haiku-4.5,91.0773,110.2682,127.0755,276.1924,113.2281,243.8365,95.0228,97.5884,77.9215,290.4749,152.2685
3
+ Claude-opus-4.6,80.041,197.754,100.162,80.4153,86.5288,110.9398,150.5253,69.7232,44.9602,52.2452,97.3295
4
+ Claude-sonnet-4.6,154.3386,189.2712,278.3433,153.4241,207.9232,104.0079,141.6592,134.9327,43.7089,63.8222,147.1431
5
+ Deepseek-v3.2,188.9782,107.3153,112.5801,185.4105,259.0915,250.2864,125.7354,221.9516,57.3445,55.6669,156.436
6
+ GLM-4.7,129.6235,110.1411,95.4605,303.947,131.7817,89.6153,211.9996,160.5917,63.7457,49.3721,134.6278
7
+ GLM-5.1,167.4809,251.4347,285.7352,312.5488,233.0885,168.078,295.6721,104.8135,72.4833,86.1206,197.7456
8
+ Gemini-3-flash-preview,27.2873,27.2941,25.3661,24.226,26.8827,19.5807,55.168,22.6091,15.7135,13.7824,25.791
9
+ Gemini-3.1-flash-lite-preview,158.9503,43.2124,83.2036,191.2867,237.7518,119.9766,140.9744,86.1881,24.6225,57.7764,114.3943
10
+ Gemini-3.1-pro-preview,40.5014,53.4536,43.3775,55.1287,87.1046,52.6185,113.2668,46.1479,43.2473,44.1445,57.8991
11
+ Gemma-4-26b-a4b-it,41.4358,37.5319,33.5177,49.9711,58.6268,47.8931,109.4397,22.3967,23.3193,15.6139,43.9746
12
+ Gemma-4-31b-it,94.5327,193.1804,250.3586,189.0602,168.4594,146.1042,288.4547,43.589,84.5247,313.5323,177.1796
13
+ Gpt-5.4,357.3064,304.6053,300.9001,395.2089,208.725,333.4138,313.7329,396.4187,330.2345,353.3553,329.3901
14
+ Gpt-5.4-mini,333.0933,217.5008,303.0277,239.124,302.5534,198.3588,179.3315,267.1145,313.2008,55.5172,240.8822
15
+ Gpt-5.4-nano,254.3095,188.8738,196.3221,291.0544,196.7436,301.9089,327.645,320.1109,320.8538,224.2656,262.2088
16
+ Gpt-oss-120b,51.5995,33.3092,73.1303,66.8018,76.0803,43.4072,91.5204,56.4928,126.5068,14.7681,63.3616
17
+ Gpt-oss-20b,185.9629,182.1386,169.189,208.175,174.7058,141.3277,205.7395,137.3125,145.4453,105.2351,165.5231
18
+ Grok-4.1-fast,98.0764,86.5536,110.6121,54.0332,88.8177,236.0511,86.7175,73.692,56.2482,67.3797,95.8181
19
+ Grok-4.20,60.9856,180.7621,61.1629,297.1904,70.4811,86.8368,72.1829,67.2458,40.2727,41.1219,97.8242
20
+ Kimi-K2.5,334.1608,122.3857,189.8468,158.9584,191.315,104.8266,274.5484,124.984,162.1486,81.6347,174.4809
21
+ Llama-4-maverick,64.5353,73.8019,67.0179,63.9908,82.1587,118.3985,76.4981,72.3758,68.6175,73.6123,76.1007
22
+ Mimo-V2-Pro,45.9749,69.6757,43.2072,53.6061,57.1214,57.5479,72.8357,49.422,63.7333,28.8174,54.1942
23
+ Minimax-m2.5,239.6571,217.6127,267.7839,395.5708,245.0672,268.9264,239.7041,113.4059,200.6302,218.9969,240.7355
24
+ Minimax-m2.7,70.5145,63.2912,70.5855,73.8646,65.1628,147.4334,62.5303,45.931,39.8098,42.5468,68.167
25
+ Mistral-large-2512,26.3446,27.9479,30.0158,21.8918,21.6876,15.6656,29.0625,18.6591,10.0165,14.2327,21.5524
26
+ Mistral-small-4,22.5922,18.2761,26.9347,17.2356,243.0557,20.7109,26.2265,8.4539,11.0912,17.9602,41.2537
27
+ Nemotron-3-nano-30b-a3b,312.2043,358.6536,290.4211,228.6499,278.3621,372.3838,282.7243,194.9471,267.3245,208.9367,279.4607
28
+ Nemotron-3-super-120b-a12b,346.3498,232.4492,206.6109,306.4442,295.0199,227.9731,316.5712,169.1652,212.1549,141.617,245.4356
29
+ Nova-2-lite-v1,174.3589,185.7674,131.2926,142.2304,201.7181,135.5685,185.4033,118.5547,43.3698,70.0387,138.8302
30
+ Qwen3.5-122b-a10b,40.9727,34.9177,36.3253,30.2049,46.4513,32.0967,78.2364,17.6285,25.204,22.4722,36.451
31
+ Qwen3.5-35b-a3b,34.734,51.2018,34.9103,37.8562,30.8286,29.5903,43.737,13.6506,14.1181,16.6684,30.7295
32
+ Qwen3.6-plus,99.6381,113.883,180.359,163.168,85.5265,90.039,114.3871,58.4227,90.5119,97.8585,109.3794
runs/agent1_2026-04-16/summary_data.csv ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Iterations,AutoBench,AAI Index,Terminal-bench,GDPval-AA,Tau2-bench Telecom,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec),Fail Rate %
2
+ Claude-haiku-4.5,195,2.92,40,27,34,55%,0.00860,44,152,1.52%
3
+ Claude-opus-4.6,198,3.17,68,46,56,92%,0.02563,38,97,0.00%
4
+ Claude-sonnet-4.6,197,3.13,63,53,58,76%,0.01948,46,147,0.51%
5
+ Deepseek-v3.2,198,2.55,53,36,35,91%,0.00059,54,156,0.00%
6
+ Gemini-3.1-flash-lite-preview,179,2.82,26,24,21,31%,0.00115,23,135,9.60%
7
+ Gemini-3.1-pro-preview,198,3.10,59,54,41,96%,0.01328,26,198,0.00%
8
+ Gemini-3-flash-preview,198,2.89,50,39,35,80%,0.00278,13,26,0.00%
9
+ Gemma-4-26b-a4b-it,186,2.53,32,14,26,44%,0.00020,14,114,6.06%
10
+ Gemma-4-31b-it,169,2.70,41,36,31,60%,0.00024,52,58,14.65%
11
+ GLM-4.7,194,2.92,55,32,35,96%,0.00139,44,44,2.02%
12
+ GLM-5.1,197,3.06,67,43,52,98%,0.00499,66,177,0.51%
13
+ Gpt-5.4,132,3.02,68,58,59,87%,0.05821,129,329,33.33%
14
+ Gpt-5.4-mini,104,2.91,59,52,46,83%,0.02030,87,241,47.47%
15
+ Gpt-5.4-nano,113,2.78,48,42,34,76%,0.00432,93,262,42.93%
16
+ Gpt-oss-120b,198,2.76,38,24,22,66%,0.00017,18,63,0.00%
17
+ Gpt-oss-20b,197,2.65,28,11,8,60%,0.00013,43,166,0.51%
18
+ Grok-4.1-fast,197,2.84,49,24,27,93%,0.00120,36,96,0.51%
19
+ Grok-4.20,187,2.92,54,38,27,93%,0.01536,37,98,5.56%
20
+ Kimi-K2.5,182,2.92,59,35,39,96%,0.00126,55,174,8.08%
21
+ Llama-4-maverick,195,2.27,7,7,0,18%,0.00028,41,76,1.52%
22
+ Mimo-V2-Pro,198,2.99,63,41,46,95%,0.00324,26,54,0.00%
23
+ Minimax-m2.5,192,2.72,56,35,34,95%,0.00052,85,241,3.03%
24
+ Minimax-m2.7,193,2.90,61,39,51,85%,0.00094,28,68,2.53%
25
+ Mistral-large-2512,193,2.62,22,16,18,41%,0.00096,9,22,2.53%
26
+ Mistral-small-4,194,2.69,26,17,18,25%,0.00050,11,41,2.02%
27
+ Nemotron-3-nano-30b-a3b,189,2.63,19,14,4,41%,0.00082,103,279,4.55%
28
+ Nemotron-3-super-120b-a12b,183,2.70,40,29,25,68%,0.00068,72,245,7.58%
29
+ Nova-2-lite-v1,181,2.66,37,17,17,73%,0.01556,57,139,8.59%
30
+ Qwen3.5-122b-a10b,198,2.84,53,31,31,89%,0.00144,13,36,0.00%
31
+ Qwen3.5-35b-a3b,198,2.83,44,27,21,94%,0.00112,14,31,0.00%
32
+ Qwen3.6-plus,197,2.99,62,44,43,95%,0.00197,48,109,0.51%