Spaces:

AutoBench
/

AutoBench-Leaderboard

Running

App Files Files Community

PeterKruger commited on Apr 19

Commit

46682e3

verified ·

1 Parent(s): a3ad7b1

Upload 8 files

Browse files

Updated benchmark with Opus 4.7

Files changed (8) hide show

runs/agent1_2026-04-16/avg_latency.csv +33 -0
runs/agent1_2026-04-16/correlations.json +10 -0
runs/agent1_2026-04-16/cost_data.csv +33 -0
runs/agent1_2026-04-16/domain_ranks.csv +33 -0
runs/agent1_2026-04-16/metadata.json +10 -0
runs/agent1_2026-04-16/models.csv +33 -0
runs/agent1_2026-04-16/p99_latency.csv +33 -0
runs/agent1_2026-04-16/summary_data.csv +33 -0

runs/agent1_2026-04-16/avg_latency.csv ADDED Viewed

	@@ -0,0 +1,33 @@

+model_name,adaptive_replanning,api_workflow,domain_workflow,error_handling,failure_recovery,multi_step_orchestration,parallel_execution,parameter_complexity,single_tool_call,tool_selection,Average (All Topics)
+Claude-haiku-4.5,43.618,46.5163,46.0829,60.58,36.0973,51.8384,37.1467,36.2839,23.389,50.41,43.07080917
+Claude-opus-4.6,42.316,54.4556,46.7135,42.7605,38.1893,38.7216,44.2887,29.0244,17.2886,21.895,37.87360967
+Claude-opus-4.7,23.4598,23.4768,26.6156,26.4562,24.31,21.8974,27.6346,11.3351,9.4661,11.3215,20.84747735
+Claude-sonnet-4.6,50.6279,54.7831,79.7373,49.8303,51.1988,41.3607,53.7757,44.2152,18.7172,29.9558,47.09331481
+Deepseek-v3.2,73.6767,59.4786,43.4286,49.8724,81.7804,73.9997,62.3873,25.2281,26.8864,29.6302,53.5962432
+Gemini-3.1-flash-lite-preview,25.9681,15.6794,18.291,29.8768,31.6073,36.0709,29.2888,13.9176,12.3058,17.3742,23.16857653
+Gemini-3.1-pro-preview,26.1382,28.8217,25.2886,27.8934,26.6653,27.3855,34.4956,23.248,16.6175,20.4602,25.91624477
+Gemini-3-flash-preview,15.8823,13.766,13.9335,12.9937,14.3633,11.8534,13.6619,10.2985,8.6949,7.8138,12.50534394
+Gemma-4-26b-a4b-it,13.7075,11.139,13.8939,15.1396,13.8836,14.9236,19.4782,6.3775,5.6871,5.4832,12.38208868
+Gemma-4-31b-it,39.1957,47.4958,70.79,50.0589,43.4831,51.4507,69.1917,20.9786,19.5977,45.3389,45.28161148
+GLM-4.7,50.4502,40.405,43.2821,66.6907,39.1247,48.016,52.613,36.519,23.0467,22.9732,43.54876781
+GLM-5.1,66.5905,67.1936,72.3457,67.9401,71.3968,61.2796,87.5068,31.5902,25.7274,36.1258,60.30060454
+Gpt-5.4,181.3289,117.5913,108.796,140.5637,86.8286,96.5208,117.2563,196.7529,113.1724,174.4537,131.0088557
+Gpt-5.4-mini,153.3592,90.6251,81.1314,90.317,101.9265,103.8755,49.8503,91.5657,78.5646,26.5415,86.91001719
+Gpt-5.4-nano,128.935,68.2781,67.8651,145.881,77.745,140.7391,82.563,78.7779,80.9436,78.7016,93.33818571
+Gpt-oss-120b,18.207,12.6311,20.5413,25.8277,20.1329,22.4405,20.2597,14.1992,14.5309,6.511,18.02638485
+Gpt-oss-20b,43.7038,43.953,49.1998,60.1144,48.8196,37.083,49.6733,32.327,22.8055,30.2565,42.86725918
+Grok-4.1-fast,36.4589,39.2228,40.1179,31.148,35.6533,64.629,41.2036,30.4571,21.3232,29.431,36.20348296
+Grok-4.20,32.9577,46.9123,36.7891,35.3105,31.7187,36.9108,35.6628,35.6581,19.6062,23.8123,33.09318092
+Kimi-K2.5,70.4371,41.202,53.0478,70.3109,49.9411,46.826,73.5591,46.9076,40.0547,31.1301,53.60984548
+Llama-4-maverick,44.0942,43.6807,36.7344,44.4963,42.2188,45.3713,34.3164,42.2113,36.3075,45.0926,41.27163619
+Mimo-V2-Pro,26.9537,29.1939,26.7587,32.303,27.6827,28.3103,30.7455,18.8794,17.5233,15.3474,25.88939247
+Minimax-m2.5,76.0348,77.7358,102.6096,108.8648,97.952,73.6815,66.3169,38.8389,35.1812,53.124,75.14721297
+Minimax-m2.7,34.2989,26.1851,29.2574,34.729,29.8218,30.4692,31.8216,18.048,14.8376,14.5781,27.06517621
+Mistral-large-2512,11.0209,11.3873,12.334,9.9634,9.4495,9.7352,12.1589,5.7922,4.0873,5.7834,9.267698618
+Mistral-small-4,10.4777,9.5762,11.4058,9.2103,22.2899,9.7938,12.1189,4.8819,4.3778,6.4862,10.55675923
+Nemotron-3-nano-30b-a3b,128.8521,126.3225,119.7762,117.2076,102.2264,140.1875,127.5032,78.0586,52.4569,67.385,105.9488772
+Nemotron-3-super-120b-a12b,100.8029,63.7416,82.0725,79.3436,88.1448,83.4456,69.5431,33.7658,37.3912,44.5288,69.39686943
+Nova-2-lite-v1,84.8424,69.59,61.1813,79.1771,69.1531,58.651,81.8847,23.9343,12.6582,16.2989,56.85229296
+Qwen3.5-122b-a10b,15.3923,14.5582,13.6746,13.4825,13.4436,14.0513,19.8363,8.3142,8.0972,8.4222,13.13992066
+Qwen3.5-35b-a3b,18.6065,17.7434,17.0379,17.1596,14.0943,16.2087,18.5391,6.9951,6.5958,8.4414,14.33161555
+Qwen3.6-plus,57.8645,40.9226,59.862,58.9875,41.7657,43.7416,58.851,22.2883,31.2027,40.9666,46.4302181

runs/agent1_2026-04-16/correlations.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "correlations": {
+    	"Artificial Analysis Intelligence Index": 85.15,
+    	"Terminal-Bench Hard": 83.00,
+	"GDPval-AA":84.56,
+	"Tau2-Bench Telecom":68.07
+  },
+  "description": "Correlation percentages between AutoBench scores and main agentic benchmark scores"
+}

runs/agent1_2026-04-16/cost_data.csv ADDED Viewed

	@@ -0,0 +1,33 @@

+model_name,adaptive_replanning,api_workflow,domain_workflow,error_handling,failure_recovery,multi_step_orchestration,parallel_execution,parameter_complexity,single_tool_call,tool_selection,Average (All Topics)
+Claude-haiku-4.5,0.00871937,0.00833217,0.012498,0.01372931,0.00610275,0.00605973,0.00636281,0.00816841,0.00674671,0.00591073,0.008421041
+Claude-opus-4.6,0.02765625,0.03058842,0.03050588,0.02743635,0.0273924,0.025655,0.03520682,0.02116382,0.01409205,0.01574433,0.025840455
+Claude-opus-4.7,0.02974789,0.03200719,0.03957885,0.03091185,0.03111022,0.03060286,0.03335786,0.01598559,0.01206295,0.01845233,0.027191898
+Claude-sonnet-4.6,0.01847625,0.01977212,0.0310388,0.02054589,0.02176963,0.017019,0.027312,0.01504006,0.01108377,0.0153794,0.019781285
+Deepseek-v3.2,0.00072667,0.00066987,0.00052314,0.00056449,0.00068195,0.00070084,0.00056044,0.00051727,0.00043447,0.00052287,0.000590106
+Gemini-3.1-flash-lite-preview,0.00118982,0.00120504,0.00110204,0.00110311,0.00109502,0.0009483,0.001208,0.00144613,0.00113141,0.00113895,0.001152242
+Gemini-3.1-pro-preview,0.0128166,0.01252432,0.0130415,0.01398126,0.01313246,0.01390243,0.01415045,0.01587529,0.00982373,0.0151,0.013335061
+Gemini-3-flash-preview,0.00313542,0.002562,0.00292724,0.00257744,0.0030312,0.00266611,0.00310289,0.00380832,0.00203841,0.00239683,0.002816407
+Gemma-4-26b-a4b-it,0.00019457,0.00019054,0.00023961,0.00023651,0.00019933,0.00024419,0.00029169,0.00014424,0.00010769,0.00013016,0.000200836
+Gemma-4-31b-it,0.00025536,0.00028379,0.00027337,0.0002714,0.00024733,0.00026902,0.00028653,0.00021209,0.00013454,0.00018181,0.00024191
+GLM-4.7,0.00147033,0.00124156,0.00162454,0.00190286,0.00131191,0.00139006,0.0018656,0.00102319,0.00086426,0.00080832,0.001388525
+GLM-5.1,0.00544603,0.00534523,0.00618099,0.00573527,0.00569496,0.00463782,0.00652936,0.00438933,0.00306728,0.00311005,0.005112927
+Gpt-5.4,0.06739089,0.06297568,0.06590694,0.06407274,0.04384528,0.04925558,0.03925617,0.09757729,0.0626006,0.09974705,0.063282983
+Gpt-5.4-mini,0.03089784,0.01762582,0.01512068,0.01276102,0.02595707,0.01934788,0.01180568,0.03532892,0.02447409,0.00615875,0.020302969
+Gpt-5.4-nano,0.00457758,0.00190104,0.00452011,0.00490867,0.00343015,0.0061414,0.00485002,0.00545835,0.00433444,0.00393266,0.004317464
+Gpt-oss-120b,0.00015626,0.00021257,0.00017285,0.00019027,0.0001633,0.00021977,0.00017995,0.00013667,0.0001373,0.00014125,0.000170961
+Gpt-oss-20b,0.00012129,0.00013599,0.00013857,0.00013626,0.00011116,0.00012501,0.00014351,0.0001672,0.00010762,0.00015682,0.000132809
+Grok-4.1-fast,0.00118526,0.00146777,0.00127826,0.00110943,0.00118303,0.00121426,0.00119787,0.00121695,0.00102116,0.00123915,0.001201663
+Grok-4.20,0.01323968,0.0182328,0.01391675,0.01641488,0.01261768,0.01774271,0.01274095,0.02400718,0.01164318,0.01462373,0.015201661
+Kimi-K2.5,0.0013356,0.00130087,0.00137106,0.00141092,0.0011434,0.0011052,0.00149473,0.0013332,0.00098521,0.00101612,0.001254149
+Llama-4-maverick,0.00023614,0.00027203,0.00030222,0.00024075,0.00027289,0.00029246,0.00027234,0.00037564,0.00025891,0.00030767,0.000278018
+Mimo-V2-Pro,0.00357415,0.00343263,0.00347788,0.00385474,0.003419,0.00344807,0.00376945,0.00303994,0.00262857,0.00212227,0.003325538
+Minimax-m2.5,0.00052565,0.0006371,0.00062919,0.00057725,0.00055945,0.0005181,0.00067571,0.00050515,0.0003416,0.00032803,0.000530249
+Minimax-m2.7,0.00100449,0.00100103,0.00115352,0.0010163,0.00099474,0.00095862,0.00115718,0.00086569,0.00055672,0.00079226,0.000953587
+Mistral-large-2512,0.00101718,0.00098131,0.00120288,0.00106283,0.00098152,0.0010059,0.0012328,0.00081516,0.00053648,0.00070873,0.000962583
+Mistral-small-4,0.00048613,0.00051284,0.00058871,0.00042199,0.00080302,0.00043751,0.00056428,0.00038369,0.00029427,0.00043662,0.000501963
+Nemotron-3-nano-30b-a3b,0.00084413,0.00083324,0.00084611,0.00072194,0.00079633,0.00087668,0.00073996,0.00125087,0.0007248,0.00073306,0.000824281
+Nemotron-3-super-120b-a12b,0.00060291,0.00045153,0.00072142,0.00093154,0.00058488,0.00067359,0.00090865,0.00047968,0.00034974,0.00051965,0.000634388
+Nova-2-lite-v1,0.0192584,0.01805111,0.01775971,0.0203214,0.01862289,0.01734272,0.02235556,0.00855767,0.00497041,0.00613215,0.015561471
+Qwen3.5-122b-a10b,0.00162306,0.00118032,0.00122518,0.00135294,0.00142354,0.00105206,0.00181228,0.00153387,0.00144515,0.00162699,0.001435478
+Qwen3.5-35b-a3b,0.00116103,0.001385,0.00132452,0.00107441,0.00102931,0.00110916,0.00114048,0.00102208,0.00082191,0.00120101,0.001115424
+Qwen3.6-plus,0.00161363,0.00121225,0.00220098,0.00285437,0.00141107,0.00130197,0.00192546,0.00183986,0.00228926,0.00252657,0.001947453

runs/agent1_2026-04-16/domain_ranks.csv ADDED Viewed

	@@ -0,0 +1,33 @@

+model_name,adaptive_replanning,api_workflow,domain_workflow,error_handling,failure_recovery,multi_step_orchestration,parallel_execution,parameter_complexity,single_tool_call,tool_selection,Average (All Topics)
+Claude-haiku-4.5,3.0685,2.9945,2.9619,3.1588,3.2391,3.2152,2.9821,2.5381,2.8195,2.7273,2.99
+Claude-opus-4.6,3.3153,3.2988,3.1821,3.1723,3.6863,3.3626,3.4559,2.7453,3.0134,2.9812,3.24
+Claude-opus-4.7,3.6111,3.3953,3.4149,3.3508,3.656,3.5898,3.3328,2.5825,2.9968,2.9562,3.30
+Claude-sonnet-4.6,3.1005,2.9142,3.3171,3.1366,3.6949,3.1871,3.2957,2.7665,2.9313,3.0678,3.16
+Deepseek-v3.2,2.732,2.7265,2.6495,2.4248,2.7263,2.6023,2.6619,2.5768,2.7301,2.6278,2.64
+Gemini-3.1-flash-lite-preview,2.7018,2.8987,2.7502,2.6935,3.0584,2.9111,3.1334,2.5258,2.6758,2.7188,2.82
+Gemini-3.1-pro-preview,3.2131,3.23,3.3875,2.977,3.4081,3.4466,3.3539,2.6809,3.2485,3.2427,3.21
+Gemini-3-flash-preview,3.2134,3.1037,3.1573,2.7569,3.0998,3.3029,3.0746,2.7353,2.7489,2.7499,2.98
+Gemma-4-26b-a4b-it,2.3694,2.5989,2.6659,2.5914,2.6424,2.8519,2.9495,2.2481,2.5338,2.5877,2.61
+Gemma-4-31b-it,2.974,2.7978,3.0587,2.6114,2.9091,2.9358,2.8219,2.378,2.7631,2.7266,2.79
+GLM-4.7,2.8162,2.752,3.0233,2.8063,3.1921,3.1412,3.2188,2.4608,2.9363,2.6366,2.92
+GLM-5.1,3.0682,3.1621,3.3615,3.1207,3.4322,3.1704,3.402,2.6728,3.0799,2.7559,3.15
+Gpt-5.4,3.1567,3.0461,3.3889,2.9513,3.1372,3.6612,3.1879,2.6879,2.9794,3.3251,3.13
+Gpt-5.4-mini,3.3448,3.0339,2.8881,2.5449,3.2292,3.1226,2.7936,2.3893,2.9396,2.8656,2.91
+Gpt-5.4-nano,2.9012,2.7599,2.9453,2.2699,2.9143,2.7349,2.7846,2.5624,2.9505,2.9565,2.78
+Gpt-oss-120b,2.8272,2.972,2.8172,2.6788,2.9679,2.6775,2.6397,2.3838,2.8949,2.6475,2.76
+Gpt-oss-20b,2.4855,2.9026,2.6505,2.4662,2.7621,2.6703,2.5331,2.4562,2.8125,2.8065,2.65
+Grok-4.1-fast,2.7251,2.7808,2.8935,2.7132,2.927,2.9732,3.0701,2.4106,2.8691,3.0558,2.84
+Grok-4.20,2.9655,3.0134,2.8431,2.9051,3.1483,3.2588,3.0393,2.7399,3.0864,3.0362,3.00
+Kimi-K2.5,2.946,2.9359,3.2898,2.9312,3.1442,3.3314,3.3351,2.5718,3.0124,2.7494,3.02
+Llama-4-maverick,2.2209,2.1618,2.236,2.2826,2.1544,2.4093,2.1975,2.2938,2.3814,2.444,2.27
+Mimo-V2-Pro,3.3098,3.1473,3.1718,2.9618,3.3235,3.1543,3.3443,2.5667,3.0858,2.8552,3.10
+Minimax-m2.5,2.7363,2.744,2.6707,2.6015,3.2291,2.9967,2.8798,2.4281,2.8761,2.5907,2.79
+Minimax-m2.7,3.0275,3.2217,3.1855,2.8278,3.3091,2.9885,3.185,2.5437,2.9389,2.7495,3.01
+Mistral-large-2512,2.4915,2.9178,2.6059,2.4284,2.5585,2.5909,2.9145,2.4412,2.7398,2.5458,2.62
+Mistral-small-4,2.6405,2.7424,2.6556,2.5279,2.7171,2.6746,2.7835,2.5975,2.8568,2.6352,2.69
+Nemotron-3-nano-30b-a3b,2.8337,2.6704,2.4984,2.5504,2.9257,2.8728,2.7019,2.5324,2.8386,2.6755,2.71
+Nemotron-3-super-120b-a12b,2.8701,2.7866,2.8974,2.6892,2.671,3.2788,2.6295,2.8184,2.7808,2.7926,2.80
+Nova-2-lite-v1,2.4534,2.6585,2.7978,2.5307,2.6296,2.881,2.6485,2.4957,2.8718,2.6597,2.66
+Qwen3.5-122b-a10b,2.6337,2.7507,2.9868,2.6132,3.1269,2.8147,3.0913,2.5081,2.8061,2.9914,2.84
+Qwen3.5-35b-a3b,2.9045,2.714,2.9957,2.5777,2.9714,2.8092,2.9918,2.3457,2.9,3.0801,2.82
+Qwen3.6-plus,3.1695,3.0611,3.0995,2.8888,3.1893,3.0903,3.2422,2.821,3.0304,3.1365,3.07

runs/agent1_2026-04-16/metadata.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "run_id": "run_2026-04-16",
+  "title": "AutoBench Agentic Run 1 - April 2026",
+  "date": "2026-04-19",
+  "description": "The first AutoBench run to measure agentic performance of top LLMs",
+  "blog_url": "https://huggingface.co/blog/PeterKruger/autobench-agentic-1",
+  "model_count": 32,
+  "is_latest": true,
+  "benchmark_comparison_note": "This run targets agentic performance; reference scores are drawn from agentic benchmarks alongside the Artificial Analysis Intelligence Index."
+}

runs/agent1_2026-04-16/models.csv ADDED Viewed

	@@ -0,0 +1,33 @@

+company_id,model_name,model_display_name,model_logo_url,api_type,context,parameters,model_version,release_date,thinking_mode,description
+3,Gemini-3-flash-preview,Gemini 3 Flash Preview,/logos/models/gemini.svg,OpenRouter,1048576,,,17/12/2025,1,Gemini 3 Flash Preview is a highly efficient model delivering Gemini 3 Pro-level reasoning and near real-time agentic tool orchestration with significantly lower latency and cost.
+3,Gemini-3.1-flash-lite-preview,Gemini 3.1 Flash Lite Preview,/logos/models/gemini.svg,OpenRouter,1048576,,,03/03/2026,1,"Gemini 3.1 Flash Lite Preview is an ultra-efficient, high-volume workhorse model featuring a 1M context window and 2.5x faster time-to-first-token than previous generations."
+3,Gemma-4-31b-it,Gemma 4 31B IT,/logos/models/gemini.svg,OpenRouter,262144,31000000000,,31/03/2026,1,"Gemma 4 31B IT is an open-weights dense multimodal model from Google DeepMind featuring a 256K context window, native video/audio processing, and advanced configurable reasoning capabilities under an Apache 2.0 license."
+3,Gemma-4-26b-a4b-it,Gemma 4 26B A4B IT,/logos/models/gemini.svg,OpenRouter,262144,26000000000,,31/03/2026,1,Gemma 4 26B A4B IT is a latency-optimized open-weights MoE model activating only 3.8B parameters per token. It delivers near-31B dense quality while preserving hardware constraints for edge and enterprise deployments.
+3,Gemini-3.1-pro-preview,Gemini 3.1 Pro Preview,/logos/models/gemini.svg,OpenRouter,1048576,,,19/02/2026,1,"Gemini 3.1 Pro Preview is Google's flagship reasoning model featuring a 1M token context, three-tier adjustable reasoning depth controls, and unparalleled complex problem-solving capabilities across multimodal inputs."
+1,Gpt-5.4-mini,GPT-5.4 Mini,/logos/models/openai.svg,OpenRouter,400000,,,17/03/2026,1,"GPT-5.4 Mini is OpenAI's highly efficient small model offering 2x faster execution than GPT-5 Mini. It supports a 400K context window and achieves 54.4% on SWE-Bench Pro, ideal for responsive coding assistants."
+1,Gpt-5.4-nano,GPT-5.4 Nano,/logos/models/openai.svg,OpenRouter,400000,,,16/03/2026,1,"GPT-5.4 Nano is OpenAI's most cost-effective tier, optimized for massive-scale classification and supporting subagents. It features a 400K context window at just $0.20 per million input tokens."
+1,Gpt-oss-120b,Gpt oss 120b,/logos/models/openai.svg,OpenRouter,131000,1.17E+11,,05/08/2025,1,"GPT-OSS-120B is an open-weight MoE model from OpenAI containing 116.8B total parameters (5.1B active). Licensed under Apache 2.0, it is post-trained with MXFP4 quantization to run inference efficiently on a single 80GB GPU."
+1,Gpt-oss-20b,Gpt oss 20b,/logos/models/openai.svg,OpenRouter,131000,21000000000,,05/08/2025,1,GPT-OSS-20B is a compact open-weight MoE model from OpenAI containing 21B parameters (3.6B active). It uses grouped multi-query attention for low-latency inference on consumer hardware under an Apache 2.0 license.
+1,Gpt-5.4,GPT-5.4,/logos/models/openai.svg,OpenRouter,1048576,,,05/03/2026,1,"GPT-5.4 is OpenAI's flagship frontier model, natively integrating frontier coding (57.7% SWE-bench Pro), state-of-the-art computer-use abilities, and deep agentic workflows over a 1.05M token context window."
+30,Deepseek-v3.2,Deepseek v3.2,/logos/models/deepseek.svg,OpenRouter,163840,6.85E+10,,01/12/2025,1,"DeepSeek V3.2 is a 685B parameter MoE model leveraging DeepSeek Sparse Attention (DSA). It excels in complex mathematical reasoning and programming competitions, featuring integrated tool-use thinking modes."
+23,Grok-4.1-fast,Grok 4.1 fast,/logos/models/grok.svg,OpenRouter,2000000,,,19/11/2025,1,"Grok 4.1 Fast provides an immense 2M token context window. It is specifically optimized for high-speed document retrieval, customer support automation, and processing massive data pipelines."
+23,Grok-4.20,Grok 4.20,/logos/models/grok.svg,OpenRouter,2000000,6.00E+12,,03/03/2026,1,Grok 4.20 is a revolutionary ~6-trillion parameter MoE model that runs four specialized agents simultaneously on a shared backbone. It utilizes persona adapters to coordinate multi-agent workflows within a 2M token context.
+2,Claude-haiku-4.5,Claude haiku 4.5,/logos/models/claude.svg,OpenRouter,200000,,,16/10/2025,1,"Claude Haiku 4.5 is Anthropic's fastest and most cost-effective model, featuring a 200K context window. It delivers near-frontier reasoning and coding speeds suitable for real-time agentic applications."
+2,Claude-opus-4.6,Claude Opus 4.6,/logos/models/claude.svg,OpenRouter,1000000,,,05/02/2026,1,Claude Opus 4.6 is features 1M token context and leading scores on Terminal-Bench 2.0. It leverages Context Compaction to sustain infinitely long agentic coding workflows.
+2,Claude-opus-4.7,Claude Opus 4.7,/logos/models/claude.svg,OpenRouter,1000000,,,16/04/2026,1,"Claude Opus 4.7 is the next generation of Anthropic's Opus family, built for long-running, asynchronous agents. Building on the coding and agentic strengths of Opus 4.6, it delivers stronger performance on complex, multi-step tasks and more reliable agentic execution across extended workflows. "
+2,Claude-sonnet-4.6,Claude Sonnet 4.6,/logos/models/claude.svg,OpenRouter,1000000,,,17/02/2026,1,"Claude Sonnet 4.6 represents a total upgrade in knowledge work and design. It achieves unprecedented computer-use reliability, executing complex UI automation and software engineering across a 1M token context window."
+27,Nemotron-3-nano-30b-a3b,Nemotron 3 Nano 30B A3B,/logos/companies/nvidia.svg,OpenRouter,262000,3.16E+10,,15/12/2025,1,Nemotron 3 Nano 30B A3B is a highly efficient 31.6B total parameter MoE model activating only 3.2B parameters. It offers a 1M token context window and up to 3.3x higher throughput for agentic systems.
+27,Nemotron-3-super-120b-a12b,Nemotron 3 Super 120B A12B,/logos/companies/nvidia.svg,OpenRouter,262000,1.20E+11,,11/03/2026,1,Nemotron 3 Super is a 120B parameter hybrid Mamba-Transformer model (12B active). It utilizes LatentMoE and Multi-Token Prediction (MTP) to maximize compute efficiency for complex RAG and IT ticket automation.
+7,Nova-2-lite-v1,Nova 2 lite v1,/logos/models/nova.svg,OpenRouter,1000000,,,02/12/2025,1,"Amazon Nova 2 Lite is a cost-efficient multimodal engine with a 1M token context. It seamlessly processes text, code, images, and video, natively supporting python interpreter tools for data analysis workflows."
+13,Mistral-large-2512,Mistral large 2512,/logos/models/mistral.svg,OpenRouter,262144,6.75E+11,,01/12/2025,,Mistral Large 3 is a massive open-weight granular MoE model featuring 675B total parameters (41B active). It offers top-tier reliability for production-grade assistants and long-context code comprehension.
+13,Mistral-small-4,Mistral Small 4,/logos/models/mistral.svg,OpenRouter,262144,1.19E+11,,16/03/2026,1,"Mistral Small 4 unifies Instruct, Magistral, and Devstral capabilities into a single 119B MoE architecture activating just 6.5B parameters. It offers configurable reasoning effort and native multimodality."
+29,Kimi-K2.5,Kimi K2.5,/logos/models/kimi.svg,OpenRouter,262144,1.00E+12,,27/01/2026,1,Kimi K2.5 is a 1-trillion parameter open-weight MoE (32B active). It features a native MoonViT encoder and self-directed Agent Swarm technology capable of orchestrating 100 sub-agents in parallel.
+31,Minimax-m2.7,MiniMax M2.7,/logos/models/minimax.svg,OpenRouter,204800,2.30E+11,,11/04/2026,1,"MiniMax M2.7 is a 230B parameter MoE model (10B active) utilizing RoPE and QK RMSNorm. It features recursive self-optimization, updating its own memory to execute highly complex software engineering tasks."
+31,Minimax-m2.5,MiniMax M2.5,/logos/models/minimax.svg,OpenRouter,196608,2.30E+11,,15/02/2026,1,"MiniMax M2.5 is a hyper-efficient 230B MoE model (10B active) trained via large-scale RL in 200,000+ environments. It excels in office productivity, outputting at 100 tokens/sec at unprecedented cost efficiency."
+27,Llama-4-maverick,Llama 4 Maverick,/logos/companies/nvidia.svg,OpenRouter,1000000,4.00E+11,,05/04/2025,,Llama 4 Maverick is Meta's natively multimodal 400B MoE model (17B active). It utilizes early fusion of text and vision tokens and was codistilled using online RL to master complex visual-reasoning tasks.
+18,Qwen3.5-35b-a3b,Qwen3.5 35B A3B,/logos/models/qwen.svg,OpenRouter,262144,35000000000,,25/02/2026,1,Qwen3.5 35B A3B is an efficient hybrid Gated DeltaNet + MoE transformer activating 3B of its 35B parameters. It delivers massive multimodal capabilities and 201-language support under an Apache 2.0 license.
+18,Qwen3.6-plus,Qwen3.6 Plus,/logos/models/qwen.svg,OpenRouter,1000000,,,30/03/2026,1,"Qwen3.6 Plus is Alibaba's proprietary flagship featuring a 1M token context. It provides a superior ""vibe coding"" experience through highly stable hybrid thinking modes and repository-level problem solving."
+18,Qwen3.5-122b-a10b,Qwen3.5 122B A10B,/logos/models/qwen.svg,OpenRouter,262144,1.22E+11,,25/02/2026,1,"Qwen3.5 122B A10B balances high performance with efficiency, activating 10B of its 122B parameters. It achieves 72.4% on SWE-bench Verified, making it a premier open-weight model for agentic workflows."
+22,GLM-5.1,GLM 5.1,/logos/models/glm.svg,OpenRouter,202752,7.44E+11,,27/03/2026,1,"GLM-5.1 is an open-weight 744B MoE model (40B active) released under the MIT license. Integrating DeepSeek Sparse Attention, it matches proprietary frontier models on SWE-Bench Pro (58.4%)."
+22,GLM-4.7,GLM 4.7,/logos/models/glm.svg,OpenRouter,202752,3.58E+11,,22/12/2025,1,GLM-4.7 is a highly stable 358B parameter model optimized for coding and UI generation. It utilizes Interleaved Thinking and Turn-level Thinking for reliable execution of complex mathematical tasks.
+40,Mimo-V2-Pro,Mimo V2 Pro,/logos/models/mimo.svg,OpenRouter,1000000,1.00E+12,,18/03/2026,1,"MiMo V2 Pro is Xiaomi's flagship ~1-Trillion parameter MoE (42B active) agentic engine. Achieving an Elo of 1426 on GDPval-AA, it is designed for extreme reliability in long-horizon autonomous task execution."

runs/agent1_2026-04-16/p99_latency.csv ADDED Viewed

	@@ -0,0 +1,33 @@

+model_name,adaptive_replanning,api_workflow,domain_workflow,error_handling,failure_recovery,multi_step_orchestration,parallel_execution,parameter_complexity,single_tool_call,tool_selection,Average (All Topics)
+Claude-haiku-4.5,80.5908,110.2682,127.0755,276.1924,113.2281,241.8884,95.1526,97.5884,78.0551,290.4749,151.0514
+Claude-opus-4.6,84.539,197.754,100.162,80.4153,86.5288,111.4439,150.5253,70.6348,44.9602,52.2452,97.9209
+Claude-opus-4.7,37.4855,63.4129,58.6555,64.3319,47.6816,33.142,86.6684,33.1777,17.9879,27.5307,47.0074
+Claude-sonnet-4.6,154.3386,191.4218,279.7576,153.4241,208.8329,106.8715,141.6592,147.4867,43.5937,63.8222,149.1208
+Deepseek-v3.2,188.9782,112.562,79.2116,88.0738,259.0915,250.2864,152.1987,49.5855,56.8288,55.6669,129.2483
+Gemini-3.1-flash-lite-preview,158.9503,43.2124,83.2036,191.2867,237.7518,119.9766,140.9744,86.1881,24.6225,57.7764,114.3943
+Gemini-3.1-pro-preview,44.9399,53.4536,43.3885,55.1287,86.8721,51.5003,113.2668,46.1479,43.2473,44.1445,58.2089
+Gemini-3-flash-preview,25.0785,27.2941,25.3661,23.1182,26.8827,19.4345,24.8071,22.6091,18.7772,11.9231,22.5291
+Gemma-4-26b-a4b-it,28.1927,38.4315,33.8465,46.8857,47.6488,47.8931,106.9718,22.323,22.4683,15.2962,40.9958
+Gemma-4-31b-it,94.1848,192.0041,250.3586,188.6529,168.3033,141.9093,283.4239,43.556,65.9362,313.5323,174.1861
+GLM-4.7,129.6235,110.1411,95.4605,303.947,131.7817,89.6153,211.9996,160.5917,63.7457,49.3721,134.6278
+GLM-5.1,167.4809,251.4347,275.4299,192.384,233.0885,168.078,281.3911,104.8135,72.4833,86.1206,183.2705
+Gpt-5.4,356.8408,304.6053,268.4902,393.5952,205.1735,323.543,321.9148,396.1398,334.0582,353.3553,325.7716
+Gpt-5.4-mini,333.0933,217.5008,303.0277,239.124,302.5534,198.3588,179.3315,267.1145,313.2008,55.5172,240.8822
+Gpt-5.4-nano,254.3095,188.8738,196.3221,291.0544,196.7436,301.9089,327.645,320.1109,320.8538,224.2656,262.2088
+Gpt-oss-120b,51.5995,33.3092,73.1303,66.8018,76.0803,43.4072,91.5204,56.4928,126.5068,14.7681,63.3616
+Gpt-oss-20b,185.9629,182.1386,169.189,208.175,174.7058,141.3277,205.7395,137.3125,145.4453,105.2351,165.5231
+Grok-4.1-fast,98.0764,86.5536,110.6121,54.0332,88.8177,236.0511,86.7175,73.692,56.2482,67.3797,95.8181
+Grok-4.20,60.9232,180.7621,61.1629,111.7952,70.4811,86.7911,72.1829,60.295,40.2018,41.1219,78.5717
+Kimi-K2.5,334.1608,122.3857,189.8468,167.2839,191.315,97.5412,274.5484,139.5256,177.2613,81.6347,177.5503
+Llama-4-maverick,64.5353,73.8019,67.0179,63.9908,82.1587,118.3985,76.4981,72.3758,68.6175,73.6123,76.1007
+Mimo-V2-Pro,45.9749,69.6757,43.2072,53.6061,57.1214,64.1453,72.8357,49.3485,61.9769,28.8174,54.6709
+Minimax-m2.5,240.499,217.9588,268.4486,395.5708,245.1988,268.9264,238.9894,112.8876,200.6302,218.9969,240.8106
+Minimax-m2.7,70.458,63.2912,70.5855,73.8646,64.2769,60.1743,62.5303,45.931,32.2267,32.067,57.5405
+Mistral-large-2512,26.3446,27.9479,30.0158,21.8918,21.6876,15.6656,29.0625,18.6591,10.0165,14.2327,21.5524
+Mistral-small-4,22.5922,18.2761,26.9347,17.2356,243.0557,20.7109,26.2265,8.4539,11.0912,17.9602,41.2537
+Nemotron-3-nano-30b-a3b,312.2043,357.3106,290.4211,252.0452,278.3621,387.1998,282.7243,200.5241,267.3245,217.4783,284.5594
+Nemotron-3-super-120b-a12b,346.3498,232.4492,206.6109,305.1337,295.0199,227.8794,315.6944,169.1652,212.1549,141.617,245.2075
+Nova-2-lite-v1,174.3589,185.7674,131.2926,142.2304,201.7181,135.5685,185.4033,118.5547,43.3698,70.0387,138.8302
+Qwen3.5-122b-a10b,40.9727,34.9177,36.3253,30.2049,46.4513,32.0967,78.2364,17.6285,25.204,22.4722,36.451
+Qwen3.5-35b-a3b,34.734,51.2018,34.9103,37.8562,30.8286,29.5903,43.737,13.6506,14.1181,16.6684,30.7295
+Qwen3.6-plus,99.6131,113.883,180.359,163.168,75.9546,90.039,114.3871,43.0535,90.5119,97.8585,106.8828

runs/agent1_2026-04-16/summary_data.csv ADDED Viewed

	@@ -0,0 +1,33 @@

+Model,Iterations,AutoBench,AAI Index,Terminal-bench,GDPval-AA,Tau2-bench Telecom,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec),Fail Rate %
+Claude-haiku-4.5,193,2.987257,40,27,34,55%,0.008421041,43.07080917,151.0514,3.02%
+Claude-opus-4.6,198,3.241761,68,46,56,92%,0.025840455,37.87360967,97.9209,0.50%
+Claude-opus-4.7,187,3.295774,,,,,0.027191898,20.84747735,47.0074,6.03%
+Claude-sonnet-4.6,193,3.157161,63,53,58,76%,0.019781285,47.09331481,149.1208,3.02%
+Deepseek-v3.2,197,2.642664,53,36,35,91%,0.000590106,53.5962432,129.2483,1.01%
+Gemini-3.1-flash-lite-preview,179,2.817498,26,24,21,31%,0.001152242,23.16857653,114.3943,10.05%
+Gemini-3.1-pro-preview,198,3.214863,59,54,41,96%,0.013335061,25.91624477,58.2089,0.50%
+Gemini-3-flash-preview,198,2.984602,50,39,35,80%,0.002816407,12.50534394,22.5291,0.50%
+Gemma-4-26b-a4b-it,192,2.60664,32,14,26,44%,0.000200836,12.38208868,40.9958,3.52%
+Gemma-4-31b-it,183,2.792605,41,36,31,60%,0.00024191,45.28161148,174.1861,8.04%
+GLM-4.7,194,2.916867,55,32,35,96%,0.001388525,43.54876781,134.6278,2.51%
+GLM-5.1 (xhigh),197,3.14801,67,43,52,98%,0.005112927,60.30060454,183.2705,1.01%
+Gpt-5.4,145,3.127315,68,58,59,87%,0.063282983,131.0088557,325.7716,27.14%
+Gpt-5.4-mini  (xhigh),104,2.907268,59,52,46,83%,0.020302969,86.91001719,240.8822,47.74%
+Gpt-5.4-nano  (xhigh),113,2.781217,48,42,34,76%,0.004317464,93.33818571,262.2088,43.22%
+Gpt-oss-120b,198,2.762188,38,24,22,66%,0.000170961,18.02638485,63.3616,0.50%
+Gpt-oss-20b,197,2.648577,28,11,8,60%,0.000132809,42.86725918,165.5231,1.01%
+Grok-4.1-fast,197,2.843729,49,24,27,93%,0.001201663,36.20348296,95.8181,1.01%
+Grok-4.20,189,3.00445,54,38,27,93%,0.015201661,33.09318092,78.5717,5.03%
+Kimi-K2.5,187,3.022371,59,35,39,96%,0.001254149,53.60984548,177.5503,6.03%
+Llama-4-maverick,195,2.269251,7,7,0,18%,0.000278018,41.27163619,76.1007,2.01%
+Mimo-V2-Pro,199,3.103969,63,41,46,95%,0.003325538,25.88939247,54.6709,0.00%
+Minimax-m2.5,184,2.790468,56,35,34,95%,0.000530249,75.14721297,240.8106,7.54%
+Minimax-m2.7,193,3.011179,61,39,51,85%,0.000953587,27.06517621,57.5405,3.02%
+Mistral-large-2512,193,2.624168,22,16,18,41%,0.000962583,9.267698618,21.5524,3.02%
+Mistral-small-4,194,2.68593,26,17,18,25%,0.000501963,10.55675923,41.2537,2.51%
+Nemotron-3-nano-30b-a3b,190,2.713783,19,14,4,41%,0.000824281,105.9488772,284.5594,4.52%
+Nemotron-3-super-120b-a12b,187,2.796666,40,29,25,68%,0.000634388,69.39686943,245.2075,6.03%
+Nova-2-lite-v1,181,2.65909,37,17,17,73%,0.015561471,56.85229296,138.8302,9.05%
+Qwen3.5-122b-a10b,198,2.835386,53,31,31,89%,0.001435478,13.13992066,36.451,0.50%
+Qwen3.5-35b-a3b,198,2.824791,44,27,21,94%,0.001115424,14.33161555,30.7295,0.50%
+Qwen3.6-plus,198,3.072465,62,44,43,0.95,0.001947453,46.4302181,106.8828,0.005025126