PeterKruger commited on
Commit
aaff80f
·
verified ·
1 Parent(s): a888f48

fixed ordering of main averages

Browse files
runs/agent1_2026-04-16/avg_latency.csv CHANGED
@@ -3,13 +3,13 @@ Claude-haiku-4.5,46.1336,46.1122,48.0425,60.335,36.1667,55.4396,39.5512,35.164,2
3
  Claude-opus-4.6,42.1124,53.3346,45.461,43.0368,39.8221,37.2462,45.1839,28.905,16.9896,22.3545,37.85942643
4
  Claude-sonnet-4.6,51.1068,52.8177,76.5625,50.1423,52.0923,42.691,52.3426,38.5508,18.7954,28.4199,46.45277089
5
  Deepseek-v3.2,70.4417,60.617,46.1213,56.2128,79.4036,77.5038,55.4781,35.3541,26.0408,27.0283,54.31511897
6
- GLM-4.7,50.4502,40.405,43.2821,66.6907,39.1247,48.016,52.613,36.519,23.0467,22.9732,23.16857653
7
- GLM-5.1,70.5772,71.5014,86.1363,80.6663,75.4139,67.054,95.9933,32.9641,26.5569,35.1629,25.96268468
8
  Gemini-3-flash-preview,16.7706,13.8003,14.096,13.6193,14.8726,12.9421,15.4606,9.8464,8.0062,8.2219,12.95436087
9
- Gemini-3.1-flash-lite-preview,25.9681,15.6794,18.291,29.8768,31.6073,36.0709,29.2888,13.9176,12.3058,17.3742,13.64022252
10
- Gemini-3.1-pro-preview,25.2172,28.3457,25.5406,28.3863,26.0591,29.5616,35.0487,23.2719,16.5858,20.0105,51.69145245
11
- Gemma-4-26b-a4b-it,16.9871,10.217,13.0184,16.2429,16.5371,16.1576,20.9129,6.4718,7.1763,5.9912,43.54876781
12
- Gemma-4-31b-it,41.6696,53.5776,81.3055,54.9655,52.0971,59.0855,83.5473,22.6931,24.6705,49.91,66.08806343
13
  Gpt-5.4,190.6972,125.9768,111.2079,140.3793,81.5197,131.3381,101.409,160.1292,108.8256,175.7158,129.263534
14
  Gpt-5.4-mini,153.3592,90.6251,81.1314,90.317,101.9265,103.8755,49.8503,91.5657,78.5646,26.5415,86.91001719
15
  Gpt-5.4-nano,128.935,68.2781,67.8651,145.881,77.745,140.7391,82.563,78.7779,80.9436,78.7016,93.33818571
 
3
  Claude-opus-4.6,42.1124,53.3346,45.461,43.0368,39.8221,37.2462,45.1839,28.905,16.9896,22.3545,37.85942643
4
  Claude-sonnet-4.6,51.1068,52.8177,76.5625,50.1423,52.0923,42.691,52.3426,38.5508,18.7954,28.4199,46.45277089
5
  Deepseek-v3.2,70.4417,60.617,46.1213,56.2128,79.4036,77.5038,55.4781,35.3541,26.0408,27.0283,54.31511897
6
+ Gemini-3.1-flash-lite-preview,25.9681,15.6794,18.291,29.8768,31.6073,36.0709,29.2888,13.9176,12.3058,17.3742,23.16857653
7
+ Gemini-3.1-pro-preview,25.2172,28.3457,25.5406,28.3863,26.0591,29.5616,35.0487,23.2719,16.5858,20.0105,25.96268468
8
  Gemini-3-flash-preview,16.7706,13.8003,14.096,13.6193,14.8726,12.9421,15.4606,9.8464,8.0062,8.2219,12.95436087
9
+ Gemma-4-26b-a4b-it,16.9871,10.217,13.0184,16.2429,16.5371,16.1576,20.9129,6.4718,7.1763,5.9912,13.64022252
10
+ Gemma-4-31b-it,41.6696,53.5776,81.3055,54.9655,52.0971,59.0855,83.5473,22.6931,24.6705,49.91,51.69145245
11
+ GLM-4.7,50.4502,40.405,43.2821,66.6907,39.1247,48.016,52.613,36.519,23.0467,22.9732,43.54876781
12
+ GLM-5.1,70.5772,71.5014,86.1363,80.6663,75.4139,67.054,95.9933,32.9641,26.5569,35.1629,66.08806343
13
  Gpt-5.4,190.6972,125.9768,111.2079,140.3793,81.5197,131.3381,101.409,160.1292,108.8256,175.7158,129.263534
14
  Gpt-5.4-mini,153.3592,90.6251,81.1314,90.317,101.9265,103.8755,49.8503,91.5657,78.5646,26.5415,86.91001719
15
  Gpt-5.4-nano,128.935,68.2781,67.8651,145.881,77.745,140.7391,82.563,78.7779,80.9436,78.7016,93.33818571
runs/agent1_2026-04-16/cost_data.csv CHANGED
@@ -1,32 +1,32 @@
1
  model_name,adaptive_replanning,api_workflow,domain_workflow,error_handling,failure_recovery,multi_step_orchestration,parallel_execution,parameter_complexity,single_tool_call,tool_selection,Average (All Topics)
2
- Claude-haiku-4.5,0.00808905,0.00876072,0.01240729,0.01352842,0.00610779,0.00782664,0.00615164,0.00776982,0.00667186,0.0082366,0.00860
3
- Claude-opus-4.6,0.027098,0.03036395,0.02974176,0.02710231,0.0267796,0.02575733,0.03427523,0.02213794,0.01376205,0.017109,0.02563
4
- Claude-sonnet-4.6,0.01848345,0.01957926,0.02999362,0.02050589,0.02137464,0.01709979,0.02698677,0.01397471,0.01081964,0.0146042,0.01948
5
- Deepseek-v3.2,0.00072267,0.00063554,0.00049758,0.00058632,0.00067593,0.00070094,0.00056376,0.00051767,0.00045667,0.00051202,0.00059
6
- GLM-4.7,0.00147033,0.00124156,0.00162454,0.00190286,0.00131191,0.00139006,0.0018656,0.00102319,0.00086426,0.00080832,0.00115
7
- GLM-5.1,0.00510934,0.00532995,0.00614466,0.00567328,0.00549074,0.00465484,0.00616262,0.00451313,0.00289774,0.00309761,0.01328
8
- Gemini-3-flash-preview,0.00303385,0.00267647,0.002966,0.00268887,0.00277996,0.00250525,0.00312325,0.00368191,0.0019587,0.0024742,0.00278
9
- Gemini-3.1-flash-lite-preview,0.00118982,0.00120504,0.00110204,0.00110311,0.00109502,0.0009483,0.001208,0.00144613,0.00113141,0.00113895,0.00020
10
- Gemini-3.1-pro-preview,0.0121991,0.01238768,0.013072,0.01407541,0.01194816,0.01342957,0.01415764,0.01647259,0.01054545,0.01578933,0.00024
11
- Gemma-4-26b-a4b-it,0.0001865,0.00018407,0.00023366,0.00022579,0.00019316,0.0002439,0.00028535,0.00014517,0.00010597,0.0001335,0.00139
12
- Gemma-4-31b-it,0.000262,0.0002922,0.00026882,0.00027561,0.0002497,0.00026655,0.00027881,0.00021136,0.00013407,0.00018046,0.00499
13
- Gpt-5.4,0.06666437,0.06379091,0.0584385,0.0612925,0.04071687,0.05445833,0.02721679,0.07230909,0.05625375,0.09743614,0.05821
14
- Gpt-5.4-mini,0.03089784,0.01762582,0.01512068,0.01276102,0.02595707,0.01934788,0.01180568,0.03532892,0.02447409,0.00615875,0.02030
15
- Gpt-5.4-nano,0.00457758,0.00190104,0.00452011,0.00490867,0.00343015,0.0061414,0.00485002,0.00545835,0.00433444,0.00393266,0.00432
16
- Gpt-oss-120b,0.00015626,0.00021258,0.00017285,0.00019027,0.00016331,0.00021977,0.00017995,0.00013667,0.0001373,0.00014125,0.00017
17
- Gpt-oss-20b,0.00012129,0.00013599,0.00013857,0.00013626,0.00011116,0.00012501,0.00014351,0.0001672,0.00010762,0.00015682,0.00013
18
- Grok-4.1-fast,0.00118526,0.00146777,0.00127826,0.00110943,0.00118303,0.00121426,0.00119787,0.00121695,0.00102116,0.00123915,0.00120
19
- Grok-4.20,0.01342326,0.018248,0.01405763,0.01805544,0.01293952,0.01611514,0.01163133,0.02519412,0.01138838,0.01545813,0.01536
20
- Kimi-K2.5,0.00133754,0.00132217,0.00140378,0.00141836,0.00115815,0.00111338,0.00149777,0.00132587,0.00094932,0.00101612,0.00126
21
- Llama-4-maverick,0.00023614,0.00027203,0.00030222,0.00024075,0.00027289,0.00029246,0.00027234,0.00037564,0.00025891,0.00030767,0.00028
22
- Mimo-V2-Pro,0.00346965,0.00348921,0.00345641,0.00368526,0.00328376,0.0031878,0.00370886,0.00311506,0.00248505,0.00210693,0.00324
23
- Minimax-m2.5,0.00051993,0.00059681,0.00065405,0.00056667,0.00054284,0.0005375,0.00060684,0.00044259,0.0003387,0.00033676,0.00052
24
- Minimax-m2.7,0.00096638,0.00099688,0.00113488,0.00098879,0.00097712,0.00093218,0.00114299,0.00083556,0.00057865,0.00078994,0.00094
25
- Mistral-large-2512,0.00101718,0.00098131,0.00120288,0.00106283,0.00098152,0.0010059,0.0012328,0.00081516,0.00053648,0.00070873,0.00096
26
- Mistral-small-4,0.00048613,0.00051284,0.00058871,0.00042199,0.00080302,0.00043751,0.00056428,0.00038369,0.00029427,0.00043662,0.00050
27
- Nemotron-3-nano-30b-a3b,0.00088842,0.0008176,0.00082007,0.00065529,0.00081284,0.00093832,0.00069204,0.00121825,0.00075232,0.00071001,0.00082
28
- Nemotron-3-super-120b-a12b,0.00065421,0.00045663,0.00069701,0.00106853,0.00055697,0.00107016,0.00094218,0.00042322,0.00037991,0.00051843,0.00068
29
- Nova-2-lite-v1,0.0192584,0.01805111,0.01775971,0.0203214,0.01862289,0.01734272,0.02235556,0.00855767,0.00497041,0.00613215,0.01556
30
- Qwen3.5-122b-a10b,0.00162306,0.00118032,0.00122518,0.00135294,0.00142354,0.00105206,0.00181228,0.00153387,0.00144515,0.00162699,0.00144
31
- Qwen3.5-35b-a3b,0.00116103,0.001385,0.00132452,0.00107441,0.00102931,0.00110916,0.00114048,0.00102208,0.00082191,0.00120101,0.00112
32
- Qwen3.6-plus,0.00160726,0.00113903,0.00224072,0.00287606,0.00137679,0.00148291,0.00189326,0.00202712,0.00227825,0.00252956,0.00197
 
1
  model_name,adaptive_replanning,api_workflow,domain_workflow,error_handling,failure_recovery,multi_step_orchestration,parallel_execution,parameter_complexity,single_tool_call,tool_selection,Average (All Topics)
2
+ Claude-haiku-4.5,0.00808905,0.00876072,0.01240729,0.01352842,0.00610779,0.00782664,0.00615164,0.00776982,0.00667186,0.0082366,0.008595128
3
+ Claude-opus-4.6,0.027098,0.03036395,0.02974176,0.02710231,0.0267796,0.02575733,0.03427523,0.02213794,0.01376205,0.017109,0.025630278
4
+ Claude-sonnet-4.6,0.01848345,0.01957926,0.02999362,0.02050589,0.02137464,0.01709979,0.02698677,0.01397471,0.01081964,0.0146042,0.01947903
5
+ Deepseek-v3.2,0.00072267,0.00063554,0.00049758,0.00058632,0.00067593,0.00070094,0.00056376,0.00051767,0.00045667,0.00051202,0.000588874
6
+ Gemini-3.1-flash-lite-preview,0.00118982,0.00120504,0.00110204,0.00110311,0.00109502,0.0009483,0.001208,0.00144613,0.00113141,0.00113895,0.001152242
7
+ Gemini-3.1-pro-preview,0.0121991,0.01238768,0.013072,0.01407541,0.01194816,0.01342957,0.01415764,0.01647259,0.01054545,0.01578933,0.013276101
8
+ Gemini-3-flash-preview,0.00303385,0.00267647,0.002966,0.00268887,0.00277996,0.00250525,0.00312325,0.00368191,0.0019587,0.0024742,0.002780972
9
+ Gemma-4-26b-a4b-it,0.0001865,0.00018407,0.00023366,0.00022579,0.00019316,0.0002439,0.00028535,0.00014517,0.00010597,0.0001335,0.000197155
10
+ Gemma-4-31b-it,0.000262,0.0002922,0.00026882,0.00027561,0.0002497,0.00026655,0.00027881,0.00021136,0.00013407,0.00018046,0.000241598
11
+ GLM-4.7,0.00147033,0.00124156,0.00162454,0.00190286,0.00131191,0.00139006,0.0018656,0.00102319,0.00086426,0.00080832,0.001388525
12
+ GLM-5.1,0.00510934,0.00532995,0.00614466,0.00567328,0.00549074,0.00465484,0.00616262,0.00451313,0.00289774,0.00309761,0.004990303
13
+ Gpt-5.4,0.06666437,0.06379091,0.0584385,0.0612925,0.04071687,0.05445833,0.02721679,0.07230909,0.05625375,0.09743614,0.058210019
14
+ Gpt-5.4-mini,0.03089784,0.01762582,0.01512068,0.01276102,0.02595707,0.01934788,0.01180568,0.03532892,0.02447409,0.00615875,0.020302969
15
+ Gpt-5.4-nano,0.00457758,0.00190104,0.00452011,0.00490867,0.00343015,0.0061414,0.00485002,0.00545835,0.00433444,0.00393266,0.004317464
16
+ Gpt-oss-120b,0.00015626,0.00021258,0.00017285,0.00019027,0.00016331,0.00021977,0.00017995,0.00013667,0.0001373,0.00014125,0.000170961
17
+ Gpt-oss-20b,0.00012129,0.00013599,0.00013857,0.00013626,0.00011116,0.00012501,0.00014351,0.0001672,0.00010762,0.00015682,0.000132809
18
+ Grok-4.1-fast,0.00118526,0.00146777,0.00127826,0.00110943,0.00118303,0.00121426,0.00119787,0.00121695,0.00102116,0.00123915,0.001201663
19
+ Grok-4.20,0.01342326,0.018248,0.01405763,0.01805544,0.01293952,0.01611514,0.01163133,0.02519412,0.01138838,0.01545813,0.015361294
20
+ Kimi-K2.5,0.00133754,0.00132217,0.00140378,0.00141836,0.00115815,0.00111338,0.00149777,0.00132587,0.00094932,0.00101612,0.001258469
21
+ Llama-4-maverick,0.00023614,0.00027203,0.00030222,0.00024075,0.00027289,0.00029246,0.00027234,0.00037564,0.00025891,0.00030767,0.000278018
22
+ Mimo-V2-Pro,0.00346965,0.00348921,0.00345641,0.00368526,0.00328376,0.0031878,0.00370886,0.00311506,0.00248505,0.00210693,0.003243439
23
+ Minimax-m2.5,0.00051993,0.00059681,0.00065405,0.00056667,0.00054284,0.0005375,0.00060684,0.00044259,0.0003387,0.00033676,0.000518891
24
+ Minimax-m2.7,0.00096638,0.00099688,0.00113488,0.00098879,0.00097712,0.00093218,0.00114299,0.00083556,0.00057865,0.00078994,0.000937895
25
+ Mistral-large-2512,0.00101718,0.00098131,0.00120288,0.00106283,0.00098152,0.0010059,0.0012328,0.00081516,0.00053648,0.00070873,0.000962583
26
+ Mistral-small-4,0.00048613,0.00051284,0.00058871,0.00042199,0.00080302,0.00043751,0.00056428,0.00038369,0.00029427,0.00043662,0.000501963
27
+ Nemotron-3-nano-30b-a3b,0.00088842,0.0008176,0.00082007,0.00065529,0.00081284,0.00093832,0.00069204,0.00121825,0.00075232,0.00071001,0.000816494
28
+ Nemotron-3-super-120b-a12b,0.00065421,0.00045663,0.00069701,0.00106853,0.00055697,0.00107016,0.00094218,0.00042322,0.00037991,0.00051843,0.000676263
29
+ Nova-2-lite-v1,0.0192584,0.01805111,0.01775971,0.0203214,0.01862289,0.01734272,0.02235556,0.00855767,0.00497041,0.00613215,0.015561471
30
+ Qwen3.5-122b-a10b,0.00162306,0.00118032,0.00122518,0.00135294,0.00142354,0.00105206,0.00181228,0.00153387,0.00144515,0.00162699,0.001435478
31
+ Qwen3.5-35b-a3b,0.00116103,0.001385,0.00132452,0.00107441,0.00102931,0.00110916,0.00114048,0.00102208,0.00082191,0.00120101,0.001115424
32
+ Qwen3.6-plus,0.00160726,0.00113903,0.00224072,0.00287606,0.00137679,0.00148291,0.00189326,0.00202712,0.00227825,0.00252956,0.001968282
runs/agent1_2026-04-16/domain_ranks.csv CHANGED
@@ -1,32 +1,32 @@
1
  model_name,adaptive_replanning,api_workflow,domain_workflow,error_handling,failure_recovery,multi_step_orchestration,parallel_execution,parameter_complexity,single_tool_call,tool_selection,Average (All Topics)
2
- Claude-haiku-4.5,3.0614,2.8375,2.9419,3.07,3.221,2.9925,2.8819,2.4656,2.8192,2.6517,2.9151
3
- Claude-opus-4.6,3.2254,3.2528,3.1685,3.1008,3.6112,3.143,3.3686,2.6688,2.972,2.9478,3.1682
4
- Claude-sonnet-4.6,3.0719,2.8159,3.3619,3.1191,3.622,3.0645,3.3271,2.7493,2.9482,2.9773,3.1262
5
- Deepseek-v3.2,2.6536,2.5936,2.5558,2.3426,2.6388,2.4324,2.5576,2.5272,2.6462,2.5903,2.5518
6
- GLM-4.7,2.8156,2.7517,3.0237,2.8089,3.1913,3.1423,3.2166,2.4618,2.9373,2.6377,2.8201
7
- GLM-5.1,2.9743,3.1265,3.1676,3.0665,3.2953,3.0381,3.2561,2.6182,3.0416,2.7205,3.0959
8
- Gemini-3-flash-preview,3.0713,3.0608,3.107,2.6955,3.0183,3.0539,2.9513,2.7208,2.6366,2.6772,2.8926
9
- Gemini-3.1-flash-lite-preview,2.7071,2.9008,2.7554,2.6945,3.0595,2.9101,3.1382,2.5291,2.6783,2.7213,2.5328
10
- Gemini-3.1-pro-preview,3.1791,3.2185,3.2664,2.9015,3.2574,2.8581,3.2808,2.5238,3.1878,3.1821,2.6980
11
- Gemma-4-26b-a4b-it,2.2699,2.5902,2.6241,2.4963,2.5757,2.7545,2.8982,2.1141,2.4297,2.5124,2.9172
12
- Gemma-4-31b-it,2.8553,2.7678,2.9285,2.5916,2.7969,2.7997,2.7872,2.2097,2.6403,2.6407,3.0559
13
- Gpt-5.4,3.102,3.0287,3.2705,3.0277,3.0711,3.2832,3.0165,2.5045,2.922,3.0751,3.0246
14
- Gpt-5.4-mini,3.3417,3.0365,2.8875,2.5461,3.2261,3.1224,2.791,2.393,2.9426,2.8658,2.9075
15
- Gpt-5.4-nano,2.9039,2.764,2.9425,2.269,2.913,2.7384,2.7862,2.5623,2.9551,2.9552,2.7823
16
- Gpt-oss-120b,2.8299,2.9759,2.8175,2.6792,2.9697,2.6809,2.6406,2.3854,2.8958,2.6491,2.7639
17
- Gpt-oss-20b,2.4834,2.9056,2.65,2.4672,2.7635,2.6706,2.5309,2.4596,2.8156,2.8078,2.6495
18
- Grok-4.1-fast,2.7256,2.7798,2.8956,2.7138,2.9297,2.9768,3.0699,2.4118,2.8696,3.0586,2.8449
19
- Grok-4.20,2.8101,2.992,2.8084,2.8254,3.069,3.0622,2.9404,2.6096,3.0932,2.8865,2.9155
20
- Kimi-K2.5,2.7529,2.9016,3.1202,2.8565,3.0409,3.2633,3.1816,2.5213,2.9282,2.7499,2.9215
21
- Llama-4-maverick,2.2207,2.1623,2.2359,2.2829,2.1537,2.4089,2.1973,2.2927,2.3858,2.4429,2.2695
22
- Mimo-V2-Pro,3.165,3.1059,3.0574,2.8061,3.2456,2.9782,3.2639,2.4309,2.8583,2.8374,2.9878
23
- Minimax-m2.5,2.6819,2.5663,2.7079,2.49,3.1716,2.8785,2.9233,2.4104,2.6426,2.5862,2.7226
24
- Minimax-m2.7,2.9146,3.1791,3.111,2.733,3.1253,2.7982,3.1518,2.5031,2.6978,2.7335,2.9030
25
- Mistral-large-2512,2.4911,2.9176,2.6082,2.4303,2.5595,2.5928,2.9121,2.4414,2.7411,2.5481,2.6249
26
- Mistral-small-4,2.6428,2.7426,2.6579,2.5289,2.7196,2.669,2.785,2.6008,2.861,2.636,2.6874
27
- Nemotron-3-nano-30b-a3b,2.6756,2.6616,2.387,2.5064,2.8724,2.6711,2.6363,2.5052,2.7506,2.574,2.6313
28
- Nemotron-3-super-120b-a12b,2.5963,2.6885,2.7757,2.6367,2.5973,2.9963,2.5992,2.7177,2.7433,2.7751,2.6956
29
- Nova-2-lite-v1,2.4537,2.6576,2.7964,2.5301,2.6316,2.8817,2.6522,2.4999,2.8745,2.6604,2.6603
30
- Qwen3.5-122b-a10b,2.6317,2.7519,2.9903,2.613,3.1279,2.8175,3.0919,2.5094,2.8085,2.9913,2.8363
31
- Qwen3.5-35b-a3b,2.9033,2.7141,2.9996,2.5789,2.9716,2.8106,2.9955,2.3478,2.9025,3.0803,2.8262
32
- Qwen3.6-plus,2.9985,3.0064,3.0355,2.8342,3.114,2.9565,3.192,2.7408,2.9821,3.041,2.9923
 
1
  model_name,adaptive_replanning,api_workflow,domain_workflow,error_handling,failure_recovery,multi_step_orchestration,parallel_execution,parameter_complexity,single_tool_call,tool_selection,Average (All Topics)
2
+ Claude-haiku-4.5,3.0614,2.8375,2.9419,3.07,3.221,2.9925,2.8819,2.4656,2.8192,2.6517,2.915128
3
+ Claude-opus-4.6,3.2254,3.2528,3.1685,3.1008,3.6112,3.143,3.3686,2.6688,2.972,2.9478,3.168179
4
+ Claude-sonnet-4.6,3.0719,2.8159,3.3619,3.1191,3.622,3.0645,3.3271,2.7493,2.9482,2.9773,3.126162
5
+ Deepseek-v3.2,2.6536,2.5936,2.5558,2.3426,2.6388,2.4324,2.5576,2.5272,2.6462,2.5903,2.55175
6
+ Gemini-3.1-flash-lite-preview,2.7071,2.9008,2.7554,2.6945,3.0595,2.9101,3.1382,2.5291,2.6783,2.7213,2.82011
7
+ Gemini-3.1-pro-preview,3.1791,3.2185,3.2664,2.9015,3.2574,2.8581,3.2808,2.5238,3.1878,3.1821,3.095938
8
+ Gemini-3-flash-preview,3.0713,3.0608,3.107,2.6955,3.0183,3.0539,2.9513,2.7208,2.6366,2.6772,2.892602
9
+ Gemma-4-26b-a4b-it,2.2699,2.5902,2.6241,2.4963,2.5757,2.7545,2.8982,2.1141,2.4297,2.5124,2.532792
10
+ Gemma-4-31b-it,2.8553,2.7678,2.9285,2.5916,2.7969,2.7997,2.7872,2.2097,2.6403,2.6407,2.69796
11
+ GLM-4.7,2.8156,2.7517,3.0237,2.8089,3.1913,3.1423,3.2166,2.4618,2.9373,2.6377,2.917214
12
+ GLM-5.1,2.9743,3.1265,3.1676,3.0665,3.2953,3.0381,3.2561,2.6182,3.0416,2.7205,3.055908
13
+ Gpt-5.4,3.102,3.0287,3.2705,3.0277,3.0711,3.2832,3.0165,2.5045,2.922,3.0751,3.024563
14
+ Gpt-5.4-mini,3.3417,3.0365,2.8875,2.5461,3.2261,3.1224,2.791,2.393,2.9426,2.8658,2.907451
15
+ Gpt-5.4-nano,2.9039,2.764,2.9425,2.269,2.913,2.7384,2.7862,2.5623,2.9551,2.9552,2.782302
16
+ Gpt-oss-120b,2.8299,2.9759,2.8175,2.6792,2.9697,2.6809,2.6406,2.3854,2.8958,2.6491,2.763865
17
+ Gpt-oss-20b,2.4834,2.9056,2.65,2.4672,2.7635,2.6706,2.5309,2.4596,2.8156,2.8078,2.649476
18
+ Grok-4.1-fast,2.7256,2.7798,2.8956,2.7138,2.9297,2.9768,3.0699,2.4118,2.8696,3.0586,2.844892
19
+ Grok-4.20,2.8101,2.992,2.8084,2.8254,3.069,3.0622,2.9404,2.6096,3.0932,2.8865,2.915472
20
+ Kimi-K2.5,2.7529,2.9016,3.1202,2.8565,3.0409,3.2633,3.1816,2.5213,2.9282,2.7499,2.921457
21
+ Llama-4-maverick,2.2207,2.1623,2.2359,2.2829,2.1537,2.4089,2.1973,2.2927,2.3858,2.4429,2.269504
22
+ Mimo-V2-Pro,3.165,3.1059,3.0574,2.8061,3.2456,2.9782,3.2639,2.4309,2.8583,2.8374,2.987781
23
+ Minimax-m2.5,2.6819,2.5663,2.7079,2.49,3.1716,2.8785,2.9233,2.4104,2.6426,2.5862,2.722556
24
+ Minimax-m2.7,2.9146,3.1791,3.111,2.733,3.1253,2.7982,3.1518,2.5031,2.6978,2.7335,2.902963
25
+ Mistral-large-2512,2.4911,2.9176,2.6082,2.4303,2.5595,2.5928,2.9121,2.4414,2.7411,2.5481,2.624904
26
+ Mistral-small-4,2.6428,2.7426,2.6579,2.5289,2.7196,2.669,2.785,2.6008,2.861,2.636,2.687354
27
+ Nemotron-3-nano-30b-a3b,2.6756,2.6616,2.387,2.5064,2.8724,2.6711,2.6363,2.5052,2.7506,2.574,2.631261
28
+ Nemotron-3-super-120b-a12b,2.5963,2.6885,2.7757,2.6367,2.5973,2.9963,2.5992,2.7177,2.7433,2.7751,2.695599
29
+ Nova-2-lite-v1,2.4537,2.6576,2.7964,2.5301,2.6316,2.8817,2.6522,2.4999,2.8745,2.6604,2.660295
30
+ Qwen3.5-122b-a10b,2.6317,2.7519,2.9903,2.613,3.1279,2.8175,3.0919,2.5094,2.8085,2.9913,2.836341
31
+ Qwen3.5-35b-a3b,2.9033,2.7141,2.9996,2.5789,2.9716,2.8106,2.9955,2.3478,2.9025,3.0803,2.826184
32
+ Qwen3.6-plus,2.9985,3.0064,3.0355,2.8342,3.114,2.9565,3.192,2.7408,2.9821,3.041,2.992333