Jongyoon Song commited on
Commit
ef2b66d
·
1 Parent(s): 7d35c34

Update evaluation results (251224) & Remove time and speed-related results

Browse files
app.py CHANGED
@@ -242,127 +242,6 @@ def create_benchmark_tab_content(data_prefix: str):
242
  )
243
  gr.HTML('</div>')
244
 
245
- # --- Speed Med Bar Plot Section (NEW) ---
246
- import json
247
- with open(f"src/data/{data_prefix}/time_data.json", "r") as f:
248
- time_data = json.load(f)
249
- time_data_state = gr.State(value=time_data)
250
-
251
- gr.HTML("""
252
- <div class="dark-container" style="margin-bottom: 24px; margin-top: 24px;">
253
- <div class="section-header">
254
- <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
255
- Speed per GPU
256
- </h3>
257
- </div>
258
- <p style="color: var(--text-secondary); margin-bottom: 8px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
259
- Speed per GPU represents the number of tokens generated per second divided by the number of GPUs during the inference.<br>
260
- </p>
261
- <p style="font-size:0.95em; color:var(--text-secondary); margin-top:0.5px;">
262
- <b>Setting</b>: We measured the speed in an H100 GPU environment consisting of 4 nodes with 8 GPUs each, using vLLM and Ray to set the tensor parallel size between 1 and 32 (In the plot, <i>GPU</i> refers to the tensor parallel size).<br>
263
- We performed inference by sending an asynchronous request to the served model, and we set the concurrency to 32. <br>
264
- <b>Note</b>: We measured the speed by directly serving open-source models, and proprietary models were excluded from the plot.
265
- </p>
266
- """)
267
-
268
- # --- Speed Bar Plot UI: Row with left (category selector) and right (min/max dials) ---
269
- category_columns = [col for col in configs.ON_LOAD_COLUMNS_CATEGORY if col not in configs.CATEGORY_EXCLUDED_COLUMNS]
270
- default_category = "Overall"
271
- default_x_axis_sort_by = "Overall Score"
272
- with gr.Row():
273
- with gr.Column(scale=1):
274
- x_axis_sort_by = gr.Radio(
275
- choices=["Overall Score", "Speed per GPU"],
276
- value="Overall Score",
277
- label="Sort X-Axis by",
278
- elem_id=f"x-axis-btn-radio-{data_prefix.replace('/', '')}", # Make elem_id unique
279
- elem_classes=["x-axis-btn-radio"],
280
- interactive=True,
281
- show_label=True
282
- )
283
-
284
- with gr.Column(scale=1):
285
- min_max_score_slider = RangeSlider(
286
- minimum=0,
287
- maximum=100,
288
- value=(0, 100),
289
- step=1,
290
- label="Minimum and Maximum Overall Score",
291
- interactive=True
292
- )
293
-
294
- with gr.Column(scale=1):
295
- min_max_param_size_slider = RangeSlider(
296
- minimum=0,
297
- maximum=1000,
298
- value=(0, 1000),
299
- step=1,
300
- label="Minimum and Maximum Parameter Size (B)",
301
- interactive=True
302
- )
303
-
304
- # Speed Bar Plot
305
- from vis_utils import create_speed_med_bar_plot
306
- speed_med_bar_plot = gr.Plot(
307
- label="",
308
- value=create_speed_med_bar_plot(
309
- initial_df_cat,
310
- time_data,
311
- min_size=0,
312
- max_size=1000,
313
- min_score=0,
314
- max_score=100,
315
- category=default_category,
316
- theme="light",
317
- x_axis_sort_by=default_x_axis_sort_by,
318
- mode=args.mode
319
- ),
320
- elem_classes=["speed-med-bar-plot", "plot-container"]
321
- )
322
- gr.HTML("</div>")
323
-
324
- # --- Event handler: update Speed bar plot and dials when category or dials change ---
325
- def update_speed_med_bar_plot(x_axis_sort_by, min_max_size, min_max_score, current_time_data_state, current_leaderboard_df=None):
326
- df = current_leaderboard_df if current_leaderboard_df is not None else initial_df_cat
327
- return create_speed_med_bar_plot(
328
- df,
329
- current_time_data_state,
330
- min_size=min_max_size[0],
331
- max_size=min_max_size[1],
332
- min_score=min_max_score[0],
333
- max_score=min_max_score[1],
334
- theme="light",
335
- x_axis_sort_by=x_axis_sort_by,
336
- mode=args.mode
337
- )
338
-
339
- # Connect category selector to dials and plot
340
- x_axis_sort_by.change(
341
- fn=update_speed_med_bar_plot,
342
- inputs=[x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state],
343
- outputs=speed_med_bar_plot
344
- )
345
-
346
- min_max_param_size_slider.change(
347
- fn=update_speed_med_bar_plot,
348
- inputs=[x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state],
349
- outputs=speed_med_bar_plot
350
- )
351
-
352
- min_max_score_slider.change(
353
- fn=update_speed_med_bar_plot,
354
- inputs=[x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state],
355
- outputs=speed_med_bar_plot
356
- )
357
-
358
- # Connect leaderboard filters to dials and plot (if leaderboard_tab_cat provides a filtered DataFrame state)
359
- if "df_state" in leaderboard_tab_cat:
360
- leaderboard_tab_cat["df_state"].change(
361
- fn=lambda df, x_axis_sort_by, min_max_size, min_max_score, time_data: update_speed_med_bar_plot(x_axis_sort_by, min_max_size, min_max_score, time_data, df),
362
- inputs=[leaderboard_tab_cat["df_state"], x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state],
363
- outputs=speed_med_bar_plot
364
- )
365
-
366
  # Update radar chart when model_selector_cat selection changes
367
  def update_radar_chart_cat(selected_display_names):
368
  # If no selection, fallback to top-5
 
242
  )
243
  gr.HTML('</div>')
244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  # Update radar chart when model_selector_cat selection changes
246
  def update_radar_chart_cat(selected_display_names):
247
  # If no selection, fallback to top-5
src/about.py CHANGED
@@ -28,7 +28,7 @@ LINK = """
28
  <a href="https://arxiv.org/abs/2509.22715" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Paper</a> |
29
  <span>🌠</span>
30
  <a href="https://huggingface.co/spaces/SamsungResearch/TRUEBench/discussions" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Discussion</a> |
31
- <span>🔭</span> Updated: 2025-10-15
32
  </h3>
33
  """
34
 
 
28
  <a href="https://arxiv.org/abs/2509.22715" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Paper</a> |
29
  <span>🌠</span>
30
  <a href="https://huggingface.co/spaces/SamsungResearch/TRUEBench/discussions" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Discussion</a> |
31
+ <span>🔭</span> Updated: 2025-12-24
32
  </h3>
33
  """
34
 
src/data/open/length_data.json CHANGED
@@ -1223,7 +1223,75 @@
1223
  "Med Resp": 2282.5
1224
  }
1225
  },
1226
- "Claude 4.5 Opus (think)": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1227
  "Overall": {
1228
  "Min": -10,
1229
  "Max": -2,
@@ -1291,7 +1359,7 @@
1291
  "Med Resp": -3.0
1292
  }
1293
  },
1294
- "GLM-4.5 FP8 (think)": {
1295
  "Overall": {
1296
  "Min": 75,
1297
  "Max": 65432,
@@ -1427,6 +1495,74 @@
1427
  "Med Resp": 1208.5
1428
  }
1429
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1430
  "MiniMax-M2 (230B A10B)": {
1431
  "Overall": {
1432
  "Min": 64,
@@ -1631,6 +1767,74 @@
1631
  "Med Resp": 1526.0
1632
  }
1633
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1634
  "Qwen3 32B (think)": {
1635
  "Overall": {
1636
  "Min": 164,
@@ -1903,6 +2107,278 @@
1903
  "Med Resp": -3.0
1904
  }
1905
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1906
  "Claude 4 Opus (20250514) (think)": {
1907
  "Overall": {
1908
  "Min": -10,
@@ -2243,6 +2719,74 @@
2243
  "Med Resp": 1558.0
2244
  }
2245
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2246
  "GPT-5 nano (Reasoning: medium)": {
2247
  "Overall": {
2248
  "Min": -10,
@@ -2787,6 +3331,74 @@
2787
  "Med Resp": 1279.0
2788
  }
2789
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2790
  "Mi:dm 2.0 Base Instruct": {
2791
  "Overall": {
2792
  "Min": 1,
 
1223
  "Med Resp": 2282.5
1224
  }
1225
  },
1226
+ "DeepSeek V3.2 Speciale": {
1227
+ "Overall": {
1228
+ "Min": 160,
1229
+ "Max": 65513,
1230
+ "Med": 3226.5,
1231
+ "Med Resp": 249.5
1232
+ },
1233
+ "Content Generation": {
1234
+ "Min": 186,
1235
+ "Max": 46347,
1236
+ "Med": 3634.0,
1237
+ "Med Resp": 364.0
1238
+ },
1239
+ "Editing": {
1240
+ "Min": 329,
1241
+ "Max": 24883,
1242
+ "Med": 3043.0,
1243
+ "Med Resp": 178.0
1244
+ },
1245
+ "Data Analysis": {
1246
+ "Min": 191,
1247
+ "Max": 64268,
1248
+ "Med": 1640.0,
1249
+ "Med Resp": 67.0
1250
+ },
1251
+ "Reasoning": {
1252
+ "Min": 228,
1253
+ "Max": 65472,
1254
+ "Med": 2211.5,
1255
+ "Med Resp": 165.0
1256
+ },
1257
+ "Hallucination": {
1258
+ "Min": 373,
1259
+ "Max": 23653,
1260
+ "Med": 3253.5,
1261
+ "Med Resp": 258.0
1262
+ },
1263
+ "Safety": {
1264
+ "Min": 331,
1265
+ "Max": 39236,
1266
+ "Med": 2575.0,
1267
+ "Med Resp": 158.0
1268
+ },
1269
+ "Repetition": {
1270
+ "Min": 356,
1271
+ "Max": 65513,
1272
+ "Med": 3357.0,
1273
+ "Med Resp": 246.0
1274
+ },
1275
+ "Summarization": {
1276
+ "Min": 160,
1277
+ "Max": 56309,
1278
+ "Med": 1500.0,
1279
+ "Med Resp": 189.5
1280
+ },
1281
+ "Translation": {
1282
+ "Min": 522,
1283
+ "Max": 25619,
1284
+ "Med": 5143.5,
1285
+ "Med Resp": 281.5
1286
+ },
1287
+ "Multi-Turn": {
1288
+ "Min": 244,
1289
+ "Max": 32258,
1290
+ "Med": 4282.0,
1291
+ "Med Resp": 854.0
1292
+ }
1293
+ },
1294
+ "Claude 4.5 Opus (think, budget: 16K)": {
1295
  "Overall": {
1296
  "Min": -10,
1297
  "Max": -2,
 
1359
  "Med Resp": -3.0
1360
  }
1361
  },
1362
+ "GLM-4.5 FP8": {
1363
  "Overall": {
1364
  "Min": 75,
1365
  "Max": 65432,
 
1495
  "Med Resp": 1208.5
1496
  }
1497
  },
1498
+ "DeepSeek V3.2": {
1499
+ "Overall": {
1500
+ "Min": 134,
1501
+ "Max": 22816,
1502
+ "Med": 762.5,
1503
+ "Med Resp": 312.0
1504
+ },
1505
+ "Content Generation": {
1506
+ "Min": 153,
1507
+ "Max": 5977,
1508
+ "Med": 845.0,
1509
+ "Med Resp": 462.0
1510
+ },
1511
+ "Editing": {
1512
+ "Min": 141,
1513
+ "Max": 6055,
1514
+ "Med": 587.5,
1515
+ "Med Resp": 245.5
1516
+ },
1517
+ "Data Analysis": {
1518
+ "Min": 157,
1519
+ "Max": 13414,
1520
+ "Med": 695.0,
1521
+ "Med Resp": 166.0
1522
+ },
1523
+ "Reasoning": {
1524
+ "Min": 272,
1525
+ "Max": 22816,
1526
+ "Med": 1440.5,
1527
+ "Med Resp": 245.0
1528
+ },
1529
+ "Hallucination": {
1530
+ "Min": 213,
1531
+ "Max": 9501,
1532
+ "Med": 938.5,
1533
+ "Med Resp": 532.5
1534
+ },
1535
+ "Safety": {
1536
+ "Min": 184,
1537
+ "Max": 5304,
1538
+ "Med": 617.0,
1539
+ "Med Resp": 238.0
1540
+ },
1541
+ "Repetition": {
1542
+ "Min": 216,
1543
+ "Max": 7227,
1544
+ "Med": 919.5,
1545
+ "Med Resp": 399.0
1546
+ },
1547
+ "Summarization": {
1548
+ "Min": 134,
1549
+ "Max": 1750,
1550
+ "Med": 471.0,
1551
+ "Med Resp": 197.5
1552
+ },
1553
+ "Translation": {
1554
+ "Min": 154,
1555
+ "Max": 6364,
1556
+ "Med": 565.0,
1557
+ "Med Resp": 301.0
1558
+ },
1559
+ "Multi-Turn": {
1560
+ "Min": 401,
1561
+ "Max": 14066,
1562
+ "Med": 2538.5,
1563
+ "Med Resp": 1261.0
1564
+ }
1565
+ },
1566
  "MiniMax-M2 (230B A10B)": {
1567
  "Overall": {
1568
  "Min": 64,
 
1767
  "Med Resp": 1526.0
1768
  }
1769
  },
1770
+ "MiMo V2 Flash": {
1771
+ "Overall": {
1772
+ "Min": 125,
1773
+ "Max": 69375,
1774
+ "Med": 1477.5,
1775
+ "Med Resp": 373.0
1776
+ },
1777
+ "Content Generation": {
1778
+ "Min": 222,
1779
+ "Max": 65445,
1780
+ "Med": 1321.5,
1781
+ "Med Resp": 500.5
1782
+ },
1783
+ "Editing": {
1784
+ "Min": 265,
1785
+ "Max": 65423,
1786
+ "Med": 1194.0,
1787
+ "Med Resp": 314.0
1788
+ },
1789
+ "Data Analysis": {
1790
+ "Min": 262,
1791
+ "Max": 65439,
1792
+ "Med": 1296.0,
1793
+ "Med Resp": 235.0
1794
+ },
1795
+ "Reasoning": {
1796
+ "Min": 319,
1797
+ "Max": 65430,
1798
+ "Med": 2559.5,
1799
+ "Med Resp": 402.5
1800
+ },
1801
+ "Hallucination": {
1802
+ "Min": 129,
1803
+ "Max": 65447,
1804
+ "Med": 1179.5,
1805
+ "Med Resp": 499.0
1806
+ },
1807
+ "Safety": {
1808
+ "Min": 133,
1809
+ "Max": 5184,
1810
+ "Med": 717.0,
1811
+ "Med Resp": 294.0
1812
+ },
1813
+ "Repetition": {
1814
+ "Min": 295,
1815
+ "Max": 65472,
1816
+ "Med": 2153.5,
1817
+ "Med Resp": 573.5
1818
+ },
1819
+ "Summarization": {
1820
+ "Min": 188,
1821
+ "Max": 64302,
1822
+ "Med": 789.5,
1823
+ "Med Resp": 220.5
1824
+ },
1825
+ "Translation": {
1826
+ "Min": 125,
1827
+ "Max": 65041,
1828
+ "Med": 1738.5,
1829
+ "Med Resp": 339.5
1830
+ },
1831
+ "Multi-Turn": {
1832
+ "Min": 323,
1833
+ "Max": 69375,
1834
+ "Med": 3331.5,
1835
+ "Med Resp": 1361.0
1836
+ }
1837
+ },
1838
  "Qwen3 32B (think)": {
1839
  "Overall": {
1840
  "Min": 164,
 
2107
  "Med Resp": -3.0
2108
  }
2109
  },
2110
+ "GPT-5.2 (Reasoning: medium)": {
2111
+ "Overall": {
2112
+ "Min": 11,
2113
+ "Max": 7735,
2114
+ "Med": 347.0,
2115
+ "Med Resp": 264.0
2116
+ },
2117
+ "Content Generation": {
2118
+ "Min": 12,
2119
+ "Max": 7735,
2120
+ "Med": 537.0,
2121
+ "Med Resp": 370.0
2122
+ },
2123
+ "Editing": {
2124
+ "Min": 11,
2125
+ "Max": 1562,
2126
+ "Med": 173.5,
2127
+ "Med Resp": 166.0
2128
+ },
2129
+ "Data Analysis": {
2130
+ "Min": 18,
2131
+ "Max": 3954,
2132
+ "Med": 222.0,
2133
+ "Med Resp": 98.0
2134
+ },
2135
+ "Reasoning": {
2136
+ "Min": 29,
2137
+ "Max": 6895,
2138
+ "Med": 445.5,
2139
+ "Med Resp": 246.5
2140
+ },
2141
+ "Hallucination": {
2142
+ "Min": 72,
2143
+ "Max": 3525,
2144
+ "Med": 633.0,
2145
+ "Med Resp": 357.5
2146
+ },
2147
+ "Safety": {
2148
+ "Min": 58,
2149
+ "Max": 2808,
2150
+ "Med": 434.0,
2151
+ "Med Resp": 285.0
2152
+ },
2153
+ "Repetition": {
2154
+ "Min": 34,
2155
+ "Max": 5202,
2156
+ "Med": 272.0,
2157
+ "Med Resp": 223.0
2158
+ },
2159
+ "Summarization": {
2160
+ "Min": 37,
2161
+ "Max": 2339,
2162
+ "Med": 201.0,
2163
+ "Med Resp": 194.5
2164
+ },
2165
+ "Translation": {
2166
+ "Min": 12,
2167
+ "Max": 3684,
2168
+ "Med": 307.0,
2169
+ "Med Resp": 283.5
2170
+ },
2171
+ "Multi-Turn": {
2172
+ "Min": 41,
2173
+ "Max": 7003,
2174
+ "Med": 983.5,
2175
+ "Med Resp": 844.5
2176
+ }
2177
+ },
2178
+ "Gemini 3 Flash Preview (Thinking Level: High)": {
2179
+ "Overall": {
2180
+ "Min": 137,
2181
+ "Max": 24472,
2182
+ "Med": 1296.5,
2183
+ "Med Resp": 424.5
2184
+ },
2185
+ "Content Generation": {
2186
+ "Min": 248,
2187
+ "Max": 16374,
2188
+ "Med": 1368.5,
2189
+ "Med Resp": 535.5
2190
+ },
2191
+ "Editing": {
2192
+ "Min": 137,
2193
+ "Max": 10610,
2194
+ "Med": 1113.5,
2195
+ "Med Resp": 338.0
2196
+ },
2197
+ "Data Analysis": {
2198
+ "Min": 166,
2199
+ "Max": 13595,
2200
+ "Med": 923.0,
2201
+ "Med Resp": 232.0
2202
+ },
2203
+ "Reasoning": {
2204
+ "Min": 318,
2205
+ "Max": 24472,
2206
+ "Med": 1210.5,
2207
+ "Med Resp": 556.0
2208
+ },
2209
+ "Hallucination": {
2210
+ "Min": 349,
2211
+ "Max": 5023,
2212
+ "Med": 1295.5,
2213
+ "Med Resp": 639.5
2214
+ },
2215
+ "Safety": {
2216
+ "Min": 380,
2217
+ "Max": 5510,
2218
+ "Med": 1297.0,
2219
+ "Med Resp": 482.0
2220
+ },
2221
+ "Repetition": {
2222
+ "Min": 309,
2223
+ "Max": 7743,
2224
+ "Med": 1477.5,
2225
+ "Med Resp": 389.5
2226
+ },
2227
+ "Summarization": {
2228
+ "Min": 306,
2229
+ "Max": 18709,
2230
+ "Med": 905.5,
2231
+ "Med Resp": 195.0
2232
+ },
2233
+ "Translation": {
2234
+ "Min": 289,
2235
+ "Max": 17871,
2236
+ "Med": 1421.0,
2237
+ "Med Resp": 381.5
2238
+ },
2239
+ "Multi-Turn": {
2240
+ "Min": 231,
2241
+ "Max": 11926,
2242
+ "Med": 3075.5,
2243
+ "Med Resp": 1466.5
2244
+ }
2245
+ },
2246
+ "Kanana 2 30B A3B Thinking": {
2247
+ "Overall": {
2248
+ "Min": 584,
2249
+ "Max": 247274,
2250
+ "Med": 4263.0,
2251
+ "Med Resp": 854.5
2252
+ },
2253
+ "Content Generation": {
2254
+ "Min": 1055,
2255
+ "Max": 139421,
2256
+ "Med": 3898.5,
2257
+ "Med Resp": 1028.0
2258
+ },
2259
+ "Editing": {
2260
+ "Min": 747,
2261
+ "Max": 134253,
2262
+ "Med": 3199.0,
2263
+ "Med Resp": 606.5
2264
+ },
2265
+ "Data Analysis": {
2266
+ "Min": 618,
2267
+ "Max": 120325,
2268
+ "Med": 3402.0,
2269
+ "Med Resp": 509.0
2270
+ },
2271
+ "Reasoning": {
2272
+ "Min": 1042,
2273
+ "Max": 160440,
2274
+ "Med": 6428.5,
2275
+ "Med Resp": 925.5
2276
+ },
2277
+ "Hallucination": {
2278
+ "Min": 760,
2279
+ "Max": 137639,
2280
+ "Med": 4215.0,
2281
+ "Med Resp": 1061.5
2282
+ },
2283
+ "Safety": {
2284
+ "Min": 787,
2285
+ "Max": 116591,
2286
+ "Med": 3686.0,
2287
+ "Med Resp": 867.0
2288
+ },
2289
+ "Repetition": {
2290
+ "Min": 1238,
2291
+ "Max": 134651,
2292
+ "Med": 8164.0,
2293
+ "Med Resp": 517.5
2294
+ },
2295
+ "Summarization": {
2296
+ "Min": 584,
2297
+ "Max": 59519,
2298
+ "Med": 2540.0,
2299
+ "Med Resp": 656.5
2300
+ },
2301
+ "Translation": {
2302
+ "Min": 899,
2303
+ "Max": 131258,
2304
+ "Med": 4796.0,
2305
+ "Med Resp": 894.0
2306
+ },
2307
+ "Multi-Turn": {
2308
+ "Min": 1560,
2309
+ "Max": 247274,
2310
+ "Med": 12632.5,
2311
+ "Med Resp": 2593.0
2312
+ }
2313
+ },
2314
+ "Kanana 2 30B A3B Instruct": {
2315
+ "Overall": {
2316
+ "Min": 51,
2317
+ "Max": 177683,
2318
+ "Med": 1195.0,
2319
+ "Med Resp": 1195.0
2320
+ },
2321
+ "Content Generation": {
2322
+ "Min": 58,
2323
+ "Max": 12603,
2324
+ "Med": 1448.0,
2325
+ "Med Resp": 1448.0
2326
+ },
2327
+ "Editing": {
2328
+ "Min": 69,
2329
+ "Max": 51628,
2330
+ "Med": 836.5,
2331
+ "Med Resp": 836.5
2332
+ },
2333
+ "Data Analysis": {
2334
+ "Min": 51,
2335
+ "Max": 11567,
2336
+ "Med": 916.0,
2337
+ "Med Resp": 916.0
2338
+ },
2339
+ "Reasoning": {
2340
+ "Min": 51,
2341
+ "Max": 122001,
2342
+ "Med": 1587.0,
2343
+ "Med Resp": 1587.0
2344
+ },
2345
+ "Hallucination": {
2346
+ "Min": 104,
2347
+ "Max": 17989,
2348
+ "Med": 1419.0,
2349
+ "Med Resp": 1419.0
2350
+ },
2351
+ "Safety": {
2352
+ "Min": 96,
2353
+ "Max": 7485,
2354
+ "Med": 1377.0,
2355
+ "Med Resp": 1377.0
2356
+ },
2357
+ "Repetition": {
2358
+ "Min": 255,
2359
+ "Max": 177683,
2360
+ "Med": 844.0,
2361
+ "Med Resp": 844.0
2362
+ },
2363
+ "Summarization": {
2364
+ "Min": 108,
2365
+ "Max": 4592,
2366
+ "Med": 778.0,
2367
+ "Med Resp": 778.0
2368
+ },
2369
+ "Translation": {
2370
+ "Min": 69,
2371
+ "Max": 30611,
2372
+ "Med": 1059.0,
2373
+ "Med Resp": 1059.0
2374
+ },
2375
+ "Multi-Turn": {
2376
+ "Min": 119,
2377
+ "Max": 74203,
2378
+ "Med": 3252.5,
2379
+ "Med Resp": 3252.5
2380
+ }
2381
+ },
2382
  "Claude 4 Opus (20250514) (think)": {
2383
  "Overall": {
2384
  "Min": -10,
 
2719
  "Med Resp": 1558.0
2720
  }
2721
  },
2722
+ "GLM-4.7 FP8": {
2723
+ "Overall": {
2724
+ "Min": 212,
2725
+ "Max": 131072,
2726
+ "Med": 2252.5,
2727
+ "Med Resp": 328.0
2728
+ },
2729
+ "Content Generation": {
2730
+ "Min": 383,
2731
+ "Max": 18712,
2732
+ "Med": 2094.0,
2733
+ "Med Resp": 423.0
2734
+ },
2735
+ "Editing": {
2736
+ "Min": 384,
2737
+ "Max": 14538,
2738
+ "Med": 2070.5,
2739
+ "Med Resp": 263.0
2740
+ },
2741
+ "Data Analysis": {
2742
+ "Min": 396,
2743
+ "Max": 13525,
2744
+ "Med": 1477.0,
2745
+ "Med Resp": 162.0
2746
+ },
2747
+ "Reasoning": {
2748
+ "Min": 254,
2749
+ "Max": 40295,
2750
+ "Med": 2298.5,
2751
+ "Med Resp": 465.5
2752
+ },
2753
+ "Hallucination": {
2754
+ "Min": 443,
2755
+ "Max": 19838,
2756
+ "Med": 2156.5,
2757
+ "Med Resp": 481.0
2758
+ },
2759
+ "Safety": {
2760
+ "Min": 212,
2761
+ "Max": 10792,
2762
+ "Med": 2121.0,
2763
+ "Med Resp": 197.0
2764
+ },
2765
+ "Repetition": {
2766
+ "Min": 768,
2767
+ "Max": 131072,
2768
+ "Med": 2963.5,
2769
+ "Med Resp": 289.0
2770
+ },
2771
+ "Summarization": {
2772
+ "Min": 599,
2773
+ "Max": 10452,
2774
+ "Med": 1426.0,
2775
+ "Med Resp": 182.5
2776
+ },
2777
+ "Translation": {
2778
+ "Min": 796,
2779
+ "Max": 12247,
2780
+ "Med": 3159.5,
2781
+ "Med Resp": 312.5
2782
+ },
2783
+ "Multi-Turn": {
2784
+ "Min": 420,
2785
+ "Max": 15706,
2786
+ "Med": 5514.5,
2787
+ "Med Resp": 1361.0
2788
+ }
2789
+ },
2790
  "GPT-5 nano (Reasoning: medium)": {
2791
  "Overall": {
2792
  "Min": -10,
 
3331
  "Med Resp": 1279.0
3332
  }
3333
  },
3334
+ "Mistral Large 3 675B Instruct 2512": {
3335
+ "Overall": {
3336
+ "Min": 1,
3337
+ "Max": 12120,
3338
+ "Med": 448.0,
3339
+ "Med Resp": 448.0
3340
+ },
3341
+ "Content Generation": {
3342
+ "Min": 13,
3343
+ "Max": 6162,
3344
+ "Med": 565.0,
3345
+ "Med Resp": 565.0
3346
+ },
3347
+ "Editing": {
3348
+ "Min": 12,
3349
+ "Max": 2369,
3350
+ "Med": 299.0,
3351
+ "Med Resp": 299.0
3352
+ },
3353
+ "Data Analysis": {
3354
+ "Min": 1,
3355
+ "Max": 3902,
3356
+ "Med": 295.0,
3357
+ "Med Resp": 295.0
3358
+ },
3359
+ "Reasoning": {
3360
+ "Min": 1,
3361
+ "Max": 6293,
3362
+ "Med": 530.0,
3363
+ "Med Resp": 530.0
3364
+ },
3365
+ "Hallucination": {
3366
+ "Min": 54,
3367
+ "Max": 4461,
3368
+ "Med": 896.0,
3369
+ "Med Resp": 896.0
3370
+ },
3371
+ "Safety": {
3372
+ "Min": 27,
3373
+ "Max": 4250,
3374
+ "Med": 589.0,
3375
+ "Med Resp": 589.0
3376
+ },
3377
+ "Repetition": {
3378
+ "Min": 89,
3379
+ "Max": 5264,
3380
+ "Med": 448.0,
3381
+ "Med Resp": 448.0
3382
+ },
3383
+ "Summarization": {
3384
+ "Min": 31,
3385
+ "Max": 1357,
3386
+ "Med": 251.5,
3387
+ "Med Resp": 251.5
3388
+ },
3389
+ "Translation": {
3390
+ "Min": 22,
3391
+ "Max": 3529,
3392
+ "Med": 354.5,
3393
+ "Med Resp": 354.5
3394
+ },
3395
+ "Multi-Turn": {
3396
+ "Min": 4,
3397
+ "Max": 12120,
3398
+ "Med": 2191.5,
3399
+ "Med Resp": 2191.5
3400
+ }
3401
+ },
3402
  "Mi:dm 2.0 Base Instruct": {
3403
  "Overall": {
3404
  "Min": 1,
src/data/open/stats.csv CHANGED
@@ -1,83 +1,98 @@
1
- "Model Name" "Link" "Comment" "Group" "Med. Len." "Med. Resp. Len." "Time to First Answer Token" "End-to-End Response Time" "Speed" "Parameter Size (B)" "Type" "Model Type" "Think" "Overall" "Content Generation" "Editing" "Data Analysis" "Reasoning" "Hallucination" "Safety" "Repetition" "Summarization" "Translation" "Multi-Turn"
2
- "GPT-5 (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5" "Reasoning: medium" "GPT" "" "" "" "" "" "" "Proprietary" "Think" "On" "70.73" "71.0" "74.38" "76.49" "79.75" "64.94" "56.2" "82.86" "80.16" "69.38" "54.36"
3
- "o3-pro (Reasoning: medium)" "https://platform.openai.com/docs/models/o3-pro" "Reasoning: medium" "GPT" "" "" "" "" "" "" "Proprietary" "Think" "On" "66.47" "72.5" "70.31" "75.7" "83.88" "64.37" "33.88" "74.29" "65.48" "64.33" "48.32"
4
- "GPT-5.1 (Reasoning: medium, verbosity: medium)" "https://platform.openai.com/docs/models/gpt-5.1" "Reasoning: medium, verbosity: medium" "GPT" "" "" "" "11.673096776008606" "" "" "Proprietary" "Think" "On" "64.57" "67.0" "70.0" "72.51" "82.64" "65.52" "52.07" "51.43" "67.06" "59.55" "45.64"
5
- "Claude 4.5 Opus (think)" "https://www.anthropic.com/claude/opus" "" "Claude" "" "" "" "" "" "" "Proprietary" "Hybrid" "On" "63.41" "63.5" "62.5" "73.71" "77.69" "82.76" "52.89" "58.57" "63.49" "56.74" "45.97"
6
- "Claude 4 Opus (20250514) (think)" "https://www.anthropic.com/claude/opus" "version: 20250514" "Claude" "" "" "" "" "" "" "Proprietary" "Hybrid" "On" "63.29" "60.75" "59.69" "73.31" "69.83" "78.74" "53.72" "55.71" "65.48" "65.45" "48.99"
7
- "Claude 4.1 Opus (20250805) (think)" "https://www.anthropic.com/claude/opus" "version: 20250805" "Claude" "" "" "" "" "" "" "Proprietary" "Hybrid" "On" "63.24" "61.25" "60.0" "78.49" "72.73" "77.01" "56.2" "57.14" "61.9" "62.64" "46.98"
8
- "GPT-5 mini (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-mini" "Reasoning: medium" "GPT" "" "" "" "" "" "" "Proprietary" "Think" "On" "62.56" "68.0" "62.5" "74.9" "76.86" "55.17" "47.93" "44.29" "74.6" "56.18" "45.3"
9
- "Gemini 3 Pro Preview (Thinking Level: High)" "" "" "Gemini" "1930.5" "378.0" "" "27.89457416534424" "" "" "Proprietary" "Think" "On" "62.48" "59.5" "64.38" "76.49" "78.93" "70.69" "39.67" "65.71" "61.51" "58.15" "48.99"
10
- "Claude 4 Sonnet (20250514) (think)" "https://www.anthropic.com/claude/sonnet" "version: 20250514" "Claude" "" "" "" "" "" "" "Proprietary" "Hybrid" "On" "61.8" "58.0" "58.44" "76.49" "67.77" "79.31" "57.02" "44.29" "65.08" "62.92" "44.97"
11
- "o3" "https://platform.openai.com/docs/models/o3" "" "GPT" "" "" "" "" "" "" "Proprietary" "Think" "On" "60.91" "68.75" "60.0" "73.31" "79.34" "54.02" "34.71" "64.29" "60.71" "55.06" "46.98"
12
- "Gemini 2.5 Pro" "https://deepmind.google/models/gemini/pro/" "" "Gemini" "" "" "" "" "" "" "Proprietary" "Think" "On" "59.34" "54.0" "60.94" "78.88" "73.14" "63.22" "17.36" "52.86" "67.86" "53.93" "52.68"
 
 
 
 
 
 
13
  "Grok-4" "https://x.ai/news/grok-4" "temperature: 0.6
14
- top-p: 0.95" "Grok" "" "" "" "" "" "" "Proprietary" "Think" "On" "58.74" "61.0" "66.25" "72.51" "63.22" "66.09" "16.53" "58.57" "66.27" "54.21" "44.3"
15
- "Gemini 2.5 Flash" "https://deepmind.google/models/gemini/flash/" "" "Gemini" "" "" "" "" "" "" "Proprietary" "Hybrid" "On" "58.62" "57.25" "62.19" "70.52" "72.31" "56.9" "28.93" "47.14" "68.65" "55.06" "46.98"
16
- "o4-mini" "https://platform.openai.com/docs/models/o4-mini" "" "GPT" "" "" "" "" "" "" "Proprietary" "Think" "On" "57.57" "67.25" "61.25" "71.71" "75.62" "45.4" "39.67" "44.29" "59.92" "47.19" "41.95"
17
  "Kimi K2 Thinking" "https://huggingface.co/moonshotai/Kimi-K2-Thinking" "temperature:1.0
18
- top-p: 0.95" "moonshot" "1692.0" "330.0" "45.35071495282816" "70.24291145801544" "24.28866627458008" "1000.0" "Open" "Think" "On" "56.84" "58.25" "50.31" "69.72" "77.27" "60.92" "44.63" "38.57" "59.92" "52.25" "44.3"
19
  "Qwen3 235B A22B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" "temperature: 0.6
20
- top-p: 0.95" "Qwen" "2404.5" "423.0" "58.364528823897146" "80.01045334339142" "31.05335185752473" "235.0" "Open" "Think" "On" "55.48" "57.5" "53.12" "73.31" "75.21" "55.17" "25.62" "35.71" "55.56" "56.18" "40.27"
21
- "GPT-5 nano (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-nano" "Reasoning: medium" "GPT" "" "" "" "" "" "" "Proprietary" "Think" "On" "55.39" "63.5" "47.19" "68.92" "75.21" "55.17" "52.07" "34.29" "63.49" "40.73" "42.95"
22
- "GLM-4.5 FP8 (think)" "https://huggingface.co/zai-org/GLM-4.5-FP8" "temperature: 0.6
23
- top-p: 0.95" "GLM" "1442.0" "604.0" "25.261904125875603" "62.74959444999695" "23.293980879127712" "355.0" "Open" "Hybrid" "On" "54.03" "60.75" "53.75" "68.92" "74.38" "47.13" "33.06" "41.43" "60.32" "46.07" "35.91"
24
  "GLM-4.6 FP8" "https://huggingface.co/zai-org/GLM-4.6-FP8" "temperature: 1.0
25
- top-p: 0.95" "GLM" "2645.5" "522.0" "81.414294828216" "110.0251989364624" "24.034975709814915" "355.0" "Open" "Hybrid" "On" "53.3" "57.5" "51.25" "71.31" "71.9" "53.45" "24.79" "28.57" "58.33" "44.38" "43.29"
26
- "Gemini 2.5 Flash-lite Preview (09-2025)" "https://deepmind.google/models/gemini/" "version: 09-2025" "Gemini" "" "" "" "" "" "" "Proprietary" "Think" "On" "53.06" "55.0" "55.94" "68.13" "70.25" "47.7" "23.97" "30.0" "60.71" "46.63" "42.28"
27
  "Qwen3 235B A22B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" "temperature: 0.7
28
- top-p: 0.8" "Qwen" "433.0" "433.0" "0.1387630701065063" "14.262101531028748" "31.359207215387023" "235.0" "Open" "Instruct" "Off" "52.94" "58.0" "49.69" "68.13" "73.97" "55.17" "45.45" "30.0" "55.95" "38.48" "41.61"
 
 
29
  "DeepSeek V3.1 (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" "temperature: 0.6
30
- top-p: 0.95" "DeepSeek" "710.5" "356.0" "14.323043732258654" "35.32915151119232" "16.64962453842425" "671.0" "Open" "Hybrid" "On" "51.45" "52.0" "50.0" "67.33" "69.83" "50.0" "33.88" "35.71" "59.52" "41.85" "40.27"
31
  "DeepSeek V3.1 Terminus (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1-Terminus" "temperature: 0.6
32
- top-p: 0.95" "DeepSeek" "831.5" "377.0" "17.055466594943752" "47.552645206451416" "17.890508425613742" "671.0" "Open" "Hybrid" "On" "51.37" "51.5" "52.19" "69.32" "73.14" "51.72" "25.62" "38.57" "57.14" "38.76" "40.94"
33
  "Qwen3 30B A3B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507" "temperature: 0.6
34
- top-p: 0.95" "Qwen" "2830.0" "351.0" "76.69636714346468" "82.98819828033447" "72.08537789542703" "30.0" "Open" "Think" "On" "50.44" "56.25" "45.0" "69.32" "69.01" "50.0" "29.75" "30.0" "48.02" "47.47" "36.58"
 
 
35
  "gpt-oss-120B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-120b" "Reasoning: medium
36
  temperature: 1.0
37
- top-p: 1.0" "GPT" "759.5" "370.5" "7.694922740481965" "12.121336698532104" "103.31935460342277" "117.0" "Open" "Think" "On" "49.11" "58.5" "48.44" "68.92" "69.83" "41.38" "39.67" "25.71" "50.79" "35.67" "32.21"
38
  "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)" "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" "version: 0528
39
  temperature: 0.6
40
- top-p: 0.95" "DeepSeek" "1177.5" "554.0" "28.558620557701" "70.60028326511383" "17.625838630215213" "671.0" "Open" "Think" "On" "48.79" "49.75" "50.0" "65.34" "59.09" "48.85" "38.02" "32.86" "57.94" "36.52" "38.93"
41
- "Gauss2.3 Hybrid" "" "" "Gauss" "546.0" "308.0" "6.750162363052368" "17.980867981910706" "42.58336125102582" "" "Proprietary" "Hybrid" "On" "46.58" "52.0" "46.25" "59.76" "66.94" "41.95" "34.71" "25.71" "53.17" "34.55" "33.22"
 
42
  "DeepSeek V3 (0324) (top_p: 0.95, temp:1.3)" "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324" "version: 0324
43
  temperature: 1.3
44
- top-p: 0.95" "DeepSeek" "408.0" "408.0" "0.211452841758728" "23.47111320495605" "17.62487523518351" "671.0" "Open" "Instruct" "Off" "45.09" "46.25" "45.0" "58.96" "60.33" "41.95" "21.49" "30.0" "55.95" "38.48" "33.22"
45
  "Qwen3 32B (think)" "https://huggingface.co/Qwen/Qwen3-32B" "temperature: 0.6
46
- top-p: 0.95" "Qwen" "1113.0" "390.0" "27.26490248867746" "39.635579228401184" "37.74973909656839" "32.8" "Open" "Hybrid" "On" "44.44" "52.25" "41.56" "68.92" "66.53" "35.06" "19.83" "25.71" "46.43" "30.9" "32.89"
47
  "Qwen3 30B A3B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507" "temperature: 0.7
48
- top-p: 0.8" "Qwen" "441.5" "441.5" "7.902002811431885" "19.310550212860107" "42.44958664990833" "30.0" "Open" "Instruct" "Off" "42.79" "45.0" "35.0" "56.18" "66.12" "51.15" "33.06" "24.29" "46.83" "28.09" "35.57"
49
  "MiniMax-M2 (230B A10B)" "https://huggingface.co/MiniMaxAI/MiniMax-M2" "temperature:1.0
50
- top-p: 0.95" "MiniMaxAI" "1142.0" "325.0" "" "" "" "230.0" "Open" "Think" "On" "42.43" "48.75" "35.62" "53.39" "57.02" "43.1" "44.63" "28.57" "49.21" "30.06" "31.21"
51
- "A.X 4.0" "https://huggingface.co/skt/A.X-4.0" "" "SKT" "412.5" "412.5" "0.6553128957748413" "7.924791574478149" "57.95526130360478" "71.9" "Open" "Instruct" "Off" "41.59" "56.0" "43.75" "43.43" "42.56" "40.23" "15.7" "24.29" "53.97" "33.43" "32.21"
52
  "gpt-oss-20B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-20b" "Reasoning: medium
53
  temperature: 1.0
54
- top-p: 1.0" "GPT" "953.5" "326.0" "26.04652036871504" "29.767700791358948" "108.53633696847938" "21.0" "Open" "Think" "On" "41.18" "52.0" "40.0" "61.35" "65.7" "43.1" "41.32" "22.86" "36.51" "20.51" "22.82"
55
  "Gemma 3 27B it" "https://huggingface.co/google/gemma-3-27b-it" "temperature: 1.0
56
- top-p: 0.95" "Gemma" "380.0" "380.0" "3.391351342201233" "13.303653597831726" "39.94050750809835" "27.0" "Open" "Instruct" "Off" "40.86" "44.25" "45.0" "45.82" "36.78" "31.61" "32.23" "22.86" "57.14" "32.87" "39.93"
57
  "Tongyi DeepResearch 30B A3B" "https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B" "temperature: 0.6
58
- top-p: 0.95" "Alibaba" "1147.0" "408.0" "45.23295979184195" "52.38741266727448" "62.676624491545525" "30.0" "Open" "Think" "On" "40.1" "41.25" "33.12" "62.15" "68.18" "44.25" "23.97" "18.57" "41.67" "26.12" "29.19"
59
  "Mistral Small 3.2 24B Instruct 2506" "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506" "temperature: 0.15
60
- top-p: 0.95" "mistralai" "369.0" "369.0" "3.2450859546661377" "13.907460689544678" "36.382163796915904" "24.0" "Open" "Instruct" "Off" "39.09" "43.0" "44.69" "43.43" "51.65" "25.86" "22.31" "25.71" "51.98" "31.18" "30.2"
61
  "K2-Think" "https://huggingface.co/LLM360/K2-Think" "temperature: 1.0
62
- top-p: 0.95" "LLM360" "1835.0" "486.0" "24.29692639716904" "43.2994556427002" "42.72123101353567" "32.8" "Open" "Think" "On" "35.06" "35.5" "36.56" "56.18" "47.11" "35.06" "14.05" "12.86" "49.21" "21.63" "23.15"
 
 
 
63
  "KAT Dev 72B Exp" "https://huggingface.co/Kwaipilot/KAT-Dev-72B-Exp" "temperature:0.6
64
- top-p: 0.95" "KAT" "397.0" "397.0" "0.0622165203094482" "8.492375493049622" "50.601864763867184" "72.0" "Open" "Instruct" "Off" "33.94" "29.25" "44.06" "46.22" "46.69" "25.86" "18.18" "20.0" "42.86" "25.56" "25.5"
65
  "Olmo 3 32B Think" "https://huggingface.co/allenai/Olmo-3-32B-Think" "temperature: 1
66
  top-p: 0.95
67
- top-k: 50" "allenai" "3360.5" "473.0" "60.18788400716624" "77.51256728172302" "44.30514641537086" "32.0" "Open" "Think" "On" "33.94" "35.25" "30.94" "57.37" "66.53" "33.33" "28.93" "24.29" "34.52" "11.8" "19.8"
68
  "EXAONE 4.0 32B (think)" "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" "temperature: 0.6
69
- top-p: 0.95" "Exaone" "1274.5" "503.0" "40.64476558326666" "52.11687910556793" "51.19312170664125" "32.0" "Open" "Hybrid" "On" "33.82" "34.25" "29.38" "56.97" "57.44" "24.71" "27.27" "17.14" "38.49" "18.54" "25.5"
70
  "Apriel 1.5 15B Thinker" "https://huggingface.co/ServiceNow-AI/Apriel-1.5-15b-Thinker" "temperature: 0.6
71
- top-p: 0.95" "Apriel" "2238.0" "375.0" "299.8162105011457" "379.46853709220886" "14.66275339770088" "15.0" "Open" "Think" "On" "31.92" "44.25" "26.56" "47.41" "59.09" "22.99" "37.19" "20.0" "26.98" "20.22" "10.07"
72
  "HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
73
- top-p: 0.6" "HCX" "1444.0" "382.5" "16.12651202553951" "24.703290462493896" "83.75171982150616" "14.7" "Open" "Hybrid" "On" "31.84" "35.0" "26.56" "53.78" "58.68" "27.59" "26.45" "17.14" "29.76" "17.13" "20.47"
 
74
  "Dhanishtha-2.0 Preview" "https://huggingface.co/HelpingAI/Dhanishtha-2.0-preview" "temperature: 0.7
75
- top-p: 0.9" "HelpingAI" "520.0" "356.0" "4.368606805801392" "35.15699875354767" "17.75738514863349" "14.8" "Open" "Think" "On" "25.81" "28.25" "19.38" "30.28" "33.47" "43.1" "47.93" "20.0" "31.75" "12.08" "13.09"
76
  "ERNIE 4.5 21B A3B Thinking" "https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking" "temperature: 0.6
77
- top-p: 0.95" "ERNIE" "1637.0" "541.0" "48.24206436969081" "56.95321476459503" "78.52955859303597" "21.0" "Open" "Think" "On" "25.32" "27.25" "20.31" "42.23" "49.59" "23.56" "31.4" "17.14" "28.17" "7.3" "13.76"
78
  "Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
79
- top-p: 0.95" "Solar" "260.0" "260.0" "12.68759036064148" "39.93266606330872" "11.341528558845871" "22.0" "Open" "Instruct" "Off" "20.73" "28.0" "24.69" "16.73" "19.42" "17.24" "28.1" "11.43" "31.35" "13.76" "11.74"
80
  "Mi:dm 2.0 Base Instruct" "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct" "temperature: 0.8
81
- top-p: 0.7" "KT" "316.0" "316.0" "3.07414984703064" "11.089128971099854" "41.13515299318637" "11.5" "Open" "Instruct" "Off" "20.25" "21.75" "17.5" "16.73" "18.6" "27.59" "59.5" "14.29" "25.4" "12.64" "11.41"
82
  "Kanana 1.5 15.7B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct" "temperature: 1.0
83
- top-p: 0.95" "Kakao" "414.0" "414.0" "2.999279260635376" "14.037613034248352" "39.50831768498445" "15.7" "Open" "Instruct" "Off" "11.71" "14.25" "10.62" "13.55" "11.16" "22.41" "22.31" "4.29" "11.9" "6.74" "5.37"
 
1
+ "Model Name" "Link" "Comment" "Group" "Med. Len." "Med. Resp. Len." "Parameter Size (B)" "Type" "Model Type" "Think" "Overall" "Content Generation" "Editing" "Data Analysis" "Reasoning" "Hallucination" "Safety" "Repetition" "Summarization" "Translation" "Multi-Turn"
2
+ "GPT-5 (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "70.73" "71.0" "74.38" "76.49" "79.75" "64.94" "56.2" "82.86" "80.16" "69.38" "54.36"
3
+ "o3-pro (Reasoning: medium)" "https://platform.openai.com/docs/models/o3-pro" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "66.47" "72.5" "70.31" "75.7" "83.88" "64.37" "33.88" "74.29" "65.48" "64.33" "48.32"
4
+ "GPT-5.2 (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5.2" "Reasoning: medium" "GPT" "347.0" "264.0" "" "Proprietary" "Hybrid" "On" "66.18" "69.25" "65.62" "71.31" "78.51" "70.69" "52.07" "51.43" "80.56" "55.9" "55.03"
5
+ "GPT-5.1 (Reasoning: medium, verbosity: medium)" "https://platform.openai.com/docs/models/gpt-5.1" "Reasoning: medium, verbosity: medium" "GPT" "" "" "" "Proprietary" "Hybrid" "On" "64.57" "67.0" "70.0" "72.51" "82.64" "65.52" "52.07" "51.43" "67.06" "59.55" "45.64"
6
+ "Claude 4.5 Opus (think, budget: 16K)" "https://www.anthropic.com/claude/opus" "thinking budget: 16K" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "63.41" "63.5" "62.5" "73.71" "77.69" "82.76" "52.89" "58.57" "63.49" "56.74" "45.97"
7
+ "Claude 4 Opus (20250514) (think)" "https://www.anthropic.com/claude/opus" "version: 20250514" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "63.29" "60.75" "59.69" "73.31" "69.83" "78.74" "53.72" "55.71" "65.48" "65.45" "48.99"
8
+ "Claude 4.1 Opus (20250805) (think)" "https://www.anthropic.com/claude/opus" "version: 20250805" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "63.24" "61.25" "60.0" "78.49" "72.73" "77.01" "56.2" "57.14" "61.9" "62.64" "46.98"
9
+ "GPT-5 mini (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-mini" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "62.56" "68.0" "62.5" "74.9" "76.86" "55.17" "47.93" "44.29" "74.6" "56.18" "45.3"
10
+ "Gemini 3 Pro Preview (Thinking Level: High)" "https://deepmind.google/models/gemini/pro/" "Thinking Level: High" "Gemini" "1930.5" "378.0" "" "Proprietary" "Think" "On" "62.48" "59.5" "64.38" "76.49" "78.93" "70.69" "39.67" "65.71" "61.51" "58.15" "48.99"
11
+ "Claude 4 Sonnet (20250514) (think)" "https://www.anthropic.com/claude/sonnet" "version: 20250514" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "61.8" "58.0" "58.44" "76.49" "67.77" "79.31" "57.02" "44.29" "65.08" "62.92" "44.97"
12
+ "o3" "https://platform.openai.com/docs/models/o3" "" "GPT" "" "" "" "Proprietary" "Think" "On" "60.91" "68.75" "60.0" "73.31" "79.34" "54.02" "34.71" "64.29" "60.71" "55.06" "46.98"
13
+ "Gemini 2.5 Pro" "https://deepmind.google/models/gemini/pro/" "" "Gemini" "" "" "" "Proprietary" "Think" "On" "59.34" "54.0" "60.94" "78.88" "73.14" "63.22" "17.36" "52.86" "67.86" "53.93" "52.68"
14
+ "Gemini 3 Flash Preview (Thinking Level: High)" "https://deepmind.google/models/gemini/flash/" "Thinking Level: High" "Gemini" "1296.5" "424.5" "" "Proprietary" "Think" "On" "59.26" "59.5" "59.69" "75.3" "79.34" "63.22" "34.71" "57.14" "59.92" "50.84" "46.31"
15
+ "GLM-4.7 FP8" "https://huggingface.co/zai-org/GLM-4.7-FP8" "temperature: 1.0
16
+ top-p: 0.95" "GLM" "2252.5" "328.0" "358.0" "Open" "Hybrid" "On" "59.22" "62.75" "60.0" "75.3" "75.21" "58.05" "29.75" "35.71" "66.67" "53.93" "45.3"
17
+ "DeepSeek V3.2 Speciale" "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Speciale" "temperature: 1.0
18
+ top-p: 0.95" "DeepSeek" "3226.5" "249.5" "671.0" "Open" "Think" "On" "59.14" "64.0" "67.19" "74.5" "78.1" "48.28" "20.66" "58.57" "66.27" "53.09" "38.93"
19
  "Grok-4" "https://x.ai/news/grok-4" "temperature: 0.6
20
+ top-p: 0.95" "Grok" "" "" "" "Proprietary" "Think" "On" "58.74" "61.0" "66.25" "72.51" "63.22" "66.09" "16.53" "58.57" "66.27" "54.21" "44.3"
21
+ "Gemini 2.5 Flash" "https://deepmind.google/models/gemini/flash/" "" "Gemini" "" "" "" "Proprietary" "Hybrid" "On" "58.62" "57.25" "62.19" "70.52" "72.31" "56.9" "28.93" "47.14" "68.65" "55.06" "46.98"
22
+ "o4-mini" "https://platform.openai.com/docs/models/o4-mini" "" "GPT" "" "" "" "Proprietary" "Think" "On" "57.57" "67.25" "61.25" "71.71" "75.62" "45.4" "39.67" "44.29" "59.92" "47.19" "41.95"
23
  "Kimi K2 Thinking" "https://huggingface.co/moonshotai/Kimi-K2-Thinking" "temperature:1.0
24
+ top-p: 0.95" "moonshot" "1692.0" "330.0" "1000.0" "Open" "Think" "On" "56.84" "58.25" "50.31" "69.72" "77.27" "60.92" "44.63" "38.57" "59.92" "52.25" "44.3"
25
  "Qwen3 235B A22B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" "temperature: 0.6
26
+ top-p: 0.95" "Qwen" "2404.5" "423.0" "235.0" "Open" "Think" "On" "55.48" "57.5" "53.12" "73.31" "75.21" "55.17" "25.62" "35.71" "55.56" "56.18" "40.27"
27
+ "GPT-5 nano (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-nano" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "55.39" "63.5" "47.19" "68.92" "75.21" "55.17" "52.07" "34.29" "63.49" "40.73" "42.95"
28
+ "GLM-4.5 FP8" "https://huggingface.co/zai-org/GLM-4.5-FP8" "temperature: 0.6
29
+ top-p: 0.95" "GLM" "1442.0" "604.0" "358.0" "Open" "Hybrid" "On" "54.03" "60.75" "53.75" "68.92" "74.38" "47.13" "33.06" "41.43" "60.32" "46.07" "35.91"
30
  "GLM-4.6 FP8" "https://huggingface.co/zai-org/GLM-4.6-FP8" "temperature: 1.0
31
+ top-p: 0.95" "GLM" "2645.5" "522.0" "358.0" "Open" "Hybrid" "On" "53.3" "57.5" "51.25" "71.31" "71.9" "53.45" "24.79" "28.57" "58.33" "44.38" "43.29"
32
+ "Gemini 2.5 Flash-lite Preview (09-2025)" "https://deepmind.google/models/gemini/" "version: 09-2025" "Gemini" "" "" "" "Proprietary" "Think" "On" "53.06" "55.0" "55.94" "68.13" "70.25" "47.7" "23.97" "30.0" "60.71" "46.63" "42.28"
33
  "Qwen3 235B A22B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" "temperature: 0.7
34
+ top-p: 0.8" "Qwen" "433.0" "433.0" "235.0" "Open" "Instruct" "Off" "52.94" "58.0" "49.69" "68.13" "73.97" "55.17" "45.45" "30.0" "55.95" "38.48" "41.61"
35
+ "DeepSeek V3.2" "https://huggingface.co/deepseek-ai/DeepSeek-V3.2" "temperature: 1.0
36
+ top-p: 0.95" "DeepSeek" "762.5" "312.0" "671.0" "Open" "Think" "On" "52.17" "51.25" "51.56" "70.92" "72.31" "51.15" "36.36" "37.14" "60.32" "40.17" "39.93"
37
  "DeepSeek V3.1 (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" "temperature: 0.6
38
+ top-p: 0.95" "DeepSeek" "710.5" "356.0" "671.0" "Open" "Hybrid" "On" "51.45" "52.0" "50.0" "67.33" "69.83" "50.0" "33.88" "35.71" "59.52" "41.85" "40.27"
39
  "DeepSeek V3.1 Terminus (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1-Terminus" "temperature: 0.6
40
+ top-p: 0.95" "DeepSeek" "831.5" "377.0" "671.0" "Open" "Hybrid" "On" "51.37" "51.5" "52.19" "69.32" "73.14" "51.72" "25.62" "38.57" "57.14" "38.76" "40.94"
41
  "Qwen3 30B A3B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507" "temperature: 0.6
42
+ top-p: 0.95" "Qwen" "2830.0" "351.0" "30.0" "Open" "Think" "On" "50.44" "56.25" "45.0" "69.32" "69.01" "50.0" "29.75" "30.0" "48.02" "47.47" "36.58"
43
+ "MiMo V2 Flash" "https://huggingface.co/XiaomiMiMo/MiMo-V2-Flash" "temperature: 0.8
44
+ top-p: 0.95" "XiaomiMiMo" "1477.5" "373.0" "309.0" "Open" "Think" "On" "50.32" "54.0" "48.12" "67.73" "68.18" "44.83" "48.76" "28.57" "53.97" "40.73" "35.91"
45
  "gpt-oss-120B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-120b" "Reasoning: medium
46
  temperature: 1.0
47
+ top-p: 1.0" "GPT" "759.5" "370.5" "117.0" "Open" "Think" "On" "49.11" "58.5" "48.44" "68.92" "69.83" "41.38" "39.67" "25.71" "50.79" "35.67" "32.21"
48
  "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)" "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" "version: 0528
49
  temperature: 0.6
50
+ top-p: 0.95" "DeepSeek" "1177.5" "554.0" "671.0" "Open" "Think" "On" "48.79" "49.75" "50.0" "65.34" "59.09" "48.85" "38.02" "32.86" "57.94" "36.52" "38.93"
51
+ "Gauss2.3 Hybrid" "" "" "Gauss" "546.0" "308.0" "" "Proprietary" "Hybrid" "On" "46.58" "52.0" "46.25" "59.76" "66.94" "41.95" "34.71" "25.71" "53.17" "34.55" "33.22"
52
+ "Mistral Large 3 675B Instruct 2512" "https://huggingface.co/mistralai/Mistral-Large-3-675B-Instruct-2512" "temperature: 0.15" "mistralai" "448.0" "448.0" "675.0" "Open" "Instruct" "Off" "45.21" "44.0" "50.62" "65.34" "60.33" "33.33" "14.88" "37.14" "53.97" "36.52" "35.91"
53
  "DeepSeek V3 (0324) (top_p: 0.95, temp:1.3)" "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324" "version: 0324
54
  temperature: 1.3
55
+ top-p: 0.95" "DeepSeek" "408.0" "408.0" "671.0" "Open" "Instruct" "Off" "45.09" "46.25" "45.0" "58.96" "60.33" "41.95" "21.49" "30.0" "55.95" "38.48" "33.22"
56
  "Qwen3 32B (think)" "https://huggingface.co/Qwen/Qwen3-32B" "temperature: 0.6
57
+ top-p: 0.95" "Qwen" "1113.0" "390.0" "32.8" "Open" "Hybrid" "On" "44.44" "52.25" "41.56" "68.92" "66.53" "35.06" "19.83" "25.71" "46.43" "30.9" "32.89"
58
  "Qwen3 30B A3B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507" "temperature: 0.7
59
+ top-p: 0.8" "Qwen" "441.5" "441.5" "30.0" "Open" "Instruct" "Off" "42.79" "45.0" "35.0" "56.18" "66.12" "51.15" "33.06" "24.29" "46.83" "28.09" "35.57"
60
  "MiniMax-M2 (230B A10B)" "https://huggingface.co/MiniMaxAI/MiniMax-M2" "temperature:1.0
61
+ top-p: 0.95" "MiniMaxAI" "1142.0" "325.0" "230.0" "Open" "Think" "On" "42.43" "48.75" "35.62" "53.39" "57.02" "43.1" "44.63" "28.57" "49.21" "30.06" "31.21"
62
+ "A.X 4.0" "https://huggingface.co/skt/A.X-4.0" "" "SKT" "412.5" "412.5" "71.9" "Open" "Instruct" "Off" "41.59" "56.0" "43.75" "43.43" "42.56" "40.23" "15.7" "24.29" "53.97" "33.43" "32.21"
63
  "gpt-oss-20B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-20b" "Reasoning: medium
64
  temperature: 1.0
65
+ top-p: 1.0" "GPT" "953.5" "326.0" "21.0" "Open" "Think" "On" "41.18" "52.0" "40.0" "61.35" "65.7" "43.1" "41.32" "22.86" "36.51" "20.51" "22.82"
66
  "Gemma 3 27B it" "https://huggingface.co/google/gemma-3-27b-it" "temperature: 1.0
67
+ top-p: 0.95" "Gemma" "380.0" "380.0" "27.0" "Open" "Instruct" "Off" "40.86" "44.25" "45.0" "45.82" "36.78" "31.61" "32.23" "22.86" "57.14" "32.87" "39.93"
68
  "Tongyi DeepResearch 30B A3B" "https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B" "temperature: 0.6
69
+ top-p: 0.95" "Alibaba" "1147.0" "408.0" "30.0" "Open" "Think" "On" "40.1" "41.25" "33.12" "62.15" "68.18" "44.25" "23.97" "18.57" "41.67" "26.12" "29.19"
70
  "Mistral Small 3.2 24B Instruct 2506" "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506" "temperature: 0.15
71
+ top-p: 0.95" "mistralai" "369.0" "369.0" "24.0" "Open" "Instruct" "Off" "39.09" "43.0" "44.69" "43.43" "51.65" "25.86" "22.31" "25.71" "51.98" "31.18" "30.2"
72
  "K2-Think" "https://huggingface.co/LLM360/K2-Think" "temperature: 1.0
73
+ top-p: 0.95" "LLM360" "1835.0" "486.0" "32.8" "Open" "Think" "On" "35.06" "35.5" "36.56" "56.18" "47.11" "35.06" "14.05" "12.86" "49.21" "21.63" "23.15"
74
+ "Kanana 2 30B A3B Thinking" "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-thinking" "temperature: 0.6
75
+ top-p: 0.95
76
+ top-k: 20" "Kakao" "4263.0" "854.5" "31.0" "Open" "Think" "On" "34.5" "37.5" "25.0" "57.77" "54.55" "39.66" "20.66" "15.71" "38.1" "24.72" "20.47"
77
  "KAT Dev 72B Exp" "https://huggingface.co/Kwaipilot/KAT-Dev-72B-Exp" "temperature:0.6
78
+ top-p: 0.95" "KAT" "397.0" "397.0" "72.0" "Open" "Instruct" "Off" "33.94" "29.25" "44.06" "46.22" "46.69" "25.86" "18.18" "20.0" "42.86" "25.56" "25.5"
79
  "Olmo 3 32B Think" "https://huggingface.co/allenai/Olmo-3-32B-Think" "temperature: 1
80
  top-p: 0.95
81
+ top-k: 50" "allenai" "3360.5" "473.0" "32.0" "Open" "Think" "On" "33.94" "35.25" "30.94" "57.37" "66.53" "33.33" "28.93" "24.29" "34.52" "11.8" "19.8"
82
  "EXAONE 4.0 32B (think)" "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" "temperature: 0.6
83
+ top-p: 0.95" "Exaone" "1274.5" "503.0" "32.0" "Open" "Hybrid" "On" "33.82" "34.25" "29.38" "56.97" "57.44" "24.71" "27.27" "17.14" "38.49" "18.54" "25.5"
84
  "Apriel 1.5 15B Thinker" "https://huggingface.co/ServiceNow-AI/Apriel-1.5-15b-Thinker" "temperature: 0.6
85
+ top-p: 0.95" "Apriel" "2238.0" "375.0" "15.0" "Open" "Think" "On" "31.92" "44.25" "26.56" "47.41" "59.09" "22.99" "37.19" "20.0" "26.98" "20.22" "10.07"
86
  "HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
87
+ top-p: 0.6" "HCX" "1444.0" "382.5" "14.7" "Open" "Hybrid" "On" "31.84" "35.0" "26.56" "53.78" "58.68" "27.59" "26.45" "17.14" "29.76" "17.13" "20.47"
88
+ "Kanana 2 30B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct" "temperature: 0" "Kakao" "1195.0" "1195.0" "31.0" "Open" "Instruct" "Off" "30.84" "38.0" "25.62" "35.86" "47.11" "37.93" "23.97" "18.57" "35.32" "20.51" "19.46"
89
  "Dhanishtha-2.0 Preview" "https://huggingface.co/HelpingAI/Dhanishtha-2.0-preview" "temperature: 0.7
90
+ top-p: 0.9" "HelpingAI" "520.0" "356.0" "14.8" "Open" "Think" "On" "25.81" "28.25" "19.38" "30.28" "33.47" "43.1" "47.93" "20.0" "31.75" "12.08" "13.09"
91
  "ERNIE 4.5 21B A3B Thinking" "https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking" "temperature: 0.6
92
+ top-p: 0.95" "ERNIE" "1637.0" "541.0" "21.0" "Open" "Think" "On" "25.32" "27.25" "20.31" "42.23" "49.59" "23.56" "31.4" "17.14" "28.17" "7.3" "13.76"
93
  "Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
94
+ top-p: 0.95" "Solar" "260.0" "260.0" "22.0" "Open" "Instruct" "Off" "20.73" "28.0" "24.69" "16.73" "19.42" "17.24" "28.1" "11.43" "31.35" "13.76" "11.74"
95
  "Mi:dm 2.0 Base Instruct" "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct" "temperature: 0.8
96
+ top-p: 0.7" "KT" "316.0" "316.0" "11.5" "Open" "Instruct" "Off" "20.25" "21.75" "17.5" "16.73" "18.6" "27.59" "59.5" "14.29" "25.4" "12.64" "11.41"
97
  "Kanana 1.5 15.7B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct" "temperature: 1.0
98
+ top-p: 0.95" "Kakao" "414.0" "414.0" "15.7" "Open" "Instruct" "Off" "11.71" "14.25" "10.62" "13.55" "11.16" "22.41" "22.31" "4.29" "11.9" "6.74" "5.37"
src/data/open/stats_lang.csv CHANGED
@@ -1,83 +1,98 @@
1
- "Model Name" "Link" "Comment" "Group" "Med. Len." "Med. Resp. Len." "Time to First Answer Token" "End-to-End Response Time" "Speed" "Parameter Size (B)" "Type" "Model Type" "Think" "Overall" "KO" "EN" "JA" "ZH" "PL" "DE" "PT" "ES" "FR" "IT" "RU" "VI"
2
- "GPT-5 (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5" "Reasoning: medium" "GPT" "" "" "" "" "" "" "Proprietary" "Think" "On" "70.73" "64.72" "65.83" "71.69" "67.68" "72.78" "71.27" "73.74" "75.68" "72.83" "77.05" "70.79" "75.61"
3
- "o3-pro (Reasoning: medium)" "https://platform.openai.com/docs/models/o3-pro" "Reasoning: medium" "GPT" "" "" "" "" "" "" "Proprietary" "Think" "On" "66.47" "63.61" "63.61" "69.28" "65.24" "63.89" "64.09" "68.16" "69.19" "70.11" "72.13" "62.36" "71.95"
4
- "GPT-5.1 (Reasoning: medium, verbosity: medium)" "https://platform.openai.com/docs/models/gpt-5.1" "Reasoning: medium, verbosity: medium" "GPT" "" "" "" "11.673096776008606" "" "" "Proprietary" "Think" "On" "64.57" "57.78" "62.5" "65.06" "62.8" "65.56" "60.22" "65.36" "68.11" "74.46" "70.49" "67.42" "63.41"
5
- "Claude 4.5 Opus (think)" "https://www.anthropic.com/claude/opus" "" "Claude" "" "" "" "" "" "" "Proprietary" "Hybrid" "On" "63.41" "59.44" "60.28" "66.27" "64.02" "66.67" "65.19" "63.69" "62.16" "63.59" "64.48" "65.73" "67.07"
6
- "Claude 4 Opus (20250514) (think)" "https://www.anthropic.com/claude/opus" "version: 20250514" "Claude" "" "" "" "" "" "" "Proprietary" "Hybrid" "On" "63.29" "57.5" "62.5" "64.46" "62.8" "59.44" "65.19" "65.92" "60.54" "65.22" "65.57" "65.17" "72.56"
7
- "Claude 4.1 Opus (20250805) (think)" "https://www.anthropic.com/claude/opus" "version: 20250805" "Claude" "" "" "" "" "" "" "Proprietary" "Hybrid" "On" "63.24" "58.33" "61.39" "60.84" "64.02" "61.67" "66.85" "68.16" "61.08" "65.76" "66.67" "65.73" "65.24"
8
- "GPT-5 mini (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-mini" "Reasoning: medium" "GPT" "" "" "" "" "" "" "Proprietary" "Think" "On" "62.56" "57.5" "56.39" "62.65" "62.2" "63.89" "60.22" "66.48" "67.03" "70.11" "67.76" "66.29" "60.98"
9
- "Gemini 3 Pro Preview (Thinking Level: High)" "" "" "Gemini" "1930.5" "378.0" "" "27.89457416534424" "" "" "Proprietary" "Think" "On" "62.48" "59.44" "60.56" "60.24" "62.2" "61.67" "65.19" "63.13" "64.32" "65.76" "65.57" "64.04" "62.2"
10
- "Claude 4 Sonnet (20250514) (think)" "https://www.anthropic.com/claude/sonnet" "version: 20250514" "Claude" "" "" "" "" "" "" "Proprietary" "Hybrid" "On" "61.8" "54.17" "59.17" "63.86" "64.63" "59.44" "61.33" "64.8" "62.16" "65.22" "67.21" "66.29" "64.02"
11
- "o3" "https://platform.openai.com/docs/models/o3" "" "GPT" "" "" "" "" "" "" "Proprietary" "Think" "On" "60.91" "57.5" "59.17" "61.45" "58.54" "61.11" "64.09" "60.89" "62.16" "63.59" "65.03" "54.49" "68.29"
12
- "Gemini 2.5 Pro" "https://deepmind.google/models/gemini/pro/" "" "Gemini" "" "" "" "" "" "" "Proprietary" "Think" "On" "59.34" "53.61" "57.78" "59.04" "57.93" "57.22" "56.91" "60.89" "63.24" "67.93" "62.3" "61.24" "60.98"
 
 
 
 
 
 
13
  "Grok-4" "https://x.ai/news/grok-4" "temperature: 0.6
14
- top-p: 0.95" "Grok" "" "" "" "" "" "" "Proprietary" "Think" "On" "58.74" "57.78" "56.67" "62.65" "60.37" "58.33" "60.22" "59.78" "56.22" "62.5" "60.66" "52.25" "60.98"
15
- "Gemini 2.5 Flash" "https://deepmind.google/models/gemini/flash/" "" "Gemini" "" "" "" "" "" "" "Proprietary" "Hybrid" "On" "58.62" "51.11" "56.39" "62.05" "56.71" "62.78" "60.77" "61.45" "60.0" "63.04" "57.92" "64.04" "56.71"
16
- "o4-mini" "https://platform.openai.com/docs/models/o4-mini" "" "GPT" "" "" "" "" "" "" "Proprietary" "Think" "On" "57.57" "54.17" "55.0" "62.05" "59.76" "52.78" "58.56" "63.69" "55.68" "57.61" "60.66" "56.74" "60.98"
17
  "Kimi K2 Thinking" "https://huggingface.co/moonshotai/Kimi-K2-Thinking" "temperature:1.0
18
- top-p: 0.95" "moonshot" "1692.0" "330.0" "45.35071495282816" "70.24291145801544" "24.28866627458008" "1000.0" "Open" "Think" "On" "56.84" "50.0" "57.5" "60.84" "62.2" "53.33" "54.14" "61.45" "53.51" "59.24" "59.56" "56.18" "61.59"
19
  "Qwen3 235B A22B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" "temperature: 0.6
20
- top-p: 0.95" "Qwen" "2404.5" "423.0" "58.364528823897146" "80.01045334339142" "31.05335185752473" "235.0" "Open" "Think" "On" "55.48" "49.17" "53.33" "56.02" "58.54" "50.56" "62.43" "60.89" "52.97" "56.52" "60.11" "53.93" "60.37"
21
- "GPT-5 nano (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-nano" "Reasoning: medium" "GPT" "" "" "" "" "" "" "Proprietary" "Think" "On" "55.39" "51.94" "53.89" "57.23" "53.66" "55.56" "58.01" "59.78" "54.59" "56.52" "59.02" "57.3" "51.83"
22
- "GLM-4.5 FP8 (think)" "https://huggingface.co/zai-org/GLM-4.5-FP8" "temperature: 0.6
23
- top-p: 0.95" "GLM" "1442.0" "604.0" "25.261904125875603" "62.74959444999695" "23.293980879127712" "355.0" "Open" "Hybrid" "On" "54.03" "46.94" "54.17" "60.84" "58.54" "48.89" "55.8" "54.75" "48.11" "57.61" "57.92" "57.87" "54.88"
24
  "GLM-4.6 FP8" "https://huggingface.co/zai-org/GLM-4.6-FP8" "temperature: 1.0
25
- top-p: 0.95" "GLM" "2645.5" "522.0" "81.414294828216" "110.0251989364624" "24.034975709814915" "355.0" "Open" "Hybrid" "On" "53.3" "49.17" "54.17" "54.22" "56.71" "52.22" "53.04" "49.16" "56.76" "56.52" "56.28" "53.93" "50.61"
26
- "Gemini 2.5 Flash-lite Preview (09-2025)" "https://deepmind.google/models/gemini/" "version: 09-2025" "Gemini" "" "" "" "" "" "" "Proprietary" "Think" "On" "53.06" "47.78" "51.11" "51.2" "53.66" "51.67" "54.7" "59.22" "51.89" "57.07" "55.74" "57.87" "51.83"
27
  "Qwen3 235B A22B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" "temperature: 0.7
28
- top-p: 0.8" "Qwen" "433.0" "433.0" "0.1387630701065063" "14.262101531028748" "31.359207215387023" "235.0" "Open" "Instruct" "Off" "52.94" "46.67" "55.28" "53.61" "59.15" "46.11" "51.38" "55.87" "54.59" "53.26" "56.28" "54.49" "53.05"
 
 
29
  "DeepSeek V3.1 (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" "temperature: 0.6
30
- top-p: 0.95" "DeepSeek" "710.5" "356.0" "14.323043732258654" "35.32915151119232" "16.64962453842425" "671.0" "Open" "Hybrid" "On" "51.45" "44.44" "48.33" "56.63" "48.78" "48.89" "55.25" "53.07" "52.97" "56.52" "57.92" "50.56" "54.27"
31
  "DeepSeek V3.1 Terminus (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1-Terminus" "temperature: 0.6
32
- top-p: 0.95" "DeepSeek" "831.5" "377.0" "17.055466594943752" "47.552645206451416" "17.890508425613742" "671.0" "Open" "Hybrid" "On" "51.37" "46.94" "50.83" "51.81" "53.66" "50.0" "53.59" "51.96" "55.14" "53.8" "54.64" "48.31" "50.61"
33
  "Qwen3 30B A3B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507" "temperature: 0.6
34
- top-p: 0.95" "Qwen" "2830.0" "351.0" "76.69636714346468" "82.98819828033447" "72.08537789542703" "30.0" "Open" "Think" "On" "50.44" "44.17" "49.17" "50.0" "57.32" "42.22" "49.72" "53.07" "50.27" "54.89" "56.83" "47.75" "58.54"
 
 
35
  "gpt-oss-120B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-120b" "Reasoning: medium
36
  temperature: 1.0
37
- top-p: 1.0" "GPT" "759.5" "370.5" "7.694922740481965" "12.121336698532104" "103.31935460342277" "117.0" "Open" "Think" "On" "49.11" "46.67" "51.39" "51.81" "47.56" "45.0" "51.38" "54.75" "50.27" "51.63" "47.54" "46.07" "45.12"
38
  "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)" "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" "version: 0528
39
  temperature: 0.6
40
- top-p: 0.95" "DeepSeek" "1177.5" "554.0" "28.558620557701" "70.60028326511383" "17.625838630215213" "671.0" "Open" "Think" "On" "48.79" "42.22" "49.44" "50.0" "53.05" "47.22" "48.62" "50.28" "48.11" "51.63" "54.1" "44.38" "53.05"
41
- "Gauss2.3 Hybrid" "" "" "Gauss" "546.0" "308.0" "6.750162363052368" "17.980867981910706" "42.58336125102582" "" "Proprietary" "Hybrid" "On" "46.58" "39.72" "45.56" "48.8" "48.17" "45.0" "44.2" "53.63" "45.41" "52.17" "51.91" "44.94" "47.56"
 
42
  "DeepSeek V3 (0324) (top_p: 0.95, temp:1.3)" "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324" "version: 0324
43
  temperature: 1.3
44
- top-p: 0.95" "DeepSeek" "408.0" "408.0" "0.211452841758728" "23.47111320495605" "17.62487523518351" "671.0" "Open" "Instruct" "Off" "45.09" "37.5" "43.61" "46.99" "51.22" "45.56" "44.75" "44.69" "44.32" "48.91" "49.18" "44.94" "49.39"
45
  "Qwen3 32B (think)" "https://huggingface.co/Qwen/Qwen3-32B" "temperature: 0.6
46
- top-p: 0.95" "Qwen" "1113.0" "390.0" "27.26490248867746" "39.635579228401184" "37.74973909656839" "32.8" "Open" "Hybrid" "On" "44.44" "38.89" "41.67" "48.8" "50.0" "38.33" "46.41" "44.69" "44.86" "44.57" "50.82" "46.07" "47.56"
47
  "Qwen3 30B A3B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507" "temperature: 0.7
48
- top-p: 0.8" "Qwen" "441.5" "441.5" "7.902002811431885" "19.310550212860107" "42.44958664990833" "30.0" "Open" "Instruct" "Off" "42.79" "34.44" "43.89" "40.96" "48.78" "38.89" "41.99" "46.93" "44.32" "42.93" "48.09" "43.26" "46.95"
49
  "MiniMax-M2 (230B A10B)" "https://huggingface.co/MiniMaxAI/MiniMax-M2" "temperature:1.0
50
- top-p: 0.95" "MiniMaxAI" "1142.0" "325.0" "" "" "" "230.0" "Open" "Think" "On" "42.43" "31.94" "46.11" "37.35" "45.73" "38.33" "45.3" "45.25" "48.65" "41.3" "46.45" "42.7" "46.95"
51
- "A.X 4.0" "https://huggingface.co/skt/A.X-4.0" "" "SKT" "412.5" "412.5" "0.6553128957748413" "7.924791574478149" "57.95526130360478" "71.9" "Open" "Instruct" "Off" "41.59" "38.89" "41.11" "43.98" "49.39" "36.11" "45.86" "43.58" "44.32" "39.67" "43.17" "39.89" "36.59"
52
  "gpt-oss-20B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-20b" "Reasoning: medium
53
  temperature: 1.0
54
- top-p: 1.0" "GPT" "953.5" "326.0" "26.04652036871504" "29.767700791358948" "108.53633696847938" "21.0" "Open" "Think" "On" "41.18" "36.67" "42.78" "45.78" "45.73" "37.78" "35.91" "41.9" "39.46" "51.09" "40.44" "38.76" "41.46"
55
  "Gemma 3 27B it" "https://huggingface.co/google/gemma-3-27b-it" "temperature: 1.0
56
- top-p: 0.95" "Gemma" "380.0" "380.0" "3.391351342201233" "13.303653597831726" "39.94050750809835" "27.0" "Open" "Instruct" "Off" "40.86" "34.44" "35.0" "37.35" "43.9" "42.22" "43.65" "47.49" "41.08" "44.02" "53.55" "39.33" "40.24"
57
  "Tongyi DeepResearch 30B A3B" "https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B" "temperature: 0.6
58
- top-p: 0.95" "Alibaba" "1147.0" "408.0" "45.23295979184195" "52.38741266727448" "62.676624491545525" "30.0" "Open" "Think" "On" "40.1" "36.11" "40.83" "43.37" "44.51" "32.78" "37.02" "44.69" "38.92" "43.48" "46.45" "37.08" "39.63"
59
  "Mistral Small 3.2 24B Instruct 2506" "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506" "temperature: 0.15
60
- top-p: 0.95" "mistralai" "369.0" "369.0" "3.2450859546661377" "13.907460689544678" "36.382163796915904" "24.0" "Open" "Instruct" "Off" "39.09" "31.39" "40.0" "36.75" "42.07" "34.44" "44.2" "41.9" "42.16" "45.65" "40.98" "37.64" "38.41"
61
  "K2-Think" "https://huggingface.co/LLM360/K2-Think" "temperature: 1.0
62
- top-p: 0.95" "LLM360" "1835.0" "486.0" "24.29692639716904" "43.2994556427002" "42.72123101353567" "32.8" "Open" "Think" "On" "35.06" "29.17" "36.11" "30.12" "44.51" "26.67" "33.15" "38.55" "37.84" "41.85" "37.7" "33.71" "36.59"
 
 
 
63
  "KAT Dev 72B Exp" "https://huggingface.co/Kwaipilot/KAT-Dev-72B-Exp" "temperature:0.6
64
- top-p: 0.95" "KAT" "397.0" "397.0" "0.0622165203094482" "8.492375493049622" "50.601864763867184" "72.0" "Open" "Instruct" "Off" "33.94" "25.0" "32.22" "31.93" "37.2" "34.44" "33.15" "43.02" "37.84" "36.96" "37.7" "30.34" "38.41"
65
  "Olmo 3 32B Think" "https://huggingface.co/allenai/Olmo-3-32B-Think" "temperature: 1
66
  top-p: 0.95
67
- top-k: 50" "allenai" "3360.5" "473.0" "60.18788400716624" "77.51256728172302" "44.30514641537086" "32.0" "Open" "Think" "On" "33.94" "30.56" "41.39" "30.12" "31.1" "25.0" "34.25" "35.75" "33.51" "36.41" "37.16" "31.46" "35.98"
68
  "EXAONE 4.0 32B (think)" "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" "temperature: 0.6
69
- top-p: 0.95" "Exaone" "1274.5" "503.0" "40.64476558326666" "52.11687910556793" "51.19312170664125" "32.0" "Open" "Hybrid" "On" "33.82" "33.61" "38.33" "28.92" "35.98" "26.11" "35.91" "34.08" "38.92" "35.33" "33.88" "28.09" "31.71"
70
  "Apriel 1.5 15B Thinker" "https://huggingface.co/ServiceNow-AI/Apriel-1.5-15b-Thinker" "temperature: 0.6
71
- top-p: 0.95" "Apriel" "2238.0" "375.0" "299.8162105011457" "379.46853709220886" "14.66275339770088" "15.0" "Open" "Think" "On" "31.92" "23.61" "39.72" "30.72" "38.41" "24.44" "40.88" "37.99" "32.43" "32.61" "22.95" "28.65" "31.71"
72
  "HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
73
- top-p: 0.6" "HCX" "1444.0" "382.5" "16.12651202553951" "24.703290462493896" "83.75171982150616" "14.7" "Open" "Hybrid" "On" "31.84" "32.22" "37.22" "31.93" "38.41" "27.78" "32.6" "30.17" "29.19" "32.07" "33.33" "25.28" "26.22"
 
74
  "Dhanishtha-2.0 Preview" "https://huggingface.co/HelpingAI/Dhanishtha-2.0-preview" "temperature: 0.7
75
- top-p: 0.9" "HelpingAI" "520.0" "356.0" "4.368606805801392" "35.15699875354767" "17.75738514863349" "14.8" "Open" "Think" "On" "25.81" "23.33" "27.22" "30.12" "32.32" "20.56" "20.99" "26.26" "25.95" "25.54" "30.6" "23.6" "25.0"
76
  "ERNIE 4.5 21B A3B Thinking" "https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking" "temperature: 0.6
77
- top-p: 0.95" "ERNIE" "1637.0" "541.0" "48.24206436969081" "56.95321476459503" "78.52955859303597" "21.0" "Open" "Think" "On" "25.32" "17.5" "31.11" "18.67" "39.02" "23.33" "24.31" "24.58" "26.49" "24.46" "30.6" "19.1" "27.44"
78
  "Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
79
- top-p: 0.95" "Solar" "260.0" "260.0" "12.68759036064148" "39.93266606330872" "11.341528558845871" "22.0" "Open" "Instruct" "Off" "20.73" "9.72" "22.22" "21.08" "24.39" "9.44" "18.23" "24.02" "29.73" "29.89" "33.33" "22.47" "12.8"
80
  "Mi:dm 2.0 Base Instruct" "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct" "temperature: 0.8
81
- top-p: 0.7" "KT" "316.0" "316.0" "3.07414984703064" "11.089128971099854" "41.13515299318637" "11.5" "Open" "Instruct" "Off" "20.25" "26.39" "26.39" "17.47" "26.83" "13.33" "18.78" "20.67" "16.22" "20.65" "21.31" "12.92" "9.15"
82
  "Kanana 1.5 15.7B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct" "temperature: 1.0
83
- top-p: 0.95" "Kakao" "414.0" "414.0" "2.999279260635376" "14.037613034248352" "39.50831768498445" "15.7" "Open" "Instruct" "Off" "11.71" "21.11" "20.28" "10.84" "15.24" "5.56" "7.73" "8.94" "9.19" "8.15" "5.46" "5.06" "4.88"
 
1
+ "Model Name" "Link" "Comment" "Group" "Med. Len." "Med. Resp. Len." "Parameter Size (B)" "Type" "Model Type" "Think" "Overall" "KO" "EN" "JA" "ZH" "PL" "DE" "PT" "ES" "FR" "IT" "RU" "VI"
2
+ "GPT-5 (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "70.73" "64.72" "65.83" "71.69" "67.68" "72.78" "71.27" "73.74" "75.68" "72.83" "77.05" "70.79" "75.61"
3
+ "o3-pro (Reasoning: medium)" "https://platform.openai.com/docs/models/o3-pro" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "66.47" "63.61" "63.61" "69.28" "65.24" "63.89" "64.09" "68.16" "69.19" "70.11" "72.13" "62.36" "71.95"
4
+ "GPT-5.2 (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5.2" "Reasoning: medium" "GPT" "347.0" "264.0" "" "Proprietary" "Hybrid" "On" "66.18" "61.67" "61.39" "69.28" "64.63" "68.89" "66.3" "70.95" "63.24" "68.48" "70.49" "70.22" "68.29"
5
+ "GPT-5.1 (Reasoning: medium, verbosity: medium)" "https://platform.openai.com/docs/models/gpt-5.1" "Reasoning: medium, verbosity: medium" "GPT" "" "" "" "Proprietary" "Hybrid" "On" "64.57" "57.78" "62.5" "65.06" "62.8" "65.56" "60.22" "65.36" "68.11" "74.46" "70.49" "67.42" "63.41"
6
+ "Claude 4.5 Opus (think, budget: 16K)" "https://www.anthropic.com/claude/opus" "thinking budget: 16K" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "63.41" "59.44" "60.28" "66.27" "64.02" "66.67" "65.19" "63.69" "62.16" "63.59" "64.48" "65.73" "67.07"
7
+ "Claude 4 Opus (20250514) (think)" "https://www.anthropic.com/claude/opus" "version: 20250514" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "63.29" "57.5" "62.5" "64.46" "62.8" "59.44" "65.19" "65.92" "60.54" "65.22" "65.57" "65.17" "72.56"
8
+ "Claude 4.1 Opus (20250805) (think)" "https://www.anthropic.com/claude/opus" "version: 20250805" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "63.24" "58.33" "61.39" "60.84" "64.02" "61.67" "66.85" "68.16" "61.08" "65.76" "66.67" "65.73" "65.24"
9
+ "GPT-5 mini (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-mini" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "62.56" "57.5" "56.39" "62.65" "62.2" "63.89" "60.22" "66.48" "67.03" "70.11" "67.76" "66.29" "60.98"
10
+ "Gemini 3 Pro Preview (Thinking Level: High)" "https://deepmind.google/models/gemini/pro/" "Thinking Level: High" "Gemini" "1930.5" "378.0" "" "Proprietary" "Think" "On" "62.48" "59.44" "60.56" "60.24" "62.2" "61.67" "65.19" "63.13" "64.32" "65.76" "65.57" "64.04" "62.2"
11
+ "Claude 4 Sonnet (20250514) (think)" "https://www.anthropic.com/claude/sonnet" "version: 20250514" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "61.8" "54.17" "59.17" "63.86" "64.63" "59.44" "61.33" "64.8" "62.16" "65.22" "67.21" "66.29" "64.02"
12
+ "o3" "https://platform.openai.com/docs/models/o3" "" "GPT" "" "" "" "Proprietary" "Think" "On" "60.91" "57.5" "59.17" "61.45" "58.54" "61.11" "64.09" "60.89" "62.16" "63.59" "65.03" "54.49" "68.29"
13
+ "Gemini 2.5 Pro" "https://deepmind.google/models/gemini/pro/" "" "Gemini" "" "" "" "Proprietary" "Think" "On" "59.34" "53.61" "57.78" "59.04" "57.93" "57.22" "56.91" "60.89" "63.24" "67.93" "62.3" "61.24" "60.98"
14
+ "Gemini 3 Flash Preview (Thinking Level: High)" "https://deepmind.google/models/gemini/flash/" "Thinking Level: High" "Gemini" "1296.5" "424.5" "" "Proprietary" "Think" "On" "59.26" "53.89" "57.22" "61.45" "57.32" "56.67" "61.33" "57.54" "58.92" "64.67" "67.76" "60.11" "61.59"
15
+ "GLM-4.7 FP8" "https://huggingface.co/zai-org/GLM-4.7-FP8" "temperature: 1.0
16
+ top-p: 0.95" "GLM" "2252.5" "328.0" "358.0" "Open" "Hybrid" "On" "59.22" "54.17" "55.28" "63.86" "63.41" "55.0" "58.56" "62.01" "61.08" "63.59" "61.75" "66.29" "54.88"
17
+ "DeepSeek V3.2 Speciale" "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Speciale" "temperature: 1.0
18
+ top-p: 0.95" "DeepSeek" "3226.5" "249.5" "671.0" "Open" "Think" "On" "59.14" "50.83" "58.06" "63.25" "57.93" "58.89" "58.56" "58.66" "60.0" "65.22" "66.12" "59.55" "62.2"
19
  "Grok-4" "https://x.ai/news/grok-4" "temperature: 0.6
20
+ top-p: 0.95" "Grok" "" "" "" "Proprietary" "Think" "On" "58.74" "57.78" "56.67" "62.65" "60.37" "58.33" "60.22" "59.78" "56.22" "62.5" "60.66" "52.25" "60.98"
21
+ "Gemini 2.5 Flash" "https://deepmind.google/models/gemini/flash/" "" "Gemini" "" "" "" "Proprietary" "Hybrid" "On" "58.62" "51.11" "56.39" "62.05" "56.71" "62.78" "60.77" "61.45" "60.0" "63.04" "57.92" "64.04" "56.71"
22
+ "o4-mini" "https://platform.openai.com/docs/models/o4-mini" "" "GPT" "" "" "" "Proprietary" "Think" "On" "57.57" "54.17" "55.0" "62.05" "59.76" "52.78" "58.56" "63.69" "55.68" "57.61" "60.66" "56.74" "60.98"
23
  "Kimi K2 Thinking" "https://huggingface.co/moonshotai/Kimi-K2-Thinking" "temperature:1.0
24
+ top-p: 0.95" "moonshot" "1692.0" "330.0" "1000.0" "Open" "Think" "On" "56.84" "50.0" "57.5" "60.84" "62.2" "53.33" "54.14" "61.45" "53.51" "59.24" "59.56" "56.18" "61.59"
25
  "Qwen3 235B A22B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" "temperature: 0.6
26
+ top-p: 0.95" "Qwen" "2404.5" "423.0" "235.0" "Open" "Think" "On" "55.48" "49.17" "53.33" "56.02" "58.54" "50.56" "62.43" "60.89" "52.97" "56.52" "60.11" "53.93" "60.37"
27
+ "GPT-5 nano (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-nano" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "55.39" "51.94" "53.89" "57.23" "53.66" "55.56" "58.01" "59.78" "54.59" "56.52" "59.02" "57.3" "51.83"
28
+ "GLM-4.5 FP8" "https://huggingface.co/zai-org/GLM-4.5-FP8" "temperature: 0.6
29
+ top-p: 0.95" "GLM" "1442.0" "604.0" "358.0" "Open" "Hybrid" "On" "54.03" "46.94" "54.17" "60.84" "58.54" "48.89" "55.8" "54.75" "48.11" "57.61" "57.92" "57.87" "54.88"
30
  "GLM-4.6 FP8" "https://huggingface.co/zai-org/GLM-4.6-FP8" "temperature: 1.0
31
+ top-p: 0.95" "GLM" "2645.5" "522.0" "358.0" "Open" "Hybrid" "On" "53.3" "49.17" "54.17" "54.22" "56.71" "52.22" "53.04" "49.16" "56.76" "56.52" "56.28" "53.93" "50.61"
32
+ "Gemini 2.5 Flash-lite Preview (09-2025)" "https://deepmind.google/models/gemini/" "version: 09-2025" "Gemini" "" "" "" "Proprietary" "Think" "On" "53.06" "47.78" "51.11" "51.2" "53.66" "51.67" "54.7" "59.22" "51.89" "57.07" "55.74" "57.87" "51.83"
33
  "Qwen3 235B A22B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" "temperature: 0.7
34
+ top-p: 0.8" "Qwen" "433.0" "433.0" "235.0" "Open" "Instruct" "Off" "52.94" "46.67" "55.28" "53.61" "59.15" "46.11" "51.38" "55.87" "54.59" "53.26" "56.28" "54.49" "53.05"
35
+ "DeepSeek V3.2" "https://huggingface.co/deepseek-ai/DeepSeek-V3.2" "temperature: 1.0
36
+ top-p: 0.95" "DeepSeek" "762.5" "312.0" "671.0" "Open" "Think" "On" "52.17" "47.5" "49.44" "53.61" "50.61" "50.56" "54.14" "59.22" "52.43" "57.07" "56.28" "44.94" "57.93"
37
  "DeepSeek V3.1 (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" "temperature: 0.6
38
+ top-p: 0.95" "DeepSeek" "710.5" "356.0" "671.0" "Open" "Hybrid" "On" "51.45" "44.44" "48.33" "56.63" "48.78" "48.89" "55.25" "53.07" "52.97" "56.52" "57.92" "50.56" "54.27"
39
  "DeepSeek V3.1 Terminus (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1-Terminus" "temperature: 0.6
40
+ top-p: 0.95" "DeepSeek" "831.5" "377.0" "671.0" "Open" "Hybrid" "On" "51.37" "46.94" "50.83" "51.81" "53.66" "50.0" "53.59" "51.96" "55.14" "53.8" "54.64" "48.31" "50.61"
41
  "Qwen3 30B A3B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507" "temperature: 0.6
42
+ top-p: 0.95" "Qwen" "2830.0" "351.0" "30.0" "Open" "Think" "On" "50.44" "44.17" "49.17" "50.0" "57.32" "42.22" "49.72" "53.07" "50.27" "54.89" "56.83" "47.75" "58.54"
43
+ "MiMo V2 Flash" "https://huggingface.co/XiaomiMiMo/MiMo-V2-Flash" "temperature: 0.8
44
+ top-p: 0.95" "XiaomiMiMo" "1477.5" "373.0" "309.0" "Open" "Think" "On" "50.32" "42.22" "53.06" "49.4" "54.27" "47.78" "51.93" "53.63" "52.97" "54.89" "54.64" "42.13" "52.44"
45
  "gpt-oss-120B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-120b" "Reasoning: medium
46
  temperature: 1.0
47
+ top-p: 1.0" "GPT" "759.5" "370.5" "117.0" "Open" "Think" "On" "49.11" "46.67" "51.39" "51.81" "47.56" "45.0" "51.38" "54.75" "50.27" "51.63" "47.54" "46.07" "45.12"
48
  "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)" "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" "version: 0528
49
  temperature: 0.6
50
+ top-p: 0.95" "DeepSeek" "1177.5" "554.0" "671.0" "Open" "Think" "On" "48.79" "42.22" "49.44" "50.0" "53.05" "47.22" "48.62" "50.28" "48.11" "51.63" "54.1" "44.38" "53.05"
51
+ "Gauss2.3 Hybrid" "" "" "Gauss" "546.0" "308.0" "" "Proprietary" "Hybrid" "On" "46.58" "39.72" "45.56" "48.8" "48.17" "45.0" "44.2" "53.63" "45.41" "52.17" "51.91" "44.94" "47.56"
52
+ "Mistral Large 3 675B Instruct 2512" "https://huggingface.co/mistralai/Mistral-Large-3-675B-Instruct-2512" "temperature: 0.15" "mistralai" "448.0" "448.0" "675.0" "Open" "Instruct" "Off" "45.21" "41.39" "44.17" "50.6" "46.34" "46.11" "43.65" "45.81" "44.32" "49.46" "49.18" "42.13" "44.51"
53
  "DeepSeek V3 (0324) (top_p: 0.95, temp:1.3)" "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324" "version: 0324
54
  temperature: 1.3
55
+ top-p: 0.95" "DeepSeek" "408.0" "408.0" "671.0" "Open" "Instruct" "Off" "45.09" "37.5" "43.61" "46.99" "51.22" "45.56" "44.75" "44.69" "44.32" "48.91" "49.18" "44.94" "49.39"
56
  "Qwen3 32B (think)" "https://huggingface.co/Qwen/Qwen3-32B" "temperature: 0.6
57
+ top-p: 0.95" "Qwen" "1113.0" "390.0" "32.8" "Open" "Hybrid" "On" "44.44" "38.89" "41.67" "48.8" "50.0" "38.33" "46.41" "44.69" "44.86" "44.57" "50.82" "46.07" "47.56"
58
  "Qwen3 30B A3B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507" "temperature: 0.7
59
+ top-p: 0.8" "Qwen" "441.5" "441.5" "30.0" "Open" "Instruct" "Off" "42.79" "34.44" "43.89" "40.96" "48.78" "38.89" "41.99" "46.93" "44.32" "42.93" "48.09" "43.26" "46.95"
60
  "MiniMax-M2 (230B A10B)" "https://huggingface.co/MiniMaxAI/MiniMax-M2" "temperature:1.0
61
+ top-p: 0.95" "MiniMaxAI" "1142.0" "325.0" "230.0" "Open" "Think" "On" "42.43" "31.94" "46.11" "37.35" "45.73" "38.33" "45.3" "45.25" "48.65" "41.3" "46.45" "42.7" "46.95"
62
+ "A.X 4.0" "https://huggingface.co/skt/A.X-4.0" "" "SKT" "412.5" "412.5" "71.9" "Open" "Instruct" "Off" "41.59" "38.89" "41.11" "43.98" "49.39" "36.11" "45.86" "43.58" "44.32" "39.67" "43.17" "39.89" "36.59"
63
  "gpt-oss-20B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-20b" "Reasoning: medium
64
  temperature: 1.0
65
+ top-p: 1.0" "GPT" "953.5" "326.0" "21.0" "Open" "Think" "On" "41.18" "36.67" "42.78" "45.78" "45.73" "37.78" "35.91" "41.9" "39.46" "51.09" "40.44" "38.76" "41.46"
66
  "Gemma 3 27B it" "https://huggingface.co/google/gemma-3-27b-it" "temperature: 1.0
67
+ top-p: 0.95" "Gemma" "380.0" "380.0" "27.0" "Open" "Instruct" "Off" "40.86" "34.44" "35.0" "37.35" "43.9" "42.22" "43.65" "47.49" "41.08" "44.02" "53.55" "39.33" "40.24"
68
  "Tongyi DeepResearch 30B A3B" "https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B" "temperature: 0.6
69
+ top-p: 0.95" "Alibaba" "1147.0" "408.0" "30.0" "Open" "Think" "On" "40.1" "36.11" "40.83" "43.37" "44.51" "32.78" "37.02" "44.69" "38.92" "43.48" "46.45" "37.08" "39.63"
70
  "Mistral Small 3.2 24B Instruct 2506" "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506" "temperature: 0.15
71
+ top-p: 0.95" "mistralai" "369.0" "369.0" "24.0" "Open" "Instruct" "Off" "39.09" "31.39" "40.0" "36.75" "42.07" "34.44" "44.2" "41.9" "42.16" "45.65" "40.98" "37.64" "38.41"
72
  "K2-Think" "https://huggingface.co/LLM360/K2-Think" "temperature: 1.0
73
+ top-p: 0.95" "LLM360" "1835.0" "486.0" "32.8" "Open" "Think" "On" "35.06" "29.17" "36.11" "30.12" "44.51" "26.67" "33.15" "38.55" "37.84" "41.85" "37.7" "33.71" "36.59"
74
+ "Kanana 2 30B A3B Thinking" "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-thinking" "temperature: 0.6
75
+ top-p: 0.95
76
+ top-k: 20" "Kakao" "4263.0" "854.5" "31.0" "Open" "Think" "On" "34.5" "25.28" "43.06" "38.55" "40.24" "25.0" "34.25" "37.99" "32.43" "34.24" "37.7" "28.65" "38.41"
77
  "KAT Dev 72B Exp" "https://huggingface.co/Kwaipilot/KAT-Dev-72B-Exp" "temperature:0.6
78
+ top-p: 0.95" "KAT" "397.0" "397.0" "72.0" "Open" "Instruct" "Off" "33.94" "25.0" "32.22" "31.93" "37.2" "34.44" "33.15" "43.02" "37.84" "36.96" "37.7" "30.34" "38.41"
79
  "Olmo 3 32B Think" "https://huggingface.co/allenai/Olmo-3-32B-Think" "temperature: 1
80
  top-p: 0.95
81
+ top-k: 50" "allenai" "3360.5" "473.0" "32.0" "Open" "Think" "On" "33.94" "30.56" "41.39" "30.12" "31.1" "25.0" "34.25" "35.75" "33.51" "36.41" "37.16" "31.46" "35.98"
82
  "EXAONE 4.0 32B (think)" "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" "temperature: 0.6
83
+ top-p: 0.95" "Exaone" "1274.5" "503.0" "32.0" "Open" "Hybrid" "On" "33.82" "33.61" "38.33" "28.92" "35.98" "26.11" "35.91" "34.08" "38.92" "35.33" "33.88" "28.09" "31.71"
84
  "Apriel 1.5 15B Thinker" "https://huggingface.co/ServiceNow-AI/Apriel-1.5-15b-Thinker" "temperature: 0.6
85
+ top-p: 0.95" "Apriel" "2238.0" "375.0" "15.0" "Open" "Think" "On" "31.92" "23.61" "39.72" "30.72" "38.41" "24.44" "40.88" "37.99" "32.43" "32.61" "22.95" "28.65" "31.71"
86
  "HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
87
+ top-p: 0.6" "HCX" "1444.0" "382.5" "14.7" "Open" "Hybrid" "On" "31.84" "32.22" "37.22" "31.93" "38.41" "27.78" "32.6" "30.17" "29.19" "32.07" "33.33" "25.28" "26.22"
88
+ "Kanana 2 30B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct" "temperature: 0" "Kakao" "1195.0" "1195.0" "31.0" "Open" "Instruct" "Off" "30.84" "33.06" "39.44" "37.35" "33.54" "17.78" "26.52" "25.14" "30.81" "29.35" "31.15" "23.03" "32.93"
89
  "Dhanishtha-2.0 Preview" "https://huggingface.co/HelpingAI/Dhanishtha-2.0-preview" "temperature: 0.7
90
+ top-p: 0.9" "HelpingAI" "520.0" "356.0" "14.8" "Open" "Think" "On" "25.81" "23.33" "27.22" "30.12" "32.32" "20.56" "20.99" "26.26" "25.95" "25.54" "30.6" "23.6" "25.0"
91
  "ERNIE 4.5 21B A3B Thinking" "https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking" "temperature: 0.6
92
+ top-p: 0.95" "ERNIE" "1637.0" "541.0" "21.0" "Open" "Think" "On" "25.32" "17.5" "31.11" "18.67" "39.02" "23.33" "24.31" "24.58" "26.49" "24.46" "30.6" "19.1" "27.44"
93
  "Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
94
+ top-p: 0.95" "Solar" "260.0" "260.0" "22.0" "Open" "Instruct" "Off" "20.73" "9.72" "22.22" "21.08" "24.39" "9.44" "18.23" "24.02" "29.73" "29.89" "33.33" "22.47" "12.8"
95
  "Mi:dm 2.0 Base Instruct" "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct" "temperature: 0.8
96
+ top-p: 0.7" "KT" "316.0" "316.0" "11.5" "Open" "Instruct" "Off" "20.25" "26.39" "26.39" "17.47" "26.83" "13.33" "18.78" "20.67" "16.22" "20.65" "21.31" "12.92" "9.15"
97
  "Kanana 1.5 15.7B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct" "temperature: 1.0
98
+ top-p: 0.95" "Kakao" "414.0" "414.0" "15.7" "Open" "Instruct" "Off" "11.71" "21.11" "20.28" "10.84" "15.24" "5.56" "7.73" "8.94" "9.19" "8.15" "5.46" "5.06" "4.88"
src/data/open/time_data.json DELETED
The diff for this file is too large to render. See raw diff