Spaces:

SamsungResearch
/

TRUEBench

Running

App Files Files Community

Jongyoon Song commited on Dec 24, 2025

Commit

ef2b66d

1 Parent(s): 7d35c34

Update evaluation results (251224) & Remove time and speed-related results

Browse files

Files changed (6) hide show

app.py +0 -121
src/about.py +1 -1
src/data/open/length_data.json +614 -2
src/data/open/stats.csv +64 -49
src/data/open/stats_lang.csv +64 -49
src/data/open/time_data.json +0 -0

app.py CHANGED Viewed

@@ -242,127 +242,6 @@ def create_benchmark_tab_content(data_prefix: str):
     )
     gr.HTML('</div>')
-    # --- Speed Med Bar Plot Section (NEW) ---
-    import json
-    with open(f"src/data/{data_prefix}/time_data.json", "r") as f:
-        time_data = json.load(f)
-    time_data_state = gr.State(value=time_data)
-    gr.HTML("""
-    <div class="dark-container" style="margin-bottom: 24px; margin-top: 24px;">
-        <div class="section-header">
-            <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
-                Speed per GPU
-            </h3>
-        </div>
-        <p style="color: var(--text-secondary); margin-bottom: 8px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
-            Speed per GPU represents the number of tokens generated per second divided by the number of GPUs during the inference.<br>
-        </p>
-        <p style="font-size:0.95em; color:var(--text-secondary); margin-top:0.5px;">
-            <b>Setting</b>: We measured the speed in an H100 GPU environment consisting of 4 nodes with 8 GPUs each, using vLLM and Ray to set the tensor parallel size between 1 and 32 (In the plot, <i>GPU</i> refers to the tensor parallel size).<br>
-            We performed inference by sending an asynchronous request to the served model, and we set the concurrency to 32. <br>
-            <b>Note</b>: We measured the speed by directly serving open-source models, and proprietary models were excluded from the plot.
-        </p>
-    """)
-    # --- Speed Bar Plot UI: Row with left (category selector) and right (min/max dials) ---
-    category_columns = [col for col in configs.ON_LOAD_COLUMNS_CATEGORY if col not in configs.CATEGORY_EXCLUDED_COLUMNS]
-    default_category = "Overall"
-    default_x_axis_sort_by = "Overall Score"
-    with gr.Row():
-        with gr.Column(scale=1):
-            x_axis_sort_by = gr.Radio(
-                choices=["Overall Score", "Speed per GPU"],
-                value="Overall Score",
-                label="Sort X-Axis by",
-                elem_id=f"x-axis-btn-radio-{data_prefix.replace('/', '')}", # Make elem_id unique
-                elem_classes=["x-axis-btn-radio"],
-                interactive=True,
-                show_label=True
-            )
-        with gr.Column(scale=1):
-            min_max_score_slider = RangeSlider(
-                minimum=0,
-                maximum=100,
-                value=(0, 100),
-                step=1,
-                label="Minimum and Maximum Overall Score",
-                interactive=True
-            )
-        with gr.Column(scale=1):
-            min_max_param_size_slider = RangeSlider(
-                minimum=0,
-                maximum=1000,
-                value=(0, 1000),
-                step=1,
-                label="Minimum and Maximum Parameter Size (B)",
-                interactive=True
-            )
-    # Speed Bar Plot
-    from vis_utils import create_speed_med_bar_plot
-    speed_med_bar_plot = gr.Plot(
-        label="",
-        value=create_speed_med_bar_plot(
-            initial_df_cat,
-            time_data,
-            min_size=0,
-            max_size=1000,
-            min_score=0,
-            max_score=100,
-            category=default_category,
-            theme="light",
-            x_axis_sort_by=default_x_axis_sort_by,
-            mode=args.mode
-        ),
-        elem_classes=["speed-med-bar-plot", "plot-container"]
-    )
-    gr.HTML("</div>")
-    # --- Event handler: update Speed bar plot and dials when category or dials change ---
-    def update_speed_med_bar_plot(x_axis_sort_by, min_max_size, min_max_score, current_time_data_state, current_leaderboard_df=None):
-        df = current_leaderboard_df if current_leaderboard_df is not None else initial_df_cat
-        return create_speed_med_bar_plot(
-            df,
-            current_time_data_state,
-            min_size=min_max_size[0],
-            max_size=min_max_size[1],
-            min_score=min_max_score[0],
-            max_score=min_max_score[1],
-            theme="light",
-            x_axis_sort_by=x_axis_sort_by,
-            mode=args.mode
-        )
-    # Connect category selector to dials and plot
-    x_axis_sort_by.change(
-        fn=update_speed_med_bar_plot,
-        inputs=[x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state],
-        outputs=speed_med_bar_plot
-    )
-    min_max_param_size_slider.change(
-        fn=update_speed_med_bar_plot,
-        inputs=[x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state],
-        outputs=speed_med_bar_plot
-    )
-    min_max_score_slider.change(
-        fn=update_speed_med_bar_plot,
-        inputs=[x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state],
-        outputs=speed_med_bar_plot
-    )
-    # Connect leaderboard filters to dials and plot (if leaderboard_tab_cat provides a filtered DataFrame state)
-    if "df_state" in leaderboard_tab_cat:
-        leaderboard_tab_cat["df_state"].change(
-            fn=lambda df, x_axis_sort_by, min_max_size, min_max_score, time_data: update_speed_med_bar_plot(x_axis_sort_by, min_max_size, min_max_score, time_data, df),
-            inputs=[leaderboard_tab_cat["df_state"], x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state],
-            outputs=speed_med_bar_plot
-        )
     # Update radar chart when model_selector_cat selection changes
     def update_radar_chart_cat(selected_display_names):
         # If no selection, fallback to top-5

     )
     gr.HTML('</div>')
     # Update radar chart when model_selector_cat selection changes
     def update_radar_chart_cat(selected_display_names):
         # If no selection, fallback to top-5

src/about.py CHANGED Viewed

@@ -28,7 +28,7 @@ LINK = """
     <a href="https://arxiv.org/abs/2509.22715" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Paper</a> |
     <span>🌠</span>
     <a href="https://huggingface.co/spaces/SamsungResearch/TRUEBench/discussions" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Discussion</a> |
-    <span>🔭</span> Updated: 2025-10-15
 </h3>
 """

     <a href="https://arxiv.org/abs/2509.22715" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Paper</a> |
     <span>🌠</span>
     <a href="https://huggingface.co/spaces/SamsungResearch/TRUEBench/discussions" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Discussion</a> |
+    <span>🔭</span> Updated: 2025-12-24
 </h3>
 """

src/data/open/length_data.json CHANGED Viewed

@@ -1223,7 +1223,75 @@
             "Med Resp": 2282.5
         }
     },
-    "Claude 4.5 Opus (think)": {
         "Overall": {
             "Min": -10,
             "Max": -2,
@@ -1291,7 +1359,7 @@
             "Med Resp": -3.0
         }
     },
-    "GLM-4.5 FP8 (think)": {
         "Overall": {
             "Min": 75,
             "Max": 65432,
@@ -1427,6 +1495,74 @@
             "Med Resp": 1208.5
         }
     },
     "MiniMax-M2 (230B A10B)": {
         "Overall": {
             "Min": 64,
@@ -1631,6 +1767,74 @@
             "Med Resp": 1526.0
         }
     },
     "Qwen3 32B (think)": {
         "Overall": {
             "Min": 164,
@@ -1903,6 +2107,278 @@
             "Med Resp": -3.0
         }
     },
     "Claude 4 Opus (20250514) (think)": {
         "Overall": {
             "Min": -10,
@@ -2243,6 +2719,74 @@
             "Med Resp": 1558.0
         }
     },
     "GPT-5 nano (Reasoning: medium)": {
         "Overall": {
             "Min": -10,
@@ -2787,6 +3331,74 @@
             "Med Resp": 1279.0
         }
     },
     "Mi:dm 2.0 Base Instruct": {
         "Overall": {
             "Min": 1,

             "Med Resp": 2282.5
         }
     },
+    "DeepSeek V3.2 Speciale": {
+        "Overall": {
+            "Min": 160,
+            "Max": 65513,
+            "Med": 3226.5,
+            "Med Resp": 249.5
+        },
+        "Content Generation": {
+            "Min": 186,
+            "Max": 46347,
+            "Med": 3634.0,
+            "Med Resp": 364.0
+        },
+        "Editing": {
+            "Min": 329,
+            "Max": 24883,
+            "Med": 3043.0,
+            "Med Resp": 178.0
+        },
+        "Data Analysis": {
+            "Min": 191,
+            "Max": 64268,
+            "Med": 1640.0,
+            "Med Resp": 67.0
+        },
+        "Reasoning": {
+            "Min": 228,
+            "Max": 65472,
+            "Med": 2211.5,
+            "Med Resp": 165.0
+        },
+        "Hallucination": {
+            "Min": 373,
+            "Max": 23653,
+            "Med": 3253.5,
+            "Med Resp": 258.0
+        },
+        "Safety": {
+            "Min": 331,
+            "Max": 39236,
+            "Med": 2575.0,
+            "Med Resp": 158.0
+        },
+        "Repetition": {
+            "Min": 356,
+            "Max": 65513,
+            "Med": 3357.0,
+            "Med Resp": 246.0
+        },
+        "Summarization": {
+            "Min": 160,
+            "Max": 56309,
+            "Med": 1500.0,
+            "Med Resp": 189.5
+        },
+        "Translation": {
+            "Min": 522,
+            "Max": 25619,
+            "Med": 5143.5,
+            "Med Resp": 281.5
+        },
+        "Multi-Turn": {
+            "Min": 244,
+            "Max": 32258,
+            "Med": 4282.0,
+            "Med Resp": 854.0
+        }
+    },
+    "Claude 4.5 Opus (think, budget: 16K)": {
         "Overall": {
             "Min": -10,
             "Max": -2,
             "Med Resp": -3.0
         }
     },
+    "GLM-4.5 FP8": {
         "Overall": {
             "Min": 75,
             "Max": 65432,
             "Med Resp": 1208.5
         }
     },
+    "DeepSeek V3.2": {
+        "Overall": {
+            "Min": 134,
+            "Max": 22816,
+            "Med": 762.5,
+            "Med Resp": 312.0
+        },
+        "Content Generation": {
+            "Min": 153,
+            "Max": 5977,
+            "Med": 845.0,
+            "Med Resp": 462.0
+        },
+        "Editing": {
+            "Min": 141,
+            "Max": 6055,
+            "Med": 587.5,
+            "Med Resp": 245.5
+        },
+        "Data Analysis": {
+            "Min": 157,
+            "Max": 13414,
+            "Med": 695.0,
+            "Med Resp": 166.0
+        },
+        "Reasoning": {
+            "Min": 272,
+            "Max": 22816,
+            "Med": 1440.5,
+            "Med Resp": 245.0
+        },
+        "Hallucination": {
+            "Min": 213,
+            "Max": 9501,
+            "Med": 938.5,
+            "Med Resp": 532.5
+        },
+        "Safety": {
+            "Min": 184,
+            "Max": 5304,
+            "Med": 617.0,
+            "Med Resp": 238.0
+        },
+        "Repetition": {
+            "Min": 216,
+            "Max": 7227,
+            "Med": 919.5,
+            "Med Resp": 399.0
+        },
+        "Summarization": {
+            "Min": 134,
+            "Max": 1750,
+            "Med": 471.0,
+            "Med Resp": 197.5
+        },
+        "Translation": {
+            "Min": 154,
+            "Max": 6364,
+            "Med": 565.0,
+            "Med Resp": 301.0
+        },
+        "Multi-Turn": {
+            "Min": 401,
+            "Max": 14066,
+            "Med": 2538.5,
+            "Med Resp": 1261.0
+        }
+    },
     "MiniMax-M2 (230B A10B)": {
         "Overall": {
             "Min": 64,
             "Med Resp": 1526.0
         }
     },
+    "MiMo V2 Flash": {
+        "Overall": {
+            "Min": 125,
+            "Max": 69375,
+            "Med": 1477.5,
+            "Med Resp": 373.0
+        },
+        "Content Generation": {
+            "Min": 222,
+            "Max": 65445,
+            "Med": 1321.5,
+            "Med Resp": 500.5
+        },
+        "Editing": {
+            "Min": 265,
+            "Max": 65423,
+            "Med": 1194.0,
+            "Med Resp": 314.0
+        },
+        "Data Analysis": {
+            "Min": 262,
+            "Max": 65439,
+            "Med": 1296.0,
+            "Med Resp": 235.0
+        },
+        "Reasoning": {
+            "Min": 319,
+            "Max": 65430,
+            "Med": 2559.5,
+            "Med Resp": 402.5
+        },
+        "Hallucination": {
+            "Min": 129,
+            "Max": 65447,
+            "Med": 1179.5,
+            "Med Resp": 499.0
+        },
+        "Safety": {
+            "Min": 133,
+            "Max": 5184,
+            "Med": 717.0,
+            "Med Resp": 294.0
+        },
+        "Repetition": {
+            "Min": 295,
+            "Max": 65472,
+            "Med": 2153.5,
+            "Med Resp": 573.5
+        },
+        "Summarization": {
+            "Min": 188,
+            "Max": 64302,
+            "Med": 789.5,
+            "Med Resp": 220.5
+        },
+        "Translation": {
+            "Min": 125,
+            "Max": 65041,
+            "Med": 1738.5,
+            "Med Resp": 339.5
+        },
+        "Multi-Turn": {
+            "Min": 323,
+            "Max": 69375,
+            "Med": 3331.5,
+            "Med Resp": 1361.0
+        }
+    },
     "Qwen3 32B (think)": {
         "Overall": {
             "Min": 164,
             "Med Resp": -3.0
         }
     },
+    "GPT-5.2 (Reasoning: medium)": {
+        "Overall": {
+            "Min": 11,
+            "Max": 7735,
+            "Med": 347.0,
+            "Med Resp": 264.0
+        },
+        "Content Generation": {
+            "Min": 12,
+            "Max": 7735,
+            "Med": 537.0,
+            "Med Resp": 370.0
+        },
+        "Editing": {
+            "Min": 11,
+            "Max": 1562,
+            "Med": 173.5,
+            "Med Resp": 166.0
+        },
+        "Data Analysis": {
+            "Min": 18,
+            "Max": 3954,
+            "Med": 222.0,
+            "Med Resp": 98.0
+        },
+        "Reasoning": {
+            "Min": 29,
+            "Max": 6895,
+            "Med": 445.5,
+            "Med Resp": 246.5
+        },
+        "Hallucination": {
+            "Min": 72,
+            "Max": 3525,
+            "Med": 633.0,
+            "Med Resp": 357.5
+        },
+        "Safety": {
+            "Min": 58,
+            "Max": 2808,
+            "Med": 434.0,
+            "Med Resp": 285.0
+        },
+        "Repetition": {
+            "Min": 34,
+            "Max": 5202,
+            "Med": 272.0,
+            "Med Resp": 223.0
+        },
+        "Summarization": {
+            "Min": 37,
+            "Max": 2339,
+            "Med": 201.0,
+            "Med Resp": 194.5
+        },
+        "Translation": {
+            "Min": 12,
+            "Max": 3684,
+            "Med": 307.0,
+            "Med Resp": 283.5
+        },
+        "Multi-Turn": {
+            "Min": 41,
+            "Max": 7003,
+            "Med": 983.5,
+            "Med Resp": 844.5
+        }
+    },
+    "Gemini 3 Flash Preview (Thinking Level: High)": {
+        "Overall": {
+            "Min": 137,
+            "Max": 24472,
+            "Med": 1296.5,
+            "Med Resp": 424.5
+        },
+        "Content Generation": {
+            "Min": 248,
+            "Max": 16374,
+            "Med": 1368.5,
+            "Med Resp": 535.5
+        },
+        "Editing": {
+            "Min": 137,
+            "Max": 10610,
+            "Med": 1113.5,
+            "Med Resp": 338.0
+        },
+        "Data Analysis": {
+            "Min": 166,
+            "Max": 13595,
+            "Med": 923.0,
+            "Med Resp": 232.0
+        },
+        "Reasoning": {
+            "Min": 318,
+            "Max": 24472,
+            "Med": 1210.5,
+            "Med Resp": 556.0
+        },
+        "Hallucination": {
+            "Min": 349,
+            "Max": 5023,
+            "Med": 1295.5,
+            "Med Resp": 639.5
+        },
+        "Safety": {
+            "Min": 380,
+            "Max": 5510,
+            "Med": 1297.0,
+            "Med Resp": 482.0
+        },
+        "Repetition": {
+            "Min": 309,
+            "Max": 7743,
+            "Med": 1477.5,
+            "Med Resp": 389.5
+        },
+        "Summarization": {
+            "Min": 306,
+            "Max": 18709,
+            "Med": 905.5,
+            "Med Resp": 195.0
+        },
+        "Translation": {
+            "Min": 289,
+            "Max": 17871,
+            "Med": 1421.0,
+            "Med Resp": 381.5
+        },
+        "Multi-Turn": {
+            "Min": 231,
+            "Max": 11926,
+            "Med": 3075.5,
+            "Med Resp": 1466.5
+        }
+    },
+    "Kanana 2 30B A3B Thinking": {
+        "Overall": {
+            "Min": 584,
+            "Max": 247274,
+            "Med": 4263.0,
+            "Med Resp": 854.5
+        },
+        "Content Generation": {
+            "Min": 1055,
+            "Max": 139421,
+            "Med": 3898.5,
+            "Med Resp": 1028.0
+        },
+        "Editing": {
+            "Min": 747,
+            "Max": 134253,
+            "Med": 3199.0,
+            "Med Resp": 606.5
+        },
+        "Data Analysis": {
+            "Min": 618,
+            "Max": 120325,
+            "Med": 3402.0,
+            "Med Resp": 509.0
+        },
+        "Reasoning": {
+            "Min": 1042,
+            "Max": 160440,
+            "Med": 6428.5,
+            "Med Resp": 925.5
+        },
+        "Hallucination": {
+            "Min": 760,
+            "Max": 137639,
+            "Med": 4215.0,
+            "Med Resp": 1061.5
+        },
+        "Safety": {
+            "Min": 787,
+            "Max": 116591,
+            "Med": 3686.0,
+            "Med Resp": 867.0
+        },
+        "Repetition": {
+            "Min": 1238,
+            "Max": 134651,
+            "Med": 8164.0,
+            "Med Resp": 517.5
+        },
+        "Summarization": {
+            "Min": 584,
+            "Max": 59519,
+            "Med": 2540.0,
+            "Med Resp": 656.5
+        },
+        "Translation": {
+            "Min": 899,
+            "Max": 131258,
+            "Med": 4796.0,
+            "Med Resp": 894.0
+        },
+        "Multi-Turn": {
+            "Min": 1560,
+            "Max": 247274,
+            "Med": 12632.5,
+            "Med Resp": 2593.0
+        }
+    },
+    "Kanana 2 30B A3B Instruct": {
+        "Overall": {
+            "Min": 51,
+            "Max": 177683,
+            "Med": 1195.0,
+            "Med Resp": 1195.0
+        },
+        "Content Generation": {
+            "Min": 58,
+            "Max": 12603,
+            "Med": 1448.0,
+            "Med Resp": 1448.0
+        },
+        "Editing": {
+            "Min": 69,
+            "Max": 51628,
+            "Med": 836.5,
+            "Med Resp": 836.5
+        },
+        "Data Analysis": {
+            "Min": 51,
+            "Max": 11567,
+            "Med": 916.0,
+            "Med Resp": 916.0
+        },
+        "Reasoning": {
+            "Min": 51,
+            "Max": 122001,
+            "Med": 1587.0,
+            "Med Resp": 1587.0
+        },
+        "Hallucination": {
+            "Min": 104,
+            "Max": 17989,
+            "Med": 1419.0,
+            "Med Resp": 1419.0
+        },
+        "Safety": {
+            "Min": 96,
+            "Max": 7485,
+            "Med": 1377.0,
+            "Med Resp": 1377.0
+        },
+        "Repetition": {
+            "Min": 255,
+            "Max": 177683,
+            "Med": 844.0,
+            "Med Resp": 844.0
+        },
+        "Summarization": {
+            "Min": 108,
+            "Max": 4592,
+            "Med": 778.0,
+            "Med Resp": 778.0
+        },
+        "Translation": {
+            "Min": 69,
+            "Max": 30611,
+            "Med": 1059.0,
+            "Med Resp": 1059.0
+        },
+        "Multi-Turn": {
+            "Min": 119,
+            "Max": 74203,
+            "Med": 3252.5,
+            "Med Resp": 3252.5
+        }
+    },
     "Claude 4 Opus (20250514) (think)": {
         "Overall": {
             "Min": -10,
             "Med Resp": 1558.0
         }
     },
+    "GLM-4.7 FP8": {
+        "Overall": {
+            "Min": 212,
+            "Max": 131072,
+            "Med": 2252.5,
+            "Med Resp": 328.0
+        },
+        "Content Generation": {
+            "Min": 383,
+            "Max": 18712,
+            "Med": 2094.0,
+            "Med Resp": 423.0
+        },
+        "Editing": {
+            "Min": 384,
+            "Max": 14538,
+            "Med": 2070.5,
+            "Med Resp": 263.0
+        },
+        "Data Analysis": {
+            "Min": 396,
+            "Max": 13525,
+            "Med": 1477.0,
+            "Med Resp": 162.0
+        },
+        "Reasoning": {
+            "Min": 254,
+            "Max": 40295,
+            "Med": 2298.5,
+            "Med Resp": 465.5
+        },
+        "Hallucination": {
+            "Min": 443,
+            "Max": 19838,
+            "Med": 2156.5,
+            "Med Resp": 481.0
+        },
+        "Safety": {
+            "Min": 212,
+            "Max": 10792,
+            "Med": 2121.0,
+            "Med Resp": 197.0
+        },
+        "Repetition": {
+            "Min": 768,
+            "Max": 131072,
+            "Med": 2963.5,
+            "Med Resp": 289.0
+        },
+        "Summarization": {
+            "Min": 599,
+            "Max": 10452,
+            "Med": 1426.0,
+            "Med Resp": 182.5
+        },
+        "Translation": {
+            "Min": 796,
+            "Max": 12247,
+            "Med": 3159.5,
+            "Med Resp": 312.5
+        },
+        "Multi-Turn": {
+            "Min": 420,
+            "Max": 15706,
+            "Med": 5514.5,
+            "Med Resp": 1361.0
+        }
+    },
     "GPT-5 nano (Reasoning: medium)": {
         "Overall": {
             "Min": -10,
             "Med Resp": 1279.0
         }
     },
+    "Mistral Large 3 675B Instruct 2512": {
+        "Overall": {
+            "Min": 1,
+            "Max": 12120,
+            "Med": 448.0,
+            "Med Resp": 448.0
+        },
+        "Content Generation": {
+            "Min": 13,
+            "Max": 6162,
+            "Med": 565.0,
+            "Med Resp": 565.0
+        },
+        "Editing": {
+            "Min": 12,
+            "Max": 2369,
+            "Med": 299.0,
+            "Med Resp": 299.0
+        },
+        "Data Analysis": {
+            "Min": 1,
+            "Max": 3902,
+            "Med": 295.0,
+            "Med Resp": 295.0
+        },
+        "Reasoning": {
+            "Min": 1,
+            "Max": 6293,
+            "Med": 530.0,
+            "Med Resp": 530.0
+        },
+        "Hallucination": {
+            "Min": 54,
+            "Max": 4461,
+            "Med": 896.0,
+            "Med Resp": 896.0
+        },
+        "Safety": {
+            "Min": 27,
+            "Max": 4250,
+            "Med": 589.0,
+            "Med Resp": 589.0
+        },
+        "Repetition": {
+            "Min": 89,
+            "Max": 5264,
+            "Med": 448.0,
+            "Med Resp": 448.0
+        },
+        "Summarization": {
+            "Min": 31,
+            "Max": 1357,
+            "Med": 251.5,
+            "Med Resp": 251.5
+        },
+        "Translation": {
+            "Min": 22,
+            "Max": 3529,
+            "Med": 354.5,
+            "Med Resp": 354.5
+        },
+        "Multi-Turn": {
+            "Min": 4,
+            "Max": 12120,
+            "Med": 2191.5,
+            "Med Resp": 2191.5
+        }
+    },
     "Mi:dm 2.0 Base Instruct": {
         "Overall": {
             "Min": 1,

src/data/open/stats.csv CHANGED Viewed

@@ -1,83 +1,98 @@
-"Model Name"	"Link"	"Comment"	"Group"	"Med. Len."	"Med. Resp. Len."	"Time to First Answer Token"	"End-to-End Response Time"	"Speed"	"Parameter Size (B)"	"Type"	"Model Type"	"Think"	"Overall"	"Content Generation"	"Editing"	"Data Analysis"	"Reasoning"	"Hallucination"	"Safety"	"Repetition"	"Summarization"	"Translation"	"Multi-Turn"
-"GPT-5 (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5"	"Reasoning: medium"	"GPT"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"70.73"	"71.0"	"74.38"	"76.49"	"79.75"	"64.94"	"56.2"	"82.86"	"80.16"	"69.38"	"54.36"
-"o3-pro (Reasoning: medium)"	"https://platform.openai.com/docs/models/o3-pro"	"Reasoning: medium"	"GPT"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"66.47"	"72.5"	"70.31"	"75.7"	"83.88"	"64.37"	"33.88"	"74.29"	"65.48"	"64.33"	"48.32"
-"GPT-5.1 (Reasoning: medium, verbosity: medium)"	"https://platform.openai.com/docs/models/gpt-5.1"	"Reasoning: medium, verbosity: medium"	"GPT"	""	""	""	"11.673096776008606"	""	""	"Proprietary"	"Think"	"On"	"64.57"	"67.0"	"70.0"	"72.51"	"82.64"	"65.52"	"52.07"	"51.43"	"67.06"	"59.55"	"45.64"
-"Claude 4.5 Opus (think)"	"https://www.anthropic.com/claude/opus"	""	"Claude"	""	""	""	""	""	""	"Proprietary"	"Hybrid"	"On"	"63.41"	"63.5"	"62.5"	"73.71"	"77.69"	"82.76"	"52.89"	"58.57"	"63.49"	"56.74"	"45.97"
-"Claude 4 Opus (20250514) (think)"	"https://www.anthropic.com/claude/opus"	"version: 20250514"	"Claude"	""	""	""	""	""	""	"Proprietary"	"Hybrid"	"On"	"63.29"	"60.75"	"59.69"	"73.31"	"69.83"	"78.74"	"53.72"	"55.71"	"65.48"	"65.45"	"48.99"
-"Claude 4.1 Opus (20250805) (think)"	"https://www.anthropic.com/claude/opus"	"version: 20250805"	"Claude"	""	""	""	""	""	""	"Proprietary"	"Hybrid"	"On"	"63.24"	"61.25"	"60.0"	"78.49"	"72.73"	"77.01"	"56.2"	"57.14"	"61.9"	"62.64"	"46.98"
-"GPT-5 mini (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5-mini"	"Reasoning: medium"	"GPT"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"62.56"	"68.0"	"62.5"	"74.9"	"76.86"	"55.17"	"47.93"	"44.29"	"74.6"	"56.18"	"45.3"
-"Gemini 3 Pro Preview (Thinking Level: High)"	""	""	"Gemini"	"1930.5"	"378.0"	""	"27.89457416534424"	""	""	"Proprietary"	"Think"	"On"	"62.48"	"59.5"	"64.38"	"76.49"	"78.93"	"70.69"	"39.67"	"65.71"	"61.51"	"58.15"	"48.99"
-"Claude 4 Sonnet (20250514) (think)"	"https://www.anthropic.com/claude/sonnet"	"version: 20250514"	"Claude"	""	""	""	""	""	""	"Proprietary"	"Hybrid"	"On"	"61.8"	"58.0"	"58.44"	"76.49"	"67.77"	"79.31"	"57.02"	"44.29"	"65.08"	"62.92"	"44.97"
-"o3"	"https://platform.openai.com/docs/models/o3"	""	"GPT"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"60.91"	"68.75"	"60.0"	"73.31"	"79.34"	"54.02"	"34.71"	"64.29"	"60.71"	"55.06"	"46.98"
-"Gemini 2.5 Pro"	"https://deepmind.google/models/gemini/pro/"	""	"Gemini"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"59.34"	"54.0"	"60.94"	"78.88"	"73.14"	"63.22"	"17.36"	"52.86"	"67.86"	"53.93"	"52.68"
 "Grok-4"	"https://x.ai/news/grok-4"	"temperature: 0.6
-top-p: 0.95"	"Grok"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"58.74"	"61.0"	"66.25"	"72.51"	"63.22"	"66.09"	"16.53"	"58.57"	"66.27"	"54.21"	"44.3"
-"Gemini 2.5 Flash"	"https://deepmind.google/models/gemini/flash/"	""	"Gemini"	""	""	""	""	""	""	"Proprietary"	"Hybrid"	"On"	"58.62"	"57.25"	"62.19"	"70.52"	"72.31"	"56.9"	"28.93"	"47.14"	"68.65"	"55.06"	"46.98"
-"o4-mini"	"https://platform.openai.com/docs/models/o4-mini"	""	"GPT"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"57.57"	"67.25"	"61.25"	"71.71"	"75.62"	"45.4"	"39.67"	"44.29"	"59.92"	"47.19"	"41.95"
 "Kimi K2 Thinking"	"https://huggingface.co/moonshotai/Kimi-K2-Thinking"	"temperature:1.0
-top-p: 0.95"	"moonshot"	"1692.0"	"330.0"	"45.35071495282816"	"70.24291145801544"	"24.28866627458008"	"1000.0"	"Open"	"Think"	"On"	"56.84"	"58.25"	"50.31"	"69.72"	"77.27"	"60.92"	"44.63"	"38.57"	"59.92"	"52.25"	"44.3"
 "Qwen3 235B A22B Thinking 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507"	"temperature: 0.6
-top-p: 0.95"	"Qwen"	"2404.5"	"423.0"	"58.364528823897146"	"80.01045334339142"	"31.05335185752473"	"235.0"	"Open"	"Think"	"On"	"55.48"	"57.5"	"53.12"	"73.31"	"75.21"	"55.17"	"25.62"	"35.71"	"55.56"	"56.18"	"40.27"
-"GPT-5 nano (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5-nano"	"Reasoning: medium"	"GPT"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"55.39"	"63.5"	"47.19"	"68.92"	"75.21"	"55.17"	"52.07"	"34.29"	"63.49"	"40.73"	"42.95"
-"GLM-4.5 FP8 (think)"	"https://huggingface.co/zai-org/GLM-4.5-FP8"	"temperature: 0.6
-top-p: 0.95"	"GLM"	"1442.0"	"604.0"	"25.261904125875603"	"62.74959444999695"	"23.293980879127712"	"355.0"	"Open"	"Hybrid"	"On"	"54.03"	"60.75"	"53.75"	"68.92"	"74.38"	"47.13"	"33.06"	"41.43"	"60.32"	"46.07"	"35.91"
 "GLM-4.6 FP8"	"https://huggingface.co/zai-org/GLM-4.6-FP8"	"temperature: 1.0
-top-p: 0.95"	"GLM"	"2645.5"	"522.0"	"81.414294828216"	"110.0251989364624"	"24.034975709814915"	"355.0"	"Open"	"Hybrid"	"On"	"53.3"	"57.5"	"51.25"	"71.31"	"71.9"	"53.45"	"24.79"	"28.57"	"58.33"	"44.38"	"43.29"
-"Gemini 2.5 Flash-lite Preview (09-2025)"	"https://deepmind.google/models/gemini/"	"version: 09-2025"	"Gemini"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"53.06"	"55.0"	"55.94"	"68.13"	"70.25"	"47.7"	"23.97"	"30.0"	"60.71"	"46.63"	"42.28"
 "Qwen3 235B A22B Instruct 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507"	"temperature: 0.7
-top-p: 0.8"	"Qwen"	"433.0"	"433.0"	"0.1387630701065063"	"14.262101531028748"	"31.359207215387023"	"235.0"	"Open"	"Instruct"	"Off"	"52.94"	"58.0"	"49.69"	"68.13"	"73.97"	"55.17"	"45.45"	"30.0"	"55.95"	"38.48"	"41.61"
 "DeepSeek V3.1 (think)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3.1"	"temperature: 0.6
-top-p: 0.95"	"DeepSeek"	"710.5"	"356.0"	"14.323043732258654"	"35.32915151119232"	"16.64962453842425"	"671.0"	"Open"	"Hybrid"	"On"	"51.45"	"52.0"	"50.0"	"67.33"	"69.83"	"50.0"	"33.88"	"35.71"	"59.52"	"41.85"	"40.27"
 "DeepSeek V3.1 Terminus (think)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3.1-Terminus"	"temperature: 0.6
-top-p: 0.95"	"DeepSeek"	"831.5"	"377.0"	"17.055466594943752"	"47.552645206451416"	"17.890508425613742"	"671.0"	"Open"	"Hybrid"	"On"	"51.37"	"51.5"	"52.19"	"69.32"	"73.14"	"51.72"	"25.62"	"38.57"	"57.14"	"38.76"	"40.94"
 "Qwen3 30B A3B Thinking 2507"	"https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507"	"temperature: 0.6
-top-p: 0.95"	"Qwen"	"2830.0"	"351.0"	"76.69636714346468"	"82.98819828033447"	"72.08537789542703"	"30.0"	"Open"	"Think"	"On"	"50.44"	"56.25"	"45.0"	"69.32"	"69.01"	"50.0"	"29.75"	"30.0"	"48.02"	"47.47"	"36.58"
 "gpt-oss-120B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-120b"	"Reasoning: medium
 temperature: 1.0
-top-p: 1.0"	"GPT"	"759.5"	"370.5"	"7.694922740481965"	"12.121336698532104"	"103.31935460342277"	"117.0"	"Open"	"Think"	"On"	"49.11"	"58.5"	"48.44"	"68.92"	"69.83"	"41.38"	"39.67"	"25.71"	"50.79"	"35.67"	"32.21"
 "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)"	"https://huggingface.co/deepseek-ai/DeepSeek-R1-0528"	"version: 0528
 temperature: 0.6
-top-p: 0.95"	"DeepSeek"	"1177.5"	"554.0"	"28.558620557701"	"70.60028326511383"	"17.625838630215213"	"671.0"	"Open"	"Think"	"On"	"48.79"	"49.75"	"50.0"	"65.34"	"59.09"	"48.85"	"38.02"	"32.86"	"57.94"	"36.52"	"38.93"
-"Gauss2.3 Hybrid"	""	""	"Gauss"	"546.0"	"308.0"	"6.750162363052368"	"17.980867981910706"	"42.58336125102582"	""	"Proprietary"	"Hybrid"	"On"	"46.58"	"52.0"	"46.25"	"59.76"	"66.94"	"41.95"	"34.71"	"25.71"	"53.17"	"34.55"	"33.22"
 "DeepSeek V3 (0324) (top_p: 0.95, temp:1.3)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3-0324"	"version: 0324
 temperature: 1.3
-top-p: 0.95"	"DeepSeek"	"408.0"	"408.0"	"0.211452841758728"	"23.47111320495605"	"17.62487523518351"	"671.0"	"Open"	"Instruct"	"Off"	"45.09"	"46.25"	"45.0"	"58.96"	"60.33"	"41.95"	"21.49"	"30.0"	"55.95"	"38.48"	"33.22"
 "Qwen3 32B (think)"	"https://huggingface.co/Qwen/Qwen3-32B"	"temperature: 0.6
-top-p: 0.95"	"Qwen"	"1113.0"	"390.0"	"27.26490248867746"	"39.635579228401184"	"37.74973909656839"	"32.8"	"Open"	"Hybrid"	"On"	"44.44"	"52.25"	"41.56"	"68.92"	"66.53"	"35.06"	"19.83"	"25.71"	"46.43"	"30.9"	"32.89"
 "Qwen3 30B A3B Instruct 2507"	"https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507"	"temperature: 0.7
-top-p: 0.8"	"Qwen"	"441.5"	"441.5"	"7.902002811431885"	"19.310550212860107"	"42.44958664990833"	"30.0"	"Open"	"Instruct"	"Off"	"42.79"	"45.0"	"35.0"	"56.18"	"66.12"	"51.15"	"33.06"	"24.29"	"46.83"	"28.09"	"35.57"
 "MiniMax-M2 (230B A10B)"	"https://huggingface.co/MiniMaxAI/MiniMax-M2"	"temperature:1.0
-top-p: 0.95"	"MiniMaxAI"	"1142.0"	"325.0"	""	""	""	"230.0"	"Open"	"Think"	"On"	"42.43"	"48.75"	"35.62"	"53.39"	"57.02"	"43.1"	"44.63"	"28.57"	"49.21"	"30.06"	"31.21"
-"A.X 4.0"	"https://huggingface.co/skt/A.X-4.0"	""	"SKT"	"412.5"	"412.5"	"0.6553128957748413"	"7.924791574478149"	"57.95526130360478"	"71.9"	"Open"	"Instruct"	"Off"	"41.59"	"56.0"	"43.75"	"43.43"	"42.56"	"40.23"	"15.7"	"24.29"	"53.97"	"33.43"	"32.21"
 "gpt-oss-20B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-20b"	"Reasoning: medium
 temperature: 1.0
-top-p: 1.0"	"GPT"	"953.5"	"326.0"	"26.04652036871504"	"29.767700791358948"	"108.53633696847938"	"21.0"	"Open"	"Think"	"On"	"41.18"	"52.0"	"40.0"	"61.35"	"65.7"	"43.1"	"41.32"	"22.86"	"36.51"	"20.51"	"22.82"
 "Gemma 3 27B it"	"https://huggingface.co/google/gemma-3-27b-it"	"temperature: 1.0
-top-p: 0.95"	"Gemma"	"380.0"	"380.0"	"3.391351342201233"	"13.303653597831726"	"39.94050750809835"	"27.0"	"Open"	"Instruct"	"Off"	"40.86"	"44.25"	"45.0"	"45.82"	"36.78"	"31.61"	"32.23"	"22.86"	"57.14"	"32.87"	"39.93"
 "Tongyi DeepResearch 30B A3B"	"https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B"	"temperature: 0.6
-top-p: 0.95"	"Alibaba"	"1147.0"	"408.0"	"45.23295979184195"	"52.38741266727448"	"62.676624491545525"	"30.0"	"Open"	"Think"	"On"	"40.1"	"41.25"	"33.12"	"62.15"	"68.18"	"44.25"	"23.97"	"18.57"	"41.67"	"26.12"	"29.19"
 "Mistral Small 3.2 24B Instruct 2506"	"https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506"	"temperature: 0.15
-top-p: 0.95"	"mistralai"	"369.0"	"369.0"	"3.2450859546661377"	"13.907460689544678"	"36.382163796915904"	"24.0"	"Open"	"Instruct"	"Off"	"39.09"	"43.0"	"44.69"	"43.43"	"51.65"	"25.86"	"22.31"	"25.71"	"51.98"	"31.18"	"30.2"
 "K2-Think"	"https://huggingface.co/LLM360/K2-Think"	"temperature: 1.0
-top-p: 0.95"	"LLM360"	"1835.0"	"486.0"	"24.29692639716904"	"43.2994556427002"	"42.72123101353567"	"32.8"	"Open"	"Think"	"On"	"35.06"	"35.5"	"36.56"	"56.18"	"47.11"	"35.06"	"14.05"	"12.86"	"49.21"	"21.63"	"23.15"
 "KAT Dev 72B Exp"	"https://huggingface.co/Kwaipilot/KAT-Dev-72B-Exp"	"temperature:0.6
-top-p: 0.95"	"KAT"	"397.0"	"397.0"	"0.0622165203094482"	"8.492375493049622"	"50.601864763867184"	"72.0"	"Open"	"Instruct"	"Off"	"33.94"	"29.25"	"44.06"	"46.22"	"46.69"	"25.86"	"18.18"	"20.0"	"42.86"	"25.56"	"25.5"
 "Olmo 3 32B Think"	"https://huggingface.co/allenai/Olmo-3-32B-Think"	"temperature: 1
 top-p: 0.95
-top-k: 50"	"allenai"	"3360.5"	"473.0"	"60.18788400716624"	"77.51256728172302"	"44.30514641537086"	"32.0"	"Open"	"Think"	"On"	"33.94"	"35.25"	"30.94"	"57.37"	"66.53"	"33.33"	"28.93"	"24.29"	"34.52"	"11.8"	"19.8"
 "EXAONE 4.0 32B (think)"	"https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B"	"temperature: 0.6
-top-p: 0.95"	"Exaone"	"1274.5"	"503.0"	"40.64476558326666"	"52.11687910556793"	"51.19312170664125"	"32.0"	"Open"	"Hybrid"	"On"	"33.82"	"34.25"	"29.38"	"56.97"	"57.44"	"24.71"	"27.27"	"17.14"	"38.49"	"18.54"	"25.5"
 "Apriel 1.5 15B Thinker"	"https://huggingface.co/ServiceNow-AI/Apriel-1.5-15b-Thinker"	"temperature: 0.6
-top-p: 0.95"	"Apriel"	"2238.0"	"375.0"	"299.8162105011457"	"379.46853709220886"	"14.66275339770088"	"15.0"	"Open"	"Think"	"On"	"31.92"	"44.25"	"26.56"	"47.41"	"59.09"	"22.99"	"37.19"	"20.0"	"26.98"	"20.22"	"10.07"
 "HyperCLOVAX SEED Think 14B (think)"	"https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"	"temperature: 0.5
-top-p: 0.6"	"HCX"	"1444.0"	"382.5"	"16.12651202553951"	"24.703290462493896"	"83.75171982150616"	"14.7"	"Open"	"Hybrid"	"On"	"31.84"	"35.0"	"26.56"	"53.78"	"58.68"	"27.59"	"26.45"	"17.14"	"29.76"	"17.13"	"20.47"
 "Dhanishtha-2.0 Preview"	"https://huggingface.co/HelpingAI/Dhanishtha-2.0-preview"	"temperature: 0.7
-top-p: 0.9"	"HelpingAI"	"520.0"	"356.0"	"4.368606805801392"	"35.15699875354767"	"17.75738514863349"	"14.8"	"Open"	"Think"	"On"	"25.81"	"28.25"	"19.38"	"30.28"	"33.47"	"43.1"	"47.93"	"20.0"	"31.75"	"12.08"	"13.09"
 "ERNIE 4.5 21B A3B Thinking"	"https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking"	"temperature: 0.6
-top-p: 0.95"	"ERNIE"	"1637.0"	"541.0"	"48.24206436969081"	"56.95321476459503"	"78.52955859303597"	"21.0"	"Open"	"Think"	"On"	"25.32"	"27.25"	"20.31"	"42.23"	"49.59"	"23.56"	"31.4"	"17.14"	"28.17"	"7.3"	"13.76"
 "Solar Pro Preview (top_p:0.95, temp: 0.7)"	"https://huggingface.co/upstage/solar-pro-preview-instruct"	"temperature: 0.7
-top-p: 0.95"	"Solar"	"260.0"	"260.0"	"12.68759036064148"	"39.93266606330872"	"11.341528558845871"	"22.0"	"Open"	"Instruct"	"Off"	"20.73"	"28.0"	"24.69"	"16.73"	"19.42"	"17.24"	"28.1"	"11.43"	"31.35"	"13.76"	"11.74"
 "Mi:dm 2.0 Base Instruct"	"https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct"	"temperature: 0.8
-top-p: 0.7"	"KT"	"316.0"	"316.0"	"3.07414984703064"	"11.089128971099854"	"41.13515299318637"	"11.5"	"Open"	"Instruct"	"Off"	"20.25"	"21.75"	"17.5"	"16.73"	"18.6"	"27.59"	"59.5"	"14.29"	"25.4"	"12.64"	"11.41"
 "Kanana 1.5 15.7B A3B Instruct"	"https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct"	"temperature: 1.0
-top-p: 0.95"	"Kakao"	"414.0"	"414.0"	"2.999279260635376"	"14.037613034248352"	"39.50831768498445"	"15.7"	"Open"	"Instruct"	"Off"	"11.71"	"14.25"	"10.62"	"13.55"	"11.16"	"22.41"	"22.31"	"4.29"	"11.9"	"6.74"	"5.37"

+"Model Name"	"Link"	"Comment"	"Group"	"Med. Len."	"Med. Resp. Len."	"Parameter Size (B)"	"Type"	"Model Type"	"Think"	"Overall"	"Content Generation"	"Editing"	"Data Analysis"	"Reasoning"	"Hallucination"	"Safety"	"Repetition"	"Summarization"	"Translation"	"Multi-Turn"
+"GPT-5 (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5"	"Reasoning: medium"	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"70.73"	"71.0"	"74.38"	"76.49"	"79.75"	"64.94"	"56.2"	"82.86"	"80.16"	"69.38"	"54.36"
+"o3-pro (Reasoning: medium)"	"https://platform.openai.com/docs/models/o3-pro"	"Reasoning: medium"	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"66.47"	"72.5"	"70.31"	"75.7"	"83.88"	"64.37"	"33.88"	"74.29"	"65.48"	"64.33"	"48.32"
+"GPT-5.2 (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5.2"	"Reasoning: medium"	"GPT"	"347.0"	"264.0"	""	"Proprietary"	"Hybrid"	"On"	"66.18"	"69.25"	"65.62"	"71.31"	"78.51"	"70.69"	"52.07"	"51.43"	"80.56"	"55.9"	"55.03"
+"GPT-5.1 (Reasoning: medium, verbosity: medium)"	"https://platform.openai.com/docs/models/gpt-5.1"	"Reasoning: medium, verbosity: medium"	"GPT"	""	""	""	"Proprietary"	"Hybrid"	"On"	"64.57"	"67.0"	"70.0"	"72.51"	"82.64"	"65.52"	"52.07"	"51.43"	"67.06"	"59.55"	"45.64"
+"Claude 4.5 Opus (think, budget: 16K)"	"https://www.anthropic.com/claude/opus"	"thinking budget: 16K"	"Claude"	""	""	""	"Proprietary"	"Hybrid"	"On"	"63.41"	"63.5"	"62.5"	"73.71"	"77.69"	"82.76"	"52.89"	"58.57"	"63.49"	"56.74"	"45.97"
+"Claude 4 Opus (20250514) (think)"	"https://www.anthropic.com/claude/opus"	"version: 20250514"	"Claude"	""	""	""	"Proprietary"	"Hybrid"	"On"	"63.29"	"60.75"	"59.69"	"73.31"	"69.83"	"78.74"	"53.72"	"55.71"	"65.48"	"65.45"	"48.99"
+"Claude 4.1 Opus (20250805) (think)"	"https://www.anthropic.com/claude/opus"	"version: 20250805"	"Claude"	""	""	""	"Proprietary"	"Hybrid"	"On"	"63.24"	"61.25"	"60.0"	"78.49"	"72.73"	"77.01"	"56.2"	"57.14"	"61.9"	"62.64"	"46.98"
+"GPT-5 mini (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5-mini"	"Reasoning: medium"	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"62.56"	"68.0"	"62.5"	"74.9"	"76.86"	"55.17"	"47.93"	"44.29"	"74.6"	"56.18"	"45.3"
+"Gemini 3 Pro Preview (Thinking Level: High)"	"https://deepmind.google/models/gemini/pro/"	"Thinking Level: High"	"Gemini"	"1930.5"	"378.0"	""	"Proprietary"	"Think"	"On"	"62.48"	"59.5"	"64.38"	"76.49"	"78.93"	"70.69"	"39.67"	"65.71"	"61.51"	"58.15"	"48.99"
+"Claude 4 Sonnet (20250514) (think)"	"https://www.anthropic.com/claude/sonnet"	"version: 20250514"	"Claude"	""	""	""	"Proprietary"	"Hybrid"	"On"	"61.8"	"58.0"	"58.44"	"76.49"	"67.77"	"79.31"	"57.02"	"44.29"	"65.08"	"62.92"	"44.97"
+"o3"	"https://platform.openai.com/docs/models/o3"	""	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"60.91"	"68.75"	"60.0"	"73.31"	"79.34"	"54.02"	"34.71"	"64.29"	"60.71"	"55.06"	"46.98"
+"Gemini 2.5 Pro"	"https://deepmind.google/models/gemini/pro/"	""	"Gemini"	""	""	""	"Proprietary"	"Think"	"On"	"59.34"	"54.0"	"60.94"	"78.88"	"73.14"	"63.22"	"17.36"	"52.86"	"67.86"	"53.93"	"52.68"
+"Gemini 3 Flash Preview (Thinking Level: High)"	"https://deepmind.google/models/gemini/flash/"	"Thinking Level: High"	"Gemini"	"1296.5"	"424.5"	""	"Proprietary"	"Think"	"On"	"59.26"	"59.5"	"59.69"	"75.3"	"79.34"	"63.22"	"34.71"	"57.14"	"59.92"	"50.84"	"46.31"
+"GLM-4.7 FP8"	"https://huggingface.co/zai-org/GLM-4.7-FP8"	"temperature: 1.0
+top-p: 0.95"	"GLM"	"2252.5"	"328.0"	"358.0"	"Open"	"Hybrid"	"On"	"59.22"	"62.75"	"60.0"	"75.3"	"75.21"	"58.05"	"29.75"	"35.71"	"66.67"	"53.93"	"45.3"
+"DeepSeek V3.2 Speciale"	"https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Speciale"	"temperature: 1.0
+top-p: 0.95"	"DeepSeek"	"3226.5"	"249.5"	"671.0"	"Open"	"Think"	"On"	"59.14"	"64.0"	"67.19"	"74.5"	"78.1"	"48.28"	"20.66"	"58.57"	"66.27"	"53.09"	"38.93"
 "Grok-4"	"https://x.ai/news/grok-4"	"temperature: 0.6
+top-p: 0.95"	"Grok"	""	""	""	"Proprietary"	"Think"	"On"	"58.74"	"61.0"	"66.25"	"72.51"	"63.22"	"66.09"	"16.53"	"58.57"	"66.27"	"54.21"	"44.3"
+"Gemini 2.5 Flash"	"https://deepmind.google/models/gemini/flash/"	""	"Gemini"	""	""	""	"Proprietary"	"Hybrid"	"On"	"58.62"	"57.25"	"62.19"	"70.52"	"72.31"	"56.9"	"28.93"	"47.14"	"68.65"	"55.06"	"46.98"
+"o4-mini"	"https://platform.openai.com/docs/models/o4-mini"	""	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"57.57"	"67.25"	"61.25"	"71.71"	"75.62"	"45.4"	"39.67"	"44.29"	"59.92"	"47.19"	"41.95"
 "Kimi K2 Thinking"	"https://huggingface.co/moonshotai/Kimi-K2-Thinking"	"temperature:1.0
+top-p: 0.95"	"moonshot"	"1692.0"	"330.0"	"1000.0"	"Open"	"Think"	"On"	"56.84"	"58.25"	"50.31"	"69.72"	"77.27"	"60.92"	"44.63"	"38.57"	"59.92"	"52.25"	"44.3"
 "Qwen3 235B A22B Thinking 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507"	"temperature: 0.6
+top-p: 0.95"	"Qwen"	"2404.5"	"423.0"	"235.0"	"Open"	"Think"	"On"	"55.48"	"57.5"	"53.12"	"73.31"	"75.21"	"55.17"	"25.62"	"35.71"	"55.56"	"56.18"	"40.27"
+"GPT-5 nano (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5-nano"	"Reasoning: medium"	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"55.39"	"63.5"	"47.19"	"68.92"	"75.21"	"55.17"	"52.07"	"34.29"	"63.49"	"40.73"	"42.95"
+"GLM-4.5 FP8"	"https://huggingface.co/zai-org/GLM-4.5-FP8"	"temperature: 0.6
+top-p: 0.95"	"GLM"	"1442.0"	"604.0"	"358.0"	"Open"	"Hybrid"	"On"	"54.03"	"60.75"	"53.75"	"68.92"	"74.38"	"47.13"	"33.06"	"41.43"	"60.32"	"46.07"	"35.91"
 "GLM-4.6 FP8"	"https://huggingface.co/zai-org/GLM-4.6-FP8"	"temperature: 1.0
+top-p: 0.95"	"GLM"	"2645.5"	"522.0"	"358.0"	"Open"	"Hybrid"	"On"	"53.3"	"57.5"	"51.25"	"71.31"	"71.9"	"53.45"	"24.79"	"28.57"	"58.33"	"44.38"	"43.29"
+"Gemini 2.5 Flash-lite Preview (09-2025)"	"https://deepmind.google/models/gemini/"	"version: 09-2025"	"Gemini"	""	""	""	"Proprietary"	"Think"	"On"	"53.06"	"55.0"	"55.94"	"68.13"	"70.25"	"47.7"	"23.97"	"30.0"	"60.71"	"46.63"	"42.28"
 "Qwen3 235B A22B Instruct 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507"	"temperature: 0.7
+top-p: 0.8"	"Qwen"	"433.0"	"433.0"	"235.0"	"Open"	"Instruct"	"Off"	"52.94"	"58.0"	"49.69"	"68.13"	"73.97"	"55.17"	"45.45"	"30.0"	"55.95"	"38.48"	"41.61"
+"DeepSeek V3.2"	"https://huggingface.co/deepseek-ai/DeepSeek-V3.2"	"temperature: 1.0
+top-p: 0.95"	"DeepSeek"	"762.5"	"312.0"	"671.0"	"Open"	"Think"	"On"	"52.17"	"51.25"	"51.56"	"70.92"	"72.31"	"51.15"	"36.36"	"37.14"	"60.32"	"40.17"	"39.93"
 "DeepSeek V3.1 (think)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3.1"	"temperature: 0.6
+top-p: 0.95"	"DeepSeek"	"710.5"	"356.0"	"671.0"	"Open"	"Hybrid"	"On"	"51.45"	"52.0"	"50.0"	"67.33"	"69.83"	"50.0"	"33.88"	"35.71"	"59.52"	"41.85"	"40.27"
 "DeepSeek V3.1 Terminus (think)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3.1-Terminus"	"temperature: 0.6
+top-p: 0.95"	"DeepSeek"	"831.5"	"377.0"	"671.0"	"Open"	"Hybrid"	"On"	"51.37"	"51.5"	"52.19"	"69.32"	"73.14"	"51.72"	"25.62"	"38.57"	"57.14"	"38.76"	"40.94"
 "Qwen3 30B A3B Thinking 2507"	"https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507"	"temperature: 0.6
+top-p: 0.95"	"Qwen"	"2830.0"	"351.0"	"30.0"	"Open"	"Think"	"On"	"50.44"	"56.25"	"45.0"	"69.32"	"69.01"	"50.0"	"29.75"	"30.0"	"48.02"	"47.47"	"36.58"
+"MiMo V2 Flash"	"https://huggingface.co/XiaomiMiMo/MiMo-V2-Flash"	"temperature: 0.8
+top-p: 0.95"	"XiaomiMiMo"	"1477.5"	"373.0"	"309.0"	"Open"	"Think"	"On"	"50.32"	"54.0"	"48.12"	"67.73"	"68.18"	"44.83"	"48.76"	"28.57"	"53.97"	"40.73"	"35.91"
 "gpt-oss-120B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-120b"	"Reasoning: medium
 temperature: 1.0
+top-p: 1.0"	"GPT"	"759.5"	"370.5"	"117.0"	"Open"	"Think"	"On"	"49.11"	"58.5"	"48.44"	"68.92"	"69.83"	"41.38"	"39.67"	"25.71"	"50.79"	"35.67"	"32.21"
 "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)"	"https://huggingface.co/deepseek-ai/DeepSeek-R1-0528"	"version: 0528
 temperature: 0.6
+top-p: 0.95"	"DeepSeek"	"1177.5"	"554.0"	"671.0"	"Open"	"Think"	"On"	"48.79"	"49.75"	"50.0"	"65.34"	"59.09"	"48.85"	"38.02"	"32.86"	"57.94"	"36.52"	"38.93"
+"Gauss2.3 Hybrid"	""	""	"Gauss"	"546.0"	"308.0"	""	"Proprietary"	"Hybrid"	"On"	"46.58"	"52.0"	"46.25"	"59.76"	"66.94"	"41.95"	"34.71"	"25.71"	"53.17"	"34.55"	"33.22"
+"Mistral Large 3 675B Instruct 2512"	"https://huggingface.co/mistralai/Mistral-Large-3-675B-Instruct-2512"	"temperature: 0.15"	"mistralai"	"448.0"	"448.0"	"675.0"	"Open"	"Instruct"	"Off"	"45.21"	"44.0"	"50.62"	"65.34"	"60.33"	"33.33"	"14.88"	"37.14"	"53.97"	"36.52"	"35.91"
 "DeepSeek V3 (0324) (top_p: 0.95, temp:1.3)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3-0324"	"version: 0324
 temperature: 1.3
+top-p: 0.95"	"DeepSeek"	"408.0"	"408.0"	"671.0"	"Open"	"Instruct"	"Off"	"45.09"	"46.25"	"45.0"	"58.96"	"60.33"	"41.95"	"21.49"	"30.0"	"55.95"	"38.48"	"33.22"
 "Qwen3 32B (think)"	"https://huggingface.co/Qwen/Qwen3-32B"	"temperature: 0.6
+top-p: 0.95"	"Qwen"	"1113.0"	"390.0"	"32.8"	"Open"	"Hybrid"	"On"	"44.44"	"52.25"	"41.56"	"68.92"	"66.53"	"35.06"	"19.83"	"25.71"	"46.43"	"30.9"	"32.89"
 "Qwen3 30B A3B Instruct 2507"	"https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507"	"temperature: 0.7
+top-p: 0.8"	"Qwen"	"441.5"	"441.5"	"30.0"	"Open"	"Instruct"	"Off"	"42.79"	"45.0"	"35.0"	"56.18"	"66.12"	"51.15"	"33.06"	"24.29"	"46.83"	"28.09"	"35.57"
 "MiniMax-M2 (230B A10B)"	"https://huggingface.co/MiniMaxAI/MiniMax-M2"	"temperature:1.0
+top-p: 0.95"	"MiniMaxAI"	"1142.0"	"325.0"	"230.0"	"Open"	"Think"	"On"	"42.43"	"48.75"	"35.62"	"53.39"	"57.02"	"43.1"	"44.63"	"28.57"	"49.21"	"30.06"	"31.21"
+"A.X 4.0"	"https://huggingface.co/skt/A.X-4.0"	""	"SKT"	"412.5"	"412.5"	"71.9"	"Open"	"Instruct"	"Off"	"41.59"	"56.0"	"43.75"	"43.43"	"42.56"	"40.23"	"15.7"	"24.29"	"53.97"	"33.43"	"32.21"
 "gpt-oss-20B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-20b"	"Reasoning: medium
 temperature: 1.0
+top-p: 1.0"	"GPT"	"953.5"	"326.0"	"21.0"	"Open"	"Think"	"On"	"41.18"	"52.0"	"40.0"	"61.35"	"65.7"	"43.1"	"41.32"	"22.86"	"36.51"	"20.51"	"22.82"
 "Gemma 3 27B it"	"https://huggingface.co/google/gemma-3-27b-it"	"temperature: 1.0
+top-p: 0.95"	"Gemma"	"380.0"	"380.0"	"27.0"	"Open"	"Instruct"	"Off"	"40.86"	"44.25"	"45.0"	"45.82"	"36.78"	"31.61"	"32.23"	"22.86"	"57.14"	"32.87"	"39.93"
 "Tongyi DeepResearch 30B A3B"	"https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B"	"temperature: 0.6
+top-p: 0.95"	"Alibaba"	"1147.0"	"408.0"	"30.0"	"Open"	"Think"	"On"	"40.1"	"41.25"	"33.12"	"62.15"	"68.18"	"44.25"	"23.97"	"18.57"	"41.67"	"26.12"	"29.19"
 "Mistral Small 3.2 24B Instruct 2506"	"https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506"	"temperature: 0.15
+top-p: 0.95"	"mistralai"	"369.0"	"369.0"	"24.0"	"Open"	"Instruct"	"Off"	"39.09"	"43.0"	"44.69"	"43.43"	"51.65"	"25.86"	"22.31"	"25.71"	"51.98"	"31.18"	"30.2"
 "K2-Think"	"https://huggingface.co/LLM360/K2-Think"	"temperature: 1.0
+top-p: 0.95"	"LLM360"	"1835.0"	"486.0"	"32.8"	"Open"	"Think"	"On"	"35.06"	"35.5"	"36.56"	"56.18"	"47.11"	"35.06"	"14.05"	"12.86"	"49.21"	"21.63"	"23.15"
+"Kanana 2 30B A3B Thinking"	"https://huggingface.co/kakaocorp/kanana-2-30b-a3b-thinking"	"temperature: 0.6
+top-p: 0.95
+top-k: 20"	"Kakao"	"4263.0"	"854.5"	"31.0"	"Open"	"Think"	"On"	"34.5"	"37.5"	"25.0"	"57.77"	"54.55"	"39.66"	"20.66"	"15.71"	"38.1"	"24.72"	"20.47"
 "KAT Dev 72B Exp"	"https://huggingface.co/Kwaipilot/KAT-Dev-72B-Exp"	"temperature:0.6
+top-p: 0.95"	"KAT"	"397.0"	"397.0"	"72.0"	"Open"	"Instruct"	"Off"	"33.94"	"29.25"	"44.06"	"46.22"	"46.69"	"25.86"	"18.18"	"20.0"	"42.86"	"25.56"	"25.5"
 "Olmo 3 32B Think"	"https://huggingface.co/allenai/Olmo-3-32B-Think"	"temperature: 1
 top-p: 0.95
+top-k: 50"	"allenai"	"3360.5"	"473.0"	"32.0"	"Open"	"Think"	"On"	"33.94"	"35.25"	"30.94"	"57.37"	"66.53"	"33.33"	"28.93"	"24.29"	"34.52"	"11.8"	"19.8"
 "EXAONE 4.0 32B (think)"	"https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B"	"temperature: 0.6
+top-p: 0.95"	"Exaone"	"1274.5"	"503.0"	"32.0"	"Open"	"Hybrid"	"On"	"33.82"	"34.25"	"29.38"	"56.97"	"57.44"	"24.71"	"27.27"	"17.14"	"38.49"	"18.54"	"25.5"
 "Apriel 1.5 15B Thinker"	"https://huggingface.co/ServiceNow-AI/Apriel-1.5-15b-Thinker"	"temperature: 0.6
+top-p: 0.95"	"Apriel"	"2238.0"	"375.0"	"15.0"	"Open"	"Think"	"On"	"31.92"	"44.25"	"26.56"	"47.41"	"59.09"	"22.99"	"37.19"	"20.0"	"26.98"	"20.22"	"10.07"
 "HyperCLOVAX SEED Think 14B (think)"	"https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"	"temperature: 0.5
+top-p: 0.6"	"HCX"	"1444.0"	"382.5"	"14.7"	"Open"	"Hybrid"	"On"	"31.84"	"35.0"	"26.56"	"53.78"	"58.68"	"27.59"	"26.45"	"17.14"	"29.76"	"17.13"	"20.47"
+"Kanana 2 30B A3B Instruct"	"https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct"	"temperature: 0"	"Kakao"	"1195.0"	"1195.0"	"31.0"	"Open"	"Instruct"	"Off"	"30.84"	"38.0"	"25.62"	"35.86"	"47.11"	"37.93"	"23.97"	"18.57"	"35.32"	"20.51"	"19.46"
 "Dhanishtha-2.0 Preview"	"https://huggingface.co/HelpingAI/Dhanishtha-2.0-preview"	"temperature: 0.7
+top-p: 0.9"	"HelpingAI"	"520.0"	"356.0"	"14.8"	"Open"	"Think"	"On"	"25.81"	"28.25"	"19.38"	"30.28"	"33.47"	"43.1"	"47.93"	"20.0"	"31.75"	"12.08"	"13.09"
 "ERNIE 4.5 21B A3B Thinking"	"https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking"	"temperature: 0.6
+top-p: 0.95"	"ERNIE"	"1637.0"	"541.0"	"21.0"	"Open"	"Think"	"On"	"25.32"	"27.25"	"20.31"	"42.23"	"49.59"	"23.56"	"31.4"	"17.14"	"28.17"	"7.3"	"13.76"
 "Solar Pro Preview (top_p:0.95, temp: 0.7)"	"https://huggingface.co/upstage/solar-pro-preview-instruct"	"temperature: 0.7
+top-p: 0.95"	"Solar"	"260.0"	"260.0"	"22.0"	"Open"	"Instruct"	"Off"	"20.73"	"28.0"	"24.69"	"16.73"	"19.42"	"17.24"	"28.1"	"11.43"	"31.35"	"13.76"	"11.74"
 "Mi:dm 2.0 Base Instruct"	"https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct"	"temperature: 0.8
+top-p: 0.7"	"KT"	"316.0"	"316.0"	"11.5"	"Open"	"Instruct"	"Off"	"20.25"	"21.75"	"17.5"	"16.73"	"18.6"	"27.59"	"59.5"	"14.29"	"25.4"	"12.64"	"11.41"
 "Kanana 1.5 15.7B A3B Instruct"	"https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct"	"temperature: 1.0
+top-p: 0.95"	"Kakao"	"414.0"	"414.0"	"15.7"	"Open"	"Instruct"	"Off"	"11.71"	"14.25"	"10.62"	"13.55"	"11.16"	"22.41"	"22.31"	"4.29"	"11.9"	"6.74"	"5.37"

src/data/open/stats_lang.csv CHANGED Viewed

@@ -1,83 +1,98 @@
-"Model Name"	"Link"	"Comment"	"Group"	"Med. Len."	"Med. Resp. Len."	"Time to First Answer Token"	"End-to-End Response Time"	"Speed"	"Parameter Size (B)"	"Type"	"Model Type"	"Think"	"Overall"	"KO"	"EN"	"JA"	"ZH"	"PL"	"DE"	"PT"	"ES"	"FR"	"IT"	"RU"	"VI"
-"GPT-5 (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5"	"Reasoning: medium"	"GPT"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"70.73"	"64.72"	"65.83"	"71.69"	"67.68"	"72.78"	"71.27"	"73.74"	"75.68"	"72.83"	"77.05"	"70.79"	"75.61"
-"o3-pro (Reasoning: medium)"	"https://platform.openai.com/docs/models/o3-pro"	"Reasoning: medium"	"GPT"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"66.47"	"63.61"	"63.61"	"69.28"	"65.24"	"63.89"	"64.09"	"68.16"	"69.19"	"70.11"	"72.13"	"62.36"	"71.95"
-"GPT-5.1 (Reasoning: medium, verbosity: medium)"	"https://platform.openai.com/docs/models/gpt-5.1"	"Reasoning: medium, verbosity: medium"	"GPT"	""	""	""	"11.673096776008606"	""	""	"Proprietary"	"Think"	"On"	"64.57"	"57.78"	"62.5"	"65.06"	"62.8"	"65.56"	"60.22"	"65.36"	"68.11"	"74.46"	"70.49"	"67.42"	"63.41"
-"Claude 4.5 Opus (think)"	"https://www.anthropic.com/claude/opus"	""	"Claude"	""	""	""	""	""	""	"Proprietary"	"Hybrid"	"On"	"63.41"	"59.44"	"60.28"	"66.27"	"64.02"	"66.67"	"65.19"	"63.69"	"62.16"	"63.59"	"64.48"	"65.73"	"67.07"
-"Claude 4 Opus (20250514) (think)"	"https://www.anthropic.com/claude/opus"	"version: 20250514"	"Claude"	""	""	""	""	""	""	"Proprietary"	"Hybrid"	"On"	"63.29"	"57.5"	"62.5"	"64.46"	"62.8"	"59.44"	"65.19"	"65.92"	"60.54"	"65.22"	"65.57"	"65.17"	"72.56"
-"Claude 4.1 Opus (20250805) (think)"	"https://www.anthropic.com/claude/opus"	"version: 20250805"	"Claude"	""	""	""	""	""	""	"Proprietary"	"Hybrid"	"On"	"63.24"	"58.33"	"61.39"	"60.84"	"64.02"	"61.67"	"66.85"	"68.16"	"61.08"	"65.76"	"66.67"	"65.73"	"65.24"
-"GPT-5 mini (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5-mini"	"Reasoning: medium"	"GPT"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"62.56"	"57.5"	"56.39"	"62.65"	"62.2"	"63.89"	"60.22"	"66.48"	"67.03"	"70.11"	"67.76"	"66.29"	"60.98"
-"Gemini 3 Pro Preview (Thinking Level: High)"	""	""	"Gemini"	"1930.5"	"378.0"	""	"27.89457416534424"	""	""	"Proprietary"	"Think"	"On"	"62.48"	"59.44"	"60.56"	"60.24"	"62.2"	"61.67"	"65.19"	"63.13"	"64.32"	"65.76"	"65.57"	"64.04"	"62.2"
-"Claude 4 Sonnet (20250514) (think)"	"https://www.anthropic.com/claude/sonnet"	"version: 20250514"	"Claude"	""	""	""	""	""	""	"Proprietary"	"Hybrid"	"On"	"61.8"	"54.17"	"59.17"	"63.86"	"64.63"	"59.44"	"61.33"	"64.8"	"62.16"	"65.22"	"67.21"	"66.29"	"64.02"
-"o3"	"https://platform.openai.com/docs/models/o3"	""	"GPT"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"60.91"	"57.5"	"59.17"	"61.45"	"58.54"	"61.11"	"64.09"	"60.89"	"62.16"	"63.59"	"65.03"	"54.49"	"68.29"
-"Gemini 2.5 Pro"	"https://deepmind.google/models/gemini/pro/"	""	"Gemini"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"59.34"	"53.61"	"57.78"	"59.04"	"57.93"	"57.22"	"56.91"	"60.89"	"63.24"	"67.93"	"62.3"	"61.24"	"60.98"
 "Grok-4"	"https://x.ai/news/grok-4"	"temperature: 0.6
-top-p: 0.95"	"Grok"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"58.74"	"57.78"	"56.67"	"62.65"	"60.37"	"58.33"	"60.22"	"59.78"	"56.22"	"62.5"	"60.66"	"52.25"	"60.98"
-"Gemini 2.5 Flash"	"https://deepmind.google/models/gemini/flash/"	""	"Gemini"	""	""	""	""	""	""	"Proprietary"	"Hybrid"	"On"	"58.62"	"51.11"	"56.39"	"62.05"	"56.71"	"62.78"	"60.77"	"61.45"	"60.0"	"63.04"	"57.92"	"64.04"	"56.71"
-"o4-mini"	"https://platform.openai.com/docs/models/o4-mini"	""	"GPT"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"57.57"	"54.17"	"55.0"	"62.05"	"59.76"	"52.78"	"58.56"	"63.69"	"55.68"	"57.61"	"60.66"	"56.74"	"60.98"
 "Kimi K2 Thinking"	"https://huggingface.co/moonshotai/Kimi-K2-Thinking"	"temperature:1.0
-top-p: 0.95"	"moonshot"	"1692.0"	"330.0"	"45.35071495282816"	"70.24291145801544"	"24.28866627458008"	"1000.0"	"Open"	"Think"	"On"	"56.84"	"50.0"	"57.5"	"60.84"	"62.2"	"53.33"	"54.14"	"61.45"	"53.51"	"59.24"	"59.56"	"56.18"	"61.59"
 "Qwen3 235B A22B Thinking 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507"	"temperature: 0.6
-top-p: 0.95"	"Qwen"	"2404.5"	"423.0"	"58.364528823897146"	"80.01045334339142"	"31.05335185752473"	"235.0"	"Open"	"Think"	"On"	"55.48"	"49.17"	"53.33"	"56.02"	"58.54"	"50.56"	"62.43"	"60.89"	"52.97"	"56.52"	"60.11"	"53.93"	"60.37"
-"GPT-5 nano (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5-nano"	"Reasoning: medium"	"GPT"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"55.39"	"51.94"	"53.89"	"57.23"	"53.66"	"55.56"	"58.01"	"59.78"	"54.59"	"56.52"	"59.02"	"57.3"	"51.83"
-"GLM-4.5 FP8 (think)"	"https://huggingface.co/zai-org/GLM-4.5-FP8"	"temperature: 0.6
-top-p: 0.95"	"GLM"	"1442.0"	"604.0"	"25.261904125875603"	"62.74959444999695"	"23.293980879127712"	"355.0"	"Open"	"Hybrid"	"On"	"54.03"	"46.94"	"54.17"	"60.84"	"58.54"	"48.89"	"55.8"	"54.75"	"48.11"	"57.61"	"57.92"	"57.87"	"54.88"
 "GLM-4.6 FP8"	"https://huggingface.co/zai-org/GLM-4.6-FP8"	"temperature: 1.0
-top-p: 0.95"	"GLM"	"2645.5"	"522.0"	"81.414294828216"	"110.0251989364624"	"24.034975709814915"	"355.0"	"Open"	"Hybrid"	"On"	"53.3"	"49.17"	"54.17"	"54.22"	"56.71"	"52.22"	"53.04"	"49.16"	"56.76"	"56.52"	"56.28"	"53.93"	"50.61"
-"Gemini 2.5 Flash-lite Preview (09-2025)"	"https://deepmind.google/models/gemini/"	"version: 09-2025"	"Gemini"	""	""	""	""	""	""	"Proprietary"	"Think"	"On"	"53.06"	"47.78"	"51.11"	"51.2"	"53.66"	"51.67"	"54.7"	"59.22"	"51.89"	"57.07"	"55.74"	"57.87"	"51.83"
 "Qwen3 235B A22B Instruct 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507"	"temperature: 0.7
-top-p: 0.8"	"Qwen"	"433.0"	"433.0"	"0.1387630701065063"	"14.262101531028748"	"31.359207215387023"	"235.0"	"Open"	"Instruct"	"Off"	"52.94"	"46.67"	"55.28"	"53.61"	"59.15"	"46.11"	"51.38"	"55.87"	"54.59"	"53.26"	"56.28"	"54.49"	"53.05"
 "DeepSeek V3.1 (think)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3.1"	"temperature: 0.6
-top-p: 0.95"	"DeepSeek"	"710.5"	"356.0"	"14.323043732258654"	"35.32915151119232"	"16.64962453842425"	"671.0"	"Open"	"Hybrid"	"On"	"51.45"	"44.44"	"48.33"	"56.63"	"48.78"	"48.89"	"55.25"	"53.07"	"52.97"	"56.52"	"57.92"	"50.56"	"54.27"
 "DeepSeek V3.1 Terminus (think)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3.1-Terminus"	"temperature: 0.6
-top-p: 0.95"	"DeepSeek"	"831.5"	"377.0"	"17.055466594943752"	"47.552645206451416"	"17.890508425613742"	"671.0"	"Open"	"Hybrid"	"On"	"51.37"	"46.94"	"50.83"	"51.81"	"53.66"	"50.0"	"53.59"	"51.96"	"55.14"	"53.8"	"54.64"	"48.31"	"50.61"
 "Qwen3 30B A3B Thinking 2507"	"https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507"	"temperature: 0.6
-top-p: 0.95"	"Qwen"	"2830.0"	"351.0"	"76.69636714346468"	"82.98819828033447"	"72.08537789542703"	"30.0"	"Open"	"Think"	"On"	"50.44"	"44.17"	"49.17"	"50.0"	"57.32"	"42.22"	"49.72"	"53.07"	"50.27"	"54.89"	"56.83"	"47.75"	"58.54"
 "gpt-oss-120B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-120b"	"Reasoning: medium
 temperature: 1.0
-top-p: 1.0"	"GPT"	"759.5"	"370.5"	"7.694922740481965"	"12.121336698532104"	"103.31935460342277"	"117.0"	"Open"	"Think"	"On"	"49.11"	"46.67"	"51.39"	"51.81"	"47.56"	"45.0"	"51.38"	"54.75"	"50.27"	"51.63"	"47.54"	"46.07"	"45.12"
 "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)"	"https://huggingface.co/deepseek-ai/DeepSeek-R1-0528"	"version: 0528
 temperature: 0.6
-top-p: 0.95"	"DeepSeek"	"1177.5"	"554.0"	"28.558620557701"	"70.60028326511383"	"17.625838630215213"	"671.0"	"Open"	"Think"	"On"	"48.79"	"42.22"	"49.44"	"50.0"	"53.05"	"47.22"	"48.62"	"50.28"	"48.11"	"51.63"	"54.1"	"44.38"	"53.05"
-"Gauss2.3 Hybrid"	""	""	"Gauss"	"546.0"	"308.0"	"6.750162363052368"	"17.980867981910706"	"42.58336125102582"	""	"Proprietary"	"Hybrid"	"On"	"46.58"	"39.72"	"45.56"	"48.8"	"48.17"	"45.0"	"44.2"	"53.63"	"45.41"	"52.17"	"51.91"	"44.94"	"47.56"
 "DeepSeek V3 (0324) (top_p: 0.95, temp:1.3)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3-0324"	"version: 0324
 temperature: 1.3
-top-p: 0.95"	"DeepSeek"	"408.0"	"408.0"	"0.211452841758728"	"23.47111320495605"	"17.62487523518351"	"671.0"	"Open"	"Instruct"	"Off"	"45.09"	"37.5"	"43.61"	"46.99"	"51.22"	"45.56"	"44.75"	"44.69"	"44.32"	"48.91"	"49.18"	"44.94"	"49.39"
 "Qwen3 32B (think)"	"https://huggingface.co/Qwen/Qwen3-32B"	"temperature: 0.6
-top-p: 0.95"	"Qwen"	"1113.0"	"390.0"	"27.26490248867746"	"39.635579228401184"	"37.74973909656839"	"32.8"	"Open"	"Hybrid"	"On"	"44.44"	"38.89"	"41.67"	"48.8"	"50.0"	"38.33"	"46.41"	"44.69"	"44.86"	"44.57"	"50.82"	"46.07"	"47.56"
 "Qwen3 30B A3B Instruct 2507"	"https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507"	"temperature: 0.7
-top-p: 0.8"	"Qwen"	"441.5"	"441.5"	"7.902002811431885"	"19.310550212860107"	"42.44958664990833"	"30.0"	"Open"	"Instruct"	"Off"	"42.79"	"34.44"	"43.89"	"40.96"	"48.78"	"38.89"	"41.99"	"46.93"	"44.32"	"42.93"	"48.09"	"43.26"	"46.95"
 "MiniMax-M2 (230B A10B)"	"https://huggingface.co/MiniMaxAI/MiniMax-M2"	"temperature:1.0
-top-p: 0.95"	"MiniMaxAI"	"1142.0"	"325.0"	""	""	""	"230.0"	"Open"	"Think"	"On"	"42.43"	"31.94"	"46.11"	"37.35"	"45.73"	"38.33"	"45.3"	"45.25"	"48.65"	"41.3"	"46.45"	"42.7"	"46.95"
-"A.X 4.0"	"https://huggingface.co/skt/A.X-4.0"	""	"SKT"	"412.5"	"412.5"	"0.6553128957748413"	"7.924791574478149"	"57.95526130360478"	"71.9"	"Open"	"Instruct"	"Off"	"41.59"	"38.89"	"41.11"	"43.98"	"49.39"	"36.11"	"45.86"	"43.58"	"44.32"	"39.67"	"43.17"	"39.89"	"36.59"
 "gpt-oss-20B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-20b"	"Reasoning: medium
 temperature: 1.0
-top-p: 1.0"	"GPT"	"953.5"	"326.0"	"26.04652036871504"	"29.767700791358948"	"108.53633696847938"	"21.0"	"Open"	"Think"	"On"	"41.18"	"36.67"	"42.78"	"45.78"	"45.73"	"37.78"	"35.91"	"41.9"	"39.46"	"51.09"	"40.44"	"38.76"	"41.46"
 "Gemma 3 27B it"	"https://huggingface.co/google/gemma-3-27b-it"	"temperature: 1.0
-top-p: 0.95"	"Gemma"	"380.0"	"380.0"	"3.391351342201233"	"13.303653597831726"	"39.94050750809835"	"27.0"	"Open"	"Instruct"	"Off"	"40.86"	"34.44"	"35.0"	"37.35"	"43.9"	"42.22"	"43.65"	"47.49"	"41.08"	"44.02"	"53.55"	"39.33"	"40.24"
 "Tongyi DeepResearch 30B A3B"	"https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B"	"temperature: 0.6
-top-p: 0.95"	"Alibaba"	"1147.0"	"408.0"	"45.23295979184195"	"52.38741266727448"	"62.676624491545525"	"30.0"	"Open"	"Think"	"On"	"40.1"	"36.11"	"40.83"	"43.37"	"44.51"	"32.78"	"37.02"	"44.69"	"38.92"	"43.48"	"46.45"	"37.08"	"39.63"
 "Mistral Small 3.2 24B Instruct 2506"	"https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506"	"temperature: 0.15
-top-p: 0.95"	"mistralai"	"369.0"	"369.0"	"3.2450859546661377"	"13.907460689544678"	"36.382163796915904"	"24.0"	"Open"	"Instruct"	"Off"	"39.09"	"31.39"	"40.0"	"36.75"	"42.07"	"34.44"	"44.2"	"41.9"	"42.16"	"45.65"	"40.98"	"37.64"	"38.41"
 "K2-Think"	"https://huggingface.co/LLM360/K2-Think"	"temperature: 1.0
-top-p: 0.95"	"LLM360"	"1835.0"	"486.0"	"24.29692639716904"	"43.2994556427002"	"42.72123101353567"	"32.8"	"Open"	"Think"	"On"	"35.06"	"29.17"	"36.11"	"30.12"	"44.51"	"26.67"	"33.15"	"38.55"	"37.84"	"41.85"	"37.7"	"33.71"	"36.59"
 "KAT Dev 72B Exp"	"https://huggingface.co/Kwaipilot/KAT-Dev-72B-Exp"	"temperature:0.6
-top-p: 0.95"	"KAT"	"397.0"	"397.0"	"0.0622165203094482"	"8.492375493049622"	"50.601864763867184"	"72.0"	"Open"	"Instruct"	"Off"	"33.94"	"25.0"	"32.22"	"31.93"	"37.2"	"34.44"	"33.15"	"43.02"	"37.84"	"36.96"	"37.7"	"30.34"	"38.41"
 "Olmo 3 32B Think"	"https://huggingface.co/allenai/Olmo-3-32B-Think"	"temperature: 1
 top-p: 0.95
-top-k: 50"	"allenai"	"3360.5"	"473.0"	"60.18788400716624"	"77.51256728172302"	"44.30514641537086"	"32.0"	"Open"	"Think"	"On"	"33.94"	"30.56"	"41.39"	"30.12"	"31.1"	"25.0"	"34.25"	"35.75"	"33.51"	"36.41"	"37.16"	"31.46"	"35.98"
 "EXAONE 4.0 32B (think)"	"https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B"	"temperature: 0.6
-top-p: 0.95"	"Exaone"	"1274.5"	"503.0"	"40.64476558326666"	"52.11687910556793"	"51.19312170664125"	"32.0"	"Open"	"Hybrid"	"On"	"33.82"	"33.61"	"38.33"	"28.92"	"35.98"	"26.11"	"35.91"	"34.08"	"38.92"	"35.33"	"33.88"	"28.09"	"31.71"
 "Apriel 1.5 15B Thinker"	"https://huggingface.co/ServiceNow-AI/Apriel-1.5-15b-Thinker"	"temperature: 0.6
-top-p: 0.95"	"Apriel"	"2238.0"	"375.0"	"299.8162105011457"	"379.46853709220886"	"14.66275339770088"	"15.0"	"Open"	"Think"	"On"	"31.92"	"23.61"	"39.72"	"30.72"	"38.41"	"24.44"	"40.88"	"37.99"	"32.43"	"32.61"	"22.95"	"28.65"	"31.71"
 "HyperCLOVAX SEED Think 14B (think)"	"https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"	"temperature: 0.5
-top-p: 0.6"	"HCX"	"1444.0"	"382.5"	"16.12651202553951"	"24.703290462493896"	"83.75171982150616"	"14.7"	"Open"	"Hybrid"	"On"	"31.84"	"32.22"	"37.22"	"31.93"	"38.41"	"27.78"	"32.6"	"30.17"	"29.19"	"32.07"	"33.33"	"25.28"	"26.22"
 "Dhanishtha-2.0 Preview"	"https://huggingface.co/HelpingAI/Dhanishtha-2.0-preview"	"temperature: 0.7
-top-p: 0.9"	"HelpingAI"	"520.0"	"356.0"	"4.368606805801392"	"35.15699875354767"	"17.75738514863349"	"14.8"	"Open"	"Think"	"On"	"25.81"	"23.33"	"27.22"	"30.12"	"32.32"	"20.56"	"20.99"	"26.26"	"25.95"	"25.54"	"30.6"	"23.6"	"25.0"
 "ERNIE 4.5 21B A3B Thinking"	"https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking"	"temperature: 0.6
-top-p: 0.95"	"ERNIE"	"1637.0"	"541.0"	"48.24206436969081"	"56.95321476459503"	"78.52955859303597"	"21.0"	"Open"	"Think"	"On"	"25.32"	"17.5"	"31.11"	"18.67"	"39.02"	"23.33"	"24.31"	"24.58"	"26.49"	"24.46"	"30.6"	"19.1"	"27.44"
 "Solar Pro Preview (top_p:0.95, temp: 0.7)"	"https://huggingface.co/upstage/solar-pro-preview-instruct"	"temperature: 0.7
-top-p: 0.95"	"Solar"	"260.0"	"260.0"	"12.68759036064148"	"39.93266606330872"	"11.341528558845871"	"22.0"	"Open"	"Instruct"	"Off"	"20.73"	"9.72"	"22.22"	"21.08"	"24.39"	"9.44"	"18.23"	"24.02"	"29.73"	"29.89"	"33.33"	"22.47"	"12.8"
 "Mi:dm 2.0 Base Instruct"	"https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct"	"temperature: 0.8
-top-p: 0.7"	"KT"	"316.0"	"316.0"	"3.07414984703064"	"11.089128971099854"	"41.13515299318637"	"11.5"	"Open"	"Instruct"	"Off"	"20.25"	"26.39"	"26.39"	"17.47"	"26.83"	"13.33"	"18.78"	"20.67"	"16.22"	"20.65"	"21.31"	"12.92"	"9.15"
 "Kanana 1.5 15.7B A3B Instruct"	"https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct"	"temperature: 1.0
-top-p: 0.95"	"Kakao"	"414.0"	"414.0"	"2.999279260635376"	"14.037613034248352"	"39.50831768498445"	"15.7"	"Open"	"Instruct"	"Off"	"11.71"	"21.11"	"20.28"	"10.84"	"15.24"	"5.56"	"7.73"	"8.94"	"9.19"	"8.15"	"5.46"	"5.06"	"4.88"

+"Model Name"	"Link"	"Comment"	"Group"	"Med. Len."	"Med. Resp. Len."	"Parameter Size (B)"	"Type"	"Model Type"	"Think"	"Overall"	"KO"	"EN"	"JA"	"ZH"	"PL"	"DE"	"PT"	"ES"	"FR"	"IT"	"RU"	"VI"
+"GPT-5 (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5"	"Reasoning: medium"	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"70.73"	"64.72"	"65.83"	"71.69"	"67.68"	"72.78"	"71.27"	"73.74"	"75.68"	"72.83"	"77.05"	"70.79"	"75.61"
+"o3-pro (Reasoning: medium)"	"https://platform.openai.com/docs/models/o3-pro"	"Reasoning: medium"	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"66.47"	"63.61"	"63.61"	"69.28"	"65.24"	"63.89"	"64.09"	"68.16"	"69.19"	"70.11"	"72.13"	"62.36"	"71.95"
+"GPT-5.2 (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5.2"	"Reasoning: medium"	"GPT"	"347.0"	"264.0"	""	"Proprietary"	"Hybrid"	"On"	"66.18"	"61.67"	"61.39"	"69.28"	"64.63"	"68.89"	"66.3"	"70.95"	"63.24"	"68.48"	"70.49"	"70.22"	"68.29"
+"GPT-5.1 (Reasoning: medium, verbosity: medium)"	"https://platform.openai.com/docs/models/gpt-5.1"	"Reasoning: medium, verbosity: medium"	"GPT"	""	""	""	"Proprietary"	"Hybrid"	"On"	"64.57"	"57.78"	"62.5"	"65.06"	"62.8"	"65.56"	"60.22"	"65.36"	"68.11"	"74.46"	"70.49"	"67.42"	"63.41"
+"Claude 4.5 Opus (think, budget: 16K)"	"https://www.anthropic.com/claude/opus"	"thinking budget: 16K"	"Claude"	""	""	""	"Proprietary"	"Hybrid"	"On"	"63.41"	"59.44"	"60.28"	"66.27"	"64.02"	"66.67"	"65.19"	"63.69"	"62.16"	"63.59"	"64.48"	"65.73"	"67.07"
+"Claude 4 Opus (20250514) (think)"	"https://www.anthropic.com/claude/opus"	"version: 20250514"	"Claude"	""	""	""	"Proprietary"	"Hybrid"	"On"	"63.29"	"57.5"	"62.5"	"64.46"	"62.8"	"59.44"	"65.19"	"65.92"	"60.54"	"65.22"	"65.57"	"65.17"	"72.56"
+"Claude 4.1 Opus (20250805) (think)"	"https://www.anthropic.com/claude/opus"	"version: 20250805"	"Claude"	""	""	""	"Proprietary"	"Hybrid"	"On"	"63.24"	"58.33"	"61.39"	"60.84"	"64.02"	"61.67"	"66.85"	"68.16"	"61.08"	"65.76"	"66.67"	"65.73"	"65.24"
+"GPT-5 mini (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5-mini"	"Reasoning: medium"	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"62.56"	"57.5"	"56.39"	"62.65"	"62.2"	"63.89"	"60.22"	"66.48"	"67.03"	"70.11"	"67.76"	"66.29"	"60.98"
+"Gemini 3 Pro Preview (Thinking Level: High)"	"https://deepmind.google/models/gemini/pro/"	"Thinking Level: High"	"Gemini"	"1930.5"	"378.0"	""	"Proprietary"	"Think"	"On"	"62.48"	"59.44"	"60.56"	"60.24"	"62.2"	"61.67"	"65.19"	"63.13"	"64.32"	"65.76"	"65.57"	"64.04"	"62.2"
+"Claude 4 Sonnet (20250514) (think)"	"https://www.anthropic.com/claude/sonnet"	"version: 20250514"	"Claude"	""	""	""	"Proprietary"	"Hybrid"	"On"	"61.8"	"54.17"	"59.17"	"63.86"	"64.63"	"59.44"	"61.33"	"64.8"	"62.16"	"65.22"	"67.21"	"66.29"	"64.02"
+"o3"	"https://platform.openai.com/docs/models/o3"	""	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"60.91"	"57.5"	"59.17"	"61.45"	"58.54"	"61.11"	"64.09"	"60.89"	"62.16"	"63.59"	"65.03"	"54.49"	"68.29"
+"Gemini 2.5 Pro"	"https://deepmind.google/models/gemini/pro/"	""	"Gemini"	""	""	""	"Proprietary"	"Think"	"On"	"59.34"	"53.61"	"57.78"	"59.04"	"57.93"	"57.22"	"56.91"	"60.89"	"63.24"	"67.93"	"62.3"	"61.24"	"60.98"
+"Gemini 3 Flash Preview (Thinking Level: High)"	"https://deepmind.google/models/gemini/flash/"	"Thinking Level: High"	"Gemini"	"1296.5"	"424.5"	""	"Proprietary"	"Think"	"On"	"59.26"	"53.89"	"57.22"	"61.45"	"57.32"	"56.67"	"61.33"	"57.54"	"58.92"	"64.67"	"67.76"	"60.11"	"61.59"
+"GLM-4.7 FP8"	"https://huggingface.co/zai-org/GLM-4.7-FP8"	"temperature: 1.0
+top-p: 0.95"	"GLM"	"2252.5"	"328.0"	"358.0"	"Open"	"Hybrid"	"On"	"59.22"	"54.17"	"55.28"	"63.86"	"63.41"	"55.0"	"58.56"	"62.01"	"61.08"	"63.59"	"61.75"	"66.29"	"54.88"
+"DeepSeek V3.2 Speciale"	"https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Speciale"	"temperature: 1.0
+top-p: 0.95"	"DeepSeek"	"3226.5"	"249.5"	"671.0"	"Open"	"Think"	"On"	"59.14"	"50.83"	"58.06"	"63.25"	"57.93"	"58.89"	"58.56"	"58.66"	"60.0"	"65.22"	"66.12"	"59.55"	"62.2"
 "Grok-4"	"https://x.ai/news/grok-4"	"temperature: 0.6
+top-p: 0.95"	"Grok"	""	""	""	"Proprietary"	"Think"	"On"	"58.74"	"57.78"	"56.67"	"62.65"	"60.37"	"58.33"	"60.22"	"59.78"	"56.22"	"62.5"	"60.66"	"52.25"	"60.98"
+"Gemini 2.5 Flash"	"https://deepmind.google/models/gemini/flash/"	""	"Gemini"	""	""	""	"Proprietary"	"Hybrid"	"On"	"58.62"	"51.11"	"56.39"	"62.05"	"56.71"	"62.78"	"60.77"	"61.45"	"60.0"	"63.04"	"57.92"	"64.04"	"56.71"
+"o4-mini"	"https://platform.openai.com/docs/models/o4-mini"	""	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"57.57"	"54.17"	"55.0"	"62.05"	"59.76"	"52.78"	"58.56"	"63.69"	"55.68"	"57.61"	"60.66"	"56.74"	"60.98"
 "Kimi K2 Thinking"	"https://huggingface.co/moonshotai/Kimi-K2-Thinking"	"temperature:1.0
+top-p: 0.95"	"moonshot"	"1692.0"	"330.0"	"1000.0"	"Open"	"Think"	"On"	"56.84"	"50.0"	"57.5"	"60.84"	"62.2"	"53.33"	"54.14"	"61.45"	"53.51"	"59.24"	"59.56"	"56.18"	"61.59"
 "Qwen3 235B A22B Thinking 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507"	"temperature: 0.6
+top-p: 0.95"	"Qwen"	"2404.5"	"423.0"	"235.0"	"Open"	"Think"	"On"	"55.48"	"49.17"	"53.33"	"56.02"	"58.54"	"50.56"	"62.43"	"60.89"	"52.97"	"56.52"	"60.11"	"53.93"	"60.37"
+"GPT-5 nano (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5-nano"	"Reasoning: medium"	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"55.39"	"51.94"	"53.89"	"57.23"	"53.66"	"55.56"	"58.01"	"59.78"	"54.59"	"56.52"	"59.02"	"57.3"	"51.83"
+"GLM-4.5 FP8"	"https://huggingface.co/zai-org/GLM-4.5-FP8"	"temperature: 0.6
+top-p: 0.95"	"GLM"	"1442.0"	"604.0"	"358.0"	"Open"	"Hybrid"	"On"	"54.03"	"46.94"	"54.17"	"60.84"	"58.54"	"48.89"	"55.8"	"54.75"	"48.11"	"57.61"	"57.92"	"57.87"	"54.88"
 "GLM-4.6 FP8"	"https://huggingface.co/zai-org/GLM-4.6-FP8"	"temperature: 1.0
+top-p: 0.95"	"GLM"	"2645.5"	"522.0"	"358.0"	"Open"	"Hybrid"	"On"	"53.3"	"49.17"	"54.17"	"54.22"	"56.71"	"52.22"	"53.04"	"49.16"	"56.76"	"56.52"	"56.28"	"53.93"	"50.61"
+"Gemini 2.5 Flash-lite Preview (09-2025)"	"https://deepmind.google/models/gemini/"	"version: 09-2025"	"Gemini"	""	""	""	"Proprietary"	"Think"	"On"	"53.06"	"47.78"	"51.11"	"51.2"	"53.66"	"51.67"	"54.7"	"59.22"	"51.89"	"57.07"	"55.74"	"57.87"	"51.83"
 "Qwen3 235B A22B Instruct 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507"	"temperature: 0.7
+top-p: 0.8"	"Qwen"	"433.0"	"433.0"	"235.0"	"Open"	"Instruct"	"Off"	"52.94"	"46.67"	"55.28"	"53.61"	"59.15"	"46.11"	"51.38"	"55.87"	"54.59"	"53.26"	"56.28"	"54.49"	"53.05"
+"DeepSeek V3.2"	"https://huggingface.co/deepseek-ai/DeepSeek-V3.2"	"temperature: 1.0
+top-p: 0.95"	"DeepSeek"	"762.5"	"312.0"	"671.0"	"Open"	"Think"	"On"	"52.17"	"47.5"	"49.44"	"53.61"	"50.61"	"50.56"	"54.14"	"59.22"	"52.43"	"57.07"	"56.28"	"44.94"	"57.93"
 "DeepSeek V3.1 (think)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3.1"	"temperature: 0.6
+top-p: 0.95"	"DeepSeek"	"710.5"	"356.0"	"671.0"	"Open"	"Hybrid"	"On"	"51.45"	"44.44"	"48.33"	"56.63"	"48.78"	"48.89"	"55.25"	"53.07"	"52.97"	"56.52"	"57.92"	"50.56"	"54.27"
 "DeepSeek V3.1 Terminus (think)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3.1-Terminus"	"temperature: 0.6
+top-p: 0.95"	"DeepSeek"	"831.5"	"377.0"	"671.0"	"Open"	"Hybrid"	"On"	"51.37"	"46.94"	"50.83"	"51.81"	"53.66"	"50.0"	"53.59"	"51.96"	"55.14"	"53.8"	"54.64"	"48.31"	"50.61"
 "Qwen3 30B A3B Thinking 2507"	"https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507"	"temperature: 0.6
+top-p: 0.95"	"Qwen"	"2830.0"	"351.0"	"30.0"	"Open"	"Think"	"On"	"50.44"	"44.17"	"49.17"	"50.0"	"57.32"	"42.22"	"49.72"	"53.07"	"50.27"	"54.89"	"56.83"	"47.75"	"58.54"
+"MiMo V2 Flash"	"https://huggingface.co/XiaomiMiMo/MiMo-V2-Flash"	"temperature: 0.8
+top-p: 0.95"	"XiaomiMiMo"	"1477.5"	"373.0"	"309.0"	"Open"	"Think"	"On"	"50.32"	"42.22"	"53.06"	"49.4"	"54.27"	"47.78"	"51.93"	"53.63"	"52.97"	"54.89"	"54.64"	"42.13"	"52.44"
 "gpt-oss-120B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-120b"	"Reasoning: medium
 temperature: 1.0
+top-p: 1.0"	"GPT"	"759.5"	"370.5"	"117.0"	"Open"	"Think"	"On"	"49.11"	"46.67"	"51.39"	"51.81"	"47.56"	"45.0"	"51.38"	"54.75"	"50.27"	"51.63"	"47.54"	"46.07"	"45.12"
 "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)"	"https://huggingface.co/deepseek-ai/DeepSeek-R1-0528"	"version: 0528
 temperature: 0.6
+top-p: 0.95"	"DeepSeek"	"1177.5"	"554.0"	"671.0"	"Open"	"Think"	"On"	"48.79"	"42.22"	"49.44"	"50.0"	"53.05"	"47.22"	"48.62"	"50.28"	"48.11"	"51.63"	"54.1"	"44.38"	"53.05"
+"Gauss2.3 Hybrid"	""	""	"Gauss"	"546.0"	"308.0"	""	"Proprietary"	"Hybrid"	"On"	"46.58"	"39.72"	"45.56"	"48.8"	"48.17"	"45.0"	"44.2"	"53.63"	"45.41"	"52.17"	"51.91"	"44.94"	"47.56"
+"Mistral Large 3 675B Instruct 2512"	"https://huggingface.co/mistralai/Mistral-Large-3-675B-Instruct-2512"	"temperature: 0.15"	"mistralai"	"448.0"	"448.0"	"675.0"	"Open"	"Instruct"	"Off"	"45.21"	"41.39"	"44.17"	"50.6"	"46.34"	"46.11"	"43.65"	"45.81"	"44.32"	"49.46"	"49.18"	"42.13"	"44.51"
 "DeepSeek V3 (0324) (top_p: 0.95, temp:1.3)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3-0324"	"version: 0324
 temperature: 1.3
+top-p: 0.95"	"DeepSeek"	"408.0"	"408.0"	"671.0"	"Open"	"Instruct"	"Off"	"45.09"	"37.5"	"43.61"	"46.99"	"51.22"	"45.56"	"44.75"	"44.69"	"44.32"	"48.91"	"49.18"	"44.94"	"49.39"
 "Qwen3 32B (think)"	"https://huggingface.co/Qwen/Qwen3-32B"	"temperature: 0.6
+top-p: 0.95"	"Qwen"	"1113.0"	"390.0"	"32.8"	"Open"	"Hybrid"	"On"	"44.44"	"38.89"	"41.67"	"48.8"	"50.0"	"38.33"	"46.41"	"44.69"	"44.86"	"44.57"	"50.82"	"46.07"	"47.56"
 "Qwen3 30B A3B Instruct 2507"	"https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507"	"temperature: 0.7
+top-p: 0.8"	"Qwen"	"441.5"	"441.5"	"30.0"	"Open"	"Instruct"	"Off"	"42.79"	"34.44"	"43.89"	"40.96"	"48.78"	"38.89"	"41.99"	"46.93"	"44.32"	"42.93"	"48.09"	"43.26"	"46.95"
 "MiniMax-M2 (230B A10B)"	"https://huggingface.co/MiniMaxAI/MiniMax-M2"	"temperature:1.0
+top-p: 0.95"	"MiniMaxAI"	"1142.0"	"325.0"	"230.0"	"Open"	"Think"	"On"	"42.43"	"31.94"	"46.11"	"37.35"	"45.73"	"38.33"	"45.3"	"45.25"	"48.65"	"41.3"	"46.45"	"42.7"	"46.95"
+"A.X 4.0"	"https://huggingface.co/skt/A.X-4.0"	""	"SKT"	"412.5"	"412.5"	"71.9"	"Open"	"Instruct"	"Off"	"41.59"	"38.89"	"41.11"	"43.98"	"49.39"	"36.11"	"45.86"	"43.58"	"44.32"	"39.67"	"43.17"	"39.89"	"36.59"
 "gpt-oss-20B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-20b"	"Reasoning: medium
 temperature: 1.0
+top-p: 1.0"	"GPT"	"953.5"	"326.0"	"21.0"	"Open"	"Think"	"On"	"41.18"	"36.67"	"42.78"	"45.78"	"45.73"	"37.78"	"35.91"	"41.9"	"39.46"	"51.09"	"40.44"	"38.76"	"41.46"
 "Gemma 3 27B it"	"https://huggingface.co/google/gemma-3-27b-it"	"temperature: 1.0
+top-p: 0.95"	"Gemma"	"380.0"	"380.0"	"27.0"	"Open"	"Instruct"	"Off"	"40.86"	"34.44"	"35.0"	"37.35"	"43.9"	"42.22"	"43.65"	"47.49"	"41.08"	"44.02"	"53.55"	"39.33"	"40.24"
 "Tongyi DeepResearch 30B A3B"	"https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B"	"temperature: 0.6
+top-p: 0.95"	"Alibaba"	"1147.0"	"408.0"	"30.0"	"Open"	"Think"	"On"	"40.1"	"36.11"	"40.83"	"43.37"	"44.51"	"32.78"	"37.02"	"44.69"	"38.92"	"43.48"	"46.45"	"37.08"	"39.63"
 "Mistral Small 3.2 24B Instruct 2506"	"https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506"	"temperature: 0.15
+top-p: 0.95"	"mistralai"	"369.0"	"369.0"	"24.0"	"Open"	"Instruct"	"Off"	"39.09"	"31.39"	"40.0"	"36.75"	"42.07"	"34.44"	"44.2"	"41.9"	"42.16"	"45.65"	"40.98"	"37.64"	"38.41"
 "K2-Think"	"https://huggingface.co/LLM360/K2-Think"	"temperature: 1.0
+top-p: 0.95"	"LLM360"	"1835.0"	"486.0"	"32.8"	"Open"	"Think"	"On"	"35.06"	"29.17"	"36.11"	"30.12"	"44.51"	"26.67"	"33.15"	"38.55"	"37.84"	"41.85"	"37.7"	"33.71"	"36.59"
+"Kanana 2 30B A3B Thinking"	"https://huggingface.co/kakaocorp/kanana-2-30b-a3b-thinking"	"temperature: 0.6
+top-p: 0.95
+top-k: 20"	"Kakao"	"4263.0"	"854.5"	"31.0"	"Open"	"Think"	"On"	"34.5"	"25.28"	"43.06"	"38.55"	"40.24"	"25.0"	"34.25"	"37.99"	"32.43"	"34.24"	"37.7"	"28.65"	"38.41"
 "KAT Dev 72B Exp"	"https://huggingface.co/Kwaipilot/KAT-Dev-72B-Exp"	"temperature:0.6
+top-p: 0.95"	"KAT"	"397.0"	"397.0"	"72.0"	"Open"	"Instruct"	"Off"	"33.94"	"25.0"	"32.22"	"31.93"	"37.2"	"34.44"	"33.15"	"43.02"	"37.84"	"36.96"	"37.7"	"30.34"	"38.41"
 "Olmo 3 32B Think"	"https://huggingface.co/allenai/Olmo-3-32B-Think"	"temperature: 1
 top-p: 0.95
+top-k: 50"	"allenai"	"3360.5"	"473.0"	"32.0"	"Open"	"Think"	"On"	"33.94"	"30.56"	"41.39"	"30.12"	"31.1"	"25.0"	"34.25"	"35.75"	"33.51"	"36.41"	"37.16"	"31.46"	"35.98"
 "EXAONE 4.0 32B (think)"	"https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B"	"temperature: 0.6
+top-p: 0.95"	"Exaone"	"1274.5"	"503.0"	"32.0"	"Open"	"Hybrid"	"On"	"33.82"	"33.61"	"38.33"	"28.92"	"35.98"	"26.11"	"35.91"	"34.08"	"38.92"	"35.33"	"33.88"	"28.09"	"31.71"
 "Apriel 1.5 15B Thinker"	"https://huggingface.co/ServiceNow-AI/Apriel-1.5-15b-Thinker"	"temperature: 0.6
+top-p: 0.95"	"Apriel"	"2238.0"	"375.0"	"15.0"	"Open"	"Think"	"On"	"31.92"	"23.61"	"39.72"	"30.72"	"38.41"	"24.44"	"40.88"	"37.99"	"32.43"	"32.61"	"22.95"	"28.65"	"31.71"
 "HyperCLOVAX SEED Think 14B (think)"	"https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"	"temperature: 0.5
+top-p: 0.6"	"HCX"	"1444.0"	"382.5"	"14.7"	"Open"	"Hybrid"	"On"	"31.84"	"32.22"	"37.22"	"31.93"	"38.41"	"27.78"	"32.6"	"30.17"	"29.19"	"32.07"	"33.33"	"25.28"	"26.22"
+"Kanana 2 30B A3B Instruct"	"https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct"	"temperature: 0"	"Kakao"	"1195.0"	"1195.0"	"31.0"	"Open"	"Instruct"	"Off"	"30.84"	"33.06"	"39.44"	"37.35"	"33.54"	"17.78"	"26.52"	"25.14"	"30.81"	"29.35"	"31.15"	"23.03"	"32.93"
 "Dhanishtha-2.0 Preview"	"https://huggingface.co/HelpingAI/Dhanishtha-2.0-preview"	"temperature: 0.7
+top-p: 0.9"	"HelpingAI"	"520.0"	"356.0"	"14.8"	"Open"	"Think"	"On"	"25.81"	"23.33"	"27.22"	"30.12"	"32.32"	"20.56"	"20.99"	"26.26"	"25.95"	"25.54"	"30.6"	"23.6"	"25.0"
 "ERNIE 4.5 21B A3B Thinking"	"https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking"	"temperature: 0.6
+top-p: 0.95"	"ERNIE"	"1637.0"	"541.0"	"21.0"	"Open"	"Think"	"On"	"25.32"	"17.5"	"31.11"	"18.67"	"39.02"	"23.33"	"24.31"	"24.58"	"26.49"	"24.46"	"30.6"	"19.1"	"27.44"
 "Solar Pro Preview (top_p:0.95, temp: 0.7)"	"https://huggingface.co/upstage/solar-pro-preview-instruct"	"temperature: 0.7
+top-p: 0.95"	"Solar"	"260.0"	"260.0"	"22.0"	"Open"	"Instruct"	"Off"	"20.73"	"9.72"	"22.22"	"21.08"	"24.39"	"9.44"	"18.23"	"24.02"	"29.73"	"29.89"	"33.33"	"22.47"	"12.8"
 "Mi:dm 2.0 Base Instruct"	"https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct"	"temperature: 0.8
+top-p: 0.7"	"KT"	"316.0"	"316.0"	"11.5"	"Open"	"Instruct"	"Off"	"20.25"	"26.39"	"26.39"	"17.47"	"26.83"	"13.33"	"18.78"	"20.67"	"16.22"	"20.65"	"21.31"	"12.92"	"9.15"
 "Kanana 1.5 15.7B A3B Instruct"	"https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct"	"temperature: 1.0
+top-p: 0.95"	"Kakao"	"414.0"	"414.0"	"15.7"	"Open"	"Instruct"	"Off"	"11.71"	"21.11"	"20.28"	"10.84"	"15.24"	"5.56"	"7.73"	"8.94"	"9.19"	"8.15"	"5.46"	"5.06"	"4.88"

src/data/open/time_data.json DELETED Viewed

The diff for this file is too large to render. See raw diff