Delta-Vector commited on
Commit
ae48413
·
verified ·
1 Parent(s): 051e390

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +12 -0
  2. app.py +354 -0
  3. requirements.txt +62 -0
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: MMLU + IFEVAL Leaderboard
3
+ emoji: 👀
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 4.44.1
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import json
4
+ import os
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+
8
+ def create_benchmark_plot(df):
9
+ if df.empty:
10
+ return None
11
+
12
+ df_copy = df.copy()
13
+ score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
14
+
15
+ for col in score_columns:
16
+ df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce').fillna(0)
17
+
18
+ df_copy['Total_Score'] = df_copy[score_columns].sum(axis=1)
19
+
20
+ df_sorted = df_copy.sort_values(by='Total_Score', ascending=False)
21
+
22
+ if len(df_sorted) > 10:
23
+ top_models = df_sorted.head(10)
24
+ else:
25
+ top_models = df_sorted
26
+
27
+ benchmarks = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
28
+ models = top_models['Model'].unique()
29
+
30
+ x = np.arange(len(benchmarks))
31
+ width = 0.8 / len(models) if len(models) > 0 else 0.8
32
+
33
+ fig, ax = plt.subplots(figsize=(30, 10))
34
+
35
+ all_scores = []
36
+ for i, model in enumerate(models):
37
+ model_data = top_models[top_models['Model'] == model]
38
+ scores = [model_data[benchmark].values[0] if not model_data[benchmark].empty else 0 for benchmark in benchmarks]
39
+ all_scores.extend(scores)
40
+ offset = width * i - (width * (len(models) - 1) / 2)
41
+ rects = ax.bar(x + offset, scores, width, label=model)
42
+ ax.bar_label(rects, padding=3)
43
+
44
+ ax.set_ylabel('Scores')
45
+ ax.set_xticks(x)
46
+ ax.set_xticklabels(benchmarks, rotation=45, ha="right")
47
+ ax.legend(loc='lower right')
48
+
49
+ if all_scores:
50
+ ax.set_ylim(top=max(all_scores) * 1.15)
51
+
52
+ plt.tight_layout()
53
+
54
+ return fig
55
+
56
+ def load_leaderboard_data():
57
+ data = []
58
+ benchmarks_dir = "benchmarks"
59
+
60
+ mmlu_categories = {
61
+ "mmlu_professional": [
62
+ "mmlu_professional_accounting", "mmlu_professional_law",
63
+ "mmlu_professional_medicine", "mmlu_professional_psychology"
64
+ ],
65
+ "mmlu_college": [
66
+ "mmlu_college_biology", "mmlu_college_chemistry", "mmlu_college_computer_science",
67
+ "mmlu_college_mathematics", "mmlu_college_medicine", "mmlu_college_physics"
68
+ ],
69
+ "mmlu_high_school": [
70
+ "mmlu_high_school_biology", "mmlu_high_school_chemistry", "mmlu_high_school_computer_science",
71
+ "mmlu_high_school_european_history", "mmlu_high_school_geography",
72
+ "mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics",
73
+ "mmlu_high_school_mathematics", "mmlu_high_school_microeconomics",
74
+ "mmlu_high_school_physics", "mmlu_high_school_psychology",
75
+ "mmlu_high_school_statistics", "mmlu_high_school_us_history",
76
+ "mmlu_high_school_world_history"
77
+ ]
78
+ }
79
+
80
+ all_mmlu_scores = [
81
+ "mmlu_abstract_algebra", "mmlu_anatomy", "mmlu_astronomy", "mmlu_business_ethics",
82
+ "mmlu_clinical_knowledge", "mmlu_college_biology", "mmlu_college_chemistry",
83
+ "mmlu_college_computer_science", "mmlu_college_mathematics", "mmlu_college_medicine",
84
+ "mmlu_college_physics", "mmlu_computer_security", "mmlu_conceptual_physics",
85
+ "mmlu_econometrics", "mmlu_electrical_engineering", "mmlu_elementary_mathematics",
86
+ "mmlu_formal_logic", "mmlu_global_facts", "mmlu_high_school_biology",
87
+ "mmlu_high_school_chemistry", "mmlu_high_school_computer_science",
88
+ "mmlu_high_school_european_history", "mmlu_high_school_geography",
89
+ "mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics",
90
+ "mmlu_high_school_mathematics", "mmlu_high_school_microeconomics",
91
+ "mmlu_high_school_physics", "mmlu_high_school_psychology",
92
+ "mmlu_high_school_statistics", "mmlu_high_school_us_history",
93
+ "mmlu_high_school_world_history", "mmlu_human_aging", "mmlu_human_sexuality",
94
+ "mmlu_humanities", "mmlu_international_law", "mmlu_jurisprudence",
95
+ "mmlu_logical_fallacies", "mmlu_machine_learning", "mmlu_management",
96
+ "mmlu_marketing", "mmlu_medical_genetics", "mmlu_miscellaneous",
97
+ "mmlu_moral_disputes", "mmlu_moral_scenarios", "mmlu_nutrition", "mmlu_other",
98
+ "mmlu_philosophy", "mmlu_prehistory", "mmlu_professional_accounting",
99
+ "mmlu_professional_law", "mmlu_professional_medicine",
100
+ "mmlu_professional_psychology", "mmlu_public_relations", "mmlu_security_studies",
101
+ "mmlu_social_sciences", "mmlu_sociology", "mmlu_stem", "mmlu_us_foreign_policy",
102
+ "mmlu_virology", "mmlu_world_religions"
103
+ ]
104
+
105
+ other_mmlu_scores = [s for s in all_mmlu_scores if s not in sum(mmlu_categories.values(), [])]
106
+ mmlu_categories["mmlu_other"] = other_mmlu_scores
107
+
108
+ for filename in os.listdir(benchmarks_dir):
109
+ if filename.endswith(".json") and filename.startswith("results_"):
110
+ filepath = os.path.join(benchmarks_dir, filename)
111
+ with open(filepath, 'r') as f:
112
+ content = json.load(f)
113
+
114
+ model_name = content.get("model_name")
115
+ if not model_name:
116
+ model_name = os.path.splitext(filename)[0]
117
+
118
+ if model_name.endswith('/'):
119
+ model_name = model_name.rstrip('/')
120
+
121
+ model_name = os.path.basename(model_name)
122
+
123
+ results = content.get("results", {})
124
+ ifeval_score = results.get("ifeval", {}).get("prompt_level_strict_acc,none")
125
+ mmlu_score = results.get("mmlu", {}).get("acc,none")
126
+
127
+ row = {"Model": model_name, "IFEval": ifeval_score, "MMLU": mmlu_score}
128
+
129
+ for score_name in all_mmlu_scores:
130
+ row[score_name] = results.get(score_name, {}).get("acc,none")
131
+
132
+ for category, scores in mmlu_categories.items():
133
+ category_scores = [pd.to_numeric(row.get(s), errors='coerce') for s in scores]
134
+ category_scores = [s for s in category_scores if pd.notna(s)]
135
+ if category_scores:
136
+ row[category] = sum(category_scores) / len(category_scores)
137
+ else:
138
+ row[category] = np.nan
139
+
140
+ data.append(row)
141
+
142
+ df_raw = pd.DataFrame(data)
143
+
144
+ numeric_cols = [col for col in df_raw.columns if col != 'Model']
145
+ for col in numeric_cols:
146
+ df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce')
147
+
148
+ score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
149
+ for col in score_columns:
150
+ df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce').fillna(0)
151
+
152
+ df_raw['Total_Score'] = df_raw[score_columns].sum(axis=1)
153
+
154
+ df_sorted = df_raw.sort_values(by='Total_Score', ascending=False)
155
+
156
+ df = df_sorted.drop_duplicates(subset=['Model'], keep='first').copy()
157
+
158
+ df = df.drop(columns=['Total_Score'])
159
+
160
+ for col in numeric_cols:
161
+ df[col] = df[col].apply(lambda x: round(x, 4) if pd.notna(x) else x)
162
+
163
+ df.fillna(0, inplace=True)
164
+
165
+ return df
166
+
167
+ def style_diff(df, all_data_df):
168
+ def highlight_max(s):
169
+ s_numeric = pd.to_numeric(s, errors='coerce')
170
+ max_val = s_numeric.max()
171
+ return ['background-color: #68a055' if v == max_val else '' for v in s_numeric]
172
+
173
+ def highlight_min(s):
174
+ s_numeric = pd.to_numeric(s, errors='coerce')
175
+ s_filtered = s_numeric[s_numeric > 0]
176
+ if s_filtered.empty:
177
+ return ['' for _ in s_numeric]
178
+ min_val = s_filtered.min()
179
+ return ['background-color: #d4605b' if v == min_val else '' for v in s_numeric]
180
+
181
+ df_styler = df.style
182
+ for col in df.columns:
183
+ if col != 'Model':
184
+ numeric_col = pd.to_numeric(df[col], errors='coerce')
185
+ if not numeric_col.isnull().all():
186
+ df_styler = df_styler.apply(highlight_max, subset=[col], axis=0)
187
+ df_styler = df_styler.apply(highlight_min, subset=[col], axis=0)
188
+ return df_styler
189
+
190
+ def prepare_plot_data(df, all_cols=False):
191
+ df_plot = df.copy()
192
+
193
+ if not df_plot.empty:
194
+ if all_cols:
195
+ score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
196
+ for col in score_columns:
197
+ df_plot[col] = pd.to_numeric(df_plot[col], errors='coerce').fillna(0)
198
+ df_plot['Total_Score'] = df_plot[score_columns].sum(axis=1)
199
+ df_plot = df_plot.sort_values(by='Total_Score', ascending=False).reset_index(drop=True)
200
+ df_plot = df_plot.head(10)
201
+ df_plot['Ranked_Model'] = [f"{i+1:02d}. {model}" for i, model in enumerate(df_plot['Model'])]
202
+ else:
203
+ df_plot['MMLU_IFEval_Combined'] = df_plot['MMLU'].fillna(0) + df_plot['IFEval'].fillna(0)
204
+ df_plot = df_plot.sort_values(by='MMLU_IFEval_Combined', ascending=False).reset_index(drop=True)
205
+
206
+ return df_plot
207
+
208
+ initial_df = load_leaderboard_data()
209
+ display_cols = ['Model', 'IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
210
+ display_df = initial_df[display_cols].copy()
211
+ for col in display_df.columns:
212
+ if col != 'Model':
213
+ display_df[col] = pd.to_numeric(display_df[col], errors='coerce').fillna(0)
214
+
215
+ with gr.Blocks() as demo:
216
+ gr.Markdown("# Model Leaderboard")
217
+
218
+ def update_plots(selected_models):
219
+ if not selected_models:
220
+ df_to_plot = initial_df
221
+ else:
222
+ df_to_plot = initial_df[initial_df['Model'].isin(selected_models)]
223
+
224
+ scatter_plot_df = prepare_plot_data(df_to_plot.copy(), all_cols=False)
225
+
226
+ padding_factor = 0.1
227
+ min_padding = 0.05
228
+
229
+ if not scatter_plot_df.empty:
230
+ x_min, x_max = scatter_plot_df['MMLU'].min(), scatter_plot_df['MMLU'].max()
231
+ x_range = x_max - x_min
232
+ x_padding = max(x_range * padding_factor, min_padding) if x_range > 0 else min_padding
233
+ x_lim = [x_min - x_padding, x_max + x_padding]
234
+
235
+ y_min, y_max = scatter_plot_df['IFEval'].min(), scatter_plot_df['IFEval'].max()
236
+ y_range = y_max - y_min
237
+ y_padding = max(y_range * padding_factor, min_padding) if y_range > 0 else min_padding
238
+ y_lim = [y_min - y_padding, y_max + y_padding]
239
+ else:
240
+ x_lim = [0, 1]
241
+ y_lim = [0, 1]
242
+ scatter_plot_df = pd.DataFrame(columns=['Model', 'MMLU', 'IFEval', 'MMLU_IFEval_Combined'])
243
+
244
+ scatter_plot_update = gr.ScatterPlot(
245
+ value=scatter_plot_df,
246
+ x="MMLU",
247
+ y="IFEval",
248
+ color="Model",
249
+ title="Model Performance",
250
+ x_lim=x_lim,
251
+ y_lim=y_lim,
252
+ )
253
+
254
+ bar_plot_df = prepare_plot_data(df_to_plot.copy(), all_cols=True)
255
+
256
+ if not bar_plot_df.empty:
257
+ value_vars = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
258
+ melted_df = bar_plot_df.melt(id_vars='Ranked_Model', value_vars=value_vars,
259
+ var_name='Benchmark', value_name='Score')
260
+ else:
261
+ melted_df = pd.DataFrame(columns=['Ranked_Model', 'Benchmark', 'Score'])
262
+
263
+ bar_plot_update = gr.BarPlot(
264
+ value=melted_df,
265
+ x="Score",
266
+ y="Ranked_Model",
267
+ color="Benchmark",
268
+ title="MMLU and IFEval Scores by Model",
269
+ x_title="Score",
270
+ y_title="Model",
271
+ color_legend_title="Benchmark",
272
+ vertical=False,
273
+ )
274
+
275
+ benchmark_plot_update = create_benchmark_plot(df_to_plot)
276
+
277
+ if not selected_models:
278
+ df_to_display = display_df
279
+ styled_df = style_diff(df_to_display, initial_df)
280
+ else:
281
+ df_to_display = display_df[display_df['Model'].isin(selected_models)]
282
+ styled_df = style_diff(df_to_display, initial_df)
283
+
284
+ return scatter_plot_update, bar_plot_update, benchmark_plot_update, styled_df
285
+
286
+ with gr.Accordion("Plots", open=True):
287
+ with gr.Tabs():
288
+ with gr.TabItem("Summary Plots"):
289
+ with gr.Row():
290
+ scatter_plot_df = prepare_plot_data(initial_df.copy(), all_cols=False)
291
+
292
+ padding_factor = 0.1
293
+ min_padding = 0.05
294
+
295
+ x_min, x_max = scatter_plot_df['MMLU'].min(), scatter_plot_df['MMLU'].max()
296
+ x_range = x_max - x_min
297
+ x_padding = max(x_range * padding_factor, min_padding)
298
+ x_lim = [x_min - x_padding, x_max + x_padding]
299
+
300
+ y_min, y_max = scatter_plot_df['IFEval'].min(), scatter_plot_df['IFEval'].max()
301
+ y_range = y_max - y_min
302
+ y_padding = max(y_range * padding_factor, min_padding)
303
+ y_lim = [y_min - y_padding, y_max + y_padding]
304
+
305
+ scatterplot = gr.ScatterPlot(
306
+ value=scatter_plot_df,
307
+ x="MMLU",
308
+ y="IFEval",
309
+ color="Model",
310
+ title="Model Performance",
311
+ x_lim=x_lim,
312
+ y_lim=y_lim,
313
+ )
314
+
315
+ bar_plot_df = prepare_plot_data(initial_df.copy(), all_cols=True)
316
+ value_vars = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
317
+ melted_df = bar_plot_df.melt(id_vars='Ranked_Model', value_vars=value_vars,
318
+ var_name='Benchmark', value_name='Score')
319
+
320
+ barplot = gr.BarPlot(
321
+ value=melted_df,
322
+ x="Score",
323
+ y="Ranked_Model",
324
+ color="Benchmark",
325
+ title="MMLU and IFEval Scores by Model",
326
+ x_title="Score",
327
+ y_title="Model",
328
+ color_legend_title="Benchmark",
329
+ vertical=False,
330
+ )
331
+ with gr.TabItem("Benchmark Comparison"):
332
+ with gr.Row():
333
+ benchmark_plot = gr.Plot(value=create_benchmark_plot(initial_df))
334
+
335
+ model_names = initial_df["Model"].tolist()
336
+ model_selector = gr.Dropdown(
337
+ choices=model_names,
338
+ label="Select Models to Display",
339
+ multiselect=True,
340
+ info="Select one or more models to display on the plots. If none are selected, all models will be shown."
341
+ )
342
+
343
+ with gr.Row():
344
+ dataframe = gr.DataFrame(
345
+ value=style_diff(display_df, initial_df),
346
+ type="pandas",
347
+ column_widths=["30%", "10%", "10%", "12%", "10%", "10%", "10%"],
348
+ wrap=True
349
+ )
350
+
351
+ model_selector.change(update_plots, inputs=model_selector, outputs=[scatterplot, barplot, benchmark_plot, dataframe])
352
+
353
+ if __name__ == "__main__":
354
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ annotated-types==0.7.0
3
+ anyio==4.9.0
4
+ certifi==2025.4.26
5
+ charset-normalizer==3.4.2
6
+ click==8.1.8
7
+ contourpy==1.3.0
8
+ cycler==0.12.1
9
+ exceptiongroup==1.3.0
10
+ fastapi==0.115.12
11
+ ffmpy==0.6.0
12
+ filelock==3.18.0
13
+ fonttools==4.58.2
14
+ fsspec==2025.5.1
15
+ gradio==4.44.1
16
+ gradio_client==1.3.0
17
+ h11==0.16.0
18
+ hf-xet==1.1.3
19
+ httpcore==1.0.9
20
+ httpx==0.28.1
21
+ huggingface-hub==0.32.4
22
+ idna==3.10
23
+ importlib_resources==6.5.2
24
+ Jinja2==3.1.6
25
+ kiwisolver==1.4.7
26
+ markdown-it-py==3.0.0
27
+ MarkupSafe==2.1.5
28
+ matplotlib==3.9.4
29
+ mdurl==0.1.2
30
+ narwhals==1.41.1
31
+ numpy==2.0.2
32
+ orjson==3.10.18
33
+ packaging==25.0
34
+ pandas==2.3.0
35
+ pillow==10.4.0
36
+ pydantic==2.11.5
37
+ pydantic_core==2.33.2
38
+ pydub==0.25.1
39
+ Pygments==2.19.1
40
+ pyparsing==3.2.3
41
+ python-dateutil==2.9.0.post0
42
+ python-multipart==0.0.20
43
+ pytz==2025.2
44
+ PyYAML==6.0.2
45
+ requests==2.32.3
46
+ rich==14.0.0
47
+ ruff==0.11.13
48
+ semantic-version==2.10.0
49
+ shellingham==1.5.4
50
+ six==1.17.0
51
+ sniffio==1.3.1
52
+ starlette==0.46.2
53
+ tomlkit==0.12.0
54
+ tqdm==4.67.1
55
+ typer==0.16.0
56
+ typing-inspection==0.4.1
57
+ typing_extensions==4.14.0
58
+ tzdata==2025.2
59
+ urllib3==2.4.0
60
+ uvicorn==0.34.3
61
+ websockets==12.0
62
+ zipp==3.22.0