NewEden-Forge
/

code

Model card Files Files and versions

xet

Community

Delta-Vector commited on Aug 14, 2025

Commit

ae48413

verified ·

1 Parent(s): 051e390

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

README.md +12 -0
app.py +354 -0
requirements.txt +62 -0

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: MMLU + IFEVAL Leaderboard
+emoji: 👀
+colorFrom: blue
+colorTo: indigo
+sdk: gradio
+sdk_version: 4.44.1
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,354 @@

+import gradio as gr
+import pandas as pd
+import json
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+def create_benchmark_plot(df):
+    if df.empty:
+        return None
+    df_copy = df.copy()
+    score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
+    for col in score_columns:
+        df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce').fillna(0)
+    df_copy['Total_Score'] = df_copy[score_columns].sum(axis=1)
+    df_sorted = df_copy.sort_values(by='Total_Score', ascending=False)
+    if len(df_sorted) > 10:
+        top_models = df_sorted.head(10)
+    else:
+        top_models = df_sorted
+    benchmarks = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
+    models = top_models['Model'].unique()
+    x = np.arange(len(benchmarks))
+    width = 0.8 / len(models) if len(models) > 0 else 0.8
+    fig, ax = plt.subplots(figsize=(30, 10))
+    all_scores = []
+    for i, model in enumerate(models):
+        model_data = top_models[top_models['Model'] == model]
+        scores = [model_data[benchmark].values[0] if not model_data[benchmark].empty else 0 for benchmark in benchmarks]
+        all_scores.extend(scores)
+        offset = width * i - (width * (len(models) - 1) / 2)
+        rects = ax.bar(x + offset, scores, width, label=model)
+        ax.bar_label(rects, padding=3)
+    ax.set_ylabel('Scores')
+    ax.set_xticks(x)
+    ax.set_xticklabels(benchmarks, rotation=45, ha="right")
+    ax.legend(loc='lower right')
+    if all_scores:
+        ax.set_ylim(top=max(all_scores) * 1.15)
+    plt.tight_layout()
+    return fig
+def load_leaderboard_data():
+    data = []
+    benchmarks_dir = "benchmarks"
+    mmlu_categories = {
+        "mmlu_professional": [
+            "mmlu_professional_accounting", "mmlu_professional_law",
+            "mmlu_professional_medicine", "mmlu_professional_psychology"
+        ],
+        "mmlu_college": [
+            "mmlu_college_biology", "mmlu_college_chemistry", "mmlu_college_computer_science",
+            "mmlu_college_mathematics", "mmlu_college_medicine", "mmlu_college_physics"
+        ],
+        "mmlu_high_school": [
+            "mmlu_high_school_biology", "mmlu_high_school_chemistry", "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history", "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics", "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics", "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics", "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history"
+        ]
+    }
+    all_mmlu_scores = [
+        "mmlu_abstract_algebra", "mmlu_anatomy", "mmlu_astronomy", "mmlu_business_ethics",
+        "mmlu_clinical_knowledge", "mmlu_college_biology", "mmlu_college_chemistry",
+        "mmlu_college_computer_science", "mmlu_college_mathematics", "mmlu_college_medicine",
+        "mmlu_college_physics", "mmlu_computer_security", "mmlu_conceptual_physics",
+        "mmlu_econometrics", "mmlu_electrical_engineering", "mmlu_elementary_mathematics",
+        "mmlu_formal_logic", "mmlu_global_facts", "mmlu_high_school_biology",
+        "mmlu_high_school_chemistry", "mmlu_high_school_computer_science",
+        "mmlu_high_school_european_history", "mmlu_high_school_geography",
+        "mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics",
+        "mmlu_high_school_mathematics", "mmlu_high_school_microeconomics",
+        "mmlu_high_school_physics", "mmlu_high_school_psychology",
+        "mmlu_high_school_statistics", "mmlu_high_school_us_history",
+        "mmlu_high_school_world_history", "mmlu_human_aging", "mmlu_human_sexuality",
+        "mmlu_humanities", "mmlu_international_law", "mmlu_jurisprudence",
+        "mmlu_logical_fallacies", "mmlu_machine_learning", "mmlu_management",
+        "mmlu_marketing", "mmlu_medical_genetics", "mmlu_miscellaneous",
+        "mmlu_moral_disputes", "mmlu_moral_scenarios", "mmlu_nutrition", "mmlu_other",
+        "mmlu_philosophy", "mmlu_prehistory", "mmlu_professional_accounting",
+        "mmlu_professional_law", "mmlu_professional_medicine",
+        "mmlu_professional_psychology", "mmlu_public_relations", "mmlu_security_studies",
+        "mmlu_social_sciences", "mmlu_sociology", "mmlu_stem", "mmlu_us_foreign_policy",
+        "mmlu_virology", "mmlu_world_religions"
+    ]
+    other_mmlu_scores = [s for s in all_mmlu_scores if s not in sum(mmlu_categories.values(), [])]
+    mmlu_categories["mmlu_other"] = other_mmlu_scores
+    for filename in os.listdir(benchmarks_dir):
+        if filename.endswith(".json") and filename.startswith("results_"):
+            filepath = os.path.join(benchmarks_dir, filename)
+            with open(filepath, 'r') as f:
+                content = json.load(f)
+                model_name = content.get("model_name")
+                if not model_name:
+                    model_name = os.path.splitext(filename)[0]
+                if model_name.endswith('/'):
+                    model_name = model_name.rstrip('/')
+                model_name = os.path.basename(model_name)
+                results = content.get("results", {})
+                ifeval_score = results.get("ifeval", {}).get("prompt_level_strict_acc,none")
+                mmlu_score = results.get("mmlu", {}).get("acc,none")
+                row = {"Model": model_name, "IFEval": ifeval_score, "MMLU": mmlu_score}
+                for score_name in all_mmlu_scores:
+                    row[score_name] = results.get(score_name, {}).get("acc,none")
+                for category, scores in mmlu_categories.items():
+                    category_scores = [pd.to_numeric(row.get(s), errors='coerce') for s in scores]
+                    category_scores = [s for s in category_scores if pd.notna(s)]
+                    if category_scores:
+                        row[category] = sum(category_scores) / len(category_scores)
+                    else:
+                        row[category] = np.nan
+                data.append(row)
+    df_raw = pd.DataFrame(data)
+    numeric_cols = [col for col in df_raw.columns if col != 'Model']
+    for col in numeric_cols:
+        df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce')
+    score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
+    for col in score_columns:
+        df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce').fillna(0)
+    df_raw['Total_Score'] = df_raw[score_columns].sum(axis=1)
+    df_sorted = df_raw.sort_values(by='Total_Score', ascending=False)
+    df = df_sorted.drop_duplicates(subset=['Model'], keep='first').copy()
+    df = df.drop(columns=['Total_Score'])
+    for col in numeric_cols:
+        df[col] = df[col].apply(lambda x: round(x, 4) if pd.notna(x) else x)
+    df.fillna(0, inplace=True)
+    return df
+def style_diff(df, all_data_df):
+    def highlight_max(s):
+        s_numeric = pd.to_numeric(s, errors='coerce')
+        max_val = s_numeric.max()
+        return ['background-color: #68a055' if v == max_val else '' for v in s_numeric]
+    def highlight_min(s):
+        s_numeric = pd.to_numeric(s, errors='coerce')
+        s_filtered = s_numeric[s_numeric > 0]
+        if s_filtered.empty:
+            return ['' for _ in s_numeric]
+        min_val = s_filtered.min()
+        return ['background-color: #d4605b' if v == min_val else '' for v in s_numeric]
+    df_styler = df.style
+    for col in df.columns:
+        if col != 'Model':
+            numeric_col = pd.to_numeric(df[col], errors='coerce')
+            if not numeric_col.isnull().all():
+                df_styler = df_styler.apply(highlight_max, subset=[col], axis=0)
+                df_styler = df_styler.apply(highlight_min, subset=[col], axis=0)
+    return df_styler
+def prepare_plot_data(df, all_cols=False):
+    df_plot = df.copy()
+    if not df_plot.empty:
+        if all_cols:
+            score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
+            for col in score_columns:
+                df_plot[col] = pd.to_numeric(df_plot[col], errors='coerce').fillna(0)
+            df_plot['Total_Score'] = df_plot[score_columns].sum(axis=1)
+            df_plot = df_plot.sort_values(by='Total_Score', ascending=False).reset_index(drop=True)
+            df_plot = df_plot.head(10)
+            df_plot['Ranked_Model'] = [f"{i+1:02d}. {model}" for i, model in enumerate(df_plot['Model'])]
+        else:
+            df_plot['MMLU_IFEval_Combined'] = df_plot['MMLU'].fillna(0) + df_plot['IFEval'].fillna(0)
+            df_plot = df_plot.sort_values(by='MMLU_IFEval_Combined', ascending=False).reset_index(drop=True)
+    return df_plot
+initial_df = load_leaderboard_data()
+display_cols = ['Model', 'IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
+display_df = initial_df[display_cols].copy()
+for col in display_df.columns:
+    if col != 'Model':
+        display_df[col] = pd.to_numeric(display_df[col], errors='coerce').fillna(0)
+with gr.Blocks() as demo:
+    gr.Markdown("# Model Leaderboard")
+    def update_plots(selected_models):
+        if not selected_models:
+            df_to_plot = initial_df
+        else:
+            df_to_plot = initial_df[initial_df['Model'].isin(selected_models)]
+        scatter_plot_df = prepare_plot_data(df_to_plot.copy(), all_cols=False)
+        padding_factor = 0.1
+        min_padding = 0.05
+        if not scatter_plot_df.empty:
+            x_min, x_max = scatter_plot_df['MMLU'].min(), scatter_plot_df['MMLU'].max()
+            x_range = x_max - x_min
+            x_padding = max(x_range * padding_factor, min_padding) if x_range > 0 else min_padding
+            x_lim = [x_min - x_padding, x_max + x_padding]
+            y_min, y_max = scatter_plot_df['IFEval'].min(), scatter_plot_df['IFEval'].max()
+            y_range = y_max - y_min
+            y_padding = max(y_range * padding_factor, min_padding) if y_range > 0 else min_padding
+            y_lim = [y_min - y_padding, y_max + y_padding]
+        else:
+            x_lim = [0, 1]
+            y_lim = [0, 1]
+            scatter_plot_df = pd.DataFrame(columns=['Model', 'MMLU', 'IFEval', 'MMLU_IFEval_Combined'])
+        scatter_plot_update = gr.ScatterPlot(
+            value=scatter_plot_df,
+            x="MMLU",
+            y="IFEval",
+            color="Model",
+            title="Model Performance",
+            x_lim=x_lim,
+            y_lim=y_lim,
+        )
+        bar_plot_df = prepare_plot_data(df_to_plot.copy(), all_cols=True)
+        if not bar_plot_df.empty:
+            value_vars = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
+            melted_df = bar_plot_df.melt(id_vars='Ranked_Model', value_vars=value_vars,
+                                         var_name='Benchmark', value_name='Score')
+        else:
+            melted_df = pd.DataFrame(columns=['Ranked_Model', 'Benchmark', 'Score'])
+        bar_plot_update = gr.BarPlot(
+            value=melted_df,
+            x="Score",
+            y="Ranked_Model",
+            color="Benchmark",
+            title="MMLU and IFEval Scores by Model",
+            x_title="Score",
+            y_title="Model",
+            color_legend_title="Benchmark",
+            vertical=False,
+        )
+        benchmark_plot_update = create_benchmark_plot(df_to_plot)
+        if not selected_models:
+            df_to_display = display_df
+            styled_df = style_diff(df_to_display, initial_df)
+        else:
+            df_to_display = display_df[display_df['Model'].isin(selected_models)]
+            styled_df = style_diff(df_to_display, initial_df)
+        return scatter_plot_update, bar_plot_update, benchmark_plot_update, styled_df
+    with gr.Accordion("Plots", open=True):
+        with gr.Tabs():
+            with gr.TabItem("Summary Plots"):
+                with gr.Row():
+                    scatter_plot_df = prepare_plot_data(initial_df.copy(), all_cols=False)
+                    padding_factor = 0.1
+                    min_padding = 0.05
+                    x_min, x_max = scatter_plot_df['MMLU'].min(), scatter_plot_df['MMLU'].max()
+                    x_range = x_max - x_min
+                    x_padding = max(x_range * padding_factor, min_padding)
+                    x_lim = [x_min - x_padding, x_max + x_padding]
+                    y_min, y_max = scatter_plot_df['IFEval'].min(), scatter_plot_df['IFEval'].max()
+                    y_range = y_max - y_min
+                    y_padding = max(y_range * padding_factor, min_padding)
+                    y_lim = [y_min - y_padding, y_max + y_padding]
+                    scatterplot = gr.ScatterPlot(
+                        value=scatter_plot_df,
+                        x="MMLU",
+                        y="IFEval",
+                        color="Model",
+                        title="Model Performance",
+                        x_lim=x_lim,
+                        y_lim=y_lim,
+                    )
+                    bar_plot_df = prepare_plot_data(initial_df.copy(), all_cols=True)
+                    value_vars = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
+                    melted_df = bar_plot_df.melt(id_vars='Ranked_Model', value_vars=value_vars,
+                                                 var_name='Benchmark', value_name='Score')
+                    barplot = gr.BarPlot(
+                        value=melted_df,
+                        x="Score",
+                        y="Ranked_Model",
+                        color="Benchmark",
+                        title="MMLU and IFEval Scores by Model",
+                        x_title="Score",
+                        y_title="Model",
+                        color_legend_title="Benchmark",
+                        vertical=False,
+                    )
+            with gr.TabItem("Benchmark Comparison"):
+                with gr.Row():
+                    benchmark_plot = gr.Plot(value=create_benchmark_plot(initial_df))
+    model_names = initial_df["Model"].tolist()
+    model_selector = gr.Dropdown(
+        choices=model_names,
+        label="Select Models to Display",
+        multiselect=True,
+        info="Select one or more models to display on the plots. If none are selected, all models will be shown."
+    )
+    with gr.Row():
+        dataframe = gr.DataFrame(
+            value=style_diff(display_df, initial_df),
+            type="pandas",
+            column_widths=["30%", "10%", "10%", "12%", "10%", "10%", "10%"],
+            wrap=True
+        )
+    model_selector.change(update_plots, inputs=model_selector, outputs=[scatterplot, barplot, benchmark_plot, dataframe])
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,62 @@

+aiofiles==23.2.1
+annotated-types==0.7.0
+anyio==4.9.0
+certifi==2025.4.26
+charset-normalizer==3.4.2
+click==8.1.8
+contourpy==1.3.0
+cycler==0.12.1
+exceptiongroup==1.3.0
+fastapi==0.115.12
+ffmpy==0.6.0
+filelock==3.18.0
+fonttools==4.58.2
+fsspec==2025.5.1
+gradio==4.44.1
+gradio_client==1.3.0
+h11==0.16.0
+hf-xet==1.1.3
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.32.4
+idna==3.10
+importlib_resources==6.5.2
+Jinja2==3.1.6
+kiwisolver==1.4.7
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.4
+mdurl==0.1.2
+narwhals==1.41.1
+numpy==2.0.2
+orjson==3.10.18
+packaging==25.0
+pandas==2.3.0
+pillow==10.4.0
+pydantic==2.11.5
+pydantic_core==2.33.2
+pydub==0.25.1
+Pygments==2.19.1
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2025.2
+PyYAML==6.0.2
+requests==2.32.3
+rich==14.0.0
+ruff==0.11.13
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+starlette==0.46.2
+tomlkit==0.12.0
+tqdm==4.67.1
+typer==0.16.0
+typing-inspection==0.4.1
+typing_extensions==4.14.0
+tzdata==2025.2
+urllib3==2.4.0
+uvicorn==0.34.3
+websockets==12.0
+zipp==3.22.0