Spaces:

Enderchef
/

SuperBench-Eval

Running on Zero

App Files Files Community

Enderchef commited on Jun 25, 2025

Commit

cda939c

verified ·

1 Parent(s): 05331fd

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -66

app.py CHANGED Viewed

@@ -321,10 +321,10 @@ def save_text(text_content):
         gr.Error(f"Error saving file: {e}")
         return None
-def load_leaderboard():
     """
-    Loads evaluation data from 'eval.jsonl', computes average accuracy per model for MMLU and MMLU-Pro,
-    and prepares data for two separate leaderboard tables.
     """
     try:
         df = pd.read_json("eval.jsonl", lines=True)
@@ -335,68 +335,36 @@ def load_leaderboard():
         if df.empty:
             gr.Warning("No valid evaluation data found to populate the leaderboard.")
-            # Return empty dataframes for both MMLU and MMLU-Pro
-            return (
-                pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
-                pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
-            )
-        # Filter for MMLU data
-        df_mmlu = df[df['benchmark'] == 'MMLU']
-        if 'subject' in df_mmlu.columns:
-            # For MMLU, if "ALL" subjects are evaluated, consider the overall accuracy.
-            # Otherwise, average specific subject accuracies.
-            df_mmlu_grouped = df_mmlu[df_mmlu['subject'] == 'ALL'].groupby("model_id")["accuracy"].mean().reset_index()
-            # If a model only has specific subject evaluations, average those.
-            # This is a simplification; a more robust approach might be to calculate weighted average.
-            # For now, if "ALL" exists, we use that; otherwise, we average available subjects.
-            # If no 'ALL' subject records, average across available subjects for MMLU
-            if df_mmlu_grouped.empty:
-                 df_mmlu_grouped = df_mmlu.groupby("model_id")["accuracy"].mean().reset_index()
-        else: # Handle older eval.jsonl without 'subject' column or if only MMLU was run
-             df_mmlu_grouped = df_mmlu.groupby("model_id")["accuracy"].mean().reset_index()
-        df_mmlu_grouped.columns = ["Model ID", "Average Accuracy (%)"]
-        df_mmlu_sorted = df_mmlu_grouped.sort_values(by="Average Accuracy (%)", ascending=False)
-        # Filter for MMLU-Pro data
-        df_mmlu_pro = df[df['benchmark'] == 'MMLU-Pro']
-        if 'subject' in df_mmlu_pro.columns:
-            df_mmlu_pro_grouped = df_mmlu_pro[df_mmlu_pro['subject'] == 'ALL'].groupby("model_id")["accuracy"].mean().reset_index()
-            if df_mmlu_pro_grouped.empty:
-                df_mmlu_pro_grouped = df_mmlu_pro.groupby("model_id")["accuracy"].mean().reset_index()
-        else: # Handle older eval.jsonl
-            df_mmlu_pro_grouped = df_mmlu_pro.groupby("model_id")["accuracy"].mean().reset_index()
-        df_mmlu_pro_grouped.columns = ["Model ID", "Average Accuracy (%)"]
-        df_mmlu_pro_sorted = df_mmlu_pro_grouped.sort_values(by="Average Accuracy (%)", ascending=False)
-        # Return two dataframes as lists of dictionaries
-        return df_mmlu_sorted.to_dict('records'), df_mmlu_pro_sorted.to_dict('records')
     except FileNotFoundError:
         gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!")
-        return (
-            pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
-            pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
-        )
     except Exception as e:
         gr.Error(f"Error loading leaderboard: {e}")
         traceback.print_exc() # Print full traceback for debugging
-        return (
-            pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
-            pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
-        )
 # --- Gradio Interface Definition ---
 with gr.Blocks(css="""
     /* Import Google Font - Inter */
-    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
     /* General body and container styling */
     body {
@@ -728,31 +696,30 @@ with gr.Blocks(css="""
             </div>
             """)
-            # MMLU Leaderboard Table
-            gr.Markdown("### MMLU Top Models")
-            mmlu_leaderboard_table = gr.Dataframe(
-                headers=["Model ID", "Average Accuracy (%)"],
-                interactive=False,
-                datatype=["str", "number"],
-                row_count=10,
-                col_count=2,
-                label="MMLU Leaderboard Data",
-                elem_classes="leaderboard-table" # Apply custom class for styling
             )
-            gr.Markdown("### MMLU-Pro Top Models")
-            mmlu_pro_leaderboard_table = gr.Dataframe(
                 headers=["Model ID", "Average Accuracy (%)"],
                 interactive=False,
                 datatype=["str", "number"],
                 row_count=10,
                 col_count=2,
-                label="MMLU-Pro Leaderboard Data",
                 elem_classes="leaderboard-table" # Apply custom class for styling
             )
-            # Load leaderboard when the tab is selected or when the app loads
-            demo.load(load_leaderboard, inputs=[], outputs=[mmlu_leaderboard_table, mmlu_pro_leaderboard_table])
 # Launch the Gradio app
-demo.launch()

         gr.Error(f"Error saving file: {e}")
         return None
+def load_leaderboard(benchmark_filter):
     """
+    Loads evaluation data from 'eval.jsonl', computes average accuracy per model for the selected benchmark,
+    and prepares data for the leaderboard table.
     """
     try:
         df = pd.read_json("eval.jsonl", lines=True)
         if df.empty:
             gr.Warning("No valid evaluation data found to populate the leaderboard.")
+            return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
+        # Filter data based on the selected benchmark
+        df_filtered = df[df['benchmark'] == benchmark_filter]
+        if df_filtered.empty:
+            gr.Warning(f"No evaluation data for {benchmark_filter} found yet.")
+            return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
+        # For the leaderboard, we typically want the average across all subjects within that benchmark.
+        # So we group by model_id and take the mean of accuracy.
+        df_grouped = df_filtered.groupby("model_id")["accuracy"].mean().reset_index()
+        df_grouped.columns = ["Model ID", "Average Accuracy (%)"]
+        df_sorted = df_grouped.sort_values(by="Average Accuracy (%)", ascending=False)
+        return df_sorted.to_dict('records')
     except FileNotFoundError:
         gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!")
+        return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
     except Exception as e:
         gr.Error(f"Error loading leaderboard: {e}")
         traceback.print_exc() # Print full traceback for debugging
+        return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
 # --- Gradio Interface Definition ---
 with gr.Blocks(css="""
     /* Import Google Font - Inter */
+    @import url('https://fonts.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
     /* General body and container styling */
     body {
             </div>
             """)
+            # Leaderboard Type Toggle
+            leaderboard_type_toggle = gr.Radio(
+                ["MMLU", "MMLU-Pro"],
+                label="Select Benchmark for Leaderboard",
+                value="MMLU", # Default to MMLU
+                interactive=True,
+                container=False, # Make it inline with content
+                elem_id="leaderboard-toggle"
             )
+            # Leaderboard Table
+            leaderboard_table_output = gr.Dataframe(
                 headers=["Model ID", "Average Accuracy (%)"],
                 interactive=False,
                 datatype=["str", "number"],
                 row_count=10,
                 col_count=2,
+                label="Benchmark Leaderboard Data",
                 elem_classes="leaderboard-table" # Apply custom class for styling
             )
+            # Initial load and dynamic update for the leaderboard
+            demo.load(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output])
+            leaderboard_type_toggle.change(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output])
 # Launch the Gradio app
+demo.launch()