Spaces:

Enderchef
/

SuperBench-Eval

Sleeping

App Files Files Community

Enderchef commited on Jun 25

Commit

05331fd

verified ·

1 Parent(s): bddb36d

Update app.py

Browse files

Files changed (1) hide show

app.py +242 -105

app.py CHANGED Viewed

@@ -18,9 +18,6 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
 # --- Constants for Benchmarks ---
 MMLU_DATASET = "cais/mmlu"
 MMLU_PRO_DATASET = "cais/mmlu_pro"
-# Humanity's Last Exam is a composite benchmark, not a single dataset readily available like MMLU/MMLU-Pro.
-# For this implementation, we will focus on MMLU and MMLU-Pro, which are direct datasets.
-# Integrating HLE would require evaluating across multiple specific datasets.
 def get_all_benchmark_options():
     """
@@ -68,12 +65,12 @@ def load_model(model_id):
         return model_cache[model_id]
     try:
         # Load tokenizer and model, using bfloat16 if CUDA is available for efficiency
-        tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             token=HF_TOKEN,
-            trust_remote_code=True,
-            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
         ).to("cuda" if torch.cuda.is_available() else "cpu")
         # Create a text-generation pipeline
@@ -107,7 +104,7 @@ def extract_choice_letter(output):
     It prioritizes an exact match after "Answer:", then looks for any single capital letter.
     """
     # Look for "Answer: X" pattern first (e.g., "Answer: A" or "Answer: B")
-    match = re.search(r"Answer:\s*([ABCD])", output, re.IGNORECASE) # Added IGNORECASE for robustness
     if match:
         return match.group(1).upper() # Ensure it's uppercase
@@ -270,8 +267,6 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
             score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
         # Format detailed results for display in the text box
-        # The key change here is to wrap the entire multi-line string construction for each item
-        # within parentheses to ensure it's treated as a single element in the list comprehension.
         formatted_details = "\n\n".join([
             (
                 f"### Question:\n{item['question']}\n\n"
@@ -300,7 +295,7 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
         gr.Info("Evaluation completed successfully!")
         return score_string, \
                gr.update(value="", visible=False), gr.update(visible=False), \
-               gr.update(visible=true), gr.update(visible=true), gr.update(value=formatted_details, visible=False)
     except Exception as e:
         error_message = str(e)
@@ -328,158 +323,284 @@ def save_text(text_content):
 def load_leaderboard():
     """
-    Loads evaluation data from 'eval.jsonl', computes average accuracy per model,
-    and prepares data for the leaderboard plot and table.
     """
     try:
-        # Read the JSONL file into a pandas DataFrame
         df = pd.read_json("eval.jsonl", lines=True)
-        # Calculate average accuracy per model across all recorded evaluations
-        df_avg = df.groupby("model_id")["accuracy"].mean().reset_index()
-        df_avg.columns = ["Model ID", "Average Accuracy (%)"]
-        # Sort models by average accuracy in descending order
-        df_sorted = df_avg.sort_values(by="Average Accuracy (%)", ascending=False)
-        # Select top 10 models for the bar chart
-        top_models = df_sorted.head(10)
-        # Create the matplotlib plot
-        fig, ax = plt.subplots(figsize=(10, 6)) # Adjust figure size for better readability
-        # For horizontal bars, it's often better to plot data sorted in ascending order
-        # so the highest bar appears at the top of the chart.
-        top_models_plot = top_models.sort_values(by="Average Accuracy (%)", ascending=True)
-        ax.barh(top_models_plot['Model ID'], top_models_plot['Average Accuracy (%)'], color='#007bff') # Use a nice blue color
-        ax.set_xlabel("Average Accuracy (%)", fontsize=12)
-        ax.set_ylabel("Model ID", fontsize=12)
-        ax.set_title("Top 10 Models by Average MMLU/MMLU-Pro Accuracy", fontsize=14)
-        ax.set_xlim(0, 100) # Ensure accuracy scale is 0-100%
-        ax.tick_params(axis='x', labelsize=10)
-        ax.tick_params(axis='y', labelsize=10)
-        ax.grid(axis='x', linestyle='--', alpha=0.7) # Add grid lines
-        plt.tight_layout() # Adjust layout to prevent labels overlapping
-        # Return the figure and the sorted dataframe as a list of dictionaries for Gradio Dataframe
-        return fig, df_sorted.to_dict('records')
     except FileNotFoundError:
         gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!")
-        return plt.figure(), pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
     except Exception as e:
         gr.Error(f"Error loading leaderboard: {e}")
-        # Return an empty plot and dataframe in case of any other error
-        return plt.figure(), pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
 # --- Gradio Interface Definition ---
 with gr.Blocks(css="""
     /* General body and container styling */
-    body { font-family: 'Inter', sans-serif; background-color: #f0f2f5; margin: 0; padding: 20px; }
     .gradio-container {
         max-width: 1200px;
         margin: 20px auto;
-        padding: 30px;
-        box-shadow: 0 8px 16px rgba(0,0,0,0.15);
-        border-radius: 12px;
         background-color: #ffffff;
-        border: 1px solid #e0e0e0;
     }
     /* Headings */
     h1 {
-        color: #2c3e50;
         text-align: center;
         margin-bottom: 30px;
-        font-size: 2.5em;
         font-weight: 700;
-        letter-spacing: -0.02em;
     }
-    h3 { color: #34495e; font-size: 1.2em; margin-bottom: 10px; }
     /* Markdown text */
-    .markdown-text { text-align: center; color: #555; line-height: 1.6; }
-    .markdown-text div { font-size: 1.1em; }
     /* Buttons */
     .gr-button {
-        background-color: #007bff; /* Primary blue */
         color: white;
         border: none;
-        padding: 12px 25px;
-        border-radius: 8px;
         cursor: pointer;
-        transition: background-color 0.3s ease, transform 0.2s ease;
-        font-size: 1.1em;
         font-weight: 600;
-        box-shadow: 0 4px 8px rgba(0,0,0,0.1);
     }
     .gr-button:hover {
-        background-color: #0056b3; /* Darker blue on hover */
-        transform: translateY(-2px); /* Slight lift effect */
     }
     .gr-button:active {
         transform: translateY(0);
-        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
     }
     /* Specific button styling for debug/show details */
     #debug-button, #show-details-button {
-        background-color: #6c757d; /* Grey for secondary actions */
     }
     #debug-button:hover, #show-details-button:hover {
-        background-color: #5a6268;
     }
     #download-button {
-        background-color: #28a745; /* Green for download */
     }
     #download-button:hover {
-        background-color: #218838;
     }
-    /* Input/Output Boxes */
     .gr-box {
-        border: 1px solid #dee2e6;
-        border-radius: 10px;
-        padding: 20px;
-        margin-bottom: 20px;
-        background-color: #fdfdfd;
-        box-shadow: inset 0 1px 3px rgba(0,0,0,0.05);
     }
     .gr-output-text {
         white-space: pre-wrap;
         word-wrap: break-word;
-        background-color: #f9f9fb;
-        border: 1px solid #e9ecef;
         border-radius: 8px;
-        padding: 15px;
-        min-height: 100px; /* Ensure a minimum height */
     }
     /* Specific error output style */
     #error-message-output {
-        background-color: #ffe0e0;
-        border-color: #ff9999;
-        color: #cc0000;
     }
     /* Labels for inputs */
     .gr-textbox label, .gr-dropdown label, .gr-slider label {
         font-weight: 600;
-        color: #495057;
-        margin-bottom: 8px;
         display: block;
-        font-size: 1em;
     }
-    /* Tab styling */
-    .gr-tab-item { padding: 25px; } /* More padding inside tabs */
     .gr-tabs-nav button {
         font-weight: 600;
         font-size: 1.1em;
-        padding: 10px 20px;
-        border-top-left-radius: 8px;
-        border-top-right-radius: 8px;
     }
 """) as demo:
     gr.Markdown("""
@@ -489,10 +610,11 @@ with gr.Blocks(css="""
     with gr.Tabs():
         with gr.TabItem("🚀 Run Evaluation"):
             gr.Markdown("""
-            <div style="text-align: center; margin-bottom: 20px; color: #666; font-size: 1.1em;">
                 Enter your Hugging Face Model ID, choose a benchmark (MMLU or MMLU-Pro),
                 select a subject (or 'ALL' for a comprehensive evaluation),
                 and specify the number of samples per subject.
             </div>
             """)
@@ -521,6 +643,8 @@ with gr.Blocks(css="""
                     )
                 run_button = gr.Button("🚀 Run Evaluation", elem_classes="gr-button")
             with gr.Column(elem_classes="gr-box"):
                 acc_output = gr.Textbox(
                     label="Benchmark Accuracy Results",
@@ -598,24 +722,37 @@ with gr.Blocks(css="""
         with gr.TabItem("📊 Leaderboard"):
             gr.Markdown("""
-            <div style="text-align: center; margin-bottom: 20px; color: #666; font-size: 1.1em;">
-                See how different models perform on average across all evaluated benchmarks.
-                This leaderboard updates with every new evaluation.
             </div>
             """)
-            with gr.Row():
-                leaderboard_plot_output = gr.Plot(label="Top 10 Models by Average Accuracy", scale=2) # Scale for better visibility
-                leaderboard_table_output = gr.Dataframe(
-                    headers=["Model ID", "Average Accuracy (%)"],
-                    interactive=False,
-                    datatype=["str", "number"],
-                    row_count=10, # Display top 10 rows initially, but can scroll
-                    col_count=2,
-                    label="Full Leaderboard Data"
-                )
             # Load leaderboard when the tab is selected or when the app loads
-            demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot_output, leaderboard_table_output])
 # Launch the Gradio app
-demo.launch()

 # --- Constants for Benchmarks ---
 MMLU_DATASET = "cais/mmlu"
 MMLU_PRO_DATASET = "cais/mmlu_pro"
 def get_all_benchmark_options():
     """
         return model_cache[model_id]
     try:
         # Load tokenizer and model, using bfloat16 if CUDA is available for efficiency
+        tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             token=HF_TOKEN,
+            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+            trust_remote_code=True
         ).to("cuda" if torch.cuda.is_available() else "cpu")
         # Create a text-generation pipeline
     It prioritizes an exact match after "Answer:", then looks for any single capital letter.
     """
     # Look for "Answer: X" pattern first (e.g., "Answer: A" or "Answer: B")
+    match = re.search(r"Answer:\s*([ABCD])", output, re.IGNORECASE)
     if match:
         return match.group(1).upper() # Ensure it's uppercase
             score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
         # Format detailed results for display in the text box
         formatted_details = "\n\n".join([
             (
                 f"### Question:\n{item['question']}\n\n"
         gr.Info("Evaluation completed successfully!")
         return score_string, \
                gr.update(value="", visible=False), gr.update(visible=False), \
+               gr.update(visible=True), gr.update(visible=True), gr.update(value=formatted_details, visible=False)
     except Exception as e:
         error_message = str(e)
 def load_leaderboard():
     """
+    Loads evaluation data from 'eval.jsonl', computes average accuracy per model for MMLU and MMLU-Pro,
+    and prepares data for two separate leaderboard tables.
     """
     try:
         df = pd.read_json("eval.jsonl", lines=True)
+        # Ensure 'accuracy' is numeric, coerce errors to NaN and drop them
+        df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
+        df = df.dropna(subset=['accuracy'])
+        if df.empty:
+            gr.Warning("No valid evaluation data found to populate the leaderboard.")
+            # Return empty dataframes for both MMLU and MMLU-Pro
+            return (
+                pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
+                pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
+            )
+        # Filter for MMLU data
+        df_mmlu = df[df['benchmark'] == 'MMLU']
+        if 'subject' in df_mmlu.columns:
+            # For MMLU, if "ALL" subjects are evaluated, consider the overall accuracy.
+            # Otherwise, average specific subject accuracies.
+            df_mmlu_grouped = df_mmlu[df_mmlu['subject'] == 'ALL'].groupby("model_id")["accuracy"].mean().reset_index()
+            # If a model only has specific subject evaluations, average those.
+            # This is a simplification; a more robust approach might be to calculate weighted average.
+            # For now, if "ALL" exists, we use that; otherwise, we average available subjects.
+            # If no 'ALL' subject records, average across available subjects for MMLU
+            if df_mmlu_grouped.empty:
+                 df_mmlu_grouped = df_mmlu.groupby("model_id")["accuracy"].mean().reset_index()
+        else: # Handle older eval.jsonl without 'subject' column or if only MMLU was run
+             df_mmlu_grouped = df_mmlu.groupby("model_id")["accuracy"].mean().reset_index()
+        df_mmlu_grouped.columns = ["Model ID", "Average Accuracy (%)"]
+        df_mmlu_sorted = df_mmlu_grouped.sort_values(by="Average Accuracy (%)", ascending=False)
+        # Filter for MMLU-Pro data
+        df_mmlu_pro = df[df['benchmark'] == 'MMLU-Pro']
+        if 'subject' in df_mmlu_pro.columns:
+            df_mmlu_pro_grouped = df_mmlu_pro[df_mmlu_pro['subject'] == 'ALL'].groupby("model_id")["accuracy"].mean().reset_index()
+            if df_mmlu_pro_grouped.empty:
+                df_mmlu_pro_grouped = df_mmlu_pro.groupby("model_id")["accuracy"].mean().reset_index()
+        else: # Handle older eval.jsonl
+            df_mmlu_pro_grouped = df_mmlu_pro.groupby("model_id")["accuracy"].mean().reset_index()
+        df_mmlu_pro_grouped.columns = ["Model ID", "Average Accuracy (%)"]
+        df_mmlu_pro_sorted = df_mmlu_pro_grouped.sort_values(by="Average Accuracy (%)", ascending=False)
+        # Return two dataframes as lists of dictionaries
+        return df_mmlu_sorted.to_dict('records'), df_mmlu_pro_sorted.to_dict('records')
     except FileNotFoundError:
         gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!")
+        return (
+            pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
+            pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
+        )
     except Exception as e:
         gr.Error(f"Error loading leaderboard: {e}")
+        traceback.print_exc() # Print full traceback for debugging
+        return (
+            pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
+            pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
+        )
 # --- Gradio Interface Definition ---
 with gr.Blocks(css="""
+    /* Import Google Font - Inter */
+    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
     /* General body and container styling */
+    body {
+        font-family: 'Inter', sans-serif;
+        background-color: #eef2f6; /* Lighter background */
+        margin: 0;
+        padding: 20px;
+    }
     .gradio-container {
         max-width: 1200px;
         margin: 20px auto;
+        padding: 40px; /* Increased padding */
+        box-shadow: 0 10px 25px rgba(0,0,0,0.1); /* Softer, larger shadow */
+        border-radius: 15px; /* More rounded corners */
         background-color: #ffffff;
+        border: 1px solid #e0e6ed; /* Subtle border */
     }
     /* Headings */
     h1 {
+        color: #1a202c; /* Darker, more professional heading color */
         text-align: center;
         margin-bottom: 30px;
+        font-size: 2.8em; /* Slightly larger H1 */
         font-weight: 700;
+        letter-spacing: -0.03em;
+        text-shadow: 1px 1px 2px rgba(0,0,0,0.05); /* Subtle text shadow */
+    }
+    h3 {
+        color: #2d3748;
+        font-size: 1.3em; /* Slightly larger H3 */
+        margin-bottom: 15px;
+        font-weight: 600;
     }
     /* Markdown text */
+    .markdown-text {
+        text-align: center;
+        color: #4a5568;
+        line-height: 1.7;
+        font-size: 1.05em;
+        margin-bottom: 30px;
+    }
+    .markdown-text div {
+        font-size: 1.1em;
+        max-width: 800px; /* Constrain width for readability */
+        margin: 0 auto;
+    }
     /* Buttons */
     .gr-button {
+        background-color: #2f80ed; /* A vibrant, professional blue */
         color: white;
         border: none;
+        padding: 14px 30px; /* More padding */
+        border-radius: 10px; /* More rounded */
         cursor: pointer;
+        transition: background-color 0.3s ease, transform 0.2s ease, box-shadow 0.2s ease;
+        font-size: 1.15em; /* Slightly larger font */
         font-weight: 600;
+        box-shadow: 0 5px 15px rgba(0, 123, 255, 0.2); /* Enhanced shadow for primary button */
+        margin: 5px; /* Add some margin for spacing between buttons */
     }
     .gr-button:hover {
+        background-color: #1a6dcd; /* Darker blue on hover */
+        transform: translateY(-3px); /* More pronounced lift effect */
+        box-shadow: 0 8px 20px rgba(0, 123, 255, 0.3);
     }
     .gr-button:active {
         transform: translateY(0);
+        box-shadow: 0 2px 5px rgba(0,0,0,0.1);
     }
     /* Specific button styling for debug/show details */
     #debug-button, #show-details-button {
+        background-color: #718096; /* Professional grey */
+        box-shadow: 0 3px 10px rgba(113, 128, 150, 0.2);
     }
     #debug-button:hover, #show-details-button:hover {
+        background-color: #5d6d81;
+        box-shadow: 0 5px 12px rgba(113, 128, 150, 0.3);
     }
     #download-button {
+        background-color: #38a169; /* Muted green for download */
+        box-shadow: 0 3px 10px rgba(56, 161, 105, 0.2);
     }
     #download-button:hover {
+        background-color: #277e50;
+        box-shadow: 0 5px 12px rgba(56, 161, 105, 0.3);
     }
+    /* Input/Output Boxes (Containers) */
     .gr-box {
+        border: 1px solid #cbd5e0; /* Lighter, subtle border */
+        border-radius: 12px;
+        padding: 25px; /* Increased padding */
+        margin-bottom: 25px;
+        background-color: #f8fafc; /* Very light background */
+        box-shadow: inset 0 2px 5px rgba(0,0,0,0.03); /* Subtle inner shadow */
     }
+    /* Specific text output boxes (the content inside the containers) */
     .gr-output-text {
         white-space: pre-wrap;
         word-wrap: break-word;
+        background-color: #ffffff; /* White background for readability */
+        border: 1px solid #e2e8f0;
         border-radius: 8px;
+        padding: 18px; /* More padding */
+        min-height: 120px; /* Ensure a minimum height */
+        box-shadow: 0 2px 8px rgba(0,0,0,0.05); /* Small shadow for depth */
+        color: #2d3748; /* Darker text for readability */
+        font-size: 0.95em;
+        line-height: 1.6;
     }
     /* Specific error output style */
     #error-message-output {
+        background-color: #ffe0e6; /* Light red */
+        border-color: #ff99aa; /* Slightly darker red border */
+        color: #c53030; /* Stronger red text */
+        font-weight: 500;
+        padding: 20px;
     }
     /* Labels for inputs */
     .gr-textbox label, .gr-dropdown label, .gr-slider label {
         font-weight: 600;
+        color: #2d3748; /* Darker label text */
+        margin-bottom: 10px;
         display: block;
+        font-size: 1.05em; /* Slightly larger label font */
     }
+    /* Tabs styling */
     .gr-tabs-nav button {
         font-weight: 600;
         font-size: 1.1em;
+        padding: 12px 25px; /* More padding for tabs */
+        border-top-left-radius: 10px;
+        border-top-right-radius: 10px;
+        background-color: #ebf4f8; /* Light blueish tab background */
+        color: #4a5568;
+        border: 1px solid #cce0eb; /* Subtle border for tabs */
+        border-bottom: none;
+        transition: background-color 0.3s ease, color 0.3s ease;
+    }
+    .gr-tabs-nav button.selected {
+        background-color: #ffffff; /* White for selected tab */
+        color: #2f80ed; /* Blue for selected text */
+        border-color: #2f80ed;
+        border-bottom: 1px solid #ffffff; /* Hide bottom border to merge with content */
+    }
+    /* Leaderboard specific table styling (general for all leaderboard tables) */
+    .leaderboard-table {
+        border-radius: 12px;
+        box-shadow: 0 4px 15px rgba(0,0,0,0.08);
+        overflow: hidden;
+        margin-bottom: 25px; /* Space between tables */
+    }
+    .leaderboard-table table {
+        border-collapse: separate;
+        border-spacing: 0;
+        width: 100%;
+        background-color: #ffffff;
+    }
+    .leaderboard-table thead th {
+        background-color: #edf2f7; /* Light grey header */
+        color: #2d3748;
+        font-weight: 700;
+        padding: 15px 20px;
+        text-align: left;
+        border-bottom: 2px solid #e2e8f0;
+    }
+    .leaderboard-table tbody tr {
+        transition: background-color 0.2s ease;
+    }
+    .leaderboard-table tbody tr:nth-child(odd) {
+        background-color: #f7fafc; /* Zebra striping */
+    }
+    .leaderboard-table tbody tr:hover {
+        background-color: #e6fffa; /* Light teal on hover for rows */
+    }
+    .leaderboard-table tbody td {
+        padding: 12px 20px;
+        border-bottom: 1px solid #ebf4f8;
+        color: #4a5568;
+    }
+    .leaderboard-table tbody tr:last-child td {
+        border-bottom: none;
+    }
+    .leaderboard-table tbody tr:first-child td {
+        border-top-left-radius: 12px;
+        border-top-right-radius: 12px;
+    }
+    .leaderboard-table tbody tr:last-child td {
+        border-bottom-left-radius: 12px;
+        border-bottom-right-radius: 12px;
+    }
+    /* Horizontal line for separation */
+    hr {
+        border: none;
+        border-top: 1px solid #e2e8f0;
+        margin: 30px 0;
     }
 """) as demo:
     gr.Markdown("""
     with gr.Tabs():
         with gr.TabItem("🚀 Run Evaluation"):
             gr.Markdown("""
+            <div class="markdown-text">
                 Enter your Hugging Face Model ID, choose a benchmark (MMLU or MMLU-Pro),
                 select a subject (or 'ALL' for a comprehensive evaluation),
                 and specify the number of samples per subject.
+                Ensure your Hugging Face token is set as an environment variable for private models.
             </div>
             """)
                     )
                 run_button = gr.Button("🚀 Run Evaluation", elem_classes="gr-button")
+            gr.Markdown("<hr>") # Visual separator
             with gr.Column(elem_classes="gr-box"):
                 acc_output = gr.Textbox(
                     label="Benchmark Accuracy Results",
         with gr.TabItem("📊 Leaderboard"):
             gr.Markdown("""
+            <div class="markdown-text">
+                Explore the performance of various LLMs on the MMLU and MMLU-Pro benchmarks.
+                This leaderboard is updated automatically with each new evaluation.
             </div>
             """)
+            # MMLU Leaderboard Table
+            gr.Markdown("### MMLU Top Models")
+            mmlu_leaderboard_table = gr.Dataframe(
+                headers=["Model ID", "Average Accuracy (%)"],
+                interactive=False,
+                datatype=["str", "number"],
+                row_count=10,
+                col_count=2,
+                label="MMLU Leaderboard Data",
+                elem_classes="leaderboard-table" # Apply custom class for styling
+            )
+            gr.Markdown("### MMLU-Pro Top Models")
+            mmlu_pro_leaderboard_table = gr.Dataframe(
+                headers=["Model ID", "Average Accuracy (%)"],
+                interactive=False,
+                datatype=["str", "number"],
+                row_count=10,
+                col_count=2,
+                label="MMLU-Pro Leaderboard Data",
+                elem_classes="leaderboard-table" # Apply custom class for styling
+            )
             # Load leaderboard when the tab is selected or when the app loads
+            demo.load(load_leaderboard, inputs=[], outputs=[mmlu_leaderboard_table, mmlu_pro_leaderboard_table])
 # Launch the Gradio app
+demo.launch()