Spaces:

Enderchef
/

SuperBench-Eval

Sleeping

App Files Files Community

Enderchef commited on Jun 25

Commit

1c17342

verified ·

1 Parent(s): 6cc6a40

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -12

app.py CHANGED Viewed

@@ -20,11 +20,7 @@ MMLU_DATASET = "cais/mmlu"
 MMLU_PRO_DATASET = "TIGER-Lab/MMLU-Pro"
 def get_all_benchmark_options():
-    """
-    Dynamically fetches all available subjects for MMLU and MMLU-Pro.
-    Returns a dictionary mapping benchmark dataset IDs to their subjects,
-    and a flattened list suitable for a Gradio dropdown.
-    """
     all_options = {}
     gr_dropdown_options = [] # This is for initial display only, not used for dynamic updates directly
@@ -89,10 +85,7 @@ def load_model(model_id):
 def format_prompt(item):
-    """
-    Formats a single MMLU/MMLU-Pro question item into a clear prompt for the LLM.
-    The prompt is designed for the model to output a single letter answer (A, B, C, D).
-    """
     prompt = f"""{item['question']}
 A. {item['choices'][0]}
 B. {item['choices'][1]}
@@ -647,7 +640,7 @@ with gr.Blocks(css="""
         with gr.TabItem("🚀 Run Evaluation"):
             gr.Markdown("""
             <div class="markdown-text">
-                Enter your Hugging Face Model ID, choose a benchmark (MMLU or MMLU-Pro),
                 select a subject (or 'ALL' for a comprehensive evaluation),
                 and specify the number of samples per subject.
                 Ensure your Hugging Face token is set as an environment variable for private models.
@@ -750,14 +743,14 @@ with gr.Blocks(css="""
         with gr.TabItem("📊 Leaderboard"):
             gr.Markdown("""
             <div class="markdown-text">
-                Explore the performance of various LLMs on the MMLU and MMLU-Pro benchmarks.
                 This leaderboard is updated automatically with each new evaluation.
             </div>
             """)
             # Leaderboard Type Toggle
             leaderboard_type_toggle = gr.Radio(
-                ["MMLU", "MMLU-Pro"],
                 label="Select Benchmark for Leaderboard",
                 value="MMLU", # Default to MMLU
                 interactive=True,

 MMLU_PRO_DATASET = "TIGER-Lab/MMLU-Pro"
 def get_all_benchmark_options():
     all_options = {}
     gr_dropdown_options = [] # This is for initial display only, not used for dynamic updates directly
 def format_prompt(item):
     prompt = f"""{item['question']}
 A. {item['choices'][0]}
 B. {item['choices'][1]}
         with gr.TabItem("🚀 Run Evaluation"):
             gr.Markdown("""
             <div class="markdown-text">
+                Enter your Hugging Face Model ID, choose a benchmark (MMLU only for now),
                 select a subject (or 'ALL' for a comprehensive evaluation),
                 and specify the number of samples per subject.
                 Ensure your Hugging Face token is set as an environment variable for private models.
         with gr.TabItem("📊 Leaderboard"):
             gr.Markdown("""
             <div class="markdown-text">
+                Explore the performance of various LLMs on a chunk of MMLU called MMLU Small.
                 This leaderboard is updated automatically with each new evaluation.
             </div>
             """)
             # Leaderboard Type Toggle
             leaderboard_type_toggle = gr.Radio(
+                ["MMLU Small"],
                 label="Select Benchmark for Leaderboard",
                 value="MMLU", # Default to MMLU
                 interactive=True,