Spaces:

Enderchef
/

SuperBench-Eval

Running on Zero

App Files Files Community

Enderchef commited on Jun 25, 2025

Commit

4e79574

verified ·

1 Parent(s): 0a040f1

Update app.py

Browse files

Files changed (1) hide show

app.py +564 -121

app.py CHANGED Viewed

@@ -1,172 +1,615 @@
 import os
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-from datasets import load_dataset, get_dataset_config_names # Import get_dataset_config_names
 import torch
 import re
 import json
 import pandas as pd
 import matplotlib.pyplot as plt
 # Cache to avoid reloading the model
 model_cache = {}
 HF_TOKEN = os.environ.get("HF_TOKEN")
 def load_model(model_id):
     if model_id in model_cache:
         return model_cache[model_id]
-    tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
-    model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN).to("cuda" if torch.cuda.is_available() else "cpu")
-    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
-    model_cache[model_id] = generator
-    return generator
 def format_prompt(item):
-    # Simplified prompt: rely on max_new_tokens=1 and model's understanding for single-letter answer
     prompt = f"""{item['question']}
 A. {item['choices'][0]}
 B. {item['choices'][1]}
 C. {item['choices'][2]}
 D. {item['choices'][3]}
-Answer:""" # Removed direct instruction from here
-    return prompt, item['answer']
 def extract_choice_letter(output):
-    # This function should now be more reliable as max_new_tokens is set to 1
     match = re.search(r"\b([ABCD])\b", output.strip())
-    return match.group(1) if match else None
 def get_choice_letter(index):
     """Converts a numerical choice index (0-3) to a capital letter (A-D)."""
-    return chr(ord('A') + index)
-def evaluate(model_id, sample_count, config_name, progress=gr.Progress()):
-    if config_name == "ALL":
-        # Dynamically get all MMLU subjects
-        subjects = get_dataset_config_names("cais/mmlu", token=HF_TOKEN)
-        gen = load_model(model_id)
-        total_correct = 0
-        total_samples = 0
-        all_results = []
-        for i, subject in enumerate(progress.tqdm(subjects, desc="Evaluating subjects")):
-            dataset = load_dataset("cais/mmlu", subject, token=HF_TOKEN)["test"]
-            dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
-            correct_subject = 0
-            for j, item in enumerate(progress.tqdm(dataset, desc=f"Processing {subject} samples")):
-                prompt, answer_idx = format_prompt(item) # answer_idx is 0, 1, 2, or 3
-                expected_letter = get_choice_letter(answer_idx) # Convert to 'A', 'B', 'C', 'D'
-                # Crucial change: Limit generation to 1 new token
-                output = gen(prompt, max_new_tokens=1, do_sample=False)[0]["generated_text"]
-                output_letter = extract_choice_letter(output) # Extract the letter from model's output
-                is_correct = output_letter == expected_letter
-                correct_subject += is_correct
-                all_results.append((prompt, output.strip(), expected_letter, output_letter, is_correct)) # Store expected_letter
-            total_correct += correct_subject
-            total_samples += len(dataset)
-        avg_accuracy = total_correct / total_samples * 100
-        return avg_accuracy, all_results
-    gen = load_model(model_id)
-    dataset = load_dataset("cais/mmlu", config_name, token=HF_TOKEN)["test"]
-    dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
-    correct = 0
-    results = []
-    for i, item in enumerate(progress.tqdm(dataset, desc=f"Processing {config_name} samples")):
-        prompt, answer_idx = format_prompt(item) # answer_idx is 0, 1, 2, or 3
-        expected_letter = get_choice_letter(answer_idx) # Convert to 'A', 'B', 'C', 'D'
-        # Crucial change: Limit generation to 1 new token
-        output = gen(prompt, max_new_tokens=1, do_sample=False)[0]["generated_text"]
-        output_letter = extract_choice_letter(output) # Extract the letter from model's output
-        is_correct = output_letter == expected_letter
-        correct += is_correct
-        results.append((prompt, output.strip(), expected_letter, output_letter, is_correct)) # Store expected_letter
-    accuracy = correct / len(dataset) * 100
-    return accuracy, results
-def run(model_id, sample_count, config_name, progress=gr.Progress()):
-    accuracy_value, details = evaluate(model_id, sample_count, config_name, progress)
-    formatted = "\n\n".join([
-        f"### Question:\n{q}\n\n**Model Answer:** {o}\n**Expected:** {a}\n**Predicted:** {g}\n**Correct:** {c}"
-        for q, o, a, g, c in details
-    ])
-    if config_name == "ALL":
-        score_string = f"Average Accuracy: {accuracy_value:.2f}% across all subjects"
-    else:
-        score_string = f"Accuracy: {accuracy_value:.2f}%, out of {len(details)} samples"
-    record = {"model_id": model_id, "subject": config_name, "accuracy": accuracy_value}
-    with open("eval.jsonl", "a") as f:
-        f.write(json.dumps(record) + "\n")
-    return score_string, formatted
-def save_text(text):
-    return "evaluation_results.txt", text
-with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-width: 900px; margin: auto;}", analytics_enabled=False) as demo:
     gr.Markdown("""
     # 🤖 LLM Benchmark Evaluator
-    Currently, only **MMLU** (`cais/mmlu`) is available for evaluation.
-    **MMLU-Pro** and **Humanity's Last Exam** will be coming soon.
-    Enter your model ID, pick MMLU, choose a subject, and hit evaluate.
-    """)
-    # Get all MMLU subject config names dynamically
-    mmlu_subjects = ["ALL"] + get_dataset_config_names("cais/mmlu", token=HF_TOKEN)
-    with gr.Row():
-        model_id = gr.Textbox(label="Your Hugging Face Model ID", placeholder="e.g., your-org/your-model")
-        config_name = gr.Dropdown(
-            label="Choose MMLU Subject",
-            choices=mmlu_subjects, # Populate with all subjects
-            value="ALL",
-            interactive=True # Make interactive now that there are more choices
-        )
-        sample_count = gr.Slider(label="Number of Samples", minimum=1, maximum=100, value=10, step=1)
-    run_button = gr.Button("🚀 Run Evaluation")
-    acc_output = gr.Textbox(label="Benchmark Accuracy", interactive=False)
-    detail_output = gr.Textbox(label="Evaluation Details", lines=20, interactive=False)
-    download_button = gr.Button("📥 Download Full Evaluation")
-    run_button.click(run, inputs=[model_id, sample_count, config_name], outputs=[acc_output, detail_output])
-    download_button.click(save_text, inputs=detail_output, outputs=gr.File())
-    with gr.Row():
-        leaderboard_plot = gr.Plot(label="Leaderboard Chart")
-        leaderboard_table = gr.Dataframe(headers=["Model ID", "Average Accuracy"], interactive=False, datatype=["str", "number"], row_count=20, col_count=2)
-    def load_leaderboard():
-        try:
-            df = pd.read_json("eval.jsonl", lines=True)
-            df_avg = df.groupby("model_id")["accuracy"].mean().reset_index()
-            df_avg.columns = ["model_id", "average_accuracy"]
-            df_sorted = df_avg.sort_values(by="average_accuracy", ascending=False)
-            top10 = df_sorted.head(10)
-            fig, ax = plt.subplots()
-            ax.barh(top10['model_id'], top10['average_accuracy'])
-            ax.set_xlabel("Average Accuracy")
-            ax.set_ylabel("Model")
-            ax.set_title("Top 10 Models by Average Accuracy")
-            return fig, df_sorted
-        except Exception as e:
-            # Handle the case where eval.jsonl might not exist yet
-            return plt.figure(), pd.DataFrame(columns=["model_id", "average_accuracy"])
-    demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot, leaderboard_table])
 demo.launch()

 import os
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from datasets import load_dataset, get_dataset_config_names
 import torch
 import re
 import json
 import pandas as pd
 import matplotlib.pyplot as plt
+import traceback # Import traceback for detailed error logging
 # Cache to avoid reloading the model
 model_cache = {}
 HF_TOKEN = os.environ.get("HF_TOKEN")
+# --- Constants for Benchmarks ---
+MMLU_DATASET = "cais/mmlu"
+MMLU_PRO_DATASET = "cais/mmlu_pro"
+# Humanity's Last Exam is a composite benchmark, not a single dataset readily available like MMLU/MMLU-Pro.
+# For this implementation, we will focus on MMLU and MMLU-Pro, which are direct datasets.
+# Integrating HLE would require evaluating across multiple specific datasets.
+def get_all_benchmark_options():
+    """
+    Dynamically fetches all available subjects for MMLU and MMLU-Pro.
+    Returns a dictionary mapping benchmark dataset IDs to their subjects,
+    and a flattened list suitable for a Gradio dropdown.
+    """
+    all_options = {}
+    gr_dropdown_options = []
+    # Get subjects for MMLU
+    try:
+        mmlu_subjects = get_dataset_config_names(MMLU_DATASET, token=HF_TOKEN)
+        all_options[MMLU_DATASET] = ["ALL"] + mmlu_subjects
+        gr_dropdown_options.extend([f"MMLU - {s}" for s in all_options[MMLU_DATASET]])
+    except Exception as e:
+        print(f"Warning: Could not load MMLU dataset configs. Error: {e}")
+        all_options[MMLU_DATASET] = []
+    # Get subjects for MMLU-Pro
+    try:
+        mmlu_pro_subjects = get_dataset_config_names(MMLU_PRO_DATASET, token=HF_TOKEN)
+        all_options[MMLU_PRO_DATASET] = ["ALL"] + mmlu_pro_subjects
+        gr_dropdown_options.extend([f"MMLU-Pro - {s}" for s in all_options[MMLU_PRO_DATASET]])
+    except Exception as e:
+        print(f"Warning: Could not load MMLU-Pro dataset configs. It might not be accessible or available. Error: {e}")
+        all_options[MMLU_PRO_DATASET] = []
+    return all_options, gr_dropdown_options
+# Initialize these once globally when the app starts
+ALL_BENCHMARK_SUBJECTS, GRADIO_DROPDOWN_OPTIONS = get_all_benchmark_options()
 def load_model(model_id):
+    """
+    Loads a Hugging Face model and its tokenizer, then creates a text-generation pipeline.
+    Uses a cache to avoid re-loading if the model is already in memory.
+    Provides Gradio Info/Error messages for user feedback.
+    Raises an exception if model loading fails.
+    """
+    gr.Info(f"Attempting to load model: {model_id}...")
     if model_id in model_cache:
+        gr.Info(f"Model '{model_id}' already loaded from cache.")
         return model_cache[model_id]
+    try:
+        # Load tokenizer and model, using bfloat16 if CUDA is available for efficiency
+        tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            token=HF_TOKEN,
+            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
+        ).to("cuda" if torch.cuda.is_available() else "cpu")
+        # Create a text-generation pipeline
+        generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
+        # Cache the loaded generator
+        model_cache[model_id] = generator
+        gr.Info(f"Model '{model_id}' loaded successfully.")
+        return generator
+    except Exception as e:
+        # Re-raise the exception to be caught by the outer run_evaluation try-except
+        raise ValueError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token. Error: {e}")
 def format_prompt(item):
+    """
+    Formats a single MMLU/MMLU-Pro question item into a clear prompt for the LLM.
+    The prompt is designed for the model to output a single letter answer (A, B, C, D).
+    """
     prompt = f"""{item['question']}
 A. {item['choices'][0]}
 B. {item['choices'][1]}
 C. {item['choices'][2]}
 D. {item['choices'][3]}
+Answer:"""
+    return prompt, item['answer'] # Returns the prompt string and the correct choice index (0-3)
 def extract_choice_letter(output):
+    """
+    Extracts the most likely choice letter (A, B, C, D) from the model's generated output.
+    It prioritizes an exact match after "Answer:", then looks for any single capital letter.
+    """
+    # Look for "Answer: X" pattern first (e.g., "Answer: A" or "Answer: B")
+    match = re.search(r"Answer:\s*([ABCD])", output, re.IGNORECASE) # Added IGNORECASE for robustness
+    if match:
+        return match.group(1).upper() # Ensure it's uppercase
+    # Fallback: look for a single capital letter A-D anywhere in the output
     match = re.search(r"\b([ABCD])\b", output.strip())
+    if match:
+        return match.group(1)
+    return None # Return None if no valid choice letter is found
 def get_choice_letter(index):
     """Converts a numerical choice index (0-3) to a capital letter (A-D)."""
+    if 0 <= index <= 3:
+        return chr(ord('A') + index)
+    return None # Return None for invalid indices
+def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress):
+    """
+    Evaluates a given model generator on a specific subject from a specified dataset.
+    Args:
+        generator: The Hugging Face pipeline for text generation.
+        dataset_id (str): The ID of the dataset (e.g., "cais/mmlu", "cais/mmlu_pro").
+        subject (str): The specific subject/config name within the dataset.
+        sample_count (int): The maximum number of samples to evaluate.
+        progress (gr.Progress): Gradio progress tracker.
+    Returns:
+        tuple: (accuracy, list_of_detailed_results)
+    Raises:
+        Exception: If dataset loading fails.
+    """
+    gr.Info(f"Loading dataset: {dataset_id} - {subject}...")
+    try:
+        # Load the "test" split of the dataset
+        dataset = load_dataset(dataset_id, subject, token=HF_TOKEN)["test"]
+    except Exception as e:
+        # Re-raise the exception to be caught by the outer run_evaluation try-except
+        raise RuntimeError(f"Failed to load dataset '{dataset_id}' for subject '{subject}'. Error: {e}")
+    # Limit the number of samples and shuffle for consistent evaluation across runs
+    num_samples_to_evaluate = min(sample_count, len(dataset))
+    dataset = dataset.shuffle(seed=42).select(range(num_samples_to_evaluate))
+    correct_count = 0
+    subject_results = []
+    # Iterate through the selected samples with a progress bar
+    for i, item in enumerate(progress.tqdm(dataset, desc=f"Processing {subject} samples")):
+        prompt, answer_idx = format_prompt(item)
+        expected_letter = get_choice_letter(answer_idx)
+        # Generate only 1 new token for the answer (A, B, C, D)
+        # do_sample=False ensures deterministic output for a given prompt (greedy decoding)
+        output_raw = generator(prompt, max_new_tokens=1, do_sample=False)[0]["generated_text"]
+        # Check for potential reasoning model output
+        is_reasoning_model_output = '<' in output_raw or re.search(r"\b(because|therefore|thus|reasoning)\b", output_raw, re.IGNORECASE) is not None
+        # Extract the predicted letter from the model's raw output
+        predicted_letter = extract_choice_letter(output_raw)
+        is_correct = (predicted_letter == expected_letter)
+        correct_count += is_correct
+        # Store detailed results for logging and display
+        subject_results.append({
+            "question": item['question'],
+            "choices": item['choices'],
+            "model_raw_output": output_raw.strip(),
+            "expected_answer_letter": expected_letter,
+            "predicted_answer_letter": predicted_letter,
+            "is_correct": is_correct,
+            "is_reasoning_model_output": is_reasoning_model_output # Store the flag
+        })
+    # Calculate accuracy for the current subject
+    accuracy = (correct_count / len(dataset)) * 100 if len(dataset) > 0 else 0
+    return accuracy, subject_results
+def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=gr.Progress()):
+    """
+    Main function to orchestrate the evaluation process.
+    Handles single subject or 'ALL' subjects evaluation for MMLU/MMLU-Pro.
+    Returns Gradio.update objects to control UI component visibility and content.
+    """
+    gr.Info("Starting evaluation...")
+    if not model_id:
+        gr.Warning("Please enter a Hugging Face Model ID before running the evaluation.")
+        # Return updates to hide logs/debug and show empty results
+        return "", gr.update(value="", visible=False), gr.update(visible=False), \
+               gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
+    # Parse the selected benchmark and subject from the dropdown string
+    parts = selected_benchmark_subject.split(" - ")
+    if len(parts) != 2:
+        gr.Error("Invalid benchmark selection format. Please select from the dropdown.")
+        return "", gr.update(value="", visible=False), gr.update(visible=False), \
+               gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
+    benchmark_name = parts[0]
+    subject_name = parts[1]
+    dataset_id_map = {
+        "MMLU": MMLU_DATASET,
+        "MMLU-Pro": MMLU_PRO_DATASET
+    }
+    current_dataset_id = dataset_id_map.get(benchmark_name)
+    if not current_dataset_id:
+        gr.Error(f"Unknown benchmark selected: {benchmark_name}. This should not happen.")
+        return "", gr.update(value="", visible=False), gr.update(visible=False), \
+               gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
+    try:
+        generator = load_model(model_id) # This function will raise an exception on failure
+        all_evaluation_results = []
+        total_correct_overall = 0
+        total_samples_overall = 0
+        eval_summary_lines = []
+        if subject_name == "ALL":
+            subjects_to_evaluate = ALL_BENCHMARK_SUBJECTS.get(current_dataset_id, [])
+            if "ALL" in subjects_to_evaluate:
+                subjects_to_evaluate.remove("ALL")
+            if not subjects_to_evaluate:
+                gr.Warning(f"No subjects found to evaluate for '{benchmark_name}'.")
+                return "", gr.update(value="", visible=False), gr.update(visible=False), \
+                       gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
+            for i, sub in enumerate(progress.tqdm(subjects_to_evaluate, desc=f"Evaluating ALL {benchmark_name} subjects")):
+                gr.Info(f"Evaluating {benchmark_name} - {sub} ({i+1}/{len(subjects_to_evaluate)})...")
+                try:
+                    accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, sub, sample_count, progress)
+                    all_evaluation_results.extend(subject_details)
+                    num_evaluated_samples = len(subject_details)
+                    num_correct_in_subject = sum(d['is_correct'] for d in subject_details)
+                    total_correct_overall += num_correct_in_subject
+                    total_samples_overall += num_evaluated_samples
+                    eval_summary_lines.append(f"- {benchmark_name} - {sub}: {accuracy:.2f}% ({num_correct_in_subject}/{num_evaluated_samples} samples)")
+                except Exception as e:
+                    gr.Error(f"Skipping {benchmark_name} - {sub} due to an error: {e}")
+                    eval_summary_lines.append(f"- {benchmark_name} - {sub}: Error during evaluation.")
+                    continue
+            overall_accuracy = (total_correct_overall / total_samples_overall) * 100 if total_samples_overall > 0 else 0
+            score_string = f"Overall Average Accuracy for {benchmark_name}: {overall_accuracy:.2f}% across {total_samples_overall} total samples.\n\n"
+            score_string += "Detailed breakdown:\n" + "\n".join(eval_summary_lines)
+        else:
+            accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, subject_name, sample_count, progress)
+            all_evaluation_results.extend(subject_details)
+            overall_accuracy = accuracy
+            num_evaluated_samples = len(subject_details)
+            score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
+        # Format detailed results for display in the text box
+        formatted_details = "\n\n".join([
+            f"### Question:\n{item['question']}\n\n"
+            f"**Choices:**\n" + "\n".join([f"{get_choice_letter(i)}. {c}" for i, c in enumerate(item['choices'])]) + "\n\n"
+            + (f"**Note:** Reasoning models are currently not fully supported for single-letter extraction. The original model output followed:\n" if item.get('is_reasoning_model_output') else "")
+            f"**Model Raw Output:** {item['model_raw_output']}\n"
+            f"**Expected Answer:** {item['expected_answer_letter']}\n"
+            f"**Predicted Answer:** {item['predicted_answer_letter']}\n"
+            f"**Correct:** {'Yes' if item['is_correct'] else 'No'}"
+            for item in all_evaluation_results
+        ])
+        # Record the evaluation result to a JSONL file for the leaderboard
+        record = {
+            "model_id": model_id,
+            "benchmark": benchmark_name,
+            "subject": subject_name,
+            "accuracy": overall_accuracy,
+            "sample_count": total_samples_overall if subject_name == "ALL" else len(all_evaluation_results),
+            "timestamp": pd.Timestamp.now().isoformat()
+        }
+        with open("eval.jsonl", "a") as f:
+            f.write(json.dumps(record) + "\n")
+        gr.Info("Evaluation completed successfully!")
+        return score_string, \
+               gr.update(value="", visible=False), gr.update(visible=False), \
+               gr.update(visible=True), gr.update(visible=True), gr.update(value=formatted_details, visible=False)
+    except Exception as e:
+        error_message = str(e)
+        detailed_error_traceback = traceback.format_exc()
+        gr.Error("An error occurred during evaluation.")
+        # Return updates for error state
+        return "Error occurred during evaluation. We'll evaluate for you! If this persists, please open a community support tab for assistance.", \
+               gr.update(value=detailed_error_traceback, visible=True), gr.update(visible=True), \
+               gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
+def save_text(text_content):
+    """Saves the provided text content to a file and returns the file path for download."""
+    if not text_content:
+        gr.Warning("No evaluation results to download.")
+        return None
+    file_path = "evaluation_results.txt"
+    try:
+        with open(file_path, "w") as f:
+            f.write(text_content)
+        return file_path
+    except Exception as e:
+        gr.Error(f"Error saving file: {e}")
+        return None
+def load_leaderboard():
+    """
+    Loads evaluation data from 'eval.jsonl', computes average accuracy per model,
+    and prepares data for the leaderboard plot and table.
+    """
+    try:
+        # Read the JSONL file into a pandas DataFrame
+        df = pd.read_json("eval.jsonl", lines=True)
+        # Calculate average accuracy per model across all recorded evaluations
+        df_avg = df.groupby("model_id")["accuracy"].mean().reset_index()
+        df_avg.columns = ["Model ID", "Average Accuracy (%)"]
+        # Sort models by average accuracy in descending order
+        df_sorted = df_avg.sort_values(by="Average Accuracy (%)", ascending=False)
+        # Select top 10 models for the bar chart
+        top_models = df_sorted.head(10)
+        # Create the matplotlib plot
+        fig, ax = plt.subplots(figsize=(10, 6)) # Adjust figure size for better readability
+        # For horizontal bars, it's often better to plot data sorted in ascending order
+        # so the highest bar appears at the top of the chart.
+        top_models_plot = top_models.sort_values(by="Average Accuracy (%)", ascending=True)
+        ax.barh(top_models_plot['Model ID'], top_models_plot['Average Accuracy (%)'], color='#007bff') # Use a nice blue color
+        ax.set_xlabel("Average Accuracy (%)", fontsize=12)
+        ax.set_ylabel("Model ID", fontsize=12)
+        ax.set_title("Top 10 Models by Average MMLU/MMLU-Pro Accuracy", fontsize=14)
+        ax.set_xlim(0, 100) # Ensure accuracy scale is 0-100%
+        ax.tick_params(axis='x', labelsize=10)
+        ax.tick_params(axis='y', labelsize=10)
+        ax.grid(axis='x', linestyle='--', alpha=0.7) # Add grid lines
+        plt.tight_layout() # Adjust layout to prevent labels overlapping
+        # Return the figure and the sorted dataframe as a list of dictionaries for Gradio Dataframe
+        return fig, df_sorted.to_dict('records')
+    except FileNotFoundError:
+        gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!")
+        return plt.figure(), pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
+    except Exception as e:
+        gr.Error(f"Error loading leaderboard: {e}")
+        # Return an empty plot and dataframe in case of any other error
+        return plt.figure(), pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
+# --- Gradio Interface Definition ---
+with gr.Blocks(css="""
+    /* General body and container styling */
+    body { font-family: 'Inter', sans-serif; background-color: #f0f2f5; margin: 0; padding: 20px; }
+    .gradio-container {
+        max-width: 1200px;
+        margin: 20px auto;
+        padding: 30px;
+        box-shadow: 0 8px 16px rgba(0,0,0,0.15);
+        border-radius: 12px;
+        background-color: #ffffff;
+        border: 1px solid #e0e0e0;
+    }
+    /* Headings */
+    h1 {
+        color: #2c3e50;
+        text-align: center;
+        margin-bottom: 30px;
+        font-size: 2.5em;
+        font-weight: 700;
+        letter-spacing: -0.02em;
+    }
+    h3 { color: #34495e; font-size: 1.2em; margin-bottom: 10px; }
+    /* Markdown text */
+    .markdown-text { text-align: center; color: #555; line-height: 1.6; }
+    .markdown-text div { font-size: 1.1em; }
+    /* Buttons */
+    .gr-button {
+        background-color: #007bff; /* Primary blue */
+        color: white;
+        border: none;
+        padding: 12px 25px;
+        border-radius: 8px;
+        cursor: pointer;
+        transition: background-color 0.3s ease, transform 0.2s ease;
+        font-size: 1.1em;
+        font-weight: 600;
+        box-shadow: 0 4px 8px rgba(0,0,0,0.1);
+    }
+    .gr-button:hover {
+        background-color: #0056b3; /* Darker blue on hover */
+        transform: translateY(-2px); /* Slight lift effect */
+    }
+    .gr-button:active {
+        transform: translateY(0);
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+    }
+    /* Specific button styling for debug/show details */
+    #debug-button, #show-details-button {
+        background-color: #6c757d; /* Grey for secondary actions */
+    }
+    #debug-button:hover, #show-details-button:hover {
+        background-color: #5a6268;
+    }
+    #download-button {
+        background-color: #28a745; /* Green for download */
+    }
+    #download-button:hover {
+        background-color: #218838;
+    }
+    /* Input/Output Boxes */
+    .gr-box {
+        border: 1px solid #dee2e6;
+        border-radius: 10px;
+        padding: 20px;
+        margin-bottom: 20px;
+        background-color: #fdfdfd;
+        box-shadow: inset 0 1px 3px rgba(0,0,0,0.05);
+    }
+    .gr-output-text {
+        white-space: pre-wrap;
+        word-wrap: break-word;
+        background-color: #f9f9fb;
+        border: 1px solid #e9ecef;
+        border-radius: 8px;
+        padding: 15px;
+        min-height: 100px; /* Ensure a minimum height */
+    }
+    /* Specific error output style */
+    #error-message-output {
+        background-color: #ffe0e0;
+        border-color: #ff9999;
+        color: #cc0000;
+    }
+    /* Labels for inputs */
+    .gr-textbox label, .gr-dropdown label, .gr-slider label {
+        font-weight: 600;
+        color: #495057;
+        margin-bottom: 8px;
+        display: block;
+        font-size: 1em;
+    }
+    /* Tab styling */
+    .gr-tab-item { padding: 25px; } /* More padding inside tabs */
+    .gr-tabs-nav button {
+        font-weight: 600;
+        font-size: 1.1em;
+        padding: 10px 20px;
+        border-top-left-radius: 8px;
+        border-top-right-radius: 8px;
+    }
+""") as demo:
     gr.Markdown("""
     # 🤖 LLM Benchmark Evaluator
+    """)
+    with gr.Tabs():
+        with gr.TabItem("🚀 Run Evaluation"):
+            gr.Markdown("""
+            <div style="text-align: center; margin-bottom: 20px; color: #666; font-size: 1.1em;">
+                Enter your Hugging Face Model ID, choose a benchmark (MMLU or MMLU-Pro),
+                select a subject (or 'ALL' for a comprehensive evaluation),
+                and specify the number of samples per subject.
+            </div>
+            """)
+            with gr.Column(elem_classes="gr-box"):
+                model_id_input = gr.Textbox(
+                    label="Your Hugging Face Model ID",
+                    placeholder="e.g., mistralai/Mistral-7B-Instruct-v0.2",
+                    interactive=True
+                )
+                with gr.Row():
+                    benchmark_subject_dropdown = gr.Dropdown(
+                        label="Choose Benchmark and Subject",
+                        choices=GRADIO_DROPDOWN_OPTIONS,
+                        value="MMLU - ALL", # Default to MMLU ALL for initial load
+                        interactive=True,
+                        min_width=400 # Ensure sufficient width
+                    )
+                    sample_count_slider = gr.Slider(
+                        label="Number of Samples per Subject (1-100)",
+                        minimum=1,
+                        maximum=100,
+                        value=10, # Default to 10 samples
+                        step=1,
+                        interactive=True,
+                        min_width=200
+                    )
+                run_button = gr.Button("🚀 Run Evaluation", elem_classes="gr-button")
+            with gr.Column(elem_classes="gr-box"):
+                acc_output = gr.Textbox(
+                    label="Benchmark Accuracy Results",
+                    interactive=False,
+                    elem_classes="gr-output-text",
+                    lines=5,
+                    placeholder="Evaluation results will appear here."
+                )
+                # Container for debug info, initially hidden
+                with gr.Column(visible=False, elem_id="debug-error-column") as debug_error_column:
+                    error_message_output = gr.Textbox(
+                        label="Debug Information (Error Details)",
+                        lines=10, interactive=False, elem_classes="gr-output-text", elem_id="error-message-output",
+                        placeholder="Error details will appear here if an error occurs."
+                    )
+                    debug_button = gr.Button("🐛 Hide Debug Info", visible=True, elem_id="debug-button", elem_classes="gr-button")
+                with gr.Row():
+                    show_details_button = gr.Button("🔍 Show Detailed Logs", visible=False, elem_id="show-details-button", elem_classes="gr-button")
+                    download_button = gr.Button("📥 Download Full Evaluation Logs", visible=False, elem_id="download-button", elem_classes="gr-button")
+                # Detailed output, initially hidden
+                detail_output = gr.Textbox(
+                    label="Detailed Evaluation Logs",
+                    lines=20,
+                    interactive=False,
+                    elem_classes="gr-output-text",
+                    placeholder="Detailed logs for each question will appear here upon successful evaluation.",
+                    visible=False # Initially hidden
+                )
+            # Define button click actions
+            run_button.click(
+                run_evaluation,
+                inputs=[model_id_input, benchmark_subject_dropdown, sample_count_slider],
+                outputs=[
+                    acc_output,
+                    error_message_output, debug_error_column, # For error state
+                    show_details_button, download_button, detail_output # For success state
+                ]
+            )
+            # Toggle visibility of detail_output
+            show_details_button.click(
+                lambda s: gr.update(visible=not s), # Toggle visibility
+                inputs=[detail_output], # Pass the component itself as input
+                outputs=[detail_output] # The component to update
+            )
+            # Change button text based on visibility
+            show_details_button.click(
+                lambda s: "🙈 Hide Detailed Logs" if not s else "🔍 Show Detailed Logs",
+                inputs=[detail_output],
+                outputs=[show_details_button]
+            )
+            # Toggle visibility of debug error column
+            debug_button.click(
+                lambda s: gr.update(visible=not s), # Toggle visibility
+                inputs=[debug_error_column], # Pass the component itself as input
+                outputs=[debug_error_column] # The component to update
+            )
+            # Change debug button text based on visibility
+            debug_button.click(
+                lambda s: "🐛 Show Debug Info" if not s else "🐛 Hide Debug Info",
+                inputs=[debug_error_column],
+                outputs=[debug_button]
+            )
+            download_button.click(
+                save_text,
+                inputs=[detail_output],
+                outputs=gr.File(label="Download Evaluation Results", file_count="single", type="filepath")
+            )
+        with gr.TabItem("📊 Leaderboard"):
+            gr.Markdown("""
+            <div style="text-align: center; margin-bottom: 20px; color: #666; font-size: 1.1em;">
+                See how different models perform on average across all evaluated benchmarks.
+                This leaderboard updates with every new evaluation.
+            </div>
+            """)
+            with gr.Row():
+                leaderboard_plot_output = gr.Plot(label="Top 10 Models by Average Accuracy", scale=2) # Scale for better visibility
+                leaderboard_table_output = gr.Dataframe(
+                    headers=["Model ID", "Average Accuracy (%)"],
+                    interactive=False,
+                    datatype=["str", "number"],
+                    row_count=10, # Display top 10 rows initially, but can scroll
+                    col_count=2,
+                    label="Full Leaderboard Data"
+                )
+            # Load leaderboard when the tab is selected or when the app loads
+            demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot_output, leaderboard_table_output])
+# Launch the Gradio app
 demo.launch()