Spaces:

Enderchef
/

SuperBench-Eval

Sleeping

App Files Files Community

Enderchef commited on Jun 24, 2025

Commit

5177cd2

verified ·

1 Parent(s): b30005f

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -14

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from datasets import load_dataset
@@ -6,11 +7,13 @@ import torch
 # Cache to avoid reloading the model
 model_cache = {}
 def load_model(model_id):
     if model_id in model_cache:
         return model_cache[model_id]
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    model = AutoModelForCausalLM.from_pretrained(model_id).to("cuda" if torch.cuda.is_available() else "cpu")
     generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
     model_cache[model_id] = generator
     return generator
@@ -20,7 +23,11 @@ def format_prompt(item, source):
         prompt = f"{item['question']}\nA. {item['choices'][0]}\nB. {item['choices'][1]}\nC. {item['choices'][2]}\nD. {item['choices'][3]}\nAnswer:"
         answer = item['answer']
     elif source == "TIGER-Lab/MMLU-Pro":
-        prompt = f"{item['question']}\nA. {item['A']}\nB. {item['B']}\nC. {item['C']}\nD. {item['D']}\nAnswer:"
         answer = item['answer']
     elif source == "cais/hle":
         prompt = f"{item['question']}\n{item['A']}\n{item['B']}\n{item['C']}\n{item['D']}\nAnswer:"
@@ -31,7 +38,7 @@ def format_prompt(item, source):
 def evaluate(model_id, dataset_name, sample_count):
     gen = load_model(model_id)
-    dataset = load_dataset(dataset_name)
     if 'test' in dataset:
         dataset = dataset['test']
     else:
@@ -50,33 +57,38 @@ def evaluate(model_id, dataset_name, sample_count):
         results.append((prompt, output.strip(), answer, output_letter, is_correct))
     accuracy = correct / len(dataset) * 100
-    return f"Accuracy: {accuracy:.2f}%", results
 def run(model_id, benchmark, sample_count):
-    score, details = evaluate(model_id, benchmark, sample_count)
     formatted = "\n\n".join([
         f"### Question:\n{q}\n\n**Model Answer:** {o}\n**Expected:** {a}\n**Predicted:** {g}\n**Correct:** {c}"
         for q, o, a, g, c in details
     ])
-    return score, formatted
 with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-width: 900px; margin: auto;}", analytics_enabled=False) as demo:
     gr.Markdown("""
     # 🤖 LLM Benchmark Evaluator
-    Easily evaluate your Hugging Face-hosted model on:
-    - **MMLU** (`cais/mmlu`)
-    - **MMLU-Pro** (`TIGER-Lab/MMLU-Pro`)
-    - **Humanity's Last Exam** (`cais/hle`)
-    Enter your model ID, pick a benchmark, and hit evaluate.
     """)
     with gr.Row():
         model_id = gr.Textbox(label="Your Hugging Face Model ID", placeholder="e.g., your-org/your-model")
         benchmark = gr.Dropdown(
             label="Choose Benchmark",
-            choices=["cais/mmlu", "TIGER-Lab/MMLU-Pro", "cais/hle"],
             value="cais/mmlu"
         )
         sample_count = gr.Slider(label="Number of Samples", minimum=1, maximum=100, value=10, step=1)
@@ -84,7 +96,9 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
     run_button = gr.Button("🚀 Run Evaluation")
     acc_output = gr.Textbox(label="Benchmark Accuracy", interactive=False)
     detail_output = gr.Textbox(label="Evaluation Details", lines=20, interactive=False)
     run_button.click(run, inputs=[model_id, benchmark, sample_count], outputs=[acc_output, detail_output])
-demo.launch()

+import os
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from datasets import load_dataset
 # Cache to avoid reloading the model
 model_cache = {}
+HF_TOKEN = os.environ.get("HF_TOKEN")
 def load_model(model_id):
     if model_id in model_cache:
         return model_cache[model_id]
+    tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
+    model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN).to("cuda" if torch.cuda.is_available() else "cpu")
     generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
     model_cache[model_id] = generator
     return generator
         prompt = f"{item['question']}\nA. {item['choices'][0]}\nB. {item['choices'][1]}\nC. {item['choices'][2]}\nD. {item['choices'][3]}\nAnswer:"
         answer = item['answer']
     elif source == "TIGER-Lab/MMLU-Pro":
+        if all(opt in item for opt in ['A', 'B', 'C', 'D']):
+            prompt = f"{item['question']}\nA. {item['A']}\nB. {item['B']}\nC. {item['C']}\nD. {item['D']}\nAnswer:"
+        else:
+            choices = item.get("choices", ["", "", "", ""])
+            prompt = f"{item['question']}\nA. {choices[0]}\nB. {choices[1]}\nC. {choices[2]}\nD. {choices[3]}\nAnswer:"
         answer = item['answer']
     elif source == "cais/hle":
         prompt = f"{item['question']}\n{item['A']}\n{item['B']}\n{item['C']}\n{item['D']}\nAnswer:"
 def evaluate(model_id, dataset_name, sample_count):
     gen = load_model(model_id)
+    dataset = load_dataset(dataset_name, token=HF_TOKEN)
     if 'test' in dataset:
         dataset = dataset['test']
     else:
         results.append((prompt, output.strip(), answer, output_letter, is_correct))
     accuracy = correct / len(dataset) * 100
+    return accuracy, results
 def run(model_id, benchmark, sample_count):
+    if benchmark != "cais/mmlu":
+        return "Only MMLU (cais/mmlu) is available now. MMLU-Pro and Humanity's Last Exam are coming soon.", ""
+    accuracy, details = evaluate(model_id, benchmark, sample_count)
     formatted = "\n\n".join([
         f"### Question:\n{q}\n\n**Model Answer:** {o}\n**Expected:** {a}\n**Predicted:** {g}\n**Correct:** {c}"
         for q, o, a, g, c in details
     ])
+    return f"Accuracy: {accuracy:.2f}%", formatted
+def save_text(text):
+    return "evaluation_results.txt", text
 with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-width: 900px; margin: auto;}", analytics_enabled=False) as demo:
     gr.Markdown("""
     # 🤖 LLM Benchmark Evaluator
+    Currently, only **MMLU** (`cais/mmlu`) is available for evaluation.
+    **MMLU-Pro** and **Humanity's Last Exam** will be coming soon.
+    Enter your model ID, pick MMLU, and hit evaluate.
     """)
     with gr.Row():
         model_id = gr.Textbox(label="Your Hugging Face Model ID", placeholder="e.g., your-org/your-model")
         benchmark = gr.Dropdown(
             label="Choose Benchmark",
+            choices=["cais/mmlu"],
             value="cais/mmlu"
         )
         sample_count = gr.Slider(label="Number of Samples", minimum=1, maximum=100, value=10, step=1)
     run_button = gr.Button("🚀 Run Evaluation")
     acc_output = gr.Textbox(label="Benchmark Accuracy", interactive=False)
     detail_output = gr.Textbox(label="Evaluation Details", lines=20, interactive=False)
+    download_button = gr.Button("📥 Download Full Evaluation")
     run_button.click(run, inputs=[model_id, benchmark, sample_count], outputs=[acc_output, detail_output])
+    download_button.click(save_text, inputs=detail_output, outputs=gr.File())
+demo.launch(share=True)