FlameF0X's picture
Update app.py
c282fa8 verified
import gradio as gr
import json
import os
import pandas as pd
from datetime import datetime
from huggingface_hub import HfApi
import lm_eval
from lm_eval.models.huggingface import HFLM
# Configuration
HF_TOKEN = os.environ.get("HF_TOKEN")
DATASET_REPO = "FlameF0X/benchmark-results"
def run_benchmarks(model_name):
if not model_name:
return "### ❌ Error\nPlease enter a valid Hugging Face model name (e.g., 'meta-llama/Llama-3.2-1B')", None
try:
# 1. Initialize the model for evaluation
# We use 'pretrained' for the model weight path and 'device' to use GPU if available
print(f"Loading model: {model_name}...")
lm_obj = HFLM(pretrained=model_name, device="cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu")
# 2. Define the tasks to run
# Note: 'mmlu' is a group; 'gsm8k' is math; 'truthfulqa_mc2' is standard for TruthfulQA
tasks = ["mmlu", "gsm8k", "truthfulqa_mc2"]
print(f"Starting evaluation on tasks: {tasks}...")
results_raw = lm_eval.simple_evaluate(
model=lm_obj,
tasks=tasks,
num_fewshot=0, # 0-shot evaluation; increase for few-shot
batch_size="auto"
)
# 3. Extract scores (Converting to percentages 0-100)
# Results structure varies by task, usually we look for 'acc' or 'acc_norm'
results = {
"MMLU": round(results_raw["results"].get("mmlu", {}).get("acc,none", 0) * 100, 2),
"GSM8K": round(results_raw["results"].get("gsm8k", {}).get("exact_match,strict-match", 0) * 100, 2),
"HumanEval": round(results_raw["results"].get("humaneval", {}).get("pass@1", 0) * 100, 2),
"TruthfulQA": round(results_raw["results"].get("truthfulqa_mc2", {}).get("acc,none", 0) * 100, 2)
}
# Calculate weighted average
avg_score = round(sum(results.values()) / len(results), 2)
# 4. Prepare the data entry (Matches your existing storage logic)
entry = {
"model_name": model_name,
"average": avg_score,
**results,
"timestamp": datetime.now().isoformat()
}
# 5. Save and Upload
filename = f"result_{model_name.replace('/', '_')}_{datetime.now().strftime('%H%M%S')}.json"
with open(filename, 'w') as f:
json.dump(entry, f)
if HF_TOKEN and DATASET_REPO != "user/benchmark-results":
try:
api = HfApi()
api.upload_file(
path_or_fileobj=filename,
path_in_repo=f"results/{filename}",
repo_id=DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN
)
except Exception as e:
print(f"Upload failed: {e}")
if os.path.exists(filename):
os.remove(filename)
summary = f"### βœ… Results for {model_name}\n**Average Score: {avg_score}%**"
return summary, [[model_name, avg_score, results["MMLU"], results["GSM8K"], results["HumanEval"]]]
except Exception as e:
error_msg = f"### ❌ Evaluation Failed\n**Error:** {str(e)}"
return error_msg, None
# --- Gradio UI remains the same ---
with gr.Blocks(theme=gr.themes.Soft()) as app:
gr.Markdown("# πŸ† AI Model Benchmarking Hub (Live Eval)")
gr.Markdown("Note: Running this requires a GPU and time for model inference.")
with gr.Row():
with gr.Column(scale=1):
model_input = gr.Textbox(label="Model Name", placeholder="e.g., meta-llama/Llama-3.2-1B")
submit_btn = gr.Button("πŸš€ Run Full Evaluation", variant="primary")
with gr.Column(scale=2):
output_summary = gr.Markdown("Enter a model name to start actual evaluation.")
results_table = gr.DataFrame(
headers=["Model", "Avg", "MMLU", "GSM8K", "HumanEval"],
datatype=["str", "number", "number", "number", "number"],
label="Benchmark Results"
)
submit_btn.click(
run_benchmarks,
inputs=model_input,
outputs=[output_summary, results_table]
)
if __name__ == "__main__":
app.launch()