import gradio as gr import json import os import pandas as pd from datetime import datetime from huggingface_hub import HfApi import lm_eval from lm_eval.models.huggingface import HFLM # Configuration HF_TOKEN = os.environ.get("HF_TOKEN") DATASET_REPO = "FlameF0X/benchmark-results" def run_benchmarks(model_name): if not model_name: return "### ❌ Error\nPlease enter a valid Hugging Face model name (e.g., 'meta-llama/Llama-3.2-1B')", None try: # 1. Initialize the model for evaluation # We use 'pretrained' for the model weight path and 'device' to use GPU if available print(f"Loading model: {model_name}...") lm_obj = HFLM(pretrained=model_name, device="cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu") # 2. Define the tasks to run # Note: 'mmlu' is a group; 'gsm8k' is math; 'truthfulqa_mc2' is standard for TruthfulQA tasks = ["mmlu", "gsm8k", "truthfulqa_mc2"] print(f"Starting evaluation on tasks: {tasks}...") results_raw = lm_eval.simple_evaluate( model=lm_obj, tasks=tasks, num_fewshot=0, # 0-shot evaluation; increase for few-shot batch_size="auto" ) # 3. Extract scores (Converting to percentages 0-100) # Results structure varies by task, usually we look for 'acc' or 'acc_norm' results = { "MMLU": round(results_raw["results"].get("mmlu", {}).get("acc,none", 0) * 100, 2), "GSM8K": round(results_raw["results"].get("gsm8k", {}).get("exact_match,strict-match", 0) * 100, 2), "HumanEval": round(results_raw["results"].get("humaneval", {}).get("pass@1", 0) * 100, 2), "TruthfulQA": round(results_raw["results"].get("truthfulqa_mc2", {}).get("acc,none", 0) * 100, 2) } # Calculate weighted average avg_score = round(sum(results.values()) / len(results), 2) # 4. Prepare the data entry (Matches your existing storage logic) entry = { "model_name": model_name, "average": avg_score, **results, "timestamp": datetime.now().isoformat() } # 5. Save and Upload filename = f"result_{model_name.replace('/', '_')}_{datetime.now().strftime('%H%M%S')}.json" with open(filename, 'w') as f: json.dump(entry, f) if HF_TOKEN and DATASET_REPO != "user/benchmark-results": try: api = HfApi() api.upload_file( path_or_fileobj=filename, path_in_repo=f"results/{filename}", repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN ) except Exception as e: print(f"Upload failed: {e}") if os.path.exists(filename): os.remove(filename) summary = f"### ✅ Results for {model_name}\n**Average Score: {avg_score}%**" return summary, [[model_name, avg_score, results["MMLU"], results["GSM8K"], results["HumanEval"]]] except Exception as e: error_msg = f"### ❌ Evaluation Failed\n**Error:** {str(e)}" return error_msg, None # --- Gradio UI remains the same --- with gr.Blocks(theme=gr.themes.Soft()) as app: gr.Markdown("# 🏆 AI Model Benchmarking Hub (Live Eval)") gr.Markdown("Note: Running this requires a GPU and time for model inference.") with gr.Row(): with gr.Column(scale=1): model_input = gr.Textbox(label="Model Name", placeholder="e.g., meta-llama/Llama-3.2-1B") submit_btn = gr.Button("🚀 Run Full Evaluation", variant="primary") with gr.Column(scale=2): output_summary = gr.Markdown("Enter a model name to start actual evaluation.") results_table = gr.DataFrame( headers=["Model", "Avg", "MMLU", "GSM8K", "HumanEval"], datatype=["str", "number", "number", "number", "number"], label="Benchmark Results" ) submit_btn.click( run_benchmarks, inputs=model_input, outputs=[output_summary, results_table] ) if __name__ == "__main__": app.launch()