Spaces:
Running
Running
| import gradio as gr | |
| import json | |
| import os | |
| import pandas as pd | |
| from datetime import datetime | |
| from huggingface_hub import HfApi | |
| import lm_eval | |
| from lm_eval.models.huggingface import HFLM | |
| # Configuration | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| DATASET_REPO = "FlameF0X/benchmark-results" | |
| def run_benchmarks(model_name): | |
| if not model_name: | |
| return "### β Error\nPlease enter a valid Hugging Face model name (e.g., 'meta-llama/Llama-3.2-1B')", None | |
| try: | |
| # 1. Initialize the model for evaluation | |
| # We use 'pretrained' for the model weight path and 'device' to use GPU if available | |
| print(f"Loading model: {model_name}...") | |
| lm_obj = HFLM(pretrained=model_name, device="cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu") | |
| # 2. Define the tasks to run | |
| # Note: 'mmlu' is a group; 'gsm8k' is math; 'truthfulqa_mc2' is standard for TruthfulQA | |
| tasks = ["mmlu", "gsm8k", "truthfulqa_mc2"] | |
| print(f"Starting evaluation on tasks: {tasks}...") | |
| results_raw = lm_eval.simple_evaluate( | |
| model=lm_obj, | |
| tasks=tasks, | |
| num_fewshot=0, # 0-shot evaluation; increase for few-shot | |
| batch_size="auto" | |
| ) | |
| # 3. Extract scores (Converting to percentages 0-100) | |
| # Results structure varies by task, usually we look for 'acc' or 'acc_norm' | |
| results = { | |
| "MMLU": round(results_raw["results"].get("mmlu", {}).get("acc,none", 0) * 100, 2), | |
| "GSM8K": round(results_raw["results"].get("gsm8k", {}).get("exact_match,strict-match", 0) * 100, 2), | |
| "HumanEval": round(results_raw["results"].get("humaneval", {}).get("pass@1", 0) * 100, 2), | |
| "TruthfulQA": round(results_raw["results"].get("truthfulqa_mc2", {}).get("acc,none", 0) * 100, 2) | |
| } | |
| # Calculate weighted average | |
| avg_score = round(sum(results.values()) / len(results), 2) | |
| # 4. Prepare the data entry (Matches your existing storage logic) | |
| entry = { | |
| "model_name": model_name, | |
| "average": avg_score, | |
| **results, | |
| "timestamp": datetime.now().isoformat() | |
| } | |
| # 5. Save and Upload | |
| filename = f"result_{model_name.replace('/', '_')}_{datetime.now().strftime('%H%M%S')}.json" | |
| with open(filename, 'w') as f: | |
| json.dump(entry, f) | |
| if HF_TOKEN and DATASET_REPO != "user/benchmark-results": | |
| try: | |
| api = HfApi() | |
| api.upload_file( | |
| path_or_fileobj=filename, | |
| path_in_repo=f"results/{filename}", | |
| repo_id=DATASET_REPO, | |
| repo_type="dataset", | |
| token=HF_TOKEN | |
| ) | |
| except Exception as e: | |
| print(f"Upload failed: {e}") | |
| if os.path.exists(filename): | |
| os.remove(filename) | |
| summary = f"### β Results for {model_name}\n**Average Score: {avg_score}%**" | |
| return summary, [[model_name, avg_score, results["MMLU"], results["GSM8K"], results["HumanEval"]]] | |
| except Exception as e: | |
| error_msg = f"### β Evaluation Failed\n**Error:** {str(e)}" | |
| return error_msg, None | |
| # --- Gradio UI remains the same --- | |
| with gr.Blocks(theme=gr.themes.Soft()) as app: | |
| gr.Markdown("# π AI Model Benchmarking Hub (Live Eval)") | |
| gr.Markdown("Note: Running this requires a GPU and time for model inference.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| model_input = gr.Textbox(label="Model Name", placeholder="e.g., meta-llama/Llama-3.2-1B") | |
| submit_btn = gr.Button("π Run Full Evaluation", variant="primary") | |
| with gr.Column(scale=2): | |
| output_summary = gr.Markdown("Enter a model name to start actual evaluation.") | |
| results_table = gr.DataFrame( | |
| headers=["Model", "Avg", "MMLU", "GSM8K", "HumanEval"], | |
| datatype=["str", "number", "number", "number", "number"], | |
| label="Benchmark Results" | |
| ) | |
| submit_btn.click( | |
| run_benchmarks, | |
| inputs=model_input, | |
| outputs=[output_summary, results_table] | |
| ) | |
| if __name__ == "__main__": | |
| app.launch() |