Spaces:

FlameF0X
/

Benchmark-Submissions

Running

App Files Files Community

Benchmark-Submissions / app.py

FlameF0X

Update app.py

c282fa8 verified about 1 month ago

raw

history blame contribute delete

4.32 kB

	import gradio as gr
	import json
	import os
	import pandas as pd
	from datetime import datetime
	from huggingface_hub import HfApi
	import lm_eval
	from lm_eval.models.huggingface import HFLM

	# Configuration
	HF_TOKEN = os.environ.get("HF_TOKEN")
	DATASET_REPO = "FlameF0X/benchmark-results"

	def run_benchmarks(model_name):
	if not model_name:
	return "### ❌ Error\nPlease enter a valid Hugging Face model name (e.g., 'meta-llama/Llama-3.2-1B')", None

	try:
	# 1. Initialize the model for evaluation
	# We use 'pretrained' for the model weight path and 'device' to use GPU if available
	print(f"Loading model: {model_name}...")
	lm_obj = HFLM(pretrained=model_name, device="cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu")

	# 2. Define the tasks to run
	# Note: 'mmlu' is a group; 'gsm8k' is math; 'truthfulqa_mc2' is standard for TruthfulQA
	tasks = ["mmlu", "gsm8k", "truthfulqa_mc2"]

	print(f"Starting evaluation on tasks: {tasks}...")
	results_raw = lm_eval.simple_evaluate(
	model=lm_obj,
	tasks=tasks,
	num_fewshot=0, # 0-shot evaluation; increase for few-shot
	batch_size="auto"
	)

	# 3. Extract scores (Converting to percentages 0-100)
	# Results structure varies by task, usually we look for 'acc' or 'acc_norm'
	results = {
	"MMLU": round(results_raw["results"].get("mmlu", {}).get("acc,none", 0) * 100, 2),
	"GSM8K": round(results_raw["results"].get("gsm8k", {}).get("exact_match,strict-match", 0) * 100, 2),
	"HumanEval": round(results_raw["results"].get("humaneval", {}).get("pass@1", 0) * 100, 2),
	"TruthfulQA": round(results_raw["results"].get("truthfulqa_mc2", {}).get("acc,none", 0) * 100, 2)
	}

	# Calculate weighted average
	avg_score = round(sum(results.values()) / len(results), 2)

	# 4. Prepare the data entry (Matches your existing storage logic)
	entry = {
	"model_name": model_name,
	"average": avg_score,
	**results,
	"timestamp": datetime.now().isoformat()
	}

	# 5. Save and Upload
	filename = f"result_{model_name.replace('/', '_')}_{datetime.now().strftime('%H%M%S')}.json"
	with open(filename, 'w') as f:
	json.dump(entry, f)

	if HF_TOKEN and DATASET_REPO != "user/benchmark-results":
	try:
	api = HfApi()
	api.upload_file(
	path_or_fileobj=filename,
	path_in_repo=f"results/{filename}",
	repo_id=DATASET_REPO,
	repo_type="dataset",
	token=HF_TOKEN
	)
	except Exception as e:
	print(f"Upload failed: {e}")

	if os.path.exists(filename):
	os.remove(filename)

	summary = f"### ✅ Results for {model_name}\nAverage Score: {avg_score}%"
	return summary, [[model_name, avg_score, results["MMLU"], results["GSM8K"], results["HumanEval"]]]

	except Exception as e:
	error_msg = f"### ❌ Evaluation Failed\nError: {str(e)}"
	return error_msg, None

	# --- Gradio UI remains the same ---
	with gr.Blocks(theme=gr.themes.Soft()) as app:
	gr.Markdown("# 🏆 AI Model Benchmarking Hub (Live Eval)")
	gr.Markdown("Note: Running this requires a GPU and time for model inference.")

	with gr.Row():
	with gr.Column(scale=1):
	model_input = gr.Textbox(label="Model Name", placeholder="e.g., meta-llama/Llama-3.2-1B")
	submit_btn = gr.Button("🚀 Run Full Evaluation", variant="primary")

	with gr.Column(scale=2):
	output_summary = gr.Markdown("Enter a model name to start actual evaluation.")
	results_table = gr.DataFrame(
	headers=["Model", "Avg", "MMLU", "GSM8K", "HumanEval"],
	datatype=["str", "number", "number", "number", "number"],
	label="Benchmark Results"
	)

	submit_btn.click(
	run_benchmarks,
	inputs=model_input,
	outputs=[output_summary, results_table]
	)

	if __name__ == "__main__":
	app.launch()