import gradio as gr
import json
import os
import pandas as pd
from datetime import datetime
from huggingface_hub import HfApi
import lm_eval
from lm_eval.models.huggingface import HFLM

# Configuration
HF_TOKEN = os.environ.get("HF_TOKEN")
DATASET_REPO = "FlameF0X/benchmark-results"

def run_benchmarks(model_name):
    if not model_name:
        return "### ❌ Error\nPlease enter a valid Hugging Face model name (e.g., 'meta-llama/Llama-3.2-1B')", None
    
    try:
        # 1. Initialize the model for evaluation
        # We use 'pretrained' for the model weight path and 'device' to use GPU if available
        print(f"Loading model: {model_name}...")
        lm_obj = HFLM(pretrained=model_name, device="cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu")

        # 2. Define the tasks to run
        # Note: 'mmlu' is a group; 'gsm8k' is math; 'truthfulqa_mc2' is standard for TruthfulQA
        tasks = ["mmlu", "gsm8k", "truthfulqa_mc2"]
        
        print(f"Starting evaluation on tasks: {tasks}...")
        results_raw = lm_eval.simple_evaluate(
            model=lm_obj,
            tasks=tasks,
            num_fewshot=0, # 0-shot evaluation; increase for few-shot
            batch_size="auto"
        )

        # 3. Extract scores (Converting to percentages 0-100)
        # Results structure varies by task, usually we look for 'acc' or 'acc_norm'
        results = {
            "MMLU": round(results_raw["results"].get("mmlu", {}).get("acc,none", 0) * 100, 2),
            "GSM8K": round(results_raw["results"].get("gsm8k", {}).get("exact_match,strict-match", 0) * 100, 2),
            "HumanEval": round(results_raw["results"].get("humaneval", {}).get("pass@1", 0) * 100, 2),
            "TruthfulQA": round(results_raw["results"].get("truthfulqa_mc2", {}).get("acc,none", 0) * 100, 2)
        }
        
        # Calculate weighted average
        avg_score = round(sum(results.values()) / len(results), 2)
        
        # 4. Prepare the data entry (Matches your existing storage logic)
        entry = {
            "model_name": model_name,
            "average": avg_score,
            **results,
            "timestamp": datetime.now().isoformat()
        }
        
        # 5. Save and Upload
        filename = f"result_{model_name.replace('/', '_')}_{datetime.now().strftime('%H%M%S')}.json"
        with open(filename, 'w') as f:
            json.dump(entry, f)
        
        if HF_TOKEN and DATASET_REPO != "user/benchmark-results":
            try:
                api = HfApi()
                api.upload_file(
                    path_or_fileobj=filename,
                    path_in_repo=f"results/{filename}",
                    repo_id=DATASET_REPO,
                    repo_type="dataset",
                    token=HF_TOKEN
                )
            except Exception as e:
                print(f"Upload failed: {e}")

        if os.path.exists(filename):
            os.remove(filename)
        
        summary = f"### ✅ Results for {model_name}\n**Average Score: {avg_score}%**"
        return summary, [[model_name, avg_score, results["MMLU"], results["GSM8K"], results["HumanEval"]]]

    except Exception as e:
        error_msg = f"### ❌ Evaluation Failed\n**Error:** {str(e)}"
        return error_msg, None

# --- Gradio UI remains the same ---
with gr.Blocks(theme=gr.themes.Soft()) as app:
    gr.Markdown("# 🏆 AI Model Benchmarking Hub (Live Eval)")
    gr.Markdown("Note: Running this requires a GPU and time for model inference.")
    
    with gr.Row():
        with gr.Column(scale=1):
            model_input = gr.Textbox(label="Model Name", placeholder="e.g., meta-llama/Llama-3.2-1B")
            submit_btn = gr.Button("🚀 Run Full Evaluation", variant="primary")
        
        with gr.Column(scale=2):
            output_summary = gr.Markdown("Enter a model name to start actual evaluation.")
            results_table = gr.DataFrame(
                headers=["Model", "Avg", "MMLU", "GSM8K", "HumanEval"],
                datatype=["str", "number", "number", "number", "number"],
                label="Benchmark Results"
            )

    submit_btn.click(
        run_benchmarks, 
        inputs=model_input, 
        outputs=[output_summary, results_table]
    )

if __name__ == "__main__":
    app.launch()