import os
import json
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import torch
from sklearn.metrics import f1_score
import re
from collections import Counter
import string
from huggingface_hub import login
import gradio as gr
import pandas as pd
from datetime import datetime

# Normalization functions (identical to extractor)
def normalize_answer(s):
    def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text): return ' '.join(text.split())
    def remove_punc(text): 
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text): return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s)))

def f1_score_qa(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0: return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    return (2 * precision * recall) / (precision + recall)

def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)

# Identical confidence calculation to extractor
def get_qa_confidence(model, tokenizer, question, context):
    inputs = tokenizer(
        question, context,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        stride=128,
        padding=True
    )
    if torch.cuda.is_available():
        inputs = {k:v.cuda() for k,v in inputs.items()}
        model = model.cuda()
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    start_probs = torch.softmax(outputs.start_logits, dim=1)
    end_probs = torch.softmax(outputs.end_logits, dim=1)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    
    confidence = np.sqrt(
        start_probs[0, answer_start].item() * 
        end_probs[0, answer_end-1].item()
    )
    
    answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    return answer.strip(), float(confidence)

def run_evaluation(num_samples, progress=gr.Progress()):
    # Authentication
    hf_token = os.getenv("EVAL_TOKEN")
    if hf_token:
        login(token=hf_token)
    
    # Load model same as extractor
    model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name, token=hf_token)
    
    progress(0.1, desc="Loading CUAD dataset...")
    try:
        dataset = load_dataset(
            "theatticusproject/cuad-qa",
            trust_remote_code=True,
            token=hf_token
        )
        test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
        print(f"✓ Loaded {len(test_data)} samples")
    except Exception as e:
        return f"❌ Dataset load failed: {str(e)}", pd.DataFrame(), None
    
    results = []
    for i, example in enumerate(test_data):
        progress(0.2 + 0.7*i/num_samples, desc=f"Evaluating {i+1}/{num_samples}")
        
        context = example["context"]
        question = example["question"]
        gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
        
        pred_answer, confidence = get_qa_confidence(model, tokenizer, question, context)
        
        results.append({
            "Question": question[:100] + "..." if len(question) > 100 else question,
            "Prediction": pred_answer,
            "Truth": gt_answer,
            "Confidence": confidence,
            "Exact Match": exact_match_score(pred_answer, gt_answer),
            "F1": f1_score_qa(pred_answer, gt_answer)
        })
    
    # Generate report
    df = pd.DataFrame(results)
    report = f"""
    Evaluation Results (n={len(df)})
    =================
    - Exact Match: {df['Exact Match'].mean():.1%}
    - F1 Score: {df['F1'].mean():.1%}
    - Avg Confidence: {df['Confidence'].mean():.1%}
    - High-Confidence (>80%) Accuracy: {
        df[df['Confidence'] > 0.8]['Exact Match'].mean():.1%}
    """
    
    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_file = f"eval_results_{timestamp}.json"
    with open(results_file, 'w') as f:
        json.dump({
            "model": model_name,
            "metrics": {
                "exact_match": float(df['Exact Match'].mean()),
                "f1": float(df['F1'].mean()),
                "avg_confidence": float(df['Confidence'].mean())
            },
            "samples": results
        }, f, indent=2)
    
    return report, df, results_file

def create_gradio_interface():
    with gr.Blocks(title="CUAD Evaluator") as demo:
        gr.Markdown("## 🏛️ CUAD QA Model Evaluation")
        
        with gr.Row():
            num_samples = gr.Slider(10, 500, value=100, step=10, 
                                   label="Number of Samples")
            eval_btn = gr.Button("🚀 Run Evaluation", variant="primary")
        
        with gr.Row():
            report = gr.Markdown("Results will appear here...")
            results_table = gr.Dataframe(headers=["Question", "Prediction", "Confidence", "Exact Match"])
        
        download = gr.File(label="Download Results", visible=False)
        
        def run_and_display(num_samples):
            report_text, df, file = run_evaluation(num_samples)
            return (
                report_text,
                df[["Question", "Prediction", "Confidence", "Exact Match"]],
                gr.File(visible=True, value=file)
            )
        
        eval_btn.click(
            fn=run_and_display,
            inputs=num_samples,
            outputs=[report, results_table, download]
        )
    
    return demo

if __name__ == "__main__":
    # Verify CUDA
    if torch.cuda.is_available():
        print(f"✓ CUDA available: {torch.cuda.get_device_name(0)}")
    else:
        print("! Using CPU")
    
    # Launch Gradio
    demo = create_gradio_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True
    )