import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, pipeline
import textstat
import math

class QGEvaluator:
    def __init__(self):
        print("Loading Evaluation Models...")
        self.device = "cpu"
        
        # 1. Fluency Model
        self.ppl_model = GPT2LMHeadModel.from_pretrained("gpt2").to(self.device)
        self.ppl_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
        
        # 2. Answerability Model
        self.qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

    def calculate_perplexity(self, text):
        """ Metric 1: Fluency (Lower is Better) """
        encodings = self.ppl_tokenizer(text, return_tensors="pt")
        input_ids = encodings.input_ids.to(self.device)
        with torch.no_grad():
            outputs = self.ppl_model(input_ids, labels=input_ids)
            loss = outputs.loss
            ppl = math.exp(loss.item())
        return ppl

    def evaluate_question(self, question, context, original_answer):
        """
        The Master Evaluation Function
        """
        # --- 1. Fluency Check ---
        ppl = self.calculate_perplexity(question)
        is_fluent = ppl < 100  # Threshold: 100 is standard for GPT-2
        
        # --- 2. Answerability Check (The Fix) ---
        qa_result = self.qa_pipeline(question=question, context=context)
        predicted_answer = qa_result['answer']
        confidence = qa_result['score']
        
        # LOGIC FIX: Is the predicted answer similar to the original answer?
        # We lowercase and check for overlap (e.g., "Mike" in "Mike")
        is_correct_answer = original_answer.lower() in predicted_answer.lower() or \
                            predicted_answer.lower() in original_answer.lower()

        # Verdict Logic
        # It is GOOD if: (It is Fluent) AND (It gets the Right Answer OR High Confidence)
        if is_fluent and (is_correct_answer or confidence > 0.5):
            verdict = "✅ Good"
        else:
            verdict = "❌ Bad"

        return {
            "Question": question,
            "Original Answer": original_answer,
            "Predicted Answer": predicted_answer, # See what the AI found
            "Match": "Yes" if is_correct_answer else "No",
            "Verdict": verdict
        }

# --- Usage Example ---
if __name__ == "__main__":
    evaluator = QGEvaluator()
    
    data = [
        {
            "context": "A political party is a political organization that typically seeks to attain and maintain political power within government, usually by participating in political campaigns, educational outreach, or protest actions.",
            "original_answer": "A political party",
            "question": "What is a political organization that seeks to attain and maintain political power within government?"
        },
        {
            "context": "A political system is a framework which defines acceptable political methods within a society.",
            "original_answer": "A political system", 
            "question": "What defines acceptable political methods within a society?"
        },
        {
            "context": " The branch of social science that studies politics and government is referred to as political science.",
            "original_answer": "political science", 
            "question": "What is the branch of social science that studies politics and government called?"
        }
    ]
    print("\n--- Evaluation Report ---")
    for item in data:
        metrics = evaluator.evaluate_question(
            item['question'], 
            item['context'], 
            item['original_answer']
        )
        print(metrics)