import torch from transformers import GPT2LMHeadModel, GPT2TokenizerFast, pipeline import textstat import math class QGEvaluator: def __init__(self): print("Loading Evaluation Models...") self.device = "cpu" # 1. Fluency Model self.ppl_model = GPT2LMHeadModel.from_pretrained("gpt2").to(self.device) self.ppl_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") # 2. Answerability Model self.qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") def calculate_perplexity(self, text): """ Metric 1: Fluency (Lower is Better) """ encodings = self.ppl_tokenizer(text, return_tensors="pt") input_ids = encodings.input_ids.to(self.device) with torch.no_grad(): outputs = self.ppl_model(input_ids, labels=input_ids) loss = outputs.loss ppl = math.exp(loss.item()) return ppl def evaluate_question(self, question, context, original_answer): """ The Master Evaluation Function """ # --- 1. Fluency Check --- ppl = self.calculate_perplexity(question) is_fluent = ppl < 100 # Threshold: 100 is standard for GPT-2 # --- 2. Answerability Check (The Fix) --- qa_result = self.qa_pipeline(question=question, context=context) predicted_answer = qa_result['answer'] confidence = qa_result['score'] # LOGIC FIX: Is the predicted answer similar to the original answer? # We lowercase and check for overlap (e.g., "Mike" in "Mike") is_correct_answer = original_answer.lower() in predicted_answer.lower() or \ predicted_answer.lower() in original_answer.lower() # Verdict Logic # It is GOOD if: (It is Fluent) AND (It gets the Right Answer OR High Confidence) if is_fluent and (is_correct_answer or confidence > 0.5): verdict = "✅ Good" else: verdict = "❌ Bad" return { "Question": question, "Original Answer": original_answer, "Predicted Answer": predicted_answer, # See what the AI found "Match": "Yes" if is_correct_answer else "No", "Verdict": verdict } # --- Usage Example --- if __name__ == "__main__": evaluator = QGEvaluator() data = [ { "context": "A political party is a political organization that typically seeks to attain and maintain political power within government, usually by participating in political campaigns, educational outreach, or protest actions.", "original_answer": "A political party", "question": "What is a political organization that seeks to attain and maintain political power within government?" }, { "context": "A political system is a framework which defines acceptable political methods within a society.", "original_answer": "A political system", "question": "What defines acceptable political methods within a society?" }, { "context": " The branch of social science that studies politics and government is referred to as political science.", "original_answer": "political science", "question": "What is the branch of social science that studies politics and government called?" } ] print("\n--- Evaluation Report ---") for item in data: metrics = evaluator.evaluate_question( item['question'], item['context'], item['original_answer'] ) print(metrics)