Spaces:
Build error
Build error
| import torch | |
| from transformers import GPT2LMHeadModel, GPT2TokenizerFast, pipeline | |
| import textstat | |
| import math | |
| class QGEvaluator: | |
| def __init__(self): | |
| print("Loading Evaluation Models...") | |
| self.device = "cpu" | |
| # 1. Fluency Model | |
| self.ppl_model = GPT2LMHeadModel.from_pretrained("gpt2").to(self.device) | |
| self.ppl_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") | |
| # 2. Answerability Model | |
| self.qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") | |
| def calculate_perplexity(self, text): | |
| """ Metric 1: Fluency (Lower is Better) """ | |
| encodings = self.ppl_tokenizer(text, return_tensors="pt") | |
| input_ids = encodings.input_ids.to(self.device) | |
| with torch.no_grad(): | |
| outputs = self.ppl_model(input_ids, labels=input_ids) | |
| loss = outputs.loss | |
| ppl = math.exp(loss.item()) | |
| return ppl | |
| def evaluate_question(self, question, context, original_answer): | |
| """ | |
| The Master Evaluation Function | |
| """ | |
| # --- 1. Fluency Check --- | |
| ppl = self.calculate_perplexity(question) | |
| is_fluent = ppl < 100 # Threshold: 100 is standard for GPT-2 | |
| # --- 2. Answerability Check (The Fix) --- | |
| qa_result = self.qa_pipeline(question=question, context=context) | |
| predicted_answer = qa_result['answer'] | |
| confidence = qa_result['score'] | |
| # LOGIC FIX: Is the predicted answer similar to the original answer? | |
| # We lowercase and check for overlap (e.g., "Mike" in "Mike") | |
| is_correct_answer = original_answer.lower() in predicted_answer.lower() or \ | |
| predicted_answer.lower() in original_answer.lower() | |
| # Verdict Logic | |
| # It is GOOD if: (It is Fluent) AND (It gets the Right Answer OR High Confidence) | |
| if is_fluent and (is_correct_answer or confidence > 0.5): | |
| verdict = "✅ Good" | |
| else: | |
| verdict = "❌ Bad" | |
| return { | |
| "Question": question, | |
| "Original Answer": original_answer, | |
| "Predicted Answer": predicted_answer, # See what the AI found | |
| "Match": "Yes" if is_correct_answer else "No", | |
| "Verdict": verdict | |
| } | |
| # --- Usage Example --- | |
| if __name__ == "__main__": | |
| evaluator = QGEvaluator() | |
| data = [ | |
| { | |
| "context": "A political party is a political organization that typically seeks to attain and maintain political power within government, usually by participating in political campaigns, educational outreach, or protest actions.", | |
| "original_answer": "A political party", | |
| "question": "What is a political organization that seeks to attain and maintain political power within government?" | |
| }, | |
| { | |
| "context": "A political system is a framework which defines acceptable political methods within a society.", | |
| "original_answer": "A political system", | |
| "question": "What defines acceptable political methods within a society?" | |
| }, | |
| { | |
| "context": " The branch of social science that studies politics and government is referred to as political science.", | |
| "original_answer": "political science", | |
| "question": "What is the branch of social science that studies politics and government called?" | |
| } | |
| ] | |
| print("\n--- Evaluation Report ---") | |
| for item in data: | |
| metrics = evaluator.evaluate_question( | |
| item['question'], | |
| item['context'], | |
| item['original_answer'] | |
| ) | |
| print(metrics) | |