Neural-Assessment-Generator / Question_Eval4.py
sangyan5's picture
Upload 11 files
7312afb verified
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, pipeline
import textstat
import math
class QGEvaluator:
def __init__(self):
print("Loading Evaluation Models...")
self.device = "cpu"
# 1. Fluency Model
self.ppl_model = GPT2LMHeadModel.from_pretrained("gpt2").to(self.device)
self.ppl_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
# 2. Answerability Model
self.qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
def calculate_perplexity(self, text):
""" Metric 1: Fluency (Lower is Better) """
encodings = self.ppl_tokenizer(text, return_tensors="pt")
input_ids = encodings.input_ids.to(self.device)
with torch.no_grad():
outputs = self.ppl_model(input_ids, labels=input_ids)
loss = outputs.loss
ppl = math.exp(loss.item())
return ppl
def evaluate_question(self, question, context, original_answer):
"""
The Master Evaluation Function
"""
# --- 1. Fluency Check ---
ppl = self.calculate_perplexity(question)
is_fluent = ppl < 100 # Threshold: 100 is standard for GPT-2
# --- 2. Answerability Check (The Fix) ---
qa_result = self.qa_pipeline(question=question, context=context)
predicted_answer = qa_result['answer']
confidence = qa_result['score']
# LOGIC FIX: Is the predicted answer similar to the original answer?
# We lowercase and check for overlap (e.g., "Mike" in "Mike")
is_correct_answer = original_answer.lower() in predicted_answer.lower() or \
predicted_answer.lower() in original_answer.lower()
# Verdict Logic
# It is GOOD if: (It is Fluent) AND (It gets the Right Answer OR High Confidence)
if is_fluent and (is_correct_answer or confidence > 0.5):
verdict = "✅ Good"
else:
verdict = "❌ Bad"
return {
"Question": question,
"Original Answer": original_answer,
"Predicted Answer": predicted_answer, # See what the AI found
"Match": "Yes" if is_correct_answer else "No",
"Verdict": verdict
}
# --- Usage Example ---
if __name__ == "__main__":
evaluator = QGEvaluator()
data = [
{
"context": "A political party is a political organization that typically seeks to attain and maintain political power within government, usually by participating in political campaigns, educational outreach, or protest actions.",
"original_answer": "A political party",
"question": "What is a political organization that seeks to attain and maintain political power within government?"
},
{
"context": "A political system is a framework which defines acceptable political methods within a society.",
"original_answer": "A political system",
"question": "What defines acceptable political methods within a society?"
},
{
"context": " The branch of social science that studies politics and government is referred to as political science.",
"original_answer": "political science",
"question": "What is the branch of social science that studies politics and government called?"
}
]
print("\n--- Evaluation Report ---")
for item in data:
metrics = evaluator.evaluate_question(
item['question'],
item['context'],
item['original_answer']
)
print(metrics)