mathstutor / app /validation /answer_checker.py
ghadgemadhuri92's picture
agent tested with the prompt: Calculate 15 * 12 then add 50.
565a379
from typing import Any, Dict, List, Optional, Tuple
class AnswerValidator:
"""
Validates AI-generated answers against quality and safety standards.
"""
def __init__(self, confidence_threshold: float = 0.5):
"""
Initialize the AnswerValidator.
Args:
confidence_threshold: Minimum confidence score required. Defaults to 0.5.
"""
self.confidence_threshold = confidence_threshold
self.required_fields = ["latex", "reasoning", "final_answer", "confidence_score"]
def validate(self, response: Dict[str, Any], is_math_problem: bool = True) -> Tuple[bool, List[str]]:
"""
Validates the AI response.
Args:
response: The JSON response dictionary from the AI.
is_math_problem: Whether the input was identified as a math problem.
If True, checks for LaTeX content.
Returns:
Tuple[bool, List[str]]: (IsValid, List of error reasons)
"""
errors = []
# 1. Check required fields
for field in self.required_fields:
if field not in response:
errors.append(f"Missing required field: {field}")
if errors:
return False, errors
# 2. check for hallucinated/empty content
# sometimes models succeed but return empty strings
if not response.get("final_answer") or str(response.get("final_answer")).strip() == "":
errors.append("Final answer is empty.")
if not response.get("reasoning") or str(response.get("reasoning")).strip() == "":
errors.append("Reasoning is empty.")
# 3. Verify LaTeX presence for math problems
# We assume 'latex' field should contain some latex-like distinct characters if it's a math problem
# or at least not be empty.
if is_math_problem:
latex_content = response.get("latex", "")
if not latex_content or str(latex_content).strip() == "":
errors.append("LaTeX content is missing for a math problem.")
# Optional: heuristic check for common latex symbols if we want to be stricter
# if "\\" not in latex_content and "$" not in latex_content:
# errors.append("LaTeX content does not appear to contain valid LaTeX syntax.")
# 4. Confidence threshold check
try:
score = float(response.get("confidence_score", 0.0))
if score < self.confidence_threshold:
errors.append(f"Confidence score {score} is below threshold {self.confidence_threshold}.")
except (ValueError, TypeError):
errors.append("Invalid confidence score format.")
return len(errors) == 0, errors