import json import re from pathlib import Path from dotenv import load_dotenv BASE_DIR = Path(__file__).resolve().parent.parent EVAL_PATH = BASE_DIR / "tests" / "eval_question.json" from app.web import answer_query from core.config import DEFAULT_GROUNDED_FALLBACK_MESSAGE def load_eval_cases(): return json.loads(EVAL_PATH.read_text(encoding="utf-8")) def evaluate_case(case): answer, sources, citations, debug_data = answer_query( case["question"], debug_mode=True, ) answer_verdict, matched_concepts = evaluate_answer(case, answer) grounding_verdict = evaluate_grounding(answer, citations, debug_data) return { "question": case["question"], "expected_behavior": case.get("expected_behavior", "answer"), "answer": answer, "answer_verdict": answer_verdict, "grounding_verdict": grounding_verdict, "matched_concepts": matched_concepts, "total_concepts": len(case.get("expected_concepts", [])), "sources": [source["chunk_id"] for source in sources], "citations": [citation["number"] for citation in citations], "grounding_reason": ((debug_data or {}).get("grounding") or {}).get("reason", "unknown"), "notes": case.get("notes", ""), } def evaluate_answer(case, answer): expected_behavior = case.get("expected_behavior", "answer") answer_text = normalize_text(answer) is_fallback = answer_text == normalize_text(DEFAULT_GROUNDED_FALLBACK_MESSAGE) if expected_behavior == "fallback": return ("correct" if is_fallback else "wrong"), 0 expected_concepts = case.get("expected_concepts", []) forbidden_concepts = case.get("forbidden_concepts", []) if not expected_concepts: return ("partially_correct", 0) for concept_group in forbidden_concepts: if any(normalize_text(phrase) in answer_text for phrase in concept_group): return "wrong", 0 matched_concepts = 0 for concept_group in expected_concepts: if any(normalize_text(phrase) in answer_text for phrase in concept_group): matched_concepts += 1 if matched_concepts == len(expected_concepts): return "correct", matched_concepts if matched_concepts > 0: return "partially_correct", matched_concepts return "wrong", matched_concepts def evaluate_grounding(answer, citations, debug_data): grounding = (debug_data or {}).get("grounding") or {} answer_text = normalize_text(answer) is_fallback = answer_text == normalize_text(DEFAULT_GROUNDED_FALLBACK_MESSAGE) if grounding.get("passed") is True and citations: return "grounded_answer" if is_fallback and grounding.get("passed") is False: return "safe_fallback" if citations: return "partially_grounded" return "not_grounded" def build_summary(results): correct_answers = [result for result in results if result["answer_verdict"] == "correct"] partially_correct_answers = [ result for result in results if result["answer_verdict"] == "partially_correct" ] grounded_answers = [ result for result in results if result["grounding_verdict"] == "grounded_answer" ] safe_fallbacks = [ result for result in results if result["grounding_verdict"] == "safe_fallback" ] partially_grounded_answers = [ result for result in results if result["grounding_verdict"] == "partially_grounded" ] return { "cases": len(results), "correct_answers": len(correct_answers), "partially_correct_answers": len(partially_correct_answers), "grounded_answers": len(grounded_answers), "safe_fallbacks": len(safe_fallbacks), "partially_grounded_answers": len(partially_grounded_answers), } def print_report(summary, results): print("RAG Answer Evaluation Report") print("=" * 32) print(f"Cases: {summary['cases']}") print(f"Correct Answers: {summary['correct_answers']}/{summary['cases']}") print(f"Partially Correct Answers: {summary['partially_correct_answers']}/{summary['cases']}") print(f"Grounded Answers: {summary['grounded_answers']}/{summary['cases']}") print(f"Safe Fallbacks: {summary['safe_fallbacks']}/{summary['cases']}") print( f"Partially Grounded Answers: " f"{summary['partially_grounded_answers']}/{summary['cases']}" ) print() print("Question Results") print("-" * 32) for result in results: print(result["question"]) print(f"Answer Verdict: {result['answer_verdict']}") print(f"Grounding Verdict: {result['grounding_verdict']}") print(f"Grounding Reason: {result['grounding_reason']}") if result["total_concepts"]: print( f"Matched Concepts: {result['matched_concepts']}/{result['total_concepts']}" ) print(f"Cited Sources: {result['citations']}") print(f"Retrieved Chunks: {result['sources']}") print(f"Answer: {result['answer']}") if result["notes"]: print(f"Notes: {result['notes']}") print() def normalize_text(text): return re.sub(r"\s+", " ", text.lower()).strip() def main(): load_dotenv() cases = load_eval_cases() results = [evaluate_case(case) for case in cases] summary = build_summary(results) print_report(summary, results) if __name__ == "__main__": raise SystemExit( "Run this from the project root with `python evaluate_retrieval.py`, not `python tests/evaluate_retrieval.py`." )