| """Offline LLM-as-a-judge evaluation. Grades the agent on two metrics per question: |
| faithfulness (grounded in the retrieved context?) and accuracy (matches ground truth?). |
| Run from the project root: python -m tests.evaluate""" |
|
|
| import sys |
| import os |
| import re |
| import pandas as pd |
| from langchain_google_genai import ChatGoogleGenerativeAI |
| from dotenv import load_dotenv |
|
|
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| load_dotenv() |
|
|
| try: |
| from src.agent import get_agent_executor, file_processor, _fallback_kb |
| except ImportError: |
| print("Run this from the project root: python -m tests.evaluate") |
| sys.exit(1) |
|
|
|
|
| FAITHFULNESS_PROMPT = """\ |
| You are evaluating whether an AI answer is grounded in the provided source context. |
| |
| Question: {question} |
| Retrieved Context: {context} |
| AI Answer: {answer} |
| |
| Does the answer contain claims NOT supported by the retrieved context? |
| A faithful answer only uses information present in the context. |
| A hallucinated answer invents facts or adds information not in the context. |
| |
| Score 1-10 where: |
| 9-10 = fully grounded, no unsupported claims |
| 6-8 = mostly grounded, minor additions |
| 3-5 = several unsupported claims |
| 1-2 = mostly fabricated |
| |
| Format: |
| Score: [1-10] |
| Reason: [one sentence]""" |
|
|
| ACCURACY_PROMPT = """\ |
| You are a strict teacher grading a student's answer. |
| |
| Question: {question} |
| Ground Truth: {ground_truth} |
| Student Answer: {answer} |
| |
| On a scale of 1-10, how accurate and complete is the student's answer compared to the ground truth? |
| - 9-10: matches ground truth meaning, may add correct extra details |
| - 6-8: partially correct, missing some key points |
| - 3-5: relevant but significantly incomplete or partially wrong |
| - 1-2: incorrect or completely off-topic |
| |
| Format: |
| Score: [1-10] |
| Reason: [one sentence]""" |
|
|
|
|
| def extract_content(message) -> str: |
| content = message.content |
| if isinstance(content, list): |
| content = " ".join( |
| block["text"] if isinstance(block, dict) else str(block) |
| for block in content |
| if not isinstance(block, dict) or block.get("type") == "text" |
| ) |
| return str(content) |
|
|
|
|
| def parse_score(text: str) -> int: |
| match = re.search(r"Score:\s*(\d+)", text) |
| return int(match.group(1)) if match else 0 |
|
|
|
|
| def get_context(question: str) -> str: |
| """Retrieve the same context the agent would use for RAG questions.""" |
| if file_processor.has_documents(): |
| ctx = file_processor.retrieve(question) |
| if ctx: |
| return ctx |
| return _fallback_kb.retrieve(question) |
|
|
|
|
| |
| |
| TEST_CASES = [ |
| { |
| "question": "What are the reporting requirements for State Parties?", |
| "ground_truth": ( |
| "State Parties must submit a comprehensive report initially, followed by further " |
| "information included in reports to the Committee on the Rights of the Child. " |
| "Other State Parties need to submit reports every five years." |
| ), |
| "source": "rag", |
| }, |
| { |
| "question": "What happens if a State Party denounces the Protocol?", |
| "ground_truth": ( |
| "Denunciation does not affect acts or situations occurring before the denunciation " |
| "becomes effective. It also does not prejudice the continued consideration of matters " |
| "already under consideration." |
| ), |
| "source": "rag", |
| }, |
| ] |
|
|
|
|
| def run_evaluation(test_cases: list = None): |
| cases = test_cases or TEST_CASES |
| print(f"Starting evaluation ({len(cases)} test case(s))\n") |
|
|
| api_key = os.getenv("GOOGLE_API_KEY") |
| if not api_key: |
| print("Set GOOGLE_API_KEY in your .env to run the offline evaluation.") |
| return |
|
|
| judge = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", temperature=0, google_api_key=api_key) |
|
|
| try: |
| agent = get_agent_executor(api_key) |
| except Exception as e: |
| print(f"Could not initialize agent: {e}") |
| return |
|
|
| results = [] |
|
|
| for i, case in enumerate(cases, 1): |
| question = case["question"] |
| ground_truth = case.get("ground_truth") |
| source = case.get("source", "rag") |
|
|
| print(f"[{i}/{len(cases)}] {question}") |
|
|
| try: |
| result = agent.invoke({"messages": [("user", question)]}) |
| answer = extract_content(result["messages"][-1]) |
| except Exception as e: |
| print(f" Agent error: {e}\n") |
| results.append({"Question": question, "Answer": f"ERROR: {e}", |
| "Ground Truth": ground_truth, |
| "Faithfulness Score": "-", "Faithfulness Reason": str(e), |
| "Accuracy Score": "-", "Accuracy Reason": str(e)}) |
| continue |
|
|
| print(f" Answer: {answer[:120]}...") |
|
|
| |
| faithfulness_score, faithfulness_reason = "-", "N/A (web search question)" |
| if source == "rag": |
| try: |
| context = get_context(question) |
| response = judge.invoke( |
| FAITHFULNESS_PROMPT.format(question=question, context=context, answer=answer) |
| ) |
| faith_text = extract_content(response) |
| faithfulness_score = parse_score(faith_text) |
| faithfulness_reason = faith_text.split("Reason:")[-1].strip() |
| print(f" Faithfulness: {faithfulness_score}/10") |
| except Exception as e: |
| faithfulness_reason = str(e) |
| print(f" Faithfulness check failed: {e}") |
|
|
| |
| accuracy_score, accuracy_reason = "-", "N/A (no ground truth)" |
| if ground_truth: |
| try: |
| response = judge.invoke( |
| ACCURACY_PROMPT.format(question=question, ground_truth=ground_truth, answer=answer) |
| ) |
| acc_text = extract_content(response) |
| accuracy_score = parse_score(acc_text) |
| accuracy_reason = acc_text.split("Reason:")[-1].strip() |
| print(f" Accuracy: {accuracy_score}/10") |
| except Exception as e: |
| accuracy_reason = str(e) |
| print(f" Accuracy check failed: {e}") |
|
|
| results.append({ |
| "Question": question, |
| "Answer": answer, |
| "Ground Truth": ground_truth or "", |
| "Faithfulness Score": faithfulness_score, |
| "Faithfulness Reason": faithfulness_reason, |
| "Accuracy Score": accuracy_score, |
| "Accuracy Reason": accuracy_reason, |
| }) |
| print() |
|
|
| |
| if not results: |
| print("No results to save.") |
| return |
|
|
| df = pd.DataFrame(results) |
| df.to_csv("evaluation_report.csv", index=False) |
|
|
| |
| print("=" * 50) |
| numeric_faith = [r["Faithfulness Score"] for r in results if isinstance(r["Faithfulness Score"], int)] |
| numeric_acc = [r["Accuracy Score"] for r in results if isinstance(r["Accuracy Score"], int)] |
| if numeric_faith: |
| print(f"Avg Faithfulness (hallucination): {sum(numeric_faith)/len(numeric_faith):.1f}/10") |
| if numeric_acc: |
| print(f"Avg Accuracy: {sum(numeric_acc)/len(numeric_acc):.1f}/10") |
| print("\nReport saved to evaluation_report.csv") |
|
|
|
|
| if __name__ == "__main__": |
| run_evaluation() |
|
|