Devisri515's picture
fix CI
2b63102
Raw
History Blame Contribute Delete
7.54 kB
"""Offline LLM-as-a-judge evaluation. Grades the agent on two metrics per question:
faithfulness (grounded in the retrieved context?) and accuracy (matches ground truth?).
Run from the project root: python -m tests.evaluate"""
import sys
import os
import re
import pandas as pd
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
load_dotenv()
try:
from src.agent import get_agent_executor, file_processor, _fallback_kb
except ImportError:
print("Run this from the project root: python -m tests.evaluate")
sys.exit(1)
FAITHFULNESS_PROMPT = """\
You are evaluating whether an AI answer is grounded in the provided source context.
Question: {question}
Retrieved Context: {context}
AI Answer: {answer}
Does the answer contain claims NOT supported by the retrieved context?
A faithful answer only uses information present in the context.
A hallucinated answer invents facts or adds information not in the context.
Score 1-10 where:
9-10 = fully grounded, no unsupported claims
6-8 = mostly grounded, minor additions
3-5 = several unsupported claims
1-2 = mostly fabricated
Format:
Score: [1-10]
Reason: [one sentence]"""
ACCURACY_PROMPT = """\
You are a strict teacher grading a student's answer.
Question: {question}
Ground Truth: {ground_truth}
Student Answer: {answer}
On a scale of 1-10, how accurate and complete is the student's answer compared to the ground truth?
- 9-10: matches ground truth meaning, may add correct extra details
- 6-8: partially correct, missing some key points
- 3-5: relevant but significantly incomplete or partially wrong
- 1-2: incorrect or completely off-topic
Format:
Score: [1-10]
Reason: [one sentence]"""
def extract_content(message) -> str:
content = message.content
if isinstance(content, list):
content = " ".join(
block["text"] if isinstance(block, dict) else str(block)
for block in content
if not isinstance(block, dict) or block.get("type") == "text"
)
return str(content)
def parse_score(text: str) -> int:
match = re.search(r"Score:\s*(\d+)", text)
return int(match.group(1)) if match else 0
def get_context(question: str) -> str:
"""Retrieve the same context the agent would use for RAG questions."""
if file_processor.has_documents():
ctx = file_processor.retrieve(question)
if ctx:
return ctx
return _fallback_kb.retrieve(question)
# Add your own Q&A pairs here. Leave ground_truth as None for web questions
# (accuracy is then skipped, since there is nothing to compare against).
TEST_CASES = [
{
"question": "What are the reporting requirements for State Parties?",
"ground_truth": (
"State Parties must submit a comprehensive report initially, followed by further "
"information included in reports to the Committee on the Rights of the Child. "
"Other State Parties need to submit reports every five years."
),
"source": "rag",
},
{
"question": "What happens if a State Party denounces the Protocol?",
"ground_truth": (
"Denunciation does not affect acts or situations occurring before the denunciation "
"becomes effective. It also does not prejudice the continued consideration of matters "
"already under consideration."
),
"source": "rag",
},
]
def run_evaluation(test_cases: list = None):
cases = test_cases or TEST_CASES
print(f"Starting evaluation ({len(cases)} test case(s))\n")
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
print("Set GOOGLE_API_KEY in your .env to run the offline evaluation.")
return
judge = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", temperature=0, google_api_key=api_key)
try:
agent = get_agent_executor(api_key)
except Exception as e:
print(f"Could not initialize agent: {e}")
return
results = []
for i, case in enumerate(cases, 1):
question = case["question"]
ground_truth = case.get("ground_truth")
source = case.get("source", "rag")
print(f"[{i}/{len(cases)}] {question}")
try:
result = agent.invoke({"messages": [("user", question)]})
answer = extract_content(result["messages"][-1])
except Exception as e:
print(f" Agent error: {e}\n")
results.append({"Question": question, "Answer": f"ERROR: {e}",
"Ground Truth": ground_truth,
"Faithfulness Score": "-", "Faithfulness Reason": str(e),
"Accuracy Score": "-", "Accuracy Reason": str(e)})
continue
print(f" Answer: {answer[:120]}...")
# Faithfulness only applies to document-grounded answers
faithfulness_score, faithfulness_reason = "-", "N/A (web search question)"
if source == "rag":
try:
context = get_context(question)
response = judge.invoke(
FAITHFULNESS_PROMPT.format(question=question, context=context, answer=answer)
)
faith_text = extract_content(response)
faithfulness_score = parse_score(faith_text)
faithfulness_reason = faith_text.split("Reason:")[-1].strip()
print(f" Faithfulness: {faithfulness_score}/10")
except Exception as e:
faithfulness_reason = str(e)
print(f" Faithfulness check failed: {e}")
# Accuracy only when a ground truth is provided
accuracy_score, accuracy_reason = "-", "N/A (no ground truth)"
if ground_truth:
try:
response = judge.invoke(
ACCURACY_PROMPT.format(question=question, ground_truth=ground_truth, answer=answer)
)
acc_text = extract_content(response)
accuracy_score = parse_score(acc_text)
accuracy_reason = acc_text.split("Reason:")[-1].strip()
print(f" Accuracy: {accuracy_score}/10")
except Exception as e:
accuracy_reason = str(e)
print(f" Accuracy check failed: {e}")
results.append({
"Question": question,
"Answer": answer,
"Ground Truth": ground_truth or "",
"Faithfulness Score": faithfulness_score,
"Faithfulness Reason": faithfulness_reason,
"Accuracy Score": accuracy_score,
"Accuracy Reason": accuracy_reason,
})
print()
# Save report
if not results:
print("No results to save.")
return
df = pd.DataFrame(results)
df.to_csv("evaluation_report.csv", index=False)
# Print summary
print("=" * 50)
numeric_faith = [r["Faithfulness Score"] for r in results if isinstance(r["Faithfulness Score"], int)]
numeric_acc = [r["Accuracy Score"] for r in results if isinstance(r["Accuracy Score"], int)]
if numeric_faith:
print(f"Avg Faithfulness (hallucination): {sum(numeric_faith)/len(numeric_faith):.1f}/10")
if numeric_acc:
print(f"Avg Accuracy: {sum(numeric_acc)/len(numeric_acc):.1f}/10")
print("\nReport saved to evaluation_report.csv")
if __name__ == "__main__":
run_evaluation()