Spaces:

Devisri515
/

Agentic_RAG_Knowledge_Search

Sleeping

App Files Files Community

Agentic_RAG_Knowledge_Search / tests /evaluate.py

Devisri515

fix CI

2b63102 about 1 month ago

Raw

History Blame Contribute Delete

7.54 kB

	"""Offline LLM-as-a-judge evaluation. Grades the agent on two metrics per question:
	faithfulness (grounded in the retrieved context?) and accuracy (matches ground truth?).
	Run from the project root: python -m tests.evaluate"""

	import sys
	import os
	import re
	import pandas as pd
	from langchain_google_genai import ChatGoogleGenerativeAI
	from dotenv import load_dotenv

	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	load_dotenv()

	try:
	from src.agent import get_agent_executor, file_processor, _fallback_kb
	except ImportError:
	print("Run this from the project root: python -m tests.evaluate")
	sys.exit(1)


	FAITHFULNESS_PROMPT = """\
	You are evaluating whether an AI answer is grounded in the provided source context.

	Question: {question}
	Retrieved Context: {context}
	AI Answer: {answer}

	Does the answer contain claims NOT supported by the retrieved context?
	A faithful answer only uses information present in the context.
	A hallucinated answer invents facts or adds information not in the context.

	Score 1-10 where:
	9-10 = fully grounded, no unsupported claims
	6-8 = mostly grounded, minor additions
	3-5 = several unsupported claims
	1-2 = mostly fabricated

	Format:
	Score: [1-10]
	Reason: [one sentence]"""

	ACCURACY_PROMPT = """\
	You are a strict teacher grading a student's answer.

	Question: {question}
	Ground Truth: {ground_truth}
	Student Answer: {answer}

	On a scale of 1-10, how accurate and complete is the student's answer compared to the ground truth?
	- 9-10: matches ground truth meaning, may add correct extra details
	- 6-8: partially correct, missing some key points
	- 3-5: relevant but significantly incomplete or partially wrong
	- 1-2: incorrect or completely off-topic

	Format:
	Score: [1-10]
	Reason: [one sentence]"""


	def extract_content(message) -> str:
	content = message.content
	if isinstance(content, list):
	content = " ".join(
	block["text"] if isinstance(block, dict) else str(block)
	for block in content
	if not isinstance(block, dict) or block.get("type") == "text"
	)
	return str(content)


	def parse_score(text: str) -> int:
	match = re.search(r"Score:\s*(\d+)", text)
	return int(match.group(1)) if match else 0


	def get_context(question: str) -> str:
	"""Retrieve the same context the agent would use for RAG questions."""
	if file_processor.has_documents():
	ctx = file_processor.retrieve(question)
	if ctx:
	return ctx
	return _fallback_kb.retrieve(question)


	# Add your own Q&A pairs here. Leave ground_truth as None for web questions
	# (accuracy is then skipped, since there is nothing to compare against).
	TEST_CASES = [
	{
	"question": "What are the reporting requirements for State Parties?",
	"ground_truth": (
	"State Parties must submit a comprehensive report initially, followed by further "
	"information included in reports to the Committee on the Rights of the Child. "
	"Other State Parties need to submit reports every five years."
	),
	"source": "rag",
	},
	{
	"question": "What happens if a State Party denounces the Protocol?",
	"ground_truth": (
	"Denunciation does not affect acts or situations occurring before the denunciation "
	"becomes effective. It also does not prejudice the continued consideration of matters "
	"already under consideration."
	),
	"source": "rag",
	},
	]


	def run_evaluation(test_cases: list = None):
	cases = test_cases or TEST_CASES
	print(f"Starting evaluation ({len(cases)} test case(s))\n")

	api_key = os.getenv("GOOGLE_API_KEY")
	if not api_key:
	print("Set GOOGLE_API_KEY in your .env to run the offline evaluation.")
	return

	judge = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", temperature=0, google_api_key=api_key)

	try:
	agent = get_agent_executor(api_key)
	except Exception as e:
	print(f"Could not initialize agent: {e}")
	return

	results = []

	for i, case in enumerate(cases, 1):
	question = case["question"]
	ground_truth = case.get("ground_truth")
	source = case.get("source", "rag")

	print(f"[{i}/{len(cases)}] {question}")

	try:
	result = agent.invoke({"messages": [("user", question)]})
	answer = extract_content(result["messages"][-1])
	except Exception as e:
	print(f" Agent error: {e}\n")
	results.append({"Question": question, "Answer": f"ERROR: {e}",
	"Ground Truth": ground_truth,
	"Faithfulness Score": "-", "Faithfulness Reason": str(e),
	"Accuracy Score": "-", "Accuracy Reason": str(e)})
	continue

	print(f" Answer: {answer[:120]}...")

	# Faithfulness only applies to document-grounded answers
	faithfulness_score, faithfulness_reason = "-", "N/A (web search question)"
	if source == "rag":
	try:
	context = get_context(question)
	response = judge.invoke(
	FAITHFULNESS_PROMPT.format(question=question, context=context, answer=answer)
	)
	faith_text = extract_content(response)
	faithfulness_score = parse_score(faith_text)
	faithfulness_reason = faith_text.split("Reason:")[-1].strip()
	print(f" Faithfulness: {faithfulness_score}/10")
	except Exception as e:
	faithfulness_reason = str(e)
	print(f" Faithfulness check failed: {e}")

	# Accuracy only when a ground truth is provided
	accuracy_score, accuracy_reason = "-", "N/A (no ground truth)"
	if ground_truth:
	try:
	response = judge.invoke(
	ACCURACY_PROMPT.format(question=question, ground_truth=ground_truth, answer=answer)
	)
	acc_text = extract_content(response)
	accuracy_score = parse_score(acc_text)
	accuracy_reason = acc_text.split("Reason:")[-1].strip()
	print(f" Accuracy: {accuracy_score}/10")
	except Exception as e:
	accuracy_reason = str(e)
	print(f" Accuracy check failed: {e}")

	results.append({
	"Question": question,
	"Answer": answer,
	"Ground Truth": ground_truth or "",
	"Faithfulness Score": faithfulness_score,
	"Faithfulness Reason": faithfulness_reason,
	"Accuracy Score": accuracy_score,
	"Accuracy Reason": accuracy_reason,
	})
	print()

	# Save report
	if not results:
	print("No results to save.")
	return

	df = pd.DataFrame(results)
	df.to_csv("evaluation_report.csv", index=False)

	# Print summary
	print("=" * 50)
	numeric_faith = [r["Faithfulness Score"] for r in results if isinstance(r["Faithfulness Score"], int)]
	numeric_acc = [r["Accuracy Score"] for r in results if isinstance(r["Accuracy Score"], int)]
	if numeric_faith:
	print(f"Avg Faithfulness (hallucination): {sum(numeric_faith)/len(numeric_faith):.1f}/10")
	if numeric_acc:
	print(f"Avg Accuracy: {sum(numeric_acc)/len(numeric_acc):.1f}/10")
	print("\nReport saved to evaluation_report.csv")


	if __name__ == "__main__":
	run_evaluation()