Spaces:

Devisri515
/

Agentic_RAG_Knowledge_Search

Sleeping

File size: 7,539 Bytes

"""Offline LLM-as-a-judge evaluation. Grades the agent on two metrics per question:
faithfulness (grounded in the retrieved context?) and accuracy (matches ground truth?).
Run from the project root: python -m tests.evaluate"""

import sys
import os
import re
import pandas as pd
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
load_dotenv()

try:
    from src.agent import get_agent_executor, file_processor, _fallback_kb
except ImportError:
    print("Run this from the project root: python -m tests.evaluate")
    sys.exit(1)


FAITHFULNESS_PROMPT = """\
You are evaluating whether an AI answer is grounded in the provided source context.

Question: {question}
Retrieved Context: {context}
AI Answer: {answer}

Does the answer contain claims NOT supported by the retrieved context?
A faithful answer only uses information present in the context.
A hallucinated answer invents facts or adds information not in the context.

Score 1-10 where:
  9-10 = fully grounded, no unsupported claims
  6-8  = mostly grounded, minor additions
  3-5  = several unsupported claims
  1-2  = mostly fabricated

Format:
Score: [1-10]
Reason: [one sentence]"""

ACCURACY_PROMPT = """\
You are a strict teacher grading a student's answer.

Question: {question}
Ground Truth: {ground_truth}
Student Answer: {answer}

On a scale of 1-10, how accurate and complete is the student's answer compared to the ground truth?
- 9-10: matches ground truth meaning, may add correct extra details
- 6-8:  partially correct, missing some key points
- 3-5:  relevant but significantly incomplete or partially wrong
- 1-2:  incorrect or completely off-topic

Format:
Score: [1-10]
Reason: [one sentence]"""


def extract_content(message) -> str:
    content = message.content
    if isinstance(content, list):
        content = " ".join(
            block["text"] if isinstance(block, dict) else str(block)
            for block in content
            if not isinstance(block, dict) or block.get("type") == "text"
        )
    return str(content)


def parse_score(text: str) -> int:
    match = re.search(r"Score:\s*(\d+)", text)
    return int(match.group(1)) if match else 0


def get_context(question: str) -> str:
    """Retrieve the same context the agent would use for RAG questions."""
    if file_processor.has_documents():
        ctx = file_processor.retrieve(question)
        if ctx:
            return ctx
    return _fallback_kb.retrieve(question)


# Add your own Q&A pairs here. Leave ground_truth as None for web questions
# (accuracy is then skipped, since there is nothing to compare against).
TEST_CASES = [
    {
        "question": "What are the reporting requirements for State Parties?",
        "ground_truth": (
            "State Parties must submit a comprehensive report initially, followed by further "
            "information included in reports to the Committee on the Rights of the Child. "
            "Other State Parties need to submit reports every five years."
        ),
        "source": "rag",
    },
    {
        "question": "What happens if a State Party denounces the Protocol?",
        "ground_truth": (
            "Denunciation does not affect acts or situations occurring before the denunciation "
            "becomes effective. It also does not prejudice the continued consideration of matters "
            "already under consideration."
        ),
        "source": "rag",
    },
]


def run_evaluation(test_cases: list = None):
    cases = test_cases or TEST_CASES
    print(f"Starting evaluation ({len(cases)} test case(s))\n")

    api_key = os.getenv("GOOGLE_API_KEY")
    if not api_key:
        print("Set GOOGLE_API_KEY in your .env to run the offline evaluation.")
        return

    judge = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", temperature=0, google_api_key=api_key)

    try:
        agent = get_agent_executor(api_key)
    except Exception as e:
        print(f"Could not initialize agent: {e}")
        return

    results = []

    for i, case in enumerate(cases, 1):
        question = case["question"]
        ground_truth = case.get("ground_truth")
        source = case.get("source", "rag")

        print(f"[{i}/{len(cases)}] {question}")

        try:
            result = agent.invoke({"messages": [("user", question)]})
            answer = extract_content(result["messages"][-1])
        except Exception as e:
            print(f"  Agent error: {e}\n")
            results.append({"Question": question, "Answer": f"ERROR: {e}",
                            "Ground Truth": ground_truth,
                            "Faithfulness Score": "-", "Faithfulness Reason": str(e),
                            "Accuracy Score": "-", "Accuracy Reason": str(e)})
            continue

        print(f"  Answer: {answer[:120]}...")

        # Faithfulness only applies to document-grounded answers
        faithfulness_score, faithfulness_reason = "-", "N/A (web search question)"
        if source == "rag":
            try:
                context = get_context(question)
                response = judge.invoke(
                    FAITHFULNESS_PROMPT.format(question=question, context=context, answer=answer)
                )
                faith_text = extract_content(response)
                faithfulness_score = parse_score(faith_text)
                faithfulness_reason = faith_text.split("Reason:")[-1].strip()
                print(f"  Faithfulness: {faithfulness_score}/10")
            except Exception as e:
                faithfulness_reason = str(e)
                print(f"  Faithfulness check failed: {e}")

        # Accuracy only when a ground truth is provided
        accuracy_score, accuracy_reason = "-", "N/A (no ground truth)"
        if ground_truth:
            try:
                response = judge.invoke(
                    ACCURACY_PROMPT.format(question=question, ground_truth=ground_truth, answer=answer)
                )
                acc_text = extract_content(response)
                accuracy_score = parse_score(acc_text)
                accuracy_reason = acc_text.split("Reason:")[-1].strip()
                print(f"  Accuracy:     {accuracy_score}/10")
            except Exception as e:
                accuracy_reason = str(e)
                print(f"  Accuracy check failed: {e}")

        results.append({
            "Question": question,
            "Answer": answer,
            "Ground Truth": ground_truth or "",
            "Faithfulness Score": faithfulness_score,
            "Faithfulness Reason": faithfulness_reason,
            "Accuracy Score": accuracy_score,
            "Accuracy Reason": accuracy_reason,
        })
        print()

    # Save report
    if not results:
        print("No results to save.")
        return

    df = pd.DataFrame(results)
    df.to_csv("evaluation_report.csv", index=False)

    # Print summary
    print("=" * 50)
    numeric_faith = [r["Faithfulness Score"] for r in results if isinstance(r["Faithfulness Score"], int)]
    numeric_acc   = [r["Accuracy Score"] for r in results if isinstance(r["Accuracy Score"], int)]
    if numeric_faith:
        print(f"Avg Faithfulness (hallucination): {sum(numeric_faith)/len(numeric_faith):.1f}/10")
    if numeric_acc:
        print(f"Avg Accuracy:                     {sum(numeric_acc)/len(numeric_acc):.1f}/10")
    print("\nReport saved to evaluation_report.csv")


if __name__ == "__main__":
    run_evaluation()