File size: 7,539 Bytes
2b63102
 
 
8f6bb93
8a27bb1
 
8f6bb93
ee3f4ca
 
8f6bb93
 
 
 
8a27bb1
ee3f4ca
8f6bb93
ee3f4ca
8f6bb93
ee3f4ca
8a27bb1
ee3f4ca
8f6bb93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b63102
 
8f6bb93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b63102
8f6bb93
 
 
 
 
 
98714ab
8f6bb93
ee3f4ca
8f6bb93
ee3f4ca
8f6bb93
ee3f4ca
 
8f6bb93
 
 
 
 
 
ee3f4ca
8f6bb93
 
 
 
 
8a27bb1
8f6bb93
 
 
 
 
 
 
 
 
2b63102
8f6bb93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b63102
8f6bb93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b87551
8a27bb1
 
 
8f6bb93
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
"""Offline LLM-as-a-judge evaluation. Grades the agent on two metrics per question:
faithfulness (grounded in the retrieved context?) and accuracy (matches ground truth?).
Run from the project root: python -m tests.evaluate"""

import sys
import os
import re
import pandas as pd
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
load_dotenv()

try:
    from src.agent import get_agent_executor, file_processor, _fallback_kb
except ImportError:
    print("Run this from the project root: python -m tests.evaluate")
    sys.exit(1)


FAITHFULNESS_PROMPT = """\
You are evaluating whether an AI answer is grounded in the provided source context.

Question: {question}
Retrieved Context: {context}
AI Answer: {answer}

Does the answer contain claims NOT supported by the retrieved context?
A faithful answer only uses information present in the context.
A hallucinated answer invents facts or adds information not in the context.

Score 1-10 where:
  9-10 = fully grounded, no unsupported claims
  6-8  = mostly grounded, minor additions
  3-5  = several unsupported claims
  1-2  = mostly fabricated

Format:
Score: [1-10]
Reason: [one sentence]"""

ACCURACY_PROMPT = """\
You are a strict teacher grading a student's answer.

Question: {question}
Ground Truth: {ground_truth}
Student Answer: {answer}

On a scale of 1-10, how accurate and complete is the student's answer compared to the ground truth?
- 9-10: matches ground truth meaning, may add correct extra details
- 6-8:  partially correct, missing some key points
- 3-5:  relevant but significantly incomplete or partially wrong
- 1-2:  incorrect or completely off-topic

Format:
Score: [1-10]
Reason: [one sentence]"""


def extract_content(message) -> str:
    content = message.content
    if isinstance(content, list):
        content = " ".join(
            block["text"] if isinstance(block, dict) else str(block)
            for block in content
            if not isinstance(block, dict) or block.get("type") == "text"
        )
    return str(content)


def parse_score(text: str) -> int:
    match = re.search(r"Score:\s*(\d+)", text)
    return int(match.group(1)) if match else 0


def get_context(question: str) -> str:
    """Retrieve the same context the agent would use for RAG questions."""
    if file_processor.has_documents():
        ctx = file_processor.retrieve(question)
        if ctx:
            return ctx
    return _fallback_kb.retrieve(question)


# Add your own Q&A pairs here. Leave ground_truth as None for web questions
# (accuracy is then skipped, since there is nothing to compare against).
TEST_CASES = [
    {
        "question": "What are the reporting requirements for State Parties?",
        "ground_truth": (
            "State Parties must submit a comprehensive report initially, followed by further "
            "information included in reports to the Committee on the Rights of the Child. "
            "Other State Parties need to submit reports every five years."
        ),
        "source": "rag",
    },
    {
        "question": "What happens if a State Party denounces the Protocol?",
        "ground_truth": (
            "Denunciation does not affect acts or situations occurring before the denunciation "
            "becomes effective. It also does not prejudice the continued consideration of matters "
            "already under consideration."
        ),
        "source": "rag",
    },
]


def run_evaluation(test_cases: list = None):
    cases = test_cases or TEST_CASES
    print(f"Starting evaluation ({len(cases)} test case(s))\n")

    api_key = os.getenv("GOOGLE_API_KEY")
    if not api_key:
        print("Set GOOGLE_API_KEY in your .env to run the offline evaluation.")
        return

    judge = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", temperature=0, google_api_key=api_key)

    try:
        agent = get_agent_executor(api_key)
    except Exception as e:
        print(f"Could not initialize agent: {e}")
        return

    results = []

    for i, case in enumerate(cases, 1):
        question = case["question"]
        ground_truth = case.get("ground_truth")
        source = case.get("source", "rag")

        print(f"[{i}/{len(cases)}] {question}")

        try:
            result = agent.invoke({"messages": [("user", question)]})
            answer = extract_content(result["messages"][-1])
        except Exception as e:
            print(f"  Agent error: {e}\n")
            results.append({"Question": question, "Answer": f"ERROR: {e}",
                            "Ground Truth": ground_truth,
                            "Faithfulness Score": "-", "Faithfulness Reason": str(e),
                            "Accuracy Score": "-", "Accuracy Reason": str(e)})
            continue

        print(f"  Answer: {answer[:120]}...")

        # Faithfulness only applies to document-grounded answers
        faithfulness_score, faithfulness_reason = "-", "N/A (web search question)"
        if source == "rag":
            try:
                context = get_context(question)
                response = judge.invoke(
                    FAITHFULNESS_PROMPT.format(question=question, context=context, answer=answer)
                )
                faith_text = extract_content(response)
                faithfulness_score = parse_score(faith_text)
                faithfulness_reason = faith_text.split("Reason:")[-1].strip()
                print(f"  Faithfulness: {faithfulness_score}/10")
            except Exception as e:
                faithfulness_reason = str(e)
                print(f"  Faithfulness check failed: {e}")

        # Accuracy only when a ground truth is provided
        accuracy_score, accuracy_reason = "-", "N/A (no ground truth)"
        if ground_truth:
            try:
                response = judge.invoke(
                    ACCURACY_PROMPT.format(question=question, ground_truth=ground_truth, answer=answer)
                )
                acc_text = extract_content(response)
                accuracy_score = parse_score(acc_text)
                accuracy_reason = acc_text.split("Reason:")[-1].strip()
                print(f"  Accuracy:     {accuracy_score}/10")
            except Exception as e:
                accuracy_reason = str(e)
                print(f"  Accuracy check failed: {e}")

        results.append({
            "Question": question,
            "Answer": answer,
            "Ground Truth": ground_truth or "",
            "Faithfulness Score": faithfulness_score,
            "Faithfulness Reason": faithfulness_reason,
            "Accuracy Score": accuracy_score,
            "Accuracy Reason": accuracy_reason,
        })
        print()

    # Save report
    if not results:
        print("No results to save.")
        return

    df = pd.DataFrame(results)
    df.to_csv("evaluation_report.csv", index=False)

    # Print summary
    print("=" * 50)
    numeric_faith = [r["Faithfulness Score"] for r in results if isinstance(r["Faithfulness Score"], int)]
    numeric_acc   = [r["Accuracy Score"] for r in results if isinstance(r["Accuracy Score"], int)]
    if numeric_faith:
        print(f"Avg Faithfulness (hallucination): {sum(numeric_faith)/len(numeric_faith):.1f}/10")
    if numeric_acc:
        print(f"Avg Accuracy:                     {sum(numeric_acc)/len(numeric_acc):.1f}/10")
    print("\nReport saved to evaluation_report.csv")


if __name__ == "__main__":
    run_evaluation()