| import json
|
| import os
|
| import sys
|
| import time
|
| import uuid
|
|
|
| sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
|
|
| from dotenv import load_dotenv
|
| load_dotenv()
|
|
|
| def score_faithfulness(question: str, answer: str, context: str, llm) -> float:
|
| prompt = f"""Rate from 0 to 10 how factually consistent this AI answer is with the reference.
|
| 10 = all facts match, 5 = some facts match, 0 = contradicts reference.
|
| Only output a single integer number, nothing else.
|
|
|
| Reference: {context[:400]}
|
| AI Answer: {answer[:400]}"""
|
| try:
|
| result = llm.invoke(prompt)
|
| import re
|
| nums = re.findall(r'\d+', result.content.strip())
|
| score = int(nums[0]) / 10.0 if nums else 0.5
|
| return min(1.0, max(0.0, score))
|
| except Exception:
|
| return 0.5
|
|
|
| def score_relevancy(question: str, answer: str, llm) -> float:
|
| prompt = f"""Rate from 0 to 10 how well this answer addresses the question.
|
| 10 = completely answers it, 5 = partially, 0 = off topic.
|
| Only output a single integer number, nothing else.
|
|
|
| Question: {question}
|
| Answer: {answer[:400]}"""
|
| try:
|
| result = llm.invoke(prompt)
|
| import re
|
| nums = re.findall(r'\d+', result.content.strip())
|
| score = int(nums[0]) / 10.0 if nums else 0.5
|
| return min(1.0, max(0.0, score))
|
| except Exception:
|
| return 0.5
|
|
|
| def run_evaluation(qa_path="tests/qa_pairs.json", sample=None):
|
| from langchain_groq import ChatGroq
|
| from langchain_core.messages import HumanMessage, AIMessage
|
| from src.agents.graph import get_graph
|
| import src.agents.graph as g
|
|
|
| print("\n" + "="*50)
|
| print("Evaluation Pipeline")
|
| print("="*50)
|
|
|
| with open(qa_path) as f:
|
| qa_pairs = json.load(f)
|
|
|
| if sample:
|
| qa_pairs = qa_pairs[:sample]
|
|
|
| print(f"\nEvaluating {len(qa_pairs)} questions...\n")
|
|
|
| graph = get_graph()
|
|
|
|
|
| g._THREAD_RETRIEVERS.clear()
|
| g._THREAD_META.clear()
|
| g.search_tool = None
|
|
|
| llm = ChatGroq(
|
| model="llama-3.1-8b-instant",
|
| api_key=os.getenv("GROQ_API_KEY"),
|
| temperature=0,
|
| )
|
|
|
| faithfulness_scores = []
|
| relevancy_scores = []
|
| latencies = []
|
|
|
| for i, pair in enumerate(qa_pairs):
|
| q = pair["question"]
|
| gt = pair.get("ground_truth", "")
|
| thread_id = f"eval_{uuid.uuid4().hex[:8]}"
|
| config = {"configurable": {"thread_id": thread_id}}
|
|
|
| t0 = time.perf_counter()
|
| try:
|
| result = graph.invoke(
|
| {"messages": [HumanMessage(content=q)]},
|
| config=config,
|
| )
|
| last_ai = next(
|
| (m for m in reversed(result["messages"]) if isinstance(m, AIMessage)), None
|
| )
|
| answer = last_ai.content if last_ai else ""
|
| except Exception as e:
|
| answer = ""
|
| latency = (time.perf_counter() - t0) * 1000
|
| latencies.append(latency)
|
|
|
| faith = score_faithfulness(q, answer, gt, llm)
|
| relevancy = score_relevancy(q, answer, llm)
|
|
|
| faithfulness_scores.append(faith)
|
| relevancy_scores.append(relevancy)
|
|
|
| print(f" [{i+1:2d}/{len(qa_pairs)}] F:{faith:.2f} R:{relevancy:.2f} {latency:.0f}ms | {q[:45]}")
|
|
|
| faith_avg = sum(faithfulness_scores) / len(faithfulness_scores)
|
| rel_avg = sum(relevancy_scores) / len(relevancy_scores)
|
| latencies_sorted = sorted(latencies)
|
| p50 = latencies_sorted[len(latencies_sorted) // 2]
|
| p90 = latencies_sorted[int(len(latencies_sorted) * 0.9)]
|
|
|
| report = {
|
| "num_questions": len(qa_pairs),
|
| "metrics": {
|
| "faithfulness": round(faith_avg, 4),
|
| "answer_relevancy": round(rel_avg, 4),
|
| },
|
| "latency_ms": {
|
| "p50": round(p50, 1),
|
| "p90": round(p90, 1),
|
| },
|
| "targets_met": {
|
| "faithfulness_gt_085": faith_avg > 0.85,
|
| "answer_relevancy_gt_080": rel_avg > 0.80,
|
| "p90_lt_2000ms": p90 < 2000,
|
| }
|
| }
|
|
|
| print("\n" + "="*50)
|
| print("RESULTS")
|
| print("="*50)
|
| print(f"Faithfulness: {faith_avg:.4f} {'✅' if faith_avg > 0.85 else '❌'} (target >0.85)")
|
| print(f"Answer Relevancy: {rel_avg:.4f} {'✅' if rel_avg > 0.80 else '❌'} (target >0.80)")
|
| print(f"P50 Latency: {p50:.0f}ms")
|
| print(f"P90 Latency: {p90:.0f}ms {'✅' if p90 < 2000 else '❌'} (target <2000ms)")
|
| print(f"\nQuestions tested: {len(qa_pairs)}")
|
|
|
| with open("evaluation_report.json", "w") as f:
|
| json.dump(report, f, indent=2)
|
| print("\nSaved to evaluation_report.json")
|
|
|
| return report
|
|
|
| if __name__ == "__main__":
|
| import argparse
|
| parser = argparse.ArgumentParser()
|
| parser.add_argument("--sample", type=int, default=None)
|
| parser.add_argument("--qa", default="tests/qa_pairs.json")
|
| args = parser.parse_args()
|
| run_evaluation(qa_path=args.qa, sample=args.sample) |