# Evaluation runner — tests the agent against curated queries # and grades answers using LLM-as-judge (1-5 scale) # tracks: response rate, confidence dist, latency, tool accuracy import json import time import sys import os from pathlib import Path from statistics import mean, median sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from src.agents.graph import run_query from src.llm import get_llm from src.logger import logger from langchain_core.messages import HumanMessage, SystemMessage EVAL_DIR = Path(__file__).parent RESULTS_DIR = EVAL_DIR / "results" def load_test_queries() -> list[dict]: with open(EVAL_DIR / "test_queries.json") as f: return json.load(f) def grade_answer(query: str, answer: str, query_type: str) -> dict: # use the LLM itself to grade answer quality on 1-5 scale llm = get_llm() grading_prompt = f"""Grade this answer on a scale of 1-5 for each criterion. Question: {query} Question type: {query_type} Answer: {answer} Criteria: 1. RELEVANCE (1-5): Does the answer address the actual question? 2. ACCURACY (1-5): Are the facts/numbers plausible and consistent? 3. COMPLETENESS (1-5): Does it cover all parts of the question? 4. CLARITY (1-5): Is it well-organized and easy to understand? For out_of_scope questions, give 5/5 if the system correctly identifies it's outside the dataset. Respond in JSON format only: {{"relevance": X, "accuracy": X, "completeness": X, "clarity": X, "reasoning": "brief explanation"}}""" try: response = llm.invoke([HumanMessage(content=grading_prompt)]) # parse the JSON from the response import re json_match = re.search(r'\{.*\}', response.content, re.DOTALL) if json_match: return json.loads(json_match.group()) except Exception as e: logger.error(f"Grading failed: {e}") return {"relevance": 0, "accuracy": 0, "completeness": 0, "clarity": 0, "reasoning": "grading failed"} def run_evaluation(): # run the full evaluation suite across test queries queries = load_test_queries() results = [] latencies = [] print(f"\nRunning evaluation on {len(queries)} test queries...\n") print("-" * 80) for i, test in enumerate(queries): query = test["query"] query_type = test["type"] print(f"[{i+1}/{len(queries)}] {query[:70]}...") start = time.time() try: result = run_query(query) elapsed = time.time() - start latencies.append(elapsed) answer = result["answer"] confidence = result["confidence"] # check if the agent used the right tool by looking at the plan plan = result.get("plan", "") used_sql = "sql_query" in plan.lower() used_search = "semantic_search" in plan.lower() expected = test["expected_tool"] tool_correct = False if expected == "sql_query" and used_sql: tool_correct = True elif expected == "semantic_search" and used_search: tool_correct = True elif expected == "both" and (used_sql or used_search): tool_correct = True elif expected == "none": tool_correct = True # any response is fine for out-of-scope # grade the answer grades = grade_answer(query, answer, query_type) result_entry = { "id": test["id"], "query": query, "type": query_type, "answer": answer[:500], # truncate for storage "confidence": confidence, "tool_correct": tool_correct, "latency_seconds": round(elapsed, 2), "grades": grades, "retries": result.get("retries", 0), } results.append(result_entry) avg_grade = mean([ grades.get("relevance", 0), grades.get("accuracy", 0), grades.get("completeness", 0), grades.get("clarity", 0), ]) print(f" → Confidence: {confidence} | Avg grade: {avg_grade:.1f}/5 | " f"Tool correct: {tool_correct} | Time: {elapsed:.1f}s") except Exception as e: elapsed = time.time() - start print(f" → ERROR: {e} ({elapsed:.1f}s)") results.append({ "id": test["id"], "query": query, "type": query_type, "answer": f"ERROR: {e}", "confidence": "ERROR", "tool_correct": False, "latency_seconds": round(elapsed, 2), "grades": {"relevance": 0, "accuracy": 0, "completeness": 0, "clarity": 0}, "retries": 0, }) # compute aggregate metrics print("\n" + "=" * 80) print("EVALUATION SUMMARY") print("=" * 80) total = len(results) errors = sum(1 for r in results if r["confidence"] == "ERROR") response_rate = (total - errors) / total * 100 confidence_counts = {} for r in results: c = r["confidence"] confidence_counts[c] = confidence_counts.get(c, 0) + 1 tool_accuracy = sum(1 for r in results if r["tool_correct"]) / total * 100 all_grades = [r["grades"] for r in results if r["confidence"] != "ERROR"] avg_relevance = mean([g["relevance"] for g in all_grades]) if all_grades else 0 avg_accuracy = mean([g["accuracy"] for g in all_grades]) if all_grades else 0 avg_completeness = mean([g["completeness"] for g in all_grades]) if all_grades else 0 avg_clarity = mean([g["clarity"] for g in all_grades]) if all_grades else 0 sorted_latencies = sorted(latencies) p50 = sorted_latencies[len(sorted_latencies) // 2] if sorted_latencies else 0 p95_idx = int(len(sorted_latencies) * 0.95) p95 = sorted_latencies[min(p95_idx, len(sorted_latencies) - 1)] if sorted_latencies else 0 summary = { "total_queries": total, "response_rate": f"{response_rate:.1f}%", "confidence_distribution": confidence_counts, "tool_routing_accuracy": f"{tool_accuracy:.1f}%", "answer_quality": { "relevance": round(avg_relevance, 2), "accuracy": round(avg_accuracy, 2), "completeness": round(avg_completeness, 2), "clarity": round(avg_clarity, 2), "overall": round(mean([avg_relevance, avg_accuracy, avg_completeness, avg_clarity]), 2), }, "latency": { "median_seconds": round(p50, 2), "p95_seconds": round(p95, 2), }, } print(json.dumps(summary, indent=2)) # save results RESULTS_DIR.mkdir(parents=True, exist_ok=True) timestamp = time.strftime("%Y%m%d_%H%M%S") with open(RESULTS_DIR / f"eval_{timestamp}.json", "w") as f: json.dump({"summary": summary, "details": results}, f, indent=2) with open(RESULTS_DIR / f"summary_{timestamp}.json", "w") as f: json.dump(summary, f, indent=2) print(f"\nResults saved to evaluation/results/eval_{timestamp}.json") return summary if __name__ == "__main__": run_evaluation()