| |
| |
| |
|
|
| import json |
| import time |
| import sys |
| import os |
| from pathlib import Path |
| from statistics import mean, median |
|
|
| sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) |
|
|
| from src.agents.graph import run_query |
| from src.llm import get_llm |
| from src.logger import logger |
| from langchain_core.messages import HumanMessage, SystemMessage |
|
|
| EVAL_DIR = Path(__file__).parent |
| RESULTS_DIR = EVAL_DIR / "results" |
|
|
|
|
| def load_test_queries() -> list[dict]: |
| with open(EVAL_DIR / "test_queries.json") as f: |
| return json.load(f) |
|
|
|
|
| def grade_answer(query: str, answer: str, query_type: str) -> dict: |
| |
| llm = get_llm() |
| grading_prompt = f"""Grade this answer on a scale of 1-5 for each criterion. |
| |
| Question: {query} |
| Question type: {query_type} |
| Answer: {answer} |
| |
| Criteria: |
| 1. RELEVANCE (1-5): Does the answer address the actual question? |
| 2. ACCURACY (1-5): Are the facts/numbers plausible and consistent? |
| 3. COMPLETENESS (1-5): Does it cover all parts of the question? |
| 4. CLARITY (1-5): Is it well-organized and easy to understand? |
| |
| For out_of_scope questions, give 5/5 if the system correctly identifies it's outside the dataset. |
| |
| Respond in JSON format only: |
| {{"relevance": X, "accuracy": X, "completeness": X, "clarity": X, "reasoning": "brief explanation"}}""" |
|
|
| try: |
| response = llm.invoke([HumanMessage(content=grading_prompt)]) |
| |
| import re |
| json_match = re.search(r'\{.*\}', response.content, re.DOTALL) |
| if json_match: |
| return json.loads(json_match.group()) |
| except Exception as e: |
| logger.error(f"Grading failed: {e}") |
|
|
| return {"relevance": 0, "accuracy": 0, "completeness": 0, "clarity": 0, "reasoning": "grading failed"} |
|
|
|
|
| def run_evaluation(): |
| |
| queries = load_test_queries() |
| results = [] |
| latencies = [] |
|
|
| print(f"\nRunning evaluation on {len(queries)} test queries...\n") |
| print("-" * 80) |
|
|
| for i, test in enumerate(queries): |
| query = test["query"] |
| query_type = test["type"] |
|
|
| print(f"[{i+1}/{len(queries)}] {query[:70]}...") |
|
|
| start = time.time() |
| try: |
| result = run_query(query) |
| elapsed = time.time() - start |
| latencies.append(elapsed) |
|
|
| answer = result["answer"] |
| confidence = result["confidence"] |
|
|
| |
| plan = result.get("plan", "") |
| used_sql = "sql_query" in plan.lower() |
| used_search = "semantic_search" in plan.lower() |
| expected = test["expected_tool"] |
|
|
| tool_correct = False |
| if expected == "sql_query" and used_sql: |
| tool_correct = True |
| elif expected == "semantic_search" and used_search: |
| tool_correct = True |
| elif expected == "both" and (used_sql or used_search): |
| tool_correct = True |
| elif expected == "none": |
| tool_correct = True |
|
|
| |
| grades = grade_answer(query, answer, query_type) |
|
|
| result_entry = { |
| "id": test["id"], |
| "query": query, |
| "type": query_type, |
| "answer": answer[:500], |
| "confidence": confidence, |
| "tool_correct": tool_correct, |
| "latency_seconds": round(elapsed, 2), |
| "grades": grades, |
| "retries": result.get("retries", 0), |
| } |
| results.append(result_entry) |
|
|
| avg_grade = mean([ |
| grades.get("relevance", 0), |
| grades.get("accuracy", 0), |
| grades.get("completeness", 0), |
| grades.get("clarity", 0), |
| ]) |
| print(f" → Confidence: {confidence} | Avg grade: {avg_grade:.1f}/5 | " |
| f"Tool correct: {tool_correct} | Time: {elapsed:.1f}s") |
|
|
| except Exception as e: |
| elapsed = time.time() - start |
| print(f" → ERROR: {e} ({elapsed:.1f}s)") |
| results.append({ |
| "id": test["id"], |
| "query": query, |
| "type": query_type, |
| "answer": f"ERROR: {e}", |
| "confidence": "ERROR", |
| "tool_correct": False, |
| "latency_seconds": round(elapsed, 2), |
| "grades": {"relevance": 0, "accuracy": 0, "completeness": 0, "clarity": 0}, |
| "retries": 0, |
| }) |
|
|
| |
| print("\n" + "=" * 80) |
| print("EVALUATION SUMMARY") |
| print("=" * 80) |
|
|
| total = len(results) |
| errors = sum(1 for r in results if r["confidence"] == "ERROR") |
| response_rate = (total - errors) / total * 100 |
|
|
| confidence_counts = {} |
| for r in results: |
| c = r["confidence"] |
| confidence_counts[c] = confidence_counts.get(c, 0) + 1 |
|
|
| tool_accuracy = sum(1 for r in results if r["tool_correct"]) / total * 100 |
|
|
| all_grades = [r["grades"] for r in results if r["confidence"] != "ERROR"] |
| avg_relevance = mean([g["relevance"] for g in all_grades]) if all_grades else 0 |
| avg_accuracy = mean([g["accuracy"] for g in all_grades]) if all_grades else 0 |
| avg_completeness = mean([g["completeness"] for g in all_grades]) if all_grades else 0 |
| avg_clarity = mean([g["clarity"] for g in all_grades]) if all_grades else 0 |
|
|
| sorted_latencies = sorted(latencies) |
| p50 = sorted_latencies[len(sorted_latencies) // 2] if sorted_latencies else 0 |
| p95_idx = int(len(sorted_latencies) * 0.95) |
| p95 = sorted_latencies[min(p95_idx, len(sorted_latencies) - 1)] if sorted_latencies else 0 |
|
|
| summary = { |
| "total_queries": total, |
| "response_rate": f"{response_rate:.1f}%", |
| "confidence_distribution": confidence_counts, |
| "tool_routing_accuracy": f"{tool_accuracy:.1f}%", |
| "answer_quality": { |
| "relevance": round(avg_relevance, 2), |
| "accuracy": round(avg_accuracy, 2), |
| "completeness": round(avg_completeness, 2), |
| "clarity": round(avg_clarity, 2), |
| "overall": round(mean([avg_relevance, avg_accuracy, avg_completeness, avg_clarity]), 2), |
| }, |
| "latency": { |
| "median_seconds": round(p50, 2), |
| "p95_seconds": round(p95, 2), |
| }, |
| } |
|
|
| print(json.dumps(summary, indent=2)) |
|
|
| |
| RESULTS_DIR.mkdir(parents=True, exist_ok=True) |
| timestamp = time.strftime("%Y%m%d_%H%M%S") |
|
|
| with open(RESULTS_DIR / f"eval_{timestamp}.json", "w") as f: |
| json.dump({"summary": summary, "details": results}, f, indent=2) |
|
|
| with open(RESULTS_DIR / f"summary_{timestamp}.json", "w") as f: |
| json.dump(summary, f, indent=2) |
|
|
| print(f"\nResults saved to evaluation/results/eval_{timestamp}.json") |
| return summary |
|
|
|
|
| if __name__ == "__main__": |
| run_evaluation() |
|
|