| |
| """ |
| eval/run_eval.py |
| Runs the agent against eval/datasets/queries.json and prints a results table. |
| Usage: python eval/run_eval.py |
| """ |
|
|
| import json |
| import sys |
| import time |
| import uuid |
| from pathlib import Path |
|
|
| from dotenv import load_dotenv |
| load_dotenv() |
|
|
| sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
| from agent.graph import get_graph |
|
|
| DATASET = Path(__file__).parent / "datasets" / "queries.json" |
| PASS = "PASS" |
| FAIL = "FAIL" |
| WARN = "WARN" |
|
|
|
|
| def run_eval(): |
| cases = json.loads(DATASET.read_text()) |
| graph = get_graph() |
| results = [] |
|
|
| print(f"\nRunning {len(cases)} eval cases...\n") |
|
|
| for case in cases: |
| state = { |
| "session_id": str(uuid.uuid4()), |
| "user_id": "eval", |
| "user_query": case["query"], |
| "connector_id": case["connector_id"], |
| "intent": "", |
| "query_plan": {}, |
| "relevant_tables": [], |
| "schema_context": "", |
| "memory_context": "", |
| "conversation_history": [], |
| "generated_code": "", |
| "code_type": "sql", |
| "sql_dialect": "postgres", |
| "execution_result": None, |
| "execution_error": None, |
| "from_cache": False, |
| "error_class": None, |
| "correction_attempts": 0, |
| "max_corrections": 3, |
| "insight_text": "", |
| "chart_spec": None, |
| "anomalies": [], |
| "history_id": None, |
| "latency_ms": None, |
| "stream_tokens": [], |
| } |
|
|
| t0 = time.time() |
| try: |
| result = graph.invoke(state) |
| elapsed = int((time.time() - t0) * 1000) |
|
|
| intent_ok = result.get("intent") == case["expected_intent"] |
| has_result = bool(result.get("execution_result")) or result.get("intent") == "unsupported" |
| no_error = not result.get("execution_error") |
|
|
| insight = result.get("insight_text") or "" |
| code = result.get("generated_code") or "" |
| contains_ok = all( |
| kw.lower() in insight.lower() or kw.lower() in code.lower() |
| for kw in case.get("expected_contains", []) |
| ) |
|
|
| passed = intent_ok and (has_result or case["expected_intent"] == "unsupported") and no_error and contains_ok |
| status = PASS if passed else FAIL |
|
|
| results.append({ |
| "id": case["id"], |
| "query": case["query"][:55], |
| "intent": result.get("intent"), |
| "expected_intent": case["expected_intent"], |
| "corrections": result.get("correction_attempts", 0), |
| "anomalies": len(result.get("anomalies", [])), |
| "latency_ms": elapsed, |
| "passed": passed, |
| "status": status, |
| "exec_error": result.get("execution_error") |
| }) |
|
|
| except Exception as exc: |
| results.append({ |
| "id": case["id"], |
| "query": case["query"][:55], |
| "intent": "ERROR", |
| "expected_intent": case["expected_intent"], |
| "corrections": 0, |
| "anomalies": 0, |
| "latency_ms": int((time.time() - t0) * 1000), |
| "passed": False, |
| "status": FAIL, |
| "error": str(exc), |
| }) |
|
|
| |
| print(f"{'ID':<5} {'Status':<4} {'Intent':<12} {'Fixes':<6} {'Warns':<6} {'ms':<7} Query") |
| print("-" * 90) |
| for r in results: |
| print( |
| f"{r['id']:<5} {r['status']:<4} {r['intent']:<12} {r['corrections']:<6} " |
| f"{r.get('anomalies', 0):<6} {r['latency_ms']:<7} {r['query']}" |
| ) |
| if r.get("error"): |
| print(f" -> SYS ERROR: {r['error']}") |
| if r.get("exec_error"): |
| print(f" -> DB ERROR: {r['exec_error']}") |
|
|
| passed = sum(1 for r in results if r["passed"]) |
| avg_lat = sum(r["latency_ms"] for r in results) // len(results) |
| print(f"\n{'-' * 90}") |
| print(f"Passed: {passed}/{len(results)} ({100 * passed // len(results)}%) | Avg Latency: {avg_lat}ms") |
| return passed == len(results) |
|
|
|
|
|
|
| if __name__ == "__main__": |
| ok = run_eval() |
| sys.exit(0 if ok else 1) |
|
|