Spaces:

Misbah17311
/

financial-intelligence-agent

Sleeping

financial-intelligence-agent / evaluation /evaluate.py

Misbah

clean up comments and formatting across all modules

4506ba8 2 months ago

7.27 kB

	# Evaluation runner — tests the agent against curated queries
	# and grades answers using LLM-as-judge (1-5 scale)
	# tracks: response rate, confidence dist, latency, tool accuracy

	import json
	import time
	import sys
	import os
	from pathlib import Path
	from statistics import mean, median

	sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

	from src.agents.graph import run_query
	from src.llm import get_llm
	from src.logger import logger
	from langchain_core.messages import HumanMessage, SystemMessage

	EVAL_DIR = Path(__file__).parent
	RESULTS_DIR = EVAL_DIR / "results"


	def load_test_queries() -> list[dict]:
	with open(EVAL_DIR / "test_queries.json") as f:
	return json.load(f)


	def grade_answer(query: str, answer: str, query_type: str) -> dict:
	# use the LLM itself to grade answer quality on 1-5 scale
	llm = get_llm()
	grading_prompt = f"""Grade this answer on a scale of 1-5 for each criterion.

	Question: {query}
	Question type: {query_type}
	Answer: {answer}

	Criteria:
	1. RELEVANCE (1-5): Does the answer address the actual question?
	2. ACCURACY (1-5): Are the facts/numbers plausible and consistent?
	3. COMPLETENESS (1-5): Does it cover all parts of the question?
	4. CLARITY (1-5): Is it well-organized and easy to understand?

	For out_of_scope questions, give 5/5 if the system correctly identifies it's outside the dataset.

	Respond in JSON format only:
	{{"relevance": X, "accuracy": X, "completeness": X, "clarity": X, "reasoning": "brief explanation"}}"""

	try:
	response = llm.invoke([HumanMessage(content=grading_prompt)])
	# parse the JSON from the response
	import re
	json_match = re.search(r'\{.*\}', response.content, re.DOTALL)
	if json_match:
	return json.loads(json_match.group())
	except Exception as e:
	logger.error(f"Grading failed: {e}")

	return {"relevance": 0, "accuracy": 0, "completeness": 0, "clarity": 0, "reasoning": "grading failed"}


	def run_evaluation():
	# run the full evaluation suite across test queries
	queries = load_test_queries()
	results = []
	latencies = []

	print(f"\nRunning evaluation on {len(queries)} test queries...\n")
	print("-" * 80)

	for i, test in enumerate(queries):
	query = test["query"]
	query_type = test["type"]

	print(f"[{i+1}/{len(queries)}] {query[:70]}...")

	start = time.time()
	try:
	result = run_query(query)
	elapsed = time.time() - start
	latencies.append(elapsed)

	answer = result["answer"]
	confidence = result["confidence"]

	# check if the agent used the right tool by looking at the plan
	plan = result.get("plan", "")
	used_sql = "sql_query" in plan.lower()
	used_search = "semantic_search" in plan.lower()
	expected = test["expected_tool"]

	tool_correct = False
	if expected == "sql_query" and used_sql:
	tool_correct = True
	elif expected == "semantic_search" and used_search:
	tool_correct = True
	elif expected == "both" and (used_sql or used_search):
	tool_correct = True
	elif expected == "none":
	tool_correct = True # any response is fine for out-of-scope

	# grade the answer
	grades = grade_answer(query, answer, query_type)

	result_entry = {
	"id": test["id"],
	"query": query,
	"type": query_type,
	"answer": answer[:500], # truncate for storage
	"confidence": confidence,
	"tool_correct": tool_correct,
	"latency_seconds": round(elapsed, 2),
	"grades": grades,
	"retries": result.get("retries", 0),
	}
	results.append(result_entry)

	avg_grade = mean([
	grades.get("relevance", 0),
	grades.get("accuracy", 0),
	grades.get("completeness", 0),
	grades.get("clarity", 0),
	])
	print(f" → Confidence: {confidence} \| Avg grade: {avg_grade:.1f}/5 \| "
	f"Tool correct: {tool_correct} \| Time: {elapsed:.1f}s")

	except Exception as e:
	elapsed = time.time() - start
	print(f" → ERROR: {e} ({elapsed:.1f}s)")
	results.append({
	"id": test["id"],
	"query": query,
	"type": query_type,
	"answer": f"ERROR: {e}",
	"confidence": "ERROR",
	"tool_correct": False,
	"latency_seconds": round(elapsed, 2),
	"grades": {"relevance": 0, "accuracy": 0, "completeness": 0, "clarity": 0},
	"retries": 0,
	})

	# compute aggregate metrics
	print("\n" + "=" * 80)
	print("EVALUATION SUMMARY")
	print("=" * 80)

	total = len(results)
	errors = sum(1 for r in results if r["confidence"] == "ERROR")
	response_rate = (total - errors) / total * 100

	confidence_counts = {}
	for r in results:
	c = r["confidence"]
	confidence_counts[c] = confidence_counts.get(c, 0) + 1

	tool_accuracy = sum(1 for r in results if r["tool_correct"]) / total * 100

	all_grades = [r["grades"] for r in results if r["confidence"] != "ERROR"]
	avg_relevance = mean([g["relevance"] for g in all_grades]) if all_grades else 0
	avg_accuracy = mean([g["accuracy"] for g in all_grades]) if all_grades else 0
	avg_completeness = mean([g["completeness"] for g in all_grades]) if all_grades else 0
	avg_clarity = mean([g["clarity"] for g in all_grades]) if all_grades else 0

	sorted_latencies = sorted(latencies)
	p50 = sorted_latencies[len(sorted_latencies) // 2] if sorted_latencies else 0
	p95_idx = int(len(sorted_latencies) * 0.95)
	p95 = sorted_latencies[min(p95_idx, len(sorted_latencies) - 1)] if sorted_latencies else 0

	summary = {
	"total_queries": total,
	"response_rate": f"{response_rate:.1f}%",
	"confidence_distribution": confidence_counts,
	"tool_routing_accuracy": f"{tool_accuracy:.1f}%",
	"answer_quality": {
	"relevance": round(avg_relevance, 2),
	"accuracy": round(avg_accuracy, 2),
	"completeness": round(avg_completeness, 2),
	"clarity": round(avg_clarity, 2),
	"overall": round(mean([avg_relevance, avg_accuracy, avg_completeness, avg_clarity]), 2),
	},
	"latency": {
	"median_seconds": round(p50, 2),
	"p95_seconds": round(p95, 2),
	},
	}

	print(json.dumps(summary, indent=2))

	# save results
	RESULTS_DIR.mkdir(parents=True, exist_ok=True)
	timestamp = time.strftime("%Y%m%d_%H%M%S")

	with open(RESULTS_DIR / f"eval_{timestamp}.json", "w") as f:
	json.dump({"summary": summary, "details": results}, f, indent=2)

	with open(RESULTS_DIR / f"summary_{timestamp}.json", "w") as f:
	json.dump(summary, f, indent=2)

	print(f"\nResults saved to evaluation/results/eval_{timestamp}.json")
	return summary


	if __name__ == "__main__":
	run_evaluation()