""" Agent Evaluation Tests - Measures agent quality and performance. Run with: pytest tests/test_evaluation.py -v -s """ import pytest import time import os from pathlib import Path from dataclasses import dataclass from typing import List # Skip if no API tokens available pytestmark = pytest.mark.skipif( not (os.getenv("HF_TOKEN") or os.getenv("GOOGLE_API_KEY")), reason="Requires HF_TOKEN or GOOGLE_API_KEY" ) @dataclass class EvalCase: """Test case for evaluation.""" name: str query: str expected_keywords: List[str] category: str # Evaluation test cases - keywords aligned with actual agent responses EVAL_CASES = [ EvalCase( name="action_items_query", query="What are the open action items?", expected_keywords=["action", "item", "implement", "complete", "next"], category="action_items" ), EvalCase( name="blockers_query", query="What blockers do we have?", expected_keywords=["blocker", "block", "risk", "waiting", "issue"], category="blockers" ), EvalCase( name="project_summary", query="Give me a summary of the project", expected_keywords=["project", "meeting", "discuss", "team", "work"], category="general" ), EvalCase( name="next_steps_query", query="What should we do next?", expected_keywords=["next", "action", "should", "need", "implement"], category="action_items" ), EvalCase( name="issues_query", query="What issues or problems were discussed?", expected_keywords=["issue", "problem", "blocker", "challenge", "risk"], category="blockers" ), ] class EvaluationMetrics: """Collect and compute evaluation metrics.""" def __init__(self): self.results = [] def add_result(self, case: EvalCase, response: str, latency: float): """Add a single evaluation result.""" # Keyword match score keywords_found = sum( 1 for kw in case.expected_keywords if kw.lower() in response.lower() ) keyword_score = keywords_found / len(case.expected_keywords) if case.expected_keywords else 1.0 # Response validity is_valid = ( len(response) > 50 and not response.startswith("❌") and not response.startswith("⚠️") ) # Response length (penalize too short or too long) length_score = 1.0 if len(response) < 100: length_score = 0.5 elif len(response) > 2000: length_score = 0.8 self.results.append({ "name": case.name, "category": case.category, "keyword_score": keyword_score, "is_valid": is_valid, "length_score": length_score, "latency_ms": latency, "response_length": len(response) }) def compute_summary(self) -> dict: """Compute summary metrics.""" if not self.results: return {} total = len(self.results) passed = sum(1 for r in self.results if r["keyword_score"] >= 0.4 and r["is_valid"] and r["response_length"] >= 100) avg_keyword_score = sum(r["keyword_score"] for r in self.results) / total avg_latency = sum(r["latency_ms"] for r in self.results) / total avg_length = sum(r["response_length"] for r in self.results) / total return { "total_cases": total, "passed": passed, "failed": total - passed, "pass_rate": round(passed / total * 100, 1), "avg_keyword_score": round(avg_keyword_score * 100, 1), "avg_latency_ms": round(avg_latency, 0), "avg_response_length": round(avg_length, 0) } @pytest.fixture(scope="module") def agent(): """Initialize agent for evaluation.""" from src.rag import ProjectRAG from src.agent import ProjectAgent data_dir = Path("./data") rag = ProjectRAG(data_dir) rag.load_and_index() # Use Google if available (faster), otherwise HuggingFace if os.getenv("GOOGLE_API_KEY"): agent = ProjectAgent(rag, provider="google") else: agent = ProjectAgent(rag, provider="huggingface") return agent @pytest.fixture(scope="module") def metrics(): """Shared metrics collector.""" return EvaluationMetrics() class TestAgentEvaluation: """Evaluation test suite.""" @pytest.mark.parametrize("case", EVAL_CASES, ids=lambda c: c.name) def test_query(self, agent, metrics, case): """Test individual query case.""" start = time.time() response = agent.query(case.query) latency = (time.time() - start) * 1000 metrics.add_result(case, response, latency) # Basic assertions assert response is not None assert len(response) > 0 # Check at least one keyword found keywords_found = sum( 1 for kw in case.expected_keywords if kw.lower() in response.lower() ) print(f"\n Query: {case.query}") print(f" Keywords found: {keywords_found}/{len(case.expected_keywords)}") print(f" Latency: {latency:.0f}ms") print(f" Response length: {len(response)} chars") def test_evaluation_summary(metrics): """Print evaluation summary after all tests.""" summary = metrics.compute_summary() if summary: print("\n" + "="*60) print("EVALUATION SUMMARY") print("="*60) print(f"Total Cases: {summary['total_cases']}") print(f"Passed: {summary['passed']}") print(f"Failed: {summary['failed']}") print(f"Pass Rate: {summary['pass_rate']}%") print(f"Avg Keyword Score: {summary['avg_keyword_score']}%") print(f"Avg Latency: {summary['avg_latency_ms']}ms") print(f"Avg Response Len: {summary['avg_response_length']} chars") print("="*60) # Assert minimum quality (80% pass rate required) assert summary["pass_rate"] >= 80, f"Pass rate too low: {summary['pass_rate']}%"