Spaces:
Sleeping
Sleeping
| """ | |
| Agent Evaluation Tests - Measures agent quality and performance. | |
| Run with: pytest tests/test_evaluation.py -v -s | |
| """ | |
| import pytest | |
| import time | |
| import os | |
| from pathlib import Path | |
| from dataclasses import dataclass | |
| from typing import List | |
| # Skip if no API tokens available | |
| pytestmark = pytest.mark.skipif( | |
| not (os.getenv("HF_TOKEN") or os.getenv("GOOGLE_API_KEY")), | |
| reason="Requires HF_TOKEN or GOOGLE_API_KEY" | |
| ) | |
| class EvalCase: | |
| """Test case for evaluation.""" | |
| name: str | |
| query: str | |
| expected_keywords: List[str] | |
| category: str | |
| # Evaluation test cases - keywords aligned with actual agent responses | |
| EVAL_CASES = [ | |
| EvalCase( | |
| name="action_items_query", | |
| query="What are the open action items?", | |
| expected_keywords=["action", "item", "implement", "complete", "next"], | |
| category="action_items" | |
| ), | |
| EvalCase( | |
| name="blockers_query", | |
| query="What blockers do we have?", | |
| expected_keywords=["blocker", "block", "risk", "waiting", "issue"], | |
| category="blockers" | |
| ), | |
| EvalCase( | |
| name="project_summary", | |
| query="Give me a summary of the project", | |
| expected_keywords=["project", "meeting", "discuss", "team", "work"], | |
| category="general" | |
| ), | |
| EvalCase( | |
| name="next_steps_query", | |
| query="What should we do next?", | |
| expected_keywords=["next", "action", "should", "need", "implement"], | |
| category="action_items" | |
| ), | |
| EvalCase( | |
| name="issues_query", | |
| query="What issues or problems were discussed?", | |
| expected_keywords=["issue", "problem", "blocker", "challenge", "risk"], | |
| category="blockers" | |
| ), | |
| ] | |
| class EvaluationMetrics: | |
| """Collect and compute evaluation metrics.""" | |
| def __init__(self): | |
| self.results = [] | |
| def add_result(self, case: EvalCase, response: str, latency: float): | |
| """Add a single evaluation result.""" | |
| # Keyword match score | |
| keywords_found = sum( | |
| 1 for kw in case.expected_keywords | |
| if kw.lower() in response.lower() | |
| ) | |
| keyword_score = keywords_found / len(case.expected_keywords) if case.expected_keywords else 1.0 | |
| # Response validity | |
| is_valid = ( | |
| len(response) > 50 and | |
| not response.startswith("❌") and | |
| not response.startswith("⚠️") | |
| ) | |
| # Response length (penalize too short or too long) | |
| length_score = 1.0 | |
| if len(response) < 100: | |
| length_score = 0.5 | |
| elif len(response) > 2000: | |
| length_score = 0.8 | |
| self.results.append({ | |
| "name": case.name, | |
| "category": case.category, | |
| "keyword_score": keyword_score, | |
| "is_valid": is_valid, | |
| "length_score": length_score, | |
| "latency_ms": latency, | |
| "response_length": len(response) | |
| }) | |
| def compute_summary(self) -> dict: | |
| """Compute summary metrics.""" | |
| if not self.results: | |
| return {} | |
| total = len(self.results) | |
| passed = sum(1 for r in self.results if r["keyword_score"] >= 0.4 and r["is_valid"] and r["response_length"] >= 100) | |
| avg_keyword_score = sum(r["keyword_score"] for r in self.results) / total | |
| avg_latency = sum(r["latency_ms"] for r in self.results) / total | |
| avg_length = sum(r["response_length"] for r in self.results) / total | |
| return { | |
| "total_cases": total, | |
| "passed": passed, | |
| "failed": total - passed, | |
| "pass_rate": round(passed / total * 100, 1), | |
| "avg_keyword_score": round(avg_keyword_score * 100, 1), | |
| "avg_latency_ms": round(avg_latency, 0), | |
| "avg_response_length": round(avg_length, 0) | |
| } | |
| def agent(): | |
| """Initialize agent for evaluation.""" | |
| from src.rag import ProjectRAG | |
| from src.agent import ProjectAgent | |
| data_dir = Path("./data") | |
| rag = ProjectRAG(data_dir) | |
| rag.load_and_index() | |
| # Use Google if available (faster), otherwise HuggingFace | |
| if os.getenv("GOOGLE_API_KEY"): | |
| agent = ProjectAgent(rag, provider="google") | |
| else: | |
| agent = ProjectAgent(rag, provider="huggingface") | |
| return agent | |
| def metrics(): | |
| """Shared metrics collector.""" | |
| return EvaluationMetrics() | |
| class TestAgentEvaluation: | |
| """Evaluation test suite.""" | |
| def test_query(self, agent, metrics, case): | |
| """Test individual query case.""" | |
| start = time.time() | |
| response = agent.query(case.query) | |
| latency = (time.time() - start) * 1000 | |
| metrics.add_result(case, response, latency) | |
| # Basic assertions | |
| assert response is not None | |
| assert len(response) > 0 | |
| # Check at least one keyword found | |
| keywords_found = sum( | |
| 1 for kw in case.expected_keywords | |
| if kw.lower() in response.lower() | |
| ) | |
| print(f"\n Query: {case.query}") | |
| print(f" Keywords found: {keywords_found}/{len(case.expected_keywords)}") | |
| print(f" Latency: {latency:.0f}ms") | |
| print(f" Response length: {len(response)} chars") | |
| def test_evaluation_summary(metrics): | |
| """Print evaluation summary after all tests.""" | |
| summary = metrics.compute_summary() | |
| if summary: | |
| print("\n" + "="*60) | |
| print("EVALUATION SUMMARY") | |
| print("="*60) | |
| print(f"Total Cases: {summary['total_cases']}") | |
| print(f"Passed: {summary['passed']}") | |
| print(f"Failed: {summary['failed']}") | |
| print(f"Pass Rate: {summary['pass_rate']}%") | |
| print(f"Avg Keyword Score: {summary['avg_keyword_score']}%") | |
| print(f"Avg Latency: {summary['avg_latency_ms']}ms") | |
| print(f"Avg Response Len: {summary['avg_response_length']} chars") | |
| print("="*60) | |
| # Assert minimum quality (80% pass rate required) | |
| assert summary["pass_rate"] >= 80, f"Pass rate too low: {summary['pass_rate']}%" | |