|
|
""" |
|
|
Agent Evaluation Tests - Measures agent quality and performance. |
|
|
|
|
|
Run with: pytest tests/test_evaluation.py -v -s |
|
|
""" |
|
|
import pytest |
|
|
import time |
|
|
import os |
|
|
from pathlib import Path |
|
|
from dataclasses import dataclass |
|
|
from typing import List |
|
|
|
|
|
|
|
|
pytestmark = pytest.mark.skipif( |
|
|
not (os.getenv("HF_TOKEN") or os.getenv("GOOGLE_API_KEY")), |
|
|
reason="Requires HF_TOKEN or GOOGLE_API_KEY" |
|
|
) |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class EvalCase: |
|
|
"""Test case for evaluation.""" |
|
|
name: str |
|
|
query: str |
|
|
expected_keywords: List[str] |
|
|
category: str |
|
|
|
|
|
|
|
|
|
|
|
EVAL_CASES = [ |
|
|
EvalCase( |
|
|
name="action_items_query", |
|
|
query="What are the open action items?", |
|
|
expected_keywords=["action", "item", "implement", "complete", "next"], |
|
|
category="action_items" |
|
|
), |
|
|
EvalCase( |
|
|
name="blockers_query", |
|
|
query="What blockers do we have?", |
|
|
expected_keywords=["blocker", "block", "risk", "waiting", "issue"], |
|
|
category="blockers" |
|
|
), |
|
|
EvalCase( |
|
|
name="project_summary", |
|
|
query="Give me a summary of the project", |
|
|
expected_keywords=["project", "meeting", "discuss", "team", "work"], |
|
|
category="general" |
|
|
), |
|
|
EvalCase( |
|
|
name="next_steps_query", |
|
|
query="What should we do next?", |
|
|
expected_keywords=["next", "action", "should", "need", "implement"], |
|
|
category="action_items" |
|
|
), |
|
|
EvalCase( |
|
|
name="issues_query", |
|
|
query="What issues or problems were discussed?", |
|
|
expected_keywords=["issue", "problem", "blocker", "challenge", "risk"], |
|
|
category="blockers" |
|
|
), |
|
|
] |
|
|
|
|
|
|
|
|
class EvaluationMetrics: |
|
|
"""Collect and compute evaluation metrics.""" |
|
|
|
|
|
def __init__(self): |
|
|
self.results = [] |
|
|
|
|
|
def add_result(self, case: EvalCase, response: str, latency: float): |
|
|
"""Add a single evaluation result.""" |
|
|
|
|
|
keywords_found = sum( |
|
|
1 for kw in case.expected_keywords |
|
|
if kw.lower() in response.lower() |
|
|
) |
|
|
keyword_score = keywords_found / len(case.expected_keywords) if case.expected_keywords else 1.0 |
|
|
|
|
|
|
|
|
is_valid = ( |
|
|
len(response) > 50 and |
|
|
not response.startswith("❌") and |
|
|
not response.startswith("⚠️") |
|
|
) |
|
|
|
|
|
|
|
|
length_score = 1.0 |
|
|
if len(response) < 100: |
|
|
length_score = 0.5 |
|
|
elif len(response) > 2000: |
|
|
length_score = 0.8 |
|
|
|
|
|
self.results.append({ |
|
|
"name": case.name, |
|
|
"category": case.category, |
|
|
"keyword_score": keyword_score, |
|
|
"is_valid": is_valid, |
|
|
"length_score": length_score, |
|
|
"latency_ms": latency, |
|
|
"response_length": len(response) |
|
|
}) |
|
|
|
|
|
def compute_summary(self) -> dict: |
|
|
"""Compute summary metrics.""" |
|
|
if not self.results: |
|
|
return {} |
|
|
|
|
|
total = len(self.results) |
|
|
passed = sum(1 for r in self.results if r["keyword_score"] >= 0.4 and r["is_valid"] and r["response_length"] >= 100) |
|
|
|
|
|
avg_keyword_score = sum(r["keyword_score"] for r in self.results) / total |
|
|
avg_latency = sum(r["latency_ms"] for r in self.results) / total |
|
|
avg_length = sum(r["response_length"] for r in self.results) / total |
|
|
|
|
|
return { |
|
|
"total_cases": total, |
|
|
"passed": passed, |
|
|
"failed": total - passed, |
|
|
"pass_rate": round(passed / total * 100, 1), |
|
|
"avg_keyword_score": round(avg_keyword_score * 100, 1), |
|
|
"avg_latency_ms": round(avg_latency, 0), |
|
|
"avg_response_length": round(avg_length, 0) |
|
|
} |
|
|
|
|
|
|
|
|
@pytest.fixture(scope="module") |
|
|
def agent(): |
|
|
"""Initialize agent for evaluation.""" |
|
|
from src.rag import ProjectRAG |
|
|
from src.agent import ProjectAgent |
|
|
|
|
|
data_dir = Path("./data") |
|
|
rag = ProjectRAG(data_dir) |
|
|
rag.load_and_index() |
|
|
|
|
|
|
|
|
if os.getenv("GOOGLE_API_KEY"): |
|
|
agent = ProjectAgent(rag, provider="google") |
|
|
else: |
|
|
agent = ProjectAgent(rag, provider="huggingface") |
|
|
|
|
|
return agent |
|
|
|
|
|
|
|
|
@pytest.fixture(scope="module") |
|
|
def metrics(): |
|
|
"""Shared metrics collector.""" |
|
|
return EvaluationMetrics() |
|
|
|
|
|
|
|
|
class TestAgentEvaluation: |
|
|
"""Evaluation test suite.""" |
|
|
|
|
|
@pytest.mark.parametrize("case", EVAL_CASES, ids=lambda c: c.name) |
|
|
def test_query(self, agent, metrics, case): |
|
|
"""Test individual query case.""" |
|
|
start = time.time() |
|
|
response = agent.query(case.query) |
|
|
latency = (time.time() - start) * 1000 |
|
|
|
|
|
metrics.add_result(case, response, latency) |
|
|
|
|
|
|
|
|
assert response is not None |
|
|
assert len(response) > 0 |
|
|
|
|
|
|
|
|
keywords_found = sum( |
|
|
1 for kw in case.expected_keywords |
|
|
if kw.lower() in response.lower() |
|
|
) |
|
|
|
|
|
print(f"\n Query: {case.query}") |
|
|
print(f" Keywords found: {keywords_found}/{len(case.expected_keywords)}") |
|
|
print(f" Latency: {latency:.0f}ms") |
|
|
print(f" Response length: {len(response)} chars") |
|
|
|
|
|
|
|
|
def test_evaluation_summary(metrics): |
|
|
"""Print evaluation summary after all tests.""" |
|
|
summary = metrics.compute_summary() |
|
|
|
|
|
if summary: |
|
|
print("\n" + "="*60) |
|
|
print("EVALUATION SUMMARY") |
|
|
print("="*60) |
|
|
print(f"Total Cases: {summary['total_cases']}") |
|
|
print(f"Passed: {summary['passed']}") |
|
|
print(f"Failed: {summary['failed']}") |
|
|
print(f"Pass Rate: {summary['pass_rate']}%") |
|
|
print(f"Avg Keyword Score: {summary['avg_keyword_score']}%") |
|
|
print(f"Avg Latency: {summary['avg_latency_ms']}ms") |
|
|
print(f"Avg Response Len: {summary['avg_response_length']} chars") |
|
|
print("="*60) |
|
|
|
|
|
|
|
|
assert summary["pass_rate"] >= 80, f"Pass rate too low: {summary['pass_rate']}%" |
|
|
|