sherlock-project-assistant / tests /test_evaluation.py
sebasmos's picture
Update demo with latest codebase changes
420bcec
"""
Agent Evaluation Tests - Measures agent quality and performance.
Run with: pytest tests/test_evaluation.py -v -s
"""
import pytest
import time
import os
from pathlib import Path
from dataclasses import dataclass
from typing import List
# Skip if no API tokens available
pytestmark = pytest.mark.skipif(
not (os.getenv("HF_TOKEN") or os.getenv("GOOGLE_API_KEY")),
reason="Requires HF_TOKEN or GOOGLE_API_KEY"
)
@dataclass
class EvalCase:
"""Test case for evaluation."""
name: str
query: str
expected_keywords: List[str]
category: str
# Evaluation test cases - keywords aligned with actual agent responses
EVAL_CASES = [
EvalCase(
name="action_items_query",
query="What are the open action items?",
expected_keywords=["action", "item", "implement", "complete", "next"],
category="action_items"
),
EvalCase(
name="blockers_query",
query="What blockers do we have?",
expected_keywords=["blocker", "block", "risk", "waiting", "issue"],
category="blockers"
),
EvalCase(
name="project_summary",
query="Give me a summary of the project",
expected_keywords=["project", "meeting", "discuss", "team", "work"],
category="general"
),
EvalCase(
name="next_steps_query",
query="What should we do next?",
expected_keywords=["next", "action", "should", "need", "implement"],
category="action_items"
),
EvalCase(
name="issues_query",
query="What issues or problems were discussed?",
expected_keywords=["issue", "problem", "blocker", "challenge", "risk"],
category="blockers"
),
]
class EvaluationMetrics:
"""Collect and compute evaluation metrics."""
def __init__(self):
self.results = []
def add_result(self, case: EvalCase, response: str, latency: float):
"""Add a single evaluation result."""
# Keyword match score
keywords_found = sum(
1 for kw in case.expected_keywords
if kw.lower() in response.lower()
)
keyword_score = keywords_found / len(case.expected_keywords) if case.expected_keywords else 1.0
# Response validity
is_valid = (
len(response) > 50 and
not response.startswith("❌") and
not response.startswith("⚠️")
)
# Response length (penalize too short or too long)
length_score = 1.0
if len(response) < 100:
length_score = 0.5
elif len(response) > 2000:
length_score = 0.8
self.results.append({
"name": case.name,
"category": case.category,
"keyword_score": keyword_score,
"is_valid": is_valid,
"length_score": length_score,
"latency_ms": latency,
"response_length": len(response)
})
def compute_summary(self) -> dict:
"""Compute summary metrics."""
if not self.results:
return {}
total = len(self.results)
passed = sum(1 for r in self.results if r["keyword_score"] >= 0.4 and r["is_valid"] and r["response_length"] >= 100)
avg_keyword_score = sum(r["keyword_score"] for r in self.results) / total
avg_latency = sum(r["latency_ms"] for r in self.results) / total
avg_length = sum(r["response_length"] for r in self.results) / total
return {
"total_cases": total,
"passed": passed,
"failed": total - passed,
"pass_rate": round(passed / total * 100, 1),
"avg_keyword_score": round(avg_keyword_score * 100, 1),
"avg_latency_ms": round(avg_latency, 0),
"avg_response_length": round(avg_length, 0)
}
@pytest.fixture(scope="module")
def agent():
"""Initialize agent for evaluation."""
from src.rag import ProjectRAG
from src.agent import ProjectAgent
data_dir = Path("./data")
rag = ProjectRAG(data_dir)
rag.load_and_index()
# Use Google if available (faster), otherwise HuggingFace
if os.getenv("GOOGLE_API_KEY"):
agent = ProjectAgent(rag, provider="google")
else:
agent = ProjectAgent(rag, provider="huggingface")
return agent
@pytest.fixture(scope="module")
def metrics():
"""Shared metrics collector."""
return EvaluationMetrics()
class TestAgentEvaluation:
"""Evaluation test suite."""
@pytest.mark.parametrize("case", EVAL_CASES, ids=lambda c: c.name)
def test_query(self, agent, metrics, case):
"""Test individual query case."""
start = time.time()
response = agent.query(case.query)
latency = (time.time() - start) * 1000
metrics.add_result(case, response, latency)
# Basic assertions
assert response is not None
assert len(response) > 0
# Check at least one keyword found
keywords_found = sum(
1 for kw in case.expected_keywords
if kw.lower() in response.lower()
)
print(f"\n Query: {case.query}")
print(f" Keywords found: {keywords_found}/{len(case.expected_keywords)}")
print(f" Latency: {latency:.0f}ms")
print(f" Response length: {len(response)} chars")
def test_evaluation_summary(metrics):
"""Print evaluation summary after all tests."""
summary = metrics.compute_summary()
if summary:
print("\n" + "="*60)
print("EVALUATION SUMMARY")
print("="*60)
print(f"Total Cases: {summary['total_cases']}")
print(f"Passed: {summary['passed']}")
print(f"Failed: {summary['failed']}")
print(f"Pass Rate: {summary['pass_rate']}%")
print(f"Avg Keyword Score: {summary['avg_keyword_score']}%")
print(f"Avg Latency: {summary['avg_latency_ms']}ms")
print(f"Avg Response Len: {summary['avg_response_length']} chars")
print("="*60)
# Assert minimum quality (80% pass rate required)
assert summary["pass_rate"] >= 80, f"Pass rate too low: {summary['pass_rate']}%"