File size: 6,185 Bytes
420bcec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
"""
Agent Evaluation Tests - Measures agent quality and performance.

Run with: pytest tests/test_evaluation.py -v -s
"""
import pytest
import time
import os
from pathlib import Path
from dataclasses import dataclass
from typing import List

# Skip if no API tokens available
pytestmark = pytest.mark.skipif(
    not (os.getenv("HF_TOKEN") or os.getenv("GOOGLE_API_KEY")),
    reason="Requires HF_TOKEN or GOOGLE_API_KEY"
)


@dataclass
class EvalCase:
    """Test case for evaluation."""
    name: str
    query: str
    expected_keywords: List[str]
    category: str


# Evaluation test cases - keywords aligned with actual agent responses
EVAL_CASES = [
    EvalCase(
        name="action_items_query",
        query="What are the open action items?",
        expected_keywords=["action", "item", "implement", "complete", "next"],
        category="action_items"
    ),
    EvalCase(
        name="blockers_query",
        query="What blockers do we have?",
        expected_keywords=["blocker", "block", "risk", "waiting", "issue"],
        category="blockers"
    ),
    EvalCase(
        name="project_summary",
        query="Give me a summary of the project",
        expected_keywords=["project", "meeting", "discuss", "team", "work"],
        category="general"
    ),
    EvalCase(
        name="next_steps_query",
        query="What should we do next?",
        expected_keywords=["next", "action", "should", "need", "implement"],
        category="action_items"
    ),
    EvalCase(
        name="issues_query",
        query="What issues or problems were discussed?",
        expected_keywords=["issue", "problem", "blocker", "challenge", "risk"],
        category="blockers"
    ),
]


class EvaluationMetrics:
    """Collect and compute evaluation metrics."""

    def __init__(self):
        self.results = []

    def add_result(self, case: EvalCase, response: str, latency: float):
        """Add a single evaluation result."""
        # Keyword match score
        keywords_found = sum(
            1 for kw in case.expected_keywords
            if kw.lower() in response.lower()
        )
        keyword_score = keywords_found / len(case.expected_keywords) if case.expected_keywords else 1.0

        # Response validity
        is_valid = (
            len(response) > 50 and
            not response.startswith("❌") and
            not response.startswith("⚠️")
        )

        # Response length (penalize too short or too long)
        length_score = 1.0
        if len(response) < 100:
            length_score = 0.5
        elif len(response) > 2000:
            length_score = 0.8

        self.results.append({
            "name": case.name,
            "category": case.category,
            "keyword_score": keyword_score,
            "is_valid": is_valid,
            "length_score": length_score,
            "latency_ms": latency,
            "response_length": len(response)
        })

    def compute_summary(self) -> dict:
        """Compute summary metrics."""
        if not self.results:
            return {}

        total = len(self.results)
        passed = sum(1 for r in self.results if r["keyword_score"] >= 0.4 and r["is_valid"]  and r["response_length"] >= 100)

        avg_keyword_score = sum(r["keyword_score"] for r in self.results) / total
        avg_latency = sum(r["latency_ms"] for r in self.results) / total
        avg_length = sum(r["response_length"] for r in self.results) / total

        return {
            "total_cases": total,
            "passed": passed,
            "failed": total - passed,
            "pass_rate": round(passed / total * 100, 1),
            "avg_keyword_score": round(avg_keyword_score * 100, 1),
            "avg_latency_ms": round(avg_latency, 0),
            "avg_response_length": round(avg_length, 0)
        }


@pytest.fixture(scope="module")
def agent():
    """Initialize agent for evaluation."""
    from src.rag import ProjectRAG
    from src.agent import ProjectAgent

    data_dir = Path("./data")
    rag = ProjectRAG(data_dir)
    rag.load_and_index()

    # Use Google if available (faster), otherwise HuggingFace
    if os.getenv("GOOGLE_API_KEY"):
        agent = ProjectAgent(rag, provider="google")
    else:
        agent = ProjectAgent(rag, provider="huggingface")

    return agent


@pytest.fixture(scope="module")
def metrics():
    """Shared metrics collector."""
    return EvaluationMetrics()


class TestAgentEvaluation:
    """Evaluation test suite."""

    @pytest.mark.parametrize("case", EVAL_CASES, ids=lambda c: c.name)
    def test_query(self, agent, metrics, case):
        """Test individual query case."""
        start = time.time()
        response = agent.query(case.query)
        latency = (time.time() - start) * 1000

        metrics.add_result(case, response, latency)

        # Basic assertions
        assert response is not None
        assert len(response) > 0

        # Check at least one keyword found
        keywords_found = sum(
            1 for kw in case.expected_keywords
            if kw.lower() in response.lower()
        )

        print(f"\n  Query: {case.query}")
        print(f"  Keywords found: {keywords_found}/{len(case.expected_keywords)}")
        print(f"  Latency: {latency:.0f}ms")
        print(f"  Response length: {len(response)} chars")


def test_evaluation_summary(metrics):
    """Print evaluation summary after all tests."""
    summary = metrics.compute_summary()

    if summary:
        print("\n" + "="*60)
        print("EVALUATION SUMMARY")
        print("="*60)
        print(f"Total Cases:        {summary['total_cases']}")
        print(f"Passed:             {summary['passed']}")
        print(f"Failed:             {summary['failed']}")
        print(f"Pass Rate:          {summary['pass_rate']}%")
        print(f"Avg Keyword Score:  {summary['avg_keyword_score']}%")
        print(f"Avg Latency:        {summary['avg_latency_ms']}ms")
        print(f"Avg Response Len:   {summary['avg_response_length']} chars")
        print("="*60)

        # Assert minimum quality (80% pass rate required)
        assert summary["pass_rate"] >= 80, f"Pass rate too low: {summary['pass_rate']}%"