File size: 8,238 Bytes
c4fe0a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
"""Tests for evaluation metrics, failure modes, and report generation."""
from eval.metrics import (
    schema_pass_rate,
    evidence_coverage_rate,
    review_required_rate,
    unsupported_recommendation_rate,
    root_cause_consistency,
    review_routing_precision_recall,
    compute_all_metrics,
)
from eval.failure_modes import (
    detect_hallucination,
    detect_omission,
    detect_ambiguity,
    detect_overconfidence,
    detect_language_drift,
    tag_failure_modes,
    summarize_failure_modes,
)
from eval.run_eval import generate_report


# --- Helpers ---

def _valid_extraction(**overrides) -> dict:
    base = {
        "case_id": "test-001",
        "root_cause_l1": "billing",
        "root_cause_l2": "overcharge",
        "sentiment_score": -0.5,
        "risk_level": "medium",
        "review_required": False,
        "next_best_actions": ["Issue refund"],
        "evidence_quotes": ["I was charged twice"],
        "confidence": 0.85,
        "churn_risk": 0.3,
        "sentiment_rationale": "Frustrated",
        "draft_notes": "Check billing.",
    }
    base.update(overrides)
    return base


def _valid_case(**overrides) -> dict:
    base = {
        "case_id": "test-001",
        "ticket_text": "I was charged twice for the same service last month.",
        "conversation_snippet": "",
        "email_thread": [],
        "vip_tier": "standard",
        "priority": "medium",
        "source_dataset": "tickets",
        "language": "en",
    }
    base.update(overrides)
    return base


# --- Metrics tests ---

def test_schema_pass_rate_all_pass():
    exts = [_valid_extraction() for _ in range(5)]
    assert schema_pass_rate(exts) == 1.0


def test_schema_pass_rate_some_fail():
    exts = [_valid_extraction(), {"root_cause_l1": ""}]  # second fails minLength
    rate = schema_pass_rate(exts)
    assert rate == 0.5


def test_schema_pass_rate_empty():
    assert schema_pass_rate([]) == 0.0


def test_evidence_coverage_all_covered():
    exts = [_valid_extraction() for _ in range(3)]
    assert evidence_coverage_rate(exts) == 1.0


def test_evidence_coverage_some_missing():
    exts = [_valid_extraction(), _valid_extraction(evidence_quotes=[])]
    assert evidence_coverage_rate(exts) == 0.5


def test_review_required_rate():
    exts = [
        _valid_extraction(confidence=0.9, risk_level="low", churn_risk=0.1),
        _valid_extraction(confidence=0.3, risk_level="high", churn_risk=0.8),
    ]
    rate = review_required_rate(exts)
    assert rate == 0.5  # second triggers review


def test_unsupported_recommendation_rate():
    exts = [
        _valid_extraction(),  # has evidence
        _valid_extraction(evidence_quotes=[], next_best_actions=["Do X"]),  # unsupported
    ]
    rate = unsupported_recommendation_rate(exts)
    assert rate == 0.5


def test_root_cause_consistency_perfect():
    exts = [
        _valid_extraction(case_id="a", root_cause_l1="billing"),
        _valid_extraction(case_id="b", root_cause_l1="billing"),
    ]
    cases = [
        _valid_case(case_id="a", source_dataset="tickets"),
        _valid_case(case_id="b", source_dataset="tickets"),
    ]
    assert root_cause_consistency(exts, cases) == 1.0


def test_root_cause_consistency_mixed():
    exts = [
        _valid_extraction(case_id="a", root_cause_l1="billing"),
        _valid_extraction(case_id="b", root_cause_l1="network"),
    ]
    cases = [
        _valid_case(case_id="a", source_dataset="tickets"),
        _valid_case(case_id="b", source_dataset="tickets"),
    ]
    assert root_cause_consistency(exts, cases) == 0.5


def test_review_routing_precision_recall():
    pred = [True, True, False, False]
    gold = [True, False, False, True]
    result = review_routing_precision_recall(pred, gold)
    assert result["precision"] == 0.5  # 1 TP / (1 TP + 1 FP)
    assert result["recall"] == 0.5     # 1 TP / (1 TP + 1 FN)


def test_compute_all_metrics_returns_all_keys():
    exts = [_valid_extraction()]
    result = compute_all_metrics(exts)
    assert "schema_pass_rate" in result
    assert "evidence_coverage_rate" in result
    assert "review_required_rate" in result
    assert "unsupported_recommendation_rate" in result
    assert "root_cause_consistency" in result


# --- Failure mode tests ---

def test_detect_hallucination_no_evidence():
    ext = _valid_extraction(evidence_quotes=[])
    case = _valid_case()
    detected, detail = detect_hallucination(ext, case)
    assert detected is True


def test_detect_hallucination_fabricated_quote():
    ext = _valid_extraction(evidence_quotes=["This quote does not exist in the text at all whatsoever"])
    case = _valid_case(ticket_text="My bill is wrong.")
    detected, detail = detect_hallucination(ext, case)
    assert detected is True


def test_detect_hallucination_valid_quote():
    ext = _valid_extraction(evidence_quotes=["charged twice"])
    case = _valid_case(ticket_text="I was charged twice for the same service.")
    detected, detail = detect_hallucination(ext, case)
    assert detected is False


def test_detect_omission_urgent_signal():
    ext = _valid_extraction(risk_level="low")
    case = _valid_case(ticket_text="I will take legal action if this is not resolved.")
    detected, detail = detect_omission(ext, case)
    assert detected is True


def test_detect_omission_no_signal():
    ext = _valid_extraction(risk_level="medium", root_cause_l1="billing")
    case = _valid_case(ticket_text="I was charged twice for the same service.")
    detected, detail = detect_omission(ext, case)
    assert detected is False


def test_detect_ambiguity_short_ticket():
    ext = _valid_extraction(confidence=0.95, review_required=False)
    case = _valid_case(ticket_text="Help please")
    detected, detail = detect_ambiguity(ext, case)
    assert detected is True


def test_detect_ambiguity_normal_ticket():
    ext = _valid_extraction(confidence=0.85, review_required=False)
    case = _valid_case(ticket_text="I was charged twice for the same service and want a refund.")
    detected, detail = detect_ambiguity(ext, case)
    assert detected is False


def test_detect_overconfidence_wrong_label():
    ext = _valid_extraction(confidence=0.95, root_cause_l1="network")
    case = _valid_case(gold_root_cause="billing")
    detected, detail = detect_overconfidence(ext, case)
    assert detected is True


def test_detect_overconfidence_correct_label():
    ext = _valid_extraction(confidence=0.95, root_cause_l1="billing")
    case = _valid_case(gold_root_cause="billing")
    detected, detail = detect_overconfidence(ext, case)
    assert detected is False


def test_detect_language_drift():
    ext = _valid_extraction(confidence=0.3)
    case = _valid_case(language="mixed")
    detected, detail = detect_language_drift(ext, case)
    assert detected is True


def test_detect_language_drift_english():
    ext = _valid_extraction(confidence=0.9)
    case = _valid_case(language="en")
    detected, detail = detect_language_drift(ext, case)
    assert detected is False


def test_tag_failure_modes_returns_list():
    ext = _valid_extraction(evidence_quotes=[])
    case = _valid_case()
    tags = tag_failure_modes(ext, case)
    assert isinstance(tags, list)
    assert any(t.mode == "hallucination" for t in tags)


def test_summarize_failure_modes():
    ext = _valid_extraction(evidence_quotes=[])
    case = _valid_case()
    tags = tag_failure_modes(ext, case)
    summary = summarize_failure_modes(tags)
    assert "total_failures" in summary
    assert "by_mode" in summary
    assert "affected_cases" in summary


# --- Report tests ---

def test_generate_report_not_empty():
    results = {
        "total_cases": 10,
        "metrics": compute_all_metrics([_valid_extraction()]),
        "failure_modes": {"total_failures": 0, "by_mode": {}, "affected_cases": 0},
        "gate_distribution": {"auto": 8, "review": 2},
        "review_reason_codes": {"low_confidence": 2},
    }
    report = generate_report(results)
    assert "# Evaluation Report" in report
    assert "schema_pass_rate" in report
    assert "PASS" in report or "FAIL" in report


def test_generate_report_empty_results():
    report = generate_report({})
    assert "No results" in report