Spaces:
Running
Running
File size: 8,238 Bytes
c4fe0a4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 | """Tests for evaluation metrics, failure modes, and report generation."""
from eval.metrics import (
schema_pass_rate,
evidence_coverage_rate,
review_required_rate,
unsupported_recommendation_rate,
root_cause_consistency,
review_routing_precision_recall,
compute_all_metrics,
)
from eval.failure_modes import (
detect_hallucination,
detect_omission,
detect_ambiguity,
detect_overconfidence,
detect_language_drift,
tag_failure_modes,
summarize_failure_modes,
)
from eval.run_eval import generate_report
# --- Helpers ---
def _valid_extraction(**overrides) -> dict:
base = {
"case_id": "test-001",
"root_cause_l1": "billing",
"root_cause_l2": "overcharge",
"sentiment_score": -0.5,
"risk_level": "medium",
"review_required": False,
"next_best_actions": ["Issue refund"],
"evidence_quotes": ["I was charged twice"],
"confidence": 0.85,
"churn_risk": 0.3,
"sentiment_rationale": "Frustrated",
"draft_notes": "Check billing.",
}
base.update(overrides)
return base
def _valid_case(**overrides) -> dict:
base = {
"case_id": "test-001",
"ticket_text": "I was charged twice for the same service last month.",
"conversation_snippet": "",
"email_thread": [],
"vip_tier": "standard",
"priority": "medium",
"source_dataset": "tickets",
"language": "en",
}
base.update(overrides)
return base
# --- Metrics tests ---
def test_schema_pass_rate_all_pass():
exts = [_valid_extraction() for _ in range(5)]
assert schema_pass_rate(exts) == 1.0
def test_schema_pass_rate_some_fail():
exts = [_valid_extraction(), {"root_cause_l1": ""}] # second fails minLength
rate = schema_pass_rate(exts)
assert rate == 0.5
def test_schema_pass_rate_empty():
assert schema_pass_rate([]) == 0.0
def test_evidence_coverage_all_covered():
exts = [_valid_extraction() for _ in range(3)]
assert evidence_coverage_rate(exts) == 1.0
def test_evidence_coverage_some_missing():
exts = [_valid_extraction(), _valid_extraction(evidence_quotes=[])]
assert evidence_coverage_rate(exts) == 0.5
def test_review_required_rate():
exts = [
_valid_extraction(confidence=0.9, risk_level="low", churn_risk=0.1),
_valid_extraction(confidence=0.3, risk_level="high", churn_risk=0.8),
]
rate = review_required_rate(exts)
assert rate == 0.5 # second triggers review
def test_unsupported_recommendation_rate():
exts = [
_valid_extraction(), # has evidence
_valid_extraction(evidence_quotes=[], next_best_actions=["Do X"]), # unsupported
]
rate = unsupported_recommendation_rate(exts)
assert rate == 0.5
def test_root_cause_consistency_perfect():
exts = [
_valid_extraction(case_id="a", root_cause_l1="billing"),
_valid_extraction(case_id="b", root_cause_l1="billing"),
]
cases = [
_valid_case(case_id="a", source_dataset="tickets"),
_valid_case(case_id="b", source_dataset="tickets"),
]
assert root_cause_consistency(exts, cases) == 1.0
def test_root_cause_consistency_mixed():
exts = [
_valid_extraction(case_id="a", root_cause_l1="billing"),
_valid_extraction(case_id="b", root_cause_l1="network"),
]
cases = [
_valid_case(case_id="a", source_dataset="tickets"),
_valid_case(case_id="b", source_dataset="tickets"),
]
assert root_cause_consistency(exts, cases) == 0.5
def test_review_routing_precision_recall():
pred = [True, True, False, False]
gold = [True, False, False, True]
result = review_routing_precision_recall(pred, gold)
assert result["precision"] == 0.5 # 1 TP / (1 TP + 1 FP)
assert result["recall"] == 0.5 # 1 TP / (1 TP + 1 FN)
def test_compute_all_metrics_returns_all_keys():
exts = [_valid_extraction()]
result = compute_all_metrics(exts)
assert "schema_pass_rate" in result
assert "evidence_coverage_rate" in result
assert "review_required_rate" in result
assert "unsupported_recommendation_rate" in result
assert "root_cause_consistency" in result
# --- Failure mode tests ---
def test_detect_hallucination_no_evidence():
ext = _valid_extraction(evidence_quotes=[])
case = _valid_case()
detected, detail = detect_hallucination(ext, case)
assert detected is True
def test_detect_hallucination_fabricated_quote():
ext = _valid_extraction(evidence_quotes=["This quote does not exist in the text at all whatsoever"])
case = _valid_case(ticket_text="My bill is wrong.")
detected, detail = detect_hallucination(ext, case)
assert detected is True
def test_detect_hallucination_valid_quote():
ext = _valid_extraction(evidence_quotes=["charged twice"])
case = _valid_case(ticket_text="I was charged twice for the same service.")
detected, detail = detect_hallucination(ext, case)
assert detected is False
def test_detect_omission_urgent_signal():
ext = _valid_extraction(risk_level="low")
case = _valid_case(ticket_text="I will take legal action if this is not resolved.")
detected, detail = detect_omission(ext, case)
assert detected is True
def test_detect_omission_no_signal():
ext = _valid_extraction(risk_level="medium", root_cause_l1="billing")
case = _valid_case(ticket_text="I was charged twice for the same service.")
detected, detail = detect_omission(ext, case)
assert detected is False
def test_detect_ambiguity_short_ticket():
ext = _valid_extraction(confidence=0.95, review_required=False)
case = _valid_case(ticket_text="Help please")
detected, detail = detect_ambiguity(ext, case)
assert detected is True
def test_detect_ambiguity_normal_ticket():
ext = _valid_extraction(confidence=0.85, review_required=False)
case = _valid_case(ticket_text="I was charged twice for the same service and want a refund.")
detected, detail = detect_ambiguity(ext, case)
assert detected is False
def test_detect_overconfidence_wrong_label():
ext = _valid_extraction(confidence=0.95, root_cause_l1="network")
case = _valid_case(gold_root_cause="billing")
detected, detail = detect_overconfidence(ext, case)
assert detected is True
def test_detect_overconfidence_correct_label():
ext = _valid_extraction(confidence=0.95, root_cause_l1="billing")
case = _valid_case(gold_root_cause="billing")
detected, detail = detect_overconfidence(ext, case)
assert detected is False
def test_detect_language_drift():
ext = _valid_extraction(confidence=0.3)
case = _valid_case(language="mixed")
detected, detail = detect_language_drift(ext, case)
assert detected is True
def test_detect_language_drift_english():
ext = _valid_extraction(confidence=0.9)
case = _valid_case(language="en")
detected, detail = detect_language_drift(ext, case)
assert detected is False
def test_tag_failure_modes_returns_list():
ext = _valid_extraction(evidence_quotes=[])
case = _valid_case()
tags = tag_failure_modes(ext, case)
assert isinstance(tags, list)
assert any(t.mode == "hallucination" for t in tags)
def test_summarize_failure_modes():
ext = _valid_extraction(evidence_quotes=[])
case = _valid_case()
tags = tag_failure_modes(ext, case)
summary = summarize_failure_modes(tags)
assert "total_failures" in summary
assert "by_mode" in summary
assert "affected_cases" in summary
# --- Report tests ---
def test_generate_report_not_empty():
results = {
"total_cases": 10,
"metrics": compute_all_metrics([_valid_extraction()]),
"failure_modes": {"total_failures": 0, "by_mode": {}, "affected_cases": 0},
"gate_distribution": {"auto": 8, "review": 2},
"review_reason_codes": {"low_confidence": 2},
}
report = generate_report(results)
assert "# Evaluation Report" in report
assert "schema_pass_rate" in report
assert "PASS" in report or "FAIL" in report
def test_generate_report_empty_results():
report = generate_report({})
assert "No results" in report
|