File size: 2,511 Bytes
6bef416
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from __future__ import annotations

import math

import pandas as pd

from src.analytics import evidence_strength_proxy, policy_curve, retrieval_outcomes, risk_slices


def _toy_eval() -> pd.DataFrame:
    return pd.DataFrame(
        {
            "example_id": ["e1", "e2", "e3", "e4"],
            "domain": ["A", "A", "B", "B"],
            "scenario_type": ["fact", "fact", "extract", "extract"],
            "difficulty": ["easy", "easy", "hard", "hard"],
            "is_correct": [1, 0, 0, 1],
            "hallucination_flag": [0, 1, 0, 1],
            "recall_at_10": [0.9, 0.9, 0.2, 0.1],
            "mrr_at_10": [0.5, 0.1, 0.2, 0.0],
            "total_latency_ms": [100, 200, 300, 400],
            "total_cost_usd": [0.01, 0.02, 0.03, 0.04],
        }
    )


def test_risk_slices_uses_expected_weighted_formula() -> None:
    out = risk_slices(_toy_eval(), group_cols=["domain"], min_n=1)
    by_domain = out.set_index("domain")

    assert by_domain.index.tolist()[0] == "B"
    assert math.isclose(by_domain.loc["A", "risk_score"], 0.42, abs_tol=1e-12)
    assert math.isclose(by_domain.loc["B", "risk_score"], 0.57, abs_tol=1e-12)


def test_retrieval_outcomes_classifies_failure_modes_once() -> None:
    out = retrieval_outcomes(_toy_eval(), threshold=0.8)
    counts = out.set_index("failure_mode")["n"].to_dict()

    assert counts == {
        "hallucination_risk_correct_answer": 1,
        "hallucination_failure": 1,
        "retrieval_failure": 1,
        "healthy": 1,
    }


def test_evidence_strength_proxy_uses_reference_anchors() -> None:
    eval_df = pd.DataFrame(
        {
            "top1_score": [10.0, 20.0],
            "recall_at_10": [0.0, 1.0],
            "mrr_at_10": [0.0, 1.0],
        }
    )
    reference_df = pd.DataFrame(
        {
            "top1_score": [0.0, 20.0],
            "recall_at_10": [0.0, 1.0],
            "mrr_at_10": [0.0, 1.0],
        }
    )

    scores = evidence_strength_proxy(eval_df, reference_df=reference_df).round(6).tolist()

    assert scores == [0.205882, 1.0]


def test_policy_curve_review_queue_is_monotonic_with_threshold() -> None:
    eval_df = _toy_eval().assign(top1_score=[0.9, 0.7, 0.3, 0.2], mean_retrieved_score=[0.8, 0.7, 0.2, 0.1])
    out = policy_curve(eval_df, thresholds=[0.2, 0.5, 0.8], reference_df=eval_df)

    assert out["review_queue_size"].tolist() == sorted(out["review_queue_size"].tolist())
    assert out["auto_approve_rate"].tolist() == sorted(out["auto_approve_rate"].tolist(), reverse=True)