from __future__ import annotations import math import pandas as pd from src.analytics import evidence_strength_proxy, policy_curve, retrieval_outcomes, risk_slices def _toy_eval() -> pd.DataFrame: return pd.DataFrame( { "example_id": ["e1", "e2", "e3", "e4"], "domain": ["A", "A", "B", "B"], "scenario_type": ["fact", "fact", "extract", "extract"], "difficulty": ["easy", "easy", "hard", "hard"], "is_correct": [1, 0, 0, 1], "hallucination_flag": [0, 1, 0, 1], "recall_at_10": [0.9, 0.9, 0.2, 0.1], "mrr_at_10": [0.5, 0.1, 0.2, 0.0], "total_latency_ms": [100, 200, 300, 400], "total_cost_usd": [0.01, 0.02, 0.03, 0.04], } ) def test_risk_slices_uses_expected_weighted_formula() -> None: out = risk_slices(_toy_eval(), group_cols=["domain"], min_n=1) by_domain = out.set_index("domain") assert by_domain.index.tolist()[0] == "B" assert math.isclose(by_domain.loc["A", "risk_score"], 0.42, abs_tol=1e-12) assert math.isclose(by_domain.loc["B", "risk_score"], 0.57, abs_tol=1e-12) def test_retrieval_outcomes_classifies_failure_modes_once() -> None: out = retrieval_outcomes(_toy_eval(), threshold=0.8) counts = out.set_index("failure_mode")["n"].to_dict() assert counts == { "hallucination_risk_correct_answer": 1, "hallucination_failure": 1, "retrieval_failure": 1, "healthy": 1, } def test_evidence_strength_proxy_uses_reference_anchors() -> None: eval_df = pd.DataFrame( { "top1_score": [10.0, 20.0], "recall_at_10": [0.0, 1.0], "mrr_at_10": [0.0, 1.0], } ) reference_df = pd.DataFrame( { "top1_score": [0.0, 20.0], "recall_at_10": [0.0, 1.0], "mrr_at_10": [0.0, 1.0], } ) scores = evidence_strength_proxy(eval_df, reference_df=reference_df).round(6).tolist() assert scores == [0.205882, 1.0] def test_policy_curve_review_queue_is_monotonic_with_threshold() -> None: eval_df = _toy_eval().assign(top1_score=[0.9, 0.7, 0.3, 0.2], mean_retrieved_score=[0.8, 0.7, 0.2, 0.1]) out = policy_curve(eval_df, thresholds=[0.2, 0.5, 0.8], reference_df=eval_df) assert out["review_queue_size"].tolist() == sorted(out["review_queue_size"].tolist()) assert out["auto_approve_rate"].tolist() == sorted(out["auto_approve_rate"].tolist(), reverse=True)