Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import math | |
| import pandas as pd | |
| from src.analytics import evidence_strength_proxy, policy_curve, retrieval_outcomes, risk_slices | |
| def _toy_eval() -> pd.DataFrame: | |
| return pd.DataFrame( | |
| { | |
| "example_id": ["e1", "e2", "e3", "e4"], | |
| "domain": ["A", "A", "B", "B"], | |
| "scenario_type": ["fact", "fact", "extract", "extract"], | |
| "difficulty": ["easy", "easy", "hard", "hard"], | |
| "is_correct": [1, 0, 0, 1], | |
| "hallucination_flag": [0, 1, 0, 1], | |
| "recall_at_10": [0.9, 0.9, 0.2, 0.1], | |
| "mrr_at_10": [0.5, 0.1, 0.2, 0.0], | |
| "total_latency_ms": [100, 200, 300, 400], | |
| "total_cost_usd": [0.01, 0.02, 0.03, 0.04], | |
| } | |
| ) | |
| def test_risk_slices_uses_expected_weighted_formula() -> None: | |
| out = risk_slices(_toy_eval(), group_cols=["domain"], min_n=1) | |
| by_domain = out.set_index("domain") | |
| assert by_domain.index.tolist()[0] == "B" | |
| assert math.isclose(by_domain.loc["A", "risk_score"], 0.42, abs_tol=1e-12) | |
| assert math.isclose(by_domain.loc["B", "risk_score"], 0.57, abs_tol=1e-12) | |
| def test_retrieval_outcomes_classifies_failure_modes_once() -> None: | |
| out = retrieval_outcomes(_toy_eval(), threshold=0.8) | |
| counts = out.set_index("failure_mode")["n"].to_dict() | |
| assert counts == { | |
| "hallucination_risk_correct_answer": 1, | |
| "hallucination_failure": 1, | |
| "retrieval_failure": 1, | |
| "healthy": 1, | |
| } | |
| def test_evidence_strength_proxy_uses_reference_anchors() -> None: | |
| eval_df = pd.DataFrame( | |
| { | |
| "top1_score": [10.0, 20.0], | |
| "recall_at_10": [0.0, 1.0], | |
| "mrr_at_10": [0.0, 1.0], | |
| } | |
| ) | |
| reference_df = pd.DataFrame( | |
| { | |
| "top1_score": [0.0, 20.0], | |
| "recall_at_10": [0.0, 1.0], | |
| "mrr_at_10": [0.0, 1.0], | |
| } | |
| ) | |
| scores = evidence_strength_proxy(eval_df, reference_df=reference_df).round(6).tolist() | |
| assert scores == [0.205882, 1.0] | |
| def test_policy_curve_review_queue_is_monotonic_with_threshold() -> None: | |
| eval_df = _toy_eval().assign(top1_score=[0.9, 0.7, 0.3, 0.2], mean_retrieved_score=[0.8, 0.7, 0.2, 0.1]) | |
| out = policy_curve(eval_df, thresholds=[0.2, 0.5, 0.8], reference_df=eval_df) | |
| assert out["review_queue_size"].tolist() == sorted(out["review_queue_size"].tolist()) | |
| assert out["auto_approve_rate"].tolist() == sorted(out["auto_approve_rate"].tolist(), reverse=True) | |