Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from src.analytics import ( | |
| MIN_CONFIG_N, | |
| config_leaderboard, | |
| make_decision_brief, | |
| overview_metrics, | |
| policy_at_threshold, | |
| policy_curve, | |
| retrieval_outcomes, | |
| risk_slices, | |
| top_examples, | |
| ) | |
| from src.data import DataBundle, filter_eval, filter_retrieval_events | |
| def test_overview_metrics_shape(bundle: DataBundle) -> None: | |
| metrics = overview_metrics(bundle.eval_runs, bundle.documents, bundle.chunks, bundle.retrieval_events) | |
| assert {"evaluations", "correct_rate", "hallucination_rate", "recall_at_10", "mrr_at_10"}.issubset(metrics) | |
| def test_risk_slices_have_risk_score(bundle: DataBundle) -> None: | |
| out = risk_slices(bundle.eval_runs) | |
| assert not out.empty and "risk_score" in out.columns | |
| def test_retrieval_outcomes_have_failure_modes(bundle: DataBundle) -> None: | |
| out = retrieval_outcomes(bundle.eval_runs) | |
| assert not out.empty and "failure_mode" in out.columns | |
| def test_config_leaderboard_has_config_and_score(bundle: DataBundle) -> None: | |
| out = config_leaderboard(bundle.eval_runs, min_n=MIN_CONFIG_N) | |
| assert not out.empty and {"config", "score", "correct_rate", "hallucination_rate"}.issubset(out.columns) | |
| def test_policy_curve_has_required_columns(bundle: DataBundle) -> None: | |
| out = policy_curve(bundle.eval_runs) | |
| assert {"threshold", "auto_approve_rate", "review_queue_size", "risk_captured_in_review"}.issubset(out.columns) | |
| def test_policy_at_threshold_returns_decision_metrics(bundle: DataBundle) -> None: | |
| out = policy_at_threshold(bundle.eval_runs, 0.55, reference_df=bundle.eval_runs) | |
| assert {"auto_approve_rate", "review_queue_size", "auto_correct_rate", "risk_captured_in_review"}.issubset(out) | |
| def test_decision_brief_has_action(bundle: DataBundle) -> None: | |
| brief = make_decision_brief(bundle.eval_runs, bundle.documents, bundle.chunks, bundle.retrieval_events) | |
| assert brief.recommended_action | |
| def test_high_risk_examples_are_returned(bundle: DataBundle) -> None: | |
| out = top_examples(bundle.eval_runs, mode="High risk", n=20, reference_df=bundle.eval_runs) | |
| assert 0 < len(out) <= 20 | |
| def test_domain_filter_keeps_retrieval_alignment(bundle: DataBundle) -> None: | |
| filtered_eval = filter_eval(bundle.eval_runs, domains=["financial_reports"]) | |
| filtered_retrieval = filter_retrieval_events(bundle.retrieval_events, filtered_eval) | |
| assert set(filtered_retrieval["example_id"].astype(str)).issubset(set(filtered_eval["example_id"].astype(str))) | |
| def test_empty_filter_returns_empty_retrieval(bundle: DataBundle) -> None: | |
| filtered_eval = filter_eval(bundle.eval_runs, domains=["missing_domain"]) | |
| filtered_retrieval = filter_retrieval_events(bundle.retrieval_events, filtered_eval) | |
| assert filtered_eval.empty and filtered_retrieval.empty | |