rag-qa-command-cente / tests /test_analytics_expected_values.py
Tarek Masryo
chore: update project files
6bef416
from __future__ import annotations
import math
import pandas as pd
from src.analytics import evidence_strength_proxy, policy_curve, retrieval_outcomes, risk_slices
def _toy_eval() -> pd.DataFrame:
return pd.DataFrame(
{
"example_id": ["e1", "e2", "e3", "e4"],
"domain": ["A", "A", "B", "B"],
"scenario_type": ["fact", "fact", "extract", "extract"],
"difficulty": ["easy", "easy", "hard", "hard"],
"is_correct": [1, 0, 0, 1],
"hallucination_flag": [0, 1, 0, 1],
"recall_at_10": [0.9, 0.9, 0.2, 0.1],
"mrr_at_10": [0.5, 0.1, 0.2, 0.0],
"total_latency_ms": [100, 200, 300, 400],
"total_cost_usd": [0.01, 0.02, 0.03, 0.04],
}
)
def test_risk_slices_uses_expected_weighted_formula() -> None:
out = risk_slices(_toy_eval(), group_cols=["domain"], min_n=1)
by_domain = out.set_index("domain")
assert by_domain.index.tolist()[0] == "B"
assert math.isclose(by_domain.loc["A", "risk_score"], 0.42, abs_tol=1e-12)
assert math.isclose(by_domain.loc["B", "risk_score"], 0.57, abs_tol=1e-12)
def test_retrieval_outcomes_classifies_failure_modes_once() -> None:
out = retrieval_outcomes(_toy_eval(), threshold=0.8)
counts = out.set_index("failure_mode")["n"].to_dict()
assert counts == {
"hallucination_risk_correct_answer": 1,
"hallucination_failure": 1,
"retrieval_failure": 1,
"healthy": 1,
}
def test_evidence_strength_proxy_uses_reference_anchors() -> None:
eval_df = pd.DataFrame(
{
"top1_score": [10.0, 20.0],
"recall_at_10": [0.0, 1.0],
"mrr_at_10": [0.0, 1.0],
}
)
reference_df = pd.DataFrame(
{
"top1_score": [0.0, 20.0],
"recall_at_10": [0.0, 1.0],
"mrr_at_10": [0.0, 1.0],
}
)
scores = evidence_strength_proxy(eval_df, reference_df=reference_df).round(6).tolist()
assert scores == [0.205882, 1.0]
def test_policy_curve_review_queue_is_monotonic_with_threshold() -> None:
eval_df = _toy_eval().assign(top1_score=[0.9, 0.7, 0.3, 0.2], mean_retrieved_score=[0.8, 0.7, 0.2, 0.1])
out = policy_curve(eval_df, thresholds=[0.2, 0.5, 0.8], reference_df=eval_df)
assert out["review_queue_size"].tolist() == sorted(out["review_queue_size"].tolist())
assert out["auto_approve_rate"].tolist() == sorted(out["auto_approve_rate"].tolist(), reverse=True)