import numpy as np import pytest from evaluation.stats import ( corr_ci, wilcoxon_signed_rank, holm_bonferroni, delta_metric, conditional_failure_rate, chi2_error_propagation, ) def test_corr_ci(): x = np.arange(10) y = np.arange(10) + np.random.normal(scale=1e-6, size=10) rho, (lo, hi), p = corr_ci(x, y, method="spearman", n_boot=1000, ci=0.90) assert -1 <= rho <= 1 assert 0 <= lo <= hi <= 1 assert 0 <= p <= 1 def test_wilcoxon(): x = [1, 2, 3] y = [1, 3, 5] _, p = wilcoxon_signed_rank(x, y) assert 0 <= p <= 1 # only smoke-check that p is a valid probability def test_holm(): raw = {"a": 0.01, "b": 0.04, "c": 0.20} adj = holm_bonferroni(raw) # For m=3, sorted raw = [0.01,0.04,0.20] # a_adj = 3*0.01=0.03; b_adj = 2*0.04=0.08; c_adj = 1*0.20=0.20 assert adj["a"]==pytest.approx(0.03, rel=1e-6) assert adj["b"]==pytest.approx(0.08, rel=1e-6) assert adj["c"]==pytest.approx(0.2, rel=1e-6) import pytest import math import numpy as np from evaluation.stats.robustness import delta_metric, conditional_failure_rate def test_delta_and_failure_rate(): orig = [0.9, 0.8, 0.7] pert = [0.85, 0.75, 0.65] delta, cohen_d = delta_metric(orig, pert) assert isinstance(delta, float) assert pytest.approx(-0.05, rel=1e-6) == delta assert isinstance(cohen_d, float) assert pytest.approx(-0.5, rel=1e-6) == cohen_d with pytest.raises(ValueError): delta_metric([1.0, 2.0], [1.0]) retrieval_errors = [0, 1, 0, 1] hallucinations = [1, 0, 0, 1] rates = conditional_failure_rate(retrieval_errors, hallucinations) assert set(rates.keys()) == { "p_hallucination_given_error", "p_hallucination_given_success", } assert pytest.approx(0.5, rel=1e-6) == rates["p_hallucination_given_error"] assert pytest.approx(0.5, rel=1e-6) == rates["p_hallucination_given_success"] only_success = [0, 0, 0] hall2 = [1, 1, 0] rates2 = conditional_failure_rate(only_success, hall2) assert math.isnan(rates2["p_hallucination_given_error"]) assert pytest.approx(2 / 3, rel=1e-6) == rates2["p_hallucination_given_success"] with pytest.raises(ValueError): conditional_failure_rate([0, 1], [1]) def test_chi2_error_propagation(): arr1 = [10, 20, 30] arr2 = [15, 25, 35] err = chi2_error_propagation(arr1, arr2) assert isinstance(err, dict) assert isinstance(err.get("chi2"), float) assert isinstance(err.get("p"), float)