Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import pytest | |
| from evaluation.stats import ( | |
| corr_ci, | |
| wilcoxon_signed_rank, | |
| holm_bonferroni, | |
| delta_metric, | |
| conditional_failure_rate, | |
| chi2_error_propagation, | |
| ) | |
| def test_corr_ci(): | |
| x = np.arange(10) | |
| y = np.arange(10) + np.random.normal(scale=1e-6, size=10) | |
| rho, (lo, hi), p = corr_ci(x, y, method="spearman", n_boot=1000, ci=0.90) | |
| assert -1 <= rho <= 1 | |
| assert 0 <= lo <= hi <= 1 | |
| assert 0 <= p <= 1 | |
| def test_wilcoxon(): | |
| x = [1, 2, 3] | |
| y = [1, 3, 5] | |
| _, p = wilcoxon_signed_rank(x, y) | |
| assert 0 <= p <= 1 # only smoke-check that p is a valid probability | |
| def test_holm(): | |
| raw = {"a": 0.01, "b": 0.04, "c": 0.20} | |
| adj = holm_bonferroni(raw) | |
| # For m=3, sorted raw = [0.01,0.04,0.20] | |
| # a_adj = 3*0.01=0.03; b_adj = 2*0.04=0.08; c_adj = 1*0.20=0.20 | |
| assert adj["a"]==pytest.approx(0.03, rel=1e-6) | |
| assert adj["b"]==pytest.approx(0.08, rel=1e-6) | |
| assert adj["c"]==pytest.approx(0.2, rel=1e-6) | |
| import pytest | |
| import math | |
| import numpy as np | |
| from evaluation.stats.robustness import delta_metric, conditional_failure_rate | |
| def test_delta_and_failure_rate(): | |
| orig = [0.9, 0.8, 0.7] | |
| pert = [0.85, 0.75, 0.65] | |
| delta, cohen_d = delta_metric(orig, pert) | |
| assert isinstance(delta, float) | |
| assert pytest.approx(-0.05, rel=1e-6) == delta | |
| assert isinstance(cohen_d, float) | |
| assert pytest.approx(-0.5, rel=1e-6) == cohen_d | |
| with pytest.raises(ValueError): | |
| delta_metric([1.0, 2.0], [1.0]) | |
| retrieval_errors = [0, 1, 0, 1] | |
| hallucinations = [1, 0, 0, 1] | |
| rates = conditional_failure_rate(retrieval_errors, hallucinations) | |
| assert set(rates.keys()) == { | |
| "p_hallucination_given_error", | |
| "p_hallucination_given_success", | |
| } | |
| assert pytest.approx(0.5, rel=1e-6) == rates["p_hallucination_given_error"] | |
| assert pytest.approx(0.5, rel=1e-6) == rates["p_hallucination_given_success"] | |
| only_success = [0, 0, 0] | |
| hall2 = [1, 1, 0] | |
| rates2 = conditional_failure_rate(only_success, hall2) | |
| assert math.isnan(rates2["p_hallucination_given_error"]) | |
| assert pytest.approx(2 / 3, rel=1e-6) == rates2["p_hallucination_given_success"] | |
| with pytest.raises(ValueError): | |
| conditional_failure_rate([0, 1], [1]) | |
| def test_chi2_error_propagation(): | |
| arr1 = [10, 20, 30] | |
| arr2 = [15, 25, 35] | |
| err = chi2_error_propagation(arr1, arr2) | |
| assert isinstance(err, dict) | |
| assert isinstance(err.get("chi2"), float) | |
| assert isinstance(err.get("p"), float) | |