File size: 2,548 Bytes
fc20fed
 
 
bdb49ae
 
 
 
 
 
 
 
 
 
 
 
fc20fed
 
 
 
 
bdb49ae
 
 
 
 
fc20fed
 
bdb49ae
 
 
 
 
fc20fed
 
 
 
 
 
 
3b8840f
 
 
 
 
 
 
fc20fed
3b8840f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc20fed
 
 
 
 
 
4dc151e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import numpy as np
import pytest

from evaluation.stats import (
    corr_ci,
    wilcoxon_signed_rank,
    holm_bonferroni,
    delta_metric,
    conditional_failure_rate,
    chi2_error_propagation,
)


def test_corr_ci():
    x = np.arange(10)
    y = np.arange(10) + np.random.normal(scale=1e-6, size=10)
    rho, (lo, hi), p = corr_ci(x, y, method="spearman", n_boot=1000, ci=0.90)
    assert -1 <= rho <= 1
    assert 0 <= lo <= hi <= 1
    assert 0 <= p <= 1


def test_wilcoxon():
    x = [1, 2, 3]
    y = [1, 3, 5]
    _, p = wilcoxon_signed_rank(x, y)
    assert 0 <= p <= 1  # only smoke-check that p is a valid probability


def test_holm():
    raw = {"a": 0.01, "b": 0.04, "c": 0.20}
    adj = holm_bonferroni(raw)
    # For m=3, sorted raw = [0.01,0.04,0.20]
    # a_adj = 3*0.01=0.03; b_adj = 2*0.04=0.08; c_adj = 1*0.20=0.20
    assert adj["a"]==pytest.approx(0.03, rel=1e-6)
    assert adj["b"]==pytest.approx(0.08, rel=1e-6)
    assert adj["c"]==pytest.approx(0.2, rel=1e-6)


import pytest
import math
import numpy as np

from evaluation.stats.robustness import delta_metric, conditional_failure_rate


def test_delta_and_failure_rate():
    orig = [0.9, 0.8, 0.7]
    pert = [0.85, 0.75, 0.65]
    delta, cohen_d = delta_metric(orig, pert)

    assert isinstance(delta, float)
    assert pytest.approx(-0.05, rel=1e-6) == delta

    assert isinstance(cohen_d, float)
    assert pytest.approx(-0.5, rel=1e-6) == cohen_d

    with pytest.raises(ValueError):
        delta_metric([1.0, 2.0], [1.0])

    retrieval_errors = [0, 1, 0, 1]
    hallucinations    = [1, 0, 0, 1]
    rates = conditional_failure_rate(retrieval_errors, hallucinations)

    assert set(rates.keys()) == {
        "p_hallucination_given_error",
        "p_hallucination_given_success",
    }

    assert pytest.approx(0.5, rel=1e-6) == rates["p_hallucination_given_error"]

    assert pytest.approx(0.5, rel=1e-6) == rates["p_hallucination_given_success"]

    only_success = [0, 0, 0]
    hall2        = [1, 1, 0]
    rates2 = conditional_failure_rate(only_success, hall2)
    assert math.isnan(rates2["p_hallucination_given_error"])
    assert pytest.approx(2 / 3, rel=1e-6) == rates2["p_hallucination_given_success"]

    with pytest.raises(ValueError):
        conditional_failure_rate([0, 1], [1])



def test_chi2_error_propagation():
    arr1 = [10, 20, 30]
    arr2 = [15, 25, 35]
    err = chi2_error_propagation(arr1, arr2)
    assert isinstance(err, dict)
    assert isinstance(err.get("chi2"), float)
    assert isinstance(err.get("p"), float)