File size: 5,803 Bytes
798602c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# stats/inference/estimators.py

import numpy as np
from scipy.stats import trim_mean, iqr, median_abs_deviation, norm
from scipy.stats.mstats import gmean, hmean, winsorize


# ---------------
# Mean estimators
# ---------------

def estimate_mean(

    data,

    estimator,

    *,

    trim_param=None,

    winsor_limits=None,

    weights=None,

):
    data = np.asarray(data)

    if estimator == "Sample Mean":
        return np.mean(data)

    if estimator == "Geometric Mean":
        if np.any(data <= 0):
            raise ValueError("Geometric mean requires positive data")
        return gmean(data)

    if estimator == "Harmonic Mean":
        if np.any(data <= 0):
            raise ValueError("Harmonic mean requires positive data")
        return hmean(data)

    if estimator == "Trimmed Mean":
        if trim_param is None:
            raise ValueError("trim_param must be provided")

        try:
            trim_param = float(trim_param)
        except Exception:
            raise ValueError("trim_param must be a numeric value")

        if not (0 < trim_param < 0.5):
            raise ValueError("trim_param must be in (0, 0.5)")

        return trim_mean(data, trim_param)

    if estimator == "Interquartile Mean":
        return trim_mean(data, 0.25)

    if estimator == "Winsorized Mean":
        if winsor_limits is None:
            raise ValueError("winsor_limits must be provided")

        # --------------------------------------------------
        # Parse winsor limits
        # --------------------------------------------------
        if isinstance(winsor_limits, str):
            parts = [p.strip() for p in winsor_limits.split(",") if p.strip()]
            try:
                parts = [float(p) for p in parts]
            except ValueError:
                raise ValueError(
                    "winsor_limits must be numeric (e.g. '0.1' or '0.05,0.2')"
                )

            if len(parts) == 1:
                limits = parts[0]
            elif len(parts) == 2:
                limits = (parts[0], parts[1])
            else:
                raise ValueError(
                    "winsor_limits must have one or two values"
                )

        elif isinstance(winsor_limits, (list, tuple)):
            if len(winsor_limits) != 2:
                raise ValueError(
                    "winsor_limits list/tuple must have exactly two values"
                )
            limits = (float(winsor_limits[0]), float(winsor_limits[1]))

        else:
            limits = float(winsor_limits)

        # --------------------------------------------------
        # Validate bounds
        # --------------------------------------------------
        if isinstance(limits, tuple):
            if not (0 <= limits[0] < 0.5 and 0 <= limits[1] < 0.5):
                raise ValueError("winsor_limits must be in [0, 0.5)")
        else:
            if not (0 <= limits < 0.5):
                raise ValueError("winsor_limits must be in [0, 0.5)")

        # --------------------------------------------------
        # Compute winsorized mean
        # --------------------------------------------------
        wins_data = winsorize(data, limits=limits)
        return np.mean(wins_data)

    if estimator == "Weighted Mean":
        if weights is None:
            raise ValueError("weights must be provided for weighted mean")

        weights = np.asarray(weights)

        if len(weights) != len(data):
            raise ValueError("weights must have same length as data")

        if np.any(weights < 0):
            raise ValueError("weights must be non-negative")

        return np.average(data, weights=weights)

    raise ValueError(f"Unknown mean estimator: {estimator}")


# --------------------
# Deviation estimators
# --------------------

def estimate_sigma(

    data,

    estimator,

):
    """

    Return a bias-corrected estimate of σ based on the chosen deviation

    estimator name.

    """
    data = np.asarray(data)
    n = len(data)

    if n < 2:
        raise ValueError("At least two observations are required to estimate deviation.")

    # 1) Classical sample standard deviation (ddof=1)
    if estimator == "Deviation (1 ddof)":
        return np.std(data, ddof=1)

    # 2) Range-based estimator, bias-corrected by d2(n)
    if estimator == "Range (bias corrected)":
        R = np.max(data) - np.min(data)
        return R / d2(n)

    # 3) IQR-based estimator: σ ≈ IQR / (2 Φ⁻¹(0.75))
    if estimator == "IQR (bias corrected)":
        IQR = iqr(data)
        return IQR / (2 * norm.ppf(0.75))

    # 4) MAD-based estimator: σ ≈ MAD / Φ⁻¹(0.75)
    if estimator == "MAD (bias corrected)":
        MAD = median_abs_deviation(data)
        return MAD / norm.ppf(0.75)

    # 5) AAD-based estimator: σ ≈ AAD * sqrt(π/2)
    if estimator == "AAD (bias corrected)":
        AAD = np.mean(np.abs(data - np.mean(data)))
        return AAD * np.sqrt(np.pi / 2)

    raise ValueError(f"Unknown deviation estimator: {estimator}")


def d2(n: int) -> float:
    """

    Bias-correction constant for the range-based σ estimator.

    Same table as in ci_deviation.py.

    """
    table = {
        2: 1.128, 3: 1.693, 4: 2.059, 5: 2.326, 6: 2.534,
        7: 2.704, 8: 2.847, 9: 2.970, 10: 3.078,
        11: 3.173, 12: 3.258, 13: 3.336, 14: 3.407,
        15: 3.472, 16: 3.532, 17: 3.588, 18: 3.640,
        19: 3.689, 20: 3.735, 21: 3.778, 22: 3.819,
        23: 3.858, 24: 3.895, 25: 3.931,
    }
    if n not in table:
        raise ValueError("Range-based estimator only supported for 2 ≤ n ≤ 25.")
    return table[n]