File size: 5,773 Bytes
798602c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | # ui/stats/estimation/descriptive.py
from functools import lru_cache
import numpy as np
import pandas as pd
from scipy.stats import (
trim_mean,
gmean,
hmean,
skew,
kurtosis,
norm
)
from scipy.special import loggamma
from scipy.integrate import quad
from scipy.stats import median_abs_deviation
# ------------------------------------------------------------------
# Bias-correction constants (user-approved implementations)
# ------------------------------------------------------------------
@lru_cache(maxsize=None)
def c4(n: int) -> float:
"""Bias correction constant for standard deviation."""
return np.exp(
np.log(np.sqrt(2 / (n - 1)))
+ loggamma(n / 2)
- loggamma((n - 1) / 2)
)
@lru_cache(maxsize=None)
def d2(n: int) -> float:
"""Bias correction constant for the range."""
f = lambda x, n: 1 - (1 - norm.cdf(x)) ** n - (norm.cdf(x)) ** n
return quad(f, -np.inf, np.inf, args=(n,))[0]
# ------------------------------------------------------------------
# Main computation function
# ------------------------------------------------------------------
def compute_descriptive_statistics(
data,
*,
quantile_probs=(0.25, 0.5, 0.75),
trim_alpha=None,
winsor_limits=None,
weights=None,
):
"""
Compute all descriptive statistics for a single numeric variable.
"""
# --- preparation ------------------------------------------------
x = pd.Series(data).dropna().astype(float)
n = len(x)
rows = []
# ----------------------------------------------------------------
# Quantiles
# ----------------------------------------------------------------
probs = np.atleast_1d(quantile_probs)
q_vals = np.quantile(x, probs)
for p, q in zip(probs, q_vals):
rows.append([
"Quantiles",
f"Q{p}",
q,
np.nan,
0
])
# ----------------------------------------------------------------
# Central Tendency
# ----------------------------------------------------------------
mean = x.mean()
median = np.median(x)
iq_mean = trim_mean(x, 0.25)
rows.extend([
["Central Tendency", "Mean", mean, np.nan, 0],
["Central Tendency", "Median", median, np.nan, 1],
["Central Tendency", "Interquartile Mean", iq_mean, np.nan, 1],
])
# Weighted mean (additional, never replaces mean)
if weights is not None:
w = pd.Series(weights).loc[x.index].astype(float)
w_mean = np.average(x, weights=w)
rows.append([
"Central Tendency",
"Weighted Mean",
w_mean,
np.nan,
0
])
# Trimmed mean
if trim_alpha is not None:
t_mean = trim_mean(x, trim_alpha)
rows.append([
"Central Tendency",
f"Trimmed Mean ({trim_alpha})",
t_mean,
np.nan,
1
])
# Winsorized mean
if winsor_limits is not None:
from scipy.stats.mstats import winsorize
xw = winsorize(x, winsor_limits)
rows.append([
"Central Tendency",
f"Winsorized Mean {tuple(winsor_limits)}",
np.mean(xw),
np.nan,
1
])
# Geometric & harmonic means
if np.all(x > 0):
rows.extend([
["Central Tendency", "Geometric Mean", gmean(x), np.nan, 0],
["Central Tendency", "Harmonic Mean", hmean(x), np.nan, 0],
])
# ----------------------------------------------------------------
# Dispersion
# ----------------------------------------------------------------
var0 = np.var(x, ddof=0)
var1 = np.var(x, ddof=1) # unbiased
std0 = np.std(x, ddof=0)
std1 = np.std(x, ddof=1)
rng = x.max() - x.min()
iqr = np.subtract(*np.percentile(x, [75, 25]))
mad = median_abs_deviation(x)
aad = np.mean(np.abs(x - mean))
rows.extend([
["Dispersion", "Variance (ddof=0)", var0, var1, 0],
["Dispersion", "Variance (ddof=1)", var1, var1, 0],
["Dispersion", "Std (ddof=0)", std0, std0 * np.sqrt(n / (n - 1)) / c4(n), 0],
["Dispersion", "Std (ddof=1)", std1, std1 / c4(n), 0],
["Dispersion", "Range", rng, rng / d2(n), 0],
["Dispersion", "AAD", aad, aad * np.sqrt(np.pi / 2), 0],
["Dispersion", "IQR", iqr, iqr / (2 * norm.ppf(0.75)), 1],
["Dispersion", "MAD", mad, mad / norm.ppf(0.75), 1],
])
# ----------------------------------------------------------------
# Shape
# ----------------------------------------------------------------
rows.extend([
["Shape", "Skewness (central moments)", skew(x), np.nan, 0],
["Shape", "Skewness (k-statistic)", skew(x, bias=False), np.nan, 0],
["Shape", "Kurtosis (central moments)", kurtosis(x, fisher=False), np.nan, 0],
["Shape", "Kurtosis (k-statistic)", kurtosis(x, fisher=False, bias=False), np.nan, 0],
["Shape", "Excess Kurtosis (central moments)", kurtosis(x, fisher=False) - 3, np.nan, 0],
["Shape", "Excess Kurtosis (k-statistic)", kurtosis(x, fisher=False, bias=False) - 3, np.nan, 0],
])
# ----------------------------------------------------------------
# Final table
# ----------------------------------------------------------------
return pd.DataFrame(
rows,
columns=[
"Statistic Type",
"Measure",
"Value",
"Bias Corrected",
"Robust",
],
)
|