# ui/stats/estimation/descriptive.py from functools import lru_cache import numpy as np import pandas as pd from scipy.stats import ( trim_mean, gmean, hmean, skew, kurtosis, norm ) from scipy.special import loggamma from scipy.integrate import quad from scipy.stats import median_abs_deviation # ------------------------------------------------------------------ # Bias-correction constants (user-approved implementations) # ------------------------------------------------------------------ @lru_cache(maxsize=None) def c4(n: int) -> float: """Bias correction constant for standard deviation.""" return np.exp( np.log(np.sqrt(2 / (n - 1))) + loggamma(n / 2) - loggamma((n - 1) / 2) ) @lru_cache(maxsize=None) def d2(n: int) -> float: """Bias correction constant for the range.""" f = lambda x, n: 1 - (1 - norm.cdf(x)) ** n - (norm.cdf(x)) ** n return quad(f, -np.inf, np.inf, args=(n,))[0] # ------------------------------------------------------------------ # Main computation function # ------------------------------------------------------------------ def compute_descriptive_statistics( data, *, quantile_probs=(0.25, 0.5, 0.75), trim_alpha=None, winsor_limits=None, weights=None, ): """ Compute all descriptive statistics for a single numeric variable. """ # --- preparation ------------------------------------------------ x = pd.Series(data).dropna().astype(float) n = len(x) rows = [] # ---------------------------------------------------------------- # Quantiles # ---------------------------------------------------------------- probs = np.atleast_1d(quantile_probs) q_vals = np.quantile(x, probs) for p, q in zip(probs, q_vals): rows.append([ "Quantiles", f"Q{p}", q, np.nan, 0 ]) # ---------------------------------------------------------------- # Central Tendency # ---------------------------------------------------------------- mean = x.mean() median = np.median(x) iq_mean = trim_mean(x, 0.25) rows.extend([ ["Central Tendency", "Mean", mean, np.nan, 0], ["Central Tendency", "Median", median, np.nan, 1], ["Central Tendency", "Interquartile Mean", iq_mean, np.nan, 1], ]) # Weighted mean (additional, never replaces mean) if weights is not None: w = pd.Series(weights).loc[x.index].astype(float) w_mean = np.average(x, weights=w) rows.append([ "Central Tendency", "Weighted Mean", w_mean, np.nan, 0 ]) # Trimmed mean if trim_alpha is not None: t_mean = trim_mean(x, trim_alpha) rows.append([ "Central Tendency", f"Trimmed Mean ({trim_alpha})", t_mean, np.nan, 1 ]) # Winsorized mean if winsor_limits is not None: from scipy.stats.mstats import winsorize xw = winsorize(x, winsor_limits) rows.append([ "Central Tendency", f"Winsorized Mean {tuple(winsor_limits)}", np.mean(xw), np.nan, 1 ]) # Geometric & harmonic means if np.all(x > 0): rows.extend([ ["Central Tendency", "Geometric Mean", gmean(x), np.nan, 0], ["Central Tendency", "Harmonic Mean", hmean(x), np.nan, 0], ]) # ---------------------------------------------------------------- # Dispersion # ---------------------------------------------------------------- var0 = np.var(x, ddof=0) var1 = np.var(x, ddof=1) # unbiased std0 = np.std(x, ddof=0) std1 = np.std(x, ddof=1) rng = x.max() - x.min() iqr = np.subtract(*np.percentile(x, [75, 25])) mad = median_abs_deviation(x) aad = np.mean(np.abs(x - mean)) rows.extend([ ["Dispersion", "Variance (ddof=0)", var0, var1, 0], ["Dispersion", "Variance (ddof=1)", var1, var1, 0], ["Dispersion", "Std (ddof=0)", std0, std0 * np.sqrt(n / (n - 1)) / c4(n), 0], ["Dispersion", "Std (ddof=1)", std1, std1 / c4(n), 0], ["Dispersion", "Range", rng, rng / d2(n), 0], ["Dispersion", "AAD", aad, aad * np.sqrt(np.pi / 2), 0], ["Dispersion", "IQR", iqr, iqr / (2 * norm.ppf(0.75)), 1], ["Dispersion", "MAD", mad, mad / norm.ppf(0.75), 1], ]) # ---------------------------------------------------------------- # Shape # ---------------------------------------------------------------- rows.extend([ ["Shape", "Skewness (central moments)", skew(x), np.nan, 0], ["Shape", "Skewness (k-statistic)", skew(x, bias=False), np.nan, 0], ["Shape", "Kurtosis (central moments)", kurtosis(x, fisher=False), np.nan, 0], ["Shape", "Kurtosis (k-statistic)", kurtosis(x, fisher=False, bias=False), np.nan, 0], ["Shape", "Excess Kurtosis (central moments)", kurtosis(x, fisher=False) - 3, np.nan, 0], ["Shape", "Excess Kurtosis (k-statistic)", kurtosis(x, fisher=False, bias=False) - 3, np.nan, 0], ]) # ---------------------------------------------------------------- # Final table # ---------------------------------------------------------------- return pd.DataFrame( rows, columns=[ "Statistic Type", "Measure", "Value", "Bias Corrected", "Robust", ], )