|
|
|
|
| from functools import lru_cache
|
| import numpy as np
|
| import pandas as pd
|
| from scipy.stats import (
|
| trim_mean,
|
| gmean,
|
| hmean,
|
| skew,
|
| kurtosis,
|
| norm
|
| )
|
| from scipy.special import loggamma
|
| from scipy.integrate import quad
|
| from scipy.stats import median_abs_deviation
|
|
|
|
|
|
|
|
|
|
|
| @lru_cache(maxsize=None)
|
| def c4(n: int) -> float:
|
| """Bias correction constant for standard deviation."""
|
| return np.exp(
|
| np.log(np.sqrt(2 / (n - 1)))
|
| + loggamma(n / 2)
|
| - loggamma((n - 1) / 2)
|
| )
|
|
|
|
|
| @lru_cache(maxsize=None)
|
| def d2(n: int) -> float:
|
| """Bias correction constant for the range."""
|
| f = lambda x, n: 1 - (1 - norm.cdf(x)) ** n - (norm.cdf(x)) ** n
|
| return quad(f, -np.inf, np.inf, args=(n,))[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
| def compute_descriptive_statistics(
|
| data,
|
| *,
|
| quantile_probs=(0.25, 0.5, 0.75),
|
| trim_alpha=None,
|
| winsor_limits=None,
|
| weights=None,
|
| ):
|
| """
|
| Compute all descriptive statistics for a single numeric variable.
|
| """
|
|
|
|
|
| x = pd.Series(data).dropna().astype(float)
|
| n = len(x)
|
|
|
| rows = []
|
|
|
|
|
|
|
|
|
| probs = np.atleast_1d(quantile_probs)
|
| q_vals = np.quantile(x, probs)
|
| for p, q in zip(probs, q_vals):
|
| rows.append([
|
| "Quantiles",
|
| f"Q{p}",
|
| q,
|
| np.nan,
|
| 0
|
| ])
|
|
|
|
|
|
|
|
|
|
|
| mean = x.mean()
|
| median = np.median(x)
|
| iq_mean = trim_mean(x, 0.25)
|
|
|
| rows.extend([
|
| ["Central Tendency", "Mean", mean, np.nan, 0],
|
| ["Central Tendency", "Median", median, np.nan, 1],
|
| ["Central Tendency", "Interquartile Mean", iq_mean, np.nan, 1],
|
| ])
|
|
|
|
|
| if weights is not None:
|
| w = pd.Series(weights).loc[x.index].astype(float)
|
| w_mean = np.average(x, weights=w)
|
| rows.append([
|
| "Central Tendency",
|
| "Weighted Mean",
|
| w_mean,
|
| np.nan,
|
| 0
|
| ])
|
|
|
|
|
| if trim_alpha is not None:
|
| t_mean = trim_mean(x, trim_alpha)
|
| rows.append([
|
| "Central Tendency",
|
| f"Trimmed Mean ({trim_alpha})",
|
| t_mean,
|
| np.nan,
|
| 1
|
| ])
|
|
|
|
|
| if winsor_limits is not None:
|
| from scipy.stats.mstats import winsorize
|
| xw = winsorize(x, winsor_limits)
|
| rows.append([
|
| "Central Tendency",
|
| f"Winsorized Mean {tuple(winsor_limits)}",
|
| np.mean(xw),
|
| np.nan,
|
| 1
|
| ])
|
|
|
|
|
| if np.all(x > 0):
|
| rows.extend([
|
| ["Central Tendency", "Geometric Mean", gmean(x), np.nan, 0],
|
| ["Central Tendency", "Harmonic Mean", hmean(x), np.nan, 0],
|
| ])
|
|
|
|
|
|
|
|
|
|
|
| var0 = np.var(x, ddof=0)
|
| var1 = np.var(x, ddof=1)
|
| std0 = np.std(x, ddof=0)
|
| std1 = np.std(x, ddof=1)
|
| rng = x.max() - x.min()
|
| iqr = np.subtract(*np.percentile(x, [75, 25]))
|
| mad = median_abs_deviation(x)
|
| aad = np.mean(np.abs(x - mean))
|
|
|
| rows.extend([
|
| ["Dispersion", "Variance (ddof=0)", var0, var1, 0],
|
| ["Dispersion", "Variance (ddof=1)", var1, var1, 0],
|
| ["Dispersion", "Std (ddof=0)", std0, std0 * np.sqrt(n / (n - 1)) / c4(n), 0],
|
| ["Dispersion", "Std (ddof=1)", std1, std1 / c4(n), 0],
|
| ["Dispersion", "Range", rng, rng / d2(n), 0],
|
| ["Dispersion", "AAD", aad, aad * np.sqrt(np.pi / 2), 0],
|
| ["Dispersion", "IQR", iqr, iqr / (2 * norm.ppf(0.75)), 1],
|
| ["Dispersion", "MAD", mad, mad / norm.ppf(0.75), 1],
|
| ])
|
|
|
|
|
|
|
|
|
|
|
| rows.extend([
|
| ["Shape", "Skewness (central moments)", skew(x), np.nan, 0],
|
| ["Shape", "Skewness (k-statistic)", skew(x, bias=False), np.nan, 0],
|
| ["Shape", "Kurtosis (central moments)", kurtosis(x, fisher=False), np.nan, 0],
|
| ["Shape", "Kurtosis (k-statistic)", kurtosis(x, fisher=False, bias=False), np.nan, 0],
|
| ["Shape", "Excess Kurtosis (central moments)", kurtosis(x, fisher=False) - 3, np.nan, 0],
|
| ["Shape", "Excess Kurtosis (k-statistic)", kurtosis(x, fisher=False, bias=False) - 3, np.nan, 0],
|
| ])
|
|
|
|
|
|
|
|
|
|
|
| return pd.DataFrame(
|
| rows,
|
| columns=[
|
| "Statistic Type",
|
| "Measure",
|
| "Value",
|
| "Bias Corrected",
|
| "Robust",
|
| ],
|
| )
|
|
|