Spaces:

Beam2513
/

again

Sleeping

App Files Files Community

again / core /estimation /descriptive.py

Beam2513

Upload 127 files

798602c verified about 1 month ago

raw

history blame contribute delete

5.77 kB

	# ui/stats/estimation/descriptive.py

	from functools import lru_cache
	import numpy as np
	import pandas as pd
	from scipy.stats import (
	trim_mean,
	gmean,
	hmean,
	skew,
	kurtosis,
	norm
	)
	from scipy.special import loggamma
	from scipy.integrate import quad
	from scipy.stats import median_abs_deviation

	# ------------------------------------------------------------------
	# Bias-correction constants (user-approved implementations)
	# ------------------------------------------------------------------

	@lru_cache(maxsize=None)
	def c4(n: int) -> float:
	"""Bias correction constant for standard deviation."""
	return np.exp(
	np.log(np.sqrt(2 / (n - 1)))
	+ loggamma(n / 2)
	- loggamma((n - 1) / 2)
	)


	@lru_cache(maxsize=None)
	def d2(n: int) -> float:
	"""Bias correction constant for the range."""
	f = lambda x, n: 1 - (1 - norm.cdf(x)) n - (norm.cdf(x)) n
	return quad(f, -np.inf, np.inf, args=(n,))[0]


	# ------------------------------------------------------------------
	# Main computation function
	# ------------------------------------------------------------------

	def compute_descriptive_statistics(
	data,
	*,
	quantile_probs=(0.25, 0.5, 0.75),
	trim_alpha=None,
	winsor_limits=None,
	weights=None,
	):
	"""
	Compute all descriptive statistics for a single numeric variable.
	"""

	# --- preparation ------------------------------------------------
	x = pd.Series(data).dropna().astype(float)
	n = len(x)

	rows = []

	# ----------------------------------------------------------------
	# Quantiles
	# ----------------------------------------------------------------
	probs = np.atleast_1d(quantile_probs)
	q_vals = np.quantile(x, probs)
	for p, q in zip(probs, q_vals):
	rows.append([
	"Quantiles",
	f"Q{p}",
	q,
	np.nan,
	0
	])

	# ----------------------------------------------------------------
	# Central Tendency
	# ----------------------------------------------------------------

	mean = x.mean()
	median = np.median(x)
	iq_mean = trim_mean(x, 0.25)

	rows.extend([
	["Central Tendency", "Mean", mean, np.nan, 0],
	["Central Tendency", "Median", median, np.nan, 1],
	["Central Tendency", "Interquartile Mean", iq_mean, np.nan, 1],
	])

	# Weighted mean (additional, never replaces mean)
	if weights is not None:
	w = pd.Series(weights).loc[x.index].astype(float)
	w_mean = np.average(x, weights=w)
	rows.append([
	"Central Tendency",
	"Weighted Mean",
	w_mean,
	np.nan,
	0
	])

	# Trimmed mean
	if trim_alpha is not None:
	t_mean = trim_mean(x, trim_alpha)
	rows.append([
	"Central Tendency",
	f"Trimmed Mean ({trim_alpha})",
	t_mean,
	np.nan,
	1
	])

	# Winsorized mean
	if winsor_limits is not None:
	from scipy.stats.mstats import winsorize
	xw = winsorize(x, winsor_limits)
	rows.append([
	"Central Tendency",
	f"Winsorized Mean {tuple(winsor_limits)}",
	np.mean(xw),
	np.nan,
	1
	])

	# Geometric & harmonic means
	if np.all(x > 0):
	rows.extend([
	["Central Tendency", "Geometric Mean", gmean(x), np.nan, 0],
	["Central Tendency", "Harmonic Mean", hmean(x), np.nan, 0],
	])

	# ----------------------------------------------------------------
	# Dispersion
	# ----------------------------------------------------------------

	var0 = np.var(x, ddof=0)
	var1 = np.var(x, ddof=1) # unbiased
	std0 = np.std(x, ddof=0)
	std1 = np.std(x, ddof=1)
	rng = x.max() - x.min()
	iqr = np.subtract(*np.percentile(x, [75, 25]))
	mad = median_abs_deviation(x)
	aad = np.mean(np.abs(x - mean))

	rows.extend([
	["Dispersion", "Variance (ddof=0)", var0, var1, 0],
	["Dispersion", "Variance (ddof=1)", var1, var1, 0],
	["Dispersion", "Std (ddof=0)", std0, std0 * np.sqrt(n / (n - 1)) / c4(n), 0],
	["Dispersion", "Std (ddof=1)", std1, std1 / c4(n), 0],
	["Dispersion", "Range", rng, rng / d2(n), 0],
	["Dispersion", "AAD", aad, aad * np.sqrt(np.pi / 2), 0],
	["Dispersion", "IQR", iqr, iqr / (2 * norm.ppf(0.75)), 1],
	["Dispersion", "MAD", mad, mad / norm.ppf(0.75), 1],
	])

	# ----------------------------------------------------------------
	# Shape
	# ----------------------------------------------------------------

	rows.extend([
	["Shape", "Skewness (central moments)", skew(x), np.nan, 0],
	["Shape", "Skewness (k-statistic)", skew(x, bias=False), np.nan, 0],
	["Shape", "Kurtosis (central moments)", kurtosis(x, fisher=False), np.nan, 0],
	["Shape", "Kurtosis (k-statistic)", kurtosis(x, fisher=False, bias=False), np.nan, 0],
	["Shape", "Excess Kurtosis (central moments)", kurtosis(x, fisher=False) - 3, np.nan, 0],
	["Shape", "Excess Kurtosis (k-statistic)", kurtosis(x, fisher=False, bias=False) - 3, np.nan, 0],
	])

	# ----------------------------------------------------------------
	# Final table
	# ----------------------------------------------------------------

	return pd.DataFrame(
	rows,
	columns=[
	"Statistic Type",
	"Measure",
	"Value",
	"Bias Corrected",
	"Robust",
	],
	)