Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified 12 months ago

16.4 kB

	"""
	Additional statistics functions with support for masked arrays.

	"""

	# Original author (2007): Pierre GF Gerard-Marchant


	__all__ = ['compare_medians_ms',
	'hdquantiles', 'hdmedian', 'hdquantiles_sd',
	'idealfourths',
	'median_cihs','mjci','mquantiles_cimj',
	'rsh',
	'trimmed_mean_ci',]


	import numpy as np
	from numpy import float64, ndarray

	import numpy.ma as ma
	from numpy.ma import MaskedArray

	from . import _mstats_basic as mstats

	from scipy.stats.distributions import norm, beta, t, binom


	def hdquantiles(data, prob=(.25, .5, .75), axis=None, var=False,):
	"""
	Computes quantile estimates with the Harrell-Davis method.

	The quantile estimates are calculated as a weighted linear combination
	of order statistics.

	Parameters
	----------
	data : array_like
	Data array.
	prob : sequence, optional
	Sequence of probabilities at which to compute the quantiles.
	axis : int or None, optional
	Axis along which to compute the quantiles. If None, use a flattened
	array.
	var : bool, optional
	Whether to return the variance of the estimate.

	Returns
	-------
	hdquantiles : MaskedArray
	A (p,) array of quantiles (if `var` is False), or a (2,p) array of
	quantiles and variances (if `var` is True), where ``p`` is the
	number of quantiles.

	See Also
	--------
	hdquantiles_sd

	Examples
	--------
	>>> import numpy as np
	>>> from scipy.stats.mstats import hdquantiles
	>>>
	>>> # Sample data
	>>> data = np.array([1.2, 2.5, 3.7, 4.0, 5.1, 6.3, 7.0, 8.2, 9.4])
	>>>
	>>> # Probabilities at which to compute quantiles
	>>> probabilities = [0.25, 0.5, 0.75]
	>>>
	>>> # Compute Harrell-Davis quantile estimates
	>>> quantile_estimates = hdquantiles(data, prob=probabilities)
	>>>
	>>> # Display the quantile estimates
	>>> for i, quantile in enumerate(probabilities):
	... print(f"{int(quantile * 100)}th percentile: {quantile_estimates[i]}")
	25th percentile: 3.1505820231763066 # may vary
	50th percentile: 5.194344084883956
	75th percentile: 7.430626414674935

	"""
	def _hd_1D(data,prob,var):
	"Computes the HD quantiles for a 1D array. Returns nan for invalid data."
	xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
	# Don't use length here, in case we have a numpy scalar
	n = xsorted.size

	hd = np.empty((2,len(prob)), float64)
	if n < 2:
	hd.flat = np.nan
	if var:
	return hd
	return hd[0]

	v = np.arange(n+1) / float(n)
	betacdf = beta.cdf
	for (i,p) in enumerate(prob):
	_w = betacdf(v, (n+1)p, (n+1)(1-p))
	w = _w[1:] - _w[:-1]
	hd_mean = np.dot(w, xsorted)
	hd[0,i] = hd_mean
	#
	hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
	#
	hd[0, prob == 0] = xsorted[0]
	hd[0, prob == 1] = xsorted[-1]
	if var:
	hd[1, prob == 0] = hd[1, prob == 1] = np.nan
	return hd
	return hd[0]
	# Initialization & checks
	data = ma.array(data, copy=False, dtype=float64)
	p = np.atleast_1d(np.asarray(prob))
	# Computes quantiles along axis (or globally)
	if (axis is None) or (data.ndim == 1):
	result = _hd_1D(data, p, var)
	else:
	if data.ndim > 2:
	raise ValueError("Array 'data' must be at most two dimensional, "
	"but got data.ndim = %d" % data.ndim)
	result = ma.apply_along_axis(_hd_1D, axis, data, p, var)

	return ma.fix_invalid(result, copy=False)


	def hdmedian(data, axis=-1, var=False):
	"""
	Returns the Harrell-Davis estimate of the median along the given axis.

	Parameters
	----------
	data : ndarray
	Data array.
	axis : int, optional
	Axis along which to compute the quantiles. If None, use a flattened
	array.
	var : bool, optional
	Whether to return the variance of the estimate.

	Returns
	-------
	hdmedian : MaskedArray
	The median values. If ``var=True``, the variance is returned inside
	the masked array. E.g. for a 1-D array the shape change from (1,) to
	(2,).

	"""
	result = hdquantiles(data,[0.5], axis=axis, var=var)
	return result.squeeze()


	def hdquantiles_sd(data, prob=(.25, .5, .75), axis=None):
	"""
	The standard error of the Harrell-Davis quantile estimates by jackknife.

	Parameters
	----------
	data : array_like
	Data array.
	prob : sequence, optional
	Sequence of quantiles to compute.
	axis : int, optional
	Axis along which to compute the quantiles. If None, use a flattened
	array.

	Returns
	-------
	hdquantiles_sd : MaskedArray
	Standard error of the Harrell-Davis quantile estimates.

	See Also
	--------
	hdquantiles

	"""
	def _hdsd_1D(data, prob):
	"Computes the std error for 1D arrays."
	xsorted = np.sort(data.compressed())
	n = len(xsorted)

	hdsd = np.empty(len(prob), float64)
	if n < 2:
	hdsd.flat = np.nan

	vv = np.arange(n) / float(n-1)
	betacdf = beta.cdf

	for (i,p) in enumerate(prob):
	_w = betacdf(vv, np, n(1-p))
	w = _w[1:] - _w[:-1]
	# cumulative sum of weights and data points if
	# ith point is left out for jackknife
	mx_ = np.zeros_like(xsorted)
	mx_[1:] = np.cumsum(w * xsorted[:-1])
	# similar but from the right
	mx_[:-1] += np.cumsum(w[::-1] * xsorted[:0:-1])[::-1]
	hdsd[i] = np.sqrt(mx_.var() * (n - 1))
	return hdsd

	# Initialization & checks
	data = ma.array(data, copy=False, dtype=float64)
	p = np.atleast_1d(np.asarray(prob))
	# Computes quantiles along axis (or globally)
	if (axis is None):
	result = _hdsd_1D(data, p)
	else:
	if data.ndim > 2:
	raise ValueError("Array 'data' must be at most two dimensional, "
	"but got data.ndim = %d" % data.ndim)
	result = ma.apply_along_axis(_hdsd_1D, axis, data, p)

	return ma.fix_invalid(result, copy=False).ravel()


	def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
	alpha=0.05, axis=None):
	"""
	Selected confidence interval of the trimmed mean along the given axis.

	Parameters
	----------
	data : array_like
	Input data.
	limits : {None, tuple}, optional
	None or a two item tuple.
	Tuple of the percentages to cut on each side of the array, with respect
	to the number of unmasked data, as floats between 0. and 1. If ``n``
	is the number of unmasked data before trimming, then
	(``n * limits[0]``)th smallest data and (``n * limits[1]``)th
	largest data are masked. The total number of unmasked data after
	trimming is ``n * (1. - sum(limits))``.
	The value of one limit can be set to None to indicate an open interval.

	Defaults to (0.2, 0.2).
	inclusive : (2,) tuple of boolean, optional
	If relative==False, tuple indicating whether values exactly equal to
	the absolute limits are allowed.
	If relative==True, tuple indicating whether the number of data being
	masked on each side should be rounded (True) or truncated (False).

	Defaults to (True, True).
	alpha : float, optional
	Confidence level of the intervals.

	Defaults to 0.05.
	axis : int, optional
	Axis along which to cut. If None, uses a flattened version of `data`.

	Defaults to None.

	Returns
	-------
	trimmed_mean_ci : (2,) ndarray
	The lower and upper confidence intervals of the trimmed data.

	"""
	data = ma.array(data, copy=False)
	trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
	tmean = trimmed.mean(axis)
	tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
	df = trimmed.count(axis) - 1
	tppf = t.ppf(1-alpha/2.,df)
	return np.array((tmean - tppftstde, tmean+tppftstde))


	def mjci(data, prob=(0.25, 0.5, 0.75), axis=None):
	"""
	Returns the Maritz-Jarrett estimators of the standard error of selected
	experimental quantiles of the data.

	Parameters
	----------
	data : ndarray
	Data array.
	prob : sequence, optional
	Sequence of quantiles to compute.
	axis : int or None, optional
	Axis along which to compute the quantiles. If None, use a flattened
	array.

	"""
	def _mjci_1D(data, p):
	data = np.sort(data.compressed())
	n = data.size
	prob = (np.array(p) * n + 0.5).astype(int)
	betacdf = beta.cdf

	mj = np.empty(len(prob), float64)
	x = np.arange(1,n+1, dtype=float64) / n
	y = x - 1./n
	for (i,m) in enumerate(prob):
	W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
	C1 = np.dot(W,data)
	C2 = np.dot(W,data**2)
	mj[i] = np.sqrt(C2 - C1**2)
	return mj

	data = ma.array(data, copy=False)
	if data.ndim > 2:
	raise ValueError("Array 'data' must be at most two dimensional, "
	"but got data.ndim = %d" % data.ndim)

	p = np.atleast_1d(np.asarray(prob))
	# Computes quantiles along axis (or globally)
	if (axis is None):
	return _mjci_1D(data, p)
	else:
	return ma.apply_along_axis(_mjci_1D, axis, data, p)


	def mquantiles_cimj(data, prob=(0.25, 0.50, 0.75), alpha=0.05, axis=None):
	"""
	Computes the alpha confidence interval for the selected quantiles of the
	data, with Maritz-Jarrett estimators.

	Parameters
	----------
	data : ndarray
	Data array.
	prob : sequence, optional
	Sequence of quantiles to compute.
	alpha : float, optional
	Confidence level of the intervals.
	axis : int or None, optional
	Axis along which to compute the quantiles.
	If None, use a flattened array.

	Returns
	-------
	ci_lower : ndarray
	The lower boundaries of the confidence interval. Of the same length as
	`prob`.
	ci_upper : ndarray
	The upper boundaries of the confidence interval. Of the same length as
	`prob`.

	"""
	alpha = min(alpha, 1 - alpha)
	z = norm.ppf(1 - alpha/2.)
	xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
	smj = mjci(data, prob, axis=axis)
	return (xq - z * smj, xq + z * smj)


	def median_cihs(data, alpha=0.05, axis=None):
	"""
	Computes the alpha-level confidence interval for the median of the data.

	Uses the Hettmasperger-Sheather method.

	Parameters
	----------
	data : array_like
	Input data. Masked values are discarded. The input should be 1D only,
	or `axis` should be set to None.
	alpha : float, optional
	Confidence level of the intervals.
	axis : int or None, optional
	Axis along which to compute the quantiles. If None, use a flattened
	array.

	Returns
	-------
	median_cihs
	Alpha level confidence interval.

	"""
	def _cihs_1D(data, alpha):
	data = np.sort(data.compressed())
	n = len(data)
	alpha = min(alpha, 1-alpha)
	k = int(binom._ppf(alpha/2., n, 0.5))
	gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
	if gk < 1-alpha:
	k -= 1
	gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
	gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
	I = (gk - 1 + alpha)/(gk - gkk)
	lambd = (n-k) * I / float(k + (n-2k)I)
	lims = (lambddata[k] + (1-lambd)data[k-1],
	lambddata[n-k-1] + (1-lambd)data[n-k])
	return lims
	data = ma.array(data, copy=False)
	# Computes quantiles along axis (or globally)
	if (axis is None):
	result = _cihs_1D(data, alpha)
	else:
	if data.ndim > 2:
	raise ValueError("Array 'data' must be at most two dimensional, "
	"but got data.ndim = %d" % data.ndim)
	result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)

	return result


	def compare_medians_ms(group_1, group_2, axis=None):
	"""
	Compares the medians from two independent groups along the given axis.

	The comparison is performed using the McKean-Schrader estimate of the
	standard error of the medians.

	Parameters
	----------
	group_1 : array_like
	First dataset. Has to be of size >=7.
	group_2 : array_like
	Second dataset. Has to be of size >=7.
	axis : int, optional
	Axis along which the medians are estimated. If None, the arrays are
	flattened. If `axis` is not None, then `group_1` and `group_2`
	should have the same shape.

	Returns
	-------
	compare_medians_ms : {float, ndarray}
	If `axis` is None, then returns a float, otherwise returns a 1-D
	ndarray of floats with a length equal to the length of `group_1`
	along `axis`.

	Examples
	--------

	>>> from scipy import stats
	>>> a = [1, 2, 3, 4, 5, 6, 7]
	>>> b = [8, 9, 10, 11, 12, 13, 14]
	>>> stats.mstats.compare_medians_ms(a, b, axis=None)
	1.0693225866553746e-05

	The function is vectorized to compute along a given axis.

	>>> import numpy as np
	>>> rng = np.random.default_rng()
	>>> x = rng.random(size=(3, 7))
	>>> y = rng.random(size=(3, 8))
	>>> stats.mstats.compare_medians_ms(x, y, axis=1)
	array([0.36908985, 0.36092538, 0.2765313 ])

	References
	----------
	.. [1] McKean, Joseph W., and Ronald M. Schrader. "A comparison of methods
	for studentizing the sample median." Communications in
	Statistics-Simulation and Computation 13.6 (1984): 751-773.

	"""
	(med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
	(std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
	mstats.stde_median(group_2, axis=axis))
	W = np.abs(med_1 - med_2) / ma.sqrt(std_12 + std_22)
	return 1 - norm.cdf(W)


	def idealfourths(data, axis=None):
	"""
	Returns an estimate of the lower and upper quartiles.

	Uses the ideal fourths algorithm.

	Parameters
	----------
	data : array_like
	Input array.
	axis : int, optional
	Axis along which the quartiles are estimated. If None, the arrays are
	flattened.

	Returns
	-------
	idealfourths : {list of floats, masked array}
	Returns the two internal values that divide `data` into four parts
	using the ideal fourths algorithm either along the flattened array
	(if `axis` is None) or along `axis` of `data`.

	"""
	def _idf(data):
	x = data.compressed()
	n = len(x)
	if n < 3:
	return [np.nan,np.nan]
	(j,h) = divmod(n/4. + 5/12.,1)
	j = int(j)
	qlo = (1-h)x[j-1] + hx[j]
	k = n - j
	qup = (1-h)x[k] + hx[k-1]
	return [qlo, qup]
	data = ma.sort(data, axis=axis).view(MaskedArray)
	if (axis is None):
	return _idf(data)
	else:
	return ma.apply_along_axis(_idf, axis, data)


	def rsh(data, points=None):
	"""
	Evaluates Rosenblatt's shifted histogram estimators for each data point.

	Rosenblatt's estimator is a centered finite-difference approximation to the
	derivative of the empirical cumulative distribution function.

	Parameters
	----------
	data : sequence
	Input data, should be 1-D. Masked values are ignored.
	points : sequence or None, optional
	Sequence of points where to evaluate Rosenblatt shifted histogram.
	If None, use the data.

	"""
	data = ma.array(data, copy=False)
	if points is None:
	points = data
	else:
	points = np.atleast_1d(np.asarray(points))

	if data.ndim != 1:
	raise AttributeError("The input array should be 1D only !")

	n = data.count()
	r = idealfourths(data, axis=None)
	h = 1.2 * (r[-1]-r[0]) / n**(1./5)
	nhi = (data[:,None] <= points[None,:] + h).sum(0)
	nlo = (data[:,None] < points[None,:] - h).sum(0)
	return (nhi-nlo) / (2.nh)