| """ |
| Additional statistics functions with support for masked arrays. |
| |
| """ |
|
|
| |
|
|
|
|
| __all__ = ['compare_medians_ms', |
| 'hdquantiles', 'hdmedian', 'hdquantiles_sd', |
| 'idealfourths', |
| 'median_cihs','mjci','mquantiles_cimj', |
| 'rsh', |
| 'trimmed_mean_ci',] |
|
|
|
|
| import numpy as np |
| from numpy import float64, ndarray |
|
|
| import numpy.ma as ma |
| from numpy.ma import MaskedArray |
|
|
| from . import _mstats_basic as mstats |
|
|
| from scipy.stats.distributions import norm, beta, t, binom |
|
|
|
|
| def hdquantiles(data, prob=(.25, .5, .75), axis=None, var=False,): |
| """ |
| Computes quantile estimates with the Harrell-Davis method. |
| |
| The quantile estimates are calculated as a weighted linear combination |
| of order statistics. |
| |
| Parameters |
| ---------- |
| data : array_like |
| Data array. |
| prob : sequence, optional |
| Sequence of probabilities at which to compute the quantiles. |
| axis : int or None, optional |
| Axis along which to compute the quantiles. If None, use a flattened |
| array. |
| var : bool, optional |
| Whether to return the variance of the estimate. |
| |
| Returns |
| ------- |
| hdquantiles : MaskedArray |
| A (p,) array of quantiles (if `var` is False), or a (2,p) array of |
| quantiles and variances (if `var` is True), where ``p`` is the |
| number of quantiles. |
| |
| See Also |
| -------- |
| hdquantiles_sd |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from scipy.stats.mstats import hdquantiles |
| >>> |
| >>> # Sample data |
| >>> data = np.array([1.2, 2.5, 3.7, 4.0, 5.1, 6.3, 7.0, 8.2, 9.4]) |
| >>> |
| >>> # Probabilities at which to compute quantiles |
| >>> probabilities = [0.25, 0.5, 0.75] |
| >>> |
| >>> # Compute Harrell-Davis quantile estimates |
| >>> quantile_estimates = hdquantiles(data, prob=probabilities) |
| >>> |
| >>> # Display the quantile estimates |
| >>> for i, quantile in enumerate(probabilities): |
| ... print(f"{int(quantile * 100)}th percentile: {quantile_estimates[i]}") |
| 25th percentile: 3.1505820231763066 # may vary |
| 50th percentile: 5.194344084883956 |
| 75th percentile: 7.430626414674935 |
| |
| """ |
| def _hd_1D(data,prob,var): |
| "Computes the HD quantiles for a 1D array. Returns nan for invalid data." |
| xsorted = np.squeeze(np.sort(data.compressed().view(ndarray))) |
| |
| n = xsorted.size |
|
|
| hd = np.empty((2,len(prob)), float64) |
| if n < 2: |
| hd.flat = np.nan |
| if var: |
| return hd |
| return hd[0] |
|
|
| v = np.arange(n+1) / float(n) |
| betacdf = beta.cdf |
| for (i,p) in enumerate(prob): |
| _w = betacdf(v, (n+1)*p, (n+1)*(1-p)) |
| w = _w[1:] - _w[:-1] |
| hd_mean = np.dot(w, xsorted) |
| hd[0,i] = hd_mean |
| |
| hd[1,i] = np.dot(w, (xsorted-hd_mean)**2) |
| |
| hd[0, prob == 0] = xsorted[0] |
| hd[0, prob == 1] = xsorted[-1] |
| if var: |
| hd[1, prob == 0] = hd[1, prob == 1] = np.nan |
| return hd |
| return hd[0] |
| |
| data = ma.array(data, copy=False, dtype=float64) |
| p = np.atleast_1d(np.asarray(prob)) |
| |
| if (axis is None) or (data.ndim == 1): |
| result = _hd_1D(data, p, var) |
| else: |
| if data.ndim > 2: |
| raise ValueError("Array 'data' must be at most two dimensional, " |
| "but got data.ndim = %d" % data.ndim) |
| result = ma.apply_along_axis(_hd_1D, axis, data, p, var) |
|
|
| return ma.fix_invalid(result, copy=False) |
|
|
|
|
| def hdmedian(data, axis=-1, var=False): |
| """ |
| Returns the Harrell-Davis estimate of the median along the given axis. |
| |
| Parameters |
| ---------- |
| data : ndarray |
| Data array. |
| axis : int, optional |
| Axis along which to compute the quantiles. If None, use a flattened |
| array. |
| var : bool, optional |
| Whether to return the variance of the estimate. |
| |
| Returns |
| ------- |
| hdmedian : MaskedArray |
| The median values. If ``var=True``, the variance is returned inside |
| the masked array. E.g. for a 1-D array the shape change from (1,) to |
| (2,). |
| |
| """ |
| result = hdquantiles(data,[0.5], axis=axis, var=var) |
| return result.squeeze() |
|
|
|
|
| def hdquantiles_sd(data, prob=(.25, .5, .75), axis=None): |
| """ |
| The standard error of the Harrell-Davis quantile estimates by jackknife. |
| |
| Parameters |
| ---------- |
| data : array_like |
| Data array. |
| prob : sequence, optional |
| Sequence of quantiles to compute. |
| axis : int, optional |
| Axis along which to compute the quantiles. If None, use a flattened |
| array. |
| |
| Returns |
| ------- |
| hdquantiles_sd : MaskedArray |
| Standard error of the Harrell-Davis quantile estimates. |
| |
| See Also |
| -------- |
| hdquantiles |
| |
| """ |
| def _hdsd_1D(data, prob): |
| "Computes the std error for 1D arrays." |
| xsorted = np.sort(data.compressed()) |
| n = len(xsorted) |
|
|
| hdsd = np.empty(len(prob), float64) |
| if n < 2: |
| hdsd.flat = np.nan |
|
|
| vv = np.arange(n) / float(n-1) |
| betacdf = beta.cdf |
|
|
| for (i,p) in enumerate(prob): |
| _w = betacdf(vv, n*p, n*(1-p)) |
| w = _w[1:] - _w[:-1] |
| |
| |
| mx_ = np.zeros_like(xsorted) |
| mx_[1:] = np.cumsum(w * xsorted[:-1]) |
| |
| mx_[:-1] += np.cumsum(w[::-1] * xsorted[:0:-1])[::-1] |
| hdsd[i] = np.sqrt(mx_.var() * (n - 1)) |
| return hdsd |
|
|
| |
| data = ma.array(data, copy=False, dtype=float64) |
| p = np.atleast_1d(np.asarray(prob)) |
| |
| if (axis is None): |
| result = _hdsd_1D(data, p) |
| else: |
| if data.ndim > 2: |
| raise ValueError("Array 'data' must be at most two dimensional, " |
| "but got data.ndim = %d" % data.ndim) |
| result = ma.apply_along_axis(_hdsd_1D, axis, data, p) |
|
|
| return ma.fix_invalid(result, copy=False).ravel() |
|
|
|
|
| def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True), |
| alpha=0.05, axis=None): |
| """ |
| Selected confidence interval of the trimmed mean along the given axis. |
| |
| Parameters |
| ---------- |
| data : array_like |
| Input data. |
| limits : {None, tuple}, optional |
| None or a two item tuple. |
| Tuple of the percentages to cut on each side of the array, with respect |
| to the number of unmasked data, as floats between 0. and 1. If ``n`` |
| is the number of unmasked data before trimming, then |
| (``n * limits[0]``)th smallest data and (``n * limits[1]``)th |
| largest data are masked. The total number of unmasked data after |
| trimming is ``n * (1. - sum(limits))``. |
| The value of one limit can be set to None to indicate an open interval. |
| |
| Defaults to (0.2, 0.2). |
| inclusive : (2,) tuple of boolean, optional |
| If relative==False, tuple indicating whether values exactly equal to |
| the absolute limits are allowed. |
| If relative==True, tuple indicating whether the number of data being |
| masked on each side should be rounded (True) or truncated (False). |
| |
| Defaults to (True, True). |
| alpha : float, optional |
| Confidence level of the intervals. |
| |
| Defaults to 0.05. |
| axis : int, optional |
| Axis along which to cut. If None, uses a flattened version of `data`. |
| |
| Defaults to None. |
| |
| Returns |
| ------- |
| trimmed_mean_ci : (2,) ndarray |
| The lower and upper confidence intervals of the trimmed data. |
| |
| """ |
| data = ma.array(data, copy=False) |
| trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis) |
| tmean = trimmed.mean(axis) |
| tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis) |
| df = trimmed.count(axis) - 1 |
| tppf = t.ppf(1-alpha/2.,df) |
| return np.array((tmean - tppf*tstde, tmean+tppf*tstde)) |
|
|
|
|
| def mjci(data, prob=(0.25, 0.5, 0.75), axis=None): |
| """ |
| Returns the Maritz-Jarrett estimators of the standard error of selected |
| experimental quantiles of the data. |
| |
| Parameters |
| ---------- |
| data : ndarray |
| Data array. |
| prob : sequence, optional |
| Sequence of quantiles to compute. |
| axis : int or None, optional |
| Axis along which to compute the quantiles. If None, use a flattened |
| array. |
| |
| """ |
| def _mjci_1D(data, p): |
| data = np.sort(data.compressed()) |
| n = data.size |
| prob = (np.array(p) * n + 0.5).astype(int) |
| betacdf = beta.cdf |
|
|
| mj = np.empty(len(prob), float64) |
| x = np.arange(1,n+1, dtype=float64) / n |
| y = x - 1./n |
| for (i,m) in enumerate(prob): |
| W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m) |
| C1 = np.dot(W,data) |
| C2 = np.dot(W,data**2) |
| mj[i] = np.sqrt(C2 - C1**2) |
| return mj |
|
|
| data = ma.array(data, copy=False) |
| if data.ndim > 2: |
| raise ValueError("Array 'data' must be at most two dimensional, " |
| "but got data.ndim = %d" % data.ndim) |
|
|
| p = np.atleast_1d(np.asarray(prob)) |
| |
| if (axis is None): |
| return _mjci_1D(data, p) |
| else: |
| return ma.apply_along_axis(_mjci_1D, axis, data, p) |
|
|
|
|
| def mquantiles_cimj(data, prob=(0.25, 0.50, 0.75), alpha=0.05, axis=None): |
| """ |
| Computes the alpha confidence interval for the selected quantiles of the |
| data, with Maritz-Jarrett estimators. |
| |
| Parameters |
| ---------- |
| data : ndarray |
| Data array. |
| prob : sequence, optional |
| Sequence of quantiles to compute. |
| alpha : float, optional |
| Confidence level of the intervals. |
| axis : int or None, optional |
| Axis along which to compute the quantiles. |
| If None, use a flattened array. |
| |
| Returns |
| ------- |
| ci_lower : ndarray |
| The lower boundaries of the confidence interval. Of the same length as |
| `prob`. |
| ci_upper : ndarray |
| The upper boundaries of the confidence interval. Of the same length as |
| `prob`. |
| |
| """ |
| alpha = min(alpha, 1 - alpha) |
| z = norm.ppf(1 - alpha/2.) |
| xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis) |
| smj = mjci(data, prob, axis=axis) |
| return (xq - z * smj, xq + z * smj) |
|
|
|
|
| def median_cihs(data, alpha=0.05, axis=None): |
| """ |
| Computes the alpha-level confidence interval for the median of the data. |
| |
| Uses the Hettmasperger-Sheather method. |
| |
| Parameters |
| ---------- |
| data : array_like |
| Input data. Masked values are discarded. The input should be 1D only, |
| or `axis` should be set to None. |
| alpha : float, optional |
| Confidence level of the intervals. |
| axis : int or None, optional |
| Axis along which to compute the quantiles. If None, use a flattened |
| array. |
| |
| Returns |
| ------- |
| median_cihs |
| Alpha level confidence interval. |
| |
| """ |
| def _cihs_1D(data, alpha): |
| data = np.sort(data.compressed()) |
| n = len(data) |
| alpha = min(alpha, 1-alpha) |
| k = int(binom._ppf(alpha/2., n, 0.5)) |
| gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5) |
| if gk < 1-alpha: |
| k -= 1 |
| gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5) |
| gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5) |
| I = (gk - 1 + alpha)/(gk - gkk) |
| lambd = (n-k) * I / float(k + (n-2*k)*I) |
| lims = (lambd*data[k] + (1-lambd)*data[k-1], |
| lambd*data[n-k-1] + (1-lambd)*data[n-k]) |
| return lims |
| data = ma.array(data, copy=False) |
| |
| if (axis is None): |
| result = _cihs_1D(data, alpha) |
| else: |
| if data.ndim > 2: |
| raise ValueError("Array 'data' must be at most two dimensional, " |
| "but got data.ndim = %d" % data.ndim) |
| result = ma.apply_along_axis(_cihs_1D, axis, data, alpha) |
|
|
| return result |
|
|
|
|
| def compare_medians_ms(group_1, group_2, axis=None): |
| """ |
| Compares the medians from two independent groups along the given axis. |
| |
| The comparison is performed using the McKean-Schrader estimate of the |
| standard error of the medians. |
| |
| Parameters |
| ---------- |
| group_1 : array_like |
| First dataset. Has to be of size >=7. |
| group_2 : array_like |
| Second dataset. Has to be of size >=7. |
| axis : int, optional |
| Axis along which the medians are estimated. If None, the arrays are |
| flattened. If `axis` is not None, then `group_1` and `group_2` |
| should have the same shape. |
| |
| Returns |
| ------- |
| compare_medians_ms : {float, ndarray} |
| If `axis` is None, then returns a float, otherwise returns a 1-D |
| ndarray of floats with a length equal to the length of `group_1` |
| along `axis`. |
| |
| Examples |
| -------- |
| |
| >>> from scipy import stats |
| >>> a = [1, 2, 3, 4, 5, 6, 7] |
| >>> b = [8, 9, 10, 11, 12, 13, 14] |
| >>> stats.mstats.compare_medians_ms(a, b, axis=None) |
| 1.0693225866553746e-05 |
| |
| The function is vectorized to compute along a given axis. |
| |
| >>> import numpy as np |
| >>> rng = np.random.default_rng() |
| >>> x = rng.random(size=(3, 7)) |
| >>> y = rng.random(size=(3, 8)) |
| >>> stats.mstats.compare_medians_ms(x, y, axis=1) |
| array([0.36908985, 0.36092538, 0.2765313 ]) |
| |
| References |
| ---------- |
| .. [1] McKean, Joseph W., and Ronald M. Schrader. "A comparison of methods |
| for studentizing the sample median." Communications in |
| Statistics-Simulation and Computation 13.6 (1984): 751-773. |
| |
| """ |
| (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis)) |
| (std_1, std_2) = (mstats.stde_median(group_1, axis=axis), |
| mstats.stde_median(group_2, axis=axis)) |
| W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2) |
| return 1 - norm.cdf(W) |
|
|
|
|
| def idealfourths(data, axis=None): |
| """ |
| Returns an estimate of the lower and upper quartiles. |
| |
| Uses the ideal fourths algorithm. |
| |
| Parameters |
| ---------- |
| data : array_like |
| Input array. |
| axis : int, optional |
| Axis along which the quartiles are estimated. If None, the arrays are |
| flattened. |
| |
| Returns |
| ------- |
| idealfourths : {list of floats, masked array} |
| Returns the two internal values that divide `data` into four parts |
| using the ideal fourths algorithm either along the flattened array |
| (if `axis` is None) or along `axis` of `data`. |
| |
| """ |
| def _idf(data): |
| x = data.compressed() |
| n = len(x) |
| if n < 3: |
| return [np.nan,np.nan] |
| (j,h) = divmod(n/4. + 5/12.,1) |
| j = int(j) |
| qlo = (1-h)*x[j-1] + h*x[j] |
| k = n - j |
| qup = (1-h)*x[k] + h*x[k-1] |
| return [qlo, qup] |
| data = ma.sort(data, axis=axis).view(MaskedArray) |
| if (axis is None): |
| return _idf(data) |
| else: |
| return ma.apply_along_axis(_idf, axis, data) |
|
|
|
|
| def rsh(data, points=None): |
| """ |
| Evaluates Rosenblatt's shifted histogram estimators for each data point. |
| |
| Rosenblatt's estimator is a centered finite-difference approximation to the |
| derivative of the empirical cumulative distribution function. |
| |
| Parameters |
| ---------- |
| data : sequence |
| Input data, should be 1-D. Masked values are ignored. |
| points : sequence or None, optional |
| Sequence of points where to evaluate Rosenblatt shifted histogram. |
| If None, use the data. |
| |
| """ |
| data = ma.array(data, copy=False) |
| if points is None: |
| points = data |
| else: |
| points = np.atleast_1d(np.asarray(points)) |
|
|
| if data.ndim != 1: |
| raise AttributeError("The input array should be 1D only !") |
|
|
| n = data.count() |
| r = idealfourths(data, axis=None) |
| h = 1.2 * (r[-1]-r[0]) / n**(1./5) |
| nhi = (data[:,None] <= points[None,:] + h).sum(0) |
| nlo = (data[:,None] < points[None,:] - h).sum(0) |
| return (nhi-nlo) / (2.*n*h) |
|
|