| import os |
| import numpy as np |
| import numpy.testing as npt |
| from numpy.testing import assert_allclose, assert_equal |
| import pytest |
| from scipy import stats |
| from scipy.optimize import differential_evolution |
|
|
| from .test_continuous_basic import distcont |
| from scipy.stats._distn_infrastructure import FitError |
| from scipy.stats._distr_params import distdiscrete |
| from scipy.stats import goodness_of_fit |
|
|
|
|
| |
| |
|
|
| fit_sizes = [1000, 5000, 10000] |
|
|
| thresh_percent = 0.25 |
| thresh_min = 0.75 |
|
|
| mle_failing_fits = [ |
| 'dpareto_lognorm', |
| 'gausshyper', |
| 'genexpon', |
| 'gengamma', |
| 'irwinhall', |
| 'kappa4', |
| 'ksone', |
| 'kstwo', |
| 'ncf', |
| 'ncx2', |
| 'truncexpon', |
| 'tukeylambda', |
| 'vonmises', |
| 'levy_stable', |
| 'trapezoid', |
| 'truncweibull_min', |
| 'studentized_range', |
| ] |
|
|
| |
| mle_Xslow_fits = ['betaprime', 'crystalball', 'exponweib', 'f', 'geninvgauss', |
| 'jf_skew_t', 'recipinvgauss', 'rel_breitwigner', 'vonmises_line'] |
|
|
| |
| |
| mle_use_floc0 = [ |
| 'burr', |
| 'chi', |
| 'chi2', |
| 'mielke', |
| 'pearson3', |
| 'genhalflogistic', |
| 'rdist', |
| 'pareto', |
| 'powerlaw', |
| 'powerlognorm', |
| 'wrapcauchy', |
| 'rel_breitwigner', |
| ] |
|
|
| mm_failing_fits = ['alpha', 'betaprime', 'burr', 'burr12', 'cauchy', 'chi', |
| 'chi2', 'crystalball', 'dgamma', 'dpareto_lognorm', 'dweibull', |
| 'f', 'fatiguelife', 'fisk', 'foldcauchy', 'genextreme', |
| 'gengamma', 'genhyperbolic', 'gennorm', 'genpareto', |
| 'halfcauchy', 'invgamma', 'invweibull', 'irwinhall', 'jf_skew_t', |
| 'johnsonsu', 'kappa3', 'ksone', 'kstwo', 'landau', 'levy', 'levy_l', |
| 'levy_stable', 'loglaplace', 'lomax', 'mielke', 'nakagami', |
| 'ncf', 'nct', 'ncx2', 'pareto', 'powerlognorm', 'powernorm', |
| 'rel_breitwigner', 'skewcauchy', 't', 'trapezoid', 'triang', |
| 'truncpareto', 'truncweibull_min', 'tukeylambda', |
| 'studentized_range'] |
|
|
| |
| mm_XXslow_fits = ['argus', 'exponpow', 'exponweib', 'gausshyper', 'genexpon', |
| 'genhalflogistic', 'halfgennorm', 'gompertz', 'johnsonsb', |
| 'kappa4', 'kstwobign', 'recipinvgauss', |
| 'truncexpon', 'vonmises', 'vonmises_line'] |
|
|
| |
| mm_Xslow_fits = ['wrapcauchy'] |
|
|
| failing_fits = {"MM": mm_failing_fits + mm_XXslow_fits, "MLE": mle_failing_fits} |
| xslow_fits = {"MM": mm_Xslow_fits, "MLE": mle_Xslow_fits} |
| fail_interval_censored = {"truncpareto"} |
|
|
| |
| skip_fit = [ |
| 'erlang', |
| 'genhyperbolic', 'norminvgauss', |
| ] |
|
|
|
|
| def cases_test_cont_fit(): |
| |
| |
| |
| |
| for distname, arg in distcont: |
| if distname not in skip_fit: |
| yield distname, arg |
|
|
|
|
| @pytest.mark.slow |
| @pytest.mark.parametrize('distname,arg', cases_test_cont_fit()) |
| @pytest.mark.parametrize('method', ["MLE", "MM"]) |
| def test_cont_fit(distname, arg, method): |
| run_xfail = int(os.getenv('SCIPY_XFAIL', default=False)) |
| run_xslow = int(os.getenv('SCIPY_XSLOW', default=False)) |
|
|
| if distname in failing_fits[method] and not run_xfail: |
| |
| |
| msg = "Failure expected; set environment variable SCIPY_XFAIL=1 to run." |
| pytest.xfail(msg) |
|
|
| if distname in xslow_fits[method] and not run_xslow: |
| msg = "Very slow; set environment variable SCIPY_XSLOW=1 to run." |
| pytest.skip(msg) |
|
|
| distfn = getattr(stats, distname) |
|
|
| truearg = np.hstack([arg, [0.0, 1.0]]) |
| diffthreshold = np.max(np.vstack([truearg*thresh_percent, |
| np.full(distfn.numargs+2, thresh_min)]), |
| 0) |
|
|
| for fit_size in fit_sizes: |
| |
| np.random.seed(1234) |
|
|
| with np.errstate(all='ignore'): |
| rvs = distfn.rvs(size=fit_size, *arg) |
| if method == 'MLE' and distfn.name in mle_use_floc0: |
| kwds = {'floc': 0} |
| else: |
| kwds = {} |
| |
| est = distfn.fit(rvs, method=method, **kwds) |
| if method == 'MLE': |
| |
| |
| |
| data1 = stats.CensoredData(rvs) |
| est1 = distfn.fit(data1, **kwds) |
| msg = ('Different results fitting uncensored data wrapped as' |
| f' CensoredData: {distfn.name}: est={est} est1={est1}') |
| assert_allclose(est1, est, rtol=1e-10, err_msg=msg) |
| if method == 'MLE' and distname not in fail_interval_censored: |
| |
| |
| |
| nic = 15 |
| interval = np.column_stack((rvs, rvs)) |
| interval[:nic, 0] *= 0.99 |
| interval[:nic, 1] *= 1.01 |
| interval.sort(axis=1) |
| data2 = stats.CensoredData(interval=interval) |
| est2 = distfn.fit(data2, **kwds) |
| msg = ('Different results fitting interval-censored' |
| f' data: {distfn.name}: est={est} est2={est2}') |
| assert_allclose(est2, est, rtol=0.05, err_msg=msg) |
|
|
| diff = est - truearg |
|
|
| |
| diffthreshold[-2] = np.max([np.abs(rvs.mean())*thresh_percent, |
| thresh_min]) |
|
|
| if np.any(np.isnan(est)): |
| raise AssertionError('nan returned in fit') |
| else: |
| if np.all(np.abs(diff) <= diffthreshold): |
| break |
| else: |
| txt = f'parameter: {str(truearg)}\n' |
| txt += f'estimated: {str(est)}\n' |
| txt += f'diff : {str(diff)}\n' |
| raise AssertionError(f'fit not very good in {distfn.name}\n' + txt) |
|
|
|
|
| def _check_loc_scale_mle_fit(name, data, desired, atol=None): |
| d = getattr(stats, name) |
| actual = d.fit(data)[-2:] |
| assert_allclose(actual, desired, atol=atol, |
| err_msg=f'poor mle fit of (loc, scale) in {name}') |
|
|
|
|
| def test_non_default_loc_scale_mle_fit(): |
| data = np.array([1.01, 1.78, 1.78, 1.78, 1.88, 1.88, 1.88, 2.00]) |
| _check_loc_scale_mle_fit('uniform', data, [1.01, 0.99], 1e-3) |
| _check_loc_scale_mle_fit('expon', data, [1.01, 0.73875], 1e-3) |
|
|
|
|
| def test_expon_fit(): |
| """gh-6167""" |
| data = [0, 0, 0, 0, 2, 2, 2, 2] |
| phat = stats.expon.fit(data, floc=0) |
| assert_allclose(phat, [0, 1.0], atol=1e-3) |
|
|
|
|
| def test_fit_error(): |
| data = np.concatenate([np.zeros(29), np.ones(21)]) |
| message = "Optimization converged to parameters that are..." |
| with pytest.raises(FitError, match=message), \ |
| pytest.warns(RuntimeWarning): |
| stats.beta.fit(data) |
|
|
|
|
| @pytest.mark.parametrize("dist, params", |
| [(stats.norm, (0.5, 2.5)), |
| (stats.binom, (10, 0.3, 2))]) |
| def test_nnlf_and_related_methods(dist, params): |
| rng = np.random.default_rng(983459824) |
|
|
| if hasattr(dist, 'pdf'): |
| logpxf = dist.logpdf |
| else: |
| logpxf = dist.logpmf |
|
|
| x = dist.rvs(*params, size=100, random_state=rng) |
| ref = -logpxf(x, *params).sum() |
| res1 = dist.nnlf(params, x) |
| res2 = dist._penalized_nnlf(params, x) |
| assert_allclose(res1, ref) |
| assert_allclose(res2, ref) |
|
|
|
|
| def cases_test_fit_mle(): |
| |
| skip_basic_fit = {'argus', 'irwinhall', 'foldnorm', 'truncpareto', |
| 'truncweibull_min', 'ksone', 'levy_stable', |
| 'studentized_range', 'kstwo', |
| 'beta', 'nakagami', 'truncnorm', |
| 'poisson_binom'} |
|
|
| |
| slow_basic_fit = {'alpha', 'arcsine', 'betaprime', 'binom', 'bradford', 'burr12', |
| 'chi', 'crystalball', 'dweibull', 'erlang', 'exponnorm', |
| 'exponpow', 'f', 'fatiguelife', 'fisk', 'foldcauchy', 'gamma', |
| 'genexpon', 'genextreme', 'gennorm', 'genpareto', |
| 'gompertz', 'halfgennorm', 'invgamma', 'invgauss', 'invweibull', |
| 'jf_skew_t', 'johnsonsb', 'johnsonsu', 'kappa3', |
| 'kstwobign', 'loglaplace', 'lognorm', 'lomax', 'mielke', |
| 'nbinom', 'norminvgauss', |
| 'pareto', 'pearson3', 'powerlaw', 'powernorm', |
| 'randint', 'rdist', 'recipinvgauss', 'rice', 'skewnorm', |
| 't', 'uniform', 'weibull_max', 'weibull_min', 'wrapcauchy'} |
|
|
| |
| xslow_basic_fit = {'betabinom', 'betanbinom', 'burr', 'dpareto_lognorm', |
| 'exponweib', 'gausshyper', 'gengamma', 'genhalflogistic', |
| 'genhyperbolic', 'geninvgauss', |
| 'hypergeom', 'kappa4', 'loguniform', |
| 'ncf', 'nchypergeom_fisher', 'nchypergeom_wallenius', |
| 'nct', 'ncx2', 'nhypergeom', |
| 'powerlognorm', 'reciprocal', 'rel_breitwigner', |
| 'skellam', 'trapezoid', 'triang', |
| 'tukeylambda', 'vonmises', 'zipfian'} |
|
|
| for dist in dict(distdiscrete + distcont): |
| if dist in skip_basic_fit or not isinstance(dist, str): |
| reason = "tested separately" |
| yield pytest.param(dist, marks=pytest.mark.skip(reason=reason)) |
| elif dist in slow_basic_fit: |
| reason = "too slow (>= 0.25s)" |
| yield pytest.param(dist, marks=pytest.mark.slow(reason=reason)) |
| elif dist in xslow_basic_fit: |
| reason = "too slow (>= 1.0s)" |
| yield pytest.param(dist, marks=pytest.mark.xslow(reason=reason)) |
| else: |
| yield dist |
|
|
|
|
| def cases_test_fit_mse(): |
| |
| skip_basic_fit = {'levy_stable', 'studentized_range', 'ksone', 'skewnorm', |
| 'irwinhall', |
| 'norminvgauss', |
| 'kstwo', |
| 'geninvgauss', |
| 'gausshyper', 'genhyperbolic', |
| 'tukeylambda', |
| 'vonmises', |
| 'arcsine', 'argus', 'powerlaw', 'rdist', |
| 'poisson_binom', |
| } |
|
|
| |
| slow_basic_fit = {'alpha', 'anglit', 'betabinom', 'bradford', |
| 'chi', 'chi2', 'crystalball', 'dweibull', |
| 'erlang', 'exponnorm', 'exponpow', 'exponweib', |
| 'fatiguelife', 'fisk', 'foldcauchy', 'foldnorm', |
| 'gamma', 'genexpon', 'genextreme', 'genhalflogistic', |
| 'genlogistic', 'genpareto', 'gompertz', |
| 'hypergeom', 'invweibull', |
| 'johnsonsu', 'kappa3', 'kstwobign', |
| 'laplace_asymmetric', 'loggamma', 'loglaplace', |
| 'lognorm', 'lomax', |
| 'maxwell', 'nhypergeom', |
| 'pareto', 'powernorm', 'randint', 'recipinvgauss', |
| 'semicircular', |
| 't', 'triang', 'truncexpon', 'truncpareto', |
| 'uniform', |
| 'wald', 'weibull_max', 'weibull_min', 'wrapcauchy'} |
|
|
| |
| xslow_basic_fit = {'argus', 'beta', 'betaprime', 'burr', 'burr12', |
| 'dgamma', 'dpareto_lognorm', 'f', 'gengamma', 'gennorm', |
| 'halfgennorm', 'invgamma', 'invgauss', 'jf_skew_t', |
| 'johnsonsb', 'kappa4', 'loguniform', 'mielke', |
| 'nakagami', 'ncf', 'nchypergeom_fisher', |
| 'nchypergeom_wallenius', 'nct', 'ncx2', |
| 'pearson3', 'powerlognorm', |
| 'reciprocal', 'rel_breitwigner', 'rice', |
| 'trapezoid', 'truncnorm', 'truncweibull_min', |
| 'vonmises_line', 'zipfian'} |
|
|
| warns_basic_fit = {'skellam'} |
|
|
| for dist in dict(distdiscrete + distcont): |
| if dist in skip_basic_fit or not isinstance(dist, str): |
| reason = "Fails. Oh well." |
| yield pytest.param(dist, marks=pytest.mark.skip(reason=reason)) |
| elif dist in slow_basic_fit: |
| reason = "too slow (>= 0.25s)" |
| yield pytest.param(dist, marks=pytest.mark.slow(reason=reason)) |
| elif dist in xslow_basic_fit: |
| reason = "too slow (>= 1.0s)" |
| yield pytest.param(dist, marks=pytest.mark.xslow(reason=reason)) |
| elif dist in warns_basic_fit: |
| mark = pytest.mark.filterwarnings('ignore::RuntimeWarning') |
| yield pytest.param(dist, marks=mark) |
| else: |
| yield dist |
|
|
|
|
| def cases_test_fitstart(): |
| for distname, shapes in dict(distcont).items(): |
| if (not isinstance(distname, str) or |
| distname in {'studentized_range', 'recipinvgauss'}): |
| continue |
| yield distname, shapes |
|
|
|
|
| @pytest.mark.parametrize('distname, shapes', cases_test_fitstart()) |
| def test_fitstart(distname, shapes): |
| dist = getattr(stats, distname) |
| rng = np.random.default_rng(216342614) |
| data = rng.random(10) |
|
|
| with np.errstate(invalid='ignore', divide='ignore'): |
| guess = dist._fitstart(data) |
|
|
| assert dist._argcheck(*guess[:-2]) |
|
|
|
|
| def assert_nlff_less_or_close(dist, data, params1, params0, rtol=1e-7, atol=0, |
| nlff_name='nnlf'): |
| nlff = getattr(dist, nlff_name) |
| nlff1 = nlff(params1, data) |
| nlff0 = nlff(params0, data) |
| if not (nlff1 < nlff0): |
| np.testing.assert_allclose(nlff1, nlff0, rtol=rtol, atol=atol) |
|
|
|
|
| class TestFit: |
| dist = stats.binom |
| seed = 654634816187 |
| rng = np.random.default_rng(seed) |
| data = stats.binom.rvs(5, 0.5, size=100, random_state=rng) |
| shape_bounds_a = [(1, 10), (0, 1)] |
| shape_bounds_d = {'n': (1, 10), 'p': (0, 1)} |
| atol = 5e-2 |
| rtol = 1e-2 |
| tols = {'atol': atol, 'rtol': rtol} |
|
|
| def opt(self, *args, rng=1, **kwds): |
| return differential_evolution(*args, rng=rng, **kwds) |
|
|
| def test_dist_iv(self): |
| message = "`dist` must be an instance of..." |
| with pytest.raises(ValueError, match=message): |
| stats.fit(10, self.data, self.shape_bounds_a) |
|
|
| def test_data_iv(self): |
| message = "`data` must be exactly one-dimensional." |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, [[1, 2, 3]], self.shape_bounds_a) |
|
|
| message = "All elements of `data` must be finite numbers." |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, [1, 2, 3, np.nan], self.shape_bounds_a) |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, [1, 2, 3, np.inf], self.shape_bounds_a) |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, ['1', '2', '3'], self.shape_bounds_a) |
|
|
| def test_bounds_iv(self): |
| message = "Bounds provided for the following unrecognized..." |
| shape_bounds = {'n': (1, 10), 'p': (0, 1), '1': (0, 10)} |
| with pytest.warns(RuntimeWarning, match=message): |
| stats.fit(self.dist, self.data, shape_bounds) |
|
|
| message = "Each element of a `bounds` sequence must be a tuple..." |
| shape_bounds = [(1, 10, 3), (0, 1)] |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, self.data, shape_bounds) |
|
|
| message = "Each element of `bounds` must be a tuple specifying..." |
| shape_bounds = [(1, 10, 3), (0, 1, 0.5)] |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, self.data, shape_bounds) |
| shape_bounds = [1, 0] |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, self.data, shape_bounds) |
|
|
| message = "A `bounds` sequence must contain at least 2 elements..." |
| shape_bounds = [(1, 10)] |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, self.data, shape_bounds) |
|
|
| message = "A `bounds` sequence may not contain more than 3 elements..." |
| bounds = [(1, 10), (1, 10), (1, 10), (1, 10)] |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, self.data, bounds) |
|
|
| message = "There are no values for `p` on the interval..." |
| shape_bounds = {'n': (1, 10), 'p': (1, 0)} |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, self.data, shape_bounds) |
|
|
| message = "There are no values for `n` on the interval..." |
| shape_bounds = [(10, 1), (0, 1)] |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, self.data, shape_bounds) |
|
|
| message = "There are no integer values for `n` on the interval..." |
| shape_bounds = [(1.4, 1.6), (0, 1)] |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, self.data, shape_bounds) |
|
|
| message = "The intersection of user-provided bounds for `n`" |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, self.data) |
| shape_bounds = [(-np.inf, np.inf), (0, 1)] |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, self.data, shape_bounds) |
|
|
| def test_guess_iv(self): |
| message = "Guesses provided for the following unrecognized..." |
| guess = {'n': 1, 'p': 0.5, '1': 255} |
| with pytest.warns(RuntimeWarning, match=message): |
| stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) |
|
|
| message = "Each element of `guess` must be a scalar..." |
| guess = {'n': 1, 'p': 'hi'} |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) |
| guess = [1, 'f'] |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) |
| guess = [[1, 2]] |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) |
|
|
| message = "A `guess` sequence must contain at least 2..." |
| guess = [1] |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) |
|
|
| message = "A `guess` sequence may not contain more than 3..." |
| guess = [1, 2, 3, 4] |
| with pytest.raises(ValueError, match=message): |
| stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) |
|
|
| message = "Guess for parameter `n` rounded.*|Guess for parameter `p` clipped.*" |
| guess = {'n': 4.5, 'p': -0.5} |
| with pytest.warns(RuntimeWarning, match=message): |
| stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) |
|
|
| message = "Guess for parameter `loc` rounded..." |
| guess = [5, 0.5, 0.5] |
| with pytest.warns(RuntimeWarning, match=message): |
| stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) |
|
|
| message = "Guess for parameter `p` clipped..." |
| guess = {'n': 5, 'p': -0.5} |
| with pytest.warns(RuntimeWarning, match=message): |
| stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) |
|
|
| message = "Guess for parameter `loc` clipped..." |
| guess = [5, 0.5, 1] |
| with pytest.warns(RuntimeWarning, match=message): |
| stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) |
|
|
| def basic_fit_test(self, dist_name, method, rng=1): |
|
|
| N = 5000 |
| dist_data = dict(distcont + distdiscrete) |
| rng = np.random.default_rng(self.seed) |
| dist = getattr(stats, dist_name) |
| shapes = np.array(dist_data[dist_name]) |
| bounds = np.empty((len(shapes) + 2, 2), dtype=np.float64) |
| bounds[:-2, 0] = shapes/10.**np.sign(shapes) |
| bounds[:-2, 1] = shapes*10.**np.sign(shapes) |
| bounds[-2] = (0, 10) |
| bounds[-1] = (1e-16, 10) |
| loc = rng.uniform(*bounds[-2]) |
| scale = rng.uniform(*bounds[-1]) |
| ref = list(dist_data[dist_name]) + [loc, scale] |
|
|
| if getattr(dist, 'pmf', False): |
| ref = ref[:-1] |
| ref[-1] = np.floor(loc) |
| data = dist.rvs(*ref, size=N, random_state=rng) |
| bounds = bounds[:-1] |
| if getattr(dist, 'pdf', False): |
| data = dist.rvs(*ref, size=N, random_state=rng) |
|
|
| with npt.suppress_warnings() as sup: |
| sup.filter(RuntimeWarning, "overflow encountered") |
| res = stats.fit(dist, data, bounds, method=method, |
| optimizer=self.opt) |
|
|
| nlff_names = {'mle': 'nnlf', 'mse': '_penalized_nlpsf'} |
| nlff_name = nlff_names[method] |
| assert_nlff_less_or_close(dist, data, res.params, ref, **self.tols, |
| nlff_name=nlff_name) |
|
|
| @pytest.mark.parametrize("dist_name", cases_test_fit_mle()) |
| def test_basic_fit_mle(self, dist_name): |
| self.basic_fit_test(dist_name, "mle", rng=5) |
|
|
| @pytest.mark.parametrize("dist_name", cases_test_fit_mse()) |
| def test_basic_fit_mse(self, dist_name): |
| self.basic_fit_test(dist_name, "mse", rng=2) |
|
|
| @pytest.mark.slow |
| def test_arcsine(self): |
| |
| |
| |
| N = 1000 |
| rng = np.random.default_rng(self.seed) |
| dist = stats.arcsine |
| shapes = (1., 2.) |
| data = dist.rvs(*shapes, size=N, random_state=rng) |
| shape_bounds = {'loc': (0.1, 10), 'scale': (0.1, 10)} |
| res = stats.fit(dist, data, shape_bounds, method='mse', optimizer=self.opt) |
| assert_nlff_less_or_close(dist, data, res.params, shapes, |
| nlff_name='_penalized_nlpsf', **self.tols) |
|
|
| @pytest.mark.parametrize("method", ('mle', 'mse')) |
| def test_argus(self, method): |
| |
| |
| |
| N = 1000 |
| rng = np.random.default_rng(self.seed) |
| dist = stats.argus |
| shapes = (1., 2., 3.) |
| data = dist.rvs(*shapes, size=N, random_state=rng) |
| shape_bounds = {'chi': (0.1, 10), 'loc': (0.1, 10), 'scale': (0.1, 10)} |
| res = stats.fit(dist, data, shape_bounds, optimizer=self.opt, method=method) |
| nlff_name = {'mle': 'nnlf', 'mse': '_penalized_nlpsf'}[method] |
| assert_nlff_less_or_close(dist, data, res.params, shapes, **self.tols, |
| nlff_name=nlff_name) |
|
|
| @pytest.mark.xslow |
| def test_beta(self): |
| |
| |
| |
| N = 1000 |
| rng = np.random.default_rng(self.seed) |
| dist = stats.beta |
| shapes = (2.3098496451481823, 0.62687954300963677, 1., 2.) |
| data = dist.rvs(*shapes, size=N, random_state=rng) |
| shape_bounds = {'a': (0.1, 10), 'b':(0.1, 10), |
| 'loc': (0.1, 10), 'scale': (0.1, 10)} |
| res = stats.fit(dist, data, shape_bounds, method='mle', optimizer=self.opt) |
| assert_nlff_less_or_close(dist, data, res.params, shapes, |
| nlff_name='nnlf', **self.tols) |
|
|
| def test_foldnorm(self): |
| |
| |
| |
| N = 1000 |
| rng = np.random.default_rng(self.seed) |
| dist = stats.foldnorm |
| shapes = (1.952125337355587, 2., 3.) |
| data = dist.rvs(*shapes, size=N, random_state=rng) |
| shape_bounds = {'c': (0.1, 10), 'loc': (0.1, 10), 'scale': (0.1, 10)} |
| res = stats.fit(dist, data, shape_bounds, optimizer=self.opt) |
|
|
| assert_nlff_less_or_close(dist, data, res.params, shapes, **self.tols) |
|
|
| def test_nakagami(self): |
| |
| |
| |
| N = 1000 |
| rng = np.random.default_rng(self.seed) |
| dist = stats.nakagami |
| shapes = (4.9673794866666237, 1., 2.) |
| data = dist.rvs(*shapes, size=N, random_state=rng) |
| shape_bounds = {'nu':(0.1, 10), 'loc': (0.1, 10), 'scale': (0.1, 10)} |
| res = stats.fit(dist, data, shape_bounds, method='mle', optimizer=self.opt) |
| assert_nlff_less_or_close(dist, data, res.params, shapes, |
| nlff_name='nnlf', **self.tols) |
|
|
| @pytest.mark.slow |
| def test_powerlaw(self): |
| |
| |
| |
| N = 1000 |
| rng = np.random.default_rng(self.seed) |
| dist = stats.powerlaw |
| shapes = (1.6591133289905851, 1., 2.) |
| data = dist.rvs(*shapes, size=N, random_state=rng) |
| shape_bounds = {'a': (0.1, 10), 'loc': (0.1, 10), 'scale': (0.1, 10)} |
| res = stats.fit(dist, data, shape_bounds, method='mse', optimizer=self.opt) |
| assert_nlff_less_or_close(dist, data, res.params, shapes, |
| nlff_name='_penalized_nlpsf', **self.tols) |
|
|
| def test_truncpareto(self): |
| |
| |
| |
| N = 1000 |
| rng = np.random.default_rng(self.seed) |
| dist = stats.truncpareto |
| shapes = (1.8, 5.3, 2.3, 4.1) |
| data = dist.rvs(*shapes, size=N, random_state=rng) |
| shape_bounds = [(0.1, 10)]*4 |
| res = stats.fit(dist, data, shape_bounds, optimizer=self.opt) |
|
|
| assert_nlff_less_or_close(dist, data, res.params, shapes, **self.tols) |
|
|
| @pytest.mark.slow |
| def test_truncweibull_min(self): |
| |
| |
| |
| N = 1000 |
| rng = np.random.default_rng(self.seed) |
| dist = stats.truncweibull_min |
| shapes = (2.5, 0.25, 1.75, 2., 3.) |
| data = dist.rvs(*shapes, size=N, random_state=rng) |
| shape_bounds = [(0.1, 10)]*5 |
| res = stats.fit(dist, data, shape_bounds, optimizer=self.opt) |
|
|
| assert_nlff_less_or_close(dist, data, res.params, shapes, **self.tols) |
|
|
| def test_missing_shape_bounds(self): |
| |
| |
| |
| |
| N = 1000 |
| rng = np.random.default_rng(self.seed) |
|
|
| dist = stats.binom |
| n, p, loc = 10, 0.65, 0 |
| data = dist.rvs(n, p, loc=loc, size=N, random_state=rng) |
| shape_bounds = {'n': np.array([0, 20])} |
| res = stats.fit(dist, data, shape_bounds, optimizer=self.opt) |
| assert_allclose(res.params, (n, p, loc), **self.tols) |
|
|
| dist = stats.bernoulli |
| p, loc = 0.314159, 0 |
| data = dist.rvs(p, loc=loc, size=N, random_state=rng) |
| res = stats.fit(dist, data, optimizer=self.opt) |
| assert_allclose(res.params, (p, loc), **self.tols) |
|
|
| def test_fit_only_loc_scale(self): |
| |
| N = 5000 |
| rng = np.random.default_rng(self.seed) |
|
|
| dist = stats.norm |
| loc, scale = 1.5, 1 |
| data = dist.rvs(loc=loc, size=N, random_state=rng) |
| loc_bounds = (0, 5) |
| bounds = {'loc': loc_bounds} |
| res = stats.fit(dist, data, bounds, optimizer=self.opt) |
| assert_allclose(res.params, (loc, scale), **self.tols) |
|
|
| |
| loc, scale = 0, 2.5 |
| data = dist.rvs(scale=scale, size=N, random_state=rng) |
| scale_bounds = (0.01, 5) |
| bounds = {'scale': scale_bounds} |
| res = stats.fit(dist, data, bounds, optimizer=self.opt) |
| assert_allclose(res.params, (loc, scale), **self.tols) |
|
|
| |
| dist = stats.norm |
| loc, scale = 1.5, 2.5 |
| data = dist.rvs(loc=loc, scale=scale, size=N, random_state=rng) |
| bounds = {'loc': loc_bounds, 'scale': scale_bounds} |
| res = stats.fit(dist, data, bounds, optimizer=self.opt) |
| assert_allclose(res.params, (loc, scale), **self.tols) |
|
|
| def test_everything_fixed(self): |
| N = 5000 |
| rng = np.random.default_rng(self.seed) |
|
|
| dist = stats.norm |
| loc, scale = 1.5, 2.5 |
| data = dist.rvs(loc=loc, scale=scale, size=N, random_state=rng) |
|
|
| |
| res = stats.fit(dist, data) |
| assert_allclose(res.params, (0, 1), **self.tols) |
|
|
| |
| bounds = {'loc': (loc, loc), 'scale': (scale, scale)} |
| res = stats.fit(dist, data, bounds) |
| assert_allclose(res.params, (loc, scale), **self.tols) |
|
|
| |
| dist = stats.binom |
| n, p, loc = 10, 0.65, 0 |
| data = dist.rvs(n, p, loc=loc, size=N, random_state=rng) |
| shape_bounds = {'n': (0, 20), 'p': (0.65, 0.65)} |
| res = stats.fit(dist, data, shape_bounds, optimizer=self.opt) |
| assert_allclose(res.params, (n, p, loc), **self.tols) |
|
|
| def test_failure(self): |
| N = 5000 |
| rng = np.random.default_rng(self.seed) |
|
|
| dist = stats.nbinom |
| shapes = (5, 0.5) |
| data = dist.rvs(*shapes, size=N, random_state=rng) |
|
|
| assert data.min() == 0 |
| |
| bounds = [(0, 30), (0, 1), (0.5, 10)] |
| res = stats.fit(dist, data, bounds) |
| message = "Optimization converged to parameter values that are" |
| assert res.message.startswith(message) |
| assert res.success is False |
|
|
| @pytest.mark.xslow |
| def test_guess(self): |
| |
| N = 2000 |
| |
| rng = np.random.default_rng(196390444561) |
| dist = stats.nhypergeom |
| params = (20, 7, 12, 0) |
| bounds = [(2, 200), (0.7, 70), (1.2, 120), (0, 10)] |
|
|
| data = dist.rvs(*params, size=N, random_state=rng) |
|
|
| res = stats.fit(dist, data, bounds, optimizer=self.opt) |
| assert not np.allclose(res.params, params, **self.tols) |
|
|
| res = stats.fit(dist, data, bounds, guess=params, optimizer=self.opt) |
| assert_allclose(res.params, params, **self.tols) |
|
|
| def test_mse_accuracy_1(self): |
| |
| |
| data = [2, 4] |
| dist = stats.expon |
| bounds = {'loc': (0, 0), 'scale': (1e-8, 10)} |
| res_mle = stats.fit(dist, data, bounds=bounds, method='mle') |
| assert_allclose(res_mle.params.scale, 3, atol=1e-3) |
| res_mse = stats.fit(dist, data, bounds=bounds, method='mse') |
| assert_allclose(res_mse.params.scale, 3.915, atol=1e-3) |
|
|
| def test_mse_accuracy_2(self): |
| |
| |
| rng = np.random.default_rng(9843212616816518964) |
|
|
| dist = stats.uniform |
| n = 10 |
| data = dist(3, 6).rvs(size=n, random_state=rng) |
| bounds = {'loc': (0, 10), 'scale': (1e-8, 10)} |
| res = stats.fit(dist, data, bounds=bounds, method='mse') |
| |
|
|
| x = np.sort(data) |
| a = (n*x[0] - x[-1])/(n - 1) |
| b = (n*x[-1] - x[0])/(n - 1) |
| ref = a, b-a |
| assert_allclose(res.params, ref, rtol=1e-4) |
|
|
|
|
| |
| examgrades = [65, 61, 81, 88, 69, 89, 55, 84, 86, 84, 71, 81, 84, 81, 78, 67, |
| 96, 66, 73, 75, 59, 71, 69, 63, 79, 76, 63, 85, 87, 88, 80, 71, |
| 65, 84, 71, 75, 81, 79, 64, 65, 84, 77, 70, 75, 84, 75, 73, 92, |
| 90, 79, 80, 71, 73, 71, 58, 79, 73, 64, 77, 82, 81, 59, 54, 82, |
| 57, 79, 79, 73, 74, 82, 63, 64, 73, 69, 87, 68, 81, 73, 83, 73, |
| 80, 73, 73, 71, 66, 78, 64, 74, 68, 67, 75, 75, 80, 85, 74, 76, |
| 80, 77, 93, 70, 86, 80, 81, 83, 68, 60, 85, 64, 74, 82, 81, 77, |
| 66, 85, 75, 81, 69, 60, 83, 72] |
|
|
|
|
| class TestGoodnessOfFit: |
|
|
| def test_gof_iv(self): |
| dist = stats.norm |
| x = [1, 2, 3] |
|
|
| message = r"`dist` must be a \(non-frozen\) instance of..." |
| with pytest.raises(TypeError, match=message): |
| goodness_of_fit(stats.norm(), x) |
|
|
| message = "`data` must be a one-dimensional array of numbers." |
| with pytest.raises(ValueError, match=message): |
| goodness_of_fit(dist, [[1, 2, 3]]) |
|
|
| message = "`statistic` must be one of..." |
| with pytest.raises(ValueError, match=message): |
| goodness_of_fit(dist, x, statistic='mm') |
|
|
| message = "`n_mc_samples` must be an integer." |
| with pytest.raises(TypeError, match=message): |
| goodness_of_fit(dist, x, n_mc_samples=1000.5) |
|
|
| message = "SeedSequence expects int or sequence" |
| with pytest.raises(TypeError, match=message): |
| goodness_of_fit(dist, x, rng='herring') |
|
|
| def test_against_ks(self): |
| rng = np.random.default_rng(8517426291317196949) |
| x = examgrades |
| known_params = {'loc': np.mean(x), 'scale': np.std(x, ddof=1)} |
| res = goodness_of_fit(stats.norm, x, known_params=known_params, |
| statistic='ks', rng=rng) |
| ref = stats.kstest(x, stats.norm(**known_params).cdf, method='exact') |
| assert_allclose(res.statistic, ref.statistic) |
| assert_allclose(res.pvalue, ref.pvalue, atol=5e-3) |
|
|
| def test_against_lilliefors(self): |
| rng = np.random.default_rng(2291803665717442724) |
| x = examgrades |
| |
| res = goodness_of_fit(stats.norm, x, statistic='ks', random_state=rng) |
| known_params = {'loc': np.mean(x), 'scale': np.std(x, ddof=1)} |
| ref = stats.kstest(x, stats.norm(**known_params).cdf, method='exact') |
| assert_allclose(res.statistic, ref.statistic) |
| assert_allclose(res.pvalue, 0.0348, atol=5e-3) |
|
|
| def test_against_cvm(self): |
| rng = np.random.default_rng(8674330857509546614) |
| x = examgrades |
| known_params = {'loc': np.mean(x), 'scale': np.std(x, ddof=1)} |
| res = goodness_of_fit(stats.norm, x, known_params=known_params, |
| statistic='cvm', rng=rng) |
| ref = stats.cramervonmises(x, stats.norm(**known_params).cdf) |
| assert_allclose(res.statistic, ref.statistic) |
| assert_allclose(res.pvalue, ref.pvalue, atol=5e-3) |
|
|
| def test_against_anderson_case_0(self): |
| |
| rng = np.random.default_rng(7384539336846690410) |
| x = np.arange(1, 101) |
| |
| known_params = {'loc': 45.01575354024957, 'scale': 30} |
| res = goodness_of_fit(stats.norm, x, known_params=known_params, |
| statistic='ad', rng=rng) |
| assert_allclose(res.statistic, 2.492) |
| assert_allclose(res.pvalue, 0.05, atol=5e-3) |
|
|
| def test_against_anderson_case_1(self): |
| |
| rng = np.random.default_rng(5040212485680146248) |
| x = np.arange(1, 101) |
| |
| known_params = {'scale': 29.957112639101933} |
| res = goodness_of_fit(stats.norm, x, known_params=known_params, |
| statistic='ad', rng=rng) |
| assert_allclose(res.statistic, 0.908) |
| assert_allclose(res.pvalue, 0.1, atol=5e-3) |
|
|
| def test_against_anderson_case_2(self): |
| |
| rng = np.random.default_rng(726693985720914083) |
| x = np.arange(1, 101) |
| |
| known_params = {'loc': 44.5680212261933} |
| res = goodness_of_fit(stats.norm, x, known_params=known_params, |
| statistic='ad', rng=rng) |
| assert_allclose(res.statistic, 2.904) |
| assert_allclose(res.pvalue, 0.025, atol=5e-3) |
|
|
| def test_against_anderson_case_3(self): |
| |
| rng = np.random.default_rng(6763691329830218206) |
| |
| x = stats.skewnorm.rvs(1.4477847789132101, loc=1, scale=2, size=100, |
| random_state=rng) |
| res = goodness_of_fit(stats.norm, x, statistic='ad', rng=rng) |
| assert_allclose(res.statistic, 0.559) |
| assert_allclose(res.pvalue, 0.15, atol=5e-3) |
|
|
| @pytest.mark.xslow |
| def test_against_anderson_gumbel_r(self): |
| rng = np.random.default_rng(7302761058217743) |
| |
| x = stats.genextreme(0.051896837188595134, loc=0.5, |
| scale=1.5).rvs(size=1000, random_state=rng) |
| res = goodness_of_fit(stats.gumbel_r, x, statistic='ad', |
| rng=rng) |
| ref = stats.anderson(x, dist='gumbel_r') |
| assert_allclose(res.statistic, ref.critical_values[0]) |
| assert_allclose(res.pvalue, ref.significance_level[0]/100, atol=5e-3) |
|
|
| def test_against_filliben_norm(self): |
| |
| rng = np.random.default_rng(8024266430745011915) |
| y = [6, 1, -4, 8, -2, 5, 0] |
| known_params = {'loc': 0, 'scale': 1} |
| res = stats.goodness_of_fit(stats.norm, y, known_params=known_params, |
| statistic="filliben", rng=rng) |
| |
| |
| |
| assert_allclose(res.statistic, 0.98538, atol=1e-4) |
| assert 0.75 < res.pvalue < 0.9 |
|
|
| |
| |
| |
| |
| |
| |
| |
| assert_allclose(res.statistic, 0.98540957187084, rtol=2e-5) |
| assert_allclose(res.pvalue, 0.8875, rtol=2e-3) |
|
|
| def test_filliben_property(self): |
| |
| rng = np.random.default_rng(8535677809395478813) |
| x = rng.normal(loc=10, scale=0.5, size=100) |
| res = stats.goodness_of_fit(stats.norm, x, |
| statistic="filliben", rng=rng) |
| known_params = {'loc': 0, 'scale': 1} |
| ref = stats.goodness_of_fit(stats.norm, x, known_params=known_params, |
| statistic="filliben", rng=rng) |
| assert_allclose(res.statistic, ref.statistic, rtol=1e-15) |
|
|
| @pytest.mark.parametrize('case', [(25, [.928, .937, .950, .958, .966]), |
| (50, [.959, .965, .972, .977, .981]), |
| (95, [.977, .979, .983, .986, .989])]) |
| def test_against_filliben_norm_table(self, case): |
| |
| rng = np.random.default_rng(504569995557928957) |
| n, ref = case |
| x = rng.random(n) |
| known_params = {'loc': 0, 'scale': 1} |
| res = stats.goodness_of_fit(stats.norm, x, known_params=known_params, |
| statistic="filliben", rng=rng) |
| percentiles = np.array([0.005, 0.01, 0.025, 0.05, 0.1]) |
| res = stats.scoreatpercentile(res.null_distribution, percentiles*100) |
| assert_allclose(res, ref, atol=2e-3) |
|
|
| @pytest.mark.xslow |
| @pytest.mark.parametrize('case', [(5, 0.95772790260469, 0.4755), |
| (6, 0.95398832257958, 0.3848), |
| (7, 0.9432692889277, 0.2328)]) |
| def test_against_ppcc(self, case): |
| |
| |
| |
| |
| |
| |
| n, ref_statistic, ref_pvalue = case |
| rng = np.random.default_rng(7777775561439803116) |
| x = rng.normal(size=n) |
| res = stats.goodness_of_fit(stats.rayleigh, x, statistic="filliben", |
| rng=rng) |
| assert_allclose(res.statistic, ref_statistic, rtol=1e-4) |
| assert_allclose(res.pvalue, ref_pvalue, atol=1.5e-2) |
|
|
| def test_params_effects(self): |
| |
| |
| rng = np.random.default_rng(9121950977643805391) |
| x = stats.skewnorm.rvs(-5.044559778383153, loc=1, scale=2, size=50, |
| random_state=rng) |
|
|
| |
| |
| guessed_params = {'c': 13.4} |
| fit_params = {'scale': 13.73} |
| known_params = {'loc': -13.85} |
| rng = np.random.default_rng(9121950977643805391) |
| res1 = goodness_of_fit(stats.weibull_min, x, n_mc_samples=2, |
| guessed_params=guessed_params, |
| fit_params=fit_params, |
| known_params=known_params, rng=rng) |
| assert not np.allclose(res1.fit_result.params.c, 13.4) |
| assert_equal(res1.fit_result.params.scale, 13.73) |
| assert_equal(res1.fit_result.params.loc, -13.85) |
|
|
| |
| |
| guessed_params = {'c': 2} |
| rng = np.random.default_rng(9121950977643805391) |
| res2 = goodness_of_fit(stats.weibull_min, x, n_mc_samples=2, |
| guessed_params=guessed_params, |
| fit_params=fit_params, |
| known_params=known_params, rng=rng) |
| assert not np.allclose(res2.fit_result.params.c, |
| res1.fit_result.params.c, rtol=1e-8) |
| assert not np.allclose(res2.null_distribution, |
| res1.null_distribution, rtol=1e-8) |
| assert_equal(res2.fit_result.params.scale, 13.73) |
| assert_equal(res2.fit_result.params.loc, -13.85) |
|
|
| |
| |
| |
| fit_params = {'c': 13.4, 'scale': 13.73} |
| rng = np.random.default_rng(9121950977643805391) |
| res3 = goodness_of_fit(stats.weibull_min, x, n_mc_samples=2, |
| guessed_params=guessed_params, |
| fit_params=fit_params, |
| known_params=known_params, rng=rng) |
| assert_equal(res3.fit_result.params.c, 13.4) |
| assert_equal(res3.fit_result.params.scale, 13.73) |
| assert_equal(res3.fit_result.params.loc, -13.85) |
| assert not np.allclose(res3.null_distribution, res1.null_distribution) |
|
|
| def test_custom_statistic(self): |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| def greenwood(dist, data, *, axis): |
| x = np.sort(data, axis=axis) |
| y = dist.cdf(x) |
| d = np.diff(y, axis=axis, prepend=0, append=1) |
| return np.sum(d ** 2, axis=axis) |
|
|
| |
| |
| |
| rng = np.random.default_rng(9121950977643805391) |
| data = stats.expon.rvs(size=5, random_state=rng) |
| result = goodness_of_fit(stats.expon, data, |
| known_params={'loc': 0, 'scale': 1}, |
| statistic=greenwood, rng=rng) |
| p = [.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99] |
| exact_quantiles = [ |
| .183863, .199403, .210088, .226040, .239947, .253677, .268422, |
| .285293, .306002, .334447, .382972, .432049, .547468] |
| simulated_quantiles = np.quantile(result.null_distribution, p) |
| assert_allclose(simulated_quantiles, exact_quantiles, atol=0.005) |
|
|
| class TestFitResult: |
| def test_plot_iv(self): |
| rng = np.random.default_rng(1769658657308472721) |
| data = stats.norm.rvs(0, 1, size=100, random_state=rng) |
|
|
| def optimizer(*args, **kwargs): |
| return differential_evolution(*args, **kwargs, rng=rng) |
|
|
| bounds = [(0, 30), (0, 1)] |
| res = stats.fit(stats.norm, data, bounds, optimizer=optimizer) |
| try: |
| import matplotlib |
| message = r"`plot_type` must be one of \{'..." |
| with pytest.raises(ValueError, match=message): |
| res.plot(plot_type='llama') |
| except (ModuleNotFoundError, ImportError): |
| message = r"matplotlib must be installed to use method `plot`." |
| with pytest.raises(ModuleNotFoundError, match=message): |
| res.plot(plot_type='llama') |
|
|