| """ |
| Created on Fri Apr 2 09:06:05 2021 |
| |
| @author: matth |
| """ |
|
|
| import math |
| import numpy as np |
| from scipy import special |
| from ._axis_nan_policy import _axis_nan_policy_factory, _broadcast_arrays |
| from scipy._lib._array_api import array_namespace, xp_moveaxis_to_end |
|
|
| __all__ = ['entropy', 'differential_entropy'] |
|
|
|
|
| @_axis_nan_policy_factory( |
| lambda x: x, |
| n_samples=lambda kwgs: ( |
| 2 if ("qk" in kwgs and kwgs["qk"] is not None) |
| else 1 |
| ), |
| n_outputs=1, result_to_tuple=lambda x: (x,), paired=True, |
| too_small=-1 |
| ) |
| def entropy(pk: np.typing.ArrayLike, |
| qk: np.typing.ArrayLike | None = None, |
| base: float | None = None, |
| axis: int = 0 |
| ) -> np.number | np.ndarray: |
| """ |
| Calculate the Shannon entropy/relative entropy of given distribution(s). |
| |
| If only probabilities `pk` are given, the Shannon entropy is calculated as |
| ``H = -sum(pk * log(pk))``. |
| |
| If `qk` is not None, then compute the relative entropy |
| ``D = sum(pk * log(pk / qk))``. This quantity is also known |
| as the Kullback-Leibler divergence. |
| |
| This routine will normalize `pk` and `qk` if they don't sum to 1. |
| |
| Parameters |
| ---------- |
| pk : array_like |
| Defines the (discrete) distribution. Along each axis-slice of ``pk``, |
| element ``i`` is the (possibly unnormalized) probability of event |
| ``i``. |
| qk : array_like, optional |
| Sequence against which the relative entropy is computed. Should be in |
| the same format as `pk`. |
| base : float, optional |
| The logarithmic base to use, defaults to ``e`` (natural logarithm). |
| axis : int, optional |
| The axis along which the entropy is calculated. Default is 0. |
| |
| Returns |
| ------- |
| S : {float, array_like} |
| The calculated entropy. |
| |
| Notes |
| ----- |
| Informally, the Shannon entropy quantifies the expected uncertainty |
| inherent in the possible outcomes of a discrete random variable. |
| For example, |
| if messages consisting of sequences of symbols from a set are to be |
| encoded and transmitted over a noiseless channel, then the Shannon entropy |
| ``H(pk)`` gives a tight lower bound for the average number of units of |
| information needed per symbol if the symbols occur with frequencies |
| governed by the discrete distribution `pk` [1]_. The choice of base |
| determines the choice of units; e.g., ``e`` for nats, ``2`` for bits, etc. |
| |
| The relative entropy, ``D(pk|qk)``, quantifies the increase in the average |
| number of units of information needed per symbol if the encoding is |
| optimized for the probability distribution `qk` instead of the true |
| distribution `pk`. Informally, the relative entropy quantifies the expected |
| excess in surprise experienced if one believes the true distribution is |
| `qk` when it is actually `pk`. |
| |
| A related quantity, the cross entropy ``CE(pk, qk)``, satisfies the |
| equation ``CE(pk, qk) = H(pk) + D(pk|qk)`` and can also be calculated with |
| the formula ``CE = -sum(pk * log(qk))``. It gives the average |
| number of units of information needed per symbol if an encoding is |
| optimized for the probability distribution `qk` when the true distribution |
| is `pk`. It is not computed directly by `entropy`, but it can be computed |
| using two calls to the function (see Examples). |
| |
| See [2]_ for more information. |
| |
| References |
| ---------- |
| .. [1] Shannon, C.E. (1948), A Mathematical Theory of Communication. |
| Bell System Technical Journal, 27: 379-423. |
| https://doi.org/10.1002/j.1538-7305.1948.tb01338.x |
| .. [2] Thomas M. Cover and Joy A. Thomas. 2006. Elements of Information |
| Theory (Wiley Series in Telecommunications and Signal Processing). |
| Wiley-Interscience, USA. |
| |
| |
| Examples |
| -------- |
| The outcome of a fair coin is the most uncertain: |
| |
| >>> import numpy as np |
| >>> from scipy.stats import entropy |
| >>> base = 2 # work in units of bits |
| >>> pk = np.array([1/2, 1/2]) # fair coin |
| >>> H = entropy(pk, base=base) |
| >>> H |
| 1.0 |
| >>> H == -np.sum(pk * np.log(pk)) / np.log(base) |
| True |
| |
| The outcome of a biased coin is less uncertain: |
| |
| >>> qk = np.array([9/10, 1/10]) # biased coin |
| >>> entropy(qk, base=base) |
| 0.46899559358928117 |
| |
| The relative entropy between the fair coin and biased coin is calculated |
| as: |
| |
| >>> D = entropy(pk, qk, base=base) |
| >>> D |
| 0.7369655941662062 |
| >>> np.isclose(D, np.sum(pk * np.log(pk/qk)) / np.log(base), rtol=4e-16, atol=0) |
| True |
| |
| The cross entropy can be calculated as the sum of the entropy and |
| relative entropy`: |
| |
| >>> CE = entropy(pk, base=base) + entropy(pk, qk, base=base) |
| >>> CE |
| 1.736965594166206 |
| >>> CE == -np.sum(pk * np.log(qk)) / np.log(base) |
| True |
| |
| """ |
| if base is not None and base <= 0: |
| raise ValueError("`base` must be a positive number or `None`.") |
|
|
| xp = array_namespace(pk) if qk is None else array_namespace(pk, qk) |
|
|
| pk = xp.asarray(pk) |
| with np.errstate(invalid='ignore'): |
| pk = 1.0*pk / xp.sum(pk, axis=axis, keepdims=True) |
| if qk is None: |
| vec = special.entr(pk) |
| else: |
| qk = xp.asarray(qk) |
| pk, qk = _broadcast_arrays((pk, qk), axis=None, xp=xp) |
| sum_kwargs = dict(axis=axis, keepdims=True) |
| qk = 1.0*qk / xp.sum(qk, **sum_kwargs) |
| vec = special.rel_entr(pk, qk) |
| S = xp.sum(vec, axis=axis) |
| if base is not None: |
| S /= math.log(base) |
| return S |
|
|
|
|
| def _differential_entropy_is_too_small(samples, kwargs, axis=-1): |
| values = samples[0] |
| n = values.shape[axis] |
| window_length = kwargs.get("window_length", |
| math.floor(math.sqrt(n) + 0.5)) |
| if not 2 <= 2 * window_length < n: |
| return True |
| return False |
|
|
|
|
| @_axis_nan_policy_factory( |
| lambda x: x, n_outputs=1, result_to_tuple=lambda x: (x,), |
| too_small=_differential_entropy_is_too_small |
| ) |
| def differential_entropy( |
| values: np.typing.ArrayLike, |
| *, |
| window_length: int | None = None, |
| base: float | None = None, |
| axis: int = 0, |
| method: str = "auto", |
| ) -> np.number | np.ndarray: |
| r"""Given a sample of a distribution, estimate the differential entropy. |
| |
| Several estimation methods are available using the `method` parameter. By |
| default, a method is selected based the size of the sample. |
| |
| Parameters |
| ---------- |
| values : sequence |
| Sample from a continuous distribution. |
| window_length : int, optional |
| Window length for computing Vasicek estimate. Must be an integer |
| between 1 and half of the sample size. If ``None`` (the default), it |
| uses the heuristic value |
| |
| .. math:: |
| \left \lfloor \sqrt{n} + 0.5 \right \rfloor |
| |
| where :math:`n` is the sample size. This heuristic was originally |
| proposed in [2]_ and has become common in the literature. |
| base : float, optional |
| The logarithmic base to use, defaults to ``e`` (natural logarithm). |
| axis : int, optional |
| The axis along which the differential entropy is calculated. |
| Default is 0. |
| method : {'vasicek', 'van es', 'ebrahimi', 'correa', 'auto'}, optional |
| The method used to estimate the differential entropy from the sample. |
| Default is ``'auto'``. See Notes for more information. |
| |
| Returns |
| ------- |
| entropy : float |
| The calculated differential entropy. |
| |
| Notes |
| ----- |
| This function will converge to the true differential entropy in the limit |
| |
| .. math:: |
| n \to \infty, \quad m \to \infty, \quad \frac{m}{n} \to 0 |
| |
| The optimal choice of ``window_length`` for a given sample size depends on |
| the (unknown) distribution. Typically, the smoother the density of the |
| distribution, the larger the optimal value of ``window_length`` [1]_. |
| |
| The following options are available for the `method` parameter. |
| |
| * ``'vasicek'`` uses the estimator presented in [1]_. This is |
| one of the first and most influential estimators of differential entropy. |
| * ``'van es'`` uses the bias-corrected estimator presented in [3]_, which |
| is not only consistent but, under some conditions, asymptotically normal. |
| * ``'ebrahimi'`` uses an estimator presented in [4]_, which was shown |
| in simulation to have smaller bias and mean squared error than |
| the Vasicek estimator. |
| * ``'correa'`` uses the estimator presented in [5]_ based on local linear |
| regression. In a simulation study, it had consistently smaller mean |
| square error than the Vasiceck estimator, but it is more expensive to |
| compute. |
| * ``'auto'`` selects the method automatically (default). Currently, |
| this selects ``'van es'`` for very small samples (<10), ``'ebrahimi'`` |
| for moderate sample sizes (11-1000), and ``'vasicek'`` for larger |
| samples, but this behavior is subject to change in future versions. |
| |
| All estimators are implemented as described in [6]_. |
| |
| References |
| ---------- |
| .. [1] Vasicek, O. (1976). A test for normality based on sample entropy. |
| Journal of the Royal Statistical Society: |
| Series B (Methodological), 38(1), 54-59. |
| .. [2] Crzcgorzewski, P., & Wirczorkowski, R. (1999). Entropy-based |
| goodness-of-fit test for exponentiality. Communications in |
| Statistics-Theory and Methods, 28(5), 1183-1202. |
| .. [3] Van Es, B. (1992). Estimating functionals related to a density by a |
| class of statistics based on spacings. Scandinavian Journal of |
| Statistics, 61-72. |
| .. [4] Ebrahimi, N., Pflughoeft, K., & Soofi, E. S. (1994). Two measures |
| of sample entropy. Statistics & Probability Letters, 20(3), 225-234. |
| .. [5] Correa, J. C. (1995). A new estimator of entropy. Communications |
| in Statistics-Theory and Methods, 24(10), 2439-2449. |
| .. [6] Noughabi, H. A. (2015). Entropy Estimation Using Numerical Methods. |
| Annals of Data Science, 2(2), 231-241. |
| https://link.springer.com/article/10.1007/s40745-015-0045-9 |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from scipy.stats import differential_entropy, norm |
| |
| Entropy of a standard normal distribution: |
| |
| >>> rng = np.random.default_rng() |
| >>> values = rng.standard_normal(100) |
| >>> differential_entropy(values) |
| 1.3407817436640392 |
| |
| Compare with the true entropy: |
| |
| >>> float(norm.entropy()) |
| 1.4189385332046727 |
| |
| For several sample sizes between 5 and 1000, compare the accuracy of |
| the ``'vasicek'``, ``'van es'``, and ``'ebrahimi'`` methods. Specifically, |
| compare the root mean squared error (over 1000 trials) between the estimate |
| and the true differential entropy of the distribution. |
| |
| >>> from scipy import stats |
| >>> import matplotlib.pyplot as plt |
| >>> |
| >>> |
| >>> def rmse(res, expected): |
| ... '''Root mean squared error''' |
| ... return np.sqrt(np.mean((res - expected)**2)) |
| >>> |
| >>> |
| >>> a, b = np.log10(5), np.log10(1000) |
| >>> ns = np.round(np.logspace(a, b, 10)).astype(int) |
| >>> reps = 1000 # number of repetitions for each sample size |
| >>> expected = stats.expon.entropy() |
| >>> |
| >>> method_errors = {'vasicek': [], 'van es': [], 'ebrahimi': []} |
| >>> for method in method_errors: |
| ... for n in ns: |
| ... rvs = stats.expon.rvs(size=(reps, n), random_state=rng) |
| ... res = stats.differential_entropy(rvs, method=method, axis=-1) |
| ... error = rmse(res, expected) |
| ... method_errors[method].append(error) |
| >>> |
| >>> for method, errors in method_errors.items(): |
| ... plt.loglog(ns, errors, label=method) |
| >>> |
| >>> plt.legend() |
| >>> plt.xlabel('sample size') |
| >>> plt.ylabel('RMSE (1000 trials)') |
| >>> plt.title('Entropy Estimator Error (Exponential Distribution)') |
| |
| """ |
| xp = array_namespace(values) |
| values = xp.asarray(values) |
| if xp.isdtype(values.dtype, "integral"): |
| values = xp.astype(values, xp.asarray(1.).dtype) |
| values = xp_moveaxis_to_end(values, axis, xp=xp) |
| n = values.shape[-1] |
|
|
| if window_length is None: |
| window_length = math.floor(math.sqrt(n) + 0.5) |
|
|
| if not 2 <= 2 * window_length < n: |
| raise ValueError( |
| f"Window length ({window_length}) must be positive and less " |
| f"than half the sample size ({n}).", |
| ) |
|
|
| if base is not None and base <= 0: |
| raise ValueError("`base` must be a positive number or `None`.") |
|
|
| sorted_data = xp.sort(values, axis=-1) |
|
|
| methods = {"vasicek": _vasicek_entropy, |
| "van es": _van_es_entropy, |
| "correa": _correa_entropy, |
| "ebrahimi": _ebrahimi_entropy, |
| "auto": _vasicek_entropy} |
| method = method.lower() |
| if method not in methods: |
| message = f"`method` must be one of {set(methods)}" |
| raise ValueError(message) |
|
|
| if method == "auto": |
| if n <= 10: |
| method = 'van es' |
| elif n <= 1000: |
| method = 'ebrahimi' |
| else: |
| method = 'vasicek' |
|
|
| res = methods[method](sorted_data, window_length, xp=xp) |
|
|
| if base is not None: |
| res /= math.log(base) |
|
|
| |
| |
| return xp.astype(res, values.dtype) |
|
|
|
|
| def _pad_along_last_axis(X, m, *, xp): |
| """Pad the data for computing the rolling window difference.""" |
| |
| shape = X.shape[:-1] + (m,) |
| Xl = xp.broadcast_to(X[..., :1], shape) |
| Xr = xp.broadcast_to(X[..., -1:], shape) |
| return xp.concat((Xl, X, Xr), axis=-1) |
|
|
|
|
| def _vasicek_entropy(X, m, *, xp): |
| """Compute the Vasicek estimator as described in [6] Eq. 1.3.""" |
| n = X.shape[-1] |
| X = _pad_along_last_axis(X, m, xp=xp) |
| differences = X[..., 2 * m:] - X[..., : -2 * m:] |
| logs = xp.log(n/(2*m) * differences) |
| return xp.mean(logs, axis=-1) |
|
|
|
|
| def _van_es_entropy(X, m, *, xp): |
| """Compute the van Es estimator as described in [6].""" |
| |
| |
| n = X.shape[-1] |
| difference = X[..., m:] - X[..., :-m] |
| term1 = 1/(n-m) * xp.sum(xp.log((n+1)/m * difference), axis=-1) |
| k = xp.arange(m, n+1, dtype=term1.dtype) |
| return term1 + xp.sum(1/k) + math.log(m) - math.log(n+1) |
|
|
|
|
| def _ebrahimi_entropy(X, m, *, xp): |
| """Compute the Ebrahimi estimator as described in [6].""" |
| |
| n = X.shape[-1] |
| X = _pad_along_last_axis(X, m, xp=xp) |
|
|
| differences = X[..., 2 * m:] - X[..., : -2 * m:] |
|
|
| i = xp.arange(1, n+1, dtype=X.dtype) |
| ci = xp.ones_like(i)*2 |
| ci[i <= m] = 1 + (i[i <= m] - 1)/m |
| ci[i >= n - m + 1] = 1 + (n - i[i >= n-m+1])/m |
|
|
| logs = xp.log(n * differences / (ci * m)) |
| return xp.mean(logs, axis=-1) |
|
|
|
|
| def _correa_entropy(X, m, *, xp): |
| """Compute the Correa estimator as described in [6].""" |
| |
| n = X.shape[-1] |
| X = _pad_along_last_axis(X, m, xp=xp) |
|
|
| i = xp.arange(1, n+1) |
| dj = xp.arange(-m, m+1)[:, None] |
| j = i + dj |
| j0 = j + m - 1 |
|
|
| Xibar = xp.mean(X[..., j0], axis=-2, keepdims=True) |
| difference = X[..., j0] - Xibar |
| num = xp.sum(difference*dj, axis=-2) |
| den = n*xp.sum(difference**2, axis=-2) |
| return -xp.mean(xp.log(num/den), axis=-1) |
|
|