| import warnings |
| import numpy as np |
|
|
| from scipy._lib._util import check_random_state, MapWrapper, rng_integers, _contains_nan |
| from scipy._lib._bunch import _make_tuple_bunch |
| from scipy.spatial.distance import cdist |
| from scipy.ndimage import _measurements |
|
|
| from ._stats import _local_correlations |
| from . import distributions |
|
|
| __all__ = ['multiscale_graphcorr'] |
|
|
| |
|
|
|
|
| class _ParallelP: |
| """Helper function to calculate parallel p-value.""" |
|
|
| def __init__(self, x, y, random_states): |
| self.x = x |
| self.y = y |
| self.random_states = random_states |
|
|
| def __call__(self, index): |
| order = self.random_states[index].permutation(self.y.shape[0]) |
| permy = self.y[order][:, order] |
|
|
| |
| perm_stat = _mgc_stat(self.x, permy)[0] |
|
|
| return perm_stat |
|
|
|
|
| def _perm_test(x, y, stat, reps=1000, workers=-1, random_state=None): |
| r"""Helper function that calculates the p-value. See below for uses. |
| |
| Parameters |
| ---------- |
| x, y : ndarray |
| `x` and `y` have shapes ``(n, p)`` and ``(n, q)``. |
| stat : float |
| The sample test statistic. |
| reps : int, optional |
| The number of replications used to estimate the null when using the |
| permutation test. The default is 1000 replications. |
| workers : int or map-like callable, optional |
| If `workers` is an int the population is subdivided into `workers` |
| sections and evaluated in parallel (uses |
| `multiprocessing.Pool <multiprocessing>`). Supply `-1` to use all cores |
| available to the Process. Alternatively supply a map-like callable, |
| such as `multiprocessing.Pool.map` for evaluating the population in |
| parallel. This evaluation is carried out as `workers(func, iterable)`. |
| Requires that `func` be pickleable. |
| random_state : {None, int, `numpy.random.Generator`, |
| `numpy.random.RandomState`}, optional |
| |
| If `seed` is None (or `np.random`), the `numpy.random.RandomState` |
| singleton is used. |
| If `seed` is an int, a new ``RandomState`` instance is used, |
| seeded with `seed`. |
| If `seed` is already a ``Generator`` or ``RandomState`` instance then |
| that instance is used. |
| |
| Returns |
| ------- |
| pvalue : float |
| The sample test p-value. |
| null_dist : list |
| The approximated null distribution. |
| |
| """ |
| |
| |
| random_state = check_random_state(random_state) |
| random_states = [np.random.RandomState(rng_integers(random_state, 1 << 32, |
| size=4, dtype=np.uint32)) for _ in range(reps)] |
|
|
| |
| parallelp = _ParallelP(x=x, y=y, random_states=random_states) |
| with MapWrapper(workers) as mapwrapper: |
| null_dist = np.array(list(mapwrapper(parallelp, range(reps)))) |
|
|
| |
| pvalue = (1 + (null_dist >= stat).sum()) / (1 + reps) |
|
|
| return pvalue, null_dist |
|
|
|
|
| def _euclidean_dist(x): |
| return cdist(x, x) |
|
|
|
|
| MGCResult = _make_tuple_bunch('MGCResult', |
| ['statistic', 'pvalue', 'mgc_dict'], []) |
|
|
|
|
| def multiscale_graphcorr(x, y, compute_distance=_euclidean_dist, reps=1000, |
| workers=1, is_twosamp=False, random_state=None): |
| r"""Computes the Multiscale Graph Correlation (MGC) test statistic. |
| |
| Specifically, for each point, MGC finds the :math:`k`-nearest neighbors for |
| one property (e.g. cloud density), and the :math:`l`-nearest neighbors for |
| the other property (e.g. grass wetness) [1]_. This pair :math:`(k, l)` is |
| called the "scale". A priori, however, it is not know which scales will be |
| most informative. So, MGC computes all distance pairs, and then efficiently |
| computes the distance correlations for all scales. The local correlations |
| illustrate which scales are relatively informative about the relationship. |
| The key, therefore, to successfully discover and decipher relationships |
| between disparate data modalities is to adaptively determine which scales |
| are the most informative, and the geometric implication for the most |
| informative scales. Doing so not only provides an estimate of whether the |
| modalities are related, but also provides insight into how the |
| determination was made. This is especially important in high-dimensional |
| data, where simple visualizations do not reveal relationships to the |
| unaided human eye. Characterizations of this implementation in particular |
| have been derived from and benchmarked within in [2]_. |
| |
| Parameters |
| ---------- |
| x, y : ndarray |
| If ``x`` and ``y`` have shapes ``(n, p)`` and ``(n, q)`` where `n` is |
| the number of samples and `p` and `q` are the number of dimensions, |
| then the MGC independence test will be run. Alternatively, ``x`` and |
| ``y`` can have shapes ``(n, n)`` if they are distance or similarity |
| matrices, and ``compute_distance`` must be sent to ``None``. If ``x`` |
| and ``y`` have shapes ``(n, p)`` and ``(m, p)``, an unpaired |
| two-sample MGC test will be run. |
| compute_distance : callable, optional |
| A function that computes the distance or similarity among the samples |
| within each data matrix. Set to ``None`` if ``x`` and ``y`` are |
| already distance matrices. The default uses the euclidean norm metric. |
| If you are calling a custom function, either create the distance |
| matrix before-hand or create a function of the form |
| ``compute_distance(x)`` where `x` is the data matrix for which |
| pairwise distances are calculated. |
| reps : int, optional |
| The number of replications used to estimate the null when using the |
| permutation test. The default is ``1000``. |
| workers : int or map-like callable, optional |
| If ``workers`` is an int the population is subdivided into ``workers`` |
| sections and evaluated in parallel (uses ``multiprocessing.Pool |
| <multiprocessing>``). Supply ``-1`` to use all cores available to the |
| Process. Alternatively supply a map-like callable, such as |
| ``multiprocessing.Pool.map`` for evaluating the p-value in parallel. |
| This evaluation is carried out as ``workers(func, iterable)``. |
| Requires that `func` be pickleable. The default is ``1``. |
| is_twosamp : bool, optional |
| If `True`, a two sample test will be run. If ``x`` and ``y`` have |
| shapes ``(n, p)`` and ``(m, p)``, this optional will be overridden and |
| set to ``True``. Set to ``True`` if ``x`` and ``y`` both have shapes |
| ``(n, p)`` and a two sample test is desired. The default is ``False``. |
| Note that this will not run if inputs are distance matrices. |
| random_state : {None, int, `numpy.random.Generator`, |
| `numpy.random.RandomState`}, optional |
| |
| If `seed` is None (or `np.random`), the `numpy.random.RandomState` |
| singleton is used. |
| If `seed` is an int, a new ``RandomState`` instance is used, |
| seeded with `seed`. |
| If `seed` is already a ``Generator`` or ``RandomState`` instance then |
| that instance is used. |
| |
| Returns |
| ------- |
| res : MGCResult |
| An object containing attributes: |
| |
| statistic : float |
| The sample MGC test statistic within ``[-1, 1]``. |
| pvalue : float |
| The p-value obtained via permutation. |
| mgc_dict : dict |
| Contains additional useful results: |
| |
| - mgc_map : ndarray |
| A 2D representation of the latent geometry of the |
| relationship. |
| - opt_scale : (int, int) |
| The estimated optimal scale as a ``(x, y)`` pair. |
| - null_dist : list |
| The null distribution derived from the permuted matrices. |
| |
| See Also |
| -------- |
| pearsonr : Pearson correlation coefficient and p-value for testing |
| non-correlation. |
| kendalltau : Calculates Kendall's tau. |
| spearmanr : Calculates a Spearman rank-order correlation coefficient. |
| |
| Notes |
| ----- |
| A description of the process of MGC and applications on neuroscience data |
| can be found in [1]_. It is performed using the following steps: |
| |
| #. Two distance matrices :math:`D^X` and :math:`D^Y` are computed and |
| modified to be mean zero columnwise. This results in two |
| :math:`n \times n` distance matrices :math:`A` and :math:`B` (the |
| centering and unbiased modification) [3]_. |
| |
| #. For all values :math:`k` and :math:`l` from :math:`1, ..., n`, |
| |
| * The :math:`k`-nearest neighbor and :math:`l`-nearest neighbor graphs |
| are calculated for each property. Here, :math:`G_k (i, j)` indicates |
| the :math:`k`-smallest values of the :math:`i`-th row of :math:`A` |
| and :math:`H_l (i, j)` indicates the :math:`l` smallested values of |
| the :math:`i`-th row of :math:`B` |
| |
| * Let :math:`\circ` denotes the entry-wise matrix product, then local |
| correlations are summed and normalized using the following statistic: |
| |
| .. math:: |
| |
| c^{kl} = \frac{\sum_{ij} A G_k B H_l} |
| {\sqrt{\sum_{ij} A^2 G_k \times \sum_{ij} B^2 H_l}} |
| |
| #. The MGC test statistic is the smoothed optimal local correlation of |
| :math:`\{ c^{kl} \}`. Denote the smoothing operation as :math:`R(\cdot)` |
| (which essentially set all isolated large correlations) as 0 and |
| connected large correlations the same as before, see [3]_.) MGC is, |
| |
| .. math:: |
| |
| MGC_n (x, y) = \max_{(k, l)} R \left(c^{kl} \left( x_n, y_n \right) |
| \right) |
| |
| The test statistic returns a value between :math:`(-1, 1)` since it is |
| normalized. |
| |
| The p-value returned is calculated using a permutation test. This process |
| is completed by first randomly permuting :math:`y` to estimate the null |
| distribution and then calculating the probability of observing a test |
| statistic, under the null, at least as extreme as the observed test |
| statistic. |
| |
| MGC requires at least 5 samples to run with reliable results. It can also |
| handle high-dimensional data sets. |
| In addition, by manipulating the input data matrices, the two-sample |
| testing problem can be reduced to the independence testing problem [4]_. |
| Given sample data :math:`U` and :math:`V` of sizes :math:`p \times n` |
| :math:`p \times m`, data matrix :math:`X` and :math:`Y` can be created as |
| follows: |
| |
| .. math:: |
| |
| X = [U | V] \in \mathcal{R}^{p \times (n + m)} |
| Y = [0_{1 \times n} | 1_{1 \times m}] \in \mathcal{R}^{(n + m)} |
| |
| Then, the MGC statistic can be calculated as normal. This methodology can |
| be extended to similar tests such as distance correlation [4]_. |
| |
| .. versionadded:: 1.4.0 |
| |
| References |
| ---------- |
| .. [1] Vogelstein, J. T., Bridgeford, E. W., Wang, Q., Priebe, C. E., |
| Maggioni, M., & Shen, C. (2019). Discovering and deciphering |
| relationships across disparate data modalities. ELife. |
| .. [2] Panda, S., Palaniappan, S., Xiong, J., Swaminathan, A., |
| Ramachandran, S., Bridgeford, E. W., ... Vogelstein, J. T. (2019). |
| mgcpy: A Comprehensive High Dimensional Independence Testing Python |
| Package. :arXiv:`1907.02088` |
| .. [3] Shen, C., Priebe, C.E., & Vogelstein, J. T. (2019). From distance |
| correlation to multiscale graph correlation. Journal of the American |
| Statistical Association. |
| .. [4] Shen, C. & Vogelstein, J. T. (2018). The Exact Equivalence of |
| Distance and Kernel Methods for Hypothesis Testing. |
| :arXiv:`1806.05514` |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from scipy.stats import multiscale_graphcorr |
| >>> x = np.arange(100) |
| >>> y = x |
| >>> res = multiscale_graphcorr(x, y) |
| >>> res.statistic, res.pvalue |
| (1.0, 0.001) |
| |
| To run an unpaired two-sample test, |
| |
| >>> x = np.arange(100) |
| >>> y = np.arange(79) |
| >>> res = multiscale_graphcorr(x, y) |
| >>> res.statistic, res.pvalue # doctest: +SKIP |
| (0.033258146255703246, 0.023) |
| |
| or, if shape of the inputs are the same, |
| |
| >>> x = np.arange(100) |
| >>> y = x |
| >>> res = multiscale_graphcorr(x, y, is_twosamp=True) |
| >>> res.statistic, res.pvalue # doctest: +SKIP |
| (-0.008021809890200488, 1.0) |
| |
| """ |
| if not isinstance(x, np.ndarray) or not isinstance(y, np.ndarray): |
| raise ValueError("x and y must be ndarrays") |
|
|
| |
| if x.ndim == 1: |
| x = x[:, np.newaxis] |
| elif x.ndim != 2: |
| raise ValueError(f"Expected a 2-D array `x`, found shape {x.shape}") |
| if y.ndim == 1: |
| y = y[:, np.newaxis] |
| elif y.ndim != 2: |
| raise ValueError(f"Expected a 2-D array `y`, found shape {y.shape}") |
|
|
| nx, px = x.shape |
| ny, py = y.shape |
|
|
| |
| _contains_nan(x, nan_policy='raise') |
| _contains_nan(y, nan_policy='raise') |
|
|
| |
| if np.sum(np.isinf(x)) > 0 or np.sum(np.isinf(y)) > 0: |
| raise ValueError("Inputs contain infinities") |
|
|
| if nx != ny: |
| if px == py: |
| |
| is_twosamp = True |
| else: |
| raise ValueError("Shape mismatch, x and y must have shape [n, p] " |
| "and [n, q] or have shape [n, p] and [m, p].") |
|
|
| if nx < 5 or ny < 5: |
| raise ValueError("MGC requires at least 5 samples to give reasonable " |
| "results.") |
|
|
| |
| x = x.astype(np.float64) |
| y = y.astype(np.float64) |
|
|
| |
| if not callable(compute_distance) and compute_distance is not None: |
| raise ValueError("Compute_distance must be a function.") |
|
|
| |
| |
| if not isinstance(reps, int) or reps < 0: |
| raise ValueError("Number of reps must be an integer greater than 0.") |
| elif reps < 1000: |
| msg = ("The number of replications is low (under 1000), and p-value " |
| "calculations may be unreliable. Use the p-value result, with " |
| "caution!") |
| warnings.warn(msg, RuntimeWarning, stacklevel=2) |
|
|
| if is_twosamp: |
| if compute_distance is None: |
| raise ValueError("Cannot run if inputs are distance matrices") |
| x, y = _two_sample_transform(x, y) |
|
|
| if compute_distance is not None: |
| |
| x = compute_distance(x) |
| y = compute_distance(y) |
|
|
| |
| stat, stat_dict = _mgc_stat(x, y) |
| stat_mgc_map = stat_dict["stat_mgc_map"] |
| opt_scale = stat_dict["opt_scale"] |
|
|
| |
| pvalue, null_dist = _perm_test(x, y, stat, reps=reps, workers=workers, |
| random_state=random_state) |
|
|
| |
| mgc_dict = {"mgc_map": stat_mgc_map, |
| "opt_scale": opt_scale, |
| "null_dist": null_dist} |
|
|
| |
| res = MGCResult(stat, pvalue, mgc_dict) |
| res.stat = stat |
| return res |
|
|
|
|
| def _mgc_stat(distx, disty): |
| r"""Helper function that calculates the MGC stat. See above for use. |
| |
| Parameters |
| ---------- |
| distx, disty : ndarray |
| `distx` and `disty` have shapes ``(n, p)`` and ``(n, q)`` or |
| ``(n, n)`` and ``(n, n)`` |
| if distance matrices. |
| |
| Returns |
| ------- |
| stat : float |
| The sample MGC test statistic within ``[-1, 1]``. |
| stat_dict : dict |
| Contains additional useful additional returns containing the following |
| keys: |
| |
| - stat_mgc_map : ndarray |
| MGC-map of the statistics. |
| - opt_scale : (float, float) |
| The estimated optimal scale as a ``(x, y)`` pair. |
| |
| """ |
| |
| stat_mgc_map = _local_correlations(distx, disty, global_corr='mgc') |
|
|
| n, m = stat_mgc_map.shape |
| if m == 1 or n == 1: |
| |
| |
| |
| stat = stat_mgc_map[m - 1][n - 1] |
| opt_scale = m * n |
| else: |
| samp_size = len(distx) - 1 |
|
|
| |
| sig_connect = _threshold_mgc_map(stat_mgc_map, samp_size) |
|
|
| |
| stat, opt_scale = _smooth_mgc_map(sig_connect, stat_mgc_map) |
|
|
| stat_dict = {"stat_mgc_map": stat_mgc_map, |
| "opt_scale": opt_scale} |
|
|
| return stat, stat_dict |
|
|
|
|
| def _threshold_mgc_map(stat_mgc_map, samp_size): |
| r""" |
| Finds a connected region of significance in the MGC-map by thresholding. |
| |
| Parameters |
| ---------- |
| stat_mgc_map : ndarray |
| All local correlations within ``[-1,1]``. |
| samp_size : int |
| The sample size of original data. |
| |
| Returns |
| ------- |
| sig_connect : ndarray |
| A binary matrix with 1's indicating the significant region. |
| |
| """ |
| m, n = stat_mgc_map.shape |
|
|
| |
| |
| |
| per_sig = 1 - (0.02 / samp_size) |
| threshold = samp_size * (samp_size - 3)/4 - 1/2 |
| threshold = distributions.beta.ppf(per_sig, threshold, threshold) * 2 - 1 |
|
|
| |
| |
| threshold = max(threshold, stat_mgc_map[m - 1][n - 1]) |
|
|
| |
| sig_connect = stat_mgc_map > threshold |
| if np.sum(sig_connect) > 0: |
| sig_connect, _ = _measurements.label(sig_connect) |
| _, label_counts = np.unique(sig_connect, return_counts=True) |
|
|
| |
| max_label = np.argmax(label_counts[1:]) + 1 |
| sig_connect = sig_connect == max_label |
| else: |
| sig_connect = np.array([[False]]) |
|
|
| return sig_connect |
|
|
|
|
| def _smooth_mgc_map(sig_connect, stat_mgc_map): |
| """Finds the smoothed maximal within the significant region R. |
| |
| If area of R is too small it returns the last local correlation. Otherwise, |
| returns the maximum within significant_connected_region. |
| |
| Parameters |
| ---------- |
| sig_connect : ndarray |
| A binary matrix with 1's indicating the significant region. |
| stat_mgc_map : ndarray |
| All local correlations within ``[-1, 1]``. |
| |
| Returns |
| ------- |
| stat : float |
| The sample MGC statistic within ``[-1, 1]``. |
| opt_scale: (float, float) |
| The estimated optimal scale as an ``(x, y)`` pair. |
| |
| """ |
| m, n = stat_mgc_map.shape |
|
|
| |
| |
| stat = stat_mgc_map[m - 1][n - 1] |
| opt_scale = [m, n] |
|
|
| if np.linalg.norm(sig_connect) != 0: |
| |
| |
| |
| if np.sum(sig_connect) >= np.ceil(0.02 * max(m, n)) * min(m, n): |
| max_corr = max(stat_mgc_map[sig_connect]) |
|
|
| |
| |
| max_corr_index = np.where((stat_mgc_map >= max_corr) & sig_connect) |
|
|
| if max_corr >= stat: |
| stat = max_corr |
|
|
| k, l = max_corr_index |
| one_d_indices = k * n + l |
| k = np.max(one_d_indices) // n |
| l = np.max(one_d_indices) % n |
| opt_scale = [k+1, l+1] |
|
|
| return stat, opt_scale |
|
|
|
|
| def _two_sample_transform(u, v): |
| """Helper function that concatenates x and y for two sample MGC stat. |
| |
| See above for use. |
| |
| Parameters |
| ---------- |
| u, v : ndarray |
| `u` and `v` have shapes ``(n, p)`` and ``(m, p)``. |
| |
| Returns |
| ------- |
| x : ndarray |
| Concatenate `u` and `v` along the ``axis = 0``. `x` thus has shape |
| ``(2n, p)``. |
| y : ndarray |
| Label matrix for `x` where 0 refers to samples that comes from `u` and |
| 1 refers to samples that come from `v`. `y` thus has shape ``(2n, 1)``. |
| |
| """ |
| nx = u.shape[0] |
| ny = v.shape[0] |
| x = np.concatenate([u, v], axis=0) |
| y = np.concatenate([np.zeros(nx), np.ones(ny)], axis=0).reshape(-1, 1) |
| return x, y |
|
|