| |
| |
| import numpy as np |
| from ._ckdtree import cKDTree, cKDTreeNode |
|
|
| __all__ = ['minkowski_distance_p', 'minkowski_distance', |
| 'distance_matrix', |
| 'Rectangle', 'KDTree'] |
|
|
|
|
| def minkowski_distance_p(x, y, p=2): |
| """Compute the pth power of the L**p distance between two arrays. |
| |
| For efficiency, this function computes the L**p distance but does |
| not extract the pth root. If `p` is 1 or infinity, this is equal to |
| the actual L**p distance. |
| |
| The last dimensions of `x` and `y` must be the same length. Any |
| other dimensions must be compatible for broadcasting. |
| |
| Parameters |
| ---------- |
| x : (..., K) array_like |
| Input array. |
| y : (..., K) array_like |
| Input array. |
| p : float, 1 <= p <= infinity |
| Which Minkowski p-norm to use. |
| |
| Returns |
| ------- |
| dist : ndarray |
| pth power of the distance between the input arrays. |
| |
| Examples |
| -------- |
| >>> from scipy.spatial import minkowski_distance_p |
| >>> minkowski_distance_p([[0, 0], [0, 0]], [[1, 1], [0, 1]]) |
| array([2., 1.]) |
| |
| """ |
| x = np.asarray(x) |
| y = np.asarray(y) |
|
|
| |
| |
| |
| common_datatype = np.promote_types(np.promote_types(x.dtype, y.dtype), |
| 'float64') |
|
|
| |
| x = x.astype(common_datatype) |
| y = y.astype(common_datatype) |
|
|
| if p == np.inf: |
| return np.amax(np.abs(y-x), axis=-1) |
| elif p == 1: |
| return np.sum(np.abs(y-x), axis=-1) |
| else: |
| return np.sum(np.abs(y-x)**p, axis=-1) |
|
|
|
|
| def minkowski_distance(x, y, p=2): |
| """Compute the L**p distance between two arrays. |
| |
| The last dimensions of `x` and `y` must be the same length. Any |
| other dimensions must be compatible for broadcasting. |
| |
| Parameters |
| ---------- |
| x : (..., K) array_like |
| Input array. |
| y : (..., K) array_like |
| Input array. |
| p : float, 1 <= p <= infinity |
| Which Minkowski p-norm to use. |
| |
| Returns |
| ------- |
| dist : ndarray |
| Distance between the input arrays. |
| |
| Examples |
| -------- |
| >>> from scipy.spatial import minkowski_distance |
| >>> minkowski_distance([[0, 0], [0, 0]], [[1, 1], [0, 1]]) |
| array([ 1.41421356, 1. ]) |
| |
| """ |
| x = np.asarray(x) |
| y = np.asarray(y) |
| if p == np.inf or p == 1: |
| return minkowski_distance_p(x, y, p) |
| else: |
| return minkowski_distance_p(x, y, p)**(1./p) |
|
|
|
|
| class Rectangle: |
| """Hyperrectangle class. |
| |
| Represents a Cartesian product of intervals. |
| """ |
| def __init__(self, maxes, mins): |
| """Construct a hyperrectangle.""" |
| self.maxes = np.maximum(maxes,mins).astype(float) |
| self.mins = np.minimum(maxes,mins).astype(float) |
| self.m, = self.maxes.shape |
|
|
| def __repr__(self): |
| return f"<Rectangle {list(zip(self.mins, self.maxes))}>" |
|
|
| def volume(self): |
| """Total volume.""" |
| return np.prod(self.maxes-self.mins) |
|
|
| def split(self, d, split): |
| """Produce two hyperrectangles by splitting. |
| |
| In general, if you need to compute maximum and minimum |
| distances to the children, it can be done more efficiently |
| by updating the maximum and minimum distances to the parent. |
| |
| Parameters |
| ---------- |
| d : int |
| Axis to split hyperrectangle along. |
| split : float |
| Position along axis `d` to split at. |
| |
| """ |
| mid = np.copy(self.maxes) |
| mid[d] = split |
| less = Rectangle(self.mins, mid) |
| mid = np.copy(self.mins) |
| mid[d] = split |
| greater = Rectangle(mid, self.maxes) |
| return less, greater |
|
|
| def min_distance_point(self, x, p=2.): |
| """ |
| Return the minimum distance between input and points in the |
| hyperrectangle. |
| |
| Parameters |
| ---------- |
| x : array_like |
| Input. |
| p : float, optional |
| Input. |
| |
| """ |
| return minkowski_distance( |
| 0, np.maximum(0, np.maximum(self.mins-x, x-self.maxes)), |
| p |
| ) |
|
|
| def max_distance_point(self, x, p=2.): |
| """ |
| Return the maximum distance between input and points in the hyperrectangle. |
| |
| Parameters |
| ---------- |
| x : array_like |
| Input array. |
| p : float, optional |
| Input. |
| |
| """ |
| return minkowski_distance(0, np.maximum(self.maxes-x, x-self.mins), p) |
|
|
| def min_distance_rectangle(self, other, p=2.): |
| """ |
| Compute the minimum distance between points in the two hyperrectangles. |
| |
| Parameters |
| ---------- |
| other : hyperrectangle |
| Input. |
| p : float |
| Input. |
| |
| """ |
| return minkowski_distance( |
| 0, |
| np.maximum(0, np.maximum(self.mins-other.maxes, |
| other.mins-self.maxes)), |
| p |
| ) |
|
|
| def max_distance_rectangle(self, other, p=2.): |
| """ |
| Compute the maximum distance between points in the two hyperrectangles. |
| |
| Parameters |
| ---------- |
| other : hyperrectangle |
| Input. |
| p : float, optional |
| Input. |
| |
| """ |
| return minkowski_distance( |
| 0, np.maximum(self.maxes-other.mins, other.maxes-self.mins), p) |
|
|
|
|
| class KDTree(cKDTree): |
| """kd-tree for quick nearest-neighbor lookup. |
| |
| This class provides an index into a set of k-dimensional points |
| which can be used to rapidly look up the nearest neighbors of any |
| point. |
| |
| Parameters |
| ---------- |
| data : array_like, shape (n,m) |
| The n data points of dimension m to be indexed. This array is |
| not copied unless this is necessary to produce a contiguous |
| array of doubles, and so modifying this data will result in |
| bogus results. The data are also copied if the kd-tree is built |
| with copy_data=True. |
| leafsize : positive int, optional |
| The number of points at which the algorithm switches over to |
| brute-force. Default: 10. |
| compact_nodes : bool, optional |
| If True, the kd-tree is built to shrink the hyperrectangles to |
| the actual data range. This usually gives a more compact tree that |
| is robust against degenerated input data and gives faster queries |
| at the expense of longer build time. Default: True. |
| copy_data : bool, optional |
| If True the data is always copied to protect the kd-tree against |
| data corruption. Default: False. |
| balanced_tree : bool, optional |
| If True, the median is used to split the hyperrectangles instead of |
| the midpoint. This usually gives a more compact tree and |
| faster queries at the expense of longer build time. Default: True. |
| boxsize : array_like or scalar, optional |
| Apply a m-d toroidal topology to the KDTree.. The topology is generated |
| by :math:`x_i + n_i L_i` where :math:`n_i` are integers and :math:`L_i` |
| is the boxsize along i-th dimension. The input data shall be wrapped |
| into :math:`[0, L_i)`. A ValueError is raised if any of the data is |
| outside of this bound. |
| |
| Notes |
| ----- |
| The algorithm used is described in Maneewongvatana and Mount 1999. |
| The general idea is that the kd-tree is a binary tree, each of whose |
| nodes represents an axis-aligned hyperrectangle. Each node specifies |
| an axis and splits the set of points based on whether their coordinate |
| along that axis is greater than or less than a particular value. |
| |
| During construction, the axis and splitting point are chosen by the |
| "sliding midpoint" rule, which ensures that the cells do not all |
| become long and thin. |
| |
| The tree can be queried for the r closest neighbors of any given point |
| (optionally returning only those within some maximum distance of the |
| point). It can also be queried, with a substantial gain in efficiency, |
| for the r approximate closest neighbors. |
| |
| For large dimensions (20 is already large) do not expect this to run |
| significantly faster than brute force. High-dimensional nearest-neighbor |
| queries are a substantial open problem in computer science. |
| |
| Attributes |
| ---------- |
| data : ndarray, shape (n,m) |
| The n data points of dimension m to be indexed. This array is |
| not copied unless this is necessary to produce a contiguous |
| array of doubles. The data are also copied if the kd-tree is built |
| with ``copy_data=True``. |
| leafsize : positive int |
| The number of points at which the algorithm switches over to |
| brute-force. |
| m : int |
| The dimension of a single data-point. |
| n : int |
| The number of data points. |
| maxes : ndarray, shape (m,) |
| The maximum value in each dimension of the n data points. |
| mins : ndarray, shape (m,) |
| The minimum value in each dimension of the n data points. |
| size : int |
| The number of nodes in the tree. |
| |
| """ |
|
|
| class node: |
| @staticmethod |
| def _create(ckdtree_node=None): |
| """Create either an inner or leaf node, wrapping a cKDTreeNode instance""" |
| if ckdtree_node is None: |
| return KDTree.node(ckdtree_node) |
| elif ckdtree_node.split_dim == -1: |
| return KDTree.leafnode(ckdtree_node) |
| else: |
| return KDTree.innernode(ckdtree_node) |
|
|
| def __init__(self, ckdtree_node=None): |
| if ckdtree_node is None: |
| ckdtree_node = cKDTreeNode() |
| self._node = ckdtree_node |
|
|
| def __lt__(self, other): |
| return id(self) < id(other) |
|
|
| def __gt__(self, other): |
| return id(self) > id(other) |
|
|
| def __le__(self, other): |
| return id(self) <= id(other) |
|
|
| def __ge__(self, other): |
| return id(self) >= id(other) |
|
|
| def __eq__(self, other): |
| return id(self) == id(other) |
|
|
| class leafnode(node): |
| @property |
| def idx(self): |
| return self._node.indices |
|
|
| @property |
| def children(self): |
| return self._node.children |
|
|
| class innernode(node): |
| def __init__(self, ckdtreenode): |
| assert isinstance(ckdtreenode, cKDTreeNode) |
| super().__init__(ckdtreenode) |
| self.less = KDTree.node._create(ckdtreenode.lesser) |
| self.greater = KDTree.node._create(ckdtreenode.greater) |
|
|
| @property |
| def split_dim(self): |
| return self._node.split_dim |
|
|
| @property |
| def split(self): |
| return self._node.split |
|
|
| @property |
| def children(self): |
| return self._node.children |
|
|
| @property |
| def tree(self): |
| if not hasattr(self, "_tree"): |
| self._tree = KDTree.node._create(super().tree) |
|
|
| return self._tree |
|
|
| def __init__(self, data, leafsize=10, compact_nodes=True, copy_data=False, |
| balanced_tree=True, boxsize=None): |
| data = np.asarray(data) |
| if data.dtype.kind == 'c': |
| raise TypeError("KDTree does not work with complex data") |
|
|
| |
| super().__init__(data, leafsize, compact_nodes, copy_data, |
| balanced_tree, boxsize) |
|
|
| def query( |
| self, x, k=1, eps=0, p=2, distance_upper_bound=np.inf, workers=1): |
| r"""Query the kd-tree for nearest neighbors. |
| |
| Parameters |
| ---------- |
| x : array_like, last dimension self.m |
| An array of points to query. |
| k : int or Sequence[int], optional |
| Either the number of nearest neighbors to return, or a list of the |
| k-th nearest neighbors to return, starting from 1. |
| eps : nonnegative float, optional |
| Return approximate nearest neighbors; the kth returned value |
| is guaranteed to be no further than (1+eps) times the |
| distance to the real kth nearest neighbor. |
| p : float, 1<=p<=infinity, optional |
| Which Minkowski p-norm to use. |
| 1 is the sum-of-absolute-values distance ("Manhattan" distance). |
| 2 is the usual Euclidean distance. |
| infinity is the maximum-coordinate-difference distance. |
| A large, finite p may cause a ValueError if overflow can occur. |
| distance_upper_bound : nonnegative float, optional |
| Return only neighbors within this distance. This is used to prune |
| tree searches, so if you are doing a series of nearest-neighbor |
| queries, it may help to supply the distance to the nearest neighbor |
| of the most recent point. |
| workers : int, optional |
| Number of workers to use for parallel processing. If -1 is given |
| all CPU threads are used. Default: 1. |
| |
| .. versionadded:: 1.6.0 |
| |
| Returns |
| ------- |
| d : float or array of floats |
| The distances to the nearest neighbors. |
| If ``x`` has shape ``tuple+(self.m,)``, then ``d`` has shape |
| ``tuple+(k,)``. |
| When k == 1, the last dimension of the output is squeezed. |
| Missing neighbors are indicated with infinite distances. |
| Hits are sorted by distance (nearest first). |
| |
| .. versionchanged:: 1.9.0 |
| Previously if ``k=None``, then `d` was an object array of |
| shape ``tuple``, containing lists of distances. This behavior |
| has been removed, use `query_ball_point` instead. |
| |
| i : integer or array of integers |
| The index of each neighbor in ``self.data``. |
| ``i`` is the same shape as d. |
| Missing neighbors are indicated with ``self.n``. |
| |
| Examples |
| -------- |
| |
| >>> import numpy as np |
| >>> from scipy.spatial import KDTree |
| >>> x, y = np.mgrid[0:5, 2:8] |
| >>> tree = KDTree(np.c_[x.ravel(), y.ravel()]) |
| |
| To query the nearest neighbours and return squeezed result, use |
| |
| >>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=1) |
| >>> print(dd, ii, sep='\n') |
| [2. 0.2236068] |
| [ 0 13] |
| |
| To query the nearest neighbours and return unsqueezed result, use |
| |
| >>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=[1]) |
| >>> print(dd, ii, sep='\n') |
| [[2. ] |
| [0.2236068]] |
| [[ 0] |
| [13]] |
| |
| To query the second nearest neighbours and return unsqueezed result, |
| use |
| |
| >>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=[2]) |
| >>> print(dd, ii, sep='\n') |
| [[2.23606798] |
| [0.80622577]] |
| [[ 6] |
| [19]] |
| |
| To query the first and second nearest neighbours, use |
| |
| >>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=2) |
| >>> print(dd, ii, sep='\n') |
| [[2. 2.23606798] |
| [0.2236068 0.80622577]] |
| [[ 0 6] |
| [13 19]] |
| |
| or, be more specific |
| |
| >>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=[1, 2]) |
| >>> print(dd, ii, sep='\n') |
| [[2. 2.23606798] |
| [0.2236068 0.80622577]] |
| [[ 0 6] |
| [13 19]] |
| |
| """ |
| x = np.asarray(x) |
| if x.dtype.kind == 'c': |
| raise TypeError("KDTree does not work with complex data") |
|
|
| if k is None: |
| raise ValueError("k must be an integer or a sequence of integers") |
|
|
| d, i = super().query(x, k, eps, p, distance_upper_bound, workers) |
| if isinstance(i, int): |
| i = np.intp(i) |
| return d, i |
|
|
| def query_ball_point(self, x, r, p=2., eps=0, workers=1, |
| return_sorted=None, return_length=False): |
| """Find all points within distance r of point(s) x. |
| |
| Parameters |
| ---------- |
| x : array_like, shape tuple + (self.m,) |
| The point or points to search for neighbors of. |
| r : array_like, float |
| The radius of points to return, must broadcast to the length of x. |
| p : float, optional |
| Which Minkowski p-norm to use. Should be in the range [1, inf]. |
| A finite large p may cause a ValueError if overflow can occur. |
| eps : nonnegative float, optional |
| Approximate search. Branches of the tree are not explored if their |
| nearest points are further than ``r / (1 + eps)``, and branches are |
| added in bulk if their furthest points are nearer than |
| ``r * (1 + eps)``. |
| workers : int, optional |
| Number of jobs to schedule for parallel processing. If -1 is given |
| all processors are used. Default: 1. |
| |
| .. versionadded:: 1.6.0 |
| return_sorted : bool, optional |
| Sorts returned indices if True and does not sort them if False. If |
| None, does not sort single point queries, but does sort |
| multi-point queries which was the behavior before this option |
| was added. |
| |
| .. versionadded:: 1.6.0 |
| return_length : bool, optional |
| Return the number of points inside the radius instead of a list |
| of the indices. |
| |
| .. versionadded:: 1.6.0 |
| |
| Returns |
| ------- |
| results : list or array of lists |
| If `x` is a single point, returns a list of the indices of the |
| neighbors of `x`. If `x` is an array of points, returns an object |
| array of shape tuple containing lists of neighbors. |
| |
| Notes |
| ----- |
| If you have many points whose neighbors you want to find, you may save |
| substantial amounts of time by putting them in a KDTree and using |
| query_ball_tree. |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from scipy import spatial |
| >>> x, y = np.mgrid[0:5, 0:5] |
| >>> points = np.c_[x.ravel(), y.ravel()] |
| >>> tree = spatial.KDTree(points) |
| >>> sorted(tree.query_ball_point([2, 0], 1)) |
| [5, 10, 11, 15] |
| |
| Query multiple points and plot the results: |
| |
| >>> import matplotlib.pyplot as plt |
| >>> points = np.asarray(points) |
| >>> plt.plot(points[:,0], points[:,1], '.') |
| >>> for results in tree.query_ball_point(([2, 0], [3, 3]), 1): |
| ... nearby_points = points[results] |
| ... plt.plot(nearby_points[:,0], nearby_points[:,1], 'o') |
| >>> plt.margins(0.1, 0.1) |
| >>> plt.show() |
| |
| """ |
| x = np.asarray(x) |
| if x.dtype.kind == 'c': |
| raise TypeError("KDTree does not work with complex data") |
| return super().query_ball_point( |
| x, r, p, eps, workers, return_sorted, return_length) |
|
|
| def query_ball_tree(self, other, r, p=2., eps=0): |
| """ |
| Find all pairs of points between `self` and `other` whose distance is |
| at most r. |
| |
| Parameters |
| ---------- |
| other : KDTree instance |
| The tree containing points to search against. |
| r : float |
| The maximum distance, has to be positive. |
| p : float, optional |
| Which Minkowski norm to use. `p` has to meet the condition |
| ``1 <= p <= infinity``. |
| eps : float, optional |
| Approximate search. Branches of the tree are not explored |
| if their nearest points are further than ``r/(1+eps)``, and |
| branches are added in bulk if their furthest points are nearer |
| than ``r * (1+eps)``. `eps` has to be non-negative. |
| |
| Returns |
| ------- |
| results : list of lists |
| For each element ``self.data[i]`` of this tree, ``results[i]`` is a |
| list of the indices of its neighbors in ``other.data``. |
| |
| Examples |
| -------- |
| You can search all pairs of points between two kd-trees within a distance: |
| |
| >>> import matplotlib.pyplot as plt |
| >>> import numpy as np |
| >>> from scipy.spatial import KDTree |
| >>> rng = np.random.default_rng() |
| >>> points1 = rng.random((15, 2)) |
| >>> points2 = rng.random((15, 2)) |
| >>> plt.figure(figsize=(6, 6)) |
| >>> plt.plot(points1[:, 0], points1[:, 1], "xk", markersize=14) |
| >>> plt.plot(points2[:, 0], points2[:, 1], "og", markersize=14) |
| >>> kd_tree1 = KDTree(points1) |
| >>> kd_tree2 = KDTree(points2) |
| >>> indexes = kd_tree1.query_ball_tree(kd_tree2, r=0.2) |
| >>> for i in range(len(indexes)): |
| ... for j in indexes[i]: |
| ... plt.plot([points1[i, 0], points2[j, 0]], |
| ... [points1[i, 1], points2[j, 1]], "-r") |
| >>> plt.show() |
| |
| """ |
| return super().query_ball_tree(other, r, p, eps) |
|
|
| def query_pairs(self, r, p=2., eps=0, output_type='set'): |
| """Find all pairs of points in `self` whose distance is at most r. |
| |
| Parameters |
| ---------- |
| r : positive float |
| The maximum distance. |
| p : float, optional |
| Which Minkowski norm to use. `p` has to meet the condition |
| ``1 <= p <= infinity``. |
| eps : float, optional |
| Approximate search. Branches of the tree are not explored |
| if their nearest points are further than ``r/(1+eps)``, and |
| branches are added in bulk if their furthest points are nearer |
| than ``r * (1+eps)``. `eps` has to be non-negative. |
| output_type : string, optional |
| Choose the output container, 'set' or 'ndarray'. Default: 'set' |
| |
| .. versionadded:: 1.6.0 |
| |
| Returns |
| ------- |
| results : set or ndarray |
| Set of pairs ``(i,j)``, with ``i < j``, for which the corresponding |
| positions are close. If output_type is 'ndarray', an ndarry is |
| returned instead of a set. |
| |
| Examples |
| -------- |
| You can search all pairs of points in a kd-tree within a distance: |
| |
| >>> import matplotlib.pyplot as plt |
| >>> import numpy as np |
| >>> from scipy.spatial import KDTree |
| >>> rng = np.random.default_rng() |
| >>> points = rng.random((20, 2)) |
| >>> plt.figure(figsize=(6, 6)) |
| >>> plt.plot(points[:, 0], points[:, 1], "xk", markersize=14) |
| >>> kd_tree = KDTree(points) |
| >>> pairs = kd_tree.query_pairs(r=0.2) |
| >>> for (i, j) in pairs: |
| ... plt.plot([points[i, 0], points[j, 0]], |
| ... [points[i, 1], points[j, 1]], "-r") |
| >>> plt.show() |
| |
| """ |
| return super().query_pairs(r, p, eps, output_type) |
|
|
| def count_neighbors(self, other, r, p=2., weights=None, cumulative=True): |
| """Count how many nearby pairs can be formed. |
| |
| Count the number of pairs ``(x1,x2)`` can be formed, with ``x1`` drawn |
| from ``self`` and ``x2`` drawn from ``other``, and where |
| ``distance(x1, x2, p) <= r``. |
| |
| Data points on ``self`` and ``other`` are optionally weighted by the |
| ``weights`` argument. (See below) |
| |
| This is adapted from the "two-point correlation" algorithm described by |
| Gray and Moore [1]_. See notes for further discussion. |
| |
| Parameters |
| ---------- |
| other : KDTree |
| The other tree to draw points from, can be the same tree as self. |
| r : float or one-dimensional array of floats |
| The radius to produce a count for. Multiple radii are searched with |
| a single tree traversal. |
| If the count is non-cumulative(``cumulative=False``), ``r`` defines |
| the edges of the bins, and must be non-decreasing. |
| p : float, optional |
| 1<=p<=infinity. |
| Which Minkowski p-norm to use. |
| Default 2.0. |
| A finite large p may cause a ValueError if overflow can occur. |
| weights : tuple, array_like, or None, optional |
| If None, the pair-counting is unweighted. |
| If given as a tuple, weights[0] is the weights of points in |
| ``self``, and weights[1] is the weights of points in ``other``; |
| either can be None to indicate the points are unweighted. |
| If given as an array_like, weights is the weights of points in |
| ``self`` and ``other``. For this to make sense, ``self`` and |
| ``other`` must be the same tree. If ``self`` and ``other`` are two |
| different trees, a ``ValueError`` is raised. |
| Default: None |
| |
| .. versionadded:: 1.6.0 |
| cumulative : bool, optional |
| Whether the returned counts are cumulative. When cumulative is set |
| to ``False`` the algorithm is optimized to work with a large number |
| of bins (>10) specified by ``r``. When ``cumulative`` is set to |
| True, the algorithm is optimized to work with a small number of |
| ``r``. Default: True |
| |
| .. versionadded:: 1.6.0 |
| |
| Returns |
| ------- |
| result : scalar or 1-D array |
| The number of pairs. For unweighted counts, the result is integer. |
| For weighted counts, the result is float. |
| If cumulative is False, ``result[i]`` contains the counts with |
| ``(-inf if i == 0 else r[i-1]) < R <= r[i]`` |
| |
| Notes |
| ----- |
| Pair-counting is the basic operation used to calculate the two point |
| correlation functions from a data set composed of position of objects. |
| |
| Two point correlation function measures the clustering of objects and |
| is widely used in cosmology to quantify the large scale structure |
| in our Universe, but it may be useful for data analysis in other fields |
| where self-similar assembly of objects also occur. |
| |
| The Landy-Szalay estimator for the two point correlation function of |
| ``D`` measures the clustering signal in ``D``. [2]_ |
| |
| For example, given the position of two sets of objects, |
| |
| - objects ``D`` (data) contains the clustering signal, and |
| |
| - objects ``R`` (random) that contains no signal, |
| |
| .. math:: |
| |
| \\xi(r) = \\frac{<D, D> - 2 f <D, R> + f^2<R, R>}{f^2<R, R>}, |
| |
| where the brackets represents counting pairs between two data sets |
| in a finite bin around ``r`` (distance), corresponding to setting |
| `cumulative=False`, and ``f = float(len(D)) / float(len(R))`` is the |
| ratio between number of objects from data and random. |
| |
| The algorithm implemented here is loosely based on the dual-tree |
| algorithm described in [1]_. We switch between two different |
| pair-cumulation scheme depending on the setting of ``cumulative``. |
| The computing time of the method we use when for |
| ``cumulative == False`` does not scale with the total number of bins. |
| The algorithm for ``cumulative == True`` scales linearly with the |
| number of bins, though it is slightly faster when only |
| 1 or 2 bins are used. [5]_. |
| |
| As an extension to the naive pair-counting, |
| weighted pair-counting counts the product of weights instead |
| of number of pairs. |
| Weighted pair-counting is used to estimate marked correlation functions |
| ([3]_, section 2.2), |
| or to properly calculate the average of data per distance bin |
| (e.g. [4]_, section 2.1 on redshift). |
| |
| .. [1] Gray and Moore, |
| "N-body problems in statistical learning", |
| Mining the sky, 2000, |
| https://arxiv.org/abs/astro-ph/0012333 |
| |
| .. [2] Landy and Szalay, |
| "Bias and variance of angular correlation functions", |
| The Astrophysical Journal, 1993, |
| http://adsabs.harvard.edu/abs/1993ApJ...412...64L |
| |
| .. [3] Sheth, Connolly and Skibba, |
| "Marked correlations in galaxy formation models", |
| Arxiv e-print, 2005, |
| https://arxiv.org/abs/astro-ph/0511773 |
| |
| .. [4] Hawkins, et al., |
| "The 2dF Galaxy Redshift Survey: correlation functions, |
| peculiar velocities and the matter density of the Universe", |
| Monthly Notices of the Royal Astronomical Society, 2002, |
| http://adsabs.harvard.edu/abs/2003MNRAS.346...78H |
| |
| .. [5] https://github.com/scipy/scipy/pull/5647#issuecomment-168474926 |
| |
| Examples |
| -------- |
| You can count neighbors number between two kd-trees within a distance: |
| |
| >>> import numpy as np |
| >>> from scipy.spatial import KDTree |
| >>> rng = np.random.default_rng() |
| >>> points1 = rng.random((5, 2)) |
| >>> points2 = rng.random((5, 2)) |
| >>> kd_tree1 = KDTree(points1) |
| >>> kd_tree2 = KDTree(points2) |
| >>> kd_tree1.count_neighbors(kd_tree2, 0.2) |
| 1 |
| |
| This number is same as the total pair number calculated by |
| `query_ball_tree`: |
| |
| >>> indexes = kd_tree1.query_ball_tree(kd_tree2, r=0.2) |
| >>> sum([len(i) for i in indexes]) |
| 1 |
| |
| """ |
| return super().count_neighbors(other, r, p, weights, cumulative) |
|
|
| def sparse_distance_matrix( |
| self, other, max_distance, p=2., output_type='dok_matrix'): |
| """Compute a sparse distance matrix. |
| |
| Computes a distance matrix between two KDTrees, leaving as zero |
| any distance greater than max_distance. |
| |
| Parameters |
| ---------- |
| other : KDTree |
| |
| max_distance : positive float |
| |
| p : float, 1<=p<=infinity |
| Which Minkowski p-norm to use. |
| A finite large p may cause a ValueError if overflow can occur. |
| |
| output_type : string, optional |
| Which container to use for output data. Options: 'dok_matrix', |
| 'coo_matrix', 'dict', or 'ndarray'. Default: 'dok_matrix'. |
| |
| .. versionadded:: 1.6.0 |
| |
| Returns |
| ------- |
| result : dok_matrix, coo_matrix, dict or ndarray |
| Sparse matrix representing the results in "dictionary of keys" |
| format. If a dict is returned the keys are (i,j) tuples of indices. |
| If output_type is 'ndarray' a record array with fields 'i', 'j', |
| and 'v' is returned, |
| |
| Examples |
| -------- |
| You can compute a sparse distance matrix between two kd-trees: |
| |
| >>> import numpy as np |
| >>> from scipy.spatial import KDTree |
| >>> rng = np.random.default_rng() |
| >>> points1 = rng.random((5, 2)) |
| >>> points2 = rng.random((5, 2)) |
| >>> kd_tree1 = KDTree(points1) |
| >>> kd_tree2 = KDTree(points2) |
| >>> sdm = kd_tree1.sparse_distance_matrix(kd_tree2, 0.3) |
| >>> sdm.toarray() |
| array([[0. , 0. , 0.12295571, 0. , 0. ], |
| [0. , 0. , 0. , 0. , 0. ], |
| [0.28942611, 0. , 0. , 0.2333084 , 0. ], |
| [0. , 0. , 0. , 0. , 0. ], |
| [0.24617575, 0.29571802, 0.26836782, 0. , 0. ]]) |
| |
| You can check distances above the `max_distance` are zeros: |
| |
| >>> from scipy.spatial import distance_matrix |
| >>> distance_matrix(points1, points2) |
| array([[0.56906522, 0.39923701, 0.12295571, 0.8658745 , 0.79428925], |
| [0.37327919, 0.7225693 , 0.87665969, 0.32580855, 0.75679479], |
| [0.28942611, 0.30088013, 0.6395831 , 0.2333084 , 0.33630734], |
| [0.31994999, 0.72658602, 0.71124834, 0.55396483, 0.90785663], |
| [0.24617575, 0.29571802, 0.26836782, 0.57714465, 0.6473269 ]]) |
| |
| """ |
| return super().sparse_distance_matrix( |
| other, max_distance, p, output_type) |
|
|
|
|
| def distance_matrix(x, y, p=2, threshold=1000000): |
| """Compute the distance matrix. |
| |
| Returns the matrix of all pair-wise distances. |
| |
| Parameters |
| ---------- |
| x : (M, K) array_like |
| Matrix of M vectors in K dimensions. |
| y : (N, K) array_like |
| Matrix of N vectors in K dimensions. |
| p : float, 1 <= p <= infinity |
| Which Minkowski p-norm to use. |
| threshold : positive int |
| If ``M * N * K`` > `threshold`, algorithm uses a Python loop instead |
| of large temporary arrays. |
| |
| Returns |
| ------- |
| result : (M, N) ndarray |
| Matrix containing the distance from every vector in `x` to every vector |
| in `y`. |
| |
| Examples |
| -------- |
| >>> from scipy.spatial import distance_matrix |
| >>> distance_matrix([[0,0],[0,1]], [[1,0],[1,1]]) |
| array([[ 1. , 1.41421356], |
| [ 1.41421356, 1. ]]) |
| |
| """ |
|
|
| x = np.asarray(x) |
| m, k = x.shape |
| y = np.asarray(y) |
| n, kk = y.shape |
|
|
| if k != kk: |
| raise ValueError(f"x contains {k}-dimensional vectors but y contains " |
| f"{kk}-dimensional vectors") |
|
|
| if m*n*k <= threshold: |
| return minkowski_distance(x[:,np.newaxis,:],y[np.newaxis,:,:],p) |
| else: |
| result = np.empty((m,n),dtype=float) |
| if m < n: |
| for i in range(m): |
| result[i,:] = minkowski_distance(x[i],y,p) |
| else: |
| for j in range(n): |
| result[:,j] = minkowski_distance(x,y[j],p) |
| return result |
|
|