| |
|
|
| import numpy as np |
| cimport cython |
|
|
| from ..metrics._dist_metrics cimport DistanceMetric64 |
| from ..utils._fast_dict cimport IntFloatDict |
| from ..utils._typedefs cimport float64_t, intp_t, uint8_t |
|
|
| |
| from cython.operator cimport dereference as deref, preincrement as inc |
| from libcpp.map cimport map as cpp_map |
| from libc.math cimport fmax, INFINITY |
|
|
|
|
| |
| |
|
|
| def compute_ward_dist( |
| const float64_t[::1] m_1, |
| const float64_t[:, ::1] m_2, |
| const intp_t[::1] coord_row, |
| const intp_t[::1] coord_col, |
| float64_t[::1] res |
| ): |
| cdef intp_t size_max = coord_row.shape[0] |
| cdef intp_t n_features = m_2.shape[1] |
| cdef intp_t i, j, row, col |
| cdef float64_t pa, n |
|
|
| for i in range(size_max): |
| row = coord_row[i] |
| col = coord_col[i] |
| n = (m_1[row] * m_1[col]) / (m_1[row] + m_1[col]) |
| pa = 0. |
| for j in range(n_features): |
| pa += (m_2[row, j] / m_1[row] - m_2[col, j] / m_1[col]) ** 2 |
| res[i] = pa * n |
|
|
|
|
| |
| |
|
|
| def _hc_get_descendent(intp_t node, children, intp_t n_leaves): |
| """ |
| Function returning all the descendent leaves of a set of nodes in the tree. |
| |
| Parameters |
| ---------- |
| node : integer |
| The node for which we want the descendents. |
| |
| children : list of pairs, length n_nodes |
| The children of each non-leaf node. Values less than `n_samples` refer |
| to leaves of the tree. A greater value `i` indicates a node with |
| children `children[i - n_samples]`. |
| |
| n_leaves : integer |
| Number of leaves. |
| |
| Returns |
| ------- |
| descendent : list of int |
| """ |
| ind = [node] |
| if node < n_leaves: |
| return ind |
| descendent = [] |
|
|
| |
| |
| |
| cdef intp_t i, n_indices = 1 |
|
|
| while n_indices: |
| i = ind.pop() |
| if i < n_leaves: |
| descendent.append(i) |
| n_indices -= 1 |
| else: |
| ind.extend(children[i - n_leaves]) |
| n_indices += 1 |
| return descendent |
|
|
|
|
| def hc_get_heads(intp_t[:] parents, copy=True): |
| """Returns the heads of the forest, as defined by parents. |
| |
| Parameters |
| ---------- |
| parents : array of integers |
| The parent structure defining the forest (ensemble of trees) |
| copy : boolean |
| If copy is False, the input 'parents' array is modified inplace |
| |
| Returns |
| ------- |
| heads : array of integers of same shape as parents |
| The indices in the 'parents' of the tree heads |
| |
| """ |
| cdef intp_t parent, node0, node, size |
| if copy: |
| parents = np.copy(parents) |
| size = parents.size |
|
|
| |
| for node0 in range(size - 1, -1, -1): |
| node = node0 |
| parent = parents[node] |
| while parent != node: |
| parents[node0] = parent |
| node = parent |
| parent = parents[node] |
| return parents |
|
|
|
|
| def _get_parents( |
| nodes, |
| heads, |
| const intp_t[:] parents, |
| uint8_t[::1] not_visited |
| ): |
| """Returns the heads of the given nodes, as defined by parents. |
| |
| Modifies 'heads' and 'not_visited' in-place. |
| |
| Parameters |
| ---------- |
| nodes : list of integers |
| The nodes to start from |
| heads : list of integers |
| A list to hold the results (modified inplace) |
| parents : array of integers |
| The parent structure defining the tree |
| not_visited |
| The tree nodes to consider (modified inplace) |
| |
| """ |
| cdef intp_t parent, node |
|
|
| for node in nodes: |
| parent = parents[node] |
| while parent != node: |
| node = parent |
| parent = parents[node] |
| if not_visited[node]: |
| not_visited[node] = 0 |
| heads.append(node) |
|
|
|
|
| |
| |
|
|
| |
| |
| |
|
|
|
|
| def max_merge( |
| IntFloatDict a, |
| IntFloatDict b, |
| const intp_t[:] mask, |
| intp_t n_a, |
| intp_t n_b |
| ): |
| """Merge two IntFloatDicts with the max strategy: when the same key is |
| present in the two dicts, the max of the two values is used. |
| |
| Parameters |
| ========== |
| a, b : IntFloatDict object |
| The IntFloatDicts to merge |
| mask : ndarray array of dtype integer and of dimension 1 |
| a mask for keys to ignore: if not mask[key] the corresponding key |
| is skipped in the output dictionary |
| n_a, n_b : float |
| n_a and n_b are weights for a and b for the merge strategy. |
| They are not used in the case of a max merge. |
| |
| Returns |
| ======= |
| out : IntFloatDict object |
| The IntFloatDict resulting from the merge |
| """ |
| cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict) |
| cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin() |
| cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end() |
| cdef intp_t key |
| cdef float64_t value |
| |
| while a_it != a_end: |
| key = deref(a_it).first |
| if mask[key]: |
| out_obj.my_map[key] = deref(a_it).second |
| inc(a_it) |
|
|
| |
| cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin() |
| cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end() |
| cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin() |
| cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end() |
| while b_it != b_end: |
| key = deref(b_it).first |
| value = deref(b_it).second |
| if mask[key]: |
| out_it = out_obj.my_map.find(key) |
| if out_it == out_end: |
| |
| out_obj.my_map[key] = value |
| else: |
| deref(out_it).second = fmax(deref(out_it).second, value) |
| inc(b_it) |
| return out_obj |
|
|
|
|
| def average_merge( |
| IntFloatDict a, |
| IntFloatDict b, |
| const intp_t[:] mask, |
| intp_t n_a, |
| intp_t n_b |
| ): |
| """Merge two IntFloatDicts with the average strategy: when the |
| same key is present in the two dicts, the weighted average of the two |
| values is used. |
| |
| Parameters |
| ========== |
| a, b : IntFloatDict object |
| The IntFloatDicts to merge |
| mask : ndarray array of dtype integer and of dimension 1 |
| a mask for keys to ignore: if not mask[key] the corresponding key |
| is skipped in the output dictionary |
| n_a, n_b : float |
| n_a and n_b are weights for a and b for the merge strategy. |
| They are used for a weighted mean. |
| |
| Returns |
| ======= |
| out : IntFloatDict object |
| The IntFloatDict resulting from the merge |
| """ |
| cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict) |
| cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin() |
| cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end() |
| cdef intp_t key |
| cdef float64_t value |
| cdef float64_t n_out = <float64_t> (n_a + n_b) |
| |
| while a_it != a_end: |
| key = deref(a_it).first |
| if mask[key]: |
| out_obj.my_map[key] = deref(a_it).second |
| inc(a_it) |
|
|
| |
| cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin() |
| cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end() |
| cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin() |
| cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end() |
| while b_it != b_end: |
| key = deref(b_it).first |
| value = deref(b_it).second |
| if mask[key]: |
| out_it = out_obj.my_map.find(key) |
| if out_it == out_end: |
| |
| out_obj.my_map[key] = value |
| else: |
| deref(out_it).second = (n_a * deref(out_it).second |
| + n_b * value) / n_out |
| inc(b_it) |
| return out_obj |
|
|
|
|
| |
| |
|
|
| cdef class WeightedEdge: |
| cdef public intp_t a |
| cdef public intp_t b |
| cdef public float64_t weight |
|
|
| def __init__(self, float64_t weight, intp_t a, intp_t b): |
| self.weight = weight |
| self.a = a |
| self.b = b |
|
|
| def __richcmp__(self, WeightedEdge other, int op): |
| """Cython-specific comparison method. |
| |
| op is the comparison code:: |
| < 0 |
| == 2 |
| > 4 |
| <= 1 |
| != 3 |
| >= 5 |
| """ |
| if op == 0: |
| return self.weight < other.weight |
| elif op == 1: |
| return self.weight <= other.weight |
| elif op == 2: |
| return self.weight == other.weight |
| elif op == 3: |
| return self.weight != other.weight |
| elif op == 4: |
| return self.weight > other.weight |
| elif op == 5: |
| return self.weight >= other.weight |
|
|
| def __repr__(self): |
| return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__, |
| self.weight, |
| self.a, self.b) |
|
|
|
|
| |
| |
|
|
| cdef class UnionFind(object): |
|
|
| def __init__(self, N): |
| self.parent = np.full(2 * N - 1, -1., dtype=np.intp, order='C') |
| self.next_label = N |
| self.size = np.hstack((np.ones(N, dtype=np.intp), |
| np.zeros(N - 1, dtype=np.intp))) |
|
|
| cdef void union(self, intp_t m, intp_t n) noexcept: |
| self.parent[m] = self.next_label |
| self.parent[n] = self.next_label |
| self.size[self.next_label] = self.size[m] + self.size[n] |
| self.next_label += 1 |
| return |
|
|
| @cython.wraparound(True) |
| cdef intp_t fast_find(self, intp_t n) noexcept: |
| cdef intp_t p |
| p = n |
| |
| while self.parent[n] != -1: |
| n = self.parent[n] |
| |
| while self.parent[p] != n: |
| p, self.parent[p] = self.parent[p], n |
| return n |
|
|
|
|
| def _single_linkage_label(const float64_t[:, :] L): |
| """ |
| Convert an linkage array or MST to a tree by labelling clusters at merges. |
| This is done by using a Union find structure to keep track of merges |
| efficiently. This is the private version of the function that assumes that |
| ``L`` has been properly validated. See ``single_linkage_label`` for the |
| user facing version of this function. |
| |
| Parameters |
| ---------- |
| L: array of shape (n_samples - 1, 3) |
| The linkage array or MST where each row specifies two samples |
| to be merged and a distance or weight at which the merge occurs. This |
| array is assumed to be sorted by the distance/weight. |
| |
| Returns |
| ------- |
| A tree in the format used by scipy.cluster.hierarchy. |
| """ |
|
|
| cdef float64_t[:, ::1] result_arr |
|
|
| cdef intp_t left, left_cluster, right, right_cluster, index |
| cdef float64_t delta |
|
|
| result_arr = np.zeros((L.shape[0], 4), dtype=np.float64) |
| U = UnionFind(L.shape[0] + 1) |
|
|
| for index in range(L.shape[0]): |
|
|
| left = <intp_t> L[index, 0] |
| right = <intp_t> L[index, 1] |
| delta = L[index, 2] |
|
|
| left_cluster = U.fast_find(left) |
| right_cluster = U.fast_find(right) |
|
|
| result_arr[index][0] = left_cluster |
| result_arr[index][1] = right_cluster |
| result_arr[index][2] = delta |
| result_arr[index][3] = U.size[left_cluster] + U.size[right_cluster] |
|
|
| U.union(left_cluster, right_cluster) |
|
|
| return np.asarray(result_arr) |
|
|
|
|
| @cython.wraparound(True) |
| def single_linkage_label(L): |
| """ |
| Convert an linkage array or MST to a tree by labelling clusters at merges. |
| This is done by using a Union find structure to keep track of merges |
| efficiently. |
| |
| Parameters |
| ---------- |
| L: array of shape (n_samples - 1, 3) |
| The linkage array or MST where each row specifies two samples |
| to be merged and a distance or weight at which the merge occurs. This |
| array is assumed to be sorted by the distance/weight. |
| |
| Returns |
| ------- |
| A tree in the format used by scipy.cluster.hierarchy. |
| """ |
| |
| if L[:, :2].min() < 0 or L[:, :2].max() >= 2 * L.shape[0] + 1: |
| raise ValueError("Input MST array is not a validly formatted MST array") |
|
|
| is_sorted = lambda x: np.all(x[:-1] <= x[1:]) |
| if not is_sorted(L[:, 2]): |
| raise ValueError("Input MST array must be sorted by weight") |
|
|
| return _single_linkage_label(L) |
|
|
|
|
| |
| def mst_linkage_core( |
| const float64_t [:, ::1] raw_data, |
| DistanceMetric64 dist_metric): |
| """ |
| Compute the necessary elements of a minimum spanning |
| tree for computation of single linkage clustering. This |
| represents the MST-LINKAGE-CORE algorithm (Figure 6) from |
| :arxiv:`Daniel Mullner, "Modern hierarchical, agglomerative clustering |
| algorithms" <1109.2378>`. |
| |
| In contrast to the scipy implementation is never computes |
| a full distance matrix, generating distances only as they |
| are needed and releasing them when no longer needed. |
| |
| Parameters |
| ---------- |
| raw_data: array of shape (n_samples, n_features) |
| The array of feature data to be clustered. Must be C-aligned |
| |
| dist_metric: DistanceMetric64 |
| A DistanceMetric64 object conforming to the API from |
| ``sklearn.metrics._dist_metrics.pxd`` that will be |
| used to compute distances. |
| |
| Returns |
| ------- |
| mst_core_data: array of shape (n_samples, 3) |
| An array providing information from which one |
| can either compute an MST, or the linkage hierarchy |
| very efficiently. See :arxiv:`Daniel Mullner, "Modern hierarchical, |
| agglomerative clustering algorithms" <1109.2378>` algorithm |
| MST-LINKAGE-CORE for more details. |
| """ |
| cdef: |
| intp_t n_samples = raw_data.shape[0] |
| uint8_t[:] in_tree = np.zeros(n_samples, dtype=bool) |
| float64_t[:, ::1] result = np.zeros((n_samples - 1, 3)) |
|
|
| intp_t current_node = 0 |
| intp_t new_node |
| intp_t i |
| intp_t j |
| intp_t num_features = raw_data.shape[1] |
|
|
| float64_t right_value |
| float64_t left_value |
| float64_t new_distance |
|
|
| float64_t[:] current_distances = np.full(n_samples, INFINITY) |
|
|
| for i in range(n_samples - 1): |
|
|
| in_tree[current_node] = 1 |
|
|
| new_distance = INFINITY |
| new_node = 0 |
|
|
| for j in range(n_samples): |
| if in_tree[j]: |
| continue |
|
|
| right_value = current_distances[j] |
| left_value = dist_metric.dist(&raw_data[current_node, 0], |
| &raw_data[j, 0], |
| num_features) |
|
|
| if left_value < right_value: |
| current_distances[j] = left_value |
|
|
| if current_distances[j] < new_distance: |
| new_distance = current_distances[j] |
| new_node = j |
|
|
| result[i, 0] = current_node |
| result[i, 1] = new_node |
| result[i, 2] = new_distance |
| current_node = new_node |
|
|
| return np.array(result) |
|
|