| """Hierarchical Agglomerative Clustering |
| |
| These routines perform some hierarchical agglomerative clustering of some |
| input data. |
| |
| Authors : Vincent Michel, Bertrand Thirion, Alexandre Gramfort, |
| Gael Varoquaux |
| License: BSD 3 clause |
| """ |
|
|
| |
| |
|
|
| import warnings |
| from heapq import heapify, heappop, heappush, heappushpop |
| from numbers import Integral, Real |
|
|
| import numpy as np |
| from scipy import sparse |
| from scipy.sparse.csgraph import connected_components |
|
|
| from ..base import ( |
| BaseEstimator, |
| ClassNamePrefixFeaturesOutMixin, |
| ClusterMixin, |
| _fit_context, |
| ) |
| from ..metrics import DistanceMetric |
| from ..metrics._dist_metrics import METRIC_MAPPING64 |
| from ..metrics.pairwise import _VALID_METRICS, paired_distances |
| from ..utils import check_array |
| from ..utils._fast_dict import IntFloatDict |
| from ..utils._param_validation import ( |
| HasMethods, |
| Interval, |
| StrOptions, |
| validate_params, |
| ) |
| from ..utils.graph import _fix_connected_components |
| from ..utils.validation import check_memory, validate_data |
|
|
| |
| from . import _hierarchical_fast as _hierarchical |
| from ._feature_agglomeration import AgglomerationTransform |
|
|
| |
| |
|
|
|
|
| def _fix_connectivity(X, connectivity, affinity): |
| """ |
| Fixes the connectivity matrix. |
| |
| The different steps are: |
| |
| - copies it |
| - makes it symmetric |
| - converts it to LIL if necessary |
| - completes it if necessary. |
| |
| Parameters |
| ---------- |
| X : array-like of shape (n_samples, n_features) |
| Feature matrix representing `n_samples` samples to be clustered. |
| |
| connectivity : sparse matrix, default=None |
| Connectivity matrix. Defines for each sample the neighboring samples |
| following a given structure of the data. The matrix is assumed to |
| be symmetric and only the upper triangular half is used. |
| Default is `None`, i.e, the Ward algorithm is unstructured. |
| |
| affinity : {"euclidean", "precomputed"}, default="euclidean" |
| Which affinity to use. At the moment `precomputed` and |
| ``euclidean`` are supported. `euclidean` uses the |
| negative squared Euclidean distance between points. |
| |
| Returns |
| ------- |
| connectivity : sparse matrix |
| The fixed connectivity matrix. |
| |
| n_connected_components : int |
| The number of connected components in the graph. |
| """ |
| n_samples = X.shape[0] |
| if connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples: |
| raise ValueError( |
| "Wrong shape for connectivity matrix: %s when X is %s" |
| % (connectivity.shape, X.shape) |
| ) |
|
|
| |
| connectivity = connectivity + connectivity.T |
|
|
| |
| if not sparse.issparse(connectivity): |
| connectivity = sparse.lil_matrix(connectivity) |
|
|
| |
| if connectivity.format != "lil": |
| connectivity = connectivity.tolil() |
|
|
| |
| n_connected_components, labels = connected_components(connectivity) |
|
|
| if n_connected_components > 1: |
| warnings.warn( |
| "the number of connected components of the " |
| "connectivity matrix is %d > 1. Completing it to avoid " |
| "stopping the tree early." % n_connected_components, |
| stacklevel=2, |
| ) |
| |
| connectivity = _fix_connected_components( |
| X=X, |
| graph=connectivity, |
| n_connected_components=n_connected_components, |
| component_labels=labels, |
| metric=affinity, |
| mode="connectivity", |
| ) |
|
|
| return connectivity, n_connected_components |
|
|
|
|
| def _single_linkage_tree( |
| connectivity, |
| n_samples, |
| n_nodes, |
| n_clusters, |
| n_connected_components, |
| return_distance, |
| ): |
| """ |
| Perform single linkage clustering on sparse data via the minimum |
| spanning tree from scipy.sparse.csgraph, then using union-find to label. |
| The parent array is then generated by walking through the tree. |
| """ |
| from scipy.sparse.csgraph import minimum_spanning_tree |
|
|
| |
| connectivity = connectivity.astype(np.float64, copy=False) |
|
|
| |
| epsilon_value = np.finfo(dtype=connectivity.data.dtype).eps |
| connectivity.data[connectivity.data == 0] = epsilon_value |
|
|
| |
| mst = minimum_spanning_tree(connectivity.tocsr()) |
|
|
| |
| mst = mst.tocoo() |
|
|
| |
| mst.data[mst.data == epsilon_value] = 0 |
|
|
| mst_array = np.vstack([mst.row, mst.col, mst.data]).T |
|
|
| |
| mst_array = mst_array[np.argsort(mst_array.T[2], kind="mergesort"), :] |
|
|
| |
| single_linkage_tree = _hierarchical._single_linkage_label(mst_array) |
| children_ = single_linkage_tree[:, :2].astype(int) |
|
|
| |
| parent = np.arange(n_nodes, dtype=np.intp) |
| for i, (left, right) in enumerate(children_, n_samples): |
| if n_clusters is not None and i >= n_nodes: |
| break |
| if left < n_nodes: |
| parent[left] = i |
| if right < n_nodes: |
| parent[right] = i |
|
|
| if return_distance: |
| distances = single_linkage_tree[:, 2] |
| return children_, n_connected_components, n_samples, parent, distances |
| return children_, n_connected_components, n_samples, parent |
|
|
|
|
| |
| |
|
|
|
|
| @validate_params( |
| { |
| "X": ["array-like"], |
| "connectivity": ["array-like", "sparse matrix", None], |
| "n_clusters": [Interval(Integral, 1, None, closed="left"), None], |
| "return_distance": ["boolean"], |
| }, |
| prefer_skip_nested_validation=True, |
| ) |
| def ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False): |
| """Ward clustering based on a Feature matrix. |
| |
| Recursively merges the pair of clusters that minimally increases |
| within-cluster variance. |
| |
| The inertia matrix uses a Heapq-based representation. |
| |
| This is the structured version, that takes into account some topological |
| structure between samples. |
| |
| Read more in the :ref:`User Guide <hierarchical_clustering>`. |
| |
| Parameters |
| ---------- |
| X : array-like of shape (n_samples, n_features) |
| Feature matrix representing `n_samples` samples to be clustered. |
| |
| connectivity : {array-like, sparse matrix}, default=None |
| Connectivity matrix. Defines for each sample the neighboring samples |
| following a given structure of the data. The matrix is assumed to |
| be symmetric and only the upper triangular half is used. |
| Default is None, i.e, the Ward algorithm is unstructured. |
| |
| n_clusters : int, default=None |
| `n_clusters` should be less than `n_samples`. Stop early the |
| construction of the tree at `n_clusters.` This is useful to decrease |
| computation time if the number of clusters is not small compared to the |
| number of samples. In this case, the complete tree is not computed, thus |
| the 'children' output is of limited use, and the 'parents' output should |
| rather be used. This option is valid only when specifying a connectivity |
| matrix. |
| |
| return_distance : bool, default=False |
| If `True`, return the distance between the clusters. |
| |
| Returns |
| ------- |
| children : ndarray of shape (n_nodes-1, 2) |
| The children of each non-leaf node. Values less than `n_samples` |
| correspond to leaves of the tree which are the original samples. |
| A node `i` greater than or equal to `n_samples` is a non-leaf |
| node and has children `children_[i - n_samples]`. Alternatively |
| at the i-th iteration, children[i][0] and children[i][1] |
| are merged to form node `n_samples + i`. |
| |
| n_connected_components : int |
| The number of connected components in the graph. |
| |
| n_leaves : int |
| The number of leaves in the tree. |
| |
| parents : ndarray of shape (n_nodes,) or None |
| The parent of each node. Only returned when a connectivity matrix |
| is specified, elsewhere 'None' is returned. |
| |
| distances : ndarray of shape (n_nodes-1,) |
| Only returned if `return_distance` is set to `True` (for compatibility). |
| The distances between the centers of the nodes. `distances[i]` |
| corresponds to a weighted Euclidean distance between |
| the nodes `children[i, 1]` and `children[i, 2]`. If the nodes refer to |
| leaves of the tree, then `distances[i]` is their unweighted Euclidean |
| distance. Distances are updated in the following way |
| (from scipy.hierarchy.linkage): |
| |
| The new entry :math:`d(u,v)` is computed as follows, |
| |
| .. math:: |
| |
| d(u,v) = \\sqrt{\\frac{|v|+|s|} |
| {T}d(v,s)^2 |
| + \\frac{|v|+|t|} |
| {T}d(v,t)^2 |
| - \\frac{|v|} |
| {T}d(s,t)^2} |
| |
| where :math:`u` is the newly joined cluster consisting of |
| clusters :math:`s` and :math:`t`, :math:`v` is an unused |
| cluster in the forest, :math:`T=|v|+|s|+|t|`, and |
| :math:`|*|` is the cardinality of its argument. This is also |
| known as the incremental algorithm. |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn.cluster import ward_tree |
| >>> X = np.array([[1, 2], [1, 4], [1, 0], |
| ... [4, 2], [4, 4], [4, 0]]) |
| >>> children, n_connected_components, n_leaves, parents = ward_tree(X) |
| >>> children |
| array([[0, 1], |
| [3, 5], |
| [2, 6], |
| [4, 7], |
| [8, 9]]) |
| >>> n_connected_components |
| 1 |
| >>> n_leaves |
| 6 |
| """ |
| X = np.asarray(X) |
| if X.ndim == 1: |
| X = np.reshape(X, (-1, 1)) |
| n_samples, n_features = X.shape |
|
|
| if connectivity is None: |
| from scipy.cluster import hierarchy |
|
|
| if n_clusters is not None: |
| warnings.warn( |
| ( |
| "Partial build of the tree is implemented " |
| "only for structured clustering (i.e. with " |
| "explicit connectivity). The algorithm " |
| "will build the full tree and only " |
| "retain the lower branches required " |
| "for the specified number of clusters" |
| ), |
| stacklevel=2, |
| ) |
| X = np.require(X, requirements="W") |
| out = hierarchy.ward(X) |
| children_ = out[:, :2].astype(np.intp) |
|
|
| if return_distance: |
| distances = out[:, 2] |
| return children_, 1, n_samples, None, distances |
| else: |
| return children_, 1, n_samples, None |
|
|
| connectivity, n_connected_components = _fix_connectivity( |
| X, connectivity, affinity="euclidean" |
| ) |
| if n_clusters is None: |
| n_nodes = 2 * n_samples - 1 |
| else: |
| if n_clusters > n_samples: |
| raise ValueError( |
| "Cannot provide more clusters than samples. " |
| "%i n_clusters was asked, and there are %i " |
| "samples." % (n_clusters, n_samples) |
| ) |
| n_nodes = 2 * n_samples - n_clusters |
|
|
| |
| coord_row = [] |
| coord_col = [] |
| A = [] |
| for ind, row in enumerate(connectivity.rows): |
| A.append(row) |
| |
| |
| row = [i for i in row if i < ind] |
| coord_row.extend( |
| len(row) |
| * [ |
| ind, |
| ] |
| ) |
| coord_col.extend(row) |
|
|
| coord_row = np.array(coord_row, dtype=np.intp, order="C") |
| coord_col = np.array(coord_col, dtype=np.intp, order="C") |
|
|
| |
| moments_1 = np.zeros(n_nodes, order="C") |
| moments_1[:n_samples] = 1 |
| moments_2 = np.zeros((n_nodes, n_features), order="C") |
| moments_2[:n_samples] = X |
| inertia = np.empty(len(coord_row), dtype=np.float64, order="C") |
| _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, inertia) |
| inertia = list(zip(inertia, coord_row, coord_col)) |
| heapify(inertia) |
|
|
| |
| parent = np.arange(n_nodes, dtype=np.intp) |
| used_node = np.ones(n_nodes, dtype=bool) |
| children = [] |
| if return_distance: |
| distances = np.empty(n_nodes - n_samples) |
|
|
| not_visited = np.empty(n_nodes, dtype=bool, order="C") |
|
|
| |
| for k in range(n_samples, n_nodes): |
| |
| while True: |
| inert, i, j = heappop(inertia) |
| if used_node[i] and used_node[j]: |
| break |
| parent[i], parent[j] = k, k |
| children.append((i, j)) |
| used_node[i] = used_node[j] = False |
| if return_distance: |
| distances[k - n_samples] = inert |
|
|
| |
| moments_1[k] = moments_1[i] + moments_1[j] |
| moments_2[k] = moments_2[i] + moments_2[j] |
|
|
| |
| coord_col = [] |
| not_visited.fill(1) |
| not_visited[k] = 0 |
| _hierarchical._get_parents(A[i], coord_col, parent, not_visited) |
| _hierarchical._get_parents(A[j], coord_col, parent, not_visited) |
| |
| [A[col].append(k) for col in coord_col] |
| A.append(coord_col) |
| coord_col = np.array(coord_col, dtype=np.intp, order="C") |
| coord_row = np.empty(coord_col.shape, dtype=np.intp, order="C") |
| coord_row.fill(k) |
| n_additions = len(coord_row) |
| ini = np.empty(n_additions, dtype=np.float64, order="C") |
|
|
| _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, ini) |
|
|
| |
| [heappush(inertia, (ini[idx], k, coord_col[idx])) for idx in range(n_additions)] |
|
|
| |
| n_leaves = n_samples |
| |
| children = [c[::-1] for c in children] |
| children = np.array(children) |
|
|
| if return_distance: |
| |
| distances = np.sqrt(2.0 * distances) |
| return children, n_connected_components, n_leaves, parent, distances |
| else: |
| return children, n_connected_components, n_leaves, parent |
|
|
|
|
| |
| def linkage_tree( |
| X, |
| connectivity=None, |
| n_clusters=None, |
| linkage="complete", |
| affinity="euclidean", |
| return_distance=False, |
| ): |
| """Linkage agglomerative clustering based on a Feature matrix. |
| |
| The inertia matrix uses a Heapq-based representation. |
| |
| This is the structured version, that takes into account some topological |
| structure between samples. |
| |
| Read more in the :ref:`User Guide <hierarchical_clustering>`. |
| |
| Parameters |
| ---------- |
| X : array-like of shape (n_samples, n_features) |
| Feature matrix representing `n_samples` samples to be clustered. |
| |
| connectivity : sparse matrix, default=None |
| Connectivity matrix. Defines for each sample the neighboring samples |
| following a given structure of the data. The matrix is assumed to |
| be symmetric and only the upper triangular half is used. |
| Default is `None`, i.e, the Ward algorithm is unstructured. |
| |
| n_clusters : int, default=None |
| Stop early the construction of the tree at `n_clusters`. This is |
| useful to decrease computation time if the number of clusters is |
| not small compared to the number of samples. In this case, the |
| complete tree is not computed, thus the 'children' output is of |
| limited use, and the 'parents' output should rather be used. |
| This option is valid only when specifying a connectivity matrix. |
| |
| linkage : {"average", "complete", "single"}, default="complete" |
| Which linkage criteria to use. The linkage criterion determines which |
| distance to use between sets of observation. |
| - "average" uses the average of the distances of each observation of |
| the two sets. |
| - "complete" or maximum linkage uses the maximum distances between |
| all observations of the two sets. |
| - "single" uses the minimum of the distances between all |
| observations of the two sets. |
| |
| affinity : str or callable, default='euclidean' |
| Which metric to use. Can be 'euclidean', 'manhattan', or any |
| distance known to paired distance (see metric.pairwise). |
| |
| return_distance : bool, default=False |
| Whether or not to return the distances between the clusters. |
| |
| Returns |
| ------- |
| children : ndarray of shape (n_nodes-1, 2) |
| The children of each non-leaf node. Values less than `n_samples` |
| correspond to leaves of the tree which are the original samples. |
| A node `i` greater than or equal to `n_samples` is a non-leaf |
| node and has children `children_[i - n_samples]`. Alternatively |
| at the i-th iteration, children[i][0] and children[i][1] |
| are merged to form node `n_samples + i`. |
| |
| n_connected_components : int |
| The number of connected components in the graph. |
| |
| n_leaves : int |
| The number of leaves in the tree. |
| |
| parents : ndarray of shape (n_nodes, ) or None |
| The parent of each node. Only returned when a connectivity matrix |
| is specified, elsewhere 'None' is returned. |
| |
| distances : ndarray of shape (n_nodes-1,) |
| Returned when `return_distance` is set to `True`. |
| |
| distances[i] refers to the distance between children[i][0] and |
| children[i][1] when they are merged. |
| |
| See Also |
| -------- |
| ward_tree : Hierarchical clustering with ward linkage. |
| """ |
| X = np.asarray(X) |
| if X.ndim == 1: |
| X = np.reshape(X, (-1, 1)) |
| n_samples, n_features = X.shape |
|
|
| linkage_choices = { |
| "complete": _hierarchical.max_merge, |
| "average": _hierarchical.average_merge, |
| "single": None, |
| } |
| try: |
| join_func = linkage_choices[linkage] |
| except KeyError as e: |
| raise ValueError( |
| "Unknown linkage option, linkage should be one of %s, but %s was given" |
| % (linkage_choices.keys(), linkage) |
| ) from e |
|
|
| if affinity == "cosine" and np.any(~np.any(X, axis=1)): |
| raise ValueError("Cosine affinity cannot be used when X contains zero vectors") |
|
|
| if connectivity is None: |
| from scipy.cluster import hierarchy |
|
|
| if n_clusters is not None: |
| warnings.warn( |
| ( |
| "Partial build of the tree is implemented " |
| "only for structured clustering (i.e. with " |
| "explicit connectivity). The algorithm " |
| "will build the full tree and only " |
| "retain the lower branches required " |
| "for the specified number of clusters" |
| ), |
| stacklevel=2, |
| ) |
|
|
| if affinity == "precomputed": |
| |
| |
| |
| if X.shape[0] != X.shape[1]: |
| raise ValueError( |
| f"Distance matrix should be square, got matrix of shape {X.shape}" |
| ) |
| i, j = np.triu_indices(X.shape[0], k=1) |
| X = X[i, j] |
| elif affinity == "l2": |
| |
| affinity = "euclidean" |
| elif affinity in ("l1", "manhattan"): |
| affinity = "cityblock" |
| elif callable(affinity): |
| X = affinity(X) |
| i, j = np.triu_indices(X.shape[0], k=1) |
| X = X[i, j] |
| if ( |
| linkage == "single" |
| and affinity != "precomputed" |
| and not callable(affinity) |
| and affinity in METRIC_MAPPING64 |
| ): |
| |
| dist_metric = DistanceMetric.get_metric(affinity) |
|
|
| |
| X = np.ascontiguousarray(X, dtype=np.double) |
|
|
| mst = _hierarchical.mst_linkage_core(X, dist_metric) |
| |
| mst = mst[np.argsort(mst.T[2], kind="mergesort"), :] |
|
|
| |
| out = _hierarchical.single_linkage_label(mst) |
| else: |
| out = hierarchy.linkage(X, method=linkage, metric=affinity) |
| children_ = out[:, :2].astype(int, copy=False) |
|
|
| if return_distance: |
| distances = out[:, 2] |
| return children_, 1, n_samples, None, distances |
| return children_, 1, n_samples, None |
|
|
| connectivity, n_connected_components = _fix_connectivity( |
| X, connectivity, affinity=affinity |
| ) |
| connectivity = connectivity.tocoo() |
| |
| diag_mask = connectivity.row != connectivity.col |
| connectivity.row = connectivity.row[diag_mask] |
| connectivity.col = connectivity.col[diag_mask] |
| connectivity.data = connectivity.data[diag_mask] |
| del diag_mask |
|
|
| if affinity == "precomputed": |
| distances = X[connectivity.row, connectivity.col].astype(np.float64, copy=False) |
| else: |
| |
| |
| distances = paired_distances( |
| X[connectivity.row], X[connectivity.col], metric=affinity |
| ) |
| connectivity.data = distances |
|
|
| if n_clusters is None: |
| n_nodes = 2 * n_samples - 1 |
| else: |
| assert n_clusters <= n_samples |
| n_nodes = 2 * n_samples - n_clusters |
|
|
| if linkage == "single": |
| return _single_linkage_tree( |
| connectivity, |
| n_samples, |
| n_nodes, |
| n_clusters, |
| n_connected_components, |
| return_distance, |
| ) |
|
|
| if return_distance: |
| distances = np.empty(n_nodes - n_samples) |
| |
| A = np.empty(n_nodes, dtype=object) |
| inertia = list() |
|
|
| |
| |
| connectivity = connectivity.tolil() |
| |
| for ind, (data, row) in enumerate(zip(connectivity.data, connectivity.rows)): |
| A[ind] = IntFloatDict( |
| np.asarray(row, dtype=np.intp), np.asarray(data, dtype=np.float64) |
| ) |
| |
| |
| inertia.extend( |
| _hierarchical.WeightedEdge(d, ind, r) for r, d in zip(row, data) if r < ind |
| ) |
| del connectivity |
|
|
| heapify(inertia) |
|
|
| |
| parent = np.arange(n_nodes, dtype=np.intp) |
| used_node = np.ones(n_nodes, dtype=np.intp) |
| children = [] |
|
|
| |
| for k in range(n_samples, n_nodes): |
| |
| while True: |
| edge = heappop(inertia) |
| if used_node[edge.a] and used_node[edge.b]: |
| break |
| i = edge.a |
| j = edge.b |
|
|
| if return_distance: |
| |
| distances[k - n_samples] = edge.weight |
|
|
| parent[i] = parent[j] = k |
| children.append((i, j)) |
| |
| n_i = used_node[i] |
| n_j = used_node[j] |
| used_node[k] = n_i + n_j |
| used_node[i] = used_node[j] = False |
|
|
| |
| |
| coord_col = join_func(A[i], A[j], used_node, n_i, n_j) |
| for col, d in coord_col: |
| A[col].append(k, d) |
| |
| |
| heappush(inertia, _hierarchical.WeightedEdge(d, k, col)) |
| A[k] = coord_col |
| |
| A[i] = A[j] = 0 |
|
|
| |
| n_leaves = n_samples |
|
|
| |
| children = np.array(children)[:, ::-1] |
|
|
| if return_distance: |
| return children, n_connected_components, n_leaves, parent, distances |
| return children, n_connected_components, n_leaves, parent |
|
|
|
|
| |
| def _complete_linkage(*args, **kwargs): |
| kwargs["linkage"] = "complete" |
| return linkage_tree(*args, **kwargs) |
|
|
|
|
| def _average_linkage(*args, **kwargs): |
| kwargs["linkage"] = "average" |
| return linkage_tree(*args, **kwargs) |
|
|
|
|
| def _single_linkage(*args, **kwargs): |
| kwargs["linkage"] = "single" |
| return linkage_tree(*args, **kwargs) |
|
|
|
|
| _TREE_BUILDERS = dict( |
| ward=ward_tree, |
| complete=_complete_linkage, |
| average=_average_linkage, |
| single=_single_linkage, |
| ) |
|
|
| |
| |
|
|
|
|
| def _hc_cut(n_clusters, children, n_leaves): |
| """Function cutting the ward tree for a given number of clusters. |
| |
| Parameters |
| ---------- |
| n_clusters : int or ndarray |
| The number of clusters to form. |
| |
| children : ndarray of shape (n_nodes-1, 2) |
| The children of each non-leaf node. Values less than `n_samples` |
| correspond to leaves of the tree which are the original samples. |
| A node `i` greater than or equal to `n_samples` is a non-leaf |
| node and has children `children_[i - n_samples]`. Alternatively |
| at the i-th iteration, children[i][0] and children[i][1] |
| are merged to form node `n_samples + i`. |
| |
| n_leaves : int |
| Number of leaves of the tree. |
| |
| Returns |
| ------- |
| labels : array [n_samples] |
| Cluster labels for each point. |
| """ |
| if n_clusters > n_leaves: |
| raise ValueError( |
| "Cannot extract more clusters than samples: " |
| f"{n_clusters} clusters were given for a tree with {n_leaves} leaves." |
| ) |
| |
| |
| |
| |
| |
| nodes = [-(max(children[-1]) + 1)] |
| for _ in range(n_clusters - 1): |
| |
| these_children = children[-nodes[0] - n_leaves] |
| |
| heappush(nodes, -these_children[0]) |
| heappushpop(nodes, -these_children[1]) |
| label = np.zeros(n_leaves, dtype=np.intp) |
| for i, node in enumerate(nodes): |
| label[_hierarchical._hc_get_descendent(-node, children, n_leaves)] = i |
| return label |
|
|
|
|
| |
|
|
|
|
| class AgglomerativeClustering(ClusterMixin, BaseEstimator): |
| """ |
| Agglomerative Clustering. |
| |
| Recursively merges pair of clusters of sample data; uses linkage distance. |
| |
| Read more in the :ref:`User Guide <hierarchical_clustering>`. |
| |
| Parameters |
| ---------- |
| n_clusters : int or None, default=2 |
| The number of clusters to find. It must be ``None`` if |
| ``distance_threshold`` is not ``None``. |
| |
| metric : str or callable, default="euclidean" |
| Metric used to compute the linkage. Can be "euclidean", "l1", "l2", |
| "manhattan", "cosine", or "precomputed". If linkage is "ward", only |
| "euclidean" is accepted. If "precomputed", a distance matrix is needed |
| as input for the fit method. If connectivity is None, linkage is |
| "single" and affinity is not "precomputed" any valid pairwise distance |
| metric can be assigned. |
| |
| .. versionadded:: 1.2 |
| |
| memory : str or object with the joblib.Memory interface, default=None |
| Used to cache the output of the computation of the tree. |
| By default, no caching is done. If a string is given, it is the |
| path to the caching directory. |
| |
| connectivity : array-like, sparse matrix, or callable, default=None |
| Connectivity matrix. Defines for each sample the neighboring |
| samples following a given structure of the data. |
| This can be a connectivity matrix itself or a callable that transforms |
| the data into a connectivity matrix, such as derived from |
| `kneighbors_graph`. Default is ``None``, i.e, the |
| hierarchical clustering algorithm is unstructured. |
| |
| For an example of connectivity matrix using |
| :class:`~sklearn.neighbors.kneighbors_graph`, see |
| :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_clustering.py`. |
| |
| compute_full_tree : 'auto' or bool, default='auto' |
| Stop early the construction of the tree at ``n_clusters``. This is |
| useful to decrease computation time if the number of clusters is not |
| small compared to the number of samples. This option is useful only |
| when specifying a connectivity matrix. Note also that when varying the |
| number of clusters and using caching, it may be advantageous to compute |
| the full tree. It must be ``True`` if ``distance_threshold`` is not |
| ``None``. By default `compute_full_tree` is "auto", which is equivalent |
| to `True` when `distance_threshold` is not `None` or that `n_clusters` |
| is inferior to the maximum between 100 or `0.02 * n_samples`. |
| Otherwise, "auto" is equivalent to `False`. |
| |
| linkage : {'ward', 'complete', 'average', 'single'}, default='ward' |
| Which linkage criterion to use. The linkage criterion determines which |
| distance to use between sets of observation. The algorithm will merge |
| the pairs of cluster that minimize this criterion. |
| |
| - 'ward' minimizes the variance of the clusters being merged. |
| - 'average' uses the average of the distances of each observation of |
| the two sets. |
| - 'complete' or 'maximum' linkage uses the maximum distances between |
| all observations of the two sets. |
| - 'single' uses the minimum of the distances between all observations |
| of the two sets. |
| |
| .. versionadded:: 0.20 |
| Added the 'single' option |
| |
| For examples comparing different `linkage` criteria, see |
| :ref:`sphx_glr_auto_examples_cluster_plot_linkage_comparison.py`. |
| |
| distance_threshold : float, default=None |
| The linkage distance threshold at or above which clusters will not be |
| merged. If not ``None``, ``n_clusters`` must be ``None`` and |
| ``compute_full_tree`` must be ``True``. |
| |
| .. versionadded:: 0.21 |
| |
| compute_distances : bool, default=False |
| Computes distances between clusters even if `distance_threshold` is not |
| used. This can be used to make dendrogram visualization, but introduces |
| a computational and memory overhead. |
| |
| .. versionadded:: 0.24 |
| |
| For an example of dendrogram visualization, see |
| :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_dendrogram.py`. |
| |
| Attributes |
| ---------- |
| n_clusters_ : int |
| The number of clusters found by the algorithm. If |
| ``distance_threshold=None``, it will be equal to the given |
| ``n_clusters``. |
| |
| labels_ : ndarray of shape (n_samples) |
| Cluster labels for each point. |
| |
| n_leaves_ : int |
| Number of leaves in the hierarchical tree. |
| |
| n_connected_components_ : int |
| The estimated number of connected components in the graph. |
| |
| .. versionadded:: 0.21 |
| ``n_connected_components_`` was added to replace ``n_components_``. |
| |
| n_features_in_ : int |
| Number of features seen during :term:`fit`. |
| |
| .. versionadded:: 0.24 |
| |
| feature_names_in_ : ndarray of shape (`n_features_in_`,) |
| Names of features seen during :term:`fit`. Defined only when `X` |
| has feature names that are all strings. |
| |
| .. versionadded:: 1.0 |
| |
| children_ : array-like of shape (n_samples-1, 2) |
| The children of each non-leaf node. Values less than `n_samples` |
| correspond to leaves of the tree which are the original samples. |
| A node `i` greater than or equal to `n_samples` is a non-leaf |
| node and has children `children_[i - n_samples]`. Alternatively |
| at the i-th iteration, children[i][0] and children[i][1] |
| are merged to form node `n_samples + i`. |
| |
| distances_ : array-like of shape (n_nodes-1,) |
| Distances between nodes in the corresponding place in `children_`. |
| Only computed if `distance_threshold` is used or `compute_distances` |
| is set to `True`. |
| |
| See Also |
| -------- |
| FeatureAgglomeration : Agglomerative clustering but for features instead of |
| samples. |
| ward_tree : Hierarchical clustering with ward linkage. |
| |
| Examples |
| -------- |
| >>> from sklearn.cluster import AgglomerativeClustering |
| >>> import numpy as np |
| >>> X = np.array([[1, 2], [1, 4], [1, 0], |
| ... [4, 2], [4, 4], [4, 0]]) |
| >>> clustering = AgglomerativeClustering().fit(X) |
| >>> clustering |
| AgglomerativeClustering() |
| >>> clustering.labels_ |
| array([1, 1, 1, 0, 0, 0]) |
| """ |
|
|
| _parameter_constraints: dict = { |
| "n_clusters": [Interval(Integral, 1, None, closed="left"), None], |
| "metric": [ |
| StrOptions(set(_VALID_METRICS) | {"precomputed"}), |
| callable, |
| ], |
| "memory": [str, HasMethods("cache"), None], |
| "connectivity": ["array-like", "sparse matrix", callable, None], |
| "compute_full_tree": [StrOptions({"auto"}), "boolean"], |
| "linkage": [StrOptions(set(_TREE_BUILDERS.keys()))], |
| "distance_threshold": [Interval(Real, 0, None, closed="left"), None], |
| "compute_distances": ["boolean"], |
| } |
|
|
| def __init__( |
| self, |
| n_clusters=2, |
| *, |
| metric="euclidean", |
| memory=None, |
| connectivity=None, |
| compute_full_tree="auto", |
| linkage="ward", |
| distance_threshold=None, |
| compute_distances=False, |
| ): |
| self.n_clusters = n_clusters |
| self.distance_threshold = distance_threshold |
| self.memory = memory |
| self.connectivity = connectivity |
| self.compute_full_tree = compute_full_tree |
| self.linkage = linkage |
| self.metric = metric |
| self.compute_distances = compute_distances |
|
|
| @_fit_context(prefer_skip_nested_validation=True) |
| def fit(self, X, y=None): |
| """Fit the hierarchical clustering from features, or distance matrix. |
| |
| Parameters |
| ---------- |
| X : array-like, shape (n_samples, n_features) or \ |
| (n_samples, n_samples) |
| Training instances to cluster, or distances between instances if |
| ``metric='precomputed'``. |
| |
| y : Ignored |
| Not used, present here for API consistency by convention. |
| |
| Returns |
| ------- |
| self : object |
| Returns the fitted instance. |
| """ |
| X = validate_data(self, X, ensure_min_samples=2) |
| return self._fit(X) |
|
|
| def _fit(self, X): |
| """Fit without validation |
| |
| Parameters |
| ---------- |
| X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples) |
| Training instances to cluster, or distances between instances if |
| ``metric='precomputed'``. |
| |
| Returns |
| ------- |
| self : object |
| Returns the fitted instance. |
| """ |
| memory = check_memory(self.memory) |
|
|
| if not ((self.n_clusters is None) ^ (self.distance_threshold is None)): |
| raise ValueError( |
| "Exactly one of n_clusters and " |
| "distance_threshold has to be set, and the other " |
| "needs to be None." |
| ) |
|
|
| if self.distance_threshold is not None and not self.compute_full_tree: |
| raise ValueError( |
| "compute_full_tree must be True if distance_threshold is set." |
| ) |
|
|
| if self.linkage == "ward" and self.metric != "euclidean": |
| raise ValueError( |
| f"{self.metric} was provided as metric. Ward can only " |
| "work with euclidean distances." |
| ) |
|
|
| tree_builder = _TREE_BUILDERS[self.linkage] |
|
|
| connectivity = self.connectivity |
| if self.connectivity is not None: |
| if callable(self.connectivity): |
| connectivity = self.connectivity(X) |
| connectivity = check_array( |
| connectivity, accept_sparse=["csr", "coo", "lil"] |
| ) |
|
|
| n_samples = len(X) |
| compute_full_tree = self.compute_full_tree |
| if self.connectivity is None: |
| compute_full_tree = True |
| if compute_full_tree == "auto": |
| if self.distance_threshold is not None: |
| compute_full_tree = True |
| else: |
| |
| |
| |
| compute_full_tree = self.n_clusters < max(100, 0.02 * n_samples) |
| n_clusters = self.n_clusters |
| if compute_full_tree: |
| n_clusters = None |
|
|
| |
| kwargs = {} |
| if self.linkage != "ward": |
| kwargs["linkage"] = self.linkage |
| kwargs["affinity"] = self.metric |
|
|
| distance_threshold = self.distance_threshold |
|
|
| return_distance = (distance_threshold is not None) or self.compute_distances |
|
|
| out = memory.cache(tree_builder)( |
| X, |
| connectivity=connectivity, |
| n_clusters=n_clusters, |
| return_distance=return_distance, |
| **kwargs, |
| ) |
| (self.children_, self.n_connected_components_, self.n_leaves_, parents) = out[ |
| :4 |
| ] |
|
|
| if return_distance: |
| self.distances_ = out[-1] |
|
|
| if self.distance_threshold is not None: |
| self.n_clusters_ = ( |
| np.count_nonzero(self.distances_ >= distance_threshold) + 1 |
| ) |
| else: |
| self.n_clusters_ = self.n_clusters |
|
|
| |
| if compute_full_tree: |
| self.labels_ = _hc_cut(self.n_clusters_, self.children_, self.n_leaves_) |
| else: |
| labels = _hierarchical.hc_get_heads(parents, copy=False) |
| |
| labels = np.copy(labels[:n_samples]) |
| |
| self.labels_ = np.searchsorted(np.unique(labels), labels) |
| return self |
|
|
| def fit_predict(self, X, y=None): |
| """Fit and return the result of each sample's clustering assignment. |
| |
| In addition to fitting, this method also return the result of the |
| clustering assignment for each sample in the training set. |
| |
| Parameters |
| ---------- |
| X : array-like of shape (n_samples, n_features) or \ |
| (n_samples, n_samples) |
| Training instances to cluster, or distances between instances if |
| ``affinity='precomputed'``. |
| |
| y : Ignored |
| Not used, present here for API consistency by convention. |
| |
| Returns |
| ------- |
| labels : ndarray of shape (n_samples,) |
| Cluster labels. |
| """ |
| return super().fit_predict(X, y) |
|
|
|
|
| class FeatureAgglomeration( |
| ClassNamePrefixFeaturesOutMixin, AgglomerationTransform, AgglomerativeClustering |
| ): |
| """Agglomerate features. |
| |
| Recursively merges pair of clusters of features. |
| |
| Refer to |
| :ref:`sphx_glr_auto_examples_cluster_plot_feature_agglomeration_vs_univariate_selection.py` |
| for an example comparison of :class:`FeatureAgglomeration` strategy with a |
| univariate feature selection strategy (based on ANOVA). |
| |
| Read more in the :ref:`User Guide <hierarchical_clustering>`. |
| |
| Parameters |
| ---------- |
| n_clusters : int or None, default=2 |
| The number of clusters to find. It must be ``None`` if |
| ``distance_threshold`` is not ``None``. |
| |
| metric : str or callable, default="euclidean" |
| Metric used to compute the linkage. Can be "euclidean", "l1", "l2", |
| "manhattan", "cosine", or "precomputed". If linkage is "ward", only |
| "euclidean" is accepted. If "precomputed", a distance matrix is needed |
| as input for the fit method. |
| |
| .. versionadded:: 1.2 |
| |
| memory : str or object with the joblib.Memory interface, default=None |
| Used to cache the output of the computation of the tree. |
| By default, no caching is done. If a string is given, it is the |
| path to the caching directory. |
| |
| connectivity : array-like, sparse matrix, or callable, default=None |
| Connectivity matrix. Defines for each feature the neighboring |
| features following a given structure of the data. |
| This can be a connectivity matrix itself or a callable that transforms |
| the data into a connectivity matrix, such as derived from |
| `kneighbors_graph`. Default is `None`, i.e, the |
| hierarchical clustering algorithm is unstructured. |
| |
| compute_full_tree : 'auto' or bool, default='auto' |
| Stop early the construction of the tree at `n_clusters`. This is useful |
| to decrease computation time if the number of clusters is not small |
| compared to the number of features. This option is useful only when |
| specifying a connectivity matrix. Note also that when varying the |
| number of clusters and using caching, it may be advantageous to compute |
| the full tree. It must be ``True`` if ``distance_threshold`` is not |
| ``None``. By default `compute_full_tree` is "auto", which is equivalent |
| to `True` when `distance_threshold` is not `None` or that `n_clusters` |
| is inferior to the maximum between 100 or `0.02 * n_samples`. |
| Otherwise, "auto" is equivalent to `False`. |
| |
| linkage : {"ward", "complete", "average", "single"}, default="ward" |
| Which linkage criterion to use. The linkage criterion determines which |
| distance to use between sets of features. The algorithm will merge |
| the pairs of cluster that minimize this criterion. |
| |
| - "ward" minimizes the variance of the clusters being merged. |
| - "complete" or maximum linkage uses the maximum distances between |
| all features of the two sets. |
| - "average" uses the average of the distances of each feature of |
| the two sets. |
| - "single" uses the minimum of the distances between all features |
| of the two sets. |
| |
| pooling_func : callable, default=np.mean |
| This combines the values of agglomerated features into a single |
| value, and should accept an array of shape [M, N] and the keyword |
| argument `axis=1`, and reduce it to an array of size [M]. |
| |
| distance_threshold : float, default=None |
| The linkage distance threshold at or above which clusters will not be |
| merged. If not ``None``, ``n_clusters`` must be ``None`` and |
| ``compute_full_tree`` must be ``True``. |
| |
| .. versionadded:: 0.21 |
| |
| compute_distances : bool, default=False |
| Computes distances between clusters even if `distance_threshold` is not |
| used. This can be used to make dendrogram visualization, but introduces |
| a computational and memory overhead. |
| |
| .. versionadded:: 0.24 |
| |
| Attributes |
| ---------- |
| n_clusters_ : int |
| The number of clusters found by the algorithm. If |
| ``distance_threshold=None``, it will be equal to the given |
| ``n_clusters``. |
| |
| labels_ : array-like of (n_features,) |
| Cluster labels for each feature. |
| |
| n_leaves_ : int |
| Number of leaves in the hierarchical tree. |
| |
| n_connected_components_ : int |
| The estimated number of connected components in the graph. |
| |
| .. versionadded:: 0.21 |
| ``n_connected_components_`` was added to replace ``n_components_``. |
| |
| n_features_in_ : int |
| Number of features seen during :term:`fit`. |
| |
| .. versionadded:: 0.24 |
| |
| feature_names_in_ : ndarray of shape (`n_features_in_`,) |
| Names of features seen during :term:`fit`. Defined only when `X` |
| has feature names that are all strings. |
| |
| .. versionadded:: 1.0 |
| |
| children_ : array-like of shape (n_nodes-1, 2) |
| The children of each non-leaf node. Values less than `n_features` |
| correspond to leaves of the tree which are the original samples. |
| A node `i` greater than or equal to `n_features` is a non-leaf |
| node and has children `children_[i - n_features]`. Alternatively |
| at the i-th iteration, children[i][0] and children[i][1] |
| are merged to form node `n_features + i`. |
| |
| distances_ : array-like of shape (n_nodes-1,) |
| Distances between nodes in the corresponding place in `children_`. |
| Only computed if `distance_threshold` is used or `compute_distances` |
| is set to `True`. |
| |
| See Also |
| -------- |
| AgglomerativeClustering : Agglomerative clustering samples instead of |
| features. |
| ward_tree : Hierarchical clustering with ward linkage. |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn import datasets, cluster |
| >>> digits = datasets.load_digits() |
| >>> images = digits.images |
| >>> X = np.reshape(images, (len(images), -1)) |
| >>> agglo = cluster.FeatureAgglomeration(n_clusters=32) |
| >>> agglo.fit(X) |
| FeatureAgglomeration(n_clusters=32) |
| >>> X_reduced = agglo.transform(X) |
| >>> X_reduced.shape |
| (1797, 32) |
| """ |
|
|
| _parameter_constraints: dict = { |
| "n_clusters": [Interval(Integral, 1, None, closed="left"), None], |
| "metric": [ |
| StrOptions(set(_VALID_METRICS) | {"precomputed"}), |
| callable, |
| ], |
| "memory": [str, HasMethods("cache"), None], |
| "connectivity": ["array-like", "sparse matrix", callable, None], |
| "compute_full_tree": [StrOptions({"auto"}), "boolean"], |
| "linkage": [StrOptions(set(_TREE_BUILDERS.keys()))], |
| "pooling_func": [callable], |
| "distance_threshold": [Interval(Real, 0, None, closed="left"), None], |
| "compute_distances": ["boolean"], |
| } |
|
|
| def __init__( |
| self, |
| n_clusters=2, |
| *, |
| metric="euclidean", |
| memory=None, |
| connectivity=None, |
| compute_full_tree="auto", |
| linkage="ward", |
| pooling_func=np.mean, |
| distance_threshold=None, |
| compute_distances=False, |
| ): |
| super().__init__( |
| n_clusters=n_clusters, |
| memory=memory, |
| connectivity=connectivity, |
| compute_full_tree=compute_full_tree, |
| linkage=linkage, |
| metric=metric, |
| distance_threshold=distance_threshold, |
| compute_distances=compute_distances, |
| ) |
| self.pooling_func = pooling_func |
|
|
| @_fit_context(prefer_skip_nested_validation=True) |
| def fit(self, X, y=None): |
| """Fit the hierarchical clustering on the data. |
| |
| Parameters |
| ---------- |
| X : array-like of shape (n_samples, n_features) |
| The data. |
| |
| y : Ignored |
| Not used, present here for API consistency by convention. |
| |
| Returns |
| ------- |
| self : object |
| Returns the transformer. |
| """ |
| X = validate_data(self, X, ensure_min_features=2) |
| super()._fit(X.T) |
| self._n_features_out = self.n_clusters_ |
| return self |
|
|
| @property |
| def fit_predict(self): |
| """Fit and return the result of each sample's clustering assignment.""" |
| raise AttributeError |
|
|