| """Algorithms for spectral clustering""" |
|
|
| |
| |
|
|
| import warnings |
| from numbers import Integral, Real |
|
|
| import numpy as np |
| from scipy.linalg import LinAlgError, qr, svd |
| from scipy.sparse import csc_matrix |
|
|
| from ..base import BaseEstimator, ClusterMixin, _fit_context |
| from ..manifold._spectral_embedding import _spectral_embedding |
| from ..metrics.pairwise import KERNEL_PARAMS, pairwise_kernels |
| from ..neighbors import NearestNeighbors, kneighbors_graph |
| from ..utils import as_float_array, check_random_state |
| from ..utils._param_validation import Interval, StrOptions, validate_params |
| from ..utils.validation import validate_data |
| from ._kmeans import k_means |
|
|
|
|
| def cluster_qr(vectors): |
| """Find the discrete partition closest to the eigenvector embedding. |
| |
| This implementation was proposed in [1]_. |
| |
| .. versionadded:: 1.1 |
| |
| Parameters |
| ---------- |
| vectors : array-like, shape: (n_samples, n_clusters) |
| The embedding space of the samples. |
| |
| Returns |
| ------- |
| labels : array of integers, shape: n_samples |
| The cluster labels of vectors. |
| |
| References |
| ---------- |
| .. [1] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019 |
| Anil Damle, Victor Minden, Lexing Ying |
| <10.1093/imaiai/iay008>` |
| |
| """ |
|
|
| k = vectors.shape[1] |
| _, _, piv = qr(vectors.T, pivoting=True) |
| ut, _, v = svd(vectors[piv[:k], :].T) |
| vectors = abs(np.dot(vectors, np.dot(ut, v.conj()))) |
| return vectors.argmax(axis=1) |
|
|
|
|
| def discretize( |
| vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None |
| ): |
| """Search for a partition matrix which is closest to the eigenvector embedding. |
| |
| This implementation was proposed in [1]_. |
| |
| Parameters |
| ---------- |
| vectors : array-like of shape (n_samples, n_clusters) |
| The embedding space of the samples. |
| |
| copy : bool, default=True |
| Whether to copy vectors, or perform in-place normalization. |
| |
| max_svd_restarts : int, default=30 |
| Maximum number of attempts to restart SVD if convergence fails |
| |
| n_iter_max : int, default=30 |
| Maximum number of iterations to attempt in rotation and partition |
| matrix search if machine precision convergence is not reached |
| |
| random_state : int, RandomState instance, default=None |
| Determines random number generation for rotation matrix initialization. |
| Use an int to make the randomness deterministic. |
| See :term:`Glossary <random_state>`. |
| |
| Returns |
| ------- |
| labels : array of integers, shape: n_samples |
| The labels of the clusters. |
| |
| References |
| ---------- |
| |
| .. [1] `Multiclass spectral clustering, 2003 |
| Stella X. Yu, Jianbo Shi |
| <https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_ |
| |
| Notes |
| ----- |
| |
| The eigenvector embedding is used to iteratively search for the |
| closest discrete partition. First, the eigenvector embedding is |
| normalized to the space of partition matrices. An optimal discrete |
| partition matrix closest to this normalized embedding multiplied by |
| an initial rotation is calculated. Fixing this discrete partition |
| matrix, an optimal rotation matrix is calculated. These two |
| calculations are performed until convergence. The discrete partition |
| matrix is returned as the clustering solution. Used in spectral |
| clustering, this method tends to be faster and more robust to random |
| initialization than k-means. |
| |
| """ |
|
|
| random_state = check_random_state(random_state) |
|
|
| vectors = as_float_array(vectors, copy=copy) |
|
|
| eps = np.finfo(float).eps |
| n_samples, n_components = vectors.shape |
|
|
| |
| |
| |
| |
| |
| norm_ones = np.sqrt(n_samples) |
| for i in range(vectors.shape[1]): |
| vectors[:, i] = (vectors[:, i] / np.linalg.norm(vectors[:, i])) * norm_ones |
| if vectors[0, i] != 0: |
| vectors[:, i] = -1 * vectors[:, i] * np.sign(vectors[0, i]) |
|
|
| |
| |
| |
| vectors = vectors / np.sqrt((vectors**2).sum(axis=1))[:, np.newaxis] |
|
|
| svd_restarts = 0 |
| has_converged = False |
|
|
| |
| |
| while (svd_restarts < max_svd_restarts) and not has_converged: |
| |
| |
| rotation = np.zeros((n_components, n_components)) |
| rotation[:, 0] = vectors[random_state.randint(n_samples), :].T |
|
|
| |
| |
| |
| c = np.zeros(n_samples) |
| for j in range(1, n_components): |
| |
| |
| c += np.abs(np.dot(vectors, rotation[:, j - 1])) |
| rotation[:, j] = vectors[c.argmin(), :].T |
|
|
| last_objective_value = 0.0 |
| n_iter = 0 |
|
|
| while not has_converged: |
| n_iter += 1 |
|
|
| t_discrete = np.dot(vectors, rotation) |
|
|
| labels = t_discrete.argmax(axis=1) |
| vectors_discrete = csc_matrix( |
| (np.ones(len(labels)), (np.arange(0, n_samples), labels)), |
| shape=(n_samples, n_components), |
| ) |
|
|
| t_svd = vectors_discrete.T * vectors |
|
|
| try: |
| U, S, Vh = np.linalg.svd(t_svd) |
| except LinAlgError: |
| svd_restarts += 1 |
| print("SVD did not converge, randomizing and trying again") |
| break |
|
|
| ncut_value = 2.0 * (n_samples - S.sum()) |
| if (abs(ncut_value - last_objective_value) < eps) or (n_iter > n_iter_max): |
| has_converged = True |
| else: |
| |
| last_objective_value = ncut_value |
| rotation = np.dot(Vh.T, U.T) |
|
|
| if not has_converged: |
| raise LinAlgError("SVD did not converge") |
| return labels |
|
|
|
|
| @validate_params( |
| {"affinity": ["array-like", "sparse matrix"]}, |
| prefer_skip_nested_validation=False, |
| ) |
| def spectral_clustering( |
| affinity, |
| *, |
| n_clusters=8, |
| n_components=None, |
| eigen_solver=None, |
| random_state=None, |
| n_init=10, |
| eigen_tol="auto", |
| assign_labels="kmeans", |
| verbose=False, |
| ): |
| """Apply clustering to a projection of the normalized Laplacian. |
| |
| In practice Spectral Clustering is very useful when the structure of |
| the individual clusters is highly non-convex or more generally when |
| a measure of the center and spread of the cluster is not a suitable |
| description of the complete cluster. For instance, when clusters are |
| nested circles on the 2D plane. |
| |
| If affinity is the adjacency matrix of a graph, this method can be |
| used to find normalized graph cuts [1]_, [2]_. |
| |
| Read more in the :ref:`User Guide <spectral_clustering>`. |
| |
| Parameters |
| ---------- |
| affinity : {array-like, sparse matrix} of shape (n_samples, n_samples) |
| The affinity matrix describing the relationship of the samples to |
| embed. **Must be symmetric**. |
| |
| Possible examples: |
| - adjacency matrix of a graph, |
| - heat kernel of the pairwise distance matrix of the samples, |
| - symmetric k-nearest neighbours connectivity matrix of the samples. |
| |
| n_clusters : int, default=None |
| Number of clusters to extract. |
| |
| n_components : int, default=n_clusters |
| Number of eigenvectors to use for the spectral embedding. |
| |
| eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} |
| The eigenvalue decomposition method. If None then ``'arpack'`` is used. |
| See [4]_ for more details regarding ``'lobpcg'``. |
| Eigensolver ``'amg'`` runs ``'lobpcg'`` with optional |
| Algebraic MultiGrid preconditioning and requires pyamg to be installed. |
| It can be faster on very large sparse problems [6]_ and [7]_. |
| |
| random_state : int, RandomState instance, default=None |
| A pseudo random number generator used for the initialization |
| of the lobpcg eigenvectors decomposition when `eigen_solver == |
| 'amg'`, and for the K-Means initialization. Use an int to make |
| the results deterministic across calls (See |
| :term:`Glossary <random_state>`). |
| |
| .. note:: |
| When using `eigen_solver == 'amg'`, |
| it is necessary to also fix the global numpy seed with |
| `np.random.seed(int)` to get deterministic results. See |
| https://github.com/pyamg/pyamg/issues/139 for further |
| information. |
| |
| n_init : int, default=10 |
| Number of time the k-means algorithm will be run with different |
| centroid seeds. The final results will be the best output of n_init |
| consecutive runs in terms of inertia. Only used if |
| ``assign_labels='kmeans'``. |
| |
| eigen_tol : float, default="auto" |
| Stopping criterion for eigendecomposition of the Laplacian matrix. |
| If `eigen_tol="auto"` then the passed tolerance will depend on the |
| `eigen_solver`: |
| |
| - If `eigen_solver="arpack"`, then `eigen_tol=0.0`; |
| - If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then |
| `eigen_tol=None` which configures the underlying `lobpcg` solver to |
| automatically resolve the value according to their heuristics. See, |
| :func:`scipy.sparse.linalg.lobpcg` for details. |
| |
| Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"` |
| values of `tol<1e-5` may lead to convergence issues and should be |
| avoided. |
| |
| .. versionadded:: 1.2 |
| Added 'auto' option. |
| |
| assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans' |
| The strategy to use to assign labels in the embedding |
| space. There are three ways to assign labels after the Laplacian |
| embedding. k-means can be applied and is a popular choice. But it can |
| also be sensitive to initialization. Discretization is another |
| approach which is less sensitive to random initialization [3]_. |
| The cluster_qr method [5]_ directly extracts clusters from eigenvectors |
| in spectral clustering. In contrast to k-means and discretization, cluster_qr |
| has no tuning parameters and is not an iterative method, yet may outperform |
| k-means and discretization in terms of both quality and speed. For a detailed |
| comparison of clustering strategies, refer to the following example: |
| :ref:`sphx_glr_auto_examples_cluster_plot_coin_segmentation.py`. |
| |
| .. versionchanged:: 1.1 |
| Added new labeling method 'cluster_qr'. |
| |
| verbose : bool, default=False |
| Verbosity mode. |
| |
| .. versionadded:: 0.24 |
| |
| Returns |
| ------- |
| labels : array of integers, shape: n_samples |
| The labels of the clusters. |
| |
| Notes |
| ----- |
| The graph should contain only one connected component, elsewhere |
| the results make little sense. |
| |
| This algorithm solves the normalized cut for `k=2`: it is a |
| normalized spectral clustering. |
| |
| References |
| ---------- |
| |
| .. [1] :doi:`Normalized cuts and image segmentation, 2000 |
| Jianbo Shi, Jitendra Malik |
| <10.1109/34.868688>` |
| |
| .. [2] :doi:`A Tutorial on Spectral Clustering, 2007 |
| Ulrike von Luxburg |
| <10.1007/s11222-007-9033-z>` |
| |
| .. [3] `Multiclass spectral clustering, 2003 |
| Stella X. Yu, Jianbo Shi |
| <https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_ |
| |
| .. [4] :doi:`Toward the Optimal Preconditioned Eigensolver: |
| Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001 |
| A. V. Knyazev |
| SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541. |
| <10.1137/S1064827500366124>` |
| |
| .. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019 |
| Anil Damle, Victor Minden, Lexing Ying |
| <10.1093/imaiai/iay008>` |
| |
| .. [6] :doi:`Multiscale Spectral Image Segmentation Multiscale preconditioning |
| for computing eigenvalues of graph Laplacians in image segmentation, 2006 |
| Andrew Knyazev |
| <10.13140/RG.2.2.35280.02565>` |
| |
| .. [7] :doi:`Preconditioned spectral clustering for stochastic block partition |
| streaming graph challenge (Preliminary version at arXiv.) |
| David Zhuzhunashvili, Andrew Knyazev |
| <10.1109/HPEC.2017.8091045>` |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn.metrics.pairwise import pairwise_kernels |
| >>> from sklearn.cluster import spectral_clustering |
| >>> X = np.array([[1, 1], [2, 1], [1, 0], |
| ... [4, 7], [3, 5], [3, 6]]) |
| >>> affinity = pairwise_kernels(X, metric='rbf') |
| >>> spectral_clustering( |
| ... affinity=affinity, n_clusters=2, assign_labels="discretize", random_state=0 |
| ... ) |
| array([1, 1, 1, 0, 0, 0]) |
| """ |
|
|
| clusterer = SpectralClustering( |
| n_clusters=n_clusters, |
| n_components=n_components, |
| eigen_solver=eigen_solver, |
| random_state=random_state, |
| n_init=n_init, |
| affinity="precomputed", |
| eigen_tol=eigen_tol, |
| assign_labels=assign_labels, |
| verbose=verbose, |
| ).fit(affinity) |
|
|
| return clusterer.labels_ |
|
|
|
|
| class SpectralClustering(ClusterMixin, BaseEstimator): |
| """Apply clustering to a projection of the normalized Laplacian. |
| |
| In practice Spectral Clustering is very useful when the structure of |
| the individual clusters is highly non-convex, or more generally when |
| a measure of the center and spread of the cluster is not a suitable |
| description of the complete cluster, such as when clusters are |
| nested circles on the 2D plane. |
| |
| If the affinity matrix is the adjacency matrix of a graph, this method |
| can be used to find normalized graph cuts [1]_, [2]_. |
| |
| When calling ``fit``, an affinity matrix is constructed using either |
| a kernel function such the Gaussian (aka RBF) kernel with Euclidean |
| distance ``d(X, X)``:: |
| |
| np.exp(-gamma * d(X,X) ** 2) |
| |
| or a k-nearest neighbors connectivity matrix. |
| |
| Alternatively, a user-provided affinity matrix can be specified by |
| setting ``affinity='precomputed'``. |
| |
| Read more in the :ref:`User Guide <spectral_clustering>`. |
| |
| Parameters |
| ---------- |
| n_clusters : int, default=8 |
| The dimension of the projection subspace. |
| |
| eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None |
| The eigenvalue decomposition strategy to use. AMG requires pyamg |
| to be installed. It can be faster on very large, sparse problems, |
| but may also lead to instabilities. If None, then ``'arpack'`` is |
| used. See [4]_ for more details regarding `'lobpcg'`. |
| |
| n_components : int, default=None |
| Number of eigenvectors to use for the spectral embedding. If None, |
| defaults to `n_clusters`. |
| |
| random_state : int, RandomState instance, default=None |
| A pseudo random number generator used for the initialization |
| of the lobpcg eigenvectors decomposition when `eigen_solver == |
| 'amg'`, and for the K-Means initialization. Use an int to make |
| the results deterministic across calls (See |
| :term:`Glossary <random_state>`). |
| |
| .. note:: |
| When using `eigen_solver == 'amg'`, |
| it is necessary to also fix the global numpy seed with |
| `np.random.seed(int)` to get deterministic results. See |
| https://github.com/pyamg/pyamg/issues/139 for further |
| information. |
| |
| n_init : int, default=10 |
| Number of time the k-means algorithm will be run with different |
| centroid seeds. The final results will be the best output of n_init |
| consecutive runs in terms of inertia. Only used if |
| ``assign_labels='kmeans'``. |
| |
| gamma : float, default=1.0 |
| Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels. |
| Ignored for ``affinity='nearest_neighbors'``, ``affinity='precomputed'`` |
| or ``affinity='precomputed_nearest_neighbors'``. |
| |
| affinity : str or callable, default='rbf' |
| How to construct the affinity matrix. |
| - 'nearest_neighbors': construct the affinity matrix by computing a |
| graph of nearest neighbors. |
| - 'rbf': construct the affinity matrix using a radial basis function |
| (RBF) kernel. |
| - 'precomputed': interpret ``X`` as a precomputed affinity matrix, |
| where larger values indicate greater similarity between instances. |
| - 'precomputed_nearest_neighbors': interpret ``X`` as a sparse graph |
| of precomputed distances, and construct a binary affinity matrix |
| from the ``n_neighbors`` nearest neighbors of each instance. |
| - one of the kernels supported by |
| :func:`~sklearn.metrics.pairwise.pairwise_kernels`. |
| |
| Only kernels that produce similarity scores (non-negative values that |
| increase with similarity) should be used. This property is not checked |
| by the clustering algorithm. |
| |
| n_neighbors : int, default=10 |
| Number of neighbors to use when constructing the affinity matrix using |
| the nearest neighbors method. Ignored for ``affinity='rbf'``. |
| |
| eigen_tol : float, default="auto" |
| Stopping criterion for eigen decomposition of the Laplacian matrix. |
| If `eigen_tol="auto"` then the passed tolerance will depend on the |
| `eigen_solver`: |
| |
| - If `eigen_solver="arpack"`, then `eigen_tol=0.0`; |
| - If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then |
| `eigen_tol=None` which configures the underlying `lobpcg` solver to |
| automatically resolve the value according to their heuristics. See, |
| :func:`scipy.sparse.linalg.lobpcg` for details. |
| |
| Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"` |
| values of `tol<1e-5` may lead to convergence issues and should be |
| avoided. |
| |
| .. versionadded:: 1.2 |
| Added 'auto' option. |
| |
| assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans' |
| The strategy for assigning labels in the embedding space. There are two |
| ways to assign labels after the Laplacian embedding. k-means is a |
| popular choice, but it can be sensitive to initialization. |
| Discretization is another approach which is less sensitive to random |
| initialization [3]_. |
| The cluster_qr method [5]_ directly extract clusters from eigenvectors |
| in spectral clustering. In contrast to k-means and discretization, cluster_qr |
| has no tuning parameters and runs no iterations, yet may outperform |
| k-means and discretization in terms of both quality and speed. |
| |
| .. versionchanged:: 1.1 |
| Added new labeling method 'cluster_qr'. |
| |
| degree : float, default=3 |
| Degree of the polynomial kernel. Ignored by other kernels. |
| |
| coef0 : float, default=1 |
| Zero coefficient for polynomial and sigmoid kernels. |
| Ignored by other kernels. |
| |
| kernel_params : dict of str to any, default=None |
| Parameters (keyword arguments) and values for kernel passed as |
| callable object. Ignored by other kernels. |
| |
| n_jobs : int, default=None |
| The number of parallel jobs to run when `affinity='nearest_neighbors'` |
| or `affinity='precomputed_nearest_neighbors'`. The neighbors search |
| will be done in parallel. |
| ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
| ``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
| for more details. |
| |
| verbose : bool, default=False |
| Verbosity mode. |
| |
| .. versionadded:: 0.24 |
| |
| Attributes |
| ---------- |
| affinity_matrix_ : array-like of shape (n_samples, n_samples) |
| Affinity matrix used for clustering. Available only after calling |
| ``fit``. |
| |
| labels_ : ndarray of shape (n_samples,) |
| Labels of each point |
| |
| n_features_in_ : int |
| Number of features seen during :term:`fit`. |
| |
| .. versionadded:: 0.24 |
| |
| feature_names_in_ : ndarray of shape (`n_features_in_`,) |
| Names of features seen during :term:`fit`. Defined only when `X` |
| has feature names that are all strings. |
| |
| .. versionadded:: 1.0 |
| |
| See Also |
| -------- |
| sklearn.cluster.KMeans : K-Means clustering. |
| sklearn.cluster.DBSCAN : Density-Based Spatial Clustering of |
| Applications with Noise. |
| |
| Notes |
| ----- |
| A distance matrix for which 0 indicates identical elements and high values |
| indicate very dissimilar elements can be transformed into an affinity / |
| similarity matrix that is well-suited for the algorithm by |
| applying the Gaussian (aka RBF, heat) kernel:: |
| |
| np.exp(- dist_matrix ** 2 / (2. * delta ** 2)) |
| |
| where ``delta`` is a free parameter representing the width of the Gaussian |
| kernel. |
| |
| An alternative is to take a symmetric version of the k-nearest neighbors |
| connectivity matrix of the points. |
| |
| If the pyamg package is installed, it is used: this greatly |
| speeds up computation. |
| |
| References |
| ---------- |
| .. [1] :doi:`Normalized cuts and image segmentation, 2000 |
| Jianbo Shi, Jitendra Malik |
| <10.1109/34.868688>` |
| |
| .. [2] :doi:`A Tutorial on Spectral Clustering, 2007 |
| Ulrike von Luxburg |
| <10.1007/s11222-007-9033-z>` |
| |
| .. [3] `Multiclass spectral clustering, 2003 |
| Stella X. Yu, Jianbo Shi |
| <https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_ |
| |
| .. [4] :doi:`Toward the Optimal Preconditioned Eigensolver: |
| Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001 |
| A. V. Knyazev |
| SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541. |
| <10.1137/S1064827500366124>` |
| |
| .. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019 |
| Anil Damle, Victor Minden, Lexing Ying |
| <10.1093/imaiai/iay008>` |
| |
| Examples |
| -------- |
| >>> from sklearn.cluster import SpectralClustering |
| >>> import numpy as np |
| >>> X = np.array([[1, 1], [2, 1], [1, 0], |
| ... [4, 7], [3, 5], [3, 6]]) |
| >>> clustering = SpectralClustering(n_clusters=2, |
| ... assign_labels='discretize', |
| ... random_state=0).fit(X) |
| >>> clustering.labels_ |
| array([1, 1, 1, 0, 0, 0]) |
| >>> clustering |
| SpectralClustering(assign_labels='discretize', n_clusters=2, |
| random_state=0) |
| """ |
|
|
| _parameter_constraints: dict = { |
| "n_clusters": [Interval(Integral, 1, None, closed="left")], |
| "eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None], |
| "n_components": [Interval(Integral, 1, None, closed="left"), None], |
| "random_state": ["random_state"], |
| "n_init": [Interval(Integral, 1, None, closed="left")], |
| "gamma": [Interval(Real, 0, None, closed="left")], |
| "affinity": [ |
| callable, |
| StrOptions( |
| set(KERNEL_PARAMS) |
| | {"nearest_neighbors", "precomputed", "precomputed_nearest_neighbors"} |
| ), |
| ], |
| "n_neighbors": [Interval(Integral, 1, None, closed="left")], |
| "eigen_tol": [ |
| Interval(Real, 0.0, None, closed="left"), |
| StrOptions({"auto"}), |
| ], |
| "assign_labels": [StrOptions({"kmeans", "discretize", "cluster_qr"})], |
| "degree": [Interval(Real, 0, None, closed="left")], |
| "coef0": [Interval(Real, None, None, closed="neither")], |
| "kernel_params": [dict, None], |
| "n_jobs": [Integral, None], |
| "verbose": ["verbose"], |
| } |
|
|
| def __init__( |
| self, |
| n_clusters=8, |
| *, |
| eigen_solver=None, |
| n_components=None, |
| random_state=None, |
| n_init=10, |
| gamma=1.0, |
| affinity="rbf", |
| n_neighbors=10, |
| eigen_tol="auto", |
| assign_labels="kmeans", |
| degree=3, |
| coef0=1, |
| kernel_params=None, |
| n_jobs=None, |
| verbose=False, |
| ): |
| self.n_clusters = n_clusters |
| self.eigen_solver = eigen_solver |
| self.n_components = n_components |
| self.random_state = random_state |
| self.n_init = n_init |
| self.gamma = gamma |
| self.affinity = affinity |
| self.n_neighbors = n_neighbors |
| self.eigen_tol = eigen_tol |
| self.assign_labels = assign_labels |
| self.degree = degree |
| self.coef0 = coef0 |
| self.kernel_params = kernel_params |
| self.n_jobs = n_jobs |
| self.verbose = verbose |
|
|
| @_fit_context(prefer_skip_nested_validation=True) |
| def fit(self, X, y=None): |
| """Perform spectral clustering from features, or affinity matrix. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ |
| (n_samples, n_samples) |
| Training instances to cluster, similarities / affinities between |
| instances if ``affinity='precomputed'``, or distances between |
| instances if ``affinity='precomputed_nearest_neighbors``. If a |
| sparse matrix is provided in a format other than ``csr_matrix``, |
| ``csc_matrix``, or ``coo_matrix``, it will be converted into a |
| sparse ``csr_matrix``. |
| |
| y : Ignored |
| Not used, present here for API consistency by convention. |
| |
| Returns |
| ------- |
| self : object |
| A fitted instance of the estimator. |
| """ |
| X = validate_data( |
| self, |
| X, |
| accept_sparse=["csr", "csc", "coo"], |
| dtype=np.float64, |
| ensure_min_samples=2, |
| ) |
| allow_squared = self.affinity in [ |
| "precomputed", |
| "precomputed_nearest_neighbors", |
| ] |
| if X.shape[0] == X.shape[1] and not allow_squared: |
| warnings.warn( |
| "The spectral clustering API has changed. ``fit``" |
| "now constructs an affinity matrix from data. To use" |
| " a custom affinity matrix, " |
| "set ``affinity=precomputed``." |
| ) |
|
|
| if self.affinity == "nearest_neighbors": |
| connectivity = kneighbors_graph( |
| X, n_neighbors=self.n_neighbors, include_self=True, n_jobs=self.n_jobs |
| ) |
| self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T) |
| elif self.affinity == "precomputed_nearest_neighbors": |
| estimator = NearestNeighbors( |
| n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed" |
| ).fit(X) |
| connectivity = estimator.kneighbors_graph(X=X, mode="connectivity") |
| self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T) |
| elif self.affinity == "precomputed": |
| self.affinity_matrix_ = X |
| else: |
| params = self.kernel_params |
| if params is None: |
| params = {} |
| if not callable(self.affinity): |
| params["gamma"] = self.gamma |
| params["degree"] = self.degree |
| params["coef0"] = self.coef0 |
| self.affinity_matrix_ = pairwise_kernels( |
| X, metric=self.affinity, filter_params=True, **params |
| ) |
|
|
| random_state = check_random_state(self.random_state) |
| n_components = ( |
| self.n_clusters if self.n_components is None else self.n_components |
| ) |
| |
| |
| |
| |
| |
| |
| maps = _spectral_embedding( |
| self.affinity_matrix_, |
| n_components=n_components, |
| eigen_solver=self.eigen_solver, |
| random_state=random_state, |
| eigen_tol=self.eigen_tol, |
| drop_first=False, |
| ) |
| if self.verbose: |
| print(f"Computing label assignment using {self.assign_labels}") |
|
|
| if self.assign_labels == "kmeans": |
| _, self.labels_, _ = k_means( |
| maps, |
| self.n_clusters, |
| random_state=random_state, |
| n_init=self.n_init, |
| verbose=self.verbose, |
| ) |
| elif self.assign_labels == "cluster_qr": |
| self.labels_ = cluster_qr(maps) |
| else: |
| self.labels_ = discretize(maps, random_state=random_state) |
|
|
| return self |
|
|
| def fit_predict(self, X, y=None): |
| """Perform spectral clustering on `X` and return cluster labels. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ |
| (n_samples, n_samples) |
| Training instances to cluster, similarities / affinities between |
| instances if ``affinity='precomputed'``, or distances between |
| instances if ``affinity='precomputed_nearest_neighbors``. If a |
| sparse matrix is provided in a format other than ``csr_matrix``, |
| ``csc_matrix``, or ``coo_matrix``, it will be converted into a |
| sparse ``csr_matrix``. |
| |
| y : Ignored |
| Not used, present here for API consistency by convention. |
| |
| Returns |
| ------- |
| labels : ndarray of shape (n_samples,) |
| Cluster labels. |
| """ |
| return super().fit_predict(X, y) |
|
|
| def __sklearn_tags__(self): |
| tags = super().__sklearn_tags__() |
| tags.input_tags.sparse = True |
| tags.input_tags.pairwise = self.affinity in [ |
| "precomputed", |
| "precomputed_nearest_neighbors", |
| ] |
| return tags |
|
|