| """K-means clustering.""" |
|
|
| |
| |
|
|
| import warnings |
| from abc import ABC, abstractmethod |
| from numbers import Integral, Real |
|
|
| import numpy as np |
| import scipy.sparse as sp |
|
|
| from ..base import ( |
| BaseEstimator, |
| ClassNamePrefixFeaturesOutMixin, |
| ClusterMixin, |
| TransformerMixin, |
| _fit_context, |
| ) |
| from ..exceptions import ConvergenceWarning |
| from ..metrics.pairwise import _euclidean_distances, euclidean_distances |
| from ..utils import check_array, check_random_state |
| from ..utils._openmp_helpers import _openmp_effective_n_threads |
| from ..utils._param_validation import Interval, StrOptions, validate_params |
| from ..utils.extmath import row_norms, stable_cumsum |
| from ..utils.parallel import ( |
| _get_threadpool_controller, |
| _threadpool_controller_decorator, |
| ) |
| from ..utils.sparsefuncs import mean_variance_axis |
| from ..utils.sparsefuncs_fast import assign_rows_csr |
| from ..utils.validation import ( |
| _check_sample_weight, |
| _is_arraylike_not_scalar, |
| check_is_fitted, |
| validate_data, |
| ) |
| from ._k_means_common import ( |
| CHUNK_SIZE, |
| _inertia_dense, |
| _inertia_sparse, |
| _is_same_clustering, |
| ) |
| from ._k_means_elkan import ( |
| elkan_iter_chunked_dense, |
| elkan_iter_chunked_sparse, |
| init_bounds_dense, |
| init_bounds_sparse, |
| ) |
| from ._k_means_lloyd import lloyd_iter_chunked_dense, lloyd_iter_chunked_sparse |
| from ._k_means_minibatch import _minibatch_update_dense, _minibatch_update_sparse |
|
|
| |
| |
|
|
|
|
| @validate_params( |
| { |
| "X": ["array-like", "sparse matrix"], |
| "n_clusters": [Interval(Integral, 1, None, closed="left")], |
| "sample_weight": ["array-like", None], |
| "x_squared_norms": ["array-like", None], |
| "random_state": ["random_state"], |
| "n_local_trials": [Interval(Integral, 1, None, closed="left"), None], |
| }, |
| prefer_skip_nested_validation=True, |
| ) |
| def kmeans_plusplus( |
| X, |
| n_clusters, |
| *, |
| sample_weight=None, |
| x_squared_norms=None, |
| random_state=None, |
| n_local_trials=None, |
| ): |
| """Init n_clusters seeds according to k-means++. |
| |
| .. versionadded:: 0.24 |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| The data to pick seeds from. |
| |
| n_clusters : int |
| The number of centroids to initialize. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| The weights for each observation in `X`. If `None`, all observations |
| are assigned equal weight. `sample_weight` is ignored if `init` |
| is a callable or a user provided array. |
| |
| .. versionadded:: 1.3 |
| |
| x_squared_norms : array-like of shape (n_samples,), default=None |
| Squared Euclidean norm of each data point. |
| |
| random_state : int or RandomState instance, default=None |
| Determines random number generation for centroid initialization. Pass |
| an int for reproducible output across multiple function calls. |
| See :term:`Glossary <random_state>`. |
| |
| n_local_trials : int, default=None |
| The number of seeding trials for each center (except the first), |
| of which the one reducing inertia the most is greedily chosen. |
| Set to None to make the number of trials depend logarithmically |
| on the number of seeds (2+log(k)) which is the recommended setting. |
| Setting to 1 disables the greedy cluster selection and recovers the |
| vanilla k-means++ algorithm which was empirically shown to work less |
| well than its greedy variant. |
| |
| Returns |
| ------- |
| centers : ndarray of shape (n_clusters, n_features) |
| The initial centers for k-means. |
| |
| indices : ndarray of shape (n_clusters,) |
| The index location of the chosen centers in the data array X. For a |
| given index and center, X[index] = center. |
| |
| Notes |
| ----- |
| Selects initial cluster centers for k-mean clustering in a smart way |
| to speed up convergence. see: Arthur, D. and Vassilvitskii, S. |
| "k-means++: the advantages of careful seeding". ACM-SIAM symposium |
| on Discrete algorithms. 2007 |
| |
| Examples |
| -------- |
| |
| >>> from sklearn.cluster import kmeans_plusplus |
| >>> import numpy as np |
| >>> X = np.array([[1, 2], [1, 4], [1, 0], |
| ... [10, 2], [10, 4], [10, 0]]) |
| >>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0) |
| >>> centers |
| array([[10, 2], |
| [ 1, 0]]) |
| >>> indices |
| array([3, 2]) |
| """ |
| |
| check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) |
| sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) |
|
|
| if X.shape[0] < n_clusters: |
| raise ValueError( |
| f"n_samples={X.shape[0]} should be >= n_clusters={n_clusters}." |
| ) |
|
|
| |
| if x_squared_norms is None: |
| x_squared_norms = row_norms(X, squared=True) |
| else: |
| x_squared_norms = check_array(x_squared_norms, dtype=X.dtype, ensure_2d=False) |
|
|
| if x_squared_norms.shape[0] != X.shape[0]: |
| raise ValueError( |
| f"The length of x_squared_norms {x_squared_norms.shape[0]} should " |
| f"be equal to the length of n_samples {X.shape[0]}." |
| ) |
|
|
| random_state = check_random_state(random_state) |
|
|
| |
| centers, indices = _kmeans_plusplus( |
| X, n_clusters, x_squared_norms, sample_weight, random_state, n_local_trials |
| ) |
|
|
| return centers, indices |
|
|
|
|
| def _kmeans_plusplus( |
| X, n_clusters, x_squared_norms, sample_weight, random_state, n_local_trials=None |
| ): |
| """Computational component for initialization of n_clusters by |
| k-means++. Prior validation of data is assumed. |
| |
| Parameters |
| ---------- |
| X : {ndarray, sparse matrix} of shape (n_samples, n_features) |
| The data to pick seeds for. |
| |
| n_clusters : int |
| The number of seeds to choose. |
| |
| sample_weight : ndarray of shape (n_samples,) |
| The weights for each observation in `X`. |
| |
| x_squared_norms : ndarray of shape (n_samples,) |
| Squared Euclidean norm of each data point. |
| |
| random_state : RandomState instance |
| The generator used to initialize the centers. |
| See :term:`Glossary <random_state>`. |
| |
| n_local_trials : int, default=None |
| The number of seeding trials for each center (except the first), |
| of which the one reducing inertia the most is greedily chosen. |
| Set to None to make the number of trials depend logarithmically |
| on the number of seeds (2+log(k)); this is the default. |
| |
| Returns |
| ------- |
| centers : ndarray of shape (n_clusters, n_features) |
| The initial centers for k-means. |
| |
| indices : ndarray of shape (n_clusters,) |
| The index location of the chosen centers in the data array X. For a |
| given index and center, X[index] = center. |
| """ |
| n_samples, n_features = X.shape |
|
|
| centers = np.empty((n_clusters, n_features), dtype=X.dtype) |
|
|
| |
| if n_local_trials is None: |
| |
| |
| |
| n_local_trials = 2 + int(np.log(n_clusters)) |
|
|
| |
| center_id = random_state.choice(n_samples, p=sample_weight / sample_weight.sum()) |
| indices = np.full(n_clusters, -1, dtype=int) |
| if sp.issparse(X): |
| centers[0] = X[[center_id]].toarray() |
| else: |
| centers[0] = X[center_id] |
| indices[0] = center_id |
|
|
| |
| closest_dist_sq = _euclidean_distances( |
| centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms, squared=True |
| ) |
| current_pot = closest_dist_sq @ sample_weight |
|
|
| |
| for c in range(1, n_clusters): |
| |
| |
| rand_vals = random_state.uniform(size=n_local_trials) * current_pot |
| candidate_ids = np.searchsorted( |
| stable_cumsum(sample_weight * closest_dist_sq), rand_vals |
| ) |
| |
| np.clip(candidate_ids, None, closest_dist_sq.size - 1, out=candidate_ids) |
|
|
| |
| distance_to_candidates = _euclidean_distances( |
| X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True |
| ) |
|
|
| |
| np.minimum(closest_dist_sq, distance_to_candidates, out=distance_to_candidates) |
| candidates_pot = distance_to_candidates @ sample_weight.reshape(-1, 1) |
|
|
| |
| best_candidate = np.argmin(candidates_pot) |
| current_pot = candidates_pot[best_candidate] |
| closest_dist_sq = distance_to_candidates[best_candidate] |
| best_candidate = candidate_ids[best_candidate] |
|
|
| |
| if sp.issparse(X): |
| centers[c] = X[[best_candidate]].toarray() |
| else: |
| centers[c] = X[best_candidate] |
| indices[c] = best_candidate |
|
|
| return centers, indices |
|
|
|
|
| |
| |
|
|
|
|
| def _tolerance(X, tol): |
| """Return a tolerance which is dependent on the dataset.""" |
| if tol == 0: |
| return 0 |
| if sp.issparse(X): |
| variances = mean_variance_axis(X, axis=0)[1] |
| else: |
| variances = np.var(X, axis=0) |
| return np.mean(variances) * tol |
|
|
|
|
| @validate_params( |
| { |
| "X": ["array-like", "sparse matrix"], |
| "sample_weight": ["array-like", None], |
| "return_n_iter": [bool], |
| }, |
| prefer_skip_nested_validation=False, |
| ) |
| def k_means( |
| X, |
| n_clusters, |
| *, |
| sample_weight=None, |
| init="k-means++", |
| n_init="auto", |
| max_iter=300, |
| verbose=False, |
| tol=1e-4, |
| random_state=None, |
| copy_x=True, |
| algorithm="lloyd", |
| return_n_iter=False, |
| ): |
| """Perform K-means clustering algorithm. |
| |
| Read more in the :ref:`User Guide <k_means>`. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| The observations to cluster. It must be noted that the data |
| will be converted to C ordering, which will cause a memory copy |
| if the given data is not C-contiguous. |
| |
| n_clusters : int |
| The number of clusters to form as well as the number of |
| centroids to generate. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| The weights for each observation in `X`. If `None`, all observations |
| are assigned equal weight. `sample_weight` is not used during |
| initialization if `init` is a callable or a user provided array. |
| |
| init : {'k-means++', 'random'}, callable or array-like of shape \ |
| (n_clusters, n_features), default='k-means++' |
| Method for initialization: |
| |
| - `'k-means++'` : selects initial cluster centers for k-mean |
| clustering in a smart way to speed up convergence. See section |
| Notes in k_init for more details. |
| - `'random'`: choose `n_clusters` observations (rows) at random from data |
| for the initial centroids. |
| - If an array is passed, it should be of shape `(n_clusters, n_features)` |
| and gives the initial centers. |
| - If a callable is passed, it should take arguments `X`, `n_clusters` and a |
| random state and return an initialization. |
| |
| n_init : 'auto' or int, default="auto" |
| Number of time the k-means algorithm will be run with different |
| centroid seeds. The final results will be the best output of |
| n_init consecutive runs in terms of inertia. |
| |
| When `n_init='auto'`, the number of runs depends on the value of init: |
| 10 if using `init='random'` or `init` is a callable; |
| 1 if using `init='k-means++'` or `init` is an array-like. |
| |
| .. versionadded:: 1.2 |
| Added 'auto' option for `n_init`. |
| |
| .. versionchanged:: 1.4 |
| Default value for `n_init` changed to `'auto'`. |
| |
| max_iter : int, default=300 |
| Maximum number of iterations of the k-means algorithm to run. |
| |
| verbose : bool, default=False |
| Verbosity mode. |
| |
| tol : float, default=1e-4 |
| Relative tolerance with regards to Frobenius norm of the difference |
| in the cluster centers of two consecutive iterations to declare |
| convergence. |
| |
| random_state : int, RandomState instance or None, default=None |
| Determines random number generation for centroid initialization. Use |
| an int to make the randomness deterministic. |
| See :term:`Glossary <random_state>`. |
| |
| copy_x : bool, default=True |
| When pre-computing distances it is more numerically accurate to center |
| the data first. If `copy_x` is True (default), then the original data is |
| not modified. If False, the original data is modified, and put back |
| before the function returns, but small numerical differences may be |
| introduced by subtracting and then adding the data mean. Note that if |
| the original data is not C-contiguous, a copy will be made even if |
| `copy_x` is False. If the original data is sparse, but not in CSR format, |
| a copy will be made even if `copy_x` is False. |
| |
| algorithm : {"lloyd", "elkan"}, default="lloyd" |
| K-means algorithm to use. The classical EM-style algorithm is `"lloyd"`. |
| The `"elkan"` variation can be more efficient on some datasets with |
| well-defined clusters, by using the triangle inequality. However it's |
| more memory intensive due to the allocation of an extra array of shape |
| `(n_samples, n_clusters)`. |
| |
| .. versionchanged:: 0.18 |
| Added Elkan algorithm |
| |
| .. versionchanged:: 1.1 |
| Renamed "full" to "lloyd", and deprecated "auto" and "full". |
| Changed "auto" to use "lloyd" instead of "elkan". |
| |
| return_n_iter : bool, default=False |
| Whether or not to return the number of iterations. |
| |
| Returns |
| ------- |
| centroid : ndarray of shape (n_clusters, n_features) |
| Centroids found at the last iteration of k-means. |
| |
| label : ndarray of shape (n_samples,) |
| The `label[i]` is the code or index of the centroid the |
| i'th observation is closest to. |
| |
| inertia : float |
| The final value of the inertia criterion (sum of squared distances to |
| the closest centroid for all observations in the training set). |
| |
| best_n_iter : int |
| Number of iterations corresponding to the best results. |
| Returned only if `return_n_iter` is set to True. |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn.cluster import k_means |
| >>> X = np.array([[1, 2], [1, 4], [1, 0], |
| ... [10, 2], [10, 4], [10, 0]]) |
| >>> centroid, label, inertia = k_means( |
| ... X, n_clusters=2, n_init="auto", random_state=0 |
| ... ) |
| >>> centroid |
| array([[10., 2.], |
| [ 1., 2.]]) |
| >>> label |
| array([1, 1, 1, 0, 0, 0], dtype=int32) |
| >>> inertia |
| 16.0 |
| """ |
| est = KMeans( |
| n_clusters=n_clusters, |
| init=init, |
| n_init=n_init, |
| max_iter=max_iter, |
| verbose=verbose, |
| tol=tol, |
| random_state=random_state, |
| copy_x=copy_x, |
| algorithm=algorithm, |
| ).fit(X, sample_weight=sample_weight) |
| if return_n_iter: |
| return est.cluster_centers_, est.labels_, est.inertia_, est.n_iter_ |
| else: |
| return est.cluster_centers_, est.labels_, est.inertia_ |
|
|
|
|
| def _kmeans_single_elkan( |
| X, |
| sample_weight, |
| centers_init, |
| max_iter=300, |
| verbose=False, |
| tol=1e-4, |
| n_threads=1, |
| ): |
| """A single run of k-means elkan, assumes preparation completed prior. |
| |
| Parameters |
| ---------- |
| X : {ndarray, sparse matrix} of shape (n_samples, n_features) |
| The observations to cluster. If sparse matrix, must be in CSR format. |
| |
| sample_weight : array-like of shape (n_samples,) |
| The weights for each observation in X. |
| |
| centers_init : ndarray of shape (n_clusters, n_features) |
| The initial centers. |
| |
| max_iter : int, default=300 |
| Maximum number of iterations of the k-means algorithm to run. |
| |
| verbose : bool, default=False |
| Verbosity mode. |
| |
| tol : float, default=1e-4 |
| Relative tolerance with regards to Frobenius norm of the difference |
| in the cluster centers of two consecutive iterations to declare |
| convergence. |
| It's not advised to set `tol=0` since convergence might never be |
| declared due to rounding errors. Use a very small number instead. |
| |
| n_threads : int, default=1 |
| The number of OpenMP threads to use for the computation. Parallelism is |
| sample-wise on the main cython loop which assigns each sample to its |
| closest center. |
| |
| Returns |
| ------- |
| centroid : ndarray of shape (n_clusters, n_features) |
| Centroids found at the last iteration of k-means. |
| |
| label : ndarray of shape (n_samples,) |
| label[i] is the code or index of the centroid the |
| i'th observation is closest to. |
| |
| inertia : float |
| The final value of the inertia criterion (sum of squared distances to |
| the closest centroid for all observations in the training set). |
| |
| n_iter : int |
| Number of iterations run. |
| """ |
| n_samples = X.shape[0] |
| n_clusters = centers_init.shape[0] |
|
|
| |
| centers = centers_init |
| centers_new = np.zeros_like(centers) |
| weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype) |
| labels = np.full(n_samples, -1, dtype=np.int32) |
| labels_old = labels.copy() |
| center_half_distances = euclidean_distances(centers) / 2 |
| distance_next_center = np.partition( |
| np.asarray(center_half_distances), kth=1, axis=0 |
| )[1] |
| upper_bounds = np.zeros(n_samples, dtype=X.dtype) |
| lower_bounds = np.zeros((n_samples, n_clusters), dtype=X.dtype) |
| center_shift = np.zeros(n_clusters, dtype=X.dtype) |
|
|
| if sp.issparse(X): |
| init_bounds = init_bounds_sparse |
| elkan_iter = elkan_iter_chunked_sparse |
| _inertia = _inertia_sparse |
| else: |
| init_bounds = init_bounds_dense |
| elkan_iter = elkan_iter_chunked_dense |
| _inertia = _inertia_dense |
|
|
| init_bounds( |
| X, |
| centers, |
| center_half_distances, |
| labels, |
| upper_bounds, |
| lower_bounds, |
| n_threads=n_threads, |
| ) |
|
|
| strict_convergence = False |
|
|
| for i in range(max_iter): |
| elkan_iter( |
| X, |
| sample_weight, |
| centers, |
| centers_new, |
| weight_in_clusters, |
| center_half_distances, |
| distance_next_center, |
| upper_bounds, |
| lower_bounds, |
| labels, |
| center_shift, |
| n_threads, |
| ) |
|
|
| |
| |
| center_half_distances = euclidean_distances(centers_new) / 2 |
| distance_next_center = np.partition( |
| np.asarray(center_half_distances), kth=1, axis=0 |
| )[1] |
|
|
| if verbose: |
| inertia = _inertia(X, sample_weight, centers, labels, n_threads) |
| print(f"Iteration {i}, inertia {inertia}") |
|
|
| centers, centers_new = centers_new, centers |
|
|
| if np.array_equal(labels, labels_old): |
| |
| if verbose: |
| print(f"Converged at iteration {i}: strict convergence.") |
| strict_convergence = True |
| break |
| else: |
| |
| center_shift_tot = (center_shift**2).sum() |
| if center_shift_tot <= tol: |
| if verbose: |
| print( |
| f"Converged at iteration {i}: center shift " |
| f"{center_shift_tot} within tolerance {tol}." |
| ) |
| break |
|
|
| labels_old[:] = labels |
|
|
| if not strict_convergence: |
| |
| elkan_iter( |
| X, |
| sample_weight, |
| centers, |
| centers, |
| weight_in_clusters, |
| center_half_distances, |
| distance_next_center, |
| upper_bounds, |
| lower_bounds, |
| labels, |
| center_shift, |
| n_threads, |
| update_centers=False, |
| ) |
|
|
| inertia = _inertia(X, sample_weight, centers, labels, n_threads) |
|
|
| return labels, inertia, centers, i + 1 |
|
|
|
|
| |
| |
| @_threadpool_controller_decorator(limits=1, user_api="blas") |
| def _kmeans_single_lloyd( |
| X, |
| sample_weight, |
| centers_init, |
| max_iter=300, |
| verbose=False, |
| tol=1e-4, |
| n_threads=1, |
| ): |
| """A single run of k-means lloyd, assumes preparation completed prior. |
| |
| Parameters |
| ---------- |
| X : {ndarray, sparse matrix} of shape (n_samples, n_features) |
| The observations to cluster. If sparse matrix, must be in CSR format. |
| |
| sample_weight : ndarray of shape (n_samples,) |
| The weights for each observation in X. |
| |
| centers_init : ndarray of shape (n_clusters, n_features) |
| The initial centers. |
| |
| max_iter : int, default=300 |
| Maximum number of iterations of the k-means algorithm to run. |
| |
| verbose : bool, default=False |
| Verbosity mode |
| |
| tol : float, default=1e-4 |
| Relative tolerance with regards to Frobenius norm of the difference |
| in the cluster centers of two consecutive iterations to declare |
| convergence. |
| It's not advised to set `tol=0` since convergence might never be |
| declared due to rounding errors. Use a very small number instead. |
| |
| n_threads : int, default=1 |
| The number of OpenMP threads to use for the computation. Parallelism is |
| sample-wise on the main cython loop which assigns each sample to its |
| closest center. |
| |
| Returns |
| ------- |
| centroid : ndarray of shape (n_clusters, n_features) |
| Centroids found at the last iteration of k-means. |
| |
| label : ndarray of shape (n_samples,) |
| label[i] is the code or index of the centroid the |
| i'th observation is closest to. |
| |
| inertia : float |
| The final value of the inertia criterion (sum of squared distances to |
| the closest centroid for all observations in the training set). |
| |
| n_iter : int |
| Number of iterations run. |
| """ |
| n_clusters = centers_init.shape[0] |
|
|
| |
| centers = centers_init |
| centers_new = np.zeros_like(centers) |
| labels = np.full(X.shape[0], -1, dtype=np.int32) |
| labels_old = labels.copy() |
| weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype) |
| center_shift = np.zeros(n_clusters, dtype=X.dtype) |
|
|
| if sp.issparse(X): |
| lloyd_iter = lloyd_iter_chunked_sparse |
| _inertia = _inertia_sparse |
| else: |
| lloyd_iter = lloyd_iter_chunked_dense |
| _inertia = _inertia_dense |
|
|
| strict_convergence = False |
|
|
| for i in range(max_iter): |
| lloyd_iter( |
| X, |
| sample_weight, |
| centers, |
| centers_new, |
| weight_in_clusters, |
| labels, |
| center_shift, |
| n_threads, |
| ) |
|
|
| if verbose: |
| inertia = _inertia(X, sample_weight, centers, labels, n_threads) |
| print(f"Iteration {i}, inertia {inertia}.") |
|
|
| centers, centers_new = centers_new, centers |
|
|
| if np.array_equal(labels, labels_old): |
| |
| if verbose: |
| print(f"Converged at iteration {i}: strict convergence.") |
| strict_convergence = True |
| break |
| else: |
| |
| center_shift_tot = (center_shift**2).sum() |
| if center_shift_tot <= tol: |
| if verbose: |
| print( |
| f"Converged at iteration {i}: center shift " |
| f"{center_shift_tot} within tolerance {tol}." |
| ) |
| break |
|
|
| labels_old[:] = labels |
|
|
| if not strict_convergence: |
| |
| lloyd_iter( |
| X, |
| sample_weight, |
| centers, |
| centers, |
| weight_in_clusters, |
| labels, |
| center_shift, |
| n_threads, |
| update_centers=False, |
| ) |
|
|
| inertia = _inertia(X, sample_weight, centers, labels, n_threads) |
|
|
| return labels, inertia, centers, i + 1 |
|
|
|
|
| def _labels_inertia(X, sample_weight, centers, n_threads=1, return_inertia=True): |
| """E step of the K-means EM algorithm. |
| |
| Compute the labels and the inertia of the given samples and centers. |
| |
| Parameters |
| ---------- |
| X : {ndarray, sparse matrix} of shape (n_samples, n_features) |
| The input samples to assign to the labels. If sparse matrix, must |
| be in CSR format. |
| |
| sample_weight : ndarray of shape (n_samples,) |
| The weights for each observation in X. |
| |
| x_squared_norms : ndarray of shape (n_samples,) |
| Precomputed squared euclidean norm of each data point, to speed up |
| computations. |
| |
| centers : ndarray of shape (n_clusters, n_features) |
| The cluster centers. |
| |
| n_threads : int, default=1 |
| The number of OpenMP threads to use for the computation. Parallelism is |
| sample-wise on the main cython loop which assigns each sample to its |
| closest center. |
| |
| return_inertia : bool, default=True |
| Whether to compute and return the inertia. |
| |
| Returns |
| ------- |
| labels : ndarray of shape (n_samples,) |
| The resulting assignment. |
| |
| inertia : float |
| Sum of squared distances of samples to their closest cluster center. |
| Inertia is only returned if return_inertia is True. |
| """ |
| n_samples = X.shape[0] |
| n_clusters = centers.shape[0] |
|
|
| labels = np.full(n_samples, -1, dtype=np.int32) |
| center_shift = np.zeros(n_clusters, dtype=centers.dtype) |
|
|
| if sp.issparse(X): |
| _labels = lloyd_iter_chunked_sparse |
| _inertia = _inertia_sparse |
| else: |
| _labels = lloyd_iter_chunked_dense |
| _inertia = _inertia_dense |
|
|
| _labels( |
| X, |
| sample_weight, |
| centers, |
| centers_new=None, |
| weight_in_clusters=None, |
| labels=labels, |
| center_shift=center_shift, |
| n_threads=n_threads, |
| update_centers=False, |
| ) |
|
|
| if return_inertia: |
| inertia = _inertia(X, sample_weight, centers, labels, n_threads) |
| return labels, inertia |
|
|
| return labels |
|
|
|
|
| |
| _labels_inertia_threadpool_limit = _threadpool_controller_decorator( |
| limits=1, user_api="blas" |
| )(_labels_inertia) |
|
|
|
|
| class _BaseKMeans( |
| ClassNamePrefixFeaturesOutMixin, TransformerMixin, ClusterMixin, BaseEstimator, ABC |
| ): |
| """Base class for KMeans and MiniBatchKMeans""" |
|
|
| _parameter_constraints: dict = { |
| "n_clusters": [Interval(Integral, 1, None, closed="left")], |
| "init": [StrOptions({"k-means++", "random"}), callable, "array-like"], |
| "n_init": [ |
| StrOptions({"auto"}), |
| Interval(Integral, 1, None, closed="left"), |
| ], |
| "max_iter": [Interval(Integral, 1, None, closed="left")], |
| "tol": [Interval(Real, 0, None, closed="left")], |
| "verbose": ["verbose"], |
| "random_state": ["random_state"], |
| } |
|
|
| def __init__( |
| self, |
| n_clusters, |
| *, |
| init, |
| n_init, |
| max_iter, |
| tol, |
| verbose, |
| random_state, |
| ): |
| self.n_clusters = n_clusters |
| self.init = init |
| self.max_iter = max_iter |
| self.tol = tol |
| self.n_init = n_init |
| self.verbose = verbose |
| self.random_state = random_state |
|
|
| def _check_params_vs_input(self, X, default_n_init=None): |
| |
| if X.shape[0] < self.n_clusters: |
| raise ValueError( |
| f"n_samples={X.shape[0]} should be >= n_clusters={self.n_clusters}." |
| ) |
|
|
| |
| self._tol = _tolerance(X, self.tol) |
|
|
| |
| if self.n_init == "auto": |
| if isinstance(self.init, str) and self.init == "k-means++": |
| self._n_init = 1 |
| elif isinstance(self.init, str) and self.init == "random": |
| self._n_init = default_n_init |
| elif callable(self.init): |
| self._n_init = default_n_init |
| else: |
| self._n_init = 1 |
| else: |
| self._n_init = self.n_init |
|
|
| if _is_arraylike_not_scalar(self.init) and self._n_init != 1: |
| warnings.warn( |
| ( |
| "Explicit initial center position passed: performing only" |
| f" one init in {self.__class__.__name__} instead of " |
| f"n_init={self._n_init}." |
| ), |
| RuntimeWarning, |
| stacklevel=2, |
| ) |
| self._n_init = 1 |
|
|
| @abstractmethod |
| def _warn_mkl_vcomp(self, n_active_threads): |
| """Issue an estimator specific warning when vcomp and mkl are both present |
| |
| This method is called by `_check_mkl_vcomp`. |
| """ |
|
|
| def _check_mkl_vcomp(self, X, n_samples): |
| """Check when vcomp and mkl are both present""" |
| |
| |
| |
| |
| if sp.issparse(X): |
| return |
|
|
| n_active_threads = int(np.ceil(n_samples / CHUNK_SIZE)) |
| if n_active_threads < self._n_threads: |
| modules = _get_threadpool_controller().info() |
| has_vcomp = "vcomp" in [module["prefix"] for module in modules] |
| has_mkl = ("mkl", "intel") in [ |
| (module["internal_api"], module.get("threading_layer", None)) |
| for module in modules |
| ] |
| if has_vcomp and has_mkl: |
| self._warn_mkl_vcomp(n_active_threads) |
|
|
| def _validate_center_shape(self, X, centers): |
| """Check if centers is compatible with X and n_clusters.""" |
| if centers.shape[0] != self.n_clusters: |
| raise ValueError( |
| f"The shape of the initial centers {centers.shape} does not " |
| f"match the number of clusters {self.n_clusters}." |
| ) |
| if centers.shape[1] != X.shape[1]: |
| raise ValueError( |
| f"The shape of the initial centers {centers.shape} does not " |
| f"match the number of features of the data {X.shape[1]}." |
| ) |
|
|
| def _check_test_data(self, X): |
| X = validate_data( |
| self, |
| X, |
| accept_sparse="csr", |
| reset=False, |
| dtype=[np.float64, np.float32], |
| order="C", |
| accept_large_sparse=False, |
| ) |
| return X |
|
|
| def _init_centroids( |
| self, |
| X, |
| x_squared_norms, |
| init, |
| random_state, |
| sample_weight, |
| init_size=None, |
| n_centroids=None, |
| ): |
| """Compute the initial centroids. |
| |
| Parameters |
| ---------- |
| X : {ndarray, sparse matrix} of shape (n_samples, n_features) |
| The input samples. |
| |
| x_squared_norms : ndarray of shape (n_samples,) |
| Squared euclidean norm of each data point. Pass it if you have it |
| at hands already to avoid it being recomputed here. |
| |
| init : {'k-means++', 'random'}, callable or ndarray of shape \ |
| (n_clusters, n_features) |
| Method for initialization. |
| |
| random_state : RandomState instance |
| Determines random number generation for centroid initialization. |
| See :term:`Glossary <random_state>`. |
| |
| sample_weight : ndarray of shape (n_samples,) |
| The weights for each observation in X. `sample_weight` is not used |
| during initialization if `init` is a callable or a user provided |
| array. |
| |
| init_size : int, default=None |
| Number of samples to randomly sample for speeding up the |
| initialization (sometimes at the expense of accuracy). |
| |
| n_centroids : int, default=None |
| Number of centroids to initialize. |
| If left to 'None' the number of centroids will be equal to |
| number of clusters to form (self.n_clusters). |
| |
| Returns |
| ------- |
| centers : ndarray of shape (n_clusters, n_features) |
| Initial centroids of clusters. |
| """ |
| n_samples = X.shape[0] |
| n_clusters = self.n_clusters if n_centroids is None else n_centroids |
|
|
| if init_size is not None and init_size < n_samples: |
| init_indices = random_state.randint(0, n_samples, init_size) |
| X = X[init_indices] |
| x_squared_norms = x_squared_norms[init_indices] |
| n_samples = X.shape[0] |
| sample_weight = sample_weight[init_indices] |
|
|
| if isinstance(init, str) and init == "k-means++": |
| centers, _ = _kmeans_plusplus( |
| X, |
| n_clusters, |
| random_state=random_state, |
| x_squared_norms=x_squared_norms, |
| sample_weight=sample_weight, |
| ) |
| elif isinstance(init, str) and init == "random": |
| seeds = random_state.choice( |
| n_samples, |
| size=n_clusters, |
| replace=False, |
| p=sample_weight / sample_weight.sum(), |
| ) |
| centers = X[seeds] |
| elif _is_arraylike_not_scalar(self.init): |
| centers = init |
| elif callable(init): |
| centers = init(X, n_clusters, random_state=random_state) |
| centers = check_array(centers, dtype=X.dtype, copy=False, order="C") |
| self._validate_center_shape(X, centers) |
|
|
| if sp.issparse(centers): |
| centers = centers.toarray() |
|
|
| return centers |
|
|
| def fit_predict(self, X, y=None, sample_weight=None): |
| """Compute cluster centers and predict cluster index for each sample. |
| |
| Convenience method; equivalent to calling fit(X) followed by |
| predict(X). |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| New data to transform. |
| |
| y : Ignored |
| Not used, present here for API consistency by convention. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| The weights for each observation in X. If None, all observations |
| are assigned equal weight. |
| |
| Returns |
| ------- |
| labels : ndarray of shape (n_samples,) |
| Index of the cluster each sample belongs to. |
| """ |
| return self.fit(X, sample_weight=sample_weight).labels_ |
|
|
| def predict(self, X): |
| """Predict the closest cluster each sample in X belongs to. |
| |
| In the vector quantization literature, `cluster_centers_` is called |
| the code book and each value returned by `predict` is the index of |
| the closest code in the code book. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| New data to predict. |
| |
| Returns |
| ------- |
| labels : ndarray of shape (n_samples,) |
| Index of the cluster each sample belongs to. |
| """ |
| check_is_fitted(self) |
|
|
| X = self._check_test_data(X) |
|
|
| |
| sample_weight = np.ones(X.shape[0], dtype=X.dtype) |
|
|
| labels = _labels_inertia_threadpool_limit( |
| X, |
| sample_weight, |
| self.cluster_centers_, |
| n_threads=self._n_threads, |
| return_inertia=False, |
| ) |
|
|
| return labels |
|
|
| def fit_transform(self, X, y=None, sample_weight=None): |
| """Compute clustering and transform X to cluster-distance space. |
| |
| Equivalent to fit(X).transform(X), but more efficiently implemented. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| New data to transform. |
| |
| y : Ignored |
| Not used, present here for API consistency by convention. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| The weights for each observation in X. If None, all observations |
| are assigned equal weight. |
| |
| Returns |
| ------- |
| X_new : ndarray of shape (n_samples, n_clusters) |
| X transformed in the new space. |
| """ |
| return self.fit(X, sample_weight=sample_weight)._transform(X) |
|
|
| def transform(self, X): |
| """Transform X to a cluster-distance space. |
| |
| In the new space, each dimension is the distance to the cluster |
| centers. Note that even if X is sparse, the array returned by |
| `transform` will typically be dense. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| New data to transform. |
| |
| Returns |
| ------- |
| X_new : ndarray of shape (n_samples, n_clusters) |
| X transformed in the new space. |
| """ |
| check_is_fitted(self) |
|
|
| X = self._check_test_data(X) |
| return self._transform(X) |
|
|
| def _transform(self, X): |
| """Guts of transform method; no input validation.""" |
| return euclidean_distances(X, self.cluster_centers_) |
|
|
| def score(self, X, y=None, sample_weight=None): |
| """Opposite of the value of X on the K-means objective. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| New data. |
| |
| y : Ignored |
| Not used, present here for API consistency by convention. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| The weights for each observation in X. If None, all observations |
| are assigned equal weight. |
| |
| Returns |
| ------- |
| score : float |
| Opposite of the value of X on the K-means objective. |
| """ |
| check_is_fitted(self) |
|
|
| X = self._check_test_data(X) |
| sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) |
|
|
| _, scores = _labels_inertia_threadpool_limit( |
| X, sample_weight, self.cluster_centers_, self._n_threads |
| ) |
| return -scores |
|
|
| def __sklearn_tags__(self): |
| tags = super().__sklearn_tags__() |
| tags.input_tags.sparse = True |
| return tags |
|
|
|
|
| class KMeans(_BaseKMeans): |
| """K-Means clustering. |
| |
| Read more in the :ref:`User Guide <k_means>`. |
| |
| Parameters |
| ---------- |
| |
| n_clusters : int, default=8 |
| The number of clusters to form as well as the number of |
| centroids to generate. |
| |
| For an example of how to choose an optimal value for `n_clusters` refer to |
| :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`. |
| |
| init : {'k-means++', 'random'}, callable or array-like of shape \ |
| (n_clusters, n_features), default='k-means++' |
| Method for initialization: |
| |
| * 'k-means++' : selects initial cluster centroids using sampling \ |
| based on an empirical probability distribution of the points' \ |
| contribution to the overall inertia. This technique speeds up \ |
| convergence. The algorithm implemented is "greedy k-means++". It \ |
| differs from the vanilla k-means++ by making several trials at \ |
| each sampling step and choosing the best centroid among them. |
| |
| * 'random': choose `n_clusters` observations (rows) at random from \ |
| data for the initial centroids. |
| |
| * If an array is passed, it should be of shape (n_clusters, n_features)\ |
| and gives the initial centers. |
| |
| * If a callable is passed, it should take arguments X, n_clusters and a\ |
| random state and return an initialization. |
| |
| For an example of how to use the different `init` strategies, see |
| :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_digits.py`. |
| |
| For an evaluation of the impact of initialization, see the example |
| :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`. |
| |
| n_init : 'auto' or int, default='auto' |
| Number of times the k-means algorithm is run with different centroid |
| seeds. The final results is the best output of `n_init` consecutive runs |
| in terms of inertia. Several runs are recommended for sparse |
| high-dimensional problems (see :ref:`kmeans_sparse_high_dim`). |
| |
| When `n_init='auto'`, the number of runs depends on the value of init: |
| 10 if using `init='random'` or `init` is a callable; |
| 1 if using `init='k-means++'` or `init` is an array-like. |
| |
| .. versionadded:: 1.2 |
| Added 'auto' option for `n_init`. |
| |
| .. versionchanged:: 1.4 |
| Default value for `n_init` changed to `'auto'`. |
| |
| max_iter : int, default=300 |
| Maximum number of iterations of the k-means algorithm for a |
| single run. |
| |
| tol : float, default=1e-4 |
| Relative tolerance with regards to Frobenius norm of the difference |
| in the cluster centers of two consecutive iterations to declare |
| convergence. |
| |
| verbose : int, default=0 |
| Verbosity mode. |
| |
| random_state : int, RandomState instance or None, default=None |
| Determines random number generation for centroid initialization. Use |
| an int to make the randomness deterministic. |
| See :term:`Glossary <random_state>`. |
| |
| copy_x : bool, default=True |
| When pre-computing distances it is more numerically accurate to center |
| the data first. If copy_x is True (default), then the original data is |
| not modified. If False, the original data is modified, and put back |
| before the function returns, but small numerical differences may be |
| introduced by subtracting and then adding the data mean. Note that if |
| the original data is not C-contiguous, a copy will be made even if |
| copy_x is False. If the original data is sparse, but not in CSR format, |
| a copy will be made even if copy_x is False. |
| |
| algorithm : {"lloyd", "elkan"}, default="lloyd" |
| K-means algorithm to use. The classical EM-style algorithm is `"lloyd"`. |
| The `"elkan"` variation can be more efficient on some datasets with |
| well-defined clusters, by using the triangle inequality. However it's |
| more memory intensive due to the allocation of an extra array of shape |
| `(n_samples, n_clusters)`. |
| |
| .. versionchanged:: 0.18 |
| Added Elkan algorithm |
| |
| .. versionchanged:: 1.1 |
| Renamed "full" to "lloyd", and deprecated "auto" and "full". |
| Changed "auto" to use "lloyd" instead of "elkan". |
| |
| Attributes |
| ---------- |
| cluster_centers_ : ndarray of shape (n_clusters, n_features) |
| Coordinates of cluster centers. If the algorithm stops before fully |
| converging (see ``tol`` and ``max_iter``), these will not be |
| consistent with ``labels_``. |
| |
| labels_ : ndarray of shape (n_samples,) |
| Labels of each point |
| |
| inertia_ : float |
| Sum of squared distances of samples to their closest cluster center, |
| weighted by the sample weights if provided. |
| |
| n_iter_ : int |
| Number of iterations run. |
| |
| n_features_in_ : int |
| Number of features seen during :term:`fit`. |
| |
| .. versionadded:: 0.24 |
| |
| feature_names_in_ : ndarray of shape (`n_features_in_`,) |
| Names of features seen during :term:`fit`. Defined only when `X` |
| has feature names that are all strings. |
| |
| .. versionadded:: 1.0 |
| |
| See Also |
| -------- |
| MiniBatchKMeans : Alternative online implementation that does incremental |
| updates of the centers positions using mini-batches. |
| For large scale learning (say n_samples > 10k) MiniBatchKMeans is |
| probably much faster than the default batch implementation. |
| |
| Notes |
| ----- |
| The k-means problem is solved using either Lloyd's or Elkan's algorithm. |
| |
| The average complexity is given by O(k n T), where n is the number of |
| samples and T is the number of iteration. |
| |
| The worst case complexity is given by O(n^(k+2/p)) with |
| n = n_samples, p = n_features. |
| Refer to :doi:`"How slow is the k-means method?" D. Arthur and S. Vassilvitskii - |
| SoCG2006.<10.1145/1137856.1137880>` for more details. |
| |
| In practice, the k-means algorithm is very fast (one of the fastest |
| clustering algorithms available), but it falls in local minima. That's why |
| it can be useful to restart it several times. |
| |
| If the algorithm stops before fully converging (because of ``tol`` or |
| ``max_iter``), ``labels_`` and ``cluster_centers_`` will not be consistent, |
| i.e. the ``cluster_centers_`` will not be the means of the points in each |
| cluster. Also, the estimator will reassign ``labels_`` after the last |
| iteration to make ``labels_`` consistent with ``predict`` on the training |
| set. |
| |
| Examples |
| -------- |
| |
| >>> from sklearn.cluster import KMeans |
| >>> import numpy as np |
| >>> X = np.array([[1, 2], [1, 4], [1, 0], |
| ... [10, 2], [10, 4], [10, 0]]) |
| >>> kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(X) |
| >>> kmeans.labels_ |
| array([1, 1, 1, 0, 0, 0], dtype=int32) |
| >>> kmeans.predict([[0, 0], [12, 3]]) |
| array([1, 0], dtype=int32) |
| >>> kmeans.cluster_centers_ |
| array([[10., 2.], |
| [ 1., 2.]]) |
| |
| For examples of common problems with K-Means and how to address them see |
| :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_assumptions.py`. |
| |
| For a demonstration of how K-Means can be used to cluster text documents see |
| :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`. |
| |
| For a comparison between K-Means and MiniBatchKMeans refer to example |
| :ref:`sphx_glr_auto_examples_cluster_plot_mini_batch_kmeans.py`. |
| |
| For a comparison between K-Means and BisectingKMeans refer to example |
| :ref:`sphx_glr_auto_examples_cluster_plot_bisect_kmeans.py`. |
| """ |
|
|
| _parameter_constraints: dict = { |
| **_BaseKMeans._parameter_constraints, |
| "copy_x": ["boolean"], |
| "algorithm": [StrOptions({"lloyd", "elkan"})], |
| } |
|
|
| def __init__( |
| self, |
| n_clusters=8, |
| *, |
| init="k-means++", |
| n_init="auto", |
| max_iter=300, |
| tol=1e-4, |
| verbose=0, |
| random_state=None, |
| copy_x=True, |
| algorithm="lloyd", |
| ): |
| super().__init__( |
| n_clusters=n_clusters, |
| init=init, |
| n_init=n_init, |
| max_iter=max_iter, |
| tol=tol, |
| verbose=verbose, |
| random_state=random_state, |
| ) |
|
|
| self.copy_x = copy_x |
| self.algorithm = algorithm |
|
|
| def _check_params_vs_input(self, X): |
| super()._check_params_vs_input(X, default_n_init=10) |
|
|
| self._algorithm = self.algorithm |
| if self._algorithm == "elkan" and self.n_clusters == 1: |
| warnings.warn( |
| ( |
| "algorithm='elkan' doesn't make sense for a single " |
| "cluster. Using 'lloyd' instead." |
| ), |
| RuntimeWarning, |
| ) |
| self._algorithm = "lloyd" |
|
|
| def _warn_mkl_vcomp(self, n_active_threads): |
| """Warn when vcomp and mkl are both present""" |
| warnings.warn( |
| "KMeans is known to have a memory leak on Windows " |
| "with MKL, when there are less chunks than available " |
| "threads. You can avoid it by setting the environment" |
| f" variable OMP_NUM_THREADS={n_active_threads}." |
| ) |
|
|
| @_fit_context(prefer_skip_nested_validation=True) |
| def fit(self, X, y=None, sample_weight=None): |
| """Compute k-means clustering. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| Training instances to cluster. It must be noted that the data |
| will be converted to C ordering, which will cause a memory |
| copy if the given data is not C-contiguous. |
| If a sparse matrix is passed, a copy will be made if it's not in |
| CSR format. |
| |
| y : Ignored |
| Not used, present here for API consistency by convention. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| The weights for each observation in X. If None, all observations |
| are assigned equal weight. `sample_weight` is not used during |
| initialization if `init` is a callable or a user provided array. |
| |
| .. versionadded:: 0.20 |
| |
| Returns |
| ------- |
| self : object |
| Fitted estimator. |
| """ |
| X = validate_data( |
| self, |
| X, |
| accept_sparse="csr", |
| dtype=[np.float64, np.float32], |
| order="C", |
| copy=self.copy_x, |
| accept_large_sparse=False, |
| ) |
|
|
| self._check_params_vs_input(X) |
|
|
| random_state = check_random_state(self.random_state) |
| sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) |
| self._n_threads = _openmp_effective_n_threads() |
|
|
| |
| init = self.init |
| init_is_array_like = _is_arraylike_not_scalar(init) |
| if init_is_array_like: |
| init = check_array(init, dtype=X.dtype, copy=True, order="C") |
| self._validate_center_shape(X, init) |
|
|
| |
| if not sp.issparse(X): |
| X_mean = X.mean(axis=0) |
| |
| X -= X_mean |
|
|
| if init_is_array_like: |
| init -= X_mean |
|
|
| |
| x_squared_norms = row_norms(X, squared=True) |
|
|
| if self._algorithm == "elkan": |
| kmeans_single = _kmeans_single_elkan |
| else: |
| kmeans_single = _kmeans_single_lloyd |
| self._check_mkl_vcomp(X, X.shape[0]) |
|
|
| best_inertia, best_labels = None, None |
|
|
| for i in range(self._n_init): |
| |
| centers_init = self._init_centroids( |
| X, |
| x_squared_norms=x_squared_norms, |
| init=init, |
| random_state=random_state, |
| sample_weight=sample_weight, |
| ) |
| if self.verbose: |
| print("Initialization complete") |
|
|
| |
| labels, inertia, centers, n_iter_ = kmeans_single( |
| X, |
| sample_weight, |
| centers_init, |
| max_iter=self.max_iter, |
| verbose=self.verbose, |
| tol=self._tol, |
| n_threads=self._n_threads, |
| ) |
|
|
| |
| |
| |
| |
| |
| if best_inertia is None or ( |
| inertia < best_inertia |
| and not _is_same_clustering(labels, best_labels, self.n_clusters) |
| ): |
| best_labels = labels |
| best_centers = centers |
| best_inertia = inertia |
| best_n_iter = n_iter_ |
|
|
| if not sp.issparse(X): |
| if not self.copy_x: |
| X += X_mean |
| best_centers += X_mean |
|
|
| distinct_clusters = len(set(best_labels)) |
| if distinct_clusters < self.n_clusters: |
| warnings.warn( |
| "Number of distinct clusters ({}) found smaller than " |
| "n_clusters ({}). Possibly due to duplicate points " |
| "in X.".format(distinct_clusters, self.n_clusters), |
| ConvergenceWarning, |
| stacklevel=2, |
| ) |
|
|
| self.cluster_centers_ = best_centers |
| self._n_features_out = self.cluster_centers_.shape[0] |
| self.labels_ = best_labels |
| self.inertia_ = best_inertia |
| self.n_iter_ = best_n_iter |
| return self |
|
|
|
|
| def _mini_batch_step( |
| X, |
| sample_weight, |
| centers, |
| centers_new, |
| weight_sums, |
| random_state, |
| random_reassign=False, |
| reassignment_ratio=0.01, |
| verbose=False, |
| n_threads=1, |
| ): |
| """Incremental update of the centers for the Minibatch K-Means algorithm. |
| |
| Parameters |
| ---------- |
| |
| X : {ndarray, sparse matrix} of shape (n_samples, n_features) |
| The original data array. If sparse, must be in CSR format. |
| |
| x_squared_norms : ndarray of shape (n_samples,) |
| Squared euclidean norm of each data point. |
| |
| sample_weight : ndarray of shape (n_samples,) |
| The weights for each observation in `X`. |
| |
| centers : ndarray of shape (n_clusters, n_features) |
| The cluster centers before the current iteration |
| |
| centers_new : ndarray of shape (n_clusters, n_features) |
| The cluster centers after the current iteration. Modified in-place. |
| |
| weight_sums : ndarray of shape (n_clusters,) |
| The vector in which we keep track of the numbers of points in a |
| cluster. This array is modified in place. |
| |
| random_state : RandomState instance |
| Determines random number generation for low count centers reassignment. |
| See :term:`Glossary <random_state>`. |
| |
| random_reassign : boolean, default=False |
| If True, centers with very low counts are randomly reassigned |
| to observations. |
| |
| reassignment_ratio : float, default=0.01 |
| Control the fraction of the maximum number of counts for a |
| center to be reassigned. A higher value means that low count |
| centers are more likely to be reassigned, which means that the |
| model will take longer to converge, but should converge in a |
| better clustering. |
| |
| verbose : bool, default=False |
| Controls the verbosity. |
| |
| n_threads : int, default=1 |
| The number of OpenMP threads to use for the computation. |
| |
| Returns |
| ------- |
| inertia : float |
| Sum of squared distances of samples to their closest cluster center. |
| The inertia is computed after finding the labels and before updating |
| the centers. |
| """ |
| |
| |
| |
| labels, inertia = _labels_inertia(X, sample_weight, centers, n_threads=n_threads) |
|
|
| |
| if sp.issparse(X): |
| _minibatch_update_sparse( |
| X, sample_weight, centers, centers_new, weight_sums, labels, n_threads |
| ) |
| else: |
| _minibatch_update_dense( |
| X, |
| sample_weight, |
| centers, |
| centers_new, |
| weight_sums, |
| labels, |
| n_threads, |
| ) |
|
|
| |
| if random_reassign and reassignment_ratio > 0: |
| to_reassign = weight_sums < reassignment_ratio * weight_sums.max() |
|
|
| |
| if to_reassign.sum() > 0.5 * X.shape[0]: |
| indices_dont_reassign = np.argsort(weight_sums)[int(0.5 * X.shape[0]) :] |
| to_reassign[indices_dont_reassign] = False |
| n_reassigns = to_reassign.sum() |
|
|
| if n_reassigns: |
| |
| new_centers = random_state.choice( |
| X.shape[0], replace=False, size=n_reassigns |
| ) |
| if verbose: |
| print(f"[MiniBatchKMeans] Reassigning {n_reassigns} cluster centers.") |
|
|
| if sp.issparse(X): |
| assign_rows_csr( |
| X, |
| new_centers.astype(np.intp, copy=False), |
| np.where(to_reassign)[0].astype(np.intp, copy=False), |
| centers_new, |
| ) |
| else: |
| centers_new[to_reassign] = X[new_centers] |
|
|
| |
| |
| |
| weight_sums[to_reassign] = np.min(weight_sums[~to_reassign]) |
|
|
| return inertia |
|
|
|
|
| class MiniBatchKMeans(_BaseKMeans): |
| """ |
| Mini-Batch K-Means clustering. |
| |
| Read more in the :ref:`User Guide <mini_batch_kmeans>`. |
| |
| Parameters |
| ---------- |
| |
| n_clusters : int, default=8 |
| The number of clusters to form as well as the number of |
| centroids to generate. |
| |
| init : {'k-means++', 'random'}, callable or array-like of shape \ |
| (n_clusters, n_features), default='k-means++' |
| Method for initialization: |
| |
| 'k-means++' : selects initial cluster centroids using sampling based on |
| an empirical probability distribution of the points' contribution to the |
| overall inertia. This technique speeds up convergence. The algorithm |
| implemented is "greedy k-means++". It differs from the vanilla k-means++ |
| by making several trials at each sampling step and choosing the best centroid |
| among them. |
| |
| 'random': choose `n_clusters` observations (rows) at random from data |
| for the initial centroids. |
| |
| If an array is passed, it should be of shape (n_clusters, n_features) |
| and gives the initial centers. |
| |
| If a callable is passed, it should take arguments X, n_clusters and a |
| random state and return an initialization. |
| |
| For an evaluation of the impact of initialization, see the example |
| :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`. |
| |
| max_iter : int, default=100 |
| Maximum number of iterations over the complete dataset before |
| stopping independently of any early stopping criterion heuristics. |
| |
| batch_size : int, default=1024 |
| Size of the mini batches. |
| For faster computations, you can set the ``batch_size`` greater than |
| 256 * number of cores to enable parallelism on all cores. |
| |
| .. versionchanged:: 1.0 |
| `batch_size` default changed from 100 to 1024. |
| |
| verbose : int, default=0 |
| Verbosity mode. |
| |
| compute_labels : bool, default=True |
| Compute label assignment and inertia for the complete dataset |
| once the minibatch optimization has converged in fit. |
| |
| random_state : int, RandomState instance or None, default=None |
| Determines random number generation for centroid initialization and |
| random reassignment. Use an int to make the randomness deterministic. |
| See :term:`Glossary <random_state>`. |
| |
| tol : float, default=0.0 |
| Control early stopping based on the relative center changes as |
| measured by a smoothed, variance-normalized of the mean center |
| squared position changes. This early stopping heuristics is |
| closer to the one used for the batch variant of the algorithms |
| but induces a slight computational and memory overhead over the |
| inertia heuristic. |
| |
| To disable convergence detection based on normalized center |
| change, set tol to 0.0 (default). |
| |
| max_no_improvement : int, default=10 |
| Control early stopping based on the consecutive number of mini |
| batches that does not yield an improvement on the smoothed inertia. |
| |
| To disable convergence detection based on inertia, set |
| max_no_improvement to None. |
| |
| init_size : int, default=None |
| Number of samples to randomly sample for speeding up the |
| initialization (sometimes at the expense of accuracy): the |
| only algorithm is initialized by running a batch KMeans on a |
| random subset of the data. This needs to be larger than n_clusters. |
| |
| If `None`, the heuristic is `init_size = 3 * batch_size` if |
| `3 * batch_size < n_clusters`, else `init_size = 3 * n_clusters`. |
| |
| n_init : 'auto' or int, default="auto" |
| Number of random initializations that are tried. |
| In contrast to KMeans, the algorithm is only run once, using the best of |
| the `n_init` initializations as measured by inertia. Several runs are |
| recommended for sparse high-dimensional problems (see |
| :ref:`kmeans_sparse_high_dim`). |
| |
| When `n_init='auto'`, the number of runs depends on the value of init: |
| 3 if using `init='random'` or `init` is a callable; |
| 1 if using `init='k-means++'` or `init` is an array-like. |
| |
| .. versionadded:: 1.2 |
| Added 'auto' option for `n_init`. |
| |
| .. versionchanged:: 1.4 |
| Default value for `n_init` changed to `'auto'` in version. |
| |
| reassignment_ratio : float, default=0.01 |
| Control the fraction of the maximum number of counts for a center to |
| be reassigned. A higher value means that low count centers are more |
| easily reassigned, which means that the model will take longer to |
| converge, but should converge in a better clustering. However, too high |
| a value may cause convergence issues, especially with a small batch |
| size. |
| |
| Attributes |
| ---------- |
| |
| cluster_centers_ : ndarray of shape (n_clusters, n_features) |
| Coordinates of cluster centers. |
| |
| labels_ : ndarray of shape (n_samples,) |
| Labels of each point (if compute_labels is set to True). |
| |
| inertia_ : float |
| The value of the inertia criterion associated with the chosen |
| partition if compute_labels is set to True. If compute_labels is set to |
| False, it's an approximation of the inertia based on an exponentially |
| weighted average of the batch inertiae. |
| The inertia is defined as the sum of square distances of samples to |
| their cluster center, weighted by the sample weights if provided. |
| |
| n_iter_ : int |
| Number of iterations over the full dataset. |
| |
| n_steps_ : int |
| Number of minibatches processed. |
| |
| .. versionadded:: 1.0 |
| |
| n_features_in_ : int |
| Number of features seen during :term:`fit`. |
| |
| .. versionadded:: 0.24 |
| |
| feature_names_in_ : ndarray of shape (`n_features_in_`,) |
| Names of features seen during :term:`fit`. Defined only when `X` |
| has feature names that are all strings. |
| |
| .. versionadded:: 1.0 |
| |
| See Also |
| -------- |
| KMeans : The classic implementation of the clustering method based on the |
| Lloyd's algorithm. It consumes the whole set of input data at each |
| iteration. |
| |
| Notes |
| ----- |
| See https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf |
| |
| When there are too few points in the dataset, some centers may be |
| duplicated, which means that a proper clustering in terms of the number |
| of requesting clusters and the number of returned clusters will not |
| always match. One solution is to set `reassignment_ratio=0`, which |
| prevents reassignments of clusters that are too small. |
| |
| See :ref:`sphx_glr_auto_examples_cluster_plot_birch_vs_minibatchkmeans.py` for a |
| comparison with :class:`~sklearn.cluster.BIRCH`. |
| |
| Examples |
| -------- |
| >>> from sklearn.cluster import MiniBatchKMeans |
| >>> import numpy as np |
| >>> X = np.array([[1, 2], [1, 4], [1, 0], |
| ... [4, 2], [4, 0], [4, 4], |
| ... [4, 5], [0, 1], [2, 2], |
| ... [3, 2], [5, 5], [1, -1]]) |
| >>> # manually fit on batches |
| >>> kmeans = MiniBatchKMeans(n_clusters=2, |
| ... random_state=0, |
| ... batch_size=6, |
| ... n_init="auto") |
| >>> kmeans = kmeans.partial_fit(X[0:6,:]) |
| >>> kmeans = kmeans.partial_fit(X[6:12,:]) |
| >>> kmeans.cluster_centers_ |
| array([[3.375, 3. ], |
| [0.75 , 0.5 ]]) |
| >>> kmeans.predict([[0, 0], [4, 4]]) |
| array([1, 0], dtype=int32) |
| >>> # fit on the whole data |
| >>> kmeans = MiniBatchKMeans(n_clusters=2, |
| ... random_state=0, |
| ... batch_size=6, |
| ... max_iter=10, |
| ... n_init="auto").fit(X) |
| >>> kmeans.cluster_centers_ |
| array([[3.55102041, 2.48979592], |
| [1.06896552, 1. ]]) |
| >>> kmeans.predict([[0, 0], [4, 4]]) |
| array([1, 0], dtype=int32) |
| """ |
|
|
| _parameter_constraints: dict = { |
| **_BaseKMeans._parameter_constraints, |
| "batch_size": [Interval(Integral, 1, None, closed="left")], |
| "compute_labels": ["boolean"], |
| "max_no_improvement": [Interval(Integral, 0, None, closed="left"), None], |
| "init_size": [Interval(Integral, 1, None, closed="left"), None], |
| "reassignment_ratio": [Interval(Real, 0, None, closed="left")], |
| } |
|
|
| def __init__( |
| self, |
| n_clusters=8, |
| *, |
| init="k-means++", |
| max_iter=100, |
| batch_size=1024, |
| verbose=0, |
| compute_labels=True, |
| random_state=None, |
| tol=0.0, |
| max_no_improvement=10, |
| init_size=None, |
| n_init="auto", |
| reassignment_ratio=0.01, |
| ): |
| super().__init__( |
| n_clusters=n_clusters, |
| init=init, |
| max_iter=max_iter, |
| verbose=verbose, |
| random_state=random_state, |
| tol=tol, |
| n_init=n_init, |
| ) |
|
|
| self.max_no_improvement = max_no_improvement |
| self.batch_size = batch_size |
| self.compute_labels = compute_labels |
| self.init_size = init_size |
| self.reassignment_ratio = reassignment_ratio |
|
|
| def _check_params_vs_input(self, X): |
| super()._check_params_vs_input(X, default_n_init=3) |
|
|
| self._batch_size = min(self.batch_size, X.shape[0]) |
|
|
| |
| self._init_size = self.init_size |
| if self._init_size is None: |
| self._init_size = 3 * self._batch_size |
| if self._init_size < self.n_clusters: |
| self._init_size = 3 * self.n_clusters |
| elif self._init_size < self.n_clusters: |
| warnings.warn( |
| ( |
| f"init_size={self._init_size} should be larger than " |
| f"n_clusters={self.n_clusters}. Setting it to " |
| "min(3*n_clusters, n_samples)" |
| ), |
| RuntimeWarning, |
| stacklevel=2, |
| ) |
| self._init_size = 3 * self.n_clusters |
| self._init_size = min(self._init_size, X.shape[0]) |
|
|
| |
| if self.reassignment_ratio < 0: |
| raise ValueError( |
| "reassignment_ratio should be >= 0, got " |
| f"{self.reassignment_ratio} instead." |
| ) |
|
|
| def _warn_mkl_vcomp(self, n_active_threads): |
| """Warn when vcomp and mkl are both present""" |
| warnings.warn( |
| "MiniBatchKMeans is known to have a memory leak on " |
| "Windows with MKL, when there are less chunks than " |
| "available threads. You can prevent it by setting " |
| f"batch_size >= {self._n_threads * CHUNK_SIZE} or by " |
| "setting the environment variable " |
| f"OMP_NUM_THREADS={n_active_threads}" |
| ) |
|
|
| def _mini_batch_convergence( |
| self, step, n_steps, n_samples, centers_squared_diff, batch_inertia |
| ): |
| """Helper function to encapsulate the early stopping logic""" |
| |
| |
| batch_inertia /= self._batch_size |
|
|
| |
| step = step + 1 |
|
|
| |
| if step == 1: |
| if self.verbose: |
| print( |
| f"Minibatch step {step}/{n_steps}: mean batch " |
| f"inertia: {batch_inertia}" |
| ) |
| return False |
|
|
| |
| |
| |
| if self._ewa_inertia is None: |
| self._ewa_inertia = batch_inertia |
| else: |
| alpha = self._batch_size * 2.0 / (n_samples + 1) |
| alpha = min(alpha, 1) |
| self._ewa_inertia = self._ewa_inertia * (1 - alpha) + batch_inertia * alpha |
|
|
| |
| if self.verbose: |
| print( |
| f"Minibatch step {step}/{n_steps}: mean batch inertia: " |
| f"{batch_inertia}, ewa inertia: {self._ewa_inertia}" |
| ) |
|
|
| |
| |
| if self._tol > 0.0 and centers_squared_diff <= self._tol: |
| if self.verbose: |
| print(f"Converged (small centers change) at step {step}/{n_steps}") |
| return True |
|
|
| |
| |
| if self._ewa_inertia_min is None or self._ewa_inertia < self._ewa_inertia_min: |
| self._no_improvement = 0 |
| self._ewa_inertia_min = self._ewa_inertia |
| else: |
| self._no_improvement += 1 |
|
|
| if ( |
| self.max_no_improvement is not None |
| and self._no_improvement >= self.max_no_improvement |
| ): |
| if self.verbose: |
| print( |
| "Converged (lack of improvement in inertia) at step " |
| f"{step}/{n_steps}" |
| ) |
| return True |
|
|
| return False |
|
|
| def _random_reassign(self): |
| """Check if a random reassignment needs to be done. |
| |
| Do random reassignments each time 10 * n_clusters samples have been |
| processed. |
| |
| If there are empty clusters we always want to reassign. |
| """ |
| self._n_since_last_reassign += self._batch_size |
| if (self._counts == 0).any() or self._n_since_last_reassign >= ( |
| 10 * self.n_clusters |
| ): |
| self._n_since_last_reassign = 0 |
| return True |
| return False |
|
|
| @_fit_context(prefer_skip_nested_validation=True) |
| def fit(self, X, y=None, sample_weight=None): |
| """Compute the centroids on X by chunking it into mini-batches. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| Training instances to cluster. It must be noted that the data |
| will be converted to C ordering, which will cause a memory copy |
| if the given data is not C-contiguous. |
| If a sparse matrix is passed, a copy will be made if it's not in |
| CSR format. |
| |
| y : Ignored |
| Not used, present here for API consistency by convention. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| The weights for each observation in X. If None, all observations |
| are assigned equal weight. `sample_weight` is not used during |
| initialization if `init` is a callable or a user provided array. |
| |
| .. versionadded:: 0.20 |
| |
| Returns |
| ------- |
| self : object |
| Fitted estimator. |
| """ |
| X = validate_data( |
| self, |
| X, |
| accept_sparse="csr", |
| dtype=[np.float64, np.float32], |
| order="C", |
| accept_large_sparse=False, |
| ) |
|
|
| self._check_params_vs_input(X) |
| random_state = check_random_state(self.random_state) |
| sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) |
| self._n_threads = _openmp_effective_n_threads() |
| n_samples, n_features = X.shape |
|
|
| |
| init = self.init |
| if _is_arraylike_not_scalar(init): |
| init = check_array(init, dtype=X.dtype, copy=True, order="C") |
| self._validate_center_shape(X, init) |
|
|
| self._check_mkl_vcomp(X, self._batch_size) |
|
|
| |
| x_squared_norms = row_norms(X, squared=True) |
|
|
| |
| validation_indices = random_state.randint(0, n_samples, self._init_size) |
| X_valid = X[validation_indices] |
| sample_weight_valid = sample_weight[validation_indices] |
|
|
| |
| best_inertia = None |
| for init_idx in range(self._n_init): |
| if self.verbose: |
| print(f"Init {init_idx + 1}/{self._n_init} with method {init}") |
|
|
| |
| |
| cluster_centers = self._init_centroids( |
| X, |
| x_squared_norms=x_squared_norms, |
| init=init, |
| random_state=random_state, |
| init_size=self._init_size, |
| sample_weight=sample_weight, |
| ) |
|
|
| |
| _, inertia = _labels_inertia_threadpool_limit( |
| X_valid, |
| sample_weight_valid, |
| cluster_centers, |
| n_threads=self._n_threads, |
| ) |
|
|
| if self.verbose: |
| print(f"Inertia for init {init_idx + 1}/{self._n_init}: {inertia}") |
| if best_inertia is None or inertia < best_inertia: |
| init_centers = cluster_centers |
| best_inertia = inertia |
|
|
| centers = init_centers |
| centers_new = np.empty_like(centers) |
|
|
| |
| self._counts = np.zeros(self.n_clusters, dtype=X.dtype) |
|
|
| |
| self._ewa_inertia = None |
| self._ewa_inertia_min = None |
| self._no_improvement = 0 |
|
|
| |
| self._n_since_last_reassign = 0 |
|
|
| n_steps = (self.max_iter * n_samples) // self._batch_size |
|
|
| with _get_threadpool_controller().limit(limits=1, user_api="blas"): |
| |
| for i in range(n_steps): |
| |
| minibatch_indices = random_state.randint(0, n_samples, self._batch_size) |
|
|
| |
| batch_inertia = _mini_batch_step( |
| X=X[minibatch_indices], |
| sample_weight=sample_weight[minibatch_indices], |
| centers=centers, |
| centers_new=centers_new, |
| weight_sums=self._counts, |
| random_state=random_state, |
| random_reassign=self._random_reassign(), |
| reassignment_ratio=self.reassignment_ratio, |
| verbose=self.verbose, |
| n_threads=self._n_threads, |
| ) |
|
|
| if self._tol > 0.0: |
| centers_squared_diff = np.sum((centers_new - centers) ** 2) |
| else: |
| centers_squared_diff = 0 |
|
|
| centers, centers_new = centers_new, centers |
|
|
| |
| if self._mini_batch_convergence( |
| i, n_steps, n_samples, centers_squared_diff, batch_inertia |
| ): |
| break |
|
|
| self.cluster_centers_ = centers |
| self._n_features_out = self.cluster_centers_.shape[0] |
|
|
| self.n_steps_ = i + 1 |
| self.n_iter_ = int(np.ceil(((i + 1) * self._batch_size) / n_samples)) |
|
|
| if self.compute_labels: |
| self.labels_, self.inertia_ = _labels_inertia_threadpool_limit( |
| X, |
| sample_weight, |
| self.cluster_centers_, |
| n_threads=self._n_threads, |
| ) |
| else: |
| self.inertia_ = self._ewa_inertia * n_samples |
|
|
| return self |
|
|
| @_fit_context(prefer_skip_nested_validation=True) |
| def partial_fit(self, X, y=None, sample_weight=None): |
| """Update k means estimate on a single mini-batch X. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| Training instances to cluster. It must be noted that the data |
| will be converted to C ordering, which will cause a memory copy |
| if the given data is not C-contiguous. |
| If a sparse matrix is passed, a copy will be made if it's not in |
| CSR format. |
| |
| y : Ignored |
| Not used, present here for API consistency by convention. |
| |
| sample_weight : array-like of shape (n_samples,), default=None |
| The weights for each observation in X. If None, all observations |
| are assigned equal weight. `sample_weight` is not used during |
| initialization if `init` is a callable or a user provided array. |
| |
| Returns |
| ------- |
| self : object |
| Return updated estimator. |
| """ |
| has_centers = hasattr(self, "cluster_centers_") |
|
|
| X = validate_data( |
| self, |
| X, |
| accept_sparse="csr", |
| dtype=[np.float64, np.float32], |
| order="C", |
| accept_large_sparse=False, |
| reset=not has_centers, |
| ) |
|
|
| self._random_state = getattr( |
| self, "_random_state", check_random_state(self.random_state) |
| ) |
| sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) |
| self.n_steps_ = getattr(self, "n_steps_", 0) |
|
|
| |
| x_squared_norms = row_norms(X, squared=True) |
|
|
| if not has_centers: |
| |
| self._check_params_vs_input(X) |
| self._n_threads = _openmp_effective_n_threads() |
|
|
| |
| init = self.init |
| if _is_arraylike_not_scalar(init): |
| init = check_array(init, dtype=X.dtype, copy=True, order="C") |
| self._validate_center_shape(X, init) |
|
|
| self._check_mkl_vcomp(X, X.shape[0]) |
|
|
| |
| self.cluster_centers_ = self._init_centroids( |
| X, |
| x_squared_norms=x_squared_norms, |
| init=init, |
| random_state=self._random_state, |
| init_size=self._init_size, |
| sample_weight=sample_weight, |
| ) |
|
|
| |
| self._counts = np.zeros(self.n_clusters, dtype=X.dtype) |
|
|
| |
| self._n_since_last_reassign = 0 |
|
|
| with _get_threadpool_controller().limit(limits=1, user_api="blas"): |
| _mini_batch_step( |
| X, |
| sample_weight=sample_weight, |
| centers=self.cluster_centers_, |
| centers_new=self.cluster_centers_, |
| weight_sums=self._counts, |
| random_state=self._random_state, |
| random_reassign=self._random_reassign(), |
| reassignment_ratio=self.reassignment_ratio, |
| verbose=self.verbose, |
| n_threads=self._n_threads, |
| ) |
|
|
| if self.compute_labels: |
| self.labels_, self.inertia_ = _labels_inertia_threadpool_limit( |
| X, |
| sample_weight, |
| self.cluster_centers_, |
| n_threads=self._n_threads, |
| ) |
|
|
| self.n_steps_ += 1 |
| self._n_features_out = self.cluster_centers_.shape[0] |
|
|
| return self |
|
|