| |
| |
|
|
| |
| |
| |
| |
|
|
| import warnings |
| from numbers import Integral, Real |
| from time import time |
|
|
| import numpy as np |
| from scipy import linalg |
| from scipy.sparse import csr_matrix, issparse |
| from scipy.spatial.distance import pdist, squareform |
|
|
| from ..base import ( |
| BaseEstimator, |
| ClassNamePrefixFeaturesOutMixin, |
| TransformerMixin, |
| _fit_context, |
| ) |
| from ..decomposition import PCA |
| from ..metrics.pairwise import _VALID_METRICS, pairwise_distances |
| from ..neighbors import NearestNeighbors |
| from ..utils import check_random_state |
| from ..utils._openmp_helpers import _openmp_effective_n_threads |
| from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params |
| from ..utils.validation import _num_samples, check_non_negative, validate_data |
|
|
| |
| |
| from . import _barnes_hut_tsne, _utils |
|
|
| MACHINE_EPSILON = np.finfo(np.double).eps |
|
|
|
|
| def _joint_probabilities(distances, desired_perplexity, verbose): |
| """Compute joint probabilities p_ij from distances. |
| |
| Parameters |
| ---------- |
| distances : ndarray of shape (n_samples * (n_samples-1) / 2,) |
| Distances of samples are stored as condensed matrices, i.e. |
| we omit the diagonal and duplicate entries and store everything |
| in a one-dimensional array. |
| |
| desired_perplexity : float |
| Desired perplexity of the joint probability distributions. |
| |
| verbose : int |
| Verbosity level. |
| |
| Returns |
| ------- |
| P : ndarray of shape (n_samples * (n_samples-1) / 2,) |
| Condensed joint probability matrix. |
| """ |
| |
| |
| distances = distances.astype(np.float32, copy=False) |
| conditional_P = _utils._binary_search_perplexity( |
| distances, desired_perplexity, verbose |
| ) |
| P = conditional_P + conditional_P.T |
| sum_P = np.maximum(np.sum(P), MACHINE_EPSILON) |
| P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON) |
| return P |
|
|
|
|
| def _joint_probabilities_nn(distances, desired_perplexity, verbose): |
| """Compute joint probabilities p_ij from distances using just nearest |
| neighbors. |
| |
| This method is approximately equal to _joint_probabilities. The latter |
| is O(N), but limiting the joint probability to nearest neighbors improves |
| this substantially to O(uN). |
| |
| Parameters |
| ---------- |
| distances : sparse matrix of shape (n_samples, n_samples) |
| Distances of samples to its n_neighbors nearest neighbors. All other |
| distances are left to zero (and are not materialized in memory). |
| Matrix should be of CSR format. |
| |
| desired_perplexity : float |
| Desired perplexity of the joint probability distributions. |
| |
| verbose : int |
| Verbosity level. |
| |
| Returns |
| ------- |
| P : sparse matrix of shape (n_samples, n_samples) |
| Condensed joint probability matrix with only nearest neighbors. Matrix |
| will be of CSR format. |
| """ |
| t0 = time() |
| |
| |
| distances.sort_indices() |
| n_samples = distances.shape[0] |
| distances_data = distances.data.reshape(n_samples, -1) |
| distances_data = distances_data.astype(np.float32, copy=False) |
| conditional_P = _utils._binary_search_perplexity( |
| distances_data, desired_perplexity, verbose |
| ) |
| assert np.all(np.isfinite(conditional_P)), "All probabilities should be finite" |
|
|
| |
| P = csr_matrix( |
| (conditional_P.ravel(), distances.indices, distances.indptr), |
| shape=(n_samples, n_samples), |
| ) |
| P = P + P.T |
|
|
| |
| sum_P = np.maximum(P.sum(), MACHINE_EPSILON) |
| P /= sum_P |
|
|
| assert np.all(np.abs(P.data) <= 1.0) |
| if verbose >= 2: |
| duration = time() - t0 |
| print("[t-SNE] Computed conditional probabilities in {:.3f}s".format(duration)) |
| return P |
|
|
|
|
| def _kl_divergence( |
| params, |
| P, |
| degrees_of_freedom, |
| n_samples, |
| n_components, |
| skip_num_points=0, |
| compute_error=True, |
| ): |
| """t-SNE objective function: gradient of the KL divergence |
| of p_ijs and q_ijs and the absolute error. |
| |
| Parameters |
| ---------- |
| params : ndarray of shape (n_params,) |
| Unraveled embedding. |
| |
| P : ndarray of shape (n_samples * (n_samples-1) / 2,) |
| Condensed joint probability matrix. |
| |
| degrees_of_freedom : int |
| Degrees of freedom of the Student's-t distribution. |
| |
| n_samples : int |
| Number of samples. |
| |
| n_components : int |
| Dimension of the embedded space. |
| |
| skip_num_points : int, default=0 |
| This does not compute the gradient for points with indices below |
| `skip_num_points`. This is useful when computing transforms of new |
| data where you'd like to keep the old data fixed. |
| |
| compute_error: bool, default=True |
| If False, the kl_divergence is not computed and returns NaN. |
| |
| Returns |
| ------- |
| kl_divergence : float |
| Kullback-Leibler divergence of p_ij and q_ij. |
| |
| grad : ndarray of shape (n_params,) |
| Unraveled gradient of the Kullback-Leibler divergence with respect to |
| the embedding. |
| """ |
| X_embedded = params.reshape(n_samples, n_components) |
|
|
| |
| dist = pdist(X_embedded, "sqeuclidean") |
| dist /= degrees_of_freedom |
| dist += 1.0 |
| dist **= (degrees_of_freedom + 1.0) / -2.0 |
| Q = np.maximum(dist / (2.0 * np.sum(dist)), MACHINE_EPSILON) |
|
|
| |
| |
|
|
| |
| if compute_error: |
| kl_divergence = 2.0 * np.dot(P, np.log(np.maximum(P, MACHINE_EPSILON) / Q)) |
| else: |
| kl_divergence = np.nan |
|
|
| |
| |
| grad = np.ndarray((n_samples, n_components), dtype=params.dtype) |
| PQd = squareform((P - Q) * dist) |
| for i in range(skip_num_points, n_samples): |
| grad[i] = np.dot(np.ravel(PQd[i], order="K"), X_embedded[i] - X_embedded) |
| grad = grad.ravel() |
| c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom |
| grad *= c |
|
|
| return kl_divergence, grad |
|
|
|
|
| def _kl_divergence_bh( |
| params, |
| P, |
| degrees_of_freedom, |
| n_samples, |
| n_components, |
| angle=0.5, |
| skip_num_points=0, |
| verbose=False, |
| compute_error=True, |
| num_threads=1, |
| ): |
| """t-SNE objective function: KL divergence of p_ijs and q_ijs. |
| |
| Uses Barnes-Hut tree methods to calculate the gradient that |
| runs in O(NlogN) instead of O(N^2). |
| |
| Parameters |
| ---------- |
| params : ndarray of shape (n_params,) |
| Unraveled embedding. |
| |
| P : sparse matrix of shape (n_samples, n_sample) |
| Sparse approximate joint probability matrix, computed only for the |
| k nearest-neighbors and symmetrized. Matrix should be of CSR format. |
| |
| degrees_of_freedom : int |
| Degrees of freedom of the Student's-t distribution. |
| |
| n_samples : int |
| Number of samples. |
| |
| n_components : int |
| Dimension of the embedded space. |
| |
| angle : float, default=0.5 |
| This is the trade-off between speed and accuracy for Barnes-Hut T-SNE. |
| 'angle' is the angular size (referred to as theta in [3]) of a distant |
| node as measured from a point. If this size is below 'angle' then it is |
| used as a summary node of all points contained within it. |
| This method is not very sensitive to changes in this parameter |
| in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing |
| computation time and angle greater 0.8 has quickly increasing error. |
| |
| skip_num_points : int, default=0 |
| This does not compute the gradient for points with indices below |
| `skip_num_points`. This is useful when computing transforms of new |
| data where you'd like to keep the old data fixed. |
| |
| verbose : int, default=False |
| Verbosity level. |
| |
| compute_error: bool, default=True |
| If False, the kl_divergence is not computed and returns NaN. |
| |
| num_threads : int, default=1 |
| Number of threads used to compute the gradient. This is set here to |
| avoid calling _openmp_effective_n_threads for each gradient step. |
| |
| Returns |
| ------- |
| kl_divergence : float |
| Kullback-Leibler divergence of p_ij and q_ij. |
| |
| grad : ndarray of shape (n_params,) |
| Unraveled gradient of the Kullback-Leibler divergence with respect to |
| the embedding. |
| """ |
| params = params.astype(np.float32, copy=False) |
| X_embedded = params.reshape(n_samples, n_components) |
|
|
| val_P = P.data.astype(np.float32, copy=False) |
| neighbors = P.indices.astype(np.int64, copy=False) |
| indptr = P.indptr.astype(np.int64, copy=False) |
|
|
| grad = np.zeros(X_embedded.shape, dtype=np.float32) |
| error = _barnes_hut_tsne.gradient( |
| val_P, |
| X_embedded, |
| neighbors, |
| indptr, |
| grad, |
| angle, |
| n_components, |
| verbose, |
| dof=degrees_of_freedom, |
| compute_error=compute_error, |
| num_threads=num_threads, |
| ) |
| c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom |
| grad = grad.ravel() |
| grad *= c |
|
|
| return error, grad |
|
|
|
|
| def _gradient_descent( |
| objective, |
| p0, |
| it, |
| max_iter, |
| n_iter_check=1, |
| n_iter_without_progress=300, |
| momentum=0.8, |
| learning_rate=200.0, |
| min_gain=0.01, |
| min_grad_norm=1e-7, |
| verbose=0, |
| args=None, |
| kwargs=None, |
| ): |
| """Batch gradient descent with momentum and individual gains. |
| |
| Parameters |
| ---------- |
| objective : callable |
| Should return a tuple of cost and gradient for a given parameter |
| vector. When expensive to compute, the cost can optionally |
| be None and can be computed every n_iter_check steps using |
| the objective_error function. |
| |
| p0 : array-like of shape (n_params,) |
| Initial parameter vector. |
| |
| it : int |
| Current number of iterations (this function will be called more than |
| once during the optimization). |
| |
| max_iter : int |
| Maximum number of gradient descent iterations. |
| |
| n_iter_check : int, default=1 |
| Number of iterations before evaluating the global error. If the error |
| is sufficiently low, we abort the optimization. |
| |
| n_iter_without_progress : int, default=300 |
| Maximum number of iterations without progress before we abort the |
| optimization. |
| |
| momentum : float within (0.0, 1.0), default=0.8 |
| The momentum generates a weight for previous gradients that decays |
| exponentially. |
| |
| learning_rate : float, default=200.0 |
| The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If |
| the learning rate is too high, the data may look like a 'ball' with any |
| point approximately equidistant from its nearest neighbours. If the |
| learning rate is too low, most points may look compressed in a dense |
| cloud with few outliers. |
| |
| min_gain : float, default=0.01 |
| Minimum individual gain for each parameter. |
| |
| min_grad_norm : float, default=1e-7 |
| If the gradient norm is below this threshold, the optimization will |
| be aborted. |
| |
| verbose : int, default=0 |
| Verbosity level. |
| |
| args : sequence, default=None |
| Arguments to pass to objective function. |
| |
| kwargs : dict, default=None |
| Keyword arguments to pass to objective function. |
| |
| Returns |
| ------- |
| p : ndarray of shape (n_params,) |
| Optimum parameters. |
| |
| error : float |
| Optimum. |
| |
| i : int |
| Last iteration. |
| """ |
| if args is None: |
| args = [] |
| if kwargs is None: |
| kwargs = {} |
|
|
| p = p0.copy().ravel() |
| update = np.zeros_like(p) |
| gains = np.ones_like(p) |
| error = np.finfo(float).max |
| best_error = np.finfo(float).max |
| best_iter = i = it |
|
|
| tic = time() |
| for i in range(it, max_iter): |
| check_convergence = (i + 1) % n_iter_check == 0 |
| |
| kwargs["compute_error"] = check_convergence or i == max_iter - 1 |
|
|
| error, grad = objective(p, *args, **kwargs) |
|
|
| inc = update * grad < 0.0 |
| dec = np.invert(inc) |
| gains[inc] += 0.2 |
| gains[dec] *= 0.8 |
| np.clip(gains, min_gain, np.inf, out=gains) |
| grad *= gains |
| update = momentum * update - learning_rate * grad |
| p += update |
|
|
| if check_convergence: |
| toc = time() |
| duration = toc - tic |
| tic = toc |
| grad_norm = linalg.norm(grad) |
|
|
| if verbose >= 2: |
| print( |
| "[t-SNE] Iteration %d: error = %.7f," |
| " gradient norm = %.7f" |
| " (%s iterations in %0.3fs)" |
| % (i + 1, error, grad_norm, n_iter_check, duration) |
| ) |
|
|
| if error < best_error: |
| best_error = error |
| best_iter = i |
| elif i - best_iter > n_iter_without_progress: |
| if verbose >= 2: |
| print( |
| "[t-SNE] Iteration %d: did not make any progress " |
| "during the last %d episodes. Finished." |
| % (i + 1, n_iter_without_progress) |
| ) |
| break |
| if grad_norm <= min_grad_norm: |
| if verbose >= 2: |
| print( |
| "[t-SNE] Iteration %d: gradient norm %f. Finished." |
| % (i + 1, grad_norm) |
| ) |
| break |
|
|
| return p, error, i |
|
|
|
|
| @validate_params( |
| { |
| "X": ["array-like", "sparse matrix"], |
| "X_embedded": ["array-like", "sparse matrix"], |
| "n_neighbors": [Interval(Integral, 1, None, closed="left")], |
| "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], |
| }, |
| prefer_skip_nested_validation=True, |
| ) |
| def trustworthiness(X, X_embedded, *, n_neighbors=5, metric="euclidean"): |
| r"""Indicate to what extent the local structure is retained. |
| |
| The trustworthiness is within [0, 1]. It is defined as |
| |
| .. math:: |
| |
| T(k) = 1 - \frac{2}{nk (2n - 3k - 1)} \sum^n_{i=1} |
| \sum_{j \in \mathcal{N}_{i}^{k}} \max(0, (r(i, j) - k)) |
| |
| where for each sample i, :math:`\mathcal{N}_{i}^{k}` are its k nearest |
| neighbors in the output space, and every sample j is its :math:`r(i, j)`-th |
| nearest neighbor in the input space. In other words, any unexpected nearest |
| neighbors in the output space are penalised in proportion to their rank in |
| the input space. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ |
| (n_samples, n_samples) |
| If the metric is 'precomputed' X must be a square distance |
| matrix. Otherwise it contains a sample per row. |
| |
| X_embedded : {array-like, sparse matrix} of shape (n_samples, n_components) |
| Embedding of the training data in low-dimensional space. |
| |
| n_neighbors : int, default=5 |
| The number of neighbors that will be considered. Should be fewer than |
| `n_samples / 2` to ensure the trustworthiness to lies within [0, 1], as |
| mentioned in [1]_. An error will be raised otherwise. |
| |
| metric : str or callable, default='euclidean' |
| Which metric to use for computing pairwise distances between samples |
| from the original input space. If metric is 'precomputed', X must be a |
| matrix of pairwise distances or squared distances. Otherwise, for a list |
| of available metrics, see the documentation of argument metric in |
| `sklearn.pairwise.pairwise_distances` and metrics listed in |
| `sklearn.metrics.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`. Note that the |
| "cosine" metric uses :func:`~sklearn.metrics.pairwise.cosine_distances`. |
| |
| .. versionadded:: 0.20 |
| |
| Returns |
| ------- |
| trustworthiness : float |
| Trustworthiness of the low-dimensional embedding. |
| |
| References |
| ---------- |
| .. [1] Jarkko Venna and Samuel Kaski. 2001. Neighborhood |
| Preservation in Nonlinear Projection Methods: An Experimental Study. |
| In Proceedings of the International Conference on Artificial Neural Networks |
| (ICANN '01). Springer-Verlag, Berlin, Heidelberg, 485-491. |
| |
| .. [2] Laurens van der Maaten. Learning a Parametric Embedding by Preserving |
| Local Structure. Proceedings of the Twelfth International Conference on |
| Artificial Intelligence and Statistics, PMLR 5:384-391, 2009. |
| |
| Examples |
| -------- |
| >>> from sklearn.datasets import make_blobs |
| >>> from sklearn.decomposition import PCA |
| >>> from sklearn.manifold import trustworthiness |
| >>> X, _ = make_blobs(n_samples=100, n_features=10, centers=3, random_state=42) |
| >>> X_embedded = PCA(n_components=2).fit_transform(X) |
| >>> print(f"{trustworthiness(X, X_embedded, n_neighbors=5):.2f}") |
| 0.92 |
| """ |
| n_samples = _num_samples(X) |
| if n_neighbors >= n_samples / 2: |
| raise ValueError( |
| f"n_neighbors ({n_neighbors}) should be less than n_samples / 2" |
| f" ({n_samples / 2})" |
| ) |
| dist_X = pairwise_distances(X, metric=metric) |
| if metric == "precomputed": |
| dist_X = dist_X.copy() |
| |
| |
| np.fill_diagonal(dist_X, np.inf) |
| ind_X = np.argsort(dist_X, axis=1) |
| |
| ind_X_embedded = ( |
| NearestNeighbors(n_neighbors=n_neighbors) |
| .fit(X_embedded) |
| .kneighbors(return_distance=False) |
| ) |
|
|
| |
| |
| |
| inverted_index = np.zeros((n_samples, n_samples), dtype=int) |
| ordered_indices = np.arange(n_samples + 1) |
| inverted_index[ordered_indices[:-1, np.newaxis], ind_X] = ordered_indices[1:] |
| ranks = ( |
| inverted_index[ordered_indices[:-1, np.newaxis], ind_X_embedded] - n_neighbors |
| ) |
| t = np.sum(ranks[ranks > 0]) |
| t = 1.0 - t * ( |
| 2.0 / (n_samples * n_neighbors * (2.0 * n_samples - 3.0 * n_neighbors - 1.0)) |
| ) |
| return t |
|
|
|
|
| class TSNE(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): |
| """T-distributed Stochastic Neighbor Embedding. |
| |
| t-SNE [1] is a tool to visualize high-dimensional data. It converts |
| similarities between data points to joint probabilities and tries |
| to minimize the Kullback-Leibler divergence between the joint |
| probabilities of the low-dimensional embedding and the |
| high-dimensional data. t-SNE has a cost function that is not convex, |
| i.e. with different initializations we can get different results. |
| |
| It is highly recommended to use another dimensionality reduction |
| method (e.g. PCA for dense data or TruncatedSVD for sparse data) |
| to reduce the number of dimensions to a reasonable amount (e.g. 50) |
| if the number of features is very high. This will suppress some |
| noise and speed up the computation of pairwise distances between |
| samples. For more tips see Laurens van der Maaten's FAQ [2]. |
| |
| Read more in the :ref:`User Guide <t_sne>`. |
| |
| Parameters |
| ---------- |
| n_components : int, default=2 |
| Dimension of the embedded space. |
| |
| perplexity : float, default=30.0 |
| The perplexity is related to the number of nearest neighbors that |
| is used in other manifold learning algorithms. Larger datasets |
| usually require a larger perplexity. Consider selecting a value |
| between 5 and 50. Different values can result in significantly |
| different results. The perplexity must be less than the number |
| of samples. |
| |
| early_exaggeration : float, default=12.0 |
| Controls how tight natural clusters in the original space are in |
| the embedded space and how much space will be between them. For |
| larger values, the space between natural clusters will be larger |
| in the embedded space. Again, the choice of this parameter is not |
| very critical. If the cost function increases during initial |
| optimization, the early exaggeration factor or the learning rate |
| might be too high. |
| |
| learning_rate : float or "auto", default="auto" |
| The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If |
| the learning rate is too high, the data may look like a 'ball' with any |
| point approximately equidistant from its nearest neighbours. If the |
| learning rate is too low, most points may look compressed in a dense |
| cloud with few outliers. If the cost function gets stuck in a bad local |
| minimum increasing the learning rate may help. |
| Note that many other t-SNE implementations (bhtsne, FIt-SNE, openTSNE, |
| etc.) use a definition of learning_rate that is 4 times smaller than |
| ours. So our learning_rate=200 corresponds to learning_rate=800 in |
| those other implementations. The 'auto' option sets the learning_rate |
| to `max(N / early_exaggeration / 4, 50)` where N is the sample size, |
| following [4] and [5]. |
| |
| .. versionchanged:: 1.2 |
| The default value changed to `"auto"`. |
| |
| max_iter : int, default=1000 |
| Maximum number of iterations for the optimization. Should be at |
| least 250. |
| |
| .. versionchanged:: 1.5 |
| Parameter name changed from `n_iter` to `max_iter`. |
| |
| n_iter_without_progress : int, default=300 |
| Maximum number of iterations without progress before we abort the |
| optimization, used after 250 initial iterations with early |
| exaggeration. Note that progress is only checked every 50 iterations so |
| this value is rounded to the next multiple of 50. |
| |
| .. versionadded:: 0.17 |
| parameter *n_iter_without_progress* to control stopping criteria. |
| |
| min_grad_norm : float, default=1e-7 |
| If the gradient norm is below this threshold, the optimization will |
| be stopped. |
| |
| metric : str or callable, default='euclidean' |
| The metric to use when calculating distance between instances in a |
| feature array. If metric is a string, it must be one of the options |
| allowed by scipy.spatial.distance.pdist for its metric parameter, or |
| a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. |
| If metric is "precomputed", X is assumed to be a distance matrix. |
| Alternatively, if metric is a callable function, it is called on each |
| pair of instances (rows) and the resulting value recorded. The callable |
| should take two arrays from X as input and return a value indicating |
| the distance between them. The default is "euclidean" which is |
| interpreted as squared euclidean distance. |
| |
| metric_params : dict, default=None |
| Additional keyword arguments for the metric function. |
| |
| .. versionadded:: 1.1 |
| |
| init : {"random", "pca"} or ndarray of shape (n_samples, n_components), \ |
| default="pca" |
| Initialization of embedding. |
| PCA initialization cannot be used with precomputed distances and is |
| usually more globally stable than random initialization. |
| |
| .. versionchanged:: 1.2 |
| The default value changed to `"pca"`. |
| |
| verbose : int, default=0 |
| Verbosity level. |
| |
| random_state : int, RandomState instance or None, default=None |
| Determines the random number generator. Pass an int for reproducible |
| results across multiple function calls. Note that different |
| initializations might result in different local minima of the cost |
| function. See :term:`Glossary <random_state>`. |
| |
| method : {'barnes_hut', 'exact'}, default='barnes_hut' |
| By default the gradient calculation algorithm uses Barnes-Hut |
| approximation running in O(NlogN) time. method='exact' |
| will run on the slower, but exact, algorithm in O(N^2) time. The |
| exact algorithm should be used when nearest-neighbor errors need |
| to be better than 3%. However, the exact method cannot scale to |
| millions of examples. |
| |
| .. versionadded:: 0.17 |
| Approximate optimization *method* via the Barnes-Hut. |
| |
| angle : float, default=0.5 |
| Only used if method='barnes_hut' |
| This is the trade-off between speed and accuracy for Barnes-Hut T-SNE. |
| 'angle' is the angular size (referred to as theta in [3]) of a distant |
| node as measured from a point. If this size is below 'angle' then it is |
| used as a summary node of all points contained within it. |
| This method is not very sensitive to changes in this parameter |
| in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing |
| computation time and angle greater 0.8 has quickly increasing error. |
| |
| n_jobs : int, default=None |
| The number of parallel jobs to run for neighbors search. This parameter |
| has no impact when ``metric="precomputed"`` or |
| (``metric="euclidean"`` and ``method="exact"``). |
| ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
| ``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
| for more details. |
| |
| .. versionadded:: 0.22 |
| |
| n_iter : int |
| Maximum number of iterations for the optimization. Should be at |
| least 250. |
| |
| .. deprecated:: 1.5 |
| `n_iter` was deprecated in version 1.5 and will be removed in 1.7. |
| Please use `max_iter` instead. |
| |
| Attributes |
| ---------- |
| embedding_ : array-like of shape (n_samples, n_components) |
| Stores the embedding vectors. |
| |
| kl_divergence_ : float |
| Kullback-Leibler divergence after optimization. |
| |
| n_features_in_ : int |
| Number of features seen during :term:`fit`. |
| |
| .. versionadded:: 0.24 |
| |
| feature_names_in_ : ndarray of shape (`n_features_in_`,) |
| Names of features seen during :term:`fit`. Defined only when `X` |
| has feature names that are all strings. |
| |
| .. versionadded:: 1.0 |
| |
| learning_rate_ : float |
| Effective learning rate. |
| |
| .. versionadded:: 1.2 |
| |
| n_iter_ : int |
| Number of iterations run. |
| |
| See Also |
| -------- |
| sklearn.decomposition.PCA : Principal component analysis that is a linear |
| dimensionality reduction method. |
| sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using |
| kernels and PCA. |
| MDS : Manifold learning using multidimensional scaling. |
| Isomap : Manifold learning based on Isometric Mapping. |
| LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding. |
| SpectralEmbedding : Spectral embedding for non-linear dimensionality. |
| |
| Notes |
| ----- |
| For an example of using :class:`~sklearn.manifold.TSNE` in combination with |
| :class:`~sklearn.neighbors.KNeighborsTransformer` see |
| :ref:`sphx_glr_auto_examples_neighbors_approximate_nearest_neighbors.py`. |
| |
| References |
| ---------- |
| |
| [1] van der Maaten, L.J.P.; Hinton, G.E. Visualizing High-Dimensional Data |
| Using t-SNE. Journal of Machine Learning Research 9:2579-2605, 2008. |
| |
| [2] van der Maaten, L.J.P. t-Distributed Stochastic Neighbor Embedding |
| https://lvdmaaten.github.io/tsne/ |
| |
| [3] L.J.P. van der Maaten. Accelerating t-SNE using Tree-Based Algorithms. |
| Journal of Machine Learning Research 15(Oct):3221-3245, 2014. |
| https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf |
| |
| [4] Belkina, A. C., Ciccolella, C. O., Anno, R., Halpert, R., Spidlen, J., |
| & Snyder-Cappione, J. E. (2019). Automated optimized parameters for |
| T-distributed stochastic neighbor embedding improve visualization |
| and analysis of large datasets. Nature Communications, 10(1), 1-12. |
| |
| [5] Kobak, D., & Berens, P. (2019). The art of using t-SNE for single-cell |
| transcriptomics. Nature Communications, 10(1), 1-14. |
| |
| Examples |
| -------- |
| >>> import numpy as np |
| >>> from sklearn.manifold import TSNE |
| >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) |
| >>> X_embedded = TSNE(n_components=2, learning_rate='auto', |
| ... init='random', perplexity=3).fit_transform(X) |
| >>> X_embedded.shape |
| (4, 2) |
| """ |
|
|
| _parameter_constraints: dict = { |
| "n_components": [Interval(Integral, 1, None, closed="left")], |
| "perplexity": [Interval(Real, 0, None, closed="neither")], |
| "early_exaggeration": [Interval(Real, 1, None, closed="left")], |
| "learning_rate": [ |
| StrOptions({"auto"}), |
| Interval(Real, 0, None, closed="neither"), |
| ], |
| "max_iter": [Interval(Integral, 250, None, closed="left"), None], |
| "n_iter_without_progress": [Interval(Integral, -1, None, closed="left")], |
| "min_grad_norm": [Interval(Real, 0, None, closed="left")], |
| "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], |
| "metric_params": [dict, None], |
| "init": [ |
| StrOptions({"pca", "random"}), |
| np.ndarray, |
| ], |
| "verbose": ["verbose"], |
| "random_state": ["random_state"], |
| "method": [StrOptions({"barnes_hut", "exact"})], |
| "angle": [Interval(Real, 0, 1, closed="both")], |
| "n_jobs": [None, Integral], |
| "n_iter": [ |
| Interval(Integral, 250, None, closed="left"), |
| Hidden(StrOptions({"deprecated"})), |
| ], |
| } |
|
|
| |
| _EXPLORATION_MAX_ITER = 250 |
|
|
| |
| _N_ITER_CHECK = 50 |
|
|
| def __init__( |
| self, |
| n_components=2, |
| *, |
| perplexity=30.0, |
| early_exaggeration=12.0, |
| learning_rate="auto", |
| max_iter=None, |
| n_iter_without_progress=300, |
| min_grad_norm=1e-7, |
| metric="euclidean", |
| metric_params=None, |
| init="pca", |
| verbose=0, |
| random_state=None, |
| method="barnes_hut", |
| angle=0.5, |
| n_jobs=None, |
| n_iter="deprecated", |
| ): |
| self.n_components = n_components |
| self.perplexity = perplexity |
| self.early_exaggeration = early_exaggeration |
| self.learning_rate = learning_rate |
| self.max_iter = max_iter |
| self.n_iter_without_progress = n_iter_without_progress |
| self.min_grad_norm = min_grad_norm |
| self.metric = metric |
| self.metric_params = metric_params |
| self.init = init |
| self.verbose = verbose |
| self.random_state = random_state |
| self.method = method |
| self.angle = angle |
| self.n_jobs = n_jobs |
| self.n_iter = n_iter |
|
|
| def _check_params_vs_input(self, X): |
| if self.perplexity >= X.shape[0]: |
| raise ValueError("perplexity must be less than n_samples") |
|
|
| def _fit(self, X, skip_num_points=0): |
| """Private function to fit the model using X as training data.""" |
|
|
| if isinstance(self.init, str) and self.init == "pca" and issparse(X): |
| raise TypeError( |
| "PCA initialization is currently not supported " |
| "with the sparse input matrix. Use " |
| 'init="random" instead.' |
| ) |
|
|
| if self.learning_rate == "auto": |
| |
| self.learning_rate_ = X.shape[0] / self.early_exaggeration / 4 |
| self.learning_rate_ = np.maximum(self.learning_rate_, 50) |
| else: |
| self.learning_rate_ = self.learning_rate |
|
|
| if self.method == "barnes_hut": |
| X = validate_data( |
| self, |
| X, |
| accept_sparse=["csr"], |
| ensure_min_samples=2, |
| dtype=[np.float32, np.float64], |
| ) |
| else: |
| X = validate_data( |
| self, |
| X, |
| accept_sparse=["csr", "csc", "coo"], |
| dtype=[np.float32, np.float64], |
| ) |
| if self.metric == "precomputed": |
| if isinstance(self.init, str) and self.init == "pca": |
| raise ValueError( |
| 'The parameter init="pca" cannot be used with metric="precomputed".' |
| ) |
| if X.shape[0] != X.shape[1]: |
| raise ValueError("X should be a square distance matrix") |
|
|
| check_non_negative( |
| X, |
| ( |
| "TSNE.fit(). With metric='precomputed', X " |
| "should contain positive distances." |
| ), |
| ) |
|
|
| if self.method == "exact" and issparse(X): |
| raise TypeError( |
| 'TSNE with method="exact" does not accept sparse ' |
| 'precomputed distance matrix. Use method="barnes_hut" ' |
| "or provide the dense distance matrix." |
| ) |
|
|
| if self.method == "barnes_hut" and self.n_components > 3: |
| raise ValueError( |
| "'n_components' should be inferior to 4 for the " |
| "barnes_hut algorithm as it relies on " |
| "quad-tree or oct-tree." |
| ) |
| random_state = check_random_state(self.random_state) |
|
|
| n_samples = X.shape[0] |
|
|
| neighbors_nn = None |
| if self.method == "exact": |
| |
| |
| if self.metric == "precomputed": |
| distances = X |
| else: |
| if self.verbose: |
| print("[t-SNE] Computing pairwise distances...") |
|
|
| if self.metric == "euclidean": |
| |
| |
| |
| |
| |
| distances = pairwise_distances(X, metric=self.metric, squared=True) |
| else: |
| metric_params_ = self.metric_params or {} |
| distances = pairwise_distances( |
| X, metric=self.metric, n_jobs=self.n_jobs, **metric_params_ |
| ) |
|
|
| if np.any(distances < 0): |
| raise ValueError( |
| "All distances should be positive, the metric given is not correct" |
| ) |
|
|
| if self.metric != "euclidean": |
| distances **= 2 |
|
|
| |
| P = _joint_probabilities(distances, self.perplexity, self.verbose) |
| assert np.all(np.isfinite(P)), "All probabilities should be finite" |
| assert np.all(P >= 0), "All probabilities should be non-negative" |
| assert np.all( |
| P <= 1 |
| ), "All probabilities should be less or then equal to one" |
|
|
| else: |
| |
| |
| |
| |
| n_neighbors = min(n_samples - 1, int(3.0 * self.perplexity + 1)) |
|
|
| if self.verbose: |
| print("[t-SNE] Computing {} nearest neighbors...".format(n_neighbors)) |
|
|
| |
| knn = NearestNeighbors( |
| algorithm="auto", |
| n_jobs=self.n_jobs, |
| n_neighbors=n_neighbors, |
| metric=self.metric, |
| metric_params=self.metric_params, |
| ) |
| t0 = time() |
| knn.fit(X) |
| duration = time() - t0 |
| if self.verbose: |
| print( |
| "[t-SNE] Indexed {} samples in {:.3f}s...".format( |
| n_samples, duration |
| ) |
| ) |
|
|
| t0 = time() |
| distances_nn = knn.kneighbors_graph(mode="distance") |
| duration = time() - t0 |
| if self.verbose: |
| print( |
| "[t-SNE] Computed neighbors for {} samples in {:.3f}s...".format( |
| n_samples, duration |
| ) |
| ) |
|
|
| |
| del knn |
|
|
| |
| |
| |
| |
| |
| distances_nn.data **= 2 |
|
|
| |
| P = _joint_probabilities_nn(distances_nn, self.perplexity, self.verbose) |
|
|
| if isinstance(self.init, np.ndarray): |
| X_embedded = self.init |
| elif self.init == "pca": |
| pca = PCA( |
| n_components=self.n_components, |
| svd_solver="randomized", |
| random_state=random_state, |
| ) |
| |
| pca.set_output(transform="default") |
| X_embedded = pca.fit_transform(X).astype(np.float32, copy=False) |
| |
| |
| X_embedded = X_embedded / np.std(X_embedded[:, 0]) * 1e-4 |
| elif self.init == "random": |
| |
| |
| X_embedded = 1e-4 * random_state.standard_normal( |
| size=(n_samples, self.n_components) |
| ).astype(np.float32) |
|
|
| |
| |
| |
| |
| degrees_of_freedom = max(self.n_components - 1, 1) |
|
|
| return self._tsne( |
| P, |
| degrees_of_freedom, |
| n_samples, |
| X_embedded=X_embedded, |
| neighbors=neighbors_nn, |
| skip_num_points=skip_num_points, |
| ) |
|
|
| def _tsne( |
| self, |
| P, |
| degrees_of_freedom, |
| n_samples, |
| X_embedded, |
| neighbors=None, |
| skip_num_points=0, |
| ): |
| """Runs t-SNE.""" |
| |
| |
| |
| |
| |
| params = X_embedded.ravel() |
|
|
| opt_args = { |
| "it": 0, |
| "n_iter_check": self._N_ITER_CHECK, |
| "min_grad_norm": self.min_grad_norm, |
| "learning_rate": self.learning_rate_, |
| "verbose": self.verbose, |
| "kwargs": dict(skip_num_points=skip_num_points), |
| "args": [P, degrees_of_freedom, n_samples, self.n_components], |
| "n_iter_without_progress": self._EXPLORATION_MAX_ITER, |
| "max_iter": self._EXPLORATION_MAX_ITER, |
| "momentum": 0.5, |
| } |
| if self.method == "barnes_hut": |
| obj_func = _kl_divergence_bh |
| opt_args["kwargs"]["angle"] = self.angle |
| |
| opt_args["kwargs"]["verbose"] = self.verbose |
| |
| |
| opt_args["kwargs"]["num_threads"] = _openmp_effective_n_threads() |
| else: |
| obj_func = _kl_divergence |
|
|
| |
| |
| P *= self.early_exaggeration |
| params, kl_divergence, it = _gradient_descent(obj_func, params, **opt_args) |
| if self.verbose: |
| print( |
| "[t-SNE] KL divergence after %d iterations with early exaggeration: %f" |
| % (it + 1, kl_divergence) |
| ) |
|
|
| |
| |
| P /= self.early_exaggeration |
| remaining = self._max_iter - self._EXPLORATION_MAX_ITER |
| if it < self._EXPLORATION_MAX_ITER or remaining > 0: |
| opt_args["max_iter"] = self._max_iter |
| opt_args["it"] = it + 1 |
| opt_args["momentum"] = 0.8 |
| opt_args["n_iter_without_progress"] = self.n_iter_without_progress |
| params, kl_divergence, it = _gradient_descent(obj_func, params, **opt_args) |
|
|
| |
| self.n_iter_ = it |
|
|
| if self.verbose: |
| print( |
| "[t-SNE] KL divergence after %d iterations: %f" |
| % (it + 1, kl_divergence) |
| ) |
|
|
| X_embedded = params.reshape(n_samples, self.n_components) |
| self.kl_divergence_ = kl_divergence |
|
|
| return X_embedded |
|
|
| @_fit_context( |
| |
| prefer_skip_nested_validation=False |
| ) |
| def fit_transform(self, X, y=None): |
| """Fit X into an embedded space and return that transformed output. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ |
| (n_samples, n_samples) |
| If the metric is 'precomputed' X must be a square distance |
| matrix. Otherwise it contains a sample per row. If the method |
| is 'exact', X may be a sparse matrix of type 'csr', 'csc' |
| or 'coo'. If the method is 'barnes_hut' and the metric is |
| 'precomputed', X may be a precomputed sparse graph. |
| |
| y : None |
| Ignored. |
| |
| Returns |
| ------- |
| X_new : ndarray of shape (n_samples, n_components) |
| Embedding of the training data in low-dimensional space. |
| """ |
| |
| |
| if self.n_iter != "deprecated": |
| if self.max_iter is not None: |
| raise ValueError( |
| "Both 'n_iter' and 'max_iter' attributes were set. Attribute" |
| " 'n_iter' was deprecated in version 1.5 and will be removed in" |
| " 1.7. To avoid this error, only set the 'max_iter' attribute." |
| ) |
| warnings.warn( |
| ( |
| "'n_iter' was renamed to 'max_iter' in version 1.5 and " |
| "will be removed in 1.7." |
| ), |
| FutureWarning, |
| ) |
| self._max_iter = self.n_iter |
| elif self.max_iter is None: |
| self._max_iter = 1000 |
| else: |
| self._max_iter = self.max_iter |
|
|
| self._check_params_vs_input(X) |
| embedding = self._fit(X) |
| self.embedding_ = embedding |
| return self.embedding_ |
|
|
| @_fit_context( |
| |
| prefer_skip_nested_validation=False |
| ) |
| def fit(self, X, y=None): |
| """Fit X into an embedded space. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ |
| (n_samples, n_samples) |
| If the metric is 'precomputed' X must be a square distance |
| matrix. Otherwise it contains a sample per row. If the method |
| is 'exact', X may be a sparse matrix of type 'csr', 'csc' |
| or 'coo'. If the method is 'barnes_hut' and the metric is |
| 'precomputed', X may be a precomputed sparse graph. |
| |
| y : None |
| Ignored. |
| |
| Returns |
| ------- |
| self : object |
| Fitted estimator. |
| """ |
| self.fit_transform(X) |
| return self |
|
|
| @property |
| def _n_features_out(self): |
| """Number of transformed output features.""" |
| return self.embedding_.shape[1] |
|
|
| def __sklearn_tags__(self): |
| tags = super().__sklearn_tags__() |
| tags.input_tags.pairwise = self.metric == "precomputed" |
| return tags |
|
|