| """Truncated SVD for sparse matrices, aka latent semantic analysis (LSA).""" |
|
|
| |
| |
|
|
| from numbers import Integral, Real |
|
|
| import numpy as np |
| import scipy.sparse as sp |
| from scipy.sparse.linalg import svds |
|
|
| from ..base import ( |
| BaseEstimator, |
| ClassNamePrefixFeaturesOutMixin, |
| TransformerMixin, |
| _fit_context, |
| ) |
| from ..utils import check_array, check_random_state |
| from ..utils._arpack import _init_arpack_v0 |
| from ..utils._param_validation import Interval, StrOptions |
| from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip |
| from ..utils.sparsefuncs import mean_variance_axis |
| from ..utils.validation import check_is_fitted, validate_data |
|
|
| __all__ = ["TruncatedSVD"] |
|
|
|
|
| class TruncatedSVD(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): |
| """Dimensionality reduction using truncated SVD (aka LSA). |
| |
| This transformer performs linear dimensionality reduction by means of |
| truncated singular value decomposition (SVD). Contrary to PCA, this |
| estimator does not center the data before computing the singular value |
| decomposition. This means it can work with sparse matrices |
| efficiently. |
| |
| In particular, truncated SVD works on term count/tf-idf matrices as |
| returned by the vectorizers in :mod:`sklearn.feature_extraction.text`. In |
| that context, it is known as latent semantic analysis (LSA). |
| |
| This estimator supports two algorithms: a fast randomized SVD solver, and |
| a "naive" algorithm that uses ARPACK as an eigensolver on `X * X.T` or |
| `X.T * X`, whichever is more efficient. |
| |
| Read more in the :ref:`User Guide <LSA>`. |
| |
| Parameters |
| ---------- |
| n_components : int, default=2 |
| Desired dimensionality of output data. |
| If algorithm='arpack', must be strictly less than the number of features. |
| If algorithm='randomized', must be less than or equal to the number of features. |
| The default value is useful for visualisation. For LSA, a value of |
| 100 is recommended. |
| |
| algorithm : {'arpack', 'randomized'}, default='randomized' |
| SVD solver to use. Either "arpack" for the ARPACK wrapper in SciPy |
| (scipy.sparse.linalg.svds), or "randomized" for the randomized |
| algorithm due to Halko (2009). |
| |
| n_iter : int, default=5 |
| Number of iterations for randomized SVD solver. Not used by ARPACK. The |
| default is larger than the default in |
| :func:`~sklearn.utils.extmath.randomized_svd` to handle sparse |
| matrices that may have large slowly decaying spectrum. |
| |
| n_oversamples : int, default=10 |
| Number of oversamples for randomized SVD solver. Not used by ARPACK. |
| See :func:`~sklearn.utils.extmath.randomized_svd` for a complete |
| description. |
| |
| .. versionadded:: 1.1 |
| |
| power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto' |
| Power iteration normalizer for randomized SVD solver. |
| Not used by ARPACK. See :func:`~sklearn.utils.extmath.randomized_svd` |
| for more details. |
| |
| .. versionadded:: 1.1 |
| |
| random_state : int, RandomState instance or None, default=None |
| Used during randomized svd. Pass an int for reproducible results across |
| multiple function calls. |
| See :term:`Glossary <random_state>`. |
| |
| tol : float, default=0.0 |
| Tolerance for ARPACK. 0 means machine precision. Ignored by randomized |
| SVD solver. |
| |
| Attributes |
| ---------- |
| components_ : ndarray of shape (n_components, n_features) |
| The right singular vectors of the input data. |
| |
| explained_variance_ : ndarray of shape (n_components,) |
| The variance of the training samples transformed by a projection to |
| each component. |
| |
| explained_variance_ratio_ : ndarray of shape (n_components,) |
| Percentage of variance explained by each of the selected components. |
| |
| singular_values_ : ndarray of shape (n_components,) |
| The singular values corresponding to each of the selected components. |
| The singular values are equal to the 2-norms of the ``n_components`` |
| variables in the lower-dimensional space. |
| |
| n_features_in_ : int |
| Number of features seen during :term:`fit`. |
| |
| .. versionadded:: 0.24 |
| |
| feature_names_in_ : ndarray of shape (`n_features_in_`,) |
| Names of features seen during :term:`fit`. Defined only when `X` |
| has feature names that are all strings. |
| |
| .. versionadded:: 1.0 |
| |
| See Also |
| -------- |
| DictionaryLearning : Find a dictionary that sparsely encodes data. |
| FactorAnalysis : A simple linear generative model with |
| Gaussian latent variables. |
| IncrementalPCA : Incremental principal components analysis. |
| KernelPCA : Kernel Principal component analysis. |
| NMF : Non-Negative Matrix Factorization. |
| PCA : Principal component analysis. |
| |
| Notes |
| ----- |
| SVD suffers from a problem called "sign indeterminacy", which means the |
| sign of the ``components_`` and the output from transform depend on the |
| algorithm and random state. To work around this, fit instances of this |
| class to data once, then keep the instance around to do transformations. |
| |
| References |
| ---------- |
| :arxiv:`Halko, et al. (2009). "Finding structure with randomness: |
| Stochastic algorithms for constructing approximate matrix decompositions" |
| <0909.4061>` |
| |
| Examples |
| -------- |
| >>> from sklearn.decomposition import TruncatedSVD |
| >>> from scipy.sparse import csr_matrix |
| >>> import numpy as np |
| >>> np.random.seed(0) |
| >>> X_dense = np.random.rand(100, 100) |
| >>> X_dense[:, 2 * np.arange(50)] = 0 |
| >>> X = csr_matrix(X_dense) |
| >>> svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42) |
| >>> svd.fit(X) |
| TruncatedSVD(n_components=5, n_iter=7, random_state=42) |
| >>> print(svd.explained_variance_ratio_) |
| [0.0157... 0.0512... 0.0499... 0.0479... 0.0453...] |
| >>> print(svd.explained_variance_ratio_.sum()) |
| 0.2102... |
| >>> print(svd.singular_values_) |
| [35.2410... 4.5981... 4.5420... 4.4486... 4.3288...] |
| """ |
|
|
| _parameter_constraints: dict = { |
| "n_components": [Interval(Integral, 1, None, closed="left")], |
| "algorithm": [StrOptions({"arpack", "randomized"})], |
| "n_iter": [Interval(Integral, 0, None, closed="left")], |
| "n_oversamples": [Interval(Integral, 1, None, closed="left")], |
| "power_iteration_normalizer": [StrOptions({"auto", "OR", "LU", "none"})], |
| "random_state": ["random_state"], |
| "tol": [Interval(Real, 0, None, closed="left")], |
| } |
|
|
| def __init__( |
| self, |
| n_components=2, |
| *, |
| algorithm="randomized", |
| n_iter=5, |
| n_oversamples=10, |
| power_iteration_normalizer="auto", |
| random_state=None, |
| tol=0.0, |
| ): |
| self.algorithm = algorithm |
| self.n_components = n_components |
| self.n_iter = n_iter |
| self.n_oversamples = n_oversamples |
| self.power_iteration_normalizer = power_iteration_normalizer |
| self.random_state = random_state |
| self.tol = tol |
|
|
| def fit(self, X, y=None): |
| """Fit model on training data X. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| Training data. |
| |
| y : Ignored |
| Not used, present here for API consistency by convention. |
| |
| Returns |
| ------- |
| self : object |
| Returns the transformer object. |
| """ |
| self.fit_transform(X) |
| return self |
|
|
| @_fit_context(prefer_skip_nested_validation=True) |
| def fit_transform(self, X, y=None): |
| """Fit model to X and perform dimensionality reduction on X. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| Training data. |
| |
| y : Ignored |
| Not used, present here for API consistency by convention. |
| |
| Returns |
| ------- |
| X_new : ndarray of shape (n_samples, n_components) |
| Reduced version of X. This will always be a dense array. |
| """ |
| X = validate_data(self, X, accept_sparse=["csr", "csc"], ensure_min_features=2) |
| random_state = check_random_state(self.random_state) |
|
|
| if self.algorithm == "arpack": |
| v0 = _init_arpack_v0(min(X.shape), random_state) |
| U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol, v0=v0) |
| |
| |
| Sigma = Sigma[::-1] |
| |
| U, VT = svd_flip(U[:, ::-1], VT[::-1], u_based_decision=False) |
|
|
| elif self.algorithm == "randomized": |
| if self.n_components > X.shape[1]: |
| raise ValueError( |
| f"n_components({self.n_components}) must be <=" |
| f" n_features({X.shape[1]})." |
| ) |
| U, Sigma, VT = randomized_svd( |
| X, |
| self.n_components, |
| n_iter=self.n_iter, |
| n_oversamples=self.n_oversamples, |
| power_iteration_normalizer=self.power_iteration_normalizer, |
| random_state=random_state, |
| flip_sign=False, |
| ) |
| U, VT = svd_flip(U, VT, u_based_decision=False) |
|
|
| self.components_ = VT |
|
|
| |
| |
| if self.algorithm == "randomized" or ( |
| self.algorithm == "arpack" and self.tol > 0 |
| ): |
| X_transformed = safe_sparse_dot(X, self.components_.T) |
| else: |
| X_transformed = U * Sigma |
|
|
| |
| self.explained_variance_ = exp_var = np.var(X_transformed, axis=0) |
| if sp.issparse(X): |
| _, full_var = mean_variance_axis(X, axis=0) |
| full_var = full_var.sum() |
| else: |
| full_var = np.var(X, axis=0).sum() |
| self.explained_variance_ratio_ = exp_var / full_var |
| self.singular_values_ = Sigma |
|
|
| return X_transformed |
|
|
| def transform(self, X): |
| """Perform dimensionality reduction on X. |
| |
| Parameters |
| ---------- |
| X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| New data. |
| |
| Returns |
| ------- |
| X_new : ndarray of shape (n_samples, n_components) |
| Reduced version of X. This will always be a dense array. |
| """ |
| check_is_fitted(self) |
| X = validate_data(self, X, accept_sparse=["csr", "csc"], reset=False) |
| return safe_sparse_dot(X, self.components_.T) |
|
|
| def inverse_transform(self, X): |
| """Transform X back to its original space. |
| |
| Returns an array X_original whose transform would be X. |
| |
| Parameters |
| ---------- |
| X : array-like of shape (n_samples, n_components) |
| New data. |
| |
| Returns |
| ------- |
| X_original : ndarray of shape (n_samples, n_features) |
| Note that this is always a dense array. |
| """ |
| X = check_array(X) |
| return np.dot(X, self.components_) |
|
|
| def __sklearn_tags__(self): |
| tags = super().__sklearn_tags__() |
| tags.input_tags.sparse = True |
| tags.transformer_tags.preserves_dtype = ["float64", "float32"] |
| return tags |
|
|
| @property |
| def _n_features_out(self): |
| """Number of transformed output features.""" |
| return self.components_.shape[0] |
|
|