| import functools |
| import warnings |
| from typing import Any, List |
|
|
| import numpy as np |
| import pytest |
| import scipy.sparse as sp |
|
|
| from sklearn.exceptions import DataDimensionalityWarning, NotFittedError |
| from sklearn.metrics import euclidean_distances |
| from sklearn.random_projection import ( |
| GaussianRandomProjection, |
| SparseRandomProjection, |
| _gaussian_random_matrix, |
| _sparse_random_matrix, |
| johnson_lindenstrauss_min_dim, |
| ) |
| from sklearn.utils._testing import ( |
| assert_allclose, |
| assert_allclose_dense_sparse, |
| assert_almost_equal, |
| assert_array_almost_equal, |
| assert_array_equal, |
| ) |
| from sklearn.utils.fixes import COO_CONTAINERS |
|
|
| all_sparse_random_matrix: List[Any] = [_sparse_random_matrix] |
| all_dense_random_matrix: List[Any] = [_gaussian_random_matrix] |
| all_random_matrix = all_sparse_random_matrix + all_dense_random_matrix |
|
|
| all_SparseRandomProjection: List[Any] = [SparseRandomProjection] |
| all_DenseRandomProjection: List[Any] = [GaussianRandomProjection] |
| all_RandomProjection = all_SparseRandomProjection + all_DenseRandomProjection |
|
|
|
|
| def make_sparse_random_data( |
| coo_container, |
| n_samples, |
| n_features, |
| n_nonzeros, |
| random_state=None, |
| sparse_format="csr", |
| ): |
| """Make some random data with uniformly located non zero entries with |
| Gaussian distributed values; `sparse_format` can be `"csr"` (default) or |
| `None` (in which case a dense array is returned). |
| """ |
| rng = np.random.RandomState(random_state) |
| data_coo = coo_container( |
| ( |
| rng.randn(n_nonzeros), |
| ( |
| rng.randint(n_samples, size=n_nonzeros), |
| rng.randint(n_features, size=n_nonzeros), |
| ), |
| ), |
| shape=(n_samples, n_features), |
| ) |
| if sparse_format is not None: |
| return data_coo.asformat(sparse_format) |
| else: |
| return data_coo.toarray() |
|
|
|
|
| def densify(matrix): |
| if not sp.issparse(matrix): |
| return matrix |
| else: |
| return matrix.toarray() |
|
|
|
|
| n_samples, n_features = (10, 1000) |
| n_nonzeros = int(n_samples * n_features / 100.0) |
|
|
|
|
| |
| |
| |
|
|
|
|
| @pytest.mark.parametrize( |
| "n_samples, eps", |
| [ |
| ([100, 110], [0.9, 1.1]), |
| ([90, 100], [0.1, 0.0]), |
| ([50, -40], [0.1, 0.2]), |
| ], |
| ) |
| def test_invalid_jl_domain(n_samples, eps): |
| with pytest.raises(ValueError): |
| johnson_lindenstrauss_min_dim(n_samples, eps=eps) |
|
|
|
|
| def test_input_size_jl_min_dim(): |
| with pytest.raises(ValueError): |
| johnson_lindenstrauss_min_dim(3 * [100], eps=2 * [0.9]) |
|
|
| johnson_lindenstrauss_min_dim( |
| np.random.randint(1, 10, size=(10, 10)), eps=np.full((10, 10), 0.5) |
| ) |
|
|
|
|
| |
| |
| |
| def check_input_size_random_matrix(random_matrix): |
| inputs = [(0, 0), (-1, 1), (1, -1), (1, 0), (-1, 0)] |
| for n_components, n_features in inputs: |
| with pytest.raises(ValueError): |
| random_matrix(n_components, n_features) |
|
|
|
|
| def check_size_generated(random_matrix): |
| inputs = [(1, 5), (5, 1), (5, 5), (1, 1)] |
| for n_components, n_features in inputs: |
| assert random_matrix(n_components, n_features).shape == ( |
| n_components, |
| n_features, |
| ) |
|
|
|
|
| def check_zero_mean_and_unit_norm(random_matrix): |
| |
| |
|
|
| A = densify(random_matrix(10000, 1, random_state=0)) |
|
|
| assert_array_almost_equal(0, np.mean(A), 3) |
| assert_array_almost_equal(1.0, np.linalg.norm(A), 1) |
|
|
|
|
| def check_input_with_sparse_random_matrix(random_matrix): |
| n_components, n_features = 5, 10 |
|
|
| for density in [-1.0, 0.0, 1.1]: |
| with pytest.raises(ValueError): |
| random_matrix(n_components, n_features, density=density) |
|
|
|
|
| @pytest.mark.parametrize("random_matrix", all_random_matrix) |
| def test_basic_property_of_random_matrix(random_matrix): |
| |
| check_input_size_random_matrix(random_matrix) |
| check_size_generated(random_matrix) |
| check_zero_mean_and_unit_norm(random_matrix) |
|
|
|
|
| @pytest.mark.parametrize("random_matrix", all_sparse_random_matrix) |
| def test_basic_property_of_sparse_random_matrix(random_matrix): |
| check_input_with_sparse_random_matrix(random_matrix) |
|
|
| random_matrix_dense = functools.partial(random_matrix, density=1.0) |
|
|
| check_zero_mean_and_unit_norm(random_matrix_dense) |
|
|
|
|
| def test_gaussian_random_matrix(): |
| |
| |
| |
| |
| |
| n_components = 100 |
| n_features = 1000 |
| A = _gaussian_random_matrix(n_components, n_features, random_state=0) |
|
|
| assert_array_almost_equal(0.0, np.mean(A), 2) |
| assert_array_almost_equal(np.var(A, ddof=1), 1 / n_components, 1) |
|
|
|
|
| def test_sparse_random_matrix(): |
| |
| n_components = 100 |
| n_features = 500 |
|
|
| for density in [0.3, 1.0]: |
| s = 1 / density |
|
|
| A = _sparse_random_matrix( |
| n_components, n_features, density=density, random_state=0 |
| ) |
| A = densify(A) |
|
|
| |
| values = np.unique(A) |
| assert np.sqrt(s) / np.sqrt(n_components) in values |
| assert -np.sqrt(s) / np.sqrt(n_components) in values |
|
|
| if density == 1.0: |
| assert np.size(values) == 2 |
| else: |
| assert 0.0 in values |
| assert np.size(values) == 3 |
|
|
| |
| |
| |
| |
| |
| |
| |
| assert_almost_equal(np.mean(A == 0.0), 1 - 1 / s, decimal=2) |
| assert_almost_equal( |
| np.mean(A == np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2 |
| ) |
| assert_almost_equal( |
| np.mean(A == -np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2 |
| ) |
|
|
| assert_almost_equal(np.var(A == 0.0, ddof=1), (1 - 1 / s) * 1 / s, decimal=2) |
| assert_almost_equal( |
| np.var(A == np.sqrt(s) / np.sqrt(n_components), ddof=1), |
| (1 - 1 / (2 * s)) * 1 / (2 * s), |
| decimal=2, |
| ) |
| assert_almost_equal( |
| np.var(A == -np.sqrt(s) / np.sqrt(n_components), ddof=1), |
| (1 - 1 / (2 * s)) * 1 / (2 * s), |
| decimal=2, |
| ) |
|
|
|
|
| |
| |
| |
|
|
|
|
| def test_random_projection_transformer_invalid_input(): |
| n_components = "auto" |
| fit_data = [[0, 1, 2]] |
| for RandomProjection in all_RandomProjection: |
| with pytest.raises(ValueError): |
| RandomProjection(n_components=n_components).fit(fit_data) |
|
|
|
|
| @pytest.mark.parametrize("coo_container", COO_CONTAINERS) |
| def test_try_to_transform_before_fit(coo_container, global_random_seed): |
| data = make_sparse_random_data( |
| coo_container, |
| n_samples, |
| n_features, |
| n_nonzeros, |
| random_state=global_random_seed, |
| sparse_format=None, |
| ) |
| for RandomProjection in all_RandomProjection: |
| with pytest.raises(NotFittedError): |
| RandomProjection(n_components="auto").transform(data) |
|
|
|
|
| @pytest.mark.parametrize("coo_container", COO_CONTAINERS) |
| def test_too_many_samples_to_find_a_safe_embedding(coo_container, global_random_seed): |
| data = make_sparse_random_data( |
| coo_container, |
| n_samples=1000, |
| n_features=100, |
| n_nonzeros=1000, |
| random_state=global_random_seed, |
| sparse_format=None, |
| ) |
|
|
| for RandomProjection in all_RandomProjection: |
| rp = RandomProjection(n_components="auto", eps=0.1) |
| expected_msg = ( |
| "eps=0.100000 and n_samples=1000 lead to a target dimension" |
| " of 5920 which is larger than the original space with" |
| " n_features=100" |
| ) |
| with pytest.raises(ValueError, match=expected_msg): |
| rp.fit(data) |
|
|
|
|
| @pytest.mark.parametrize("coo_container", COO_CONTAINERS) |
| def test_random_projection_embedding_quality(coo_container): |
| data = make_sparse_random_data( |
| coo_container, |
| n_samples=8, |
| n_features=5000, |
| n_nonzeros=15000, |
| random_state=0, |
| sparse_format=None, |
| ) |
| eps = 0.2 |
|
|
| original_distances = euclidean_distances(data, squared=True) |
| original_distances = original_distances.ravel() |
| non_identical = original_distances != 0.0 |
|
|
| |
| original_distances = original_distances[non_identical] |
|
|
| for RandomProjection in all_RandomProjection: |
| rp = RandomProjection(n_components="auto", eps=eps, random_state=0) |
| projected = rp.fit_transform(data) |
|
|
| projected_distances = euclidean_distances(projected, squared=True) |
| projected_distances = projected_distances.ravel() |
|
|
| |
| projected_distances = projected_distances[non_identical] |
|
|
| distances_ratio = projected_distances / original_distances |
|
|
| |
| |
| |
| assert distances_ratio.max() < 1 + eps |
| assert 1 - eps < distances_ratio.min() |
|
|
|
|
| @pytest.mark.parametrize("coo_container", COO_CONTAINERS) |
| def test_SparseRandomProj_output_representation(coo_container): |
| dense_data = make_sparse_random_data( |
| coo_container, |
| n_samples, |
| n_features, |
| n_nonzeros, |
| random_state=0, |
| sparse_format=None, |
| ) |
| sparse_data = make_sparse_random_data( |
| coo_container, |
| n_samples, |
| n_features, |
| n_nonzeros, |
| random_state=0, |
| sparse_format="csr", |
| ) |
| for SparseRandomProj in all_SparseRandomProjection: |
| |
| |
| rp = SparseRandomProj(n_components=10, dense_output=True, random_state=0) |
| rp.fit(dense_data) |
| assert isinstance(rp.transform(dense_data), np.ndarray) |
| assert isinstance(rp.transform(sparse_data), np.ndarray) |
|
|
| |
| rp = SparseRandomProj(n_components=10, dense_output=False, random_state=0) |
| rp = rp.fit(dense_data) |
| |
| assert isinstance(rp.transform(dense_data), np.ndarray) |
|
|
| |
| assert sp.issparse(rp.transform(sparse_data)) |
|
|
|
|
| @pytest.mark.parametrize("coo_container", COO_CONTAINERS) |
| def test_correct_RandomProjection_dimensions_embedding( |
| coo_container, global_random_seed |
| ): |
| data = make_sparse_random_data( |
| coo_container, |
| n_samples, |
| n_features, |
| n_nonzeros, |
| random_state=global_random_seed, |
| sparse_format=None, |
| ) |
| for RandomProjection in all_RandomProjection: |
| rp = RandomProjection(n_components="auto", random_state=0, eps=0.5).fit(data) |
|
|
| |
| |
| assert rp.n_components == "auto" |
| assert rp.n_components_ == 110 |
|
|
| if RandomProjection in all_SparseRandomProjection: |
| assert rp.density == "auto" |
| assert_almost_equal(rp.density_, 0.03, 2) |
|
|
| assert rp.components_.shape == (110, n_features) |
|
|
| projected_1 = rp.transform(data) |
| assert projected_1.shape == (n_samples, 110) |
|
|
| |
| projected_2 = rp.transform(data) |
| assert_array_equal(projected_1, projected_2) |
|
|
| |
| rp2 = RandomProjection(random_state=0, eps=0.5) |
| projected_3 = rp2.fit_transform(data) |
| assert_array_equal(projected_1, projected_3) |
|
|
| |
| with pytest.raises(ValueError): |
| rp.transform(data[:, 1:5]) |
|
|
| |
| |
| if RandomProjection in all_SparseRandomProjection: |
| rp = RandomProjection(n_components=100, density=0.001, random_state=0) |
| projected = rp.fit_transform(data) |
| assert projected.shape == (n_samples, 100) |
| assert rp.components_.shape == (100, n_features) |
| assert rp.components_.nnz < 115 |
| assert 85 < rp.components_.nnz |
|
|
|
|
| @pytest.mark.parametrize("coo_container", COO_CONTAINERS) |
| def test_warning_n_components_greater_than_n_features( |
| coo_container, global_random_seed |
| ): |
| n_features = 20 |
| n_samples = 5 |
| n_nonzeros = int(n_features / 4) |
| data = make_sparse_random_data( |
| coo_container, |
| n_samples, |
| n_features, |
| n_nonzeros, |
| random_state=global_random_seed, |
| sparse_format=None, |
| ) |
|
|
| for RandomProjection in all_RandomProjection: |
| with pytest.warns(DataDimensionalityWarning): |
| RandomProjection(n_components=n_features + 1).fit(data) |
|
|
|
|
| @pytest.mark.parametrize("coo_container", COO_CONTAINERS) |
| def test_works_with_sparse_data(coo_container, global_random_seed): |
| n_features = 20 |
| n_samples = 5 |
| n_nonzeros = int(n_features / 4) |
| dense_data = make_sparse_random_data( |
| coo_container, |
| n_samples, |
| n_features, |
| n_nonzeros, |
| random_state=global_random_seed, |
| sparse_format=None, |
| ) |
| sparse_data = make_sparse_random_data( |
| coo_container, |
| n_samples, |
| n_features, |
| n_nonzeros, |
| random_state=global_random_seed, |
| sparse_format="csr", |
| ) |
|
|
| for RandomProjection in all_RandomProjection: |
| rp_dense = RandomProjection(n_components=3, random_state=1).fit(dense_data) |
| rp_sparse = RandomProjection(n_components=3, random_state=1).fit(sparse_data) |
| assert_array_almost_equal( |
| densify(rp_dense.components_), densify(rp_sparse.components_) |
| ) |
|
|
|
|
| def test_johnson_lindenstrauss_min_dim(): |
| """Test Johnson-Lindenstrauss for small eps. |
| |
| Regression test for #17111: before #19374, 32-bit systems would fail. |
| """ |
| assert johnson_lindenstrauss_min_dim(100, eps=1e-5) == 368416070986 |
|
|
|
|
| @pytest.mark.parametrize("coo_container", COO_CONTAINERS) |
| @pytest.mark.parametrize("random_projection_cls", all_RandomProjection) |
| def test_random_projection_feature_names_out( |
| coo_container, random_projection_cls, global_random_seed |
| ): |
| data = make_sparse_random_data( |
| coo_container, |
| n_samples, |
| n_features, |
| n_nonzeros, |
| random_state=global_random_seed, |
| sparse_format=None, |
| ) |
| random_projection = random_projection_cls(n_components=2) |
| random_projection.fit(data) |
| names_out = random_projection.get_feature_names_out() |
| class_name_lower = random_projection_cls.__name__.lower() |
| expected_names_out = np.array( |
| [f"{class_name_lower}{i}" for i in range(random_projection.n_components_)], |
| dtype=object, |
| ) |
|
|
| assert_array_equal(names_out, expected_names_out) |
|
|
|
|
| @pytest.mark.parametrize("coo_container", COO_CONTAINERS) |
| @pytest.mark.parametrize("n_samples", (2, 9, 10, 11, 1000)) |
| @pytest.mark.parametrize("n_features", (2, 9, 10, 11, 1000)) |
| @pytest.mark.parametrize("random_projection_cls", all_RandomProjection) |
| @pytest.mark.parametrize("compute_inverse_components", [True, False]) |
| def test_inverse_transform( |
| coo_container, |
| n_samples, |
| n_features, |
| random_projection_cls, |
| compute_inverse_components, |
| global_random_seed, |
| ): |
| n_components = 10 |
|
|
| random_projection = random_projection_cls( |
| n_components=n_components, |
| compute_inverse_components=compute_inverse_components, |
| random_state=global_random_seed, |
| ) |
|
|
| X_dense = make_sparse_random_data( |
| coo_container, |
| n_samples, |
| n_features, |
| n_nonzeros=n_samples * n_features // 100 + 1, |
| random_state=global_random_seed, |
| sparse_format=None, |
| ) |
| X_csr = make_sparse_random_data( |
| coo_container, |
| n_samples, |
| n_features, |
| n_nonzeros=n_samples * n_features // 100 + 1, |
| random_state=global_random_seed, |
| sparse_format="csr", |
| ) |
|
|
| for X in [X_dense, X_csr]: |
| with warnings.catch_warnings(): |
| warnings.filterwarnings( |
| "ignore", |
| message=( |
| "The number of components is higher than the number of features" |
| ), |
| category=DataDimensionalityWarning, |
| ) |
| projected = random_projection.fit_transform(X) |
|
|
| if compute_inverse_components: |
| assert hasattr(random_projection, "inverse_components_") |
| inv_components = random_projection.inverse_components_ |
| assert inv_components.shape == (n_features, n_components) |
|
|
| projected_back = random_projection.inverse_transform(projected) |
| assert projected_back.shape == X.shape |
|
|
| projected_again = random_projection.transform(projected_back) |
| if hasattr(projected, "toarray"): |
| projected = projected.toarray() |
| assert_allclose(projected, projected_again, rtol=1e-7, atol=1e-10) |
|
|
|
|
| @pytest.mark.parametrize("random_projection_cls", all_RandomProjection) |
| @pytest.mark.parametrize( |
| "input_dtype, expected_dtype", |
| ( |
| (np.float32, np.float32), |
| (np.float64, np.float64), |
| (np.int32, np.float64), |
| (np.int64, np.float64), |
| ), |
| ) |
| def test_random_projection_dtype_match( |
| random_projection_cls, input_dtype, expected_dtype |
| ): |
| |
| rng = np.random.RandomState(42) |
| X = rng.rand(25, 3000) |
| rp = random_projection_cls(random_state=0) |
| transformed = rp.fit_transform(X.astype(input_dtype)) |
|
|
| assert rp.components_.dtype == expected_dtype |
| assert transformed.dtype == expected_dtype |
|
|
|
|
| @pytest.mark.parametrize("random_projection_cls", all_RandomProjection) |
| def test_random_projection_numerical_consistency(random_projection_cls): |
| |
| atol = 1e-5 |
| rng = np.random.RandomState(42) |
| X = rng.rand(25, 3000) |
| rp_32 = random_projection_cls(random_state=0) |
| rp_64 = random_projection_cls(random_state=0) |
|
|
| projection_32 = rp_32.fit_transform(X.astype(np.float32)) |
| projection_64 = rp_64.fit_transform(X.astype(np.float64)) |
|
|
| assert_allclose(projection_64, projection_32, atol=atol) |
|
|
| assert_allclose_dense_sparse(rp_32.components_, rp_64.components_) |
|
|