| """Testing for Spectral Biclustering methods""" |
|
|
| import numpy as np |
| import pytest |
| from scipy.sparse import issparse |
|
|
| from sklearn.base import BaseEstimator, BiclusterMixin |
| from sklearn.cluster import SpectralBiclustering, SpectralCoclustering |
| from sklearn.cluster._bicluster import ( |
| _bistochastic_normalize, |
| _log_normalize, |
| _scale_normalize, |
| ) |
| from sklearn.datasets import make_biclusters, make_checkerboard |
| from sklearn.metrics import consensus_score, v_measure_score |
| from sklearn.model_selection import ParameterGrid |
| from sklearn.utils._testing import ( |
| assert_almost_equal, |
| assert_array_almost_equal, |
| assert_array_equal, |
| ) |
| from sklearn.utils.fixes import CSR_CONTAINERS |
|
|
|
|
| class MockBiclustering(BiclusterMixin, BaseEstimator): |
| |
| def __init__(self): |
| pass |
|
|
| def get_indices(self, i): |
| |
| return ( |
| np.where([True, True, False, False, True])[0], |
| np.where([False, False, True, True])[0], |
| ) |
|
|
|
|
| @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) |
| def test_get_submatrix(csr_container): |
| data = np.arange(20).reshape(5, 4) |
| model = MockBiclustering() |
|
|
| for X in (data, csr_container(data), data.tolist()): |
| submatrix = model.get_submatrix(0, X) |
| if issparse(submatrix): |
| submatrix = submatrix.toarray() |
| assert_array_equal(submatrix, [[2, 3], [6, 7], [18, 19]]) |
| submatrix[:] = -1 |
| if issparse(X): |
| X = X.toarray() |
| assert np.all(X != -1) |
|
|
|
|
| def _test_shape_indices(model): |
| |
| for i in range(model.n_clusters): |
| m, n = model.get_shape(i) |
| i_ind, j_ind = model.get_indices(i) |
| assert len(i_ind) == m |
| assert len(j_ind) == n |
|
|
|
|
| @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) |
| def test_spectral_coclustering(global_random_seed, csr_container): |
| |
| param_grid = { |
| "svd_method": ["randomized", "arpack"], |
| "n_svd_vecs": [None, 20], |
| "mini_batch": [False, True], |
| "init": ["k-means++"], |
| "n_init": [10], |
| } |
| S, rows, cols = make_biclusters( |
| (30, 30), 3, noise=0.1, random_state=global_random_seed |
| ) |
| S -= S.min() |
| S = np.where(S < 1, 0, S) |
| for mat in (S, csr_container(S)): |
| for kwargs in ParameterGrid(param_grid): |
| model = SpectralCoclustering( |
| n_clusters=3, random_state=global_random_seed, **kwargs |
| ) |
| model.fit(mat) |
|
|
| assert model.rows_.shape == (3, 30) |
| assert_array_equal(model.rows_.sum(axis=0), np.ones(30)) |
| assert_array_equal(model.columns_.sum(axis=0), np.ones(30)) |
| assert consensus_score(model.biclusters_, (rows, cols)) == 1 |
|
|
| _test_shape_indices(model) |
|
|
|
|
| @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) |
| def test_spectral_biclustering(global_random_seed, csr_container): |
| |
| S, rows, cols = make_checkerboard( |
| (30, 30), 3, noise=0.5, random_state=global_random_seed |
| ) |
|
|
| non_default_params = { |
| "method": ["scale", "log"], |
| "svd_method": ["arpack"], |
| "n_svd_vecs": [20], |
| "mini_batch": [True], |
| } |
|
|
| for mat in (S, csr_container(S)): |
| for param_name, param_values in non_default_params.items(): |
| for param_value in param_values: |
| model = SpectralBiclustering( |
| n_clusters=3, |
| n_init=3, |
| init="k-means++", |
| random_state=global_random_seed, |
| ) |
| model.set_params(**dict([(param_name, param_value)])) |
|
|
| if issparse(mat) and model.get_params().get("method") == "log": |
| |
| with pytest.raises(ValueError): |
| model.fit(mat) |
| continue |
| else: |
| model.fit(mat) |
|
|
| assert model.rows_.shape == (9, 30) |
| assert model.columns_.shape == (9, 30) |
| assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30)) |
| assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30)) |
| assert consensus_score(model.biclusters_, (rows, cols)) == 1 |
|
|
| _test_shape_indices(model) |
|
|
|
|
| def _do_scale_test(scaled): |
| """Check that rows sum to one constant, and columns to another.""" |
| row_sum = scaled.sum(axis=1) |
| col_sum = scaled.sum(axis=0) |
| if issparse(scaled): |
| row_sum = np.asarray(row_sum).squeeze() |
| col_sum = np.asarray(col_sum).squeeze() |
| assert_array_almost_equal(row_sum, np.tile(row_sum.mean(), 100), decimal=1) |
| assert_array_almost_equal(col_sum, np.tile(col_sum.mean(), 100), decimal=1) |
|
|
|
|
| def _do_bistochastic_test(scaled): |
| """Check that rows and columns sum to the same constant.""" |
| _do_scale_test(scaled) |
| assert_almost_equal(scaled.sum(axis=0).mean(), scaled.sum(axis=1).mean(), decimal=1) |
|
|
|
|
| @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) |
| def test_scale_normalize(global_random_seed, csr_container): |
| generator = np.random.RandomState(global_random_seed) |
| X = generator.rand(100, 100) |
| for mat in (X, csr_container(X)): |
| scaled, _, _ = _scale_normalize(mat) |
| _do_scale_test(scaled) |
| if issparse(mat): |
| assert issparse(scaled) |
|
|
|
|
| @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) |
| def test_bistochastic_normalize(global_random_seed, csr_container): |
| generator = np.random.RandomState(global_random_seed) |
| X = generator.rand(100, 100) |
| for mat in (X, csr_container(X)): |
| scaled = _bistochastic_normalize(mat) |
| _do_bistochastic_test(scaled) |
| if issparse(mat): |
| assert issparse(scaled) |
|
|
|
|
| def test_log_normalize(global_random_seed): |
| |
| |
| generator = np.random.RandomState(global_random_seed) |
| mat = generator.rand(100, 100) |
| scaled = _log_normalize(mat) + 1 |
| _do_bistochastic_test(scaled) |
|
|
|
|
| def test_fit_best_piecewise(global_random_seed): |
| model = SpectralBiclustering(random_state=global_random_seed) |
| vectors = np.array([[0, 0, 0, 1, 1, 1], [2, 2, 2, 3, 3, 3], [0, 1, 2, 3, 4, 5]]) |
| best = model._fit_best_piecewise(vectors, n_best=2, n_clusters=2) |
| assert_array_equal(best, vectors[:2]) |
|
|
|
|
| @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) |
| def test_project_and_cluster(global_random_seed, csr_container): |
| model = SpectralBiclustering(random_state=global_random_seed) |
| data = np.array([[1, 1, 1], [1, 1, 1], [3, 6, 3], [3, 6, 3]]) |
| vectors = np.array([[1, 0], [0, 1], [0, 0]]) |
| for mat in (data, csr_container(data)): |
| labels = model._project_and_cluster(mat, vectors, n_clusters=2) |
| assert_almost_equal(v_measure_score(labels, [0, 0, 1, 1]), 1.0) |
|
|
|
|
| def test_perfect_checkerboard(global_random_seed): |
| |
| model = SpectralBiclustering( |
| 3, svd_method="arpack", random_state=global_random_seed |
| ) |
|
|
| S, rows, cols = make_checkerboard( |
| (30, 30), 3, noise=0, random_state=global_random_seed |
| ) |
| model.fit(S) |
| assert consensus_score(model.biclusters_, (rows, cols)) == 1 |
|
|
| S, rows, cols = make_checkerboard( |
| (40, 30), 3, noise=0, random_state=global_random_seed |
| ) |
| model.fit(S) |
| assert consensus_score(model.biclusters_, (rows, cols)) == 1 |
|
|
| S, rows, cols = make_checkerboard( |
| (30, 40), 3, noise=0, random_state=global_random_seed |
| ) |
| model.fit(S) |
| assert consensus_score(model.biclusters_, (rows, cols)) == 1 |
|
|
|
|
| @pytest.mark.parametrize( |
| "params, type_err, err_msg", |
| [ |
| ( |
| {"n_clusters": 6}, |
| ValueError, |
| "n_clusters should be <= n_samples=5", |
| ), |
| ( |
| {"n_clusters": (3, 3, 3)}, |
| ValueError, |
| "Incorrect parameter n_clusters", |
| ), |
| ( |
| {"n_clusters": (3, 6)}, |
| ValueError, |
| "Incorrect parameter n_clusters", |
| ), |
| ( |
| {"n_components": 3, "n_best": 4}, |
| ValueError, |
| "n_best=4 must be <= n_components=3", |
| ), |
| ], |
| ) |
| def test_spectralbiclustering_parameter_validation(params, type_err, err_msg): |
| """Check parameters validation in `SpectralBiClustering`""" |
| data = np.arange(25).reshape((5, 5)) |
| model = SpectralBiclustering(**params) |
| with pytest.raises(type_err, match=err_msg): |
| model.fit(data) |
|
|
|
|
| @pytest.mark.parametrize("est", (SpectralBiclustering(), SpectralCoclustering())) |
| def test_n_features_in_(est): |
| X, _, _ = make_biclusters((3, 3), 3, random_state=0) |
|
|
| assert not hasattr(est, "n_features_in_") |
| est.fit(X) |
| assert est.n_features_in_ == 3 |
|
|