Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified 12 months ago

9.13 kB

	"""Testing for Spectral Biclustering methods"""

	import numpy as np
	import pytest
	from scipy.sparse import issparse

	from sklearn.base import BaseEstimator, BiclusterMixin
	from sklearn.cluster import SpectralBiclustering, SpectralCoclustering
	from sklearn.cluster._bicluster import (
	_bistochastic_normalize,
	_log_normalize,
	_scale_normalize,
	)
	from sklearn.datasets import make_biclusters, make_checkerboard
	from sklearn.metrics import consensus_score, v_measure_score
	from sklearn.model_selection import ParameterGrid
	from sklearn.utils._testing import (
	assert_almost_equal,
	assert_array_almost_equal,
	assert_array_equal,
	)
	from sklearn.utils.fixes import CSR_CONTAINERS


	class MockBiclustering(BiclusterMixin, BaseEstimator):
	# Mock object for testing get_submatrix.
	def __init__(self):
	pass

	def get_indices(self, i):
	# Overridden to reproduce old get_submatrix test.
	return (
	np.where([True, True, False, False, True])[0],
	np.where([False, False, True, True])[0],
	)


	@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
	def test_get_submatrix(csr_container):
	data = np.arange(20).reshape(5, 4)
	model = MockBiclustering()

	for X in (data, csr_container(data), data.tolist()):
	submatrix = model.get_submatrix(0, X)
	if issparse(submatrix):
	submatrix = submatrix.toarray()
	assert_array_equal(submatrix, [[2, 3], [6, 7], [18, 19]])
	submatrix[:] = -1
	if issparse(X):
	X = X.toarray()
	assert np.all(X != -1)


	def _test_shape_indices(model):
	# Test get_shape and get_indices on fitted model.
	for i in range(model.n_clusters):
	m, n = model.get_shape(i)
	i_ind, j_ind = model.get_indices(i)
	assert len(i_ind) == m
	assert len(j_ind) == n


	@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
	def test_spectral_coclustering(global_random_seed, csr_container):
	# Test Dhillon's Spectral CoClustering on a simple problem.
	param_grid = {
	"svd_method": ["randomized", "arpack"],
	"n_svd_vecs": [None, 20],
	"mini_batch": [False, True],
	"init": ["k-means++"],
	"n_init": [10],
	}
	S, rows, cols = make_biclusters(
	(30, 30), 3, noise=0.1, random_state=global_random_seed
	)
	S -= S.min() # needs to be nonnegative before making it sparse
	S = np.where(S < 1, 0, S) # threshold some values
	for mat in (S, csr_container(S)):
	for kwargs in ParameterGrid(param_grid):
	model = SpectralCoclustering(
	n_clusters=3, random_state=global_random_seed, **kwargs
	)
	model.fit(mat)

	assert model.rows_.shape == (3, 30)
	assert_array_equal(model.rows_.sum(axis=0), np.ones(30))
	assert_array_equal(model.columns_.sum(axis=0), np.ones(30))
	assert consensus_score(model.biclusters_, (rows, cols)) == 1

	_test_shape_indices(model)


	@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
	def test_spectral_biclustering(global_random_seed, csr_container):
	# Test Kluger methods on a checkerboard dataset.
	S, rows, cols = make_checkerboard(
	(30, 30), 3, noise=0.5, random_state=global_random_seed
	)

	non_default_params = {
	"method": ["scale", "log"],
	"svd_method": ["arpack"],
	"n_svd_vecs": [20],
	"mini_batch": [True],
	}

	for mat in (S, csr_container(S)):
	for param_name, param_values in non_default_params.items():
	for param_value in param_values:
	model = SpectralBiclustering(
	n_clusters=3,
	n_init=3,
	init="k-means++",
	random_state=global_random_seed,
	)
	model.set_params(**dict([(param_name, param_value)]))

	if issparse(mat) and model.get_params().get("method") == "log":
	# cannot take log of sparse matrix
	with pytest.raises(ValueError):
	model.fit(mat)
	continue
	else:
	model.fit(mat)

	assert model.rows_.shape == (9, 30)
	assert model.columns_.shape == (9, 30)
	assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30))
	assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30))
	assert consensus_score(model.biclusters_, (rows, cols)) == 1

	_test_shape_indices(model)


	def _do_scale_test(scaled):
	"""Check that rows sum to one constant, and columns to another."""
	row_sum = scaled.sum(axis=1)
	col_sum = scaled.sum(axis=0)
	if issparse(scaled):
	row_sum = np.asarray(row_sum).squeeze()
	col_sum = np.asarray(col_sum).squeeze()
	assert_array_almost_equal(row_sum, np.tile(row_sum.mean(), 100), decimal=1)
	assert_array_almost_equal(col_sum, np.tile(col_sum.mean(), 100), decimal=1)


	def _do_bistochastic_test(scaled):
	"""Check that rows and columns sum to the same constant."""
	_do_scale_test(scaled)
	assert_almost_equal(scaled.sum(axis=0).mean(), scaled.sum(axis=1).mean(), decimal=1)


	@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
	def test_scale_normalize(global_random_seed, csr_container):
	generator = np.random.RandomState(global_random_seed)
	X = generator.rand(100, 100)
	for mat in (X, csr_container(X)):
	scaled, _, _ = _scale_normalize(mat)
	_do_scale_test(scaled)
	if issparse(mat):
	assert issparse(scaled)


	@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
	def test_bistochastic_normalize(global_random_seed, csr_container):
	generator = np.random.RandomState(global_random_seed)
	X = generator.rand(100, 100)
	for mat in (X, csr_container(X)):
	scaled = _bistochastic_normalize(mat)
	_do_bistochastic_test(scaled)
	if issparse(mat):
	assert issparse(scaled)


	def test_log_normalize(global_random_seed):
	# adding any constant to a log-scaled matrix should make it
	# bistochastic
	generator = np.random.RandomState(global_random_seed)
	mat = generator.rand(100, 100)
	scaled = _log_normalize(mat) + 1
	_do_bistochastic_test(scaled)


	def test_fit_best_piecewise(global_random_seed):
	model = SpectralBiclustering(random_state=global_random_seed)
	vectors = np.array([[0, 0, 0, 1, 1, 1], [2, 2, 2, 3, 3, 3], [0, 1, 2, 3, 4, 5]])
	best = model._fit_best_piecewise(vectors, n_best=2, n_clusters=2)
	assert_array_equal(best, vectors[:2])


	@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
	def test_project_and_cluster(global_random_seed, csr_container):
	model = SpectralBiclustering(random_state=global_random_seed)
	data = np.array([[1, 1, 1], [1, 1, 1], [3, 6, 3], [3, 6, 3]])
	vectors = np.array([[1, 0], [0, 1], [0, 0]])
	for mat in (data, csr_container(data)):
	labels = model._project_and_cluster(mat, vectors, n_clusters=2)
	assert_almost_equal(v_measure_score(labels, [0, 0, 1, 1]), 1.0)


	def test_perfect_checkerboard(global_random_seed):
	# XXX Previously failed on build bot (not reproducible)
	model = SpectralBiclustering(
	3, svd_method="arpack", random_state=global_random_seed
	)

	S, rows, cols = make_checkerboard(
	(30, 30), 3, noise=0, random_state=global_random_seed
	)
	model.fit(S)
	assert consensus_score(model.biclusters_, (rows, cols)) == 1

	S, rows, cols = make_checkerboard(
	(40, 30), 3, noise=0, random_state=global_random_seed
	)
	model.fit(S)
	assert consensus_score(model.biclusters_, (rows, cols)) == 1

	S, rows, cols = make_checkerboard(
	(30, 40), 3, noise=0, random_state=global_random_seed
	)
	model.fit(S)
	assert consensus_score(model.biclusters_, (rows, cols)) == 1


	@pytest.mark.parametrize(
	"params, type_err, err_msg",
	[
	(
	{"n_clusters": 6},
	ValueError,
	"n_clusters should be <= n_samples=5",
	),
	(
	{"n_clusters": (3, 3, 3)},
	ValueError,
	"Incorrect parameter n_clusters",
	),
	(
	{"n_clusters": (3, 6)},
	ValueError,
	"Incorrect parameter n_clusters",
	),
	(
	{"n_components": 3, "n_best": 4},
	ValueError,
	"n_best=4 must be <= n_components=3",
	),
	],
	)
	def test_spectralbiclustering_parameter_validation(params, type_err, err_msg):
	"""Check parameters validation in `SpectralBiClustering`"""
	data = np.arange(25).reshape((5, 5))
	model = SpectralBiclustering(**params)
	with pytest.raises(type_err, match=err_msg):
	model.fit(data)


	@pytest.mark.parametrize("est", (SpectralBiclustering(), SpectralCoclustering()))
	def test_n_features_in_(est):
	X, _, _ = make_biclusters((3, 3), 3, random_state=0)

	assert not hasattr(est, "n_features_in_")
	est.fit(X)
	assert est.n_features_in_ == 3