Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified 12 months ago

19.6 kB

	import functools
	import warnings
	from typing import Any, List

	import numpy as np
	import pytest
	import scipy.sparse as sp

	from sklearn.exceptions import DataDimensionalityWarning, NotFittedError
	from sklearn.metrics import euclidean_distances
	from sklearn.random_projection import (
	GaussianRandomProjection,
	SparseRandomProjection,
	_gaussian_random_matrix,
	_sparse_random_matrix,
	johnson_lindenstrauss_min_dim,
	)
	from sklearn.utils._testing import (
	assert_allclose,
	assert_allclose_dense_sparse,
	assert_almost_equal,
	assert_array_almost_equal,
	assert_array_equal,
	)
	from sklearn.utils.fixes import COO_CONTAINERS

	all_sparse_random_matrix: List[Any] = [_sparse_random_matrix]
	all_dense_random_matrix: List[Any] = [_gaussian_random_matrix]
	all_random_matrix = all_sparse_random_matrix + all_dense_random_matrix

	all_SparseRandomProjection: List[Any] = [SparseRandomProjection]
	all_DenseRandomProjection: List[Any] = [GaussianRandomProjection]
	all_RandomProjection = all_SparseRandomProjection + all_DenseRandomProjection


	def make_sparse_random_data(
	coo_container,
	n_samples,
	n_features,
	n_nonzeros,
	random_state=None,
	sparse_format="csr",
	):
	"""Make some random data with uniformly located non zero entries with
	Gaussian distributed values; `sparse_format` can be `"csr"` (default) or
	`None` (in which case a dense array is returned).
	"""
	rng = np.random.RandomState(random_state)
	data_coo = coo_container(
	(
	rng.randn(n_nonzeros),
	(
	rng.randint(n_samples, size=n_nonzeros),
	rng.randint(n_features, size=n_nonzeros),
	),
	),
	shape=(n_samples, n_features),
	)
	if sparse_format is not None:
	return data_coo.asformat(sparse_format)
	else:
	return data_coo.toarray()


	def densify(matrix):
	if not sp.issparse(matrix):
	return matrix
	else:
	return matrix.toarray()


	n_samples, n_features = (10, 1000)
	n_nonzeros = int(n_samples * n_features / 100.0)


	###############################################################################
	# test on JL lemma
	###############################################################################


	@pytest.mark.parametrize(
	"n_samples, eps",
	[
	([100, 110], [0.9, 1.1]),
	([90, 100], [0.1, 0.0]),
	([50, -40], [0.1, 0.2]),
	],
	)
	def test_invalid_jl_domain(n_samples, eps):
	with pytest.raises(ValueError):
	johnson_lindenstrauss_min_dim(n_samples, eps=eps)


	def test_input_size_jl_min_dim():
	with pytest.raises(ValueError):
	johnson_lindenstrauss_min_dim(3 * [100], eps=2 * [0.9])

	johnson_lindenstrauss_min_dim(
	np.random.randint(1, 10, size=(10, 10)), eps=np.full((10, 10), 0.5)
	)


	###############################################################################
	# tests random matrix generation
	###############################################################################
	def check_input_size_random_matrix(random_matrix):
	inputs = [(0, 0), (-1, 1), (1, -1), (1, 0), (-1, 0)]
	for n_components, n_features in inputs:
	with pytest.raises(ValueError):
	random_matrix(n_components, n_features)


	def check_size_generated(random_matrix):
	inputs = [(1, 5), (5, 1), (5, 5), (1, 1)]
	for n_components, n_features in inputs:
	assert random_matrix(n_components, n_features).shape == (
	n_components,
	n_features,
	)


	def check_zero_mean_and_unit_norm(random_matrix):
	# All random matrix should produce a transformation matrix
	# with zero mean and unit norm for each columns

	A = densify(random_matrix(10000, 1, random_state=0))

	assert_array_almost_equal(0, np.mean(A), 3)
	assert_array_almost_equal(1.0, np.linalg.norm(A), 1)


	def check_input_with_sparse_random_matrix(random_matrix):
	n_components, n_features = 5, 10

	for density in [-1.0, 0.0, 1.1]:
	with pytest.raises(ValueError):
	random_matrix(n_components, n_features, density=density)


	@pytest.mark.parametrize("random_matrix", all_random_matrix)
	def test_basic_property_of_random_matrix(random_matrix):
	# Check basic properties of random matrix generation
	check_input_size_random_matrix(random_matrix)
	check_size_generated(random_matrix)
	check_zero_mean_and_unit_norm(random_matrix)


	@pytest.mark.parametrize("random_matrix", all_sparse_random_matrix)
	def test_basic_property_of_sparse_random_matrix(random_matrix):
	check_input_with_sparse_random_matrix(random_matrix)

	random_matrix_dense = functools.partial(random_matrix, density=1.0)

	check_zero_mean_and_unit_norm(random_matrix_dense)


	def test_gaussian_random_matrix():
	# Check some statical properties of Gaussian random matrix
	# Check that the random matrix follow the proper distribution.
	# Let's say that each element of a_{ij} of A is taken from
	# a_ij ~ N(0.0, 1 / n_components).
	#
	n_components = 100
	n_features = 1000
	A = _gaussian_random_matrix(n_components, n_features, random_state=0)

	assert_array_almost_equal(0.0, np.mean(A), 2)
	assert_array_almost_equal(np.var(A, ddof=1), 1 / n_components, 1)


	def test_sparse_random_matrix():
	# Check some statical properties of sparse random matrix
	n_components = 100
	n_features = 500

	for density in [0.3, 1.0]:
	s = 1 / density

	A = _sparse_random_matrix(
	n_components, n_features, density=density, random_state=0
	)
	A = densify(A)

	# Check possible values
	values = np.unique(A)
	assert np.sqrt(s) / np.sqrt(n_components) in values
	assert -np.sqrt(s) / np.sqrt(n_components) in values

	if density == 1.0:
	assert np.size(values) == 2
	else:
	assert 0.0 in values
	assert np.size(values) == 3

	# Check that the random matrix follow the proper distribution.
	# Let's say that each element of a_{ij} of A is taken from
	#
	# - -sqrt(s) / sqrt(n_components) with probability 1 / 2s
	# - 0 with probability 1 - 1 / s
	# - +sqrt(s) / sqrt(n_components) with probability 1 / 2s
	#
	assert_almost_equal(np.mean(A == 0.0), 1 - 1 / s, decimal=2)
	assert_almost_equal(
	np.mean(A == np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2
	)
	assert_almost_equal(
	np.mean(A == -np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2
	)

	assert_almost_equal(np.var(A == 0.0, ddof=1), (1 - 1 / s) * 1 / s, decimal=2)
	assert_almost_equal(
	np.var(A == np.sqrt(s) / np.sqrt(n_components), ddof=1),
	(1 - 1 / (2 * s)) * 1 / (2 * s),
	decimal=2,
	)
	assert_almost_equal(
	np.var(A == -np.sqrt(s) / np.sqrt(n_components), ddof=1),
	(1 - 1 / (2 * s)) * 1 / (2 * s),
	decimal=2,
	)


	###############################################################################
	# tests on random projection transformer
	###############################################################################


	def test_random_projection_transformer_invalid_input():
	n_components = "auto"
	fit_data = [[0, 1, 2]]
	for RandomProjection in all_RandomProjection:
	with pytest.raises(ValueError):
	RandomProjection(n_components=n_components).fit(fit_data)


	@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
	def test_try_to_transform_before_fit(coo_container, global_random_seed):
	data = make_sparse_random_data(
	coo_container,
	n_samples,
	n_features,
	n_nonzeros,
	random_state=global_random_seed,
	sparse_format=None,
	)
	for RandomProjection in all_RandomProjection:
	with pytest.raises(NotFittedError):
	RandomProjection(n_components="auto").transform(data)


	@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
	def test_too_many_samples_to_find_a_safe_embedding(coo_container, global_random_seed):
	data = make_sparse_random_data(
	coo_container,
	n_samples=1000,
	n_features=100,
	n_nonzeros=1000,
	random_state=global_random_seed,
	sparse_format=None,
	)

	for RandomProjection in all_RandomProjection:
	rp = RandomProjection(n_components="auto", eps=0.1)
	expected_msg = (
	"eps=0.100000 and n_samples=1000 lead to a target dimension"
	" of 5920 which is larger than the original space with"
	" n_features=100"
	)
	with pytest.raises(ValueError, match=expected_msg):
	rp.fit(data)


	@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
	def test_random_projection_embedding_quality(coo_container):
	data = make_sparse_random_data(
	coo_container,
	n_samples=8,
	n_features=5000,
	n_nonzeros=15000,
	random_state=0,
	sparse_format=None,
	)
	eps = 0.2

	original_distances = euclidean_distances(data, squared=True)
	original_distances = original_distances.ravel()
	non_identical = original_distances != 0.0

	# remove 0 distances to avoid division by 0
	original_distances = original_distances[non_identical]

	for RandomProjection in all_RandomProjection:
	rp = RandomProjection(n_components="auto", eps=eps, random_state=0)
	projected = rp.fit_transform(data)

	projected_distances = euclidean_distances(projected, squared=True)
	projected_distances = projected_distances.ravel()

	# remove 0 distances to avoid division by 0
	projected_distances = projected_distances[non_identical]

	distances_ratio = projected_distances / original_distances

	# check that the automatically tuned values for the density respect the
	# contract for eps: pairwise distances are preserved according to the
	# Johnson-Lindenstrauss lemma
	assert distances_ratio.max() < 1 + eps
	assert 1 - eps < distances_ratio.min()


	@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
	def test_SparseRandomProj_output_representation(coo_container):
	dense_data = make_sparse_random_data(
	coo_container,
	n_samples,
	n_features,
	n_nonzeros,
	random_state=0,
	sparse_format=None,
	)
	sparse_data = make_sparse_random_data(
	coo_container,
	n_samples,
	n_features,
	n_nonzeros,
	random_state=0,
	sparse_format="csr",
	)
	for SparseRandomProj in all_SparseRandomProjection:
	# when using sparse input, the projected data can be forced to be a
	# dense numpy array
	rp = SparseRandomProj(n_components=10, dense_output=True, random_state=0)
	rp.fit(dense_data)
	assert isinstance(rp.transform(dense_data), np.ndarray)
	assert isinstance(rp.transform(sparse_data), np.ndarray)

	# the output can be left to a sparse matrix instead
	rp = SparseRandomProj(n_components=10, dense_output=False, random_state=0)
	rp = rp.fit(dense_data)
	# output for dense input will stay dense:
	assert isinstance(rp.transform(dense_data), np.ndarray)

	# output for sparse output will be sparse:
	assert sp.issparse(rp.transform(sparse_data))


	@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
	def test_correct_RandomProjection_dimensions_embedding(
	coo_container, global_random_seed
	):
	data = make_sparse_random_data(
	coo_container,
	n_samples,
	n_features,
	n_nonzeros,
	random_state=global_random_seed,
	sparse_format=None,
	)
	for RandomProjection in all_RandomProjection:
	rp = RandomProjection(n_components="auto", random_state=0, eps=0.5).fit(data)

	# the number of components is adjusted from the shape of the training
	# set
	assert rp.n_components == "auto"
	assert rp.n_components_ == 110

	if RandomProjection in all_SparseRandomProjection:
	assert rp.density == "auto"
	assert_almost_equal(rp.density_, 0.03, 2)

	assert rp.components_.shape == (110, n_features)

	projected_1 = rp.transform(data)
	assert projected_1.shape == (n_samples, 110)

	# once the RP is 'fitted' the projection is always the same
	projected_2 = rp.transform(data)
	assert_array_equal(projected_1, projected_2)

	# fit transform with same random seed will lead to the same results
	rp2 = RandomProjection(random_state=0, eps=0.5)
	projected_3 = rp2.fit_transform(data)
	assert_array_equal(projected_1, projected_3)

	# Try to transform with an input X of size different from fitted.
	with pytest.raises(ValueError):
	rp.transform(data[:, 1:5])

	# it is also possible to fix the number of components and the density
	# level
	if RandomProjection in all_SparseRandomProjection:
	rp = RandomProjection(n_components=100, density=0.001, random_state=0)
	projected = rp.fit_transform(data)
	assert projected.shape == (n_samples, 100)
	assert rp.components_.shape == (100, n_features)
	assert rp.components_.nnz < 115 # close to 1% density
	assert 85 < rp.components_.nnz # close to 1% density


	@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
	def test_warning_n_components_greater_than_n_features(
	coo_container, global_random_seed
	):
	n_features = 20
	n_samples = 5
	n_nonzeros = int(n_features / 4)
	data = make_sparse_random_data(
	coo_container,
	n_samples,
	n_features,
	n_nonzeros,
	random_state=global_random_seed,
	sparse_format=None,
	)

	for RandomProjection in all_RandomProjection:
	with pytest.warns(DataDimensionalityWarning):
	RandomProjection(n_components=n_features + 1).fit(data)


	@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
	def test_works_with_sparse_data(coo_container, global_random_seed):
	n_features = 20
	n_samples = 5
	n_nonzeros = int(n_features / 4)
	dense_data = make_sparse_random_data(
	coo_container,
	n_samples,
	n_features,
	n_nonzeros,
	random_state=global_random_seed,
	sparse_format=None,
	)
	sparse_data = make_sparse_random_data(
	coo_container,
	n_samples,
	n_features,
	n_nonzeros,
	random_state=global_random_seed,
	sparse_format="csr",
	)

	for RandomProjection in all_RandomProjection:
	rp_dense = RandomProjection(n_components=3, random_state=1).fit(dense_data)
	rp_sparse = RandomProjection(n_components=3, random_state=1).fit(sparse_data)
	assert_array_almost_equal(
	densify(rp_dense.components_), densify(rp_sparse.components_)
	)


	def test_johnson_lindenstrauss_min_dim():
	"""Test Johnson-Lindenstrauss for small eps.

	Regression test for #17111: before #19374, 32-bit systems would fail.
	"""
	assert johnson_lindenstrauss_min_dim(100, eps=1e-5) == 368416070986


	@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
	@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
	def test_random_projection_feature_names_out(
	coo_container, random_projection_cls, global_random_seed
	):
	data = make_sparse_random_data(
	coo_container,
	n_samples,
	n_features,
	n_nonzeros,
	random_state=global_random_seed,
	sparse_format=None,
	)
	random_projection = random_projection_cls(n_components=2)
	random_projection.fit(data)
	names_out = random_projection.get_feature_names_out()
	class_name_lower = random_projection_cls.__name__.lower()
	expected_names_out = np.array(
	[f"{class_name_lower}{i}" for i in range(random_projection.n_components_)],
	dtype=object,
	)

	assert_array_equal(names_out, expected_names_out)


	@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
	@pytest.mark.parametrize("n_samples", (2, 9, 10, 11, 1000))
	@pytest.mark.parametrize("n_features", (2, 9, 10, 11, 1000))
	@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
	@pytest.mark.parametrize("compute_inverse_components", [True, False])
	def test_inverse_transform(
	coo_container,
	n_samples,
	n_features,
	random_projection_cls,
	compute_inverse_components,
	global_random_seed,
	):
	n_components = 10

	random_projection = random_projection_cls(
	n_components=n_components,
	compute_inverse_components=compute_inverse_components,
	random_state=global_random_seed,
	)

	X_dense = make_sparse_random_data(
	coo_container,
	n_samples,
	n_features,
	n_nonzeros=n_samples * n_features // 100 + 1,
	random_state=global_random_seed,
	sparse_format=None,
	)
	X_csr = make_sparse_random_data(
	coo_container,
	n_samples,
	n_features,
	n_nonzeros=n_samples * n_features // 100 + 1,
	random_state=global_random_seed,
	sparse_format="csr",
	)

	for X in [X_dense, X_csr]:
	with warnings.catch_warnings():
	warnings.filterwarnings(
	"ignore",
	message=(
	"The number of components is higher than the number of features"
	),
	category=DataDimensionalityWarning,
	)
	projected = random_projection.fit_transform(X)

	if compute_inverse_components:
	assert hasattr(random_projection, "inverse_components_")
	inv_components = random_projection.inverse_components_
	assert inv_components.shape == (n_features, n_components)

	projected_back = random_projection.inverse_transform(projected)
	assert projected_back.shape == X.shape

	projected_again = random_projection.transform(projected_back)
	if hasattr(projected, "toarray"):
	projected = projected.toarray()
	assert_allclose(projected, projected_again, rtol=1e-7, atol=1e-10)


	@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
	@pytest.mark.parametrize(
	"input_dtype, expected_dtype",
	(
	(np.float32, np.float32),
	(np.float64, np.float64),
	(np.int32, np.float64),
	(np.int64, np.float64),
	),
	)
	def test_random_projection_dtype_match(
	random_projection_cls, input_dtype, expected_dtype
	):
	# Verify output matrix dtype
	rng = np.random.RandomState(42)
	X = rng.rand(25, 3000)
	rp = random_projection_cls(random_state=0)
	transformed = rp.fit_transform(X.astype(input_dtype))

	assert rp.components_.dtype == expected_dtype
	assert transformed.dtype == expected_dtype


	@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
	def test_random_projection_numerical_consistency(random_projection_cls):
	# Verify numerical consistency among np.float32 and np.float64
	atol = 1e-5
	rng = np.random.RandomState(42)
	X = rng.rand(25, 3000)
	rp_32 = random_projection_cls(random_state=0)
	rp_64 = random_projection_cls(random_state=0)

	projection_32 = rp_32.fit_transform(X.astype(np.float32))
	projection_64 = rp_64.fit_transform(X.astype(np.float64))

	assert_allclose(projection_64, projection_32, atol=atol)

	assert_allclose_dense_sparse(rp_32.components_, rp_64.components_)