Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified 12 months ago

5.23 kB

	import numpy as np
	import pytest
	from scipy import stats

	from sklearn.linear_model import LogisticRegression
	from sklearn.svm import LinearSVC
	from sklearn.svm._bounds import l1_min_c
	from sklearn.svm._newrand import bounded_rand_int_wrap, set_seed_wrap
	from sklearn.utils.fixes import CSR_CONTAINERS

	dense_X = [[-1, 0], [0, 1], [1, 1], [1, 1]]

	Y1 = [0, 1, 1, 1]
	Y2 = [2, 1, 0, 0]


	@pytest.mark.parametrize("X_container", CSR_CONTAINERS + [np.array])
	@pytest.mark.parametrize("loss", ["squared_hinge", "log"])
	@pytest.mark.parametrize("Y_label", ["two-classes", "multi-class"])
	@pytest.mark.parametrize("intercept_label", ["no-intercept", "fit-intercept"])
	def test_l1_min_c(X_container, loss, Y_label, intercept_label):
	Ys = {"two-classes": Y1, "multi-class": Y2}
	intercepts = {
	"no-intercept": {"fit_intercept": False},
	"fit-intercept": {"fit_intercept": True, "intercept_scaling": 10},
	}

	X = X_container(dense_X)
	Y = Ys[Y_label]
	intercept_params = intercepts[intercept_label]
	check_l1_min_c(X, Y, loss, **intercept_params)


	def check_l1_min_c(X, y, loss, fit_intercept=True, intercept_scaling=1.0):
	min_c = l1_min_c(
	X,
	y,
	loss=loss,
	fit_intercept=fit_intercept,
	intercept_scaling=intercept_scaling,
	)

	clf = {
	"log": LogisticRegression(penalty="l1", solver="liblinear"),
	"squared_hinge": LinearSVC(loss="squared_hinge", penalty="l1", dual=False),
	}[loss]

	clf.fit_intercept = fit_intercept
	clf.intercept_scaling = intercept_scaling

	clf.C = min_c
	clf.fit(X, y)
	assert (np.asarray(clf.coef_) == 0).all()
	assert (np.asarray(clf.intercept_) == 0).all()

	clf.C = min_c * 1.01
	clf.fit(X, y)
	assert (np.asarray(clf.coef_) != 0).any() or (np.asarray(clf.intercept_) != 0).any()


	def test_ill_posed_min_c():
	X = [[0, 0], [0, 0]]
	y = [0, 1]
	with pytest.raises(ValueError):
	l1_min_c(X, y)


	_MAX_UNSIGNED_INT = 4294967295


	def test_newrand_default():
	"""Test that bounded_rand_int_wrap without seeding respects the range

	Note this test should pass either if executed alone, or in conjunctions
	with other tests that call set_seed explicit in any order: it checks
	invariants on the RNG instead of specific values.
	"""
	generated = [bounded_rand_int_wrap(100) for _ in range(10)]
	assert all(0 <= x < 100 for x in generated)
	assert not all(x == generated[0] for x in generated)


	@pytest.mark.parametrize("seed, expected", [(0, 54), (_MAX_UNSIGNED_INT, 9)])
	def test_newrand_set_seed(seed, expected):
	"""Test that `set_seed` produces deterministic results"""
	set_seed_wrap(seed)
	generated = bounded_rand_int_wrap(100)
	assert generated == expected


	@pytest.mark.parametrize("seed", [-1, _MAX_UNSIGNED_INT + 1])
	def test_newrand_set_seed_overflow(seed):
	"""Test that `set_seed_wrap` is defined for unsigned 32bits ints"""
	with pytest.raises(OverflowError):
	set_seed_wrap(seed)


	@pytest.mark.parametrize("range_, n_pts", [(_MAX_UNSIGNED_INT, 10000), (100, 25)])
	def test_newrand_bounded_rand_int(range_, n_pts):
	"""Test that `bounded_rand_int` follows a uniform distribution"""
	# XXX: this test is very seed sensitive: either it is wrong (too strict?)
	# or the wrapped RNG is not uniform enough, at least on some platforms.
	set_seed_wrap(42)
	n_iter = 100
	ks_pvals = []
	uniform_dist = stats.uniform(loc=0, scale=range_)
	# perform multiple samplings to make chance of outlier sampling negligible
	for _ in range(n_iter):
	# Deterministic random sampling
	sample = [bounded_rand_int_wrap(range_) for _ in range(n_pts)]
	res = stats.kstest(sample, uniform_dist.cdf)
	ks_pvals.append(res.pvalue)
	# Null hypothesis = samples come from an uniform distribution.
	# Under the null hypothesis, p-values should be uniformly distributed
	# and not concentrated on low values
	# (this may seem counter-intuitive but is backed by multiple refs)
	# So we can do two checks:

	# (1) check uniformity of p-values
	uniform_p_vals_dist = stats.uniform(loc=0, scale=1)
	res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf)
	assert res_pvals.pvalue > 0.05, (
	"Null hypothesis rejected: generated random numbers are not uniform."
	" Details: the (meta) p-value of the test of uniform distribution"
	f" of p-values is {res_pvals.pvalue} which is not > 0.05"
	)

	# (2) (safety belt) check that 90% of p-values are above 0.05
	min_10pct_pval = np.percentile(ks_pvals, q=10)
	# lower 10th quantile pvalue <= 0.05 means that the test rejects the
	# null hypothesis that the sample came from the uniform distribution
	assert min_10pct_pval > 0.05, (
	"Null hypothesis rejected: generated random numbers are not uniform. "
	f"Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05."
	)


	@pytest.mark.parametrize("range_", [-1, _MAX_UNSIGNED_INT + 1])
	def test_newrand_bounded_rand_int_limits(range_):
	"""Test that `bounded_rand_int_wrap` is defined for unsigned 32bits ints"""
	with pytest.raises(OverflowError):
	bounded_rand_int_wrap(range_)