spam-classifier
/
venv
/lib
/python3.11
/site-packages
/sklearn
/preprocessing
/tests
/test_polynomial.py
| import sys | |
| import numpy as np | |
| import pytest | |
| from numpy.testing import assert_allclose, assert_array_equal | |
| from scipy import sparse | |
| from scipy.interpolate import BSpline | |
| from scipy.sparse import random as sparse_random | |
| from sklearn.linear_model import LinearRegression | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.preprocessing import ( | |
| KBinsDiscretizer, | |
| PolynomialFeatures, | |
| SplineTransformer, | |
| ) | |
| from sklearn.preprocessing._csr_polynomial_expansion import ( | |
| _calc_expanded_nnz, | |
| _calc_total_nnz, | |
| _get_sizeof_LARGEST_INT_t, | |
| ) | |
| from sklearn.utils._testing import assert_array_almost_equal | |
| from sklearn.utils.fixes import ( | |
| CSC_CONTAINERS, | |
| CSR_CONTAINERS, | |
| parse_version, | |
| sp_version, | |
| ) | |
| def test_polynomial_and_spline_array_order(est): | |
| """Test that output array has the given order.""" | |
| X = np.arange(10).reshape(5, 2) | |
| def is_c_contiguous(a): | |
| return np.isfortran(a.T) | |
| assert is_c_contiguous(est().fit_transform(X)) | |
| assert is_c_contiguous(est(order="C").fit_transform(X)) | |
| assert np.isfortran(est(order="F").fit_transform(X)) | |
| def test_spline_transformer_input_validation(params, err_msg): | |
| """Test that we raise errors for invalid input in SplineTransformer.""" | |
| X = [[1], [2]] | |
| with pytest.raises(ValueError, match=err_msg): | |
| SplineTransformer(**params).fit(X) | |
| def test_spline_transformer_integer_knots(extrapolation): | |
| """Test that SplineTransformer accepts integer value knot positions.""" | |
| X = np.arange(20).reshape(10, 2) | |
| knots = [[0, 1], [1, 2], [5, 5], [11, 10], [12, 11]] | |
| _ = SplineTransformer( | |
| degree=3, knots=knots, extrapolation=extrapolation | |
| ).fit_transform(X) | |
| def test_spline_transformer_feature_names(): | |
| """Test that SplineTransformer generates correct features name.""" | |
| X = np.arange(20).reshape(10, 2) | |
| splt = SplineTransformer(n_knots=3, degree=3, include_bias=True).fit(X) | |
| feature_names = splt.get_feature_names_out() | |
| assert_array_equal( | |
| feature_names, | |
| [ | |
| "x0_sp_0", | |
| "x0_sp_1", | |
| "x0_sp_2", | |
| "x0_sp_3", | |
| "x0_sp_4", | |
| "x1_sp_0", | |
| "x1_sp_1", | |
| "x1_sp_2", | |
| "x1_sp_3", | |
| "x1_sp_4", | |
| ], | |
| ) | |
| splt = SplineTransformer(n_knots=3, degree=3, include_bias=False).fit(X) | |
| feature_names = splt.get_feature_names_out(["a", "b"]) | |
| assert_array_equal( | |
| feature_names, | |
| [ | |
| "a_sp_0", | |
| "a_sp_1", | |
| "a_sp_2", | |
| "a_sp_3", | |
| "b_sp_0", | |
| "b_sp_1", | |
| "b_sp_2", | |
| "b_sp_3", | |
| ], | |
| ) | |
| def test_split_transform_feature_names_extrapolation_degree(extrapolation, degree): | |
| """Test feature names are correct for different extrapolations and degree. | |
| Non-regression test for gh-25292. | |
| """ | |
| X = np.arange(20).reshape(10, 2) | |
| splt = SplineTransformer(degree=degree, extrapolation=extrapolation).fit(X) | |
| feature_names = splt.get_feature_names_out(["a", "b"]) | |
| assert len(feature_names) == splt.n_features_out_ | |
| X_trans = splt.transform(X) | |
| assert X_trans.shape[1] == len(feature_names) | |
| def test_spline_transformer_unity_decomposition(degree, n_knots, knots, extrapolation): | |
| """Test that B-splines are indeed a decomposition of unity. | |
| Splines basis functions must sum up to 1 per row, if we stay in between boundaries. | |
| """ | |
| X = np.linspace(0, 1, 100)[:, None] | |
| # make the boundaries 0 and 1 part of X_train, for sure. | |
| X_train = np.r_[[[0]], X[::2, :], [[1]]] | |
| X_test = X[1::2, :] | |
| if extrapolation == "periodic": | |
| n_knots = n_knots + degree # periodic splines require degree < n_knots | |
| splt = SplineTransformer( | |
| n_knots=n_knots, | |
| degree=degree, | |
| knots=knots, | |
| include_bias=True, | |
| extrapolation=extrapolation, | |
| ) | |
| splt.fit(X_train) | |
| for X in [X_train, X_test]: | |
| assert_allclose(np.sum(splt.transform(X), axis=1), 1) | |
| def test_spline_transformer_linear_regression(bias, intercept): | |
| """Test that B-splines fit a sinusodial curve pretty well.""" | |
| X = np.linspace(0, 10, 100)[:, None] | |
| y = np.sin(X[:, 0]) + 2 # +2 to avoid the value 0 in assert_allclose | |
| pipe = Pipeline( | |
| steps=[ | |
| ( | |
| "spline", | |
| SplineTransformer( | |
| n_knots=15, | |
| degree=3, | |
| include_bias=bias, | |
| extrapolation="constant", | |
| ), | |
| ), | |
| ("ols", LinearRegression(fit_intercept=intercept)), | |
| ] | |
| ) | |
| pipe.fit(X, y) | |
| assert_allclose(pipe.predict(X), y, rtol=1e-3) | |
| def test_spline_transformer_get_base_knot_positions( | |
| knots, n_knots, sample_weight, expected_knots | |
| ): | |
| """Check the behaviour to find knot positions with and without sample_weight.""" | |
| X = np.array([[0, 2], [0, 2], [2, 2], [3, 3], [4, 6], [5, 8], [6, 14]]) | |
| base_knots = SplineTransformer._get_base_knot_positions( | |
| X=X, knots=knots, n_knots=n_knots, sample_weight=sample_weight | |
| ) | |
| assert_allclose(base_knots, expected_knots) | |
| def test_spline_transformer_periodic_linear_regression(bias, intercept): | |
| """Test that B-splines fit a periodic curve pretty well.""" | |
| # "+ 3" to avoid the value 0 in assert_allclose | |
| def f(x): | |
| return np.sin(2 * np.pi * x) - np.sin(8 * np.pi * x) + 3 | |
| X = np.linspace(0, 1, 101)[:, None] | |
| pipe = Pipeline( | |
| steps=[ | |
| ( | |
| "spline", | |
| SplineTransformer( | |
| n_knots=20, | |
| degree=3, | |
| include_bias=bias, | |
| extrapolation="periodic", | |
| ), | |
| ), | |
| ("ols", LinearRegression(fit_intercept=intercept)), | |
| ] | |
| ) | |
| pipe.fit(X, f(X[:, 0])) | |
| # Generate larger array to check periodic extrapolation | |
| X_ = np.linspace(-1, 2, 301)[:, None] | |
| predictions = pipe.predict(X_) | |
| assert_allclose(predictions, f(X_[:, 0]), atol=0.01, rtol=0.01) | |
| assert_allclose(predictions[0:100], predictions[100:200], rtol=1e-3) | |
| def test_spline_transformer_periodic_spline_backport(): | |
| """Test that the backport of extrapolate="periodic" works correctly""" | |
| X = np.linspace(-2, 3.5, 10)[:, None] | |
| degree = 2 | |
| # Use periodic extrapolation backport in SplineTransformer | |
| transformer = SplineTransformer( | |
| degree=degree, extrapolation="periodic", knots=[[-1.0], [0.0], [1.0]] | |
| ) | |
| Xt = transformer.fit_transform(X) | |
| # Use periodic extrapolation in BSpline | |
| coef = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]]) | |
| spl = BSpline(np.arange(-3, 4), coef, degree, "periodic") | |
| Xspl = spl(X[:, 0]) | |
| assert_allclose(Xt, Xspl) | |
| def test_spline_transformer_periodic_splines_periodicity(): | |
| """Test if shifted knots result in the same transformation up to permutation.""" | |
| X = np.linspace(0, 10, 101)[:, None] | |
| transformer_1 = SplineTransformer( | |
| degree=3, | |
| extrapolation="periodic", | |
| knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]], | |
| ) | |
| transformer_2 = SplineTransformer( | |
| degree=3, | |
| extrapolation="periodic", | |
| knots=[[1.0], [3.0], [4.0], [5.0], [8.0], [9.0]], | |
| ) | |
| Xt_1 = transformer_1.fit_transform(X) | |
| Xt_2 = transformer_2.fit_transform(X) | |
| assert_allclose(Xt_1, Xt_2[:, [4, 0, 1, 2, 3]]) | |
| def test_spline_transformer_periodic_splines_smoothness(degree): | |
| """Test that spline transformation is smooth at first / last knot.""" | |
| X = np.linspace(-2, 10, 10_000)[:, None] | |
| transformer = SplineTransformer( | |
| degree=degree, | |
| extrapolation="periodic", | |
| knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]], | |
| ) | |
| Xt = transformer.fit_transform(X) | |
| delta = (X.max() - X.min()) / len(X) | |
| tol = 10 * delta | |
| dXt = Xt | |
| # We expect splines of degree `degree` to be (`degree`-1) times | |
| # continuously differentiable. I.e. for d = 0, ..., `degree` - 1 the d-th | |
| # derivative should be continuous. This is the case if the (d+1)-th | |
| # numerical derivative is reasonably small (smaller than `tol` in absolute | |
| # value). We thus compute d-th numeric derivatives for d = 1, ..., `degree` | |
| # and compare them to `tol`. | |
| # | |
| # Note that the 0-th derivative is the function itself, such that we are | |
| # also checking its continuity. | |
| for d in range(1, degree + 1): | |
| # Check continuity of the (d-1)-th derivative | |
| diff = np.diff(dXt, axis=0) | |
| assert np.abs(diff).max() < tol | |
| # Compute d-th numeric derivative | |
| dXt = diff / delta | |
| # As degree `degree` splines are not `degree` times continuously | |
| # differentiable at the knots, the `degree + 1`-th numeric derivative | |
| # should have spikes at the knots. | |
| diff = np.diff(dXt, axis=0) | |
| assert np.abs(diff).max() > 1 | |
| def test_spline_transformer_extrapolation(bias, intercept, degree): | |
| """Test that B-spline extrapolation works correctly.""" | |
| # we use a straight line for that | |
| X = np.linspace(-1, 1, 100)[:, None] | |
| y = X.squeeze() | |
| # 'constant' | |
| pipe = Pipeline( | |
| [ | |
| [ | |
| "spline", | |
| SplineTransformer( | |
| n_knots=4, | |
| degree=degree, | |
| include_bias=bias, | |
| extrapolation="constant", | |
| ), | |
| ], | |
| ["ols", LinearRegression(fit_intercept=intercept)], | |
| ] | |
| ) | |
| pipe.fit(X, y) | |
| assert_allclose(pipe.predict([[-10], [5]]), [-1, 1]) | |
| # 'linear' | |
| pipe = Pipeline( | |
| [ | |
| [ | |
| "spline", | |
| SplineTransformer( | |
| n_knots=4, | |
| degree=degree, | |
| include_bias=bias, | |
| extrapolation="linear", | |
| ), | |
| ], | |
| ["ols", LinearRegression(fit_intercept=intercept)], | |
| ] | |
| ) | |
| pipe.fit(X, y) | |
| assert_allclose(pipe.predict([[-10], [5]]), [-10, 5]) | |
| # 'error' | |
| splt = SplineTransformer( | |
| n_knots=4, degree=degree, include_bias=bias, extrapolation="error" | |
| ) | |
| splt.fit(X) | |
| msg = "X contains values beyond the limits of the knots" | |
| with pytest.raises(ValueError, match=msg): | |
| splt.transform([[-10]]) | |
| with pytest.raises(ValueError, match=msg): | |
| splt.transform([[5]]) | |
| def test_spline_transformer_kbindiscretizer(): | |
| """Test that a B-spline of degree=0 is equivalent to KBinsDiscretizer.""" | |
| rng = np.random.RandomState(97531) | |
| X = rng.randn(200).reshape(200, 1) | |
| n_bins = 5 | |
| n_knots = n_bins + 1 | |
| splt = SplineTransformer( | |
| n_knots=n_knots, degree=0, knots="quantile", include_bias=True | |
| ) | |
| splines = splt.fit_transform(X) | |
| kbd = KBinsDiscretizer(n_bins=n_bins, encode="onehot-dense", strategy="quantile") | |
| kbins = kbd.fit_transform(X) | |
| # Though they should be exactly equal, we test approximately with high | |
| # accuracy. | |
| assert_allclose(splines, kbins, rtol=1e-13) | |
| def test_spline_transformer_sparse_output( | |
| degree, knots, extrapolation, include_bias, global_random_seed | |
| ): | |
| rng = np.random.RandomState(global_random_seed) | |
| X = rng.randn(200).reshape(40, 5) | |
| splt_dense = SplineTransformer( | |
| degree=degree, | |
| knots=knots, | |
| extrapolation=extrapolation, | |
| include_bias=include_bias, | |
| sparse_output=False, | |
| ) | |
| splt_sparse = SplineTransformer( | |
| degree=degree, | |
| knots=knots, | |
| extrapolation=extrapolation, | |
| include_bias=include_bias, | |
| sparse_output=True, | |
| ) | |
| splt_dense.fit(X) | |
| splt_sparse.fit(X) | |
| X_trans_sparse = splt_sparse.transform(X) | |
| X_trans_dense = splt_dense.transform(X) | |
| assert sparse.issparse(X_trans_sparse) and X_trans_sparse.format == "csr" | |
| assert_allclose(X_trans_dense, X_trans_sparse.toarray()) | |
| # extrapolation regime | |
| X_min = np.amin(X, axis=0) | |
| X_max = np.amax(X, axis=0) | |
| X_extra = np.r_[ | |
| np.linspace(X_min - 5, X_min, 10), np.linspace(X_max, X_max + 5, 10) | |
| ] | |
| if extrapolation == "error": | |
| msg = "X contains values beyond the limits of the knots" | |
| with pytest.raises(ValueError, match=msg): | |
| splt_dense.transform(X_extra) | |
| msg = "Out of bounds" | |
| with pytest.raises(ValueError, match=msg): | |
| splt_sparse.transform(X_extra) | |
| else: | |
| assert_allclose( | |
| splt_dense.transform(X_extra), splt_sparse.transform(X_extra).toarray() | |
| ) | |
| def test_spline_transformer_sparse_output_raise_error_for_old_scipy(): | |
| """Test that SplineTransformer with sparse=True raises for scipy<1.8.0.""" | |
| X = [[1], [2]] | |
| with pytest.raises(ValueError, match="scipy>=1.8.0"): | |
| SplineTransformer(sparse_output=True).fit(X) | |
| def test_spline_transformer_n_features_out( | |
| n_knots, include_bias, degree, extrapolation, sparse_output | |
| ): | |
| """Test that transform results in n_features_out_ features.""" | |
| if sparse_output and sp_version < parse_version("1.8.0"): | |
| pytest.skip("The option `sparse_output` is available as of scipy 1.8.0") | |
| splt = SplineTransformer( | |
| n_knots=n_knots, | |
| degree=degree, | |
| include_bias=include_bias, | |
| extrapolation=extrapolation, | |
| sparse_output=sparse_output, | |
| ) | |
| X = np.linspace(0, 1, 10)[:, None] | |
| splt.fit(X) | |
| assert splt.transform(X).shape[1] == splt.n_features_out_ | |
| def test_polynomial_features_input_validation(params, err_msg): | |
| """Test that we raise errors for invalid input in PolynomialFeatures.""" | |
| X = [[1], [2]] | |
| with pytest.raises(ValueError, match=err_msg): | |
| PolynomialFeatures(**params).fit(X) | |
| def single_feature_degree3(): | |
| X = np.arange(6)[:, np.newaxis] | |
| P = np.hstack([np.ones_like(X), X, X**2, X**3]) | |
| return X, P | |
| def test_polynomial_features_one_feature( | |
| single_feature_degree3, | |
| degree, | |
| include_bias, | |
| interaction_only, | |
| indices, | |
| X_container, | |
| ): | |
| """Test PolynomialFeatures on single feature up to degree 3.""" | |
| X, P = single_feature_degree3 | |
| if X_container is not None: | |
| X = X_container(X) | |
| tf = PolynomialFeatures( | |
| degree=degree, include_bias=include_bias, interaction_only=interaction_only | |
| ).fit(X) | |
| out = tf.transform(X) | |
| if X_container is not None: | |
| out = out.toarray() | |
| assert_allclose(out, P[:, indices]) | |
| if tf.n_output_features_ > 0: | |
| assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_) | |
| def two_features_degree3(): | |
| X = np.arange(6).reshape((3, 2)) | |
| x1 = X[:, :1] | |
| x2 = X[:, 1:] | |
| P = np.hstack( | |
| [ | |
| x1**0 * x2**0, # 0 | |
| x1**1 * x2**0, # 1 | |
| x1**0 * x2**1, # 2 | |
| x1**2 * x2**0, # 3 | |
| x1**1 * x2**1, # 4 | |
| x1**0 * x2**2, # 5 | |
| x1**3 * x2**0, # 6 | |
| x1**2 * x2**1, # 7 | |
| x1**1 * x2**2, # 8 | |
| x1**0 * x2**3, # 9 | |
| ] | |
| ) | |
| return X, P | |
| def test_polynomial_features_two_features( | |
| two_features_degree3, | |
| degree, | |
| include_bias, | |
| interaction_only, | |
| indices, | |
| X_container, | |
| ): | |
| """Test PolynomialFeatures on 2 features up to degree 3.""" | |
| X, P = two_features_degree3 | |
| if X_container is not None: | |
| X = X_container(X) | |
| tf = PolynomialFeatures( | |
| degree=degree, include_bias=include_bias, interaction_only=interaction_only | |
| ).fit(X) | |
| out = tf.transform(X) | |
| if X_container is not None: | |
| out = out.toarray() | |
| assert_allclose(out, P[:, indices]) | |
| if tf.n_output_features_ > 0: | |
| assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_) | |
| def test_polynomial_feature_names(): | |
| X = np.arange(30).reshape(10, 3) | |
| poly = PolynomialFeatures(degree=2, include_bias=True).fit(X) | |
| feature_names = poly.get_feature_names_out() | |
| assert_array_equal( | |
| ["1", "x0", "x1", "x2", "x0^2", "x0 x1", "x0 x2", "x1^2", "x1 x2", "x2^2"], | |
| feature_names, | |
| ) | |
| assert len(feature_names) == poly.transform(X).shape[1] | |
| poly = PolynomialFeatures(degree=3, include_bias=False).fit(X) | |
| feature_names = poly.get_feature_names_out(["a", "b", "c"]) | |
| assert_array_equal( | |
| [ | |
| "a", | |
| "b", | |
| "c", | |
| "a^2", | |
| "a b", | |
| "a c", | |
| "b^2", | |
| "b c", | |
| "c^2", | |
| "a^3", | |
| "a^2 b", | |
| "a^2 c", | |
| "a b^2", | |
| "a b c", | |
| "a c^2", | |
| "b^3", | |
| "b^2 c", | |
| "b c^2", | |
| "c^3", | |
| ], | |
| feature_names, | |
| ) | |
| assert len(feature_names) == poly.transform(X).shape[1] | |
| poly = PolynomialFeatures(degree=(2, 3), include_bias=False).fit(X) | |
| feature_names = poly.get_feature_names_out(["a", "b", "c"]) | |
| assert_array_equal( | |
| [ | |
| "a^2", | |
| "a b", | |
| "a c", | |
| "b^2", | |
| "b c", | |
| "c^2", | |
| "a^3", | |
| "a^2 b", | |
| "a^2 c", | |
| "a b^2", | |
| "a b c", | |
| "a c^2", | |
| "b^3", | |
| "b^2 c", | |
| "b c^2", | |
| "c^3", | |
| ], | |
| feature_names, | |
| ) | |
| assert len(feature_names) == poly.transform(X).shape[1] | |
| poly = PolynomialFeatures( | |
| degree=(3, 3), include_bias=True, interaction_only=True | |
| ).fit(X) | |
| feature_names = poly.get_feature_names_out(["a", "b", "c"]) | |
| assert_array_equal(["1", "a b c"], feature_names) | |
| assert len(feature_names) == poly.transform(X).shape[1] | |
| # test some unicode | |
| poly = PolynomialFeatures(degree=1, include_bias=True).fit(X) | |
| feature_names = poly.get_feature_names_out(["\u0001F40D", "\u262e", "\u05d0"]) | |
| assert_array_equal(["1", "\u0001F40D", "\u262e", "\u05d0"], feature_names) | |
| def test_polynomial_features_csc_X( | |
| deg, include_bias, interaction_only, dtype, csc_container | |
| ): | |
| rng = np.random.RandomState(0) | |
| X = rng.randint(0, 2, (100, 2)) | |
| X_csc = csc_container(X) | |
| est = PolynomialFeatures( | |
| deg, include_bias=include_bias, interaction_only=interaction_only | |
| ) | |
| Xt_csc = est.fit_transform(X_csc.astype(dtype)) | |
| Xt_dense = est.fit_transform(X.astype(dtype)) | |
| assert sparse.issparse(Xt_csc) and Xt_csc.format == "csc" | |
| assert Xt_csc.dtype == Xt_dense.dtype | |
| assert_array_almost_equal(Xt_csc.toarray(), Xt_dense) | |
| def test_polynomial_features_csr_X( | |
| deg, include_bias, interaction_only, dtype, csr_container | |
| ): | |
| rng = np.random.RandomState(0) | |
| X = rng.randint(0, 2, (100, 2)) | |
| X_csr = csr_container(X) | |
| est = PolynomialFeatures( | |
| deg, include_bias=include_bias, interaction_only=interaction_only | |
| ) | |
| Xt_csr = est.fit_transform(X_csr.astype(dtype)) | |
| Xt_dense = est.fit_transform(X.astype(dtype, copy=False)) | |
| assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr" | |
| assert Xt_csr.dtype == Xt_dense.dtype | |
| assert_array_almost_equal(Xt_csr.toarray(), Xt_dense) | |
| def test_num_combinations( | |
| n_features, min_degree, max_degree, interaction_only, include_bias, csr_container | |
| ): | |
| """ | |
| Test that n_output_features_ is calculated correctly. | |
| """ | |
| x = csr_container(([1], ([0], [n_features - 1]))) | |
| est = PolynomialFeatures( | |
| degree=max_degree, | |
| interaction_only=interaction_only, | |
| include_bias=include_bias, | |
| ) | |
| est.fit(x) | |
| num_combos = est.n_output_features_ | |
| combos = PolynomialFeatures._combinations( | |
| n_features=n_features, | |
| min_degree=0, | |
| max_degree=max_degree, | |
| interaction_only=interaction_only, | |
| include_bias=include_bias, | |
| ) | |
| assert num_combos == sum([1 for _ in combos]) | |
| def test_polynomial_features_csr_X_floats( | |
| deg, include_bias, interaction_only, dtype, csr_container | |
| ): | |
| X_csr = csr_container(sparse_random(1000, 10, 0.5, random_state=0)) | |
| X = X_csr.toarray() | |
| est = PolynomialFeatures( | |
| deg, include_bias=include_bias, interaction_only=interaction_only | |
| ) | |
| Xt_csr = est.fit_transform(X_csr.astype(dtype)) | |
| Xt_dense = est.fit_transform(X.astype(dtype)) | |
| assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr" | |
| assert Xt_csr.dtype == Xt_dense.dtype | |
| assert_array_almost_equal(Xt_csr.toarray(), Xt_dense) | |
| def test_polynomial_features_csr_X_zero_row( | |
| zero_row_index, deg, interaction_only, csr_container | |
| ): | |
| X_csr = csr_container(sparse_random(3, 10, 1.0, random_state=0)) | |
| X_csr[zero_row_index, :] = 0.0 | |
| X = X_csr.toarray() | |
| est = PolynomialFeatures(deg, include_bias=False, interaction_only=interaction_only) | |
| Xt_csr = est.fit_transform(X_csr) | |
| Xt_dense = est.fit_transform(X) | |
| assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr" | |
| assert Xt_csr.dtype == Xt_dense.dtype | |
| assert_array_almost_equal(Xt_csr.toarray(), Xt_dense) | |
| # This degree should always be one more than the highest degree supported by | |
| # _csr_expansion. | |
| def test_polynomial_features_csr_X_degree_4( | |
| include_bias, interaction_only, csr_container | |
| ): | |
| X_csr = csr_container(sparse_random(1000, 10, 0.5, random_state=0)) | |
| X = X_csr.toarray() | |
| est = PolynomialFeatures( | |
| 4, include_bias=include_bias, interaction_only=interaction_only | |
| ) | |
| Xt_csr = est.fit_transform(X_csr) | |
| Xt_dense = est.fit_transform(X) | |
| assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr" | |
| assert Xt_csr.dtype == Xt_dense.dtype | |
| assert_array_almost_equal(Xt_csr.toarray(), Xt_dense) | |
| def test_polynomial_features_csr_X_dim_edges(deg, dim, interaction_only, csr_container): | |
| X_csr = csr_container(sparse_random(1000, dim, 0.5, random_state=0)) | |
| X = X_csr.toarray() | |
| est = PolynomialFeatures(deg, interaction_only=interaction_only) | |
| Xt_csr = est.fit_transform(X_csr) | |
| Xt_dense = est.fit_transform(X) | |
| assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr" | |
| assert Xt_csr.dtype == Xt_dense.dtype | |
| assert_array_almost_equal(Xt_csr.toarray(), Xt_dense) | |
| def test_csr_polynomial_expansion_index_overflow_non_regression( | |
| interaction_only, include_bias, csr_container | |
| ): | |
| """Check the automatic index dtype promotion to `np.int64` when needed. | |
| This ensures that sufficiently large input configurations get | |
| properly promoted to use `np.int64` for index and indptr representation | |
| while preserving data integrity. Non-regression test for gh-16803. | |
| Note that this is only possible for Python runtimes with a 64 bit address | |
| space. On 32 bit platforms, a `ValueError` is raised instead. | |
| """ | |
| def degree_2_calc(d, i, j): | |
| if interaction_only: | |
| return d * i - (i**2 + 3 * i) // 2 - 1 + j | |
| else: | |
| return d * i - (i**2 + i) // 2 + j | |
| n_samples = 13 | |
| n_features = 120001 | |
| data_dtype = np.float32 | |
| data = np.arange(1, 5, dtype=np.int64) | |
| row = np.array([n_samples - 2, n_samples - 2, n_samples - 1, n_samples - 1]) | |
| # An int64 dtype is required to avoid overflow error on Windows within the | |
| # `degree_2_calc` function. | |
| col = np.array( | |
| [n_features - 2, n_features - 1, n_features - 2, n_features - 1], dtype=np.int64 | |
| ) | |
| X = csr_container( | |
| (data, (row, col)), | |
| shape=(n_samples, n_features), | |
| dtype=data_dtype, | |
| ) | |
| pf = PolynomialFeatures( | |
| interaction_only=interaction_only, include_bias=include_bias, degree=2 | |
| ) | |
| # Calculate the number of combinations a-priori, and if needed check for | |
| # the correct ValueError and terminate the test early. | |
| num_combinations = pf._num_combinations( | |
| n_features=n_features, | |
| min_degree=0, | |
| max_degree=2, | |
| interaction_only=pf.interaction_only, | |
| include_bias=pf.include_bias, | |
| ) | |
| if num_combinations > np.iinfo(np.intp).max: | |
| msg = ( | |
| r"The output that would result from the current configuration would have" | |
| r" \d* features which is too large to be indexed" | |
| ) | |
| with pytest.raises(ValueError, match=msg): | |
| pf.fit(X) | |
| return | |
| X_trans = pf.fit_transform(X) | |
| row_nonzero, col_nonzero = X_trans.nonzero() | |
| n_degree_1_features_out = n_features + include_bias | |
| max_degree_2_idx = ( | |
| degree_2_calc(n_features, col[int(not interaction_only)], col[1]) | |
| + n_degree_1_features_out | |
| ) | |
| # Account for bias of all samples except last one which will be handled | |
| # separately since there are distinct data values before it | |
| data_target = [1] * (n_samples - 2) if include_bias else [] | |
| col_nonzero_target = [0] * (n_samples - 2) if include_bias else [] | |
| for i in range(2): | |
| x = data[2 * i] | |
| y = data[2 * i + 1] | |
| x_idx = col[2 * i] | |
| y_idx = col[2 * i + 1] | |
| if include_bias: | |
| data_target.append(1) | |
| col_nonzero_target.append(0) | |
| data_target.extend([x, y]) | |
| col_nonzero_target.extend( | |
| [x_idx + int(include_bias), y_idx + int(include_bias)] | |
| ) | |
| if not interaction_only: | |
| data_target.extend([x * x, x * y, y * y]) | |
| col_nonzero_target.extend( | |
| [ | |
| degree_2_calc(n_features, x_idx, x_idx) + n_degree_1_features_out, | |
| degree_2_calc(n_features, x_idx, y_idx) + n_degree_1_features_out, | |
| degree_2_calc(n_features, y_idx, y_idx) + n_degree_1_features_out, | |
| ] | |
| ) | |
| else: | |
| data_target.extend([x * y]) | |
| col_nonzero_target.append( | |
| degree_2_calc(n_features, x_idx, y_idx) + n_degree_1_features_out | |
| ) | |
| nnz_per_row = int(include_bias) + 3 + 2 * int(not interaction_only) | |
| assert pf.n_output_features_ == max_degree_2_idx + 1 | |
| assert X_trans.dtype == data_dtype | |
| assert X_trans.shape == (n_samples, max_degree_2_idx + 1) | |
| assert X_trans.indptr.dtype == X_trans.indices.dtype == np.int64 | |
| # Ensure that dtype promotion was actually required: | |
| assert X_trans.indices.max() > np.iinfo(np.int32).max | |
| row_nonzero_target = list(range(n_samples - 2)) if include_bias else [] | |
| row_nonzero_target.extend( | |
| [n_samples - 2] * nnz_per_row + [n_samples - 1] * nnz_per_row | |
| ) | |
| assert_allclose(X_trans.data, data_target) | |
| assert_array_equal(row_nonzero, row_nonzero_target) | |
| assert_array_equal(col_nonzero, col_nonzero_target) | |
| def test_csr_polynomial_expansion_index_overflow( | |
| degree, n_features, interaction_only, include_bias, csr_container | |
| ): | |
| """Tests known edge-cases to the dtype promotion strategy and custom | |
| Cython code, including a current bug in the upstream | |
| `scipy.sparse.hstack`. | |
| """ | |
| data = [1.0] | |
| # Use int32 indices as much as we can | |
| indices_dtype = np.int32 if n_features - 1 <= np.iinfo(np.int32).max else np.int64 | |
| row = np.array([0], dtype=indices_dtype) | |
| col = np.array([n_features - 1], dtype=indices_dtype) | |
| # First degree index | |
| expected_indices = [ | |
| n_features - 1 + int(include_bias), | |
| ] | |
| # Second degree index | |
| expected_indices.append(n_features * (n_features + 1) // 2 + expected_indices[0]) | |
| # Third degree index | |
| expected_indices.append( | |
| n_features * (n_features + 1) * (n_features + 2) // 6 + expected_indices[1] | |
| ) | |
| X = csr_container((data, (row, col))) | |
| pf = PolynomialFeatures( | |
| interaction_only=interaction_only, include_bias=include_bias, degree=degree | |
| ) | |
| # Calculate the number of combinations a-priori, and if needed check for | |
| # the correct ValueError and terminate the test early. | |
| num_combinations = pf._num_combinations( | |
| n_features=n_features, | |
| min_degree=0, | |
| max_degree=degree, | |
| interaction_only=pf.interaction_only, | |
| include_bias=pf.include_bias, | |
| ) | |
| if num_combinations > np.iinfo(np.intp).max: | |
| msg = ( | |
| r"The output that would result from the current configuration would have" | |
| r" \d* features which is too large to be indexed" | |
| ) | |
| with pytest.raises(ValueError, match=msg): | |
| pf.fit(X) | |
| return | |
| # In SciPy < 1.8, a bug occurs when an intermediate matrix in | |
| # `to_stack` in `hstack` fits within int32 however would require int64 when | |
| # combined with all previous matrices in `to_stack`. | |
| if sp_version < parse_version("1.8.0"): | |
| has_bug = False | |
| max_int32 = np.iinfo(np.int32).max | |
| cumulative_size = n_features + include_bias | |
| for deg in range(2, degree + 1): | |
| max_indptr = _calc_total_nnz(X.indptr, interaction_only, deg) | |
| max_indices = _calc_expanded_nnz(n_features, interaction_only, deg) - 1 | |
| cumulative_size += max_indices + 1 | |
| needs_int64 = max(max_indices, max_indptr) > max_int32 | |
| has_bug |= not needs_int64 and cumulative_size > max_int32 | |
| if has_bug: | |
| msg = r"In scipy versions `<1.8.0`, the function `scipy.sparse.hstack`" | |
| with pytest.raises(ValueError, match=msg): | |
| X_trans = pf.fit_transform(X) | |
| return | |
| # When `n_features>=65535`, `scipy.sparse.hstack` may not use the right | |
| # dtype for representing indices and indptr if `n_features` is still | |
| # small enough so that each block matrix's indices and indptr arrays | |
| # can be represented with `np.int32`. We test `n_features==65535` | |
| # since it is guaranteed to run into this bug. | |
| if ( | |
| sp_version < parse_version("1.9.2") | |
| and n_features == 65535 | |
| and degree == 2 | |
| and not interaction_only | |
| ): # pragma: no cover | |
| msg = r"In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`" | |
| with pytest.raises(ValueError, match=msg): | |
| X_trans = pf.fit_transform(X) | |
| return | |
| X_trans = pf.fit_transform(X) | |
| expected_dtype = np.int64 if num_combinations > np.iinfo(np.int32).max else np.int32 | |
| # Terms higher than first degree | |
| non_bias_terms = 1 + (degree - 1) * int(not interaction_only) | |
| expected_nnz = int(include_bias) + non_bias_terms | |
| assert X_trans.dtype == X.dtype | |
| assert X_trans.shape == (1, pf.n_output_features_) | |
| assert X_trans.indptr.dtype == X_trans.indices.dtype == expected_dtype | |
| assert X_trans.nnz == expected_nnz | |
| if include_bias: | |
| assert X_trans[0, 0] == pytest.approx(1.0) | |
| for idx in range(non_bias_terms): | |
| assert X_trans[0, expected_indices[idx]] == pytest.approx(1.0) | |
| offset = interaction_only * n_features | |
| if degree == 3: | |
| offset *= 1 + n_features | |
| assert pf.n_output_features_ == expected_indices[degree - 1] + 1 - offset | |
| def test_csr_polynomial_expansion_too_large_to_index( | |
| interaction_only, include_bias, csr_container | |
| ): | |
| n_features = np.iinfo(np.int64).max // 2 | |
| data = [1.0] | |
| row = [0] | |
| col = [n_features - 1] | |
| X = csr_container((data, (row, col))) | |
| pf = PolynomialFeatures( | |
| interaction_only=interaction_only, include_bias=include_bias, degree=(2, 2) | |
| ) | |
| msg = ( | |
| r"The output that would result from the current configuration would have \d*" | |
| r" features which is too large to be indexed" | |
| ) | |
| with pytest.raises(ValueError, match=msg): | |
| pf.fit(X) | |
| with pytest.raises(ValueError, match=msg): | |
| pf.fit_transform(X) | |
| def test_polynomial_features_behaviour_on_zero_degree(sparse_container): | |
| """Check that PolynomialFeatures raises error when degree=0 and include_bias=False, | |
| and output a single constant column when include_bias=True | |
| """ | |
| X = np.ones((10, 2)) | |
| poly = PolynomialFeatures(degree=0, include_bias=False) | |
| err_msg = ( | |
| "Setting degree to zero and include_bias to False would result in" | |
| " an empty output array." | |
| ) | |
| with pytest.raises(ValueError, match=err_msg): | |
| poly.fit_transform(X) | |
| poly = PolynomialFeatures(degree=(0, 0), include_bias=False) | |
| err_msg = ( | |
| "Setting both min_degree and max_degree to zero and include_bias to" | |
| " False would result in an empty output array." | |
| ) | |
| with pytest.raises(ValueError, match=err_msg): | |
| poly.fit_transform(X) | |
| for _X in [X, sparse_container(X)]: | |
| poly = PolynomialFeatures(degree=0, include_bias=True) | |
| output = poly.fit_transform(_X) | |
| # convert to dense array if needed | |
| if sparse.issparse(output): | |
| output = output.toarray() | |
| assert_array_equal(output, np.ones((X.shape[0], 1))) | |
| def test_sizeof_LARGEST_INT_t(): | |
| # On Windows, scikit-learn is typically compiled with MSVC that | |
| # does not support int128 arithmetic (at the time of writing): | |
| # https://stackoverflow.com/a/6761962/163740 | |
| if sys.platform == "win32" or ( | |
| sys.maxsize <= 2**32 and sys.platform != "emscripten" | |
| ): | |
| expected_size = 8 | |
| else: | |
| expected_size = 16 | |
| assert _get_sizeof_LARGEST_INT_t() == expected_size | |
| def test_csr_polynomial_expansion_windows_fail(csr_container): | |
| # Minimum needed to ensure integer overflow occurs while guaranteeing an | |
| # int64-indexable output. | |
| n_features = int(np.iinfo(np.int64).max ** (1 / 3) + 3) | |
| data = [1.0] | |
| row = [0] | |
| col = [n_features - 1] | |
| # First degree index | |
| expected_indices = [ | |
| n_features - 1, | |
| ] | |
| # Second degree index | |
| expected_indices.append( | |
| int(n_features * (n_features + 1) // 2 + expected_indices[0]) | |
| ) | |
| # Third degree index | |
| expected_indices.append( | |
| int(n_features * (n_features + 1) * (n_features + 2) // 6 + expected_indices[1]) | |
| ) | |
| X = csr_container((data, (row, col))) | |
| pf = PolynomialFeatures(interaction_only=False, include_bias=False, degree=3) | |
| if sys.maxsize <= 2**32: | |
| msg = ( | |
| r"The output that would result from the current configuration would" | |
| r" have \d*" | |
| r" features which is too large to be indexed" | |
| ) | |
| with pytest.raises(ValueError, match=msg): | |
| pf.fit_transform(X) | |
| else: | |
| X_trans = pf.fit_transform(X) | |
| for idx in range(3): | |
| assert X_trans[0, expected_indices[idx]] == pytest.approx(1.0) | |