spam-classifier
/
venv
/lib
/python3.11
/site-packages
/sklearn
/preprocessing
/tests
/test_encoders.py
| import re | |
| import warnings | |
| import numpy as np | |
| import pytest | |
| from scipy import sparse | |
| from sklearn.exceptions import NotFittedError | |
| from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder | |
| from sklearn.utils._missing import is_scalar_nan | |
| from sklearn.utils._testing import ( | |
| _convert_container, | |
| assert_allclose, | |
| assert_array_equal, | |
| ) | |
| from sklearn.utils.fixes import CSR_CONTAINERS | |
| def test_one_hot_encoder_sparse_dense(): | |
| # check that sparse and dense will give the same results | |
| X = np.array([[3, 2, 1], [0, 1, 1]]) | |
| enc_sparse = OneHotEncoder() | |
| enc_dense = OneHotEncoder(sparse_output=False) | |
| X_trans_sparse = enc_sparse.fit_transform(X) | |
| X_trans_dense = enc_dense.fit_transform(X) | |
| assert X_trans_sparse.shape == (2, 5) | |
| assert X_trans_dense.shape == (2, 5) | |
| assert sparse.issparse(X_trans_sparse) | |
| assert not sparse.issparse(X_trans_dense) | |
| # check outcome | |
| assert_array_equal( | |
| X_trans_sparse.toarray(), [[0.0, 1.0, 0.0, 1.0, 1.0], [1.0, 0.0, 1.0, 0.0, 1.0]] | |
| ) | |
| assert_array_equal(X_trans_sparse.toarray(), X_trans_dense) | |
| def test_one_hot_encoder_handle_unknown(handle_unknown): | |
| X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) | |
| X2 = np.array([[4, 1, 1]]) | |
| # Test that one hot encoder raises error for unknown features | |
| # present during transform. | |
| oh = OneHotEncoder(handle_unknown="error") | |
| oh.fit(X) | |
| with pytest.raises(ValueError, match="Found unknown categories"): | |
| oh.transform(X2) | |
| # Test the ignore option, ignores unknown features (giving all 0's) | |
| oh = OneHotEncoder(handle_unknown=handle_unknown) | |
| oh.fit(X) | |
| X2_passed = X2.copy() | |
| assert_array_equal( | |
| oh.transform(X2_passed).toarray(), | |
| np.array([[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]]), | |
| ) | |
| # ensure transformed data was not modified in place | |
| assert_allclose(X2, X2_passed) | |
| def test_one_hot_encoder_handle_unknown_strings(handle_unknown): | |
| X = np.array(["11111111", "22", "333", "4444"]).reshape((-1, 1)) | |
| X2 = np.array(["55555", "22"]).reshape((-1, 1)) | |
| # Non Regression test for the issue #12470 | |
| # Test the ignore option, when categories are numpy string dtype | |
| # particularly when the known category strings are larger | |
| # than the unknown category strings | |
| oh = OneHotEncoder(handle_unknown=handle_unknown) | |
| oh.fit(X) | |
| X2_passed = X2.copy() | |
| assert_array_equal( | |
| oh.transform(X2_passed).toarray(), | |
| np.array([[0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]), | |
| ) | |
| # ensure transformed data was not modified in place | |
| assert_array_equal(X2, X2_passed) | |
| def test_one_hot_encoder_dtype(input_dtype, output_dtype): | |
| X = np.asarray([[0, 1]], dtype=input_dtype).T | |
| X_expected = np.asarray([[1, 0], [0, 1]], dtype=output_dtype) | |
| oh = OneHotEncoder(categories="auto", dtype=output_dtype) | |
| assert_array_equal(oh.fit_transform(X).toarray(), X_expected) | |
| assert_array_equal(oh.fit(X).transform(X).toarray(), X_expected) | |
| oh = OneHotEncoder(categories="auto", dtype=output_dtype, sparse_output=False) | |
| assert_array_equal(oh.fit_transform(X), X_expected) | |
| assert_array_equal(oh.fit(X).transform(X), X_expected) | |
| def test_one_hot_encoder_dtype_pandas(output_dtype): | |
| pd = pytest.importorskip("pandas") | |
| X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]}) | |
| X_expected = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=output_dtype) | |
| oh = OneHotEncoder(dtype=output_dtype) | |
| assert_array_equal(oh.fit_transform(X_df).toarray(), X_expected) | |
| assert_array_equal(oh.fit(X_df).transform(X_df).toarray(), X_expected) | |
| oh = OneHotEncoder(dtype=output_dtype, sparse_output=False) | |
| assert_array_equal(oh.fit_transform(X_df), X_expected) | |
| assert_array_equal(oh.fit(X_df).transform(X_df), X_expected) | |
| def test_one_hot_encoder_feature_names(): | |
| enc = OneHotEncoder() | |
| X = [ | |
| ["Male", 1, "girl", 2, 3], | |
| ["Female", 41, "girl", 1, 10], | |
| ["Male", 51, "boy", 12, 3], | |
| ["Male", 91, "girl", 21, 30], | |
| ] | |
| enc.fit(X) | |
| feature_names = enc.get_feature_names_out() | |
| assert_array_equal( | |
| [ | |
| "x0_Female", | |
| "x0_Male", | |
| "x1_1", | |
| "x1_41", | |
| "x1_51", | |
| "x1_91", | |
| "x2_boy", | |
| "x2_girl", | |
| "x3_1", | |
| "x3_2", | |
| "x3_12", | |
| "x3_21", | |
| "x4_3", | |
| "x4_10", | |
| "x4_30", | |
| ], | |
| feature_names, | |
| ) | |
| feature_names2 = enc.get_feature_names_out(["one", "two", "three", "four", "five"]) | |
| assert_array_equal( | |
| [ | |
| "one_Female", | |
| "one_Male", | |
| "two_1", | |
| "two_41", | |
| "two_51", | |
| "two_91", | |
| "three_boy", | |
| "three_girl", | |
| "four_1", | |
| "four_2", | |
| "four_12", | |
| "four_21", | |
| "five_3", | |
| "five_10", | |
| "five_30", | |
| ], | |
| feature_names2, | |
| ) | |
| with pytest.raises(ValueError, match="input_features should have length"): | |
| enc.get_feature_names_out(["one", "two"]) | |
| def test_one_hot_encoder_feature_names_unicode(): | |
| enc = OneHotEncoder() | |
| X = np.array([["c❤t1", "dat2"]], dtype=object).T | |
| enc.fit(X) | |
| feature_names = enc.get_feature_names_out() | |
| assert_array_equal(["x0_c❤t1", "x0_dat2"], feature_names) | |
| feature_names = enc.get_feature_names_out(input_features=["n👍me"]) | |
| assert_array_equal(["n👍me_c❤t1", "n👍me_dat2"], feature_names) | |
| def test_one_hot_encoder_custom_feature_name_combiner(): | |
| """Check the behaviour of `feature_name_combiner` as a callable.""" | |
| def name_combiner(feature, category): | |
| return feature + "_" + repr(category) | |
| enc = OneHotEncoder(feature_name_combiner=name_combiner) | |
| X = np.array([["None", None]], dtype=object).T | |
| enc.fit(X) | |
| feature_names = enc.get_feature_names_out() | |
| assert_array_equal(["x0_'None'", "x0_None"], feature_names) | |
| feature_names = enc.get_feature_names_out(input_features=["a"]) | |
| assert_array_equal(["a_'None'", "a_None"], feature_names) | |
| def wrong_combiner(feature, category): | |
| # we should be returning a Python string | |
| return 0 | |
| enc = OneHotEncoder(feature_name_combiner=wrong_combiner).fit(X) | |
| err_msg = ( | |
| "When `feature_name_combiner` is a callable, it should return a Python string." | |
| ) | |
| with pytest.raises(TypeError, match=err_msg): | |
| enc.get_feature_names_out() | |
| def test_one_hot_encoder_set_params(): | |
| X = np.array([[1, 2]]).T | |
| oh = OneHotEncoder() | |
| # set params on not yet fitted object | |
| oh.set_params(categories=[[0, 1, 2, 3]]) | |
| assert oh.get_params()["categories"] == [[0, 1, 2, 3]] | |
| assert oh.fit_transform(X).toarray().shape == (2, 4) | |
| # set params on already fitted object | |
| oh.set_params(categories=[[0, 1, 2, 3, 4]]) | |
| assert oh.fit_transform(X).toarray().shape == (2, 5) | |
| def check_categorical_onehot(X): | |
| enc = OneHotEncoder(categories="auto") | |
| Xtr1 = enc.fit_transform(X) | |
| enc = OneHotEncoder(categories="auto", sparse_output=False) | |
| Xtr2 = enc.fit_transform(X) | |
| assert_allclose(Xtr1.toarray(), Xtr2) | |
| assert sparse.issparse(Xtr1) and Xtr1.format == "csr" | |
| return Xtr1.toarray() | |
| def test_one_hot_encoder(X): | |
| Xtr = check_categorical_onehot(np.array(X)[:, [0]]) | |
| assert_allclose(Xtr, [[0, 1], [1, 0]]) | |
| Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]]) | |
| assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]]) | |
| Xtr = OneHotEncoder(categories="auto").fit_transform(X) | |
| assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]]) | |
| def test_one_hot_encoder_inverse(handle_unknown, sparse_, drop): | |
| X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]] | |
| enc = OneHotEncoder(sparse_output=sparse_, drop=drop) | |
| X_tr = enc.fit_transform(X) | |
| exp = np.array(X, dtype=object) | |
| assert_array_equal(enc.inverse_transform(X_tr), exp) | |
| X = [[2, 55], [1, 55], [3, 55]] | |
| enc = OneHotEncoder(sparse_output=sparse_, categories="auto", drop=drop) | |
| X_tr = enc.fit_transform(X) | |
| exp = np.array(X) | |
| assert_array_equal(enc.inverse_transform(X_tr), exp) | |
| if drop is None: | |
| # with unknown categories | |
| # drop is incompatible with handle_unknown=ignore | |
| X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]] | |
| enc = OneHotEncoder( | |
| sparse_output=sparse_, | |
| handle_unknown=handle_unknown, | |
| categories=[["abc", "def"], [1, 2], [54, 55, 56]], | |
| ) | |
| X_tr = enc.fit_transform(X) | |
| exp = np.array(X, dtype=object) | |
| exp[2, 1] = None | |
| assert_array_equal(enc.inverse_transform(X_tr), exp) | |
| # with an otherwise numerical output, still object if unknown | |
| X = [[2, 55], [1, 55], [3, 55]] | |
| enc = OneHotEncoder( | |
| sparse_output=sparse_, | |
| categories=[[1, 2], [54, 56]], | |
| handle_unknown=handle_unknown, | |
| ) | |
| X_tr = enc.fit_transform(X) | |
| exp = np.array(X, dtype=object) | |
| exp[2, 0] = None | |
| exp[:, 1] = None | |
| assert_array_equal(enc.inverse_transform(X_tr), exp) | |
| # incorrect shape raises | |
| X_tr = np.array([[0, 1, 1], [1, 0, 1]]) | |
| msg = re.escape("Shape of the passed X data is not correct") | |
| with pytest.raises(ValueError, match=msg): | |
| enc.inverse_transform(X_tr) | |
| def test_one_hot_encoder_inverse_transform_raise_error_with_unknown( | |
| X, X_trans, sparse_ | |
| ): | |
| """Check that `inverse_transform` raise an error with unknown samples, no | |
| dropped feature, and `handle_unknow="error`. | |
| Non-regression test for: | |
| https://github.com/scikit-learn/scikit-learn/issues/14934 | |
| """ | |
| enc = OneHotEncoder(sparse_output=sparse_).fit(X) | |
| msg = ( | |
| r"Samples \[(\d )*\d\] can not be inverted when drop=None and " | |
| r"handle_unknown='error' because they contain all zeros" | |
| ) | |
| if sparse_: | |
| # emulate sparse data transform by a one-hot encoder sparse. | |
| X_trans = _convert_container(X_trans, "sparse") | |
| with pytest.raises(ValueError, match=msg): | |
| enc.inverse_transform(X_trans) | |
| def test_one_hot_encoder_inverse_if_binary(): | |
| X = np.array([["Male", 1], ["Female", 3], ["Female", 2]], dtype=object) | |
| ohe = OneHotEncoder(drop="if_binary", sparse_output=False) | |
| X_tr = ohe.fit_transform(X) | |
| assert_array_equal(ohe.inverse_transform(X_tr), X) | |
| def test_one_hot_encoder_drop_reset(drop, reset_drop): | |
| # check that resetting drop option without refitting does not throw an error | |
| X = np.array([["Male", 1], ["Female", 3], ["Female", 2]], dtype=object) | |
| ohe = OneHotEncoder(drop=drop, sparse_output=False) | |
| ohe.fit(X) | |
| X_tr = ohe.transform(X) | |
| feature_names = ohe.get_feature_names_out() | |
| ohe.set_params(drop=reset_drop) | |
| assert_array_equal(ohe.inverse_transform(X_tr), X) | |
| assert_allclose(ohe.transform(X), X_tr) | |
| assert_array_equal(ohe.get_feature_names_out(), feature_names) | |
| def test_X_is_not_1D(X, method): | |
| oh = OneHotEncoder() | |
| msg = "Expected 2D array, got 1D array instead" | |
| with pytest.raises(ValueError, match=msg): | |
| getattr(oh, method)(X) | |
| def test_X_is_not_1D_pandas(method): | |
| pd = pytest.importorskip("pandas") | |
| X = pd.Series([6, 3, 4, 6]) | |
| oh = OneHotEncoder() | |
| msg = f"Expected a 2-dimensional container but got {type(X)} instead." | |
| with pytest.raises(ValueError, match=msg): | |
| getattr(oh, method)(X) | |
| def test_one_hot_encoder_categories(X, cat_exp, cat_dtype): | |
| # order of categories should not depend on order of samples | |
| for Xi in [X, X[::-1]]: | |
| enc = OneHotEncoder(categories="auto") | |
| enc.fit(Xi) | |
| # assert enc.categories == 'auto' | |
| assert isinstance(enc.categories_, list) | |
| for res, exp in zip(enc.categories_, cat_exp): | |
| res_list = res.tolist() | |
| if is_scalar_nan(exp[-1]): | |
| assert is_scalar_nan(res_list[-1]) | |
| assert res_list[:-1] == exp[:-1] | |
| else: | |
| assert res.tolist() == exp | |
| assert np.issubdtype(res.dtype, cat_dtype) | |
| def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype, handle_unknown): | |
| enc = OneHotEncoder(categories=cats) | |
| exp = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]) | |
| assert_array_equal(enc.fit_transform(X).toarray(), exp) | |
| assert list(enc.categories[0]) == list(cats[0]) | |
| assert enc.categories_[0].tolist() == list(cats[0]) | |
| # manually specified categories should have same dtype as | |
| # the data when coerced from lists | |
| assert enc.categories_[0].dtype == cat_dtype | |
| # when specifying categories manually, unknown categories should already | |
| # raise when fitting | |
| enc = OneHotEncoder(categories=cats) | |
| with pytest.raises(ValueError, match="Found unknown categories"): | |
| enc.fit(X2) | |
| enc = OneHotEncoder(categories=cats, handle_unknown=handle_unknown) | |
| exp = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) | |
| assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp) | |
| def test_one_hot_encoder_unsorted_categories(): | |
| X = np.array([["a", "b"]], dtype=object).T | |
| enc = OneHotEncoder(categories=[["b", "a", "c"]]) | |
| exp = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0]]) | |
| assert_array_equal(enc.fit(X).transform(X).toarray(), exp) | |
| assert_array_equal(enc.fit_transform(X).toarray(), exp) | |
| assert enc.categories_[0].tolist() == ["b", "a", "c"] | |
| assert np.issubdtype(enc.categories_[0].dtype, np.object_) | |
| # unsorted passed categories still raise for numerical values | |
| X = np.array([[1, 2]]).T | |
| enc = OneHotEncoder(categories=[[2, 1, 3]]) | |
| msg = "Unsorted categories are not supported" | |
| with pytest.raises(ValueError, match=msg): | |
| enc.fit_transform(X) | |
| def test_encoder_nan_ending_specified_categories(Encoder): | |
| """Test encoder for specified categories that nan is at the end. | |
| Non-regression test for: | |
| https://github.com/scikit-learn/scikit-learn/issues/27088 | |
| """ | |
| cats = [np.array([0, np.nan, 1])] | |
| enc = Encoder(categories=cats) | |
| X = np.array([[0, 1]], dtype=object).T | |
| with pytest.raises(ValueError, match="Nan should be the last element"): | |
| enc.fit(X) | |
| def test_one_hot_encoder_specified_categories_mixed_columns(): | |
| # multiple columns | |
| X = np.array([["a", "b"], [0, 2]], dtype=object).T | |
| enc = OneHotEncoder(categories=[["a", "b", "c"], [0, 1, 2]]) | |
| exp = np.array([[1.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 1.0]]) | |
| assert_array_equal(enc.fit_transform(X).toarray(), exp) | |
| assert enc.categories_[0].tolist() == ["a", "b", "c"] | |
| assert np.issubdtype(enc.categories_[0].dtype, np.object_) | |
| assert enc.categories_[1].tolist() == [0, 1, 2] | |
| # integer categories but from object dtype data | |
| assert np.issubdtype(enc.categories_[1].dtype, np.object_) | |
| def test_one_hot_encoder_pandas(): | |
| pd = pytest.importorskip("pandas") | |
| X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]}) | |
| Xtr = check_categorical_onehot(X_df) | |
| assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]]) | |
| def test_one_hot_encoder_feature_names_drop(drop, expected_names): | |
| X = [["c", 2, "a"], ["b", 2, "b"]] | |
| ohe = OneHotEncoder(drop=drop) | |
| ohe.fit(X) | |
| feature_names = ohe.get_feature_names_out() | |
| assert_array_equal(expected_names, feature_names) | |
| def test_one_hot_encoder_drop_equals_if_binary(): | |
| # Canonical case | |
| X = [[10, "yes"], [20, "no"], [30, "yes"]] | |
| expected = np.array( | |
| [[1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 1.0]] | |
| ) | |
| expected_drop_idx = np.array([None, 0]) | |
| ohe = OneHotEncoder(drop="if_binary", sparse_output=False) | |
| result = ohe.fit_transform(X) | |
| assert_array_equal(ohe.drop_idx_, expected_drop_idx) | |
| assert_allclose(result, expected) | |
| # with only one cat, the behaviour is equivalent to drop=None | |
| X = [["true", "a"], ["false", "a"], ["false", "a"]] | |
| expected = np.array([[1.0, 1.0], [0.0, 1.0], [0.0, 1.0]]) | |
| expected_drop_idx = np.array([0, None]) | |
| ohe = OneHotEncoder(drop="if_binary", sparse_output=False) | |
| result = ohe.fit_transform(X) | |
| assert_array_equal(ohe.drop_idx_, expected_drop_idx) | |
| assert_allclose(result, expected) | |
| def test_ordinal_encoder(X): | |
| enc = OrdinalEncoder() | |
| exp = np.array([[0, 1, 0], [1, 0, 0]], dtype="int64") | |
| assert_array_equal(enc.fit_transform(X), exp.astype("float64")) | |
| enc = OrdinalEncoder(dtype="int64") | |
| assert_array_equal(enc.fit_transform(X), exp) | |
| def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype): | |
| enc = OrdinalEncoder(categories=cats) | |
| exp = np.array([[0.0], [1.0]]) | |
| assert_array_equal(enc.fit_transform(X), exp) | |
| assert list(enc.categories[0]) == list(cats[0]) | |
| assert enc.categories_[0].tolist() == list(cats[0]) | |
| # manually specified categories should have same dtype as | |
| # the data when coerced from lists | |
| assert enc.categories_[0].dtype == cat_dtype | |
| # when specifying categories manually, unknown categories should already | |
| # raise when fitting | |
| enc = OrdinalEncoder(categories=cats) | |
| with pytest.raises(ValueError, match="Found unknown categories"): | |
| enc.fit(X2) | |
| def test_ordinal_encoder_inverse(): | |
| X = [["abc", 2, 55], ["def", 1, 55]] | |
| enc = OrdinalEncoder() | |
| X_tr = enc.fit_transform(X) | |
| exp = np.array(X, dtype=object) | |
| assert_array_equal(enc.inverse_transform(X_tr), exp) | |
| # incorrect shape raises | |
| X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]]) | |
| msg = re.escape("Shape of the passed X data is not correct") | |
| with pytest.raises(ValueError, match=msg): | |
| enc.inverse_transform(X_tr) | |
| def test_ordinal_encoder_handle_unknowns_string(): | |
| enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-2) | |
| X_fit = np.array([["a", "x"], ["b", "y"], ["c", "z"]], dtype=object) | |
| X_trans = np.array([["c", "xy"], ["bla", "y"], ["a", "x"]], dtype=object) | |
| enc.fit(X_fit) | |
| X_trans_enc = enc.transform(X_trans) | |
| exp = np.array([[2, -2], [-2, 1], [0, 0]], dtype="int64") | |
| assert_array_equal(X_trans_enc, exp) | |
| X_trans_inv = enc.inverse_transform(X_trans_enc) | |
| inv_exp = np.array([["c", None], [None, "y"], ["a", "x"]], dtype=object) | |
| assert_array_equal(X_trans_inv, inv_exp) | |
| def test_ordinal_encoder_handle_unknowns_numeric(dtype): | |
| enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999) | |
| X_fit = np.array([[1, 7], [2, 8], [3, 9]], dtype=dtype) | |
| X_trans = np.array([[3, 12], [23, 8], [1, 7]], dtype=dtype) | |
| enc.fit(X_fit) | |
| X_trans_enc = enc.transform(X_trans) | |
| exp = np.array([[2, -999], [-999, 1], [0, 0]], dtype="int64") | |
| assert_array_equal(X_trans_enc, exp) | |
| X_trans_inv = enc.inverse_transform(X_trans_enc) | |
| inv_exp = np.array([[3, None], [None, 8], [1, 7]], dtype=object) | |
| assert_array_equal(X_trans_inv, inv_exp) | |
| def test_ordinal_encoder_handle_unknowns_nan(): | |
| # Make sure unknown_value=np.nan properly works | |
| enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan) | |
| X_fit = np.array([[1], [2], [3]]) | |
| enc.fit(X_fit) | |
| X_trans = enc.transform([[1], [2], [4]]) | |
| assert_array_equal(X_trans, [[0], [1], [np.nan]]) | |
| def test_ordinal_encoder_handle_unknowns_nan_non_float_dtype(): | |
| # Make sure an error is raised when unknown_value=np.nan and the dtype | |
| # isn't a float dtype | |
| enc = OrdinalEncoder( | |
| handle_unknown="use_encoded_value", unknown_value=np.nan, dtype=int | |
| ) | |
| X_fit = np.array([[1], [2], [3]]) | |
| with pytest.raises(ValueError, match="dtype parameter should be a float dtype"): | |
| enc.fit(X_fit) | |
| def test_ordinal_encoder_raise_categories_shape(): | |
| X = np.array([["Low", "Medium", "High", "Medium", "Low"]], dtype=object).T | |
| cats = ["Low", "Medium", "High"] | |
| enc = OrdinalEncoder(categories=cats) | |
| msg = "Shape mismatch: if categories is an array," | |
| with pytest.raises(ValueError, match=msg): | |
| enc.fit(X) | |
| def test_encoder_dtypes(): | |
| # check that dtypes are preserved when determining categories | |
| enc = OneHotEncoder(categories="auto") | |
| exp = np.array([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0]], dtype="float64") | |
| for X in [ | |
| np.array([[1, 2], [3, 4]], dtype="int64"), | |
| np.array([[1, 2], [3, 4]], dtype="float64"), | |
| np.array([["a", "b"], ["c", "d"]]), # str dtype | |
| np.array([[b"a", b"b"], [b"c", b"d"]]), # bytes dtype | |
| np.array([[1, "a"], [3, "b"]], dtype="object"), | |
| ]: | |
| enc.fit(X) | |
| assert all([enc.categories_[i].dtype == X.dtype for i in range(2)]) | |
| assert_array_equal(enc.transform(X).toarray(), exp) | |
| X = [[1, 2], [3, 4]] | |
| enc.fit(X) | |
| assert all([np.issubdtype(enc.categories_[i].dtype, np.integer) for i in range(2)]) | |
| assert_array_equal(enc.transform(X).toarray(), exp) | |
| X = [[1, "a"], [3, "b"]] | |
| enc.fit(X) | |
| assert all([enc.categories_[i].dtype == "object" for i in range(2)]) | |
| assert_array_equal(enc.transform(X).toarray(), exp) | |
| def test_encoder_dtypes_pandas(): | |
| # check dtype (similar to test_categorical_encoder_dtypes for dataframes) | |
| pd = pytest.importorskip("pandas") | |
| enc = OneHotEncoder(categories="auto") | |
| exp = np.array( | |
| [[1.0, 0.0, 1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0, 0.0, 1.0]], | |
| dtype="float64", | |
| ) | |
| X = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}, dtype="int64") | |
| enc.fit(X) | |
| assert all([enc.categories_[i].dtype == "int64" for i in range(2)]) | |
| assert_array_equal(enc.transform(X).toarray(), exp) | |
| X = pd.DataFrame({"A": [1, 2], "B": ["a", "b"], "C": [3.0, 4.0]}) | |
| X_type = [X["A"].dtype, X["B"].dtype, X["C"].dtype] | |
| enc.fit(X) | |
| assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)]) | |
| assert_array_equal(enc.transform(X).toarray(), exp) | |
| def test_one_hot_encoder_warning(): | |
| enc = OneHotEncoder() | |
| X = [["Male", 1], ["Female", 3]] | |
| with warnings.catch_warnings(): | |
| warnings.simplefilter("error") | |
| enc.fit_transform(X) | |
| def test_ohe_handle_unknown_warn(drop): | |
| """Check handle_unknown='warn' works correctly.""" | |
| X = [["a", 0], ["b", 2], ["b", 1]] | |
| ohe = OneHotEncoder( | |
| drop=drop, | |
| sparse_output=False, | |
| handle_unknown="warn", | |
| categories=[["b", "a"], [1, 2]], | |
| ) | |
| ohe.fit(X) | |
| X_test = [["c", 1]] | |
| X_expected = np.array([[0, 0]]) | |
| warn_msg = ( | |
| r"Found unknown categories in columns \[0\] during transform. " | |
| r"These unknown categories will be encoded as all zeros" | |
| ) | |
| with pytest.warns(UserWarning, match=warn_msg): | |
| X_trans = ohe.transform(X_test) | |
| assert_allclose(X_trans, X_expected) | |
| def test_one_hot_encoder_drop_manual(missing_value): | |
| cats_to_drop = ["def", 12, 3, 56, missing_value] | |
| enc = OneHotEncoder(drop=cats_to_drop) | |
| X = [ | |
| ["abc", 12, 2, 55, "a"], | |
| ["def", 12, 1, 55, "a"], | |
| ["def", 12, 3, 56, missing_value], | |
| ] | |
| trans = enc.fit_transform(X).toarray() | |
| exp = [[1, 0, 1, 1, 1], [0, 1, 0, 1, 1], [0, 0, 0, 0, 0]] | |
| assert_array_equal(trans, exp) | |
| assert enc.drop is cats_to_drop | |
| dropped_cats = [ | |
| cat[feature] for cat, feature in zip(enc.categories_, enc.drop_idx_) | |
| ] | |
| X_inv_trans = enc.inverse_transform(trans) | |
| X_array = np.array(X, dtype=object) | |
| # last value is np.nan | |
| if is_scalar_nan(cats_to_drop[-1]): | |
| assert_array_equal(dropped_cats[:-1], cats_to_drop[:-1]) | |
| assert is_scalar_nan(dropped_cats[-1]) | |
| assert is_scalar_nan(cats_to_drop[-1]) | |
| # do not include the last column which includes missing values | |
| assert_array_equal(X_array[:, :-1], X_inv_trans[:, :-1]) | |
| # check last column is the missing value | |
| assert_array_equal(X_array[-1, :-1], X_inv_trans[-1, :-1]) | |
| assert is_scalar_nan(X_array[-1, -1]) | |
| assert is_scalar_nan(X_inv_trans[-1, -1]) | |
| else: | |
| assert_array_equal(dropped_cats, cats_to_drop) | |
| assert_array_equal(X_array, X_inv_trans) | |
| def test_invalid_drop_length(drop): | |
| enc = OneHotEncoder(drop=drop) | |
| err_msg = "`drop` should have length equal to the number" | |
| with pytest.raises(ValueError, match=err_msg): | |
| enc.fit([["abc", 2, 55], ["def", 1, 55], ["def", 3, 59]]) | |
| def test_categories(density, drop): | |
| ohe_base = OneHotEncoder(sparse_output=density) | |
| ohe_test = OneHotEncoder(sparse_output=density, drop=drop) | |
| X = [["c", 1, "a"], ["a", 2, "b"]] | |
| ohe_base.fit(X) | |
| ohe_test.fit(X) | |
| assert_array_equal(ohe_base.categories_, ohe_test.categories_) | |
| if drop == "first": | |
| assert_array_equal(ohe_test.drop_idx_, 0) | |
| else: | |
| for drop_cat, drop_idx, cat_list in zip( | |
| drop, ohe_test.drop_idx_, ohe_test.categories_ | |
| ): | |
| assert cat_list[int(drop_idx)] == drop_cat | |
| assert isinstance(ohe_test.drop_idx_, np.ndarray) | |
| assert ohe_test.drop_idx_.dtype == object | |
| def test_encoders_has_categorical_tags(Encoder): | |
| assert Encoder().__sklearn_tags__().input_tags.categorical | |
| def test_ohe_infrequent_two_levels(kwargs, categories): | |
| """Test that different parameters for combine 'a', 'c', and 'd' into | |
| the infrequent category works as expected.""" | |
| X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T | |
| ohe = OneHotEncoder( | |
| categories=categories, | |
| handle_unknown="infrequent_if_exist", | |
| sparse_output=False, | |
| **kwargs, | |
| ).fit(X_train) | |
| assert_array_equal(ohe.infrequent_categories_, [["a", "c", "d"]]) | |
| X_test = [["b"], ["a"], ["c"], ["d"], ["e"]] | |
| expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]]) | |
| X_trans = ohe.transform(X_test) | |
| assert_allclose(expected, X_trans) | |
| expected_inv = [[col] for col in ["b"] + ["infrequent_sklearn"] * 4] | |
| X_inv = ohe.inverse_transform(X_trans) | |
| assert_array_equal(expected_inv, X_inv) | |
| feature_names = ohe.get_feature_names_out() | |
| assert_array_equal(["x0_b", "x0_infrequent_sklearn"], feature_names) | |
| def test_ohe_infrequent_two_levels_drop_frequent(drop): | |
| """Test two levels and dropping the frequent category.""" | |
| X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T | |
| ohe = OneHotEncoder( | |
| handle_unknown="infrequent_if_exist", | |
| sparse_output=False, | |
| max_categories=2, | |
| drop=drop, | |
| ).fit(X_train) | |
| assert ohe.categories_[0][ohe.drop_idx_[0]] == "b" | |
| X_test = np.array([["b"], ["c"]]) | |
| X_trans = ohe.transform(X_test) | |
| assert_allclose([[0], [1]], X_trans) | |
| feature_names = ohe.get_feature_names_out() | |
| assert_array_equal(["x0_infrequent_sklearn"], feature_names) | |
| X_inverse = ohe.inverse_transform(X_trans) | |
| assert_array_equal([["b"], ["infrequent_sklearn"]], X_inverse) | |
| def test_ohe_infrequent_two_levels_drop_infrequent_errors(drop): | |
| """Test two levels and dropping any infrequent category removes the | |
| whole infrequent category.""" | |
| X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T | |
| ohe = OneHotEncoder( | |
| handle_unknown="infrequent_if_exist", | |
| sparse_output=False, | |
| max_categories=2, | |
| drop=drop, | |
| ) | |
| msg = f"Unable to drop category {drop[0]!r} from feature 0 because it is infrequent" | |
| with pytest.raises(ValueError, match=msg): | |
| ohe.fit(X_train) | |
| def test_ohe_infrequent_three_levels(kwargs): | |
| """Test that different parameters for combing 'a', and 'd' into | |
| the infrequent category works as expected.""" | |
| X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T | |
| ohe = OneHotEncoder( | |
| handle_unknown="infrequent_if_exist", sparse_output=False, **kwargs | |
| ).fit(X_train) | |
| assert_array_equal(ohe.infrequent_categories_, [["a", "d"]]) | |
| X_test = [["b"], ["a"], ["c"], ["d"], ["e"]] | |
| expected = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1], [0, 0, 1]]) | |
| X_trans = ohe.transform(X_test) | |
| assert_allclose(expected, X_trans) | |
| expected_inv = [ | |
| ["b"], | |
| ["infrequent_sklearn"], | |
| ["c"], | |
| ["infrequent_sklearn"], | |
| ["infrequent_sklearn"], | |
| ] | |
| X_inv = ohe.inverse_transform(X_trans) | |
| assert_array_equal(expected_inv, X_inv) | |
| feature_names = ohe.get_feature_names_out() | |
| assert_array_equal(["x0_b", "x0_c", "x0_infrequent_sklearn"], feature_names) | |
| def test_ohe_infrequent_three_levels_drop_frequent(drop): | |
| """Test three levels and dropping the frequent category.""" | |
| X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T | |
| ohe = OneHotEncoder( | |
| handle_unknown="infrequent_if_exist", | |
| sparse_output=False, | |
| max_categories=3, | |
| drop=drop, | |
| ).fit(X_train) | |
| X_test = np.array([["b"], ["c"], ["d"]]) | |
| assert_allclose([[0, 0], [1, 0], [0, 1]], ohe.transform(X_test)) | |
| # Check handle_unknown="ignore" | |
| ohe.set_params(handle_unknown="ignore").fit(X_train) | |
| msg = "Found unknown categories" | |
| with pytest.warns(UserWarning, match=msg): | |
| X_trans = ohe.transform([["b"], ["e"]]) | |
| assert_allclose([[0, 0], [0, 0]], X_trans) | |
| def test_ohe_infrequent_three_levels_drop_infrequent_errors(drop): | |
| """Test three levels and dropping the infrequent category.""" | |
| X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T | |
| ohe = OneHotEncoder( | |
| handle_unknown="infrequent_if_exist", | |
| sparse_output=False, | |
| max_categories=3, | |
| drop=drop, | |
| ) | |
| msg = f"Unable to drop category {drop[0]!r} from feature 0 because it is infrequent" | |
| with pytest.raises(ValueError, match=msg): | |
| ohe.fit(X_train) | |
| def test_ohe_infrequent_handle_unknown_error(): | |
| """Test that different parameters for combining 'a', and 'd' into | |
| the infrequent category works as expected.""" | |
| X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T | |
| ohe = OneHotEncoder( | |
| handle_unknown="error", sparse_output=False, max_categories=3 | |
| ).fit(X_train) | |
| assert_array_equal(ohe.infrequent_categories_, [["a", "d"]]) | |
| # all categories are known | |
| X_test = [["b"], ["a"], ["c"], ["d"]] | |
| expected = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1]]) | |
| X_trans = ohe.transform(X_test) | |
| assert_allclose(expected, X_trans) | |
| # 'bad' is not known and will error | |
| X_test = [["bad"]] | |
| msg = r"Found unknown categories \['bad'\] in column 0" | |
| with pytest.raises(ValueError, match=msg): | |
| ohe.transform(X_test) | |
| def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs): | |
| """'a' is the only frequent category, all other categories are infrequent.""" | |
| X_train = np.array([["a"] * 5 + ["e"] * 30], dtype=object).T | |
| ohe = OneHotEncoder( | |
| categories=[["c", "d", "a", "b"]], | |
| sparse_output=False, | |
| handle_unknown="infrequent_if_exist", | |
| **kwargs, | |
| ).fit(X_train) | |
| X_test = [["a"], ["b"], ["c"], ["d"], ["e"]] | |
| expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]]) | |
| X_trans = ohe.transform(X_test) | |
| assert_allclose(expected, X_trans) | |
| # 'a' is dropped | |
| drops = ["first", "if_binary", ["a"]] | |
| X_test = [["a"], ["c"]] | |
| for drop in drops: | |
| ohe.set_params(drop=drop).fit(X_train) | |
| assert_allclose([[0], [1]], ohe.transform(X_test)) | |
| def test_ohe_infrequent_two_levels_user_cats(): | |
| """Test that the order of the categories provided by a user is respected.""" | |
| X_train = np.array( | |
| [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object | |
| ).T | |
| ohe = OneHotEncoder( | |
| categories=[["c", "d", "a", "b"]], | |
| sparse_output=False, | |
| handle_unknown="infrequent_if_exist", | |
| max_categories=2, | |
| ).fit(X_train) | |
| assert_array_equal(ohe.infrequent_categories_, [["c", "d", "a"]]) | |
| X_test = [["b"], ["a"], ["c"], ["d"], ["e"]] | |
| expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]]) | |
| X_trans = ohe.transform(X_test) | |
| assert_allclose(expected, X_trans) | |
| # 'infrequent' is used to denote the infrequent categories for | |
| # `inverse_transform` | |
| expected_inv = [[col] for col in ["b"] + ["infrequent_sklearn"] * 4] | |
| X_inv = ohe.inverse_transform(X_trans) | |
| assert_array_equal(expected_inv, X_inv) | |
| def test_ohe_infrequent_three_levels_user_cats(): | |
| """Test that the order of the categories provided by a user is respected. | |
| In this case 'c' is encoded as the first category and 'b' is encoded | |
| as the second one.""" | |
| X_train = np.array( | |
| [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object | |
| ).T | |
| ohe = OneHotEncoder( | |
| categories=[["c", "d", "b", "a"]], | |
| sparse_output=False, | |
| handle_unknown="infrequent_if_exist", | |
| max_categories=3, | |
| ).fit(X_train) | |
| assert_array_equal(ohe.infrequent_categories_, [["d", "a"]]) | |
| X_test = [["b"], ["a"], ["c"], ["d"], ["e"]] | |
| expected = np.array([[0, 1, 0], [0, 0, 1], [1, 0, 0], [0, 0, 1], [0, 0, 1]]) | |
| X_trans = ohe.transform(X_test) | |
| assert_allclose(expected, X_trans) | |
| # 'infrequent' is used to denote the infrequent categories for | |
| # `inverse_transform` | |
| expected_inv = [ | |
| ["b"], | |
| ["infrequent_sklearn"], | |
| ["c"], | |
| ["infrequent_sklearn"], | |
| ["infrequent_sklearn"], | |
| ] | |
| X_inv = ohe.inverse_transform(X_trans) | |
| assert_array_equal(expected_inv, X_inv) | |
| def test_ohe_infrequent_mixed(): | |
| """Test infrequent categories where feature 0 has infrequent categories, | |
| and feature 1 does not.""" | |
| # X[:, 0] 1 and 2 are infrequent | |
| # X[:, 1] nothing is infrequent | |
| X = np.c_[[0, 1, 3, 3, 3, 3, 2, 0, 3], [0, 0, 0, 0, 1, 1, 1, 1, 1]] | |
| ohe = OneHotEncoder(max_categories=3, drop="if_binary", sparse_output=False) | |
| ohe.fit(X) | |
| X_test = [[3, 0], [1, 1]] | |
| X_trans = ohe.transform(X_test) | |
| # feature 1 is binary so it drops a category 0 | |
| assert_allclose(X_trans, [[0, 1, 0, 0], [0, 0, 1, 1]]) | |
| def test_ohe_infrequent_multiple_categories(): | |
| """Test infrequent categories with feature matrix with 3 features.""" | |
| X = np.c_[ | |
| [0, 1, 3, 3, 3, 3, 2, 0, 3], | |
| [0, 0, 5, 1, 1, 10, 5, 5, 0], | |
| [1, 0, 1, 0, 1, 0, 1, 0, 1], | |
| ] | |
| ohe = OneHotEncoder( | |
| categories="auto", max_categories=3, handle_unknown="infrequent_if_exist" | |
| ) | |
| # X[:, 0] 1 and 2 are infrequent | |
| # X[:, 1] 1 and 10 are infrequent | |
| # X[:, 2] nothing is infrequent | |
| X_trans = ohe.fit_transform(X).toarray() | |
| assert_array_equal(ohe.infrequent_categories_[0], [1, 2]) | |
| assert_array_equal(ohe.infrequent_categories_[1], [1, 10]) | |
| assert_array_equal(ohe.infrequent_categories_[2], None) | |
| # 'infrequent' is used to denote the infrequent categories | |
| # For the first column, 1 and 2 have the same frequency. In this case, | |
| # 1 will be chosen to be the feature name because is smaller lexiconically | |
| feature_names = ohe.get_feature_names_out() | |
| assert_array_equal( | |
| [ | |
| "x0_0", | |
| "x0_3", | |
| "x0_infrequent_sklearn", | |
| "x1_0", | |
| "x1_5", | |
| "x1_infrequent_sklearn", | |
| "x2_0", | |
| "x2_1", | |
| ], | |
| feature_names, | |
| ) | |
| expected = [ | |
| [1, 0, 0, 1, 0, 0, 0, 1], | |
| [0, 0, 1, 1, 0, 0, 1, 0], | |
| [0, 1, 0, 0, 1, 0, 0, 1], | |
| [0, 1, 0, 0, 0, 1, 1, 0], | |
| [0, 1, 0, 0, 0, 1, 0, 1], | |
| [0, 1, 0, 0, 0, 1, 1, 0], | |
| [0, 0, 1, 0, 1, 0, 0, 1], | |
| [1, 0, 0, 0, 1, 0, 1, 0], | |
| [0, 1, 0, 1, 0, 0, 0, 1], | |
| ] | |
| assert_allclose(expected, X_trans) | |
| X_test = [[3, 1, 2], [4, 0, 3]] | |
| X_test_trans = ohe.transform(X_test) | |
| # X[:, 2] does not have an infrequent category, thus it is encoded as all | |
| # zeros | |
| expected = [[0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0]] | |
| assert_allclose(expected, X_test_trans.toarray()) | |
| X_inv = ohe.inverse_transform(X_test_trans) | |
| expected_inv = np.array( | |
| [[3, "infrequent_sklearn", None], ["infrequent_sklearn", 0, None]], dtype=object | |
| ) | |
| assert_array_equal(expected_inv, X_inv) | |
| # error for unknown categories | |
| ohe = OneHotEncoder( | |
| categories="auto", max_categories=3, handle_unknown="error" | |
| ).fit(X) | |
| with pytest.raises(ValueError, match="Found unknown categories"): | |
| ohe.transform(X_test) | |
| # only infrequent or known categories | |
| X_test = [[1, 1, 1], [3, 10, 0]] | |
| X_test_trans = ohe.transform(X_test) | |
| expected = [[0, 0, 1, 0, 0, 1, 0, 1], [0, 1, 0, 0, 0, 1, 1, 0]] | |
| assert_allclose(expected, X_test_trans.toarray()) | |
| X_inv = ohe.inverse_transform(X_test_trans) | |
| expected_inv = np.array( | |
| [["infrequent_sklearn", "infrequent_sklearn", 1], [3, "infrequent_sklearn", 0]], | |
| dtype=object, | |
| ) | |
| assert_array_equal(expected_inv, X_inv) | |
| def test_ohe_infrequent_multiple_categories_dtypes(): | |
| """Test infrequent categories with a pandas dataframe with multiple dtypes.""" | |
| pd = pytest.importorskip("pandas") | |
| X = pd.DataFrame( | |
| { | |
| "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"], | |
| "int": [5, 3, 0, 10, 10, 12, 0, 3, 5], | |
| }, | |
| columns=["str", "int"], | |
| ) | |
| ohe = OneHotEncoder( | |
| categories="auto", max_categories=3, handle_unknown="infrequent_if_exist" | |
| ) | |
| # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be | |
| # considered infrequent because they are greater | |
| # X[:, 1] 0, 3, 5, 10 has frequency 2 and 12 has frequency 1. | |
| # 0, 3, 12 will be considered infrequent | |
| X_trans = ohe.fit_transform(X).toarray() | |
| assert_array_equal(ohe.infrequent_categories_[0], ["a", "b"]) | |
| assert_array_equal(ohe.infrequent_categories_[1], [0, 3, 12]) | |
| expected = [ | |
| [0, 0, 1, 1, 0, 0], | |
| [0, 1, 0, 0, 0, 1], | |
| [1, 0, 0, 0, 0, 1], | |
| [0, 1, 0, 0, 1, 0], | |
| [0, 1, 0, 0, 1, 0], | |
| [0, 0, 1, 0, 0, 1], | |
| [1, 0, 0, 0, 0, 1], | |
| [0, 0, 1, 0, 0, 1], | |
| [0, 0, 1, 1, 0, 0], | |
| ] | |
| assert_allclose(expected, X_trans) | |
| X_test = pd.DataFrame({"str": ["b", "f"], "int": [14, 12]}, columns=["str", "int"]) | |
| expected = [[0, 0, 1, 0, 0, 1], [0, 1, 0, 0, 0, 1]] | |
| X_test_trans = ohe.transform(X_test) | |
| assert_allclose(expected, X_test_trans.toarray()) | |
| X_inv = ohe.inverse_transform(X_test_trans) | |
| expected_inv = np.array( | |
| [["infrequent_sklearn", "infrequent_sklearn"], ["f", "infrequent_sklearn"]], | |
| dtype=object, | |
| ) | |
| assert_array_equal(expected_inv, X_inv) | |
| # only infrequent or known categories | |
| X_test = pd.DataFrame({"str": ["c", "b"], "int": [12, 5]}, columns=["str", "int"]) | |
| X_test_trans = ohe.transform(X_test).toarray() | |
| expected = [[1, 0, 0, 0, 0, 1], [0, 0, 1, 1, 0, 0]] | |
| assert_allclose(expected, X_test_trans) | |
| X_inv = ohe.inverse_transform(X_test_trans) | |
| expected_inv = np.array( | |
| [["c", "infrequent_sklearn"], ["infrequent_sklearn", 5]], dtype=object | |
| ) | |
| assert_array_equal(expected_inv, X_inv) | |
| def test_ohe_infrequent_one_level_errors(kwargs): | |
| """All user provided categories are infrequent.""" | |
| X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 2]).T | |
| ohe = OneHotEncoder( | |
| handle_unknown="infrequent_if_exist", sparse_output=False, **kwargs | |
| ) | |
| ohe.fit(X_train) | |
| X_trans = ohe.transform([["a"]]) | |
| assert_allclose(X_trans, [[1]]) | |
| def test_ohe_infrequent_user_cats_unknown_training_errors(kwargs): | |
| """All user provided categories are infrequent.""" | |
| X_train = np.array([["e"] * 3], dtype=object).T | |
| ohe = OneHotEncoder( | |
| categories=[["c", "d", "a", "b"]], | |
| sparse_output=False, | |
| handle_unknown="infrequent_if_exist", | |
| **kwargs, | |
| ).fit(X_train) | |
| X_trans = ohe.transform([["a"], ["e"]]) | |
| assert_allclose(X_trans, [[1], [1]]) | |
| # deliberately omit 'OS' as an invalid combo | |
| def test_encoders_string_categories(input_dtype, category_dtype, array_type): | |
| """Check that encoding work with object, unicode, and byte string dtypes. | |
| Non-regression test for: | |
| https://github.com/scikit-learn/scikit-learn/issues/15616 | |
| https://github.com/scikit-learn/scikit-learn/issues/15726 | |
| https://github.com/scikit-learn/scikit-learn/issues/19677 | |
| """ | |
| X = np.array([["b"], ["a"]], dtype=input_dtype) | |
| categories = [np.array(["b", "a"], dtype=category_dtype)] | |
| ohe = OneHotEncoder(categories=categories, sparse_output=False).fit(X) | |
| X_test = _convert_container( | |
| [["a"], ["a"], ["b"], ["a"]], array_type, dtype=input_dtype | |
| ) | |
| X_trans = ohe.transform(X_test) | |
| expected = np.array([[0, 1], [0, 1], [1, 0], [0, 1]]) | |
| assert_allclose(X_trans, expected) | |
| oe = OrdinalEncoder(categories=categories).fit(X) | |
| X_trans = oe.transform(X_test) | |
| expected = np.array([[1], [1], [0], [1]]) | |
| assert_array_equal(X_trans, expected) | |
| def test_mixed_string_bytes_categoricals(): | |
| """Check that this mixture of predefined categories and X raises an error. | |
| Categories defined as bytes can not easily be compared to data that is | |
| a string. | |
| """ | |
| # data as unicode | |
| X = np.array([["b"], ["a"]], dtype="U") | |
| # predefined categories as bytes | |
| categories = [np.array(["b", "a"], dtype="S")] | |
| ohe = OneHotEncoder(categories=categories, sparse_output=False) | |
| msg = re.escape( | |
| "In column 0, the predefined categories have type 'bytes' which is incompatible" | |
| " with values of type 'str_'." | |
| ) | |
| with pytest.raises(ValueError, match=msg): | |
| ohe.fit(X) | |
| def test_ohe_missing_values_get_feature_names(missing_value): | |
| # encoder with missing values with object dtypes | |
| X = np.array([["a", "b", missing_value, "a", missing_value]], dtype=object).T | |
| ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore").fit(X) | |
| names = ohe.get_feature_names_out() | |
| assert_array_equal(names, ["x0_a", "x0_b", f"x0_{missing_value}"]) | |
| def test_ohe_missing_value_support_pandas(): | |
| # check support for pandas with mixed dtypes and missing values | |
| pd = pytest.importorskip("pandas") | |
| df = pd.DataFrame( | |
| { | |
| "col1": ["dog", "cat", None, "cat"], | |
| "col2": np.array([3, 0, 4, np.nan], dtype=float), | |
| }, | |
| columns=["col1", "col2"], | |
| ) | |
| expected_df_trans = np.array( | |
| [ | |
| [0, 1, 0, 0, 1, 0, 0], | |
| [1, 0, 0, 1, 0, 0, 0], | |
| [0, 0, 1, 0, 0, 1, 0], | |
| [1, 0, 0, 0, 0, 0, 1], | |
| ] | |
| ) | |
| Xtr = check_categorical_onehot(df) | |
| assert_allclose(Xtr, expected_df_trans) | |
| def test_ohe_missing_value_support_pandas_categorical(pd_nan_type, handle_unknown): | |
| # checks pandas dataframe with categorical features | |
| pd = pytest.importorskip("pandas") | |
| pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan | |
| df = pd.DataFrame( | |
| { | |
| "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"), | |
| } | |
| ) | |
| expected_df_trans = np.array( | |
| [ | |
| [0, 0, 1, 0], | |
| [1, 0, 0, 0], | |
| [0, 0, 0, 1], | |
| [0, 1, 0, 0], | |
| [1, 0, 0, 0], | |
| ] | |
| ) | |
| ohe = OneHotEncoder(sparse_output=False, handle_unknown=handle_unknown) | |
| df_trans = ohe.fit_transform(df) | |
| assert_allclose(expected_df_trans, df_trans) | |
| assert len(ohe.categories_) == 1 | |
| assert_array_equal(ohe.categories_[0][:-1], ["a", "b", "c"]) | |
| assert np.isnan(ohe.categories_[0][-1]) | |
| def test_ohe_drop_first_handle_unknown_ignore_warns(handle_unknown): | |
| """Check drop='first' and handle_unknown='ignore'/'infrequent_if_exist' | |
| during transform.""" | |
| X = [["a", 0], ["b", 2], ["b", 1]] | |
| ohe = OneHotEncoder( | |
| drop="first", sparse_output=False, handle_unknown=handle_unknown | |
| ) | |
| X_trans = ohe.fit_transform(X) | |
| X_expected = np.array( | |
| [ | |
| [0, 0, 0], | |
| [1, 0, 1], | |
| [1, 1, 0], | |
| ] | |
| ) | |
| assert_allclose(X_trans, X_expected) | |
| # Both categories are unknown | |
| X_test = [["c", 3]] | |
| X_expected = np.array([[0, 0, 0]]) | |
| warn_msg = ( | |
| r"Found unknown categories in columns \[0, 1\] during " | |
| "transform. These unknown categories will be encoded as all " | |
| "zeros" | |
| ) | |
| with pytest.warns(UserWarning, match=warn_msg): | |
| X_trans = ohe.transform(X_test) | |
| assert_allclose(X_trans, X_expected) | |
| # inverse_transform maps to None | |
| X_inv = ohe.inverse_transform(X_expected) | |
| assert_array_equal(X_inv, np.array([["a", 0]], dtype=object)) | |
| def test_ohe_drop_if_binary_handle_unknown_ignore_warns(handle_unknown): | |
| """Check drop='if_binary' and handle_unknown='ignore' during transform.""" | |
| X = [["a", 0], ["b", 2], ["b", 1]] | |
| ohe = OneHotEncoder( | |
| drop="if_binary", sparse_output=False, handle_unknown=handle_unknown | |
| ) | |
| X_trans = ohe.fit_transform(X) | |
| X_expected = np.array( | |
| [ | |
| [0, 1, 0, 0], | |
| [1, 0, 0, 1], | |
| [1, 0, 1, 0], | |
| ] | |
| ) | |
| assert_allclose(X_trans, X_expected) | |
| # Both categories are unknown | |
| X_test = [["c", 3]] | |
| X_expected = np.array([[0, 0, 0, 0]]) | |
| warn_msg = ( | |
| r"Found unknown categories in columns \[0, 1\] during " | |
| "transform. These unknown categories will be encoded as all " | |
| "zeros" | |
| ) | |
| with pytest.warns(UserWarning, match=warn_msg): | |
| X_trans = ohe.transform(X_test) | |
| assert_allclose(X_trans, X_expected) | |
| # inverse_transform maps to None | |
| X_inv = ohe.inverse_transform(X_expected) | |
| assert_array_equal(X_inv, np.array([["a", None]], dtype=object)) | |
| def test_ohe_drop_first_explicit_categories(handle_unknown): | |
| """Check drop='first' and handle_unknown='ignore'/'infrequent_if_exist' | |
| during fit with categories passed in.""" | |
| X = [["a", 0], ["b", 2], ["b", 1]] | |
| ohe = OneHotEncoder( | |
| drop="first", | |
| sparse_output=False, | |
| handle_unknown=handle_unknown, | |
| categories=[["b", "a"], [1, 2]], | |
| ) | |
| ohe.fit(X) | |
| X_test = [["c", 1]] | |
| X_expected = np.array([[0, 0]]) | |
| warn_msg = ( | |
| r"Found unknown categories in columns \[0\] during transform. " | |
| r"These unknown categories will be encoded as all zeros" | |
| ) | |
| with pytest.warns(UserWarning, match=warn_msg): | |
| X_trans = ohe.transform(X_test) | |
| assert_allclose(X_trans, X_expected) | |
| def test_ohe_more_informative_error_message(): | |
| """Raise informative error message when pandas output and sparse_output=True.""" | |
| pd = pytest.importorskip("pandas") | |
| df = pd.DataFrame({"a": [1, 2, 3], "b": ["z", "b", "b"]}, columns=["a", "b"]) | |
| ohe = OneHotEncoder(sparse_output=True) | |
| ohe.set_output(transform="pandas") | |
| msg = ( | |
| "Pandas output does not support sparse data. Set " | |
| "sparse_output=False to output pandas dataframes or disable Pandas output" | |
| ) | |
| with pytest.raises(ValueError, match=msg): | |
| ohe.fit_transform(df) | |
| ohe.fit(df) | |
| with pytest.raises(ValueError, match=msg): | |
| ohe.transform(df) | |
| def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype(): | |
| """Test ordinal encoder with nan passthrough fails when dtype=np.int32.""" | |
| X = np.array([[np.nan, 3.0, 1.0, 3.0]]).T | |
| oe = OrdinalEncoder(dtype=np.int32) | |
| msg = ( | |
| r"There are missing values in features \[0\]. For OrdinalEncoder " | |
| f"to encode missing values with dtype: {np.int32}" | |
| ) | |
| with pytest.raises(ValueError, match=msg): | |
| oe.fit(X) | |
| def test_ordinal_encoder_passthrough_missing_values_float(encoded_missing_value): | |
| """Test ordinal encoder with nan on float dtypes.""" | |
| X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T | |
| oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(X) | |
| assert len(oe.categories_) == 1 | |
| assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan]) | |
| X_trans = oe.transform(X) | |
| assert_allclose(X_trans, [[encoded_missing_value], [1.0], [0.0], [1.0]]) | |
| X_inverse = oe.inverse_transform(X_trans) | |
| assert_allclose(X_inverse, X) | |
| def test_ordinal_encoder_missing_value_support_pandas_categorical( | |
| pd_nan_type, encoded_missing_value | |
| ): | |
| """Check ordinal encoder is compatible with pandas.""" | |
| # checks pandas dataframe with categorical features | |
| pd = pytest.importorskip("pandas") | |
| pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan | |
| df = pd.DataFrame( | |
| { | |
| "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"), | |
| } | |
| ) | |
| oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(df) | |
| assert len(oe.categories_) == 1 | |
| assert_array_equal(oe.categories_[0][:3], ["a", "b", "c"]) | |
| assert np.isnan(oe.categories_[0][-1]) | |
| df_trans = oe.transform(df) | |
| assert_allclose(df_trans, [[2.0], [0.0], [encoded_missing_value], [1.0], [0.0]]) | |
| X_inverse = oe.inverse_transform(df_trans) | |
| assert X_inverse.shape == (5, 1) | |
| assert_array_equal(X_inverse[:2, 0], ["c", "a"]) | |
| assert_array_equal(X_inverse[3:, 0], ["b", "a"]) | |
| assert np.isnan(X_inverse[2, 0]) | |
| def test_ordinal_encoder_specified_categories_missing_passthrough( | |
| X, X2, cats, cat_dtype | |
| ): | |
| """Test ordinal encoder for specified categories.""" | |
| oe = OrdinalEncoder(categories=cats) | |
| exp = np.array([[0.0], [np.nan]]) | |
| assert_array_equal(oe.fit_transform(X), exp) | |
| # manually specified categories should have same dtype as | |
| # the data when coerced from lists | |
| assert oe.categories_[0].dtype == cat_dtype | |
| # when specifying categories manually, unknown categories should already | |
| # raise when fitting | |
| oe = OrdinalEncoder(categories=cats) | |
| with pytest.raises(ValueError, match="Found unknown categories"): | |
| oe.fit(X2) | |
| def test_encoder_duplicate_specified_categories(Encoder): | |
| """Test encoder for specified categories have duplicate values. | |
| Non-regression test for: | |
| https://github.com/scikit-learn/scikit-learn/issues/27088 | |
| """ | |
| cats = [np.array(["a", "b", "a"], dtype=object)] | |
| enc = Encoder(categories=cats) | |
| X = np.array([["a", "b"]], dtype=object).T | |
| with pytest.raises( | |
| ValueError, match="the predefined categories contain duplicate elements." | |
| ): | |
| enc.fit(X) | |
| def test_ordinal_encoder_handle_missing_and_unknown(X, expected_X_trans, X_test): | |
| """Test the interaction between missing values and handle_unknown""" | |
| oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1) | |
| X_trans = oe.fit_transform(X) | |
| assert_allclose(X_trans, expected_X_trans) | |
| assert_allclose(oe.transform(X_test), [[-1.0]]) | |
| def test_ordinal_encoder_sparse(csr_container): | |
| """Check that we raise proper error with sparse input in OrdinalEncoder. | |
| Non-regression test for: | |
| https://github.com/scikit-learn/scikit-learn/issues/19878 | |
| """ | |
| X = np.array([[3, 2, 1], [0, 1, 1]]) | |
| X_sparse = csr_container(X) | |
| encoder = OrdinalEncoder() | |
| err_msg = "Sparse data was passed, but dense data is required" | |
| with pytest.raises(TypeError, match=err_msg): | |
| encoder.fit(X_sparse) | |
| with pytest.raises(TypeError, match=err_msg): | |
| encoder.fit_transform(X_sparse) | |
| X_trans = encoder.fit_transform(X) | |
| X_trans_sparse = csr_container(X_trans) | |
| with pytest.raises(TypeError, match=err_msg): | |
| encoder.inverse_transform(X_trans_sparse) | |
| def test_ordinal_encoder_fit_with_unseen_category(): | |
| """Check OrdinalEncoder.fit works with unseen category when | |
| `handle_unknown="use_encoded_value"`. | |
| Non-regression test for: | |
| https://github.com/scikit-learn/scikit-learn/issues/19872 | |
| """ | |
| X = np.array([0, 0, 1, 0, 2, 5])[:, np.newaxis] | |
| oe = OrdinalEncoder( | |
| categories=[[-1, 0, 1]], handle_unknown="use_encoded_value", unknown_value=-999 | |
| ) | |
| oe.fit(X) | |
| oe = OrdinalEncoder(categories=[[-1, 0, 1]], handle_unknown="error") | |
| with pytest.raises(ValueError, match="Found unknown categories"): | |
| oe.fit(X) | |
| def test_ordinal_encoder_handle_unknown_string_dtypes(X_train, X_test): | |
| """Checks that `OrdinalEncoder` transforms string dtypes. | |
| Non-regression test for: | |
| https://github.com/scikit-learn/scikit-learn/issues/19872 | |
| """ | |
| enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-9) | |
| enc.fit(X_train) | |
| X_trans = enc.transform(X_test) | |
| assert_allclose(X_trans, [[-9, 0]]) | |
| def test_ordinal_encoder_python_integer(): | |
| """Check that `OrdinalEncoder` accepts Python integers that are potentially | |
| larger than 64 bits. | |
| Non-regression test for: | |
| https://github.com/scikit-learn/scikit-learn/issues/20721 | |
| """ | |
| X = np.array( | |
| [ | |
| 44253463435747313673, | |
| 9867966753463435747313673, | |
| 44253462342215747313673, | |
| 442534634357764313673, | |
| ] | |
| ).reshape(-1, 1) | |
| encoder = OrdinalEncoder().fit(X) | |
| assert_array_equal(encoder.categories_, np.sort(X, axis=0).T) | |
| X_trans = encoder.transform(X) | |
| assert_array_equal(X_trans, [[0], [3], [2], [1]]) | |
| def test_ordinal_encoder_features_names_out_pandas(): | |
| """Check feature names out is same as the input.""" | |
| pd = pytest.importorskip("pandas") | |
| names = ["b", "c", "a"] | |
| X = pd.DataFrame([[1, 2, 3]], columns=names) | |
| enc = OrdinalEncoder().fit(X) | |
| feature_names_out = enc.get_feature_names_out() | |
| assert_array_equal(names, feature_names_out) | |
| def test_ordinal_encoder_unknown_missing_interaction(): | |
| """Check interactions between encode_unknown and missing value encoding.""" | |
| X = np.array([["a"], ["b"], [np.nan]], dtype=object) | |
| oe = OrdinalEncoder( | |
| handle_unknown="use_encoded_value", | |
| unknown_value=np.nan, | |
| encoded_missing_value=-3, | |
| ).fit(X) | |
| X_trans = oe.transform(X) | |
| assert_allclose(X_trans, [[0], [1], [-3]]) | |
| # "c" is unknown and is mapped to np.nan | |
| # "None" is a missing value and is set to -3 | |
| X_test = np.array([["c"], [np.nan]], dtype=object) | |
| X_test_trans = oe.transform(X_test) | |
| assert_allclose(X_test_trans, [[np.nan], [-3]]) | |
| # Non-regression test for #24082 | |
| X_roundtrip = oe.inverse_transform(X_test_trans) | |
| # np.nan is unknown so it maps to None | |
| assert X_roundtrip[0][0] is None | |
| # -3 is the encoded missing value so it maps back to nan | |
| assert np.isnan(X_roundtrip[1][0]) | |
| def test_ordinal_encoder_encoded_missing_value_error(with_pandas): | |
| """Check OrdinalEncoder errors when encoded_missing_value is used by | |
| an known category.""" | |
| X = np.array([["a", "dog"], ["b", "cat"], ["c", np.nan]], dtype=object) | |
| # The 0-th feature has no missing values so it is not included in the list of | |
| # features | |
| error_msg = ( | |
| r"encoded_missing_value \(1\) is already used to encode a known category " | |
| r"in features: " | |
| ) | |
| if with_pandas: | |
| pd = pytest.importorskip("pandas") | |
| X = pd.DataFrame(X, columns=["letter", "pet"]) | |
| error_msg = error_msg + r"\['pet'\]" | |
| else: | |
| error_msg = error_msg + r"\[1\]" | |
| oe = OrdinalEncoder(encoded_missing_value=1) | |
| with pytest.raises(ValueError, match=error_msg): | |
| oe.fit(X) | |
| def test_ordinal_encoder_unknown_missing_interaction_both_nan( | |
| X_train, X_test_trans_expected, X_roundtrip_expected | |
| ): | |
| """Check transform when unknown_value and encoded_missing_value is nan. | |
| Non-regression test for #24082. | |
| """ | |
| oe = OrdinalEncoder( | |
| handle_unknown="use_encoded_value", | |
| unknown_value=np.nan, | |
| encoded_missing_value=np.nan, | |
| ).fit(X_train) | |
| X_test = np.array([["1"], [np.nan], ["b"]]) | |
| X_test_trans = oe.transform(X_test) | |
| # both nan and unknown are encoded as nan | |
| assert_allclose(X_test_trans, X_test_trans_expected) | |
| X_roundtrip = oe.inverse_transform(X_test_trans) | |
| n_samples = X_roundtrip_expected.shape[0] | |
| for i in range(n_samples): | |
| expected_val = X_roundtrip_expected[i, 0] | |
| val = X_roundtrip[i, 0] | |
| if expected_val is None: | |
| assert val is None | |
| elif is_scalar_nan(expected_val): | |
| assert np.isnan(val) | |
| else: | |
| assert val == expected_val | |
| def test_one_hot_encoder_set_output(): | |
| """Check OneHotEncoder works with set_output.""" | |
| pd = pytest.importorskip("pandas") | |
| X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]}) | |
| ohe = OneHotEncoder() | |
| ohe.set_output(transform="pandas") | |
| match = "Pandas output does not support sparse data. Set sparse_output=False" | |
| with pytest.raises(ValueError, match=match): | |
| ohe.fit_transform(X_df) | |
| ohe_default = OneHotEncoder(sparse_output=False).set_output(transform="default") | |
| ohe_pandas = OneHotEncoder(sparse_output=False).set_output(transform="pandas") | |
| X_default = ohe_default.fit_transform(X_df) | |
| X_pandas = ohe_pandas.fit_transform(X_df) | |
| assert_allclose(X_pandas.to_numpy(), X_default) | |
| assert_array_equal(ohe_pandas.get_feature_names_out(), X_pandas.columns) | |
| def test_ordinal_set_output(): | |
| """Check OrdinalEncoder works with set_output.""" | |
| pd = pytest.importorskip("pandas") | |
| X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]}) | |
| ord_default = OrdinalEncoder().set_output(transform="default") | |
| ord_pandas = OrdinalEncoder().set_output(transform="pandas") | |
| X_default = ord_default.fit_transform(X_df) | |
| X_pandas = ord_pandas.fit_transform(X_df) | |
| assert_allclose(X_pandas.to_numpy(), X_default) | |
| assert_array_equal(ord_pandas.get_feature_names_out(), X_pandas.columns) | |
| def test_predefined_categories_dtype(): | |
| """Check that the categories_ dtype is `object` for string categories | |
| Regression test for gh-25171. | |
| """ | |
| categories = [["as", "mmas", "eas", "ras", "acs"], ["1", "2"]] | |
| enc = OneHotEncoder(categories=categories) | |
| enc.fit([["as", "1"]]) | |
| assert len(categories) == len(enc.categories_) | |
| for n, cat in enumerate(enc.categories_): | |
| assert cat.dtype == object | |
| assert_array_equal(categories[n], cat) | |
| def test_ordinal_encoder_missing_unknown_encoding_max(): | |
| """Check missing value or unknown encoding can equal the cardinality.""" | |
| X = np.array([["dog"], ["cat"], [np.nan]], dtype=object) | |
| X_trans = OrdinalEncoder(encoded_missing_value=2).fit_transform(X) | |
| assert_allclose(X_trans, [[1], [0], [2]]) | |
| enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=2).fit(X) | |
| X_test = np.array([["snake"]]) | |
| X_trans = enc.transform(X_test) | |
| assert_allclose(X_trans, [[2]]) | |
| def test_drop_idx_infrequent_categories(): | |
| """Check drop_idx is defined correctly with infrequent categories. | |
| Non-regression test for gh-25550. | |
| """ | |
| X = np.array( | |
| [["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object | |
| ).T | |
| ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="first").fit(X) | |
| assert_array_equal( | |
| ohe.get_feature_names_out(), ["x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"] | |
| ) | |
| assert ohe.categories_[0][ohe.drop_idx_[0]] == "b" | |
| X = np.array([["a"] * 2 + ["b"] * 2 + ["c"] * 10], dtype=object).T | |
| ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="if_binary").fit(X) | |
| assert_array_equal(ohe.get_feature_names_out(), ["x0_infrequent_sklearn"]) | |
| assert ohe.categories_[0][ohe.drop_idx_[0]] == "c" | |
| X = np.array( | |
| [["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object | |
| ).T | |
| ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=["d"]).fit(X) | |
| assert_array_equal( | |
| ohe.get_feature_names_out(), ["x0_b", "x0_c", "x0_e", "x0_infrequent_sklearn"] | |
| ) | |
| assert ohe.categories_[0][ohe.drop_idx_[0]] == "d" | |
| ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=None).fit(X) | |
| assert_array_equal( | |
| ohe.get_feature_names_out(), | |
| ["x0_b", "x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"], | |
| ) | |
| assert ohe.drop_idx_ is None | |
| def test_ordinal_encoder_infrequent_three_levels(kwargs): | |
| """Test parameters for grouping 'a', and 'd' into the infrequent category.""" | |
| X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T | |
| ordinal = OrdinalEncoder( | |
| handle_unknown="use_encoded_value", unknown_value=-1, **kwargs | |
| ).fit(X_train) | |
| assert_array_equal(ordinal.categories_, [["a", "b", "c", "d"]]) | |
| assert_array_equal(ordinal.infrequent_categories_, [["a", "d"]]) | |
| X_test = [["a"], ["b"], ["c"], ["d"], ["z"]] | |
| expected_trans = [[2], [0], [1], [2], [-1]] | |
| X_trans = ordinal.transform(X_test) | |
| assert_allclose(X_trans, expected_trans) | |
| X_inverse = ordinal.inverse_transform(X_trans) | |
| expected_inverse = [ | |
| ["infrequent_sklearn"], | |
| ["b"], | |
| ["c"], | |
| ["infrequent_sklearn"], | |
| [None], | |
| ] | |
| assert_array_equal(X_inverse, expected_inverse) | |
| def test_ordinal_encoder_infrequent_three_levels_user_cats(): | |
| """Test that the order of the categories provided by a user is respected. | |
| In this case 'c' is encoded as the first category and 'b' is encoded | |
| as the second one. | |
| """ | |
| X_train = np.array( | |
| [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object | |
| ).T | |
| ordinal = OrdinalEncoder( | |
| categories=[["c", "d", "b", "a"]], | |
| max_categories=3, | |
| handle_unknown="use_encoded_value", | |
| unknown_value=-1, | |
| ).fit(X_train) | |
| assert_array_equal(ordinal.categories_, [["c", "d", "b", "a"]]) | |
| assert_array_equal(ordinal.infrequent_categories_, [["d", "a"]]) | |
| X_test = [["a"], ["b"], ["c"], ["d"], ["z"]] | |
| expected_trans = [[2], [1], [0], [2], [-1]] | |
| X_trans = ordinal.transform(X_test) | |
| assert_allclose(X_trans, expected_trans) | |
| X_inverse = ordinal.inverse_transform(X_trans) | |
| expected_inverse = [ | |
| ["infrequent_sklearn"], | |
| ["b"], | |
| ["c"], | |
| ["infrequent_sklearn"], | |
| [None], | |
| ] | |
| assert_array_equal(X_inverse, expected_inverse) | |
| def test_ordinal_encoder_infrequent_mixed(): | |
| """Test when feature 0 has infrequent categories and feature 1 does not.""" | |
| X = np.column_stack(([0, 1, 3, 3, 3, 3, 2, 0, 3], [0, 0, 0, 0, 1, 1, 1, 1, 1])) | |
| ordinal = OrdinalEncoder(max_categories=3).fit(X) | |
| assert_array_equal(ordinal.infrequent_categories_[0], [1, 2]) | |
| assert ordinal.infrequent_categories_[1] is None | |
| X_test = [[3, 0], [1, 1]] | |
| expected_trans = [[1, 0], [2, 1]] | |
| X_trans = ordinal.transform(X_test) | |
| assert_allclose(X_trans, expected_trans) | |
| X_inverse = ordinal.inverse_transform(X_trans) | |
| expected_inverse = np.array([[3, 0], ["infrequent_sklearn", 1]], dtype=object) | |
| assert_array_equal(X_inverse, expected_inverse) | |
| def test_ordinal_encoder_infrequent_multiple_categories_dtypes(): | |
| """Test infrequent categories with a pandas DataFrame with multiple dtypes.""" | |
| pd = pytest.importorskip("pandas") | |
| categorical_dtype = pd.CategoricalDtype(["bird", "cat", "dog", "snake"]) | |
| X = pd.DataFrame( | |
| { | |
| "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"], | |
| "int": [5, 3, 0, 10, 10, 12, 0, 3, 5], | |
| "categorical": pd.Series( | |
| ["dog"] * 4 + ["cat"] * 3 + ["snake"] + ["bird"], | |
| dtype=categorical_dtype, | |
| ), | |
| }, | |
| columns=["str", "int", "categorical"], | |
| ) | |
| ordinal = OrdinalEncoder(max_categories=3).fit(X) | |
| # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be | |
| # considered infrequent because they appear first when sorted | |
| # X[:, 1] 0, 3, 5, 10 has frequency 2 and 12 has frequency 1. | |
| # 0, 3, 12 will be considered infrequent because they appear first when | |
| # sorted. | |
| # X[:, 2] "snake" and "bird" or infrequent | |
| assert_array_equal(ordinal.infrequent_categories_[0], ["a", "b"]) | |
| assert_array_equal(ordinal.infrequent_categories_[1], [0, 3, 12]) | |
| assert_array_equal(ordinal.infrequent_categories_[2], ["bird", "snake"]) | |
| X_test = pd.DataFrame( | |
| { | |
| "str": ["a", "b", "f", "c"], | |
| "int": [12, 0, 10, 5], | |
| "categorical": pd.Series( | |
| ["cat"] + ["snake"] + ["bird"] + ["dog"], | |
| dtype=categorical_dtype, | |
| ), | |
| }, | |
| columns=["str", "int", "categorical"], | |
| ) | |
| expected_trans = [[2, 2, 0], [2, 2, 2], [1, 1, 2], [0, 0, 1]] | |
| X_trans = ordinal.transform(X_test) | |
| assert_allclose(X_trans, expected_trans) | |
| def test_ordinal_encoder_infrequent_custom_mapping(): | |
| """Check behavior of unknown_value and encoded_missing_value with infrequent.""" | |
| X_train = np.array( | |
| [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3 + [np.nan]], dtype=object | |
| ).T | |
| ordinal = OrdinalEncoder( | |
| handle_unknown="use_encoded_value", | |
| unknown_value=2, | |
| max_categories=2, | |
| encoded_missing_value=3, | |
| ).fit(X_train) | |
| assert_array_equal(ordinal.infrequent_categories_, [["a", "c", "d"]]) | |
| X_test = np.array([["a"], ["b"], ["c"], ["d"], ["e"], [np.nan]], dtype=object) | |
| expected_trans = [[1], [0], [1], [1], [2], [3]] | |
| X_trans = ordinal.transform(X_test) | |
| assert_allclose(X_trans, expected_trans) | |
| def test_ordinal_encoder_all_frequent(kwargs): | |
| """All categories are considered frequent have same encoding as default encoder.""" | |
| X_train = np.array( | |
| [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object | |
| ).T | |
| adjusted_encoder = OrdinalEncoder( | |
| **kwargs, handle_unknown="use_encoded_value", unknown_value=-1 | |
| ).fit(X_train) | |
| default_encoder = OrdinalEncoder( | |
| handle_unknown="use_encoded_value", unknown_value=-1 | |
| ).fit(X_train) | |
| X_test = [["a"], ["b"], ["c"], ["d"], ["e"]] | |
| assert_allclose( | |
| adjusted_encoder.transform(X_test), default_encoder.transform(X_test) | |
| ) | |
| def test_ordinal_encoder_all_infrequent(kwargs): | |
| """When all categories are infrequent, they are all encoded as zero.""" | |
| X_train = np.array( | |
| [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object | |
| ).T | |
| encoder = OrdinalEncoder( | |
| **kwargs, handle_unknown="use_encoded_value", unknown_value=-1 | |
| ).fit(X_train) | |
| X_test = [["a"], ["b"], ["c"], ["d"], ["e"]] | |
| assert_allclose(encoder.transform(X_test), [[0], [0], [0], [0], [-1]]) | |
| def test_ordinal_encoder_missing_appears_frequent(): | |
| """Check behavior when missing value appears frequently.""" | |
| X = np.array( | |
| [[np.nan] * 20 + ["dog"] * 10 + ["cat"] * 5 + ["snake"] + ["deer"]], | |
| dtype=object, | |
| ).T | |
| ordinal = OrdinalEncoder(max_categories=3).fit(X) | |
| X_test = np.array([["snake", "cat", "dog", np.nan]], dtype=object).T | |
| X_trans = ordinal.transform(X_test) | |
| assert_allclose(X_trans, [[2], [0], [1], [np.nan]]) | |
| def test_ordinal_encoder_missing_appears_infrequent(): | |
| """Check behavior when missing value appears infrequently.""" | |
| # feature 0 has infrequent categories | |
| # feature 1 has no infrequent categories | |
| X = np.array( | |
| [ | |
| [np.nan] + ["dog"] * 10 + ["cat"] * 5 + ["snake"] + ["deer"], | |
| ["red"] * 9 + ["green"] * 9, | |
| ], | |
| dtype=object, | |
| ).T | |
| ordinal = OrdinalEncoder(min_frequency=4).fit(X) | |
| X_test = np.array( | |
| [ | |
| ["snake", "red"], | |
| ["deer", "green"], | |
| [np.nan, "green"], | |
| ["dog", "green"], | |
| ["cat", "red"], | |
| ], | |
| dtype=object, | |
| ) | |
| X_trans = ordinal.transform(X_test) | |
| assert_allclose(X_trans, [[2, 1], [2, 0], [np.nan, 0], [1, 0], [0, 1]]) | |
| def test_encoder_not_fitted(Encoder): | |
| """Check that we raise a `NotFittedError` by calling transform before fit with | |
| the encoders. | |
| One could expect that the passing the `categories` argument to the encoder | |
| would make it stateless. However, `fit` is making a couple of check, such as the | |
| position of `np.nan`. | |
| """ | |
| X = np.array([["A"], ["B"], ["C"]], dtype=object) | |
| encoder = Encoder(categories=[["A", "B", "C"]]) | |
| with pytest.raises(NotFittedError): | |
| encoder.transform(X) | |