| import numpy as np |
| import pytest |
|
|
| from sklearn.experimental import enable_iterative_imputer |
| from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer |
| from sklearn.utils._testing import ( |
| assert_allclose, |
| assert_allclose_dense_sparse, |
| assert_array_equal, |
| ) |
| from sklearn.utils.fixes import CSR_CONTAINERS |
|
|
|
|
| def imputers(): |
| return [IterativeImputer(tol=0.1), KNNImputer(), SimpleImputer()] |
|
|
|
|
| def sparse_imputers(): |
| return [SimpleImputer()] |
|
|
|
|
| |
| @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") |
| @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__) |
| def test_imputation_missing_value_in_test_array(imputer): |
| |
| |
| train = [[1], [2]] |
| test = [[3], [np.nan]] |
| imputer.set_params(add_indicator=True) |
| imputer.fit(train).transform(test) |
|
|
|
|
| |
| @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") |
| @pytest.mark.parametrize("marker", [np.nan, -1, 0]) |
| @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__) |
| def test_imputers_add_indicator(marker, imputer): |
| X = np.array( |
| [ |
| [marker, 1, 5, marker, 1], |
| [2, marker, 1, marker, 2], |
| [6, 3, marker, marker, 3], |
| [1, 2, 9, marker, 4], |
| ] |
| ) |
| X_true_indicator = np.array( |
| [ |
| [1.0, 0.0, 0.0, 1.0], |
| [0.0, 1.0, 0.0, 1.0], |
| [0.0, 0.0, 1.0, 1.0], |
| [0.0, 0.0, 0.0, 1.0], |
| ] |
| ) |
| imputer.set_params(missing_values=marker, add_indicator=True) |
|
|
| X_trans = imputer.fit_transform(X) |
| assert_allclose(X_trans[:, -4:], X_true_indicator) |
| assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3])) |
|
|
| imputer.set_params(add_indicator=False) |
| X_trans_no_indicator = imputer.fit_transform(X) |
| assert_allclose(X_trans[:, :-4], X_trans_no_indicator) |
|
|
|
|
| |
| @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") |
| @pytest.mark.parametrize("marker", [np.nan, -1]) |
| @pytest.mark.parametrize( |
| "imputer", sparse_imputers(), ids=lambda x: x.__class__.__name__ |
| ) |
| @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) |
| def test_imputers_add_indicator_sparse(imputer, marker, csr_container): |
| X = csr_container( |
| [ |
| [marker, 1, 5, marker, 1], |
| [2, marker, 1, marker, 2], |
| [6, 3, marker, marker, 3], |
| [1, 2, 9, marker, 4], |
| ] |
| ) |
| X_true_indicator = csr_container( |
| [ |
| [1.0, 0.0, 0.0, 1.0], |
| [0.0, 1.0, 0.0, 1.0], |
| [0.0, 0.0, 1.0, 1.0], |
| [0.0, 0.0, 0.0, 1.0], |
| ] |
| ) |
| imputer.set_params(missing_values=marker, add_indicator=True) |
|
|
| X_trans = imputer.fit_transform(X) |
| assert_allclose_dense_sparse(X_trans[:, -4:], X_true_indicator) |
| assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3])) |
|
|
| imputer.set_params(add_indicator=False) |
| X_trans_no_indicator = imputer.fit_transform(X) |
| assert_allclose_dense_sparse(X_trans[:, :-4], X_trans_no_indicator) |
|
|
|
|
| |
| @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") |
| @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__) |
| @pytest.mark.parametrize("add_indicator", [True, False]) |
| def test_imputers_pandas_na_integer_array_support(imputer, add_indicator): |
| |
| pd = pytest.importorskip("pandas") |
| marker = np.nan |
| imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker) |
|
|
| X = np.array( |
| [ |
| [marker, 1, 5, marker, 1], |
| [2, marker, 1, marker, 2], |
| [6, 3, marker, marker, 3], |
| [1, 2, 9, marker, 4], |
| ] |
| ) |
| |
| X_trans_expected = imputer.fit_transform(X) |
|
|
| |
| X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c", "d", "e"]) |
|
|
| |
| X_trans = imputer.fit_transform(X_df) |
|
|
| assert_allclose(X_trans_expected, X_trans) |
|
|
|
|
| @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__) |
| @pytest.mark.parametrize("add_indicator", [True, False]) |
| def test_imputers_feature_names_out_pandas(imputer, add_indicator): |
| """Check feature names out for imputers.""" |
| pd = pytest.importorskip("pandas") |
| marker = np.nan |
| imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker) |
|
|
| X = np.array( |
| [ |
| [marker, 1, 5, 3, marker, 1], |
| [2, marker, 1, 4, marker, 2], |
| [6, 3, 7, marker, marker, 3], |
| [1, 2, 9, 8, marker, 4], |
| ] |
| ) |
| X_df = pd.DataFrame(X, columns=["a", "b", "c", "d", "e", "f"]) |
| imputer.fit(X_df) |
|
|
| names = imputer.get_feature_names_out() |
|
|
| if add_indicator: |
| expected_names = [ |
| "a", |
| "b", |
| "c", |
| "d", |
| "f", |
| "missingindicator_a", |
| "missingindicator_b", |
| "missingindicator_d", |
| "missingindicator_e", |
| ] |
| assert_array_equal(expected_names, names) |
| else: |
| expected_names = ["a", "b", "c", "d", "f"] |
| assert_array_equal(expected_names, names) |
|
|
|
|
| @pytest.mark.parametrize("keep_empty_features", [True, False]) |
| @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__) |
| def test_keep_empty_features(imputer, keep_empty_features): |
| """Check that the imputer keeps features with only missing values.""" |
| X = np.array([[np.nan, 1], [np.nan, 2], [np.nan, 3]]) |
| imputer = imputer.set_params( |
| add_indicator=False, keep_empty_features=keep_empty_features |
| ) |
|
|
| for method in ["fit_transform", "transform"]: |
| X_imputed = getattr(imputer, method)(X) |
| if keep_empty_features: |
| assert X_imputed.shape == X.shape |
| else: |
| assert X_imputed.shape == (X.shape[0], X.shape[1] - 1) |
|
|
|
|
| @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__) |
| @pytest.mark.parametrize("missing_value_test", [np.nan, 1]) |
| def test_imputation_adds_missing_indicator_if_add_indicator_is_true( |
| imputer, missing_value_test |
| ): |
| """Check that missing indicator always exists when add_indicator=True. |
| |
| Non-regression test for gh-26590. |
| """ |
| X_train = np.array([[0, np.nan], [1, 2]]) |
|
|
| |
| X_test = np.array([[0, missing_value_test], [1, 2]]) |
|
|
| imputer.set_params(add_indicator=True) |
| imputer.fit(X_train) |
|
|
| X_test_imputed_with_indicator = imputer.transform(X_test) |
| assert X_test_imputed_with_indicator.shape == (2, 3) |
|
|
| imputer.set_params(add_indicator=False) |
| imputer.fit(X_train) |
| X_test_imputed_without_indicator = imputer.transform(X_test) |
| assert X_test_imputed_without_indicator.shape == (2, 2) |
|
|
| assert_allclose( |
| X_test_imputed_with_indicator[:, :-1], X_test_imputed_without_indicator |
| ) |
| if np.isnan(missing_value_test): |
| expected_missing_indicator = [1, 0] |
| else: |
| expected_missing_indicator = [0, 0] |
|
|
| assert_allclose(X_test_imputed_with_indicator[:, -1], expected_missing_indicator) |
|
|