File size: 11,529 Bytes

000de75

"""Functions for encoding categorical data with additive smoothing techniques applied."""

import numpy as np
import pandas as pd
import itertools
from sklearn.preprocessing import TargetEncoder


def get_target_encodings(
    *,
    train_data,
    cols_to_encode,
    target_col,
    smooth="auto",
    keep_nans_as_category=False,
    cols_to_keep_nan_category=None,
):
    """
    Retrieve target encodings of input data for later use (performs no encoding).

    The complete train data set is used to target encode the holdout test data set. This
    function is used to obtain encodings for storage and later use on separate data.

    Smoothing addresses overfitting caused by sparse data by relying on the global mean
    (mean of target across all rows) rather than the local mean (mean of target across a
    specific category) when there are a small number of observations in a category. The
    degree of smoothing is controlled by the parameter 'smooth'. Higher values of 'smooth'
    increases the influence of the global mean on the target encoding. A 'smooth' value of
    100 can be interpreted as: there must be at least 100 values in the category for the
    sample mean to overtake the global mean.

    There is also an option to keep nan's as a category for cases on data missing not at
    random. The format of nan for categorical columns required for the function is 'nan'.

    Use kfold_target_encode to perform kfold encoding, and apply_target_encodings to use the
    output of this function on the test data.

    Parameters
    ----------
    train_data : dataframe
        data to be used to for target encoding at a later stage. This is likely the full
        train data set.
    cols_to_encode : list of strings
        names of columns to be encoded.
    target_col : str
        name of the target variable column.
    smooth : str or float, optional
        controls the amount of smoothing applied. A larger smooth value will put more
        weight on the global target mean. If "auto", then smooth is set to an
        empirical Bayes estimate, defaults to "auto".
    keep_nans_as_category : bool, optional
        option to retain nans as a category for cases of data missing not at random, by
        default False.
    cols_to_keep_nan_category : list of strings, optional
        names of columns to keep the encoded nan category, by default None. Need to state
        names of columns if keep_nans_as_category is True.

    Returns
    -------
    encodings_all : dict
        encodings used for each column.

    Raises
    -------
    ValueError
        error raised if there are multiple types of nan's in columns to be encoded.
    ValueError
        error raised if nans are not in the correct format: 'nan'.
    ValueError
        error raised if keep_nans_as_category is True but columns not provided.

    """
    train_data_to_encode = train_data[cols_to_encode]
    train_target = train_data[target_col]

    # Raise an error if there are multiple types of nan's
    all_nan_types = [None, "None", np.NaN, "nan", "NAN", "N/A"]
    incorrect_nan_types = ["None", np.NaN, "nan", "NAN", "N/A"]
    for col in train_data_to_encode:
        cat_present = train_data_to_encode[col].unique().tolist()
        if len(list(set(all_nan_types) & set(cat_present))) > 1:
            raise ValueError(
                "Multiple types of nans present in data. Make sure that missing values in"
                "categorical columns are all recorded as 'nan'."
            )
        # Raise an error if nan not in correct format for function
        if any(element in all_nan_types for element in cat_present):
            if not "nan" in cat_present:
                raise ValueError(
                    "Missing values in categorical columns are not recorded as 'nan'."
                )

    encoder = TargetEncoder(smooth=smooth)
    encoder = encoder.fit(train_data_to_encode, train_target)

    # Get dictionary with encodings
    paired_dicts = []
    paired_arrays = zip(encoder.categories_, encoder.encodings_)
    for category_array, value_array in paired_arrays:
        paired_dict = dict(zip(category_array, value_array))
        paired_dicts.append(paired_dict)
    encodings_all = dict(zip(encoder.feature_names_in_, paired_dicts))

    # Sklearn treats nans as a category. The default in this function is to convert nan
    # categories back to np.NaN unless stated otherwise.
    if keep_nans_as_category is False:
        for col in encodings_all:
            encodings_all[col].update({"nan": np.nan})
    # If it is specified to keep nan categories for specific features, only those features
    # not specified in cols_to_keep_nan_category are converted to np.NaN.
    if (keep_nans_as_category is True) and (cols_to_keep_nan_category is None):
        raise ValueError(
            "Parameter keep_nans_as_category is True but cols_to_keep_nan_category not provided."
        )
    if (keep_nans_as_category is True) and not (cols_to_keep_nan_category is None):
        cols_to_remove_nan_cat = set(cols_to_encode) - set(cols_to_keep_nan_category)
        for col_to_remove_nan in cols_to_remove_nan_cat:
            encodings_all[col_to_remove_nan].update({"nan": np.nan})

    return encodings_all


def apply_target_encodings(*, data, cols_to_encode, encodings, drop_categorical_cols=False):
    """Target encode input data with supplied encodings.

    Parameters
    ----------
    data : dataframe
        data with columns to be target encoded.
    cols_to_encode : list of strings
        list of columns to target encode.
    encodings : dict
        target encodings to use on input data (from training data).
    drop_categorical_cols: bool, optional
        option to drop categorical columns after encoding, defaults to False.

    Returns
    -------
    data : dataframe
        target encoded version of the input data.

    Raises
    -------
    AssertionError
        raises an error if the column to be encoded is not in the passed data.
    ValueError
        error raised if nans are not in the correct format: 'nan'.
    ValueError
        error raised if keep_nans_as_category is True but columns not provided.

    """
    data_to_encode = data[cols_to_encode]
    # Raise an error if there are multiple types of nan's
    all_nan_types = [None, "None", np.NaN, "nan", "NAN", "N/A"]
    incorrect_nan_types = ["None", np.NaN, "nan", "NAN", "N/A"]
    for col in data_to_encode:
        cat_present = data_to_encode[col].unique().tolist()
        if len(list(set(all_nan_types) & set(cat_present))) > 1:
            raise ValueError(
                "Multiple types of nans present in data. Make sure that missing values in"
                "categorical columns are all recorded as 'nan'."
            )
        # Raise an error if nan not in correct format for function
        if any(element in all_nan_types for element in cat_present):
            if not "nan" in cat_present:
                raise ValueError(
                    "Missing values in categorical columns are not recorded as 'nan'."
                )

    encoded_data = data.copy()
    for col in cols_to_encode:
        assert (
            col in encodings.keys()
        ), "No target encodings found for {} column".format(col)
        encodings_col = encodings[col]

        # Account for the case where the new data includes a category not present
        # in the train data encodings and set that category encoding nan
        data_unique = data[col].unique().tolist()
        encodings_unique = list(set(encodings_col.keys()))
        diffs = np.setdiff1d(data_unique, encodings_unique)
        encodings_col.update(zip(diffs, itertools.repeat(np.nan)))

        # Use the lookup table to place each category in the current fold with its
        # encoded value from the train data (in the new _te column)
        filtered = encoded_data.filter(items=[col])
        filtered_encodings = filtered.replace(encodings_col)
        filtered_encodings = filtered_encodings.rename(columns={col: col + "_te"})
        encoded_data = pd.concat([encoded_data, filtered_encodings], axis=1)

    if drop_categorical_cols is True:
        encoded_data = encoded_data.drop(columns=cols_to_encode)
    return encoded_data


def kfold_target_encode(
    *,
    df,
    fold_ids,
    cols_to_encode,
    id_col,
    target,
    smooth="auto",
    keep_nans_as_category=False,
    cols_to_keep_nan_category=None,
    drop_categorical_cols=False,
):
    """Perform K-fold target encoding.

    Fold by fold target encoding of train data is used to prevent data leakage in cross-
    validation (the same folds are used for encoding and CV). For example, in 5-fold
    target encoding, each fold is encoded using the other 4 folds and that fold is
    then used as the validation fold in CV. Smoothing is performed on each K-fold.

    Parameters
    ----------
    df : dataframe
        data with columns to be target encoded. Will generally be the train data.
    fold_ids : list of arrays
        each array contains the validation patient IDs for each fold.
    cols_to_encode : list of strings
        columns to target encode.
    id_col : str
        name of patient ID column.
    target : str
        name of target column.
    smooth : str or float, optional
        controls the amount of smoothing applied. A larger smooth value will put more
        weight on the global target mean. If "auto", then smooth is set to an
        empirical Bayes estimate, defaults to "auto".
    keep_nans_as_category : bool, optional
        option to retain nans as a category for cases of data missing not at random, by
        default False.
    cols_to_keep_nan_category : list of strings, optional
        names of columns to keep the encoded nan category, by default None. Need to state
        names of columns if keep_nans_as_category is True.
    drop_categorical_cols: bool, optional
        option to drop categorical columns after encoding, defaults to False.

    Returns
    -------
    encoded_df_cv : dataframe
        k-fold target encoded version of the input data.
    fold_encodings_all : dataframe
        contains target encodings for each fold.

    """
    # Loop over CV folds and perform K-fold target encoding
    encoded_data_cv = []
    fold_encodings_all = []
    for fold in fold_ids:
        # Divide data into train folds and validation fold
        validation_fold = df[df[id_col].isin(fold)]
        train_folds = df[~df[id_col].isin(fold)]

        # Obtain target encodings from train folds
        fold_encodings = get_target_encodings(
            train_data=train_folds,
            cols_to_encode=cols_to_encode,
            target_col=target,
            smooth=smooth,
            keep_nans_as_category=keep_nans_as_category,
            cols_to_keep_nan_category=cols_to_keep_nan_category,
        )
        fold_encodings_all.append(fold_encodings)

        # Apply to validation fold
        encoded_data_fold = apply_target_encodings(
            data=validation_fold,
            cols_to_encode=cols_to_encode,
            encodings=fold_encodings,
            drop_categorical_cols=drop_categorical_cols,
        )
        encoded_data_cv.append(encoded_data_fold)

    # Place the encoded validation fold data into a single df
    encoded_df_cv = pd.concat(encoded_data_cv)

    # Place the encodings for all folds into a df
    fold_encodings_all = pd.json_normalize(fold_encodings_all).T
    return encoded_df_cv, fold_encodings_all