| | """Functions for encoding categorical data with additive smoothing techniques applied.""" |
| |
|
| | import numpy as np |
| | import pandas as pd |
| | import itertools |
| | from sklearn.preprocessing import TargetEncoder |
| |
|
| |
|
| | def get_target_encodings( |
| | *, |
| | train_data, |
| | cols_to_encode, |
| | target_col, |
| | smooth="auto", |
| | keep_nans_as_category=False, |
| | cols_to_keep_nan_category=None, |
| | ): |
| | """ |
| | Retrieve target encodings of input data for later use (performs no encoding). |
| | |
| | The complete train data set is used to target encode the holdout test data set. This |
| | function is used to obtain encodings for storage and later use on separate data. |
| | |
| | Smoothing addresses overfitting caused by sparse data by relying on the global mean |
| | (mean of target across all rows) rather than the local mean (mean of target across a |
| | specific category) when there are a small number of observations in a category. The |
| | degree of smoothing is controlled by the parameter 'smooth'. Higher values of 'smooth' |
| | increases the influence of the global mean on the target encoding. A 'smooth' value of |
| | 100 can be interpreted as: there must be at least 100 values in the category for the |
| | sample mean to overtake the global mean. |
| | |
| | There is also an option to keep nan's as a category for cases on data missing not at |
| | random. The format of nan for categorical columns required for the function is 'nan'. |
| | |
| | Use kfold_target_encode to perform kfold encoding, and apply_target_encodings to use the |
| | output of this function on the test data. |
| | |
| | Parameters |
| | ---------- |
| | train_data : dataframe |
| | data to be used to for target encoding at a later stage. This is likely the full |
| | train data set. |
| | cols_to_encode : list of strings |
| | names of columns to be encoded. |
| | target_col : str |
| | name of the target variable column. |
| | smooth : str or float, optional |
| | controls the amount of smoothing applied. A larger smooth value will put more |
| | weight on the global target mean. If "auto", then smooth is set to an |
| | empirical Bayes estimate, defaults to "auto". |
| | keep_nans_as_category : bool, optional |
| | option to retain nans as a category for cases of data missing not at random, by |
| | default False. |
| | cols_to_keep_nan_category : list of strings, optional |
| | names of columns to keep the encoded nan category, by default None. Need to state |
| | names of columns if keep_nans_as_category is True. |
| | |
| | Returns |
| | ------- |
| | encodings_all : dict |
| | encodings used for each column. |
| | |
| | Raises |
| | ------- |
| | ValueError |
| | error raised if there are multiple types of nan's in columns to be encoded. |
| | ValueError |
| | error raised if nans are not in the correct format: 'nan'. |
| | ValueError |
| | error raised if keep_nans_as_category is True but columns not provided. |
| | |
| | """ |
| | train_data_to_encode = train_data[cols_to_encode] |
| | train_target = train_data[target_col] |
| |
|
| | |
| | all_nan_types = [None, "None", np.NaN, "nan", "NAN", "N/A"] |
| | incorrect_nan_types = ["None", np.NaN, "nan", "NAN", "N/A"] |
| | for col in train_data_to_encode: |
| | cat_present = train_data_to_encode[col].unique().tolist() |
| | if len(list(set(all_nan_types) & set(cat_present))) > 1: |
| | raise ValueError( |
| | "Multiple types of nans present in data. Make sure that missing values in" |
| | "categorical columns are all recorded as 'nan'." |
| | ) |
| | |
| | if any(element in all_nan_types for element in cat_present): |
| | if not "nan" in cat_present: |
| | raise ValueError( |
| | "Missing values in categorical columns are not recorded as 'nan'." |
| | ) |
| |
|
| | encoder = TargetEncoder(smooth=smooth) |
| | encoder = encoder.fit(train_data_to_encode, train_target) |
| |
|
| | |
| | paired_dicts = [] |
| | paired_arrays = zip(encoder.categories_, encoder.encodings_) |
| | for category_array, value_array in paired_arrays: |
| | paired_dict = dict(zip(category_array, value_array)) |
| | paired_dicts.append(paired_dict) |
| | encodings_all = dict(zip(encoder.feature_names_in_, paired_dicts)) |
| |
|
| | |
| | |
| | if keep_nans_as_category is False: |
| | for col in encodings_all: |
| | encodings_all[col].update({"nan": np.nan}) |
| | |
| | |
| | if (keep_nans_as_category is True) and (cols_to_keep_nan_category is None): |
| | raise ValueError( |
| | "Parameter keep_nans_as_category is True but cols_to_keep_nan_category not provided." |
| | ) |
| | if (keep_nans_as_category is True) and not (cols_to_keep_nan_category is None): |
| | cols_to_remove_nan_cat = set(cols_to_encode) - set(cols_to_keep_nan_category) |
| | for col_to_remove_nan in cols_to_remove_nan_cat: |
| | encodings_all[col_to_remove_nan].update({"nan": np.nan}) |
| |
|
| | return encodings_all |
| |
|
| |
|
| | def apply_target_encodings(*, data, cols_to_encode, encodings, drop_categorical_cols=False): |
| | """Target encode input data with supplied encodings. |
| | |
| | Parameters |
| | ---------- |
| | data : dataframe |
| | data with columns to be target encoded. |
| | cols_to_encode : list of strings |
| | list of columns to target encode. |
| | encodings : dict |
| | target encodings to use on input data (from training data). |
| | drop_categorical_cols: bool, optional |
| | option to drop categorical columns after encoding, defaults to False. |
| | |
| | Returns |
| | ------- |
| | data : dataframe |
| | target encoded version of the input data. |
| | |
| | Raises |
| | ------- |
| | AssertionError |
| | raises an error if the column to be encoded is not in the passed data. |
| | ValueError |
| | error raised if nans are not in the correct format: 'nan'. |
| | ValueError |
| | error raised if keep_nans_as_category is True but columns not provided. |
| | |
| | """ |
| | data_to_encode = data[cols_to_encode] |
| | |
| | all_nan_types = [None, "None", np.NaN, "nan", "NAN", "N/A"] |
| | incorrect_nan_types = ["None", np.NaN, "nan", "NAN", "N/A"] |
| | for col in data_to_encode: |
| | cat_present = data_to_encode[col].unique().tolist() |
| | if len(list(set(all_nan_types) & set(cat_present))) > 1: |
| | raise ValueError( |
| | "Multiple types of nans present in data. Make sure that missing values in" |
| | "categorical columns are all recorded as 'nan'." |
| | ) |
| | |
| | if any(element in all_nan_types for element in cat_present): |
| | if not "nan" in cat_present: |
| | raise ValueError( |
| | "Missing values in categorical columns are not recorded as 'nan'." |
| | ) |
| |
|
| | encoded_data = data.copy() |
| | for col in cols_to_encode: |
| | assert ( |
| | col in encodings.keys() |
| | ), "No target encodings found for {} column".format(col) |
| | encodings_col = encodings[col] |
| |
|
| | |
| | |
| | data_unique = data[col].unique().tolist() |
| | encodings_unique = list(set(encodings_col.keys())) |
| | diffs = np.setdiff1d(data_unique, encodings_unique) |
| | encodings_col.update(zip(diffs, itertools.repeat(np.nan))) |
| |
|
| | |
| | |
| | filtered = encoded_data.filter(items=[col]) |
| | filtered_encodings = filtered.replace(encodings_col) |
| | filtered_encodings = filtered_encodings.rename(columns={col: col + "_te"}) |
| | encoded_data = pd.concat([encoded_data, filtered_encodings], axis=1) |
| |
|
| | if drop_categorical_cols is True: |
| | encoded_data = encoded_data.drop(columns=cols_to_encode) |
| | return encoded_data |
| |
|
| |
|
| | def kfold_target_encode( |
| | *, |
| | df, |
| | fold_ids, |
| | cols_to_encode, |
| | id_col, |
| | target, |
| | smooth="auto", |
| | keep_nans_as_category=False, |
| | cols_to_keep_nan_category=None, |
| | drop_categorical_cols=False, |
| | ): |
| | """Perform K-fold target encoding. |
| | |
| | Fold by fold target encoding of train data is used to prevent data leakage in cross- |
| | validation (the same folds are used for encoding and CV). For example, in 5-fold |
| | target encoding, each fold is encoded using the other 4 folds and that fold is |
| | then used as the validation fold in CV. Smoothing is performed on each K-fold. |
| | |
| | Parameters |
| | ---------- |
| | df : dataframe |
| | data with columns to be target encoded. Will generally be the train data. |
| | fold_ids : list of arrays |
| | each array contains the validation patient IDs for each fold. |
| | cols_to_encode : list of strings |
| | columns to target encode. |
| | id_col : str |
| | name of patient ID column. |
| | target : str |
| | name of target column. |
| | smooth : str or float, optional |
| | controls the amount of smoothing applied. A larger smooth value will put more |
| | weight on the global target mean. If "auto", then smooth is set to an |
| | empirical Bayes estimate, defaults to "auto". |
| | keep_nans_as_category : bool, optional |
| | option to retain nans as a category for cases of data missing not at random, by |
| | default False. |
| | cols_to_keep_nan_category : list of strings, optional |
| | names of columns to keep the encoded nan category, by default None. Need to state |
| | names of columns if keep_nans_as_category is True. |
| | drop_categorical_cols: bool, optional |
| | option to drop categorical columns after encoding, defaults to False. |
| | |
| | Returns |
| | ------- |
| | encoded_df_cv : dataframe |
| | k-fold target encoded version of the input data. |
| | fold_encodings_all : dataframe |
| | contains target encodings for each fold. |
| | |
| | """ |
| | |
| | encoded_data_cv = [] |
| | fold_encodings_all = [] |
| | for fold in fold_ids: |
| | |
| | validation_fold = df[df[id_col].isin(fold)] |
| | train_folds = df[~df[id_col].isin(fold)] |
| |
|
| | |
| | fold_encodings = get_target_encodings( |
| | train_data=train_folds, |
| | cols_to_encode=cols_to_encode, |
| | target_col=target, |
| | smooth=smooth, |
| | keep_nans_as_category=keep_nans_as_category, |
| | cols_to_keep_nan_category=cols_to_keep_nan_category, |
| | ) |
| | fold_encodings_all.append(fold_encodings) |
| |
|
| | |
| | encoded_data_fold = apply_target_encodings( |
| | data=validation_fold, |
| | cols_to_encode=cols_to_encode, |
| | encodings=fold_encodings, |
| | drop_categorical_cols=drop_categorical_cols, |
| | ) |
| | encoded_data_cv.append(encoded_data_fold) |
| |
|
| | |
| | encoded_df_cv = pd.concat(encoded_data_cv) |
| |
|
| | |
| | fold_encodings_all = pd.json_normalize(fold_encodings_all).T |
| | return encoded_df_cv, fold_encodings_all |