copd-model-h / training /encoding.py
IamGrooooot's picture
Inital Upload
000de75
"""Functions for encoding categorical data with additive smoothing techniques applied."""
import numpy as np
import pandas as pd
import itertools
from sklearn.preprocessing import TargetEncoder
def get_target_encodings(
*,
train_data,
cols_to_encode,
target_col,
smooth="auto",
keep_nans_as_category=False,
cols_to_keep_nan_category=None,
):
"""
Retrieve target encodings of input data for later use (performs no encoding).
The complete train data set is used to target encode the holdout test data set. This
function is used to obtain encodings for storage and later use on separate data.
Smoothing addresses overfitting caused by sparse data by relying on the global mean
(mean of target across all rows) rather than the local mean (mean of target across a
specific category) when there are a small number of observations in a category. The
degree of smoothing is controlled by the parameter 'smooth'. Higher values of 'smooth'
increases the influence of the global mean on the target encoding. A 'smooth' value of
100 can be interpreted as: there must be at least 100 values in the category for the
sample mean to overtake the global mean.
There is also an option to keep nan's as a category for cases on data missing not at
random. The format of nan for categorical columns required for the function is 'nan'.
Use kfold_target_encode to perform kfold encoding, and apply_target_encodings to use the
output of this function on the test data.
Parameters
----------
train_data : dataframe
data to be used to for target encoding at a later stage. This is likely the full
train data set.
cols_to_encode : list of strings
names of columns to be encoded.
target_col : str
name of the target variable column.
smooth : str or float, optional
controls the amount of smoothing applied. A larger smooth value will put more
weight on the global target mean. If "auto", then smooth is set to an
empirical Bayes estimate, defaults to "auto".
keep_nans_as_category : bool, optional
option to retain nans as a category for cases of data missing not at random, by
default False.
cols_to_keep_nan_category : list of strings, optional
names of columns to keep the encoded nan category, by default None. Need to state
names of columns if keep_nans_as_category is True.
Returns
-------
encodings_all : dict
encodings used for each column.
Raises
-------
ValueError
error raised if there are multiple types of nan's in columns to be encoded.
ValueError
error raised if nans are not in the correct format: 'nan'.
ValueError
error raised if keep_nans_as_category is True but columns not provided.
"""
train_data_to_encode = train_data[cols_to_encode]
train_target = train_data[target_col]
# Raise an error if there are multiple types of nan's
all_nan_types = [None, "None", np.NaN, "nan", "NAN", "N/A"]
incorrect_nan_types = ["None", np.NaN, "nan", "NAN", "N/A"]
for col in train_data_to_encode:
cat_present = train_data_to_encode[col].unique().tolist()
if len(list(set(all_nan_types) & set(cat_present))) > 1:
raise ValueError(
"Multiple types of nans present in data. Make sure that missing values in"
"categorical columns are all recorded as 'nan'."
)
# Raise an error if nan not in correct format for function
if any(element in all_nan_types for element in cat_present):
if not "nan" in cat_present:
raise ValueError(
"Missing values in categorical columns are not recorded as 'nan'."
)
encoder = TargetEncoder(smooth=smooth)
encoder = encoder.fit(train_data_to_encode, train_target)
# Get dictionary with encodings
paired_dicts = []
paired_arrays = zip(encoder.categories_, encoder.encodings_)
for category_array, value_array in paired_arrays:
paired_dict = dict(zip(category_array, value_array))
paired_dicts.append(paired_dict)
encodings_all = dict(zip(encoder.feature_names_in_, paired_dicts))
# Sklearn treats nans as a category. The default in this function is to convert nan
# categories back to np.NaN unless stated otherwise.
if keep_nans_as_category is False:
for col in encodings_all:
encodings_all[col].update({"nan": np.nan})
# If it is specified to keep nan categories for specific features, only those features
# not specified in cols_to_keep_nan_category are converted to np.NaN.
if (keep_nans_as_category is True) and (cols_to_keep_nan_category is None):
raise ValueError(
"Parameter keep_nans_as_category is True but cols_to_keep_nan_category not provided."
)
if (keep_nans_as_category is True) and not (cols_to_keep_nan_category is None):
cols_to_remove_nan_cat = set(cols_to_encode) - set(cols_to_keep_nan_category)
for col_to_remove_nan in cols_to_remove_nan_cat:
encodings_all[col_to_remove_nan].update({"nan": np.nan})
return encodings_all
def apply_target_encodings(*, data, cols_to_encode, encodings, drop_categorical_cols=False):
"""Target encode input data with supplied encodings.
Parameters
----------
data : dataframe
data with columns to be target encoded.
cols_to_encode : list of strings
list of columns to target encode.
encodings : dict
target encodings to use on input data (from training data).
drop_categorical_cols: bool, optional
option to drop categorical columns after encoding, defaults to False.
Returns
-------
data : dataframe
target encoded version of the input data.
Raises
-------
AssertionError
raises an error if the column to be encoded is not in the passed data.
ValueError
error raised if nans are not in the correct format: 'nan'.
ValueError
error raised if keep_nans_as_category is True but columns not provided.
"""
data_to_encode = data[cols_to_encode]
# Raise an error if there are multiple types of nan's
all_nan_types = [None, "None", np.NaN, "nan", "NAN", "N/A"]
incorrect_nan_types = ["None", np.NaN, "nan", "NAN", "N/A"]
for col in data_to_encode:
cat_present = data_to_encode[col].unique().tolist()
if len(list(set(all_nan_types) & set(cat_present))) > 1:
raise ValueError(
"Multiple types of nans present in data. Make sure that missing values in"
"categorical columns are all recorded as 'nan'."
)
# Raise an error if nan not in correct format for function
if any(element in all_nan_types for element in cat_present):
if not "nan" in cat_present:
raise ValueError(
"Missing values in categorical columns are not recorded as 'nan'."
)
encoded_data = data.copy()
for col in cols_to_encode:
assert (
col in encodings.keys()
), "No target encodings found for {} column".format(col)
encodings_col = encodings[col]
# Account for the case where the new data includes a category not present
# in the train data encodings and set that category encoding nan
data_unique = data[col].unique().tolist()
encodings_unique = list(set(encodings_col.keys()))
diffs = np.setdiff1d(data_unique, encodings_unique)
encodings_col.update(zip(diffs, itertools.repeat(np.nan)))
# Use the lookup table to place each category in the current fold with its
# encoded value from the train data (in the new _te column)
filtered = encoded_data.filter(items=[col])
filtered_encodings = filtered.replace(encodings_col)
filtered_encodings = filtered_encodings.rename(columns={col: col + "_te"})
encoded_data = pd.concat([encoded_data, filtered_encodings], axis=1)
if drop_categorical_cols is True:
encoded_data = encoded_data.drop(columns=cols_to_encode)
return encoded_data
def kfold_target_encode(
*,
df,
fold_ids,
cols_to_encode,
id_col,
target,
smooth="auto",
keep_nans_as_category=False,
cols_to_keep_nan_category=None,
drop_categorical_cols=False,
):
"""Perform K-fold target encoding.
Fold by fold target encoding of train data is used to prevent data leakage in cross-
validation (the same folds are used for encoding and CV). For example, in 5-fold
target encoding, each fold is encoded using the other 4 folds and that fold is
then used as the validation fold in CV. Smoothing is performed on each K-fold.
Parameters
----------
df : dataframe
data with columns to be target encoded. Will generally be the train data.
fold_ids : list of arrays
each array contains the validation patient IDs for each fold.
cols_to_encode : list of strings
columns to target encode.
id_col : str
name of patient ID column.
target : str
name of target column.
smooth : str or float, optional
controls the amount of smoothing applied. A larger smooth value will put more
weight on the global target mean. If "auto", then smooth is set to an
empirical Bayes estimate, defaults to "auto".
keep_nans_as_category : bool, optional
option to retain nans as a category for cases of data missing not at random, by
default False.
cols_to_keep_nan_category : list of strings, optional
names of columns to keep the encoded nan category, by default None. Need to state
names of columns if keep_nans_as_category is True.
drop_categorical_cols: bool, optional
option to drop categorical columns after encoding, defaults to False.
Returns
-------
encoded_df_cv : dataframe
k-fold target encoded version of the input data.
fold_encodings_all : dataframe
contains target encodings for each fold.
"""
# Loop over CV folds and perform K-fold target encoding
encoded_data_cv = []
fold_encodings_all = []
for fold in fold_ids:
# Divide data into train folds and validation fold
validation_fold = df[df[id_col].isin(fold)]
train_folds = df[~df[id_col].isin(fold)]
# Obtain target encodings from train folds
fold_encodings = get_target_encodings(
train_data=train_folds,
cols_to_encode=cols_to_encode,
target_col=target,
smooth=smooth,
keep_nans_as_category=keep_nans_as_category,
cols_to_keep_nan_category=cols_to_keep_nan_category,
)
fold_encodings_all.append(fold_encodings)
# Apply to validation fold
encoded_data_fold = apply_target_encodings(
data=validation_fold,
cols_to_encode=cols_to_encode,
encodings=fold_encodings,
drop_categorical_cols=drop_categorical_cols,
)
encoded_data_cv.append(encoded_data_fold)
# Place the encoded validation fold data into a single df
encoded_df_cv = pd.concat(encoded_data_cv)
# Place the encodings for all folds into a df
fold_encodings_all = pd.json_normalize(fold_encodings_all).T
return encoded_df_cv, fold_encodings_all