Inital Upload

000de75 3 days ago

11.5 kB

	"""Functions for encoding categorical data with additive smoothing techniques applied."""

	import numpy as np
	import pandas as pd
	import itertools
	from sklearn.preprocessing import TargetEncoder


	def get_target_encodings(
	*,
	train_data,
	cols_to_encode,
	target_col,
	smooth="auto",
	keep_nans_as_category=False,
	cols_to_keep_nan_category=None,
	):
	"""
	Retrieve target encodings of input data for later use (performs no encoding).

	The complete train data set is used to target encode the holdout test data set. This
	function is used to obtain encodings for storage and later use on separate data.

	Smoothing addresses overfitting caused by sparse data by relying on the global mean
	(mean of target across all rows) rather than the local mean (mean of target across a
	specific category) when there are a small number of observations in a category. The
	degree of smoothing is controlled by the parameter 'smooth'. Higher values of 'smooth'
	increases the influence of the global mean on the target encoding. A 'smooth' value of
	100 can be interpreted as: there must be at least 100 values in the category for the
	sample mean to overtake the global mean.

	There is also an option to keep nan's as a category for cases on data missing not at
	random. The format of nan for categorical columns required for the function is 'nan'.

	Use kfold_target_encode to perform kfold encoding, and apply_target_encodings to use the
	output of this function on the test data.

	Parameters
	----------
	train_data : dataframe
	data to be used to for target encoding at a later stage. This is likely the full
	train data set.
	cols_to_encode : list of strings
	names of columns to be encoded.
	target_col : str
	name of the target variable column.
	smooth : str or float, optional
	controls the amount of smoothing applied. A larger smooth value will put more
	weight on the global target mean. If "auto", then smooth is set to an
	empirical Bayes estimate, defaults to "auto".
	keep_nans_as_category : bool, optional
	option to retain nans as a category for cases of data missing not at random, by
	default False.
	cols_to_keep_nan_category : list of strings, optional
	names of columns to keep the encoded nan category, by default None. Need to state
	names of columns if keep_nans_as_category is True.

	Returns
	-------
	encodings_all : dict
	encodings used for each column.

	Raises
	-------
	ValueError
	error raised if there are multiple types of nan's in columns to be encoded.
	ValueError
	error raised if nans are not in the correct format: 'nan'.
	ValueError
	error raised if keep_nans_as_category is True but columns not provided.

	"""
	train_data_to_encode = train_data[cols_to_encode]
	train_target = train_data[target_col]

	# Raise an error if there are multiple types of nan's
	all_nan_types = [None, "None", np.NaN, "nan", "NAN", "N/A"]
	incorrect_nan_types = ["None", np.NaN, "nan", "NAN", "N/A"]
	for col in train_data_to_encode:
	cat_present = train_data_to_encode[col].unique().tolist()
	if len(list(set(all_nan_types) & set(cat_present))) > 1:
	raise ValueError(
	"Multiple types of nans present in data. Make sure that missing values in"
	"categorical columns are all recorded as 'nan'."
	)
	# Raise an error if nan not in correct format for function
	if any(element in all_nan_types for element in cat_present):
	if not "nan" in cat_present:
	raise ValueError(
	"Missing values in categorical columns are not recorded as 'nan'."
	)

	encoder = TargetEncoder(smooth=smooth)
	encoder = encoder.fit(train_data_to_encode, train_target)

	# Get dictionary with encodings
	paired_dicts = []
	paired_arrays = zip(encoder.categories_, encoder.encodings_)
	for category_array, value_array in paired_arrays:
	paired_dict = dict(zip(category_array, value_array))
	paired_dicts.append(paired_dict)
	encodings_all = dict(zip(encoder.feature_names_in_, paired_dicts))

	# Sklearn treats nans as a category. The default in this function is to convert nan
	# categories back to np.NaN unless stated otherwise.
	if keep_nans_as_category is False:
	for col in encodings_all:
	encodings_all[col].update({"nan": np.nan})
	# If it is specified to keep nan categories for specific features, only those features
	# not specified in cols_to_keep_nan_category are converted to np.NaN.
	if (keep_nans_as_category is True) and (cols_to_keep_nan_category is None):
	raise ValueError(
	"Parameter keep_nans_as_category is True but cols_to_keep_nan_category not provided."
	)
	if (keep_nans_as_category is True) and not (cols_to_keep_nan_category is None):
	cols_to_remove_nan_cat = set(cols_to_encode) - set(cols_to_keep_nan_category)
	for col_to_remove_nan in cols_to_remove_nan_cat:
	encodings_all[col_to_remove_nan].update({"nan": np.nan})

	return encodings_all


	def apply_target_encodings(*, data, cols_to_encode, encodings, drop_categorical_cols=False):
	"""Target encode input data with supplied encodings.

	Parameters
	----------
	data : dataframe
	data with columns to be target encoded.
	cols_to_encode : list of strings
	list of columns to target encode.
	encodings : dict
	target encodings to use on input data (from training data).
	drop_categorical_cols: bool, optional
	option to drop categorical columns after encoding, defaults to False.

	Returns
	-------
	data : dataframe
	target encoded version of the input data.

	Raises
	-------
	AssertionError
	raises an error if the column to be encoded is not in the passed data.
	ValueError
	error raised if nans are not in the correct format: 'nan'.
	ValueError
	error raised if keep_nans_as_category is True but columns not provided.

	"""
	data_to_encode = data[cols_to_encode]
	# Raise an error if there are multiple types of nan's
	all_nan_types = [None, "None", np.NaN, "nan", "NAN", "N/A"]
	incorrect_nan_types = ["None", np.NaN, "nan", "NAN", "N/A"]
	for col in data_to_encode:
	cat_present = data_to_encode[col].unique().tolist()
	if len(list(set(all_nan_types) & set(cat_present))) > 1:
	raise ValueError(
	"Multiple types of nans present in data. Make sure that missing values in"
	"categorical columns are all recorded as 'nan'."
	)
	# Raise an error if nan not in correct format for function
	if any(element in all_nan_types for element in cat_present):
	if not "nan" in cat_present:
	raise ValueError(
	"Missing values in categorical columns are not recorded as 'nan'."
	)

	encoded_data = data.copy()
	for col in cols_to_encode:
	assert (
	col in encodings.keys()
	), "No target encodings found for {} column".format(col)
	encodings_col = encodings[col]

	# Account for the case where the new data includes a category not present
	# in the train data encodings and set that category encoding nan
	data_unique = data[col].unique().tolist()
	encodings_unique = list(set(encodings_col.keys()))
	diffs = np.setdiff1d(data_unique, encodings_unique)
	encodings_col.update(zip(diffs, itertools.repeat(np.nan)))

	# Use the lookup table to place each category in the current fold with its
	# encoded value from the train data (in the new _te column)
	filtered = encoded_data.filter(items=[col])
	filtered_encodings = filtered.replace(encodings_col)
	filtered_encodings = filtered_encodings.rename(columns={col: col + "_te"})
	encoded_data = pd.concat([encoded_data, filtered_encodings], axis=1)

	if drop_categorical_cols is True:
	encoded_data = encoded_data.drop(columns=cols_to_encode)
	return encoded_data


	def kfold_target_encode(
	*,
	df,
	fold_ids,
	cols_to_encode,
	id_col,
	target,
	smooth="auto",
	keep_nans_as_category=False,
	cols_to_keep_nan_category=None,
	drop_categorical_cols=False,
	):
	"""Perform K-fold target encoding.

	Fold by fold target encoding of train data is used to prevent data leakage in cross-
	validation (the same folds are used for encoding and CV). For example, in 5-fold
	target encoding, each fold is encoded using the other 4 folds and that fold is
	then used as the validation fold in CV. Smoothing is performed on each K-fold.

	Parameters
	----------
	df : dataframe
	data with columns to be target encoded. Will generally be the train data.
	fold_ids : list of arrays
	each array contains the validation patient IDs for each fold.
	cols_to_encode : list of strings
	columns to target encode.
	id_col : str
	name of patient ID column.
	target : str
	name of target column.
	smooth : str or float, optional
	controls the amount of smoothing applied. A larger smooth value will put more
	weight on the global target mean. If "auto", then smooth is set to an
	empirical Bayes estimate, defaults to "auto".
	keep_nans_as_category : bool, optional
	option to retain nans as a category for cases of data missing not at random, by
	default False.
	cols_to_keep_nan_category : list of strings, optional
	names of columns to keep the encoded nan category, by default None. Need to state
	names of columns if keep_nans_as_category is True.
	drop_categorical_cols: bool, optional
	option to drop categorical columns after encoding, defaults to False.

	Returns
	-------
	encoded_df_cv : dataframe
	k-fold target encoded version of the input data.
	fold_encodings_all : dataframe
	contains target encodings for each fold.

	"""
	# Loop over CV folds and perform K-fold target encoding
	encoded_data_cv = []
	fold_encodings_all = []
	for fold in fold_ids:
	# Divide data into train folds and validation fold
	validation_fold = df[df[id_col].isin(fold)]
	train_folds = df[~df[id_col].isin(fold)]

	# Obtain target encodings from train folds
	fold_encodings = get_target_encodings(
	train_data=train_folds,
	cols_to_encode=cols_to_encode,
	target_col=target,
	smooth=smooth,
	keep_nans_as_category=keep_nans_as_category,
	cols_to_keep_nan_category=cols_to_keep_nan_category,
	)
	fold_encodings_all.append(fold_encodings)

	# Apply to validation fold
	encoded_data_fold = apply_target_encodings(
	data=validation_fold,
	cols_to_encode=cols_to_encode,
	encodings=fold_encodings,
	drop_categorical_cols=drop_categorical_cols,
	)
	encoded_data_cv.append(encoded_data_fold)

	# Place the encoded validation fold data into a single df
	encoded_df_cv = pd.concat(encoded_data_cv)

	# Place the encodings for all folds into a df
	fold_encodings_all = pd.json_normalize(fold_encodings_all).T
	return encoded_df_cv, fold_encodings_all