|
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
class MeanEncoding(): |
|
|
""" |
|
|
replacing the label by the mean of the target for that label. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
|
|
|
""" |
|
|
|
|
|
def __init__(self, mapping=None, cols=None): |
|
|
self.cols = cols |
|
|
self.mapping = mapping |
|
|
self._dim = None |
|
|
|
|
|
|
|
|
|
|
|
def fit(self, X, y=None, **kwargs): |
|
|
"""Fit encoder according to X and y. |
|
|
Parameters |
|
|
---------- |
|
|
X : array-like, shape = [n_samples, n_features] |
|
|
Training vectors, where n_samples is the number of samples |
|
|
and n_features is the number of features. |
|
|
y : array-like, shape = [n_samples] |
|
|
Target values. |
|
|
Returns |
|
|
------- |
|
|
self : encoder |
|
|
Returns self. |
|
|
""" |
|
|
|
|
|
self._dim = X.shape[1] |
|
|
|
|
|
_, categories = self.mean_encoding( |
|
|
X, |
|
|
y, |
|
|
mapping=self.mapping, |
|
|
cols=self.cols |
|
|
|
|
|
) |
|
|
self.mapping = categories |
|
|
return self |
|
|
|
|
|
|
|
|
def transform(self, X): |
|
|
"""Perform the transformation to new categorical data. |
|
|
Will use the mapping (if available) and the column list to encode the |
|
|
data. |
|
|
Parameters |
|
|
---------- |
|
|
X : array-like, shape = [n_samples, n_features] |
|
|
Returns |
|
|
------- |
|
|
X : Transformed values with encoding applied. |
|
|
""" |
|
|
|
|
|
if self._dim is None: |
|
|
raise ValueError('Must train encoder before it can be used to transform data.') |
|
|
|
|
|
|
|
|
if X.shape[1] != self._dim: |
|
|
raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) |
|
|
|
|
|
X, _ = self.mean_encoding( |
|
|
X, |
|
|
mapping=self.mapping, |
|
|
cols=self.cols |
|
|
|
|
|
) |
|
|
|
|
|
return X |
|
|
|
|
|
|
|
|
def mean_encoding(self, X_in, y=None, mapping=None, cols=None): |
|
|
""" |
|
|
Grouping the observations that show rare labels into a unique category ('rare') |
|
|
|
|
|
""" |
|
|
|
|
|
X = X_in.copy(deep=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if mapping is not None: |
|
|
mapping_out = mapping |
|
|
for i in mapping: |
|
|
column = i.get('col') |
|
|
X[column] = X[column].map(i['mapping']) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
mapping_out = [] |
|
|
for col in cols: |
|
|
|
|
|
|
|
|
|
|
|
mapping = X[y.name].groupby(X[col]).mean().to_dict() |
|
|
mapping = pd.Series(mapping) |
|
|
mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, ) |
|
|
|
|
|
return X, mapping_out |