|
|
|
|
|
|
|
|
from sklearn.base import BaseEstimator, TransformerMixin |
|
|
from sklearn.compose import ColumnTransformer |
|
|
from sklearn.preprocessing import OneHotEncoder |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from typing import Optional, Iterable, Any |
|
|
|
|
|
|
|
|
class ManualProductTypeMapper(BaseEstimator, TransformerMixin): |
|
|
""" |
|
|
Transformer that maps values of a Product-Type column to a controlled set of |
|
|
allowed categories, mapping all other (unwanted / rare / unknown) values to 'Others'. |
|
|
""" |
|
|
|
|
|
def __init__(self, product_col: str = 'Product_Type', keep_set: Optional[Iterable[str]] = None): |
|
|
|
|
|
self.product_col = product_col |
|
|
self.keep_set = keep_set |
|
|
|
|
|
|
|
|
def fit(self, X: pd.DataFrame, y: Optional[Any] = None): |
|
|
""" |
|
|
Validate inputs and prepare internal state. |
|
|
""" |
|
|
|
|
|
if not isinstance(X, pd.DataFrame): |
|
|
raise ValueError("fit expects X to be a pandas DataFrame") |
|
|
if self.product_col not in X.columns: |
|
|
raise ValueError(f"product_col '{self.product_col}' not found in X during fit") |
|
|
|
|
|
|
|
|
if self.keep_set is None: |
|
|
raise ValueError("ManualProductTypeMapper requires a non-empty keep_set (pass an iterable of values)") |
|
|
|
|
|
|
|
|
self.keep_set_ = set(self.keep_set) |
|
|
|
|
|
return self |
|
|
|
|
|
def transform(self, X: pd.DataFrame) -> pd.DataFrame: |
|
|
""" |
|
|
Map values not in keep_set_ to 'Others'. |
|
|
""" |
|
|
|
|
|
if not hasattr(self, 'keep_set_'): |
|
|
raise ValueError("transform called before fit(). Call fit(X) first.") |
|
|
|
|
|
if not isinstance(X, pd.DataFrame): |
|
|
raise ValueError("transform expects a pandas DataFrame") |
|
|
if self.product_col not in X.columns: |
|
|
raise ValueError(f"product_col '{self.product_col}' not found in X during transform") |
|
|
|
|
|
|
|
|
X2 = X.copy() |
|
|
|
|
|
|
|
|
def mapper_func(v): |
|
|
return v if v in self.keep_set_ else 'Others' |
|
|
|
|
|
|
|
|
X2[self.product_col] = X2[self.product_col].apply(mapper_func) |
|
|
return X2 |
|
|
|
|
|
def fit_transform(self, X: pd.DataFrame, y: Optional[Any] = None, **fit_params) -> pd.DataFrame: |
|
|
""" |
|
|
Fit the transformer and transform X in one step. |
|
|
Additionally ensures that the transformed training data contains at least one |
|
|
row with Product_Type == 'Others' for downstream OneHotEncoder compatibility. |
|
|
""" |
|
|
|
|
|
self.fit(X, y) |
|
|
|
|
|
X_trans = self.transform(X) |
|
|
|
|
|
|
|
|
if 'Others' in X_trans[self.product_col].unique(): |
|
|
return X_trans |
|
|
|
|
|
|
|
|
synthetic: dict = {} |
|
|
for col in X_trans.columns: |
|
|
if col == self.product_col: |
|
|
synthetic[col] = 'Others' |
|
|
else: |
|
|
|
|
|
ser = X_trans[col].dropna() |
|
|
|
|
|
if ser.empty: |
|
|
synthetic[col] = np.nan |
|
|
else: |
|
|
|
|
|
if pd.api.types.is_object_dtype(ser) or pd.api.types.is_categorical_dtype(ser) or pd.api.types.is_string_dtype(ser): |
|
|
synthetic[col] = ser.mode().iloc[0] |
|
|
else: |
|
|
|
|
|
synthetic[col] = float(ser.median()) if pd.api.types.is_numeric_dtype(ser) else ser.iloc[0] |
|
|
|
|
|
synthetic_df = pd.DataFrame([synthetic], columns=X_trans.columns) |
|
|
|
|
|
|
|
|
X_with_dummy = pd.concat([X_trans, synthetic_df], ignore_index=True) |
|
|
return X_with_dummy |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
KEEP_PRODUCT_TYPES = { |
|
|
'Fruits and Vegetables', 'Snack Foods', 'Dairy', 'Frozen Foods', 'Household', |
|
|
'Baking Goods', 'Canned', 'Health and Hygiene', 'Meat', 'Soft Drinks' |
|
|
} |
|
|
|