|
|
|
|
|
import pandas as pd |
|
|
from datetime import datetime |
|
|
from sklearn.base import BaseEstimator, TransformerMixin |
|
|
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, FunctionTransformer |
|
|
from sklearn.pipeline import Pipeline |
|
|
from sklearn.compose import ColumnTransformer |
|
|
|
|
|
|
|
|
|
|
|
class ProductPrefixExtractor(BaseEstimator, TransformerMixin): |
|
|
def __init__(self, input_column='Product_Id', output_column='Product_Prefix'): |
|
|
self.input_column = input_column |
|
|
self.output_column = output_column |
|
|
def fit(self, X, y=None): |
|
|
return self |
|
|
def transform(self, X): |
|
|
X_copy = X.copy() |
|
|
X_copy[self.output_column] = X_copy[self.input_column].str[:2] |
|
|
return X_copy |
|
|
|
|
|
|
|
|
|
|
|
def convert_year_to_age(df): |
|
|
current_year = datetime.now().year |
|
|
df = df.copy() |
|
|
df['Store_Age'] = current_year - df['Store_Establishment_Year'] |
|
|
return df.drop(columns=['Store_Establishment_Year']) |
|
|
|
|
|
|
|
|
store_age_transformer = FunctionTransformer(convert_year_to_age) |
|
|
|
|
|
|
|
|
|
|
|
class StoreAgeBinner(BaseEstimator, TransformerMixin): |
|
|
def __init__(self, bins=None, labels=None): |
|
|
self.bins = bins if bins is not None else [0, 10, 30, 100] |
|
|
self.labels = labels if labels is not None else ['New', 'Mid', 'Old'] |
|
|
def fit(self, X, y=None): |
|
|
return self |
|
|
def transform(self, X): |
|
|
X_copy = X.copy() |
|
|
X_copy['Store_Age_Bin'] = pd.cut(X_copy['Store_Age'], bins=self.bins, labels=self.labels, right=False) |
|
|
return X_copy |
|
|
|
|
|
|
|
|
store_age_binner = StoreAgeBinner() |
|
|
|
|
|
|
|
|
|
|
|
ordinal_features = ['Store_Size', 'Store_Location_City_Type', 'Store_Age_Bin'] |
|
|
ordinal_categories = [ |
|
|
['Small', 'Medium', 'High'], |
|
|
['Tier 3', 'Tier 2', 'Tier 1'], |
|
|
['New', 'Mid', 'Old'] |
|
|
] |
|
|
|
|
|
nominal_features = ['Store_Id', 'Store_Type', 'Product_Prefix', 'Product_Sugar_Content', 'Product_Type'] |
|
|
|
|
|
numerical_features = ['Product_Weight', 'Product_Allocated_Area', 'Product_MRP'] |
|
|
|
|
|
|
|
|
|
|
|
ordinal_encoder = OrdinalEncoder(categories=ordinal_categories) |
|
|
onehot_encoder = OneHotEncoder(handle_unknown='ignore') |
|
|
scaler = StandardScaler() |
|
|
|
|
|
|
|
|
|
|
|
numeric_pipeline = Pipeline([ |
|
|
('scaler', scaler) |
|
|
]) |
|
|
|
|
|
categorical_pipeline = ColumnTransformer( |
|
|
transformers=[ |
|
|
('ord', ordinal_encoder, ordinal_features), |
|
|
('onehot', onehot_encoder, nominal_features) |
|
|
], |
|
|
remainder='drop' |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
preprocessor = Pipeline([ |
|
|
('extract_prefix', ProductPrefixExtractor(input_column='Product_Id')), |
|
|
('convert_age', store_age_transformer), |
|
|
('bin_age', store_age_binner), |
|
|
('preprocess', ColumnTransformer([ |
|
|
('num', numeric_pipeline, numerical_features), |
|
|
('cat', categorical_pipeline, ordinal_features + nominal_features) |
|
|
], remainder='drop')) |
|
|
]) |
|
|
|