File size: 2,967 Bytes
d633e66 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import pandas as pd
from datetime import datetime
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# Custom transformer to extract Product Prefix
class ProductPrefixExtractor(BaseEstimator, TransformerMixin):
def __init__(self, input_column='Product_Id', output_column='Product_Prefix'):
self.input_column = input_column
self.output_column = output_column
def fit(self, X, y=None):
return self
def transform(self, X):
X_copy = X.copy()
X_copy[self.output_column] = X_copy[self.input_column].str[:2]
return X_copy
# Convert Store_Establishment_Year to Store_Age
def convert_year_to_age(df):
current_year = datetime.now().year
df = df.copy()
df['Store_Age'] = current_year - df['Store_Establishment_Year']
return df.drop(columns=['Store_Establishment_Year'])
store_age_transformer = FunctionTransformer(convert_year_to_age)
# Custom transformer to bin Store_Age into categories
class StoreAgeBinner(BaseEstimator, TransformerMixin):
def __init__(self, bins=None, labels=None):
self.bins = bins if bins is not None else [0, 10, 30, 100]
self.labels = labels if labels is not None else ['New', 'Mid', 'Old']
def fit(self, X, y=None):
return self
def transform(self, X):
X_copy = X.copy()
X_copy['Store_Age_Bin'] = pd.cut(X_copy['Store_Age'], bins=self.bins, labels=self.labels, right=False)
return X_copy
store_age_binner = StoreAgeBinner()
# Feature groups
ordinal_features = ['Store_Size', 'Store_Location_City_Type', 'Store_Age_Bin']
ordinal_categories = [
['Small', 'Medium', 'High'],
['Tier 3', 'Tier 2', 'Tier 1'],
['New', 'Mid', 'Old']
]
nominal_features = ['Store_Id', 'Store_Type', 'Product_Prefix', 'Product_Sugar_Content', 'Product_Type']
numerical_features = ['Product_Weight', 'Product_Allocated_Area', 'Product_MRP']
# Encoders and scaler
ordinal_encoder = OrdinalEncoder(categories=ordinal_categories)
onehot_encoder = OneHotEncoder(handle_unknown='ignore')
scaler = StandardScaler()
# Pipelines for numeric and categorical features
numeric_pipeline = Pipeline([
('scaler', scaler)
])
categorical_pipeline = ColumnTransformer(
transformers=[
('ord', ordinal_encoder, ordinal_features),
('onehot', onehot_encoder, nominal_features)
],
remainder='drop'
)
# Full preprocessing pipeline
preprocessor = Pipeline([
('extract_prefix', ProductPrefixExtractor(input_column='Product_Id')),
('convert_age', store_age_transformer),
('bin_age', store_age_binner),
('preprocess', ColumnTransformer([
('num', numeric_pipeline, numerical_features),
('cat', categorical_pipeline, ordinal_features + nominal_features)
], remainder='drop'))
])
|