File size: 2,967 Bytes
d633e66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93

import pandas as pd
from datetime import datetime
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


# Custom transformer to extract Product Prefix
class ProductPrefixExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, input_column='Product_Id', output_column='Product_Prefix'):
        self.input_column = input_column
        self.output_column = output_column
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_copy = X.copy()
        X_copy[self.output_column] = X_copy[self.input_column].str[:2]
        return X_copy


# Convert Store_Establishment_Year to Store_Age
def convert_year_to_age(df):
    current_year = datetime.now().year
    df = df.copy()
    df['Store_Age'] = current_year - df['Store_Establishment_Year']
    return df.drop(columns=['Store_Establishment_Year'])


store_age_transformer = FunctionTransformer(convert_year_to_age)


# Custom transformer to bin Store_Age into categories
class StoreAgeBinner(BaseEstimator, TransformerMixin):
    def __init__(self, bins=None, labels=None):
        self.bins = bins if bins is not None else [0, 10, 30, 100]
        self.labels = labels if labels is not None else ['New', 'Mid', 'Old']
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_copy = X.copy()
        X_copy['Store_Age_Bin'] = pd.cut(X_copy['Store_Age'], bins=self.bins, labels=self.labels, right=False)
        return X_copy


store_age_binner = StoreAgeBinner()


# Feature groups
ordinal_features = ['Store_Size', 'Store_Location_City_Type', 'Store_Age_Bin']
ordinal_categories = [
    ['Small', 'Medium', 'High'],
    ['Tier 3', 'Tier 2', 'Tier 1'],
    ['New', 'Mid', 'Old']
]

nominal_features = ['Store_Id', 'Store_Type', 'Product_Prefix', 'Product_Sugar_Content', 'Product_Type']

numerical_features = ['Product_Weight', 'Product_Allocated_Area', 'Product_MRP']


# Encoders and scaler
ordinal_encoder = OrdinalEncoder(categories=ordinal_categories)
onehot_encoder = OneHotEncoder(handle_unknown='ignore')
scaler = StandardScaler()


# Pipelines for numeric and categorical features
numeric_pipeline = Pipeline([
    ('scaler', scaler)
])

categorical_pipeline = ColumnTransformer(
    transformers=[
        ('ord', ordinal_encoder, ordinal_features),
        ('onehot', onehot_encoder, nominal_features)
    ],
    remainder='drop'
)


# Full preprocessing pipeline
preprocessor = Pipeline([
    ('extract_prefix', ProductPrefixExtractor(input_column='Product_Id')),
    ('convert_age', store_age_transformer),
    ('bin_age', store_age_binner),
    ('preprocess', ColumnTransformer([
        ('num', numeric_pipeline, numerical_features),
        ('cat', categorical_pipeline, ordinal_features + nominal_features)
    ], remainder='drop'))
])