import pandas as pd from datetime import datetime from sklearn.base import BaseEstimator, TransformerMixin from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, FunctionTransformer from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer # Custom transformer to extract Product Prefix class ProductPrefixExtractor(BaseEstimator, TransformerMixin): def __init__(self, input_column='Product_Id', output_column='Product_Prefix'): self.input_column = input_column self.output_column = output_column def fit(self, X, y=None): return self def transform(self, X): X_copy = X.copy() X_copy[self.output_column] = X_copy[self.input_column].str[:2] return X_copy # Convert Store_Establishment_Year to Store_Age def convert_year_to_age(df): current_year = datetime.now().year df = df.copy() df['Store_Age'] = current_year - df['Store_Establishment_Year'] return df.drop(columns=['Store_Establishment_Year']) store_age_transformer = FunctionTransformer(convert_year_to_age) # Custom transformer to bin Store_Age into categories class StoreAgeBinner(BaseEstimator, TransformerMixin): def __init__(self, bins=None, labels=None): self.bins = bins if bins is not None else [0, 10, 30, 100] self.labels = labels if labels is not None else ['New', 'Mid', 'Old'] def fit(self, X, y=None): return self def transform(self, X): X_copy = X.copy() X_copy['Store_Age_Bin'] = pd.cut(X_copy['Store_Age'], bins=self.bins, labels=self.labels, right=False) return X_copy store_age_binner = StoreAgeBinner() # Feature groups ordinal_features = ['Store_Size', 'Store_Location_City_Type', 'Store_Age_Bin'] ordinal_categories = [ ['Small', 'Medium', 'High'], ['Tier 3', 'Tier 2', 'Tier 1'], ['New', 'Mid', 'Old'] ] nominal_features = ['Store_Id', 'Store_Type', 'Product_Prefix', 'Product_Sugar_Content', 'Product_Type'] numerical_features = ['Product_Weight', 'Product_Allocated_Area', 'Product_MRP'] # Encoders and scaler ordinal_encoder = OrdinalEncoder(categories=ordinal_categories) onehot_encoder = OneHotEncoder(handle_unknown='ignore') scaler = StandardScaler() # Pipelines for numeric and categorical features numeric_pipeline = Pipeline([ ('scaler', scaler) ]) categorical_pipeline = ColumnTransformer( transformers=[ ('ord', ordinal_encoder, ordinal_features), ('onehot', onehot_encoder, nominal_features) ], remainder='drop' ) # Full preprocessing pipeline preprocessor = Pipeline([ ('extract_prefix', ProductPrefixExtractor(input_column='Product_Id')), ('convert_age', store_age_transformer), ('bin_age', store_age_binner), ('preprocess', ColumnTransformer([ ('num', numeric_pipeline, numerical_features), ('cat', categorical_pipeline, ordinal_features + nominal_features) ], remainder='drop')) ])