RetailSlesPredictionBackend / custom_transformers.py
Quantum9999's picture
Upload folder using huggingface_hub
d633e66 verified
import pandas as pd
from datetime import datetime
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# Custom transformer to extract Product Prefix
class ProductPrefixExtractor(BaseEstimator, TransformerMixin):
def __init__(self, input_column='Product_Id', output_column='Product_Prefix'):
self.input_column = input_column
self.output_column = output_column
def fit(self, X, y=None):
return self
def transform(self, X):
X_copy = X.copy()
X_copy[self.output_column] = X_copy[self.input_column].str[:2]
return X_copy
# Convert Store_Establishment_Year to Store_Age
def convert_year_to_age(df):
current_year = datetime.now().year
df = df.copy()
df['Store_Age'] = current_year - df['Store_Establishment_Year']
return df.drop(columns=['Store_Establishment_Year'])
store_age_transformer = FunctionTransformer(convert_year_to_age)
# Custom transformer to bin Store_Age into categories
class StoreAgeBinner(BaseEstimator, TransformerMixin):
def __init__(self, bins=None, labels=None):
self.bins = bins if bins is not None else [0, 10, 30, 100]
self.labels = labels if labels is not None else ['New', 'Mid', 'Old']
def fit(self, X, y=None):
return self
def transform(self, X):
X_copy = X.copy()
X_copy['Store_Age_Bin'] = pd.cut(X_copy['Store_Age'], bins=self.bins, labels=self.labels, right=False)
return X_copy
store_age_binner = StoreAgeBinner()
# Feature groups
ordinal_features = ['Store_Size', 'Store_Location_City_Type', 'Store_Age_Bin']
ordinal_categories = [
['Small', 'Medium', 'High'],
['Tier 3', 'Tier 2', 'Tier 1'],
['New', 'Mid', 'Old']
]
nominal_features = ['Store_Id', 'Store_Type', 'Product_Prefix', 'Product_Sugar_Content', 'Product_Type']
numerical_features = ['Product_Weight', 'Product_Allocated_Area', 'Product_MRP']
# Encoders and scaler
ordinal_encoder = OrdinalEncoder(categories=ordinal_categories)
onehot_encoder = OneHotEncoder(handle_unknown='ignore')
scaler = StandardScaler()
# Pipelines for numeric and categorical features
numeric_pipeline = Pipeline([
('scaler', scaler)
])
categorical_pipeline = ColumnTransformer(
transformers=[
('ord', ordinal_encoder, ordinal_features),
('onehot', onehot_encoder, nominal_features)
],
remainder='drop'
)
# Full preprocessing pipeline
preprocessor = Pipeline([
('extract_prefix', ProductPrefixExtractor(input_column='Product_Id')),
('convert_age', store_age_transformer),
('bin_age', store_age_binner),
('preprocess', ColumnTransformer([
('num', numeric_pipeline, numerical_features),
('cat', categorical_pipeline, ordinal_features + nominal_features)
], remainder='drop'))
])