import numpy as np import pandas as pd from typing import Self from datetime import datetime, timezone from sklearn.base import BaseEstimator, TransformerMixin class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin): def __init__(self): pass # no parameters for now def fit(self, X: pd.DataFrame, y=None) -> Self: # No fitting needed for this transformer return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: df = X.copy() # Extract item type df["Item_Type_Combined"] = df["Product_Id"].str[:2].astype("category") df["Is_Food"] = (df["Item_Type_Combined"] == "FD").astype(int) df["Is_Non_Consumable"] = (df["Item_Type_Combined"] == "NC").astype(int) df["Is_Drink"] = (df["Item_Type_Combined"] == "DR").astype(int) # Sugar content cleanup df["Product_Sugar_Content"] = np.where(df["Item_Type_Combined"] == "NC", "No Sugar", df["Product_Sugar_Content"]) df["Product_Sugar_Content"] = df["Product_Sugar_Content"].replace({"reg": "Regular"}) # Store age df["Store_Age"] = datetime.now(tz=timezone.utc).year - df["Store_Establishment_Year"] # Encodings df["Store_Size_Encoded"] = df["Store_Size"].map({"Small": 0, "Medium": 1, "High": 2}) df["Store_Tier_Encoded"] = df["Store_Location_City_Type"].map({"Tier 3": 0, "Tier 2": 1, "Tier 1": 2}) # Derived features df["MRP_per_kg"] = df["Product_MRP"] / df["Product_Weight"] df["MRP_x_StoreTier"] = df["Product_MRP"] * df["Store_Tier_Encoded"] df["MRP_x_StoreSize"] = df["Product_MRP"] * df["Store_Size_Encoded"] df["Store_Avg_MRP"] = df.groupby("Store_Id")["Product_MRP"].transform("mean") df["Area_log"] = np.log1p(df["Product_Allocated_Area"].clip( df["Product_Allocated_Area"].quantile(0.01), df["Product_Allocated_Area"].quantile(0.99) )) df["Area_Share"] = df["Product_Allocated_Area"] / df.groupby("Store_Id")["Product_Allocated_Area"].transform("sum") df["Area_Rank"] = df.groupby("Store_Id")["Product_Allocated_Area"].rank(ascending=False) df["Area_bin"] = pd.qcut(df["Product_Allocated_Area"], q=10, labels=False, duplicates="drop") # Binary flags df["Is_Sugar_Free"] = (df["Product_Sugar_Content"] == "No Sugar").astype(int) df["Is_Medium_Store_Size"] = (df["Store_Size"] == "Medium").astype(int) df["Is_Supermarket"] = df["Store_Type"].str.contains("Supermarket").astype(int) df["Is_FV_or_Snack"] = df["Product_Type"].isin(["Fruits and Vegetables", "Snack Foods"]).astype(int) # Drop unused columns df = df.drop(columns=[ "Product_Id", "Item_Type_Combined", "Product_Allocated_Area", "Store_Type", "Store_Size", "Store_Location_City_Type", "Store_Establishment_Year", "Product_Sugar_Content" ]) return df