|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from typing import Self |
|
|
from datetime import datetime, timezone |
|
|
from sklearn.base import BaseEstimator, TransformerMixin |
|
|
|
|
|
|
|
|
class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin): |
|
|
def __init__(self): |
|
|
pass |
|
|
|
|
|
def fit(self, X: pd.DataFrame, y=None) -> Self: |
|
|
|
|
|
return self |
|
|
|
|
|
def transform(self, X: pd.DataFrame) -> pd.DataFrame: |
|
|
df = X.copy() |
|
|
|
|
|
|
|
|
df["Item_Type_Combined"] = df["Product_Id"].str[:2].astype("category") |
|
|
df["Is_Food"] = (df["Item_Type_Combined"] == "FD").astype(int) |
|
|
df["Is_Non_Consumable"] = (df["Item_Type_Combined"] == "NC").astype(int) |
|
|
df["Is_Drink"] = (df["Item_Type_Combined"] == "DR").astype(int) |
|
|
|
|
|
|
|
|
df["Product_Sugar_Content"] = np.where(df["Item_Type_Combined"] == "NC", "No Sugar", df["Product_Sugar_Content"]) |
|
|
df["Product_Sugar_Content"] = df["Product_Sugar_Content"].replace({"reg": "Regular"}) |
|
|
|
|
|
|
|
|
df["Store_Age"] = datetime.now(tz=timezone.utc).year - df["Store_Establishment_Year"] |
|
|
|
|
|
|
|
|
df["Store_Size_Encoded"] = df["Store_Size"].map({"Small": 0, "Medium": 1, "High": 2}) |
|
|
df["Store_Tier_Encoded"] = df["Store_Location_City_Type"].map({"Tier 3": 0, "Tier 2": 1, "Tier 1": 2}) |
|
|
|
|
|
|
|
|
df["MRP_per_kg"] = df["Product_MRP"] / df["Product_Weight"] |
|
|
df["MRP_x_StoreTier"] = df["Product_MRP"] * df["Store_Tier_Encoded"] |
|
|
df["MRP_x_StoreSize"] = df["Product_MRP"] * df["Store_Size_Encoded"] |
|
|
|
|
|
df["Store_Avg_MRP"] = df.groupby("Store_Id")["Product_MRP"].transform("mean") |
|
|
|
|
|
df["Area_log"] = np.log1p(df["Product_Allocated_Area"].clip( |
|
|
df["Product_Allocated_Area"].quantile(0.01), |
|
|
df["Product_Allocated_Area"].quantile(0.99) |
|
|
)) |
|
|
df["Area_Share"] = df["Product_Allocated_Area"] / df.groupby("Store_Id")["Product_Allocated_Area"].transform("sum") |
|
|
df["Area_Rank"] = df.groupby("Store_Id")["Product_Allocated_Area"].rank(ascending=False) |
|
|
df["Area_bin"] = pd.qcut(df["Product_Allocated_Area"], q=10, labels=False, duplicates="drop") |
|
|
|
|
|
|
|
|
df["Is_Sugar_Free"] = (df["Product_Sugar_Content"] == "No Sugar").astype(int) |
|
|
df["Is_Medium_Store_Size"] = (df["Store_Size"] == "Medium").astype(int) |
|
|
df["Is_Supermarket"] = df["Store_Type"].str.contains("Supermarket").astype(int) |
|
|
df["Is_FV_or_Snack"] = df["Product_Type"].isin(["Fruits and Vegetables", "Snack Foods"]).astype(int) |
|
|
|
|
|
|
|
|
df = df.drop(columns=[ |
|
|
"Product_Id", |
|
|
"Item_Type_Combined", |
|
|
"Product_Allocated_Area", |
|
|
"Store_Type", |
|
|
"Store_Size", |
|
|
"Store_Location_City_Type", |
|
|
"Store_Establishment_Year", |
|
|
"Product_Sugar_Content" |
|
|
]) |
|
|
|
|
|
return df |
|
|
|