utaustin-model-deployment-project-api / feature_engineering.py
sebastiangv's picture
Upload folder using huggingface_hub
5d0ee50 verified
import numpy as np
import pandas as pd
from typing import Self
from datetime import datetime, timezone
from sklearn.base import BaseEstimator, TransformerMixin
class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass # no parameters for now
def fit(self, X: pd.DataFrame, y=None) -> Self:
# No fitting needed for this transformer
return self
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
df = X.copy()
# Extract item type
df["Item_Type_Combined"] = df["Product_Id"].str[:2].astype("category")
df["Is_Food"] = (df["Item_Type_Combined"] == "FD").astype(int)
df["Is_Non_Consumable"] = (df["Item_Type_Combined"] == "NC").astype(int)
df["Is_Drink"] = (df["Item_Type_Combined"] == "DR").astype(int)
# Sugar content cleanup
df["Product_Sugar_Content"] = np.where(df["Item_Type_Combined"] == "NC", "No Sugar", df["Product_Sugar_Content"])
df["Product_Sugar_Content"] = df["Product_Sugar_Content"].replace({"reg": "Regular"})
# Store age
df["Store_Age"] = datetime.now(tz=timezone.utc).year - df["Store_Establishment_Year"]
# Encodings
df["Store_Size_Encoded"] = df["Store_Size"].map({"Small": 0, "Medium": 1, "High": 2})
df["Store_Tier_Encoded"] = df["Store_Location_City_Type"].map({"Tier 3": 0, "Tier 2": 1, "Tier 1": 2})
# Derived features
df["MRP_per_kg"] = df["Product_MRP"] / df["Product_Weight"]
df["MRP_x_StoreTier"] = df["Product_MRP"] * df["Store_Tier_Encoded"]
df["MRP_x_StoreSize"] = df["Product_MRP"] * df["Store_Size_Encoded"]
df["Store_Avg_MRP"] = df.groupby("Store_Id")["Product_MRP"].transform("mean")
df["Area_log"] = np.log1p(df["Product_Allocated_Area"].clip(
df["Product_Allocated_Area"].quantile(0.01),
df["Product_Allocated_Area"].quantile(0.99)
))
df["Area_Share"] = df["Product_Allocated_Area"] / df.groupby("Store_Id")["Product_Allocated_Area"].transform("sum")
df["Area_Rank"] = df.groupby("Store_Id")["Product_Allocated_Area"].rank(ascending=False)
df["Area_bin"] = pd.qcut(df["Product_Allocated_Area"], q=10, labels=False, duplicates="drop")
# Binary flags
df["Is_Sugar_Free"] = (df["Product_Sugar_Content"] == "No Sugar").astype(int)
df["Is_Medium_Store_Size"] = (df["Store_Size"] == "Medium").astype(int)
df["Is_Supermarket"] = df["Store_Type"].str.contains("Supermarket").astype(int)
df["Is_FV_or_Snack"] = df["Product_Type"].isin(["Fruits and Vegetables", "Snack Foods"]).astype(int)
# Drop unused columns
df = df.drop(columns=[
"Product_Id",
"Item_Type_Combined",
"Product_Allocated_Area",
"Store_Type",
"Store_Size",
"Store_Location_City_Type",
"Store_Establishment_Year",
"Product_Sugar_Content"
])
return df