File size: 3,040 Bytes
5d0ee50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import numpy as np
import pandas as pd
from typing import Self
from datetime import datetime, timezone
from sklearn.base import BaseEstimator, TransformerMixin


class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass  # no parameters for now

    def fit(self, X: pd.DataFrame, y=None) -> Self:
        # No fitting needed for this transformer
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        df = X.copy()

        # Extract item type
        df["Item_Type_Combined"] = df["Product_Id"].str[:2].astype("category")
        df["Is_Food"]           = (df["Item_Type_Combined"] == "FD").astype(int)
        df["Is_Non_Consumable"] = (df["Item_Type_Combined"] == "NC").astype(int)
        df["Is_Drink"]          = (df["Item_Type_Combined"] == "DR").astype(int)

        # Sugar content cleanup
        df["Product_Sugar_Content"] = np.where(df["Item_Type_Combined"] == "NC", "No Sugar", df["Product_Sugar_Content"])
        df["Product_Sugar_Content"] = df["Product_Sugar_Content"].replace({"reg": "Regular"})

        # Store age
        df["Store_Age"] = datetime.now(tz=timezone.utc).year - df["Store_Establishment_Year"]

        # Encodings
        df["Store_Size_Encoded"] = df["Store_Size"].map({"Small": 0, "Medium": 1, "High": 2})
        df["Store_Tier_Encoded"] = df["Store_Location_City_Type"].map({"Tier 3": 0, "Tier 2": 1, "Tier 1": 2})

        # Derived features
        df["MRP_per_kg"] = df["Product_MRP"] / df["Product_Weight"]
        df["MRP_x_StoreTier"] = df["Product_MRP"] * df["Store_Tier_Encoded"]
        df["MRP_x_StoreSize"] = df["Product_MRP"] * df["Store_Size_Encoded"]

        df["Store_Avg_MRP"] = df.groupby("Store_Id")["Product_MRP"].transform("mean")

        df["Area_log"] = np.log1p(df["Product_Allocated_Area"].clip(
            df["Product_Allocated_Area"].quantile(0.01),
            df["Product_Allocated_Area"].quantile(0.99)
        ))
        df["Area_Share"] = df["Product_Allocated_Area"] / df.groupby("Store_Id")["Product_Allocated_Area"].transform("sum")
        df["Area_Rank"]  = df.groupby("Store_Id")["Product_Allocated_Area"].rank(ascending=False)
        df["Area_bin"]   = pd.qcut(df["Product_Allocated_Area"], q=10, labels=False, duplicates="drop")

        # Binary flags
        df["Is_Sugar_Free"] = (df["Product_Sugar_Content"] == "No Sugar").astype(int)
        df["Is_Medium_Store_Size"] = (df["Store_Size"] == "Medium").astype(int)
        df["Is_Supermarket"] = df["Store_Type"].str.contains("Supermarket").astype(int)
        df["Is_FV_or_Snack"] = df["Product_Type"].isin(["Fruits and Vegetables", "Snack Foods"]).astype(int)

        # Drop unused columns
        df = df.drop(columns=[
            "Product_Id",
            "Item_Type_Combined",
            "Product_Allocated_Area",
            "Store_Type",
            "Store_Size",
            "Store_Location_City_Type",
            "Store_Establishment_Year",
            "Product_Sugar_Content"
        ])

        return df