Rossmann-Store-Sales / src /features.py
ymlin105's picture
feat: standardize feature engineering and push new production model
aa92081
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from src.core import setup_logger
logger = setup_logger(__name__)
class FeatureEngineeringStrategy(ABC):
@abstractmethod
def apply_transformation(self, df: pd.DataFrame) -> pd.DataFrame:
pass
class DateTransformation(FeatureEngineeringStrategy):
def apply_transformation(self, df: pd.DataFrame) -> pd.DataFrame:
logger.info("Applying date transformation.")
df_transformed = df.copy()
date_col = 'date' if 'date' in df.columns else 'Date'
if date_col not in df.columns:
return df
df_transformed[date_col] = pd.to_datetime(df[date_col])
df_transformed['Year'] = df_transformed[date_col].dt.year
df_transformed['Month'] = df_transformed[date_col].dt.month
df_transformed['Day'] = df_transformed[date_col].dt.day
df_transformed['DayOfWeek'] = df_transformed[date_col].dt.dayofweek + 1
df_transformed['IsWeekend'] = (df_transformed[date_col].dt.dayofweek >= 5).astype(int)
df_transformed['DayOfMonth'] = df_transformed[date_col].dt.day
return df_transformed
class FourierSeriesSeasonality(FeatureEngineeringStrategy):
def __init__(self, period: float = 365.25, order: int = 3):
self.period = period
self.order = order
def apply_transformation(self, df: pd.DataFrame) -> pd.DataFrame:
logger.info(f"Applying Fourier terms (order={self.order})")
df_transformed = df.copy()
date_col = 'date' if 'date' in df.columns else 'Date'
times = pd.to_datetime(df_transformed[date_col]).values.view(np.int64) / 10**9 / (60 * 60 * 24)
for i in range(1, self.order + 1):
df_transformed[f'fourier_sin_{i}'] = np.sin(2 * np.pi * i * times / self.period)
df_transformed[f'fourier_cos_{i}'] = np.cos(2 * np.pi * i * times / self.period)
return df_transformed
class EasterFeature(FeatureEngineeringStrategy):
def apply_transformation(self, df: pd.DataFrame) -> pd.DataFrame:
logger.info("Applying Easter feature.")
df_transformed = df.copy()
date_col = 'date' if 'date' in df.columns else 'Date'
dates = pd.to_datetime(df_transformed[date_col])
easter_dates = {2013: '2013-03-31', 2014: '2014-04-20', 2015: '2015-04-05', 2016: '2016-03-27'}
df_transformed['days_to_easter'] = 999
for year, date_str in easter_dates.items():
mask = dates.dt.year == year
df_transformed.loc[mask, 'days_to_easter'] = (dates[mask] - pd.to_datetime(date_str)).dt.days
df_transformed['easter_effect'] = ((df_transformed['days_to_easter'] >= -7) & (df_transformed['days_to_easter'] <= 7)).astype(int)
return df_transformed
class RossmannFeatureEngineering(FeatureEngineeringStrategy):
def apply_transformation(self, df: pd.DataFrame) -> pd.DataFrame:
logger.info("Applying Rossmann retail features.")
df_transformed = df.copy()
if 'StateHoliday' in df_transformed.columns:
df_transformed['StateHoliday'] = df_transformed['StateHoliday'].astype(str).map({'0': 0, 'a': 1, 'b': 2, 'c': 3}).fillna(0)
if 'CompetitionDistance' in df_transformed.columns:
df_transformed['CompetitionDistance'] = df_transformed['CompetitionDistance'].fillna(100000)
if 'CompetitionOpenSinceYear' in df_transformed.columns and 'Year' in df_transformed.columns:
df_transformed['CompetitionOpenTime'] = 12 * (df_transformed['Year'] - df_transformed['CompetitionOpenSinceYear']) + (df_transformed['Month'] - df_transformed['CompetitionOpenSinceMonth'])
df_transformed['CompetitionOpenTime'] = df_transformed['CompetitionOpenTime'].apply(lambda x: x if x > 0 else 0)
return df_transformed
class FeatureEngineer:
def __init__(self, strategy: FeatureEngineeringStrategy):
self._strategy = strategy
def set_strategy(self, strategy: FeatureEngineeringStrategy):
self._strategy = strategy
def apply_feature_engineering(self, df: pd.DataFrame) -> pd.DataFrame:
return self._strategy.apply_transformation(df)