Spaces:
Build error
Build error
File size: 4,246 Bytes
ea6f215 aa92081 ea6f215 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from src.core import setup_logger
logger = setup_logger(__name__)
class FeatureEngineeringStrategy(ABC):
@abstractmethod
def apply_transformation(self, df: pd.DataFrame) -> pd.DataFrame:
pass
class DateTransformation(FeatureEngineeringStrategy):
def apply_transformation(self, df: pd.DataFrame) -> pd.DataFrame:
logger.info("Applying date transformation.")
df_transformed = df.copy()
date_col = 'date' if 'date' in df.columns else 'Date'
if date_col not in df.columns:
return df
df_transformed[date_col] = pd.to_datetime(df[date_col])
df_transformed['Year'] = df_transformed[date_col].dt.year
df_transformed['Month'] = df_transformed[date_col].dt.month
df_transformed['Day'] = df_transformed[date_col].dt.day
df_transformed['DayOfWeek'] = df_transformed[date_col].dt.dayofweek + 1
df_transformed['IsWeekend'] = (df_transformed[date_col].dt.dayofweek >= 5).astype(int)
df_transformed['DayOfMonth'] = df_transformed[date_col].dt.day
return df_transformed
class FourierSeriesSeasonality(FeatureEngineeringStrategy):
def __init__(self, period: float = 365.25, order: int = 3):
self.period = period
self.order = order
def apply_transformation(self, df: pd.DataFrame) -> pd.DataFrame:
logger.info(f"Applying Fourier terms (order={self.order})")
df_transformed = df.copy()
date_col = 'date' if 'date' in df.columns else 'Date'
times = pd.to_datetime(df_transformed[date_col]).values.view(np.int64) / 10**9 / (60 * 60 * 24)
for i in range(1, self.order + 1):
df_transformed[f'fourier_sin_{i}'] = np.sin(2 * np.pi * i * times / self.period)
df_transformed[f'fourier_cos_{i}'] = np.cos(2 * np.pi * i * times / self.period)
return df_transformed
class EasterFeature(FeatureEngineeringStrategy):
def apply_transformation(self, df: pd.DataFrame) -> pd.DataFrame:
logger.info("Applying Easter feature.")
df_transformed = df.copy()
date_col = 'date' if 'date' in df.columns else 'Date'
dates = pd.to_datetime(df_transformed[date_col])
easter_dates = {2013: '2013-03-31', 2014: '2014-04-20', 2015: '2015-04-05', 2016: '2016-03-27'}
df_transformed['days_to_easter'] = 999
for year, date_str in easter_dates.items():
mask = dates.dt.year == year
df_transformed.loc[mask, 'days_to_easter'] = (dates[mask] - pd.to_datetime(date_str)).dt.days
df_transformed['easter_effect'] = ((df_transformed['days_to_easter'] >= -7) & (df_transformed['days_to_easter'] <= 7)).astype(int)
return df_transformed
class RossmannFeatureEngineering(FeatureEngineeringStrategy):
def apply_transformation(self, df: pd.DataFrame) -> pd.DataFrame:
logger.info("Applying Rossmann retail features.")
df_transformed = df.copy()
if 'StateHoliday' in df_transformed.columns:
df_transformed['StateHoliday'] = df_transformed['StateHoliday'].astype(str).map({'0': 0, 'a': 1, 'b': 2, 'c': 3}).fillna(0)
if 'CompetitionDistance' in df_transformed.columns:
df_transformed['CompetitionDistance'] = df_transformed['CompetitionDistance'].fillna(100000)
if 'CompetitionOpenSinceYear' in df_transformed.columns and 'Year' in df_transformed.columns:
df_transformed['CompetitionOpenTime'] = 12 * (df_transformed['Year'] - df_transformed['CompetitionOpenSinceYear']) + (df_transformed['Month'] - df_transformed['CompetitionOpenSinceMonth'])
df_transformed['CompetitionOpenTime'] = df_transformed['CompetitionOpenTime'].apply(lambda x: x if x > 0 else 0)
return df_transformed
class FeatureEngineer:
def __init__(self, strategy: FeatureEngineeringStrategy):
self._strategy = strategy
def set_strategy(self, strategy: FeatureEngineeringStrategy):
self._strategy = strategy
def apply_feature_engineering(self, df: pd.DataFrame) -> pd.DataFrame:
return self._strategy.apply_transformation(df)
|