File size: 4,246 Bytes
ea6f215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa92081
ea6f215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from src.core import setup_logger

logger = setup_logger(__name__)

class FeatureEngineeringStrategy(ABC):
    @abstractmethod
    def apply_transformation(self, df: pd.DataFrame) -> pd.DataFrame:
        pass

class DateTransformation(FeatureEngineeringStrategy):
    def apply_transformation(self, df: pd.DataFrame) -> pd.DataFrame:
        logger.info("Applying date transformation.")
        df_transformed = df.copy()
        date_col = 'date' if 'date' in df.columns else 'Date'
        if date_col not in df.columns:
            return df
        df_transformed[date_col] = pd.to_datetime(df[date_col])
        df_transformed['Year'] = df_transformed[date_col].dt.year
        df_transformed['Month'] = df_transformed[date_col].dt.month
        df_transformed['Day'] = df_transformed[date_col].dt.day
        df_transformed['DayOfWeek'] = df_transformed[date_col].dt.dayofweek + 1
        df_transformed['IsWeekend'] = (df_transformed[date_col].dt.dayofweek >= 5).astype(int)
        df_transformed['DayOfMonth'] = df_transformed[date_col].dt.day
        return df_transformed

class FourierSeriesSeasonality(FeatureEngineeringStrategy):
    def __init__(self, period: float = 365.25, order: int = 3):
        self.period = period
        self.order = order

    def apply_transformation(self, df: pd.DataFrame) -> pd.DataFrame:
        logger.info(f"Applying Fourier terms (order={self.order})")
        df_transformed = df.copy()
        date_col = 'date' if 'date' in df.columns else 'Date'
        times = pd.to_datetime(df_transformed[date_col]).values.view(np.int64) / 10**9 / (60 * 60 * 24)
        for i in range(1, self.order + 1):
            df_transformed[f'fourier_sin_{i}'] = np.sin(2 * np.pi * i * times / self.period)
            df_transformed[f'fourier_cos_{i}'] = np.cos(2 * np.pi * i * times / self.period)
        return df_transformed

class EasterFeature(FeatureEngineeringStrategy):
    def apply_transformation(self, df: pd.DataFrame) -> pd.DataFrame:
        logger.info("Applying Easter feature.")
        df_transformed = df.copy()
        date_col = 'date' if 'date' in df.columns else 'Date'
        dates = pd.to_datetime(df_transformed[date_col])
        easter_dates = {2013: '2013-03-31', 2014: '2014-04-20', 2015: '2015-04-05', 2016: '2016-03-27'}
        df_transformed['days_to_easter'] = 999
        for year, date_str in easter_dates.items():
            mask = dates.dt.year == year
            df_transformed.loc[mask, 'days_to_easter'] = (dates[mask] - pd.to_datetime(date_str)).dt.days
        df_transformed['easter_effect'] = ((df_transformed['days_to_easter'] >= -7) & (df_transformed['days_to_easter'] <= 7)).astype(int)
        return df_transformed

class RossmannFeatureEngineering(FeatureEngineeringStrategy):
    def apply_transformation(self, df: pd.DataFrame) -> pd.DataFrame:
        logger.info("Applying Rossmann retail features.")
        df_transformed = df.copy()
        if 'StateHoliday' in df_transformed.columns:
            df_transformed['StateHoliday'] = df_transformed['StateHoliday'].astype(str).map({'0': 0, 'a': 1, 'b': 2, 'c': 3}).fillna(0)
        if 'CompetitionDistance' in df_transformed.columns:
            df_transformed['CompetitionDistance'] = df_transformed['CompetitionDistance'].fillna(100000)
        if 'CompetitionOpenSinceYear' in df_transformed.columns and 'Year' in df_transformed.columns:
            df_transformed['CompetitionOpenTime'] = 12 * (df_transformed['Year'] - df_transformed['CompetitionOpenSinceYear']) + (df_transformed['Month'] - df_transformed['CompetitionOpenSinceMonth'])
            df_transformed['CompetitionOpenTime'] = df_transformed['CompetitionOpenTime'].apply(lambda x: x if x > 0 else 0)
        return df_transformed

class FeatureEngineer:
    def __init__(self, strategy: FeatureEngineeringStrategy):
        self._strategy = strategy
    def set_strategy(self, strategy: FeatureEngineeringStrategy):
        self._strategy = strategy
    def apply_feature_engineering(self, df: pd.DataFrame) -> pd.DataFrame:
        return self._strategy.apply_transformation(df)