File size: 3,756 Bytes
52cc99a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import numpy as np
import logging
from typing import Iterable

logger = logging.getLogger(__name__)

def extract_date_features(df: pd.DataFrame) -> pd.DataFrame:
    """Extracts basic calendar features from the Date column."""
    df = df.copy()
    date_col = "Date" if "Date" in df.columns else "date"
    df[date_col] = pd.to_datetime(df[date_col])
    
    df["Year"] = df[date_col].dt.year
    df["Month"] = df[date_col].dt.month
    df["Day"] = df[date_col].dt.day
    df["DayOfWeek"] = df[date_col].dt.dayofweek + 1
    df["IsWeekend"] = (df[date_col].dt.dayofweek >= 5).astype(int)
    df["DayOfMonth"] = df[date_col].dt.day
    return df

def apply_fourier_seasonality(df: pd.DataFrame, period: float = 365.25, order: int = 5) -> pd.DataFrame:
    """Applies Fourier terms to capture annual seasonality."""
    df = df.copy()
    date_col = "Date" if "Date" in df.columns else "date"
    
    # Calculate days since a reference point
    times = pd.to_datetime(df[date_col]).values.view(np.int64) / 10**9 / (60 * 60 * 24)
    
    for i in range(1, order + 1):
        df[f"fourier_sin_{i}"] = np.sin(2 * np.pi * i * times / period)
        df[f"fourier_cos_{i}"] = np.cos(2 * np.pi * i * times / period)
    return df

def add_holiday_features(df: pd.DataFrame) -> pd.DataFrame:
    """Adds Easter countdown and holiday effect windows."""
    df = df.copy()
    date_col = "Date" if "Date" in df.columns else "date"
    dates = pd.to_datetime(df[date_col])
    
    # Known Easter dates for the dataset period
    easter_dates = {
        2013: "2013-03-31", 2014: "2014-04-20", 2015: "2015-04-05", 2016: "2016-03-27"
    }
    
    df["days_to_easter"] = 999
    for year, date_str in easter_dates.items():
        mask = dates.dt.year == year
        if any(mask):
            df.loc[mask, "days_to_easter"] = (dates[mask] - pd.to_datetime(date_str)).dt.days
            
    df["easter_effect"] = ((df["days_to_easter"] >= -7) & (df["days_to_easter"] <= 7)).astype(int)
    return df

def apply_rossmann_store_features(df: pd.DataFrame) -> pd.DataFrame:
    """Applies store-specific transformations (Competition, Assortment)."""
    df = df.copy()
    
    # StoreType/Assortment encoding
    if "StoreType" in df.columns:
        df["StoreType"] = df["StoreType"].astype(str).map({"a": 1, "b": 2, "c": 3, "d": 4}).fillna(0)
    
    if "Assortment" in df.columns:
        df["Assortment"] = df["Assortment"].astype(str).map({"a": 1, "b": 2, "c": 3}).fillna(0)
        
    # Log Competiton Distance
    if "CompetitionDistance" in df.columns:
        df["LogCompetitionDistance"] = np.log1p(df["CompetitionDistance"])
        
    return df

def apply_feature_pipeline(
    df: pd.DataFrame,
    *,
    fourier_period: float = 365.25,
    fourier_order: int = 5,
) -> pd.DataFrame:
    """Applies the full feature engineering sequence used by training and serving."""
    df = extract_date_features(df)
    df = apply_fourier_seasonality(df, period=fourier_period, order=fourier_order)
    df = add_holiday_features(df)
    return apply_rossmann_store_features(df)

def build_feature_matrix(df: pd.DataFrame, feature_cols: Iterable[str]) -> pd.DataFrame:
    """Constructs the final feature matrix with strict ordering and clipping."""
    X = pd.DataFrame(index=df.index)
    for col in feature_cols:
        if col in df.columns:
            val = df[col]
            # Dataset ends in 2015; clip year to prevent extrapolation on unseen year values
            if col == "Year":
                val = val.clip(upper=2015)
            X[col] = val
        else:
            X[col] = 0
            
    # Ensure numeric and handle any remaining NaNs
    X = X.apply(pd.to_numeric, errors="coerce").fillna(0)
    return X