ymlin105's picture
init: simplify rossmann forecasting project baseline
52cc99a
import pandas as pd
import numpy as np
import logging
from typing import Iterable
logger = logging.getLogger(__name__)
def extract_date_features(df: pd.DataFrame) -> pd.DataFrame:
"""Extracts basic calendar features from the Date column."""
df = df.copy()
date_col = "Date" if "Date" in df.columns else "date"
df[date_col] = pd.to_datetime(df[date_col])
df["Year"] = df[date_col].dt.year
df["Month"] = df[date_col].dt.month
df["Day"] = df[date_col].dt.day
df["DayOfWeek"] = df[date_col].dt.dayofweek + 1
df["IsWeekend"] = (df[date_col].dt.dayofweek >= 5).astype(int)
df["DayOfMonth"] = df[date_col].dt.day
return df
def apply_fourier_seasonality(df: pd.DataFrame, period: float = 365.25, order: int = 5) -> pd.DataFrame:
"""Applies Fourier terms to capture annual seasonality."""
df = df.copy()
date_col = "Date" if "Date" in df.columns else "date"
# Calculate days since a reference point
times = pd.to_datetime(df[date_col]).values.view(np.int64) / 10**9 / (60 * 60 * 24)
for i in range(1, order + 1):
df[f"fourier_sin_{i}"] = np.sin(2 * np.pi * i * times / period)
df[f"fourier_cos_{i}"] = np.cos(2 * np.pi * i * times / period)
return df
def add_holiday_features(df: pd.DataFrame) -> pd.DataFrame:
"""Adds Easter countdown and holiday effect windows."""
df = df.copy()
date_col = "Date" if "Date" in df.columns else "date"
dates = pd.to_datetime(df[date_col])
# Known Easter dates for the dataset period
easter_dates = {
2013: "2013-03-31", 2014: "2014-04-20", 2015: "2015-04-05", 2016: "2016-03-27"
}
df["days_to_easter"] = 999
for year, date_str in easter_dates.items():
mask = dates.dt.year == year
if any(mask):
df.loc[mask, "days_to_easter"] = (dates[mask] - pd.to_datetime(date_str)).dt.days
df["easter_effect"] = ((df["days_to_easter"] >= -7) & (df["days_to_easter"] <= 7)).astype(int)
return df
def apply_rossmann_store_features(df: pd.DataFrame) -> pd.DataFrame:
"""Applies store-specific transformations (Competition, Assortment)."""
df = df.copy()
# StoreType/Assortment encoding
if "StoreType" in df.columns:
df["StoreType"] = df["StoreType"].astype(str).map({"a": 1, "b": 2, "c": 3, "d": 4}).fillna(0)
if "Assortment" in df.columns:
df["Assortment"] = df["Assortment"].astype(str).map({"a": 1, "b": 2, "c": 3}).fillna(0)
# Log Competiton Distance
if "CompetitionDistance" in df.columns:
df["LogCompetitionDistance"] = np.log1p(df["CompetitionDistance"])
return df
def apply_feature_pipeline(
df: pd.DataFrame,
*,
fourier_period: float = 365.25,
fourier_order: int = 5,
) -> pd.DataFrame:
"""Applies the full feature engineering sequence used by training and serving."""
df = extract_date_features(df)
df = apply_fourier_seasonality(df, period=fourier_period, order=fourier_order)
df = add_holiday_features(df)
return apply_rossmann_store_features(df)
def build_feature_matrix(df: pd.DataFrame, feature_cols: Iterable[str]) -> pd.DataFrame:
"""Constructs the final feature matrix with strict ordering and clipping."""
X = pd.DataFrame(index=df.index)
for col in feature_cols:
if col in df.columns:
val = df[col]
# Dataset ends in 2015; clip year to prevent extrapolation on unseen year values
if col == "Year":
val = val.clip(upper=2015)
X[col] = val
else:
X[col] = 0
# Ensure numeric and handle any remaining NaNs
X = X.apply(pd.to_numeric, errors="coerce").fillna(0)
return X