Spaces:
Sleeping
Sleeping
File size: 3,756 Bytes
52cc99a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | import pandas as pd
import numpy as np
import logging
from typing import Iterable
logger = logging.getLogger(__name__)
def extract_date_features(df: pd.DataFrame) -> pd.DataFrame:
"""Extracts basic calendar features from the Date column."""
df = df.copy()
date_col = "Date" if "Date" in df.columns else "date"
df[date_col] = pd.to_datetime(df[date_col])
df["Year"] = df[date_col].dt.year
df["Month"] = df[date_col].dt.month
df["Day"] = df[date_col].dt.day
df["DayOfWeek"] = df[date_col].dt.dayofweek + 1
df["IsWeekend"] = (df[date_col].dt.dayofweek >= 5).astype(int)
df["DayOfMonth"] = df[date_col].dt.day
return df
def apply_fourier_seasonality(df: pd.DataFrame, period: float = 365.25, order: int = 5) -> pd.DataFrame:
"""Applies Fourier terms to capture annual seasonality."""
df = df.copy()
date_col = "Date" if "Date" in df.columns else "date"
# Calculate days since a reference point
times = pd.to_datetime(df[date_col]).values.view(np.int64) / 10**9 / (60 * 60 * 24)
for i in range(1, order + 1):
df[f"fourier_sin_{i}"] = np.sin(2 * np.pi * i * times / period)
df[f"fourier_cos_{i}"] = np.cos(2 * np.pi * i * times / period)
return df
def add_holiday_features(df: pd.DataFrame) -> pd.DataFrame:
"""Adds Easter countdown and holiday effect windows."""
df = df.copy()
date_col = "Date" if "Date" in df.columns else "date"
dates = pd.to_datetime(df[date_col])
# Known Easter dates for the dataset period
easter_dates = {
2013: "2013-03-31", 2014: "2014-04-20", 2015: "2015-04-05", 2016: "2016-03-27"
}
df["days_to_easter"] = 999
for year, date_str in easter_dates.items():
mask = dates.dt.year == year
if any(mask):
df.loc[mask, "days_to_easter"] = (dates[mask] - pd.to_datetime(date_str)).dt.days
df["easter_effect"] = ((df["days_to_easter"] >= -7) & (df["days_to_easter"] <= 7)).astype(int)
return df
def apply_rossmann_store_features(df: pd.DataFrame) -> pd.DataFrame:
"""Applies store-specific transformations (Competition, Assortment)."""
df = df.copy()
# StoreType/Assortment encoding
if "StoreType" in df.columns:
df["StoreType"] = df["StoreType"].astype(str).map({"a": 1, "b": 2, "c": 3, "d": 4}).fillna(0)
if "Assortment" in df.columns:
df["Assortment"] = df["Assortment"].astype(str).map({"a": 1, "b": 2, "c": 3}).fillna(0)
# Log Competiton Distance
if "CompetitionDistance" in df.columns:
df["LogCompetitionDistance"] = np.log1p(df["CompetitionDistance"])
return df
def apply_feature_pipeline(
df: pd.DataFrame,
*,
fourier_period: float = 365.25,
fourier_order: int = 5,
) -> pd.DataFrame:
"""Applies the full feature engineering sequence used by training and serving."""
df = extract_date_features(df)
df = apply_fourier_seasonality(df, period=fourier_period, order=fourier_order)
df = add_holiday_features(df)
return apply_rossmann_store_features(df)
def build_feature_matrix(df: pd.DataFrame, feature_cols: Iterable[str]) -> pd.DataFrame:
"""Constructs the final feature matrix with strict ordering and clipping."""
X = pd.DataFrame(index=df.index)
for col in feature_cols:
if col in df.columns:
val = df[col]
# Dataset ends in 2015; clip year to prevent extrapolation on unseen year values
if col == "Year":
val = val.clip(upper=2015)
X[col] = val
else:
X[col] = 0
# Ensure numeric and handle any remaining NaNs
X = X.apply(pd.to_numeric, errors="coerce").fillna(0)
return X
|