Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import logging | |
| from typing import Iterable | |
| logger = logging.getLogger(__name__) | |
| def extract_date_features(df: pd.DataFrame) -> pd.DataFrame: | |
| """Extracts basic calendar features from the Date column.""" | |
| df = df.copy() | |
| date_col = "Date" if "Date" in df.columns else "date" | |
| df[date_col] = pd.to_datetime(df[date_col]) | |
| df["Year"] = df[date_col].dt.year | |
| df["Month"] = df[date_col].dt.month | |
| df["Day"] = df[date_col].dt.day | |
| df["DayOfWeek"] = df[date_col].dt.dayofweek + 1 | |
| df["IsWeekend"] = (df[date_col].dt.dayofweek >= 5).astype(int) | |
| df["DayOfMonth"] = df[date_col].dt.day | |
| return df | |
| def apply_fourier_seasonality(df: pd.DataFrame, period: float = 365.25, order: int = 5) -> pd.DataFrame: | |
| """Applies Fourier terms to capture annual seasonality.""" | |
| df = df.copy() | |
| date_col = "Date" if "Date" in df.columns else "date" | |
| # Calculate days since a reference point | |
| times = pd.to_datetime(df[date_col]).values.view(np.int64) / 10**9 / (60 * 60 * 24) | |
| for i in range(1, order + 1): | |
| df[f"fourier_sin_{i}"] = np.sin(2 * np.pi * i * times / period) | |
| df[f"fourier_cos_{i}"] = np.cos(2 * np.pi * i * times / period) | |
| return df | |
| def add_holiday_features(df: pd.DataFrame) -> pd.DataFrame: | |
| """Adds Easter countdown and holiday effect windows.""" | |
| df = df.copy() | |
| date_col = "Date" if "Date" in df.columns else "date" | |
| dates = pd.to_datetime(df[date_col]) | |
| # Known Easter dates for the dataset period | |
| easter_dates = { | |
| 2013: "2013-03-31", 2014: "2014-04-20", 2015: "2015-04-05", 2016: "2016-03-27" | |
| } | |
| df["days_to_easter"] = 999 | |
| for year, date_str in easter_dates.items(): | |
| mask = dates.dt.year == year | |
| if any(mask): | |
| df.loc[mask, "days_to_easter"] = (dates[mask] - pd.to_datetime(date_str)).dt.days | |
| df["easter_effect"] = ((df["days_to_easter"] >= -7) & (df["days_to_easter"] <= 7)).astype(int) | |
| return df | |
| def apply_rossmann_store_features(df: pd.DataFrame) -> pd.DataFrame: | |
| """Applies store-specific transformations (Competition, Assortment).""" | |
| df = df.copy() | |
| # StoreType/Assortment encoding | |
| if "StoreType" in df.columns: | |
| df["StoreType"] = df["StoreType"].astype(str).map({"a": 1, "b": 2, "c": 3, "d": 4}).fillna(0) | |
| if "Assortment" in df.columns: | |
| df["Assortment"] = df["Assortment"].astype(str).map({"a": 1, "b": 2, "c": 3}).fillna(0) | |
| # Log Competiton Distance | |
| if "CompetitionDistance" in df.columns: | |
| df["LogCompetitionDistance"] = np.log1p(df["CompetitionDistance"]) | |
| return df | |
| def apply_feature_pipeline( | |
| df: pd.DataFrame, | |
| *, | |
| fourier_period: float = 365.25, | |
| fourier_order: int = 5, | |
| ) -> pd.DataFrame: | |
| """Applies the full feature engineering sequence used by training and serving.""" | |
| df = extract_date_features(df) | |
| df = apply_fourier_seasonality(df, period=fourier_period, order=fourier_order) | |
| df = add_holiday_features(df) | |
| return apply_rossmann_store_features(df) | |
| def build_feature_matrix(df: pd.DataFrame, feature_cols: Iterable[str]) -> pd.DataFrame: | |
| """Constructs the final feature matrix with strict ordering and clipping.""" | |
| X = pd.DataFrame(index=df.index) | |
| for col in feature_cols: | |
| if col in df.columns: | |
| val = df[col] | |
| # Dataset ends in 2015; clip year to prevent extrapolation on unseen year values | |
| if col == "Year": | |
| val = val.clip(upper=2015) | |
| X[col] = val | |
| else: | |
| X[col] = 0 | |
| # Ensure numeric and handle any remaining NaNs | |
| X = X.apply(pd.to_numeric, errors="coerce").fillna(0) | |
| return X | |