Spaces:
Sleeping
Sleeping
File size: 5,809 Bytes
dab71f6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | """
Feature engineering for LightGBM-based time series forecasting.
Key features:
- Calendar: day-of-week, month, year, week-of-year, is_weekend, quarter
- Lag features: sales at t-7, t-14, t-21, t-28, t-35, t-364 (same day last year)
- Rolling statistics: 7-day and 28-day rolling mean/std/max
- Trend: linear trend index
- External: sell_price, snap flag, event indicators
All features are added in-place to the dataframe using a lag-safe approach
(no data leakage: lags are always β₯ HORIZON days back from forecast date).
"""
from __future__ import annotations
import numpy as np
import pandas as pd
from src.config import TARGET_COL, DATE_COL, ID_COL, HORIZON
# ββ Calendar features ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def add_calendar_features(df: pd.DataFrame) -> pd.DataFrame:
"""Add date-derived features."""
df = df.copy()
d = df[DATE_COL]
df["dayofweek"] = d.dt.dayofweek # 0=Mon β¦ 6=Sun
df["month"] = d.dt.month
df["year"] = d.dt.year
df["weekofyear"] = d.dt.isocalendar().week.astype(int)
df["dayofyear"] = d.dt.dayofyear
df["is_weekend"] = (df["dayofweek"] >= 5).astype(int)
df["quarter"] = d.dt.quarter
# Month-end / month-start: high spending days
df["is_month_start"] = d.dt.is_month_start.astype(int)
df["is_month_end"] = d.dt.is_month_end.astype(int)
return df
# ββ Lag features βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
LAG_DAYS = [7, 14, 21, 28, 35, 42, 56, 364] # all β₯ HORIZON=28 β
def add_lag_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Add lagged sales values per series.
Sorted by (unique_id, ds) required before calling.
"""
df = df.sort_values([ID_COL, DATE_COL]).copy()
for lag in LAG_DAYS:
col = f"lag_{lag}"
df[col] = df.groupby(ID_COL)[TARGET_COL].shift(lag)
return df
# ββ Rolling statistics βββββββββββββββββββββββββββββββββββββββββββββββββββββ
ROLL_WINDOWS = [7, 28]
ROLL_LAG = HORIZON # shift before rolling so no leakage
def add_rolling_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Rolling mean/std computed on lagged sales (lag=HORIZON).
Rolling over a window of W days ending at t-HORIZON.
"""
df = df.sort_values([ID_COL, DATE_COL]).copy()
lagged = df.groupby(ID_COL)[TARGET_COL].shift(ROLL_LAG)
for w in ROLL_WINDOWS:
rolled = lagged.groupby(df[ID_COL]).transform(
lambda x: x.rolling(w, min_periods=1).mean()
)
df[f"rolling_mean_{w}"] = rolled
rolled_std = lagged.groupby(df[ID_COL]).transform(
lambda x: x.rolling(w, min_periods=1).std()
)
df[f"rolling_std_{w}"] = rolled_std.fillna(0)
return df
# ββ Price features βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def add_price_features(df: pd.DataFrame) -> pd.DataFrame:
"""Normalise sell_price within each series."""
if "sell_price" not in df.columns:
return df
df = df.copy()
gp = df.groupby(ID_COL)["sell_price"]
df["price_norm"] = df["sell_price"] / gp.transform("mean")
df["price_change"] = df.groupby(ID_COL)["sell_price"].pct_change().fillna(0)
return df
# ββ Event / snap features ββββββββββββββββββββββββββββββββββββββββββββββββββ
def add_event_features(df: pd.DataFrame) -> pd.DataFrame:
"""Binary flags for holidays and SNAP days."""
df = df.copy()
if "event_name_1" in df.columns:
df["has_event"] = df["event_name_1"].notna().astype(int)
if "snap_CA" in df.columns:
snap_cols = [c for c in df.columns if c.startswith("snap_")]
df["is_snap"] = df[snap_cols].max(axis=1).astype(int)
return df
# ββ Master feature builder βββββββββββββββββββββββββββββββββββββββββββββββββ
FEATURE_COLS: list[str] = [] # filled dynamically
def build_features(df: pd.DataFrame, fit: bool = True) -> pd.DataFrame:
"""
Apply all feature engineering steps.
Args:
df : DataFrame in long format with (unique_id, ds, y, optional exog).
fit : If True, also cache the final feature column list.
Returns:
DataFrame with all features added.
"""
global FEATURE_COLS
df = add_calendar_features(df)
df = add_lag_features(df)
df = add_rolling_features(df)
df = add_price_features(df)
df = add_event_features(df)
# Drop rows where lags are undefined (first LAG_DAYS[-1] rows per series)
df = df.dropna(subset=[f"lag_{LAG_DAYS[0]}"])
if fit:
# Collect all numeric feature columns (exclude id/date/target)
exclude = {ID_COL, DATE_COL, TARGET_COL,
"event_name_1", "event_name_2",
"event_type_1", "event_type_2"}
FEATURE_COLS = [c for c in df.columns
if c not in exclude
and df[c].dtype in (np.float64, np.float32,
np.int64, np.int32, int, float)]
return df
def get_feature_cols() -> list[str]:
"""Return feature columns set during last build_features(fit=True) call."""
return list(FEATURE_COLS)
|