File size: 5,809 Bytes
dab71f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""
Feature engineering for LightGBM-based time series forecasting.

Key features:
  - Calendar: day-of-week, month, year, week-of-year, is_weekend, quarter
  - Lag features: sales at t-7, t-14, t-21, t-28, t-35, t-364 (same day last year)
  - Rolling statistics: 7-day and 28-day rolling mean/std/max
  - Trend: linear trend index
  - External: sell_price, snap flag, event indicators

All features are added in-place to the dataframe using a lag-safe approach
(no data leakage: lags are always β‰₯ HORIZON days back from forecast date).
"""
from __future__ import annotations

import numpy as np
import pandas as pd

from src.config import TARGET_COL, DATE_COL, ID_COL, HORIZON


# ── Calendar features ──────────────────────────────────────────────────────

def add_calendar_features(df: pd.DataFrame) -> pd.DataFrame:
    """Add date-derived features."""
    df = df.copy()
    d = df[DATE_COL]
    df["dayofweek"]  = d.dt.dayofweek          # 0=Mon … 6=Sun
    df["month"]      = d.dt.month
    df["year"]       = d.dt.year
    df["weekofyear"] = d.dt.isocalendar().week.astype(int)
    df["dayofyear"]  = d.dt.dayofyear
    df["is_weekend"] = (df["dayofweek"] >= 5).astype(int)
    df["quarter"]    = d.dt.quarter
    # Month-end / month-start: high spending days
    df["is_month_start"] = d.dt.is_month_start.astype(int)
    df["is_month_end"]   = d.dt.is_month_end.astype(int)
    return df


# ── Lag features ───────────────────────────────────────────────────────────

LAG_DAYS = [7, 14, 21, 28, 35, 42, 56, 364]   # all β‰₯ HORIZON=28 βœ“

def add_lag_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add lagged sales values per series.
    Sorted by (unique_id, ds) required before calling.
    """
    df = df.sort_values([ID_COL, DATE_COL]).copy()
    for lag in LAG_DAYS:
        col = f"lag_{lag}"
        df[col] = df.groupby(ID_COL)[TARGET_COL].shift(lag)
    return df


# ── Rolling statistics ─────────────────────────────────────────────────────

ROLL_WINDOWS  = [7, 28]
ROLL_LAG      = HORIZON    # shift before rolling so no leakage

def add_rolling_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Rolling mean/std computed on lagged sales (lag=HORIZON).
    Rolling over a window of W days ending at t-HORIZON.
    """
    df = df.sort_values([ID_COL, DATE_COL]).copy()
    lagged = df.groupby(ID_COL)[TARGET_COL].shift(ROLL_LAG)

    for w in ROLL_WINDOWS:
        rolled = lagged.groupby(df[ID_COL]).transform(
            lambda x: x.rolling(w, min_periods=1).mean()
        )
        df[f"rolling_mean_{w}"] = rolled

        rolled_std = lagged.groupby(df[ID_COL]).transform(
            lambda x: x.rolling(w, min_periods=1).std()
        )
        df[f"rolling_std_{w}"] = rolled_std.fillna(0)

    return df


# ── Price features ─────────────────────────────────────────────────────────

def add_price_features(df: pd.DataFrame) -> pd.DataFrame:
    """Normalise sell_price within each series."""
    if "sell_price" not in df.columns:
        return df
    df = df.copy()
    gp = df.groupby(ID_COL)["sell_price"]
    df["price_norm"]   = df["sell_price"] / gp.transform("mean")
    df["price_change"] = df.groupby(ID_COL)["sell_price"].pct_change().fillna(0)
    return df


# ── Event / snap features ──────────────────────────────────────────────────

def add_event_features(df: pd.DataFrame) -> pd.DataFrame:
    """Binary flags for holidays and SNAP days."""
    df = df.copy()
    if "event_name_1" in df.columns:
        df["has_event"] = df["event_name_1"].notna().astype(int)
    if "snap_CA" in df.columns:
        snap_cols = [c for c in df.columns if c.startswith("snap_")]
        df["is_snap"] = df[snap_cols].max(axis=1).astype(int)
    return df


# ── Master feature builder ─────────────────────────────────────────────────

FEATURE_COLS: list[str] = []  # filled dynamically

def build_features(df: pd.DataFrame, fit: bool = True) -> pd.DataFrame:
    """
    Apply all feature engineering steps.

    Args:
        df  : DataFrame in long format with (unique_id, ds, y, optional exog).
        fit : If True, also cache the final feature column list.

    Returns:
        DataFrame with all features added.
    """
    global FEATURE_COLS

    df = add_calendar_features(df)
    df = add_lag_features(df)
    df = add_rolling_features(df)
    df = add_price_features(df)
    df = add_event_features(df)

    # Drop rows where lags are undefined (first LAG_DAYS[-1] rows per series)
    df = df.dropna(subset=[f"lag_{LAG_DAYS[0]}"])

    if fit:
        # Collect all numeric feature columns (exclude id/date/target)
        exclude = {ID_COL, DATE_COL, TARGET_COL,
                   "event_name_1", "event_name_2",
                   "event_type_1", "event_type_2"}
        FEATURE_COLS = [c for c in df.columns
                        if c not in exclude
                        and df[c].dtype in (np.float64, np.float32,
                                            np.int64, np.int32, int, float)]

    return df


def get_feature_cols() -> list[str]:
    """Return feature columns set during last build_features(fit=True) call."""
    return list(FEATURE_COLS)