|
|
import pandas as pd, numpy as np |
|
|
|
|
|
def add_time_features(df, date_col): |
|
|
""" |
|
|
[๋ฌด์์ ํ๋์?] |
|
|
- ๋ ์ง ์ด(date_col)์์ '์ฐ๋/์/์ผ/์์ผ/๋ช ์ฃผ์ฐจ/์ฃผ๋ง ์ฌ๋ถ' ๊ฐ์ |
|
|
์ฌ์ด ๋ฌ๋ ฅ ์ ๋ณด๋ฅผ ๋ฝ์ ํ์(๋ฐ์ดํฐํ๋ ์์) ๋ถ์ฌ์ค์. |
|
|
|
|
|
[์ ํ์ํ์ฃ ?] |
|
|
- ๊ธฐ๊ณ๋ '2025-01-15' ๊ฐ์ ๋ ์ง ๊ธ์๋ฅผ ์ ๋ชป ์ดํดํด์. |
|
|
๋์ '2025๋
', '1์', '15์ผ', '์์์ผ', '3์ฃผ์ฐจ' ์ฒ๋ผ ์ซ์ ์ ๋ณด๊ฐ ์์ผ๋ฉด |
|
|
๊ท์น(๊ณ์ /์์ผ ํจํด)์ ๋ ์ ๋ฐฐ์ธ ์ ์์ด์. |
|
|
|
|
|
[์
๋ ฅ] |
|
|
- df: ์๋ ๋ฐ์ดํฐ ํ (DataFrame) |
|
|
- date_col: ๋ ์ง๊ฐ ๋ค์ด์๋ ์ด ์ด๋ฆ (์: 'date') |
|
|
|
|
|
[์ถ๋ ฅ] |
|
|
- ๋ฌ๋ ฅ ์ ๋ณด ์ด์ด ์ถ๊ฐ๋ ์ ํ (์๋ณธ์ ๊ฑด๋๋ฆฌ์ง ์์์) |
|
|
""" |
|
|
df = df.copy() |
|
|
|
|
|
|
|
|
df[date_col] = pd.to_datetime(df[date_col], errors="coerce") |
|
|
|
|
|
|
|
|
df = df.dropna(subset=[date_col]).sort_values(date_col) |
|
|
|
|
|
|
|
|
df["year"] = df[date_col].dt.year |
|
|
df["month"] = df[date_col].dt.month |
|
|
df["day"] = df[date_col].dt.day |
|
|
df["dow"] = df[date_col].dt.dayofweek |
|
|
|
|
|
df["week"] = df[date_col].dt.isocalendar().week.astype(int) |
|
|
|
|
|
df["is_weekend"] = (df["dow"]>=5).astype(int) |
|
|
|
|
|
return df |
|
|
|
|
|
|
|
|
def add_lag_features(df, date_col, target_col, group_keys=None, lags=(1,7,14), rolls=(7,14)): |
|
|
""" |
|
|
[๋ฌด์์ ํ๋์?] |
|
|
- '์ด์ /์ผ์ฃผ์ผ ์ /๋ณด๋ฆ ์ ' ๊ฐ์ ๊ณผ๊ฑฐ ๊ฐ(=์ง์ฐ๊ฐ, lag)์ ๋ง๋ค์ด์ ๋ถ์ด๊ณ , |
|
|
์ต๊ทผ 7์ผ/14์ผ์ ํ๊ท ยทํ์คํธ์ฐจ(ํ๋ค๋ฆผ)๋ ๊ฐ์ด ๋ถ์ฌ์ค์. |
|
|
|
|
|
[์ ํ์ํ์ฃ ?] |
|
|
- ์์๋ ์ด์ /์ง๋์ฃผ์ ๋น์ทํ๊ฒ ์์ง์ด๋ ๊ฒฝํฅ์ด ์์ด์. |
|
|
๊ณผ๊ฑฐ ๊ฐ์ ํํธ๋ก ์ฃผ๋ฉด '๋ด์ผ'์ ๋ง์ถ๊ธฐ ์ฌ์์ ธ์. |
|
|
- lag7: 7์ผ ์ ๊ฐ โ '์ง๋์ฃผ ๊ฐ์ ์์ผ'์ ํํธ |
|
|
- rmean7: ์ต๊ทผ 7์ผ ํ๊ท โ ์ต๊ทผ ํ๋ฆ(ํ๊ท ) |
|
|
- rstd7: ์ต๊ทผ 7์ผ ํ๋ค๋ฆผ(ํ์คํธ์ฐจ) โ ๋ณ๋์ฑ ํฌ๊ธฐ |
|
|
|
|
|
[group_keys๊ฐ ๋ญ์ฃ ?] |
|
|
- ์ ํฌ/๋ธ๋๋/์ํ๋ง๋ค ๋ฐ๋ก ๊ณผ๊ฑฐ๋ฅผ ๋ณด๋ผ๊ณ ์ง์ ํ๋ ์ด๋ค์ด์์. |
|
|
์) ["region", "item"]์ด๋ฉด ์ง์ญ+์ํ๋ณ๋ก ๊ฐ๊ฐ ์ด์ /์ง๋์ฃผ๋ฅผ ๊ณ์ฐํด์. |
|
|
(๊ทธ๋ฃน ์์ด ํต์ผ๋ก ๊ณ์ฐํ๋ฉด ์๋ก ๋ค๋ฅธ ์ ํฌ/์ํ์ ๊ฐ์ด ์์ฌ์ ์๋ฏธ๊ฐ ํ๋ ค์ง ์ ์์ด์.) |
|
|
|
|
|
[์
๋ ฅ] |
|
|
- df: ํ |
|
|
- date_col: ๋ ์ง ์ด ์ด๋ฆ |
|
|
- target_col: ๋ง์ถ๊ณ ์ถ์ ์ซ์(ํ๋งค๋ ๋ฑ) ์ด |
|
|
- group_keys: ๊ทธ๋ฃนํํ ์ด ๋ชฉ๋ก(์์ด๋ ๋จ) |
|
|
- lags: ๋ง๋ค lag ๋ชฉ๋ก(๊ธฐ๋ณธ 1, 7, 14) |
|
|
- rolls: ๊ตด๋ฆฌ๋ ์ฐฝ ํฌ๊ธฐ(rolling window) ๋ชฉ๋ก(๊ธฐ๋ณธ 7, 14) |
|
|
|
|
|
[์ถ๋ ฅ] |
|
|
- lag/rmean/rstd ์ด์ด ์ถ๊ฐ๋ ํ(๋ ์ง์) |
|
|
""" |
|
|
df = df.copy() |
|
|
|
|
|
|
|
|
group_keys = [c for c in (group_keys or []) if c in df.columns] |
|
|
|
|
|
|
|
|
if group_keys: |
|
|
g = df.groupby(group_keys, group_keys=False) |
|
|
else: |
|
|
g = [(None, df)] |
|
|
|
|
|
out = [] |
|
|
|
|
|
|
|
|
|
|
|
for _, part in (g if isinstance(g, list) else g): |
|
|
part = part.sort_values(date_col).copy() |
|
|
|
|
|
|
|
|
for l in lags: |
|
|
part[f"lag{l}"] = part[target_col].shift(l) |
|
|
|
|
|
|
|
|
|
|
|
for w in rolls: |
|
|
|
|
|
|
|
|
part[f"rmean{w}"] = part[target_col].rolling(w, min_periods=max(2, w//2)).mean() |
|
|
part[f"rstd{w}"] = part[target_col].rolling(w, min_periods=max(2, w//2)).std() |
|
|
|
|
|
out.append(part) |
|
|
|
|
|
|
|
|
return pd.concat(out, axis=0).sort_values(date_col) |
|
|
|
|
|
|
|
|
def make_matrix(df, mapping): |
|
|
""" |
|
|
[๋ฌด์์ ํ๋์?] |
|
|
- ๋ชจ๋ธ ํ์ต์ฉ '์
๋ ฅ X'์ '์ ๋ต y'๋ฅผ ๋ง๋๋ ๊ณต์ฅ์
๋๋ค. |
|
|
1) ๋ ์ง/ํ๊น ์ด ์ด๋ฆ์ mapping์์ ์ฝ๊ณ , |
|
|
2) add_time_features / add_lag_features๋ก ์ซ์ ํํธ๋ฅผ ์ถ๊ฐํ๊ณ , |
|
|
3) (์๋ค๋ฉด) region/brand/item์ '์-ํซ ์ธ์ฝ๋ฉ(๊ฐ์ง ์ด)'์ผ๋ก ๋ฐ๊ฟ์ X์ ๋ถ์ฌ์. |
|
|
4) y๋ ํ๊น ๊ฐ(ํ๋งค๋ ๋ฑ)์ผ๋ก ์ค์ ํด์. |
|
|
|
|
|
[์
๋ ฅ] |
|
|
- df: ์๋ณธ ํ |
|
|
- mapping: {'date':..., 'target':..., 'region':..., 'brand':..., 'item':...} |
|
|
(region/brand/item์ ์์ด๋ ๋จ) |
|
|
|
|
|
[์ถ๋ ฅ] |
|
|
- df: ํผ์ฒ๊ฐ ๋ถ์ ํ(์ด๊ธฐ lag๋ก NaN์ธ ๋งจ ์๋ถ๋ถ์ ์ ๊ฑฐ๋จ) |
|
|
- X: ๋ชจ๋ธ์ ๋ค์ด๊ฐ ์ซ์ ๋ฐฐ์ด(2์ฐจ์) |
|
|
- y: ์ ๋ต ๋ฒกํฐ(1์ฐจ์) |
|
|
- feat_names: X์ ์ด ์ด๋ฆ ๋ชฉ๋ก(๋ชจ๋ธ ํด์/์ฌํ์ ํ์) |
|
|
""" |
|
|
df = df.copy() |
|
|
|
|
|
|
|
|
date_col = mapping.get("date") |
|
|
target_col = mapping.get("target") |
|
|
region_col = mapping.get("region") |
|
|
brand_col = mapping.get("brand") |
|
|
item_col = mapping.get("item") |
|
|
|
|
|
|
|
|
if not date_col or not target_col: |
|
|
raise ValueError("date/target ์ปฌ๋ผ ๋งคํ์ด ํ์ํฉ๋๋ค.") |
|
|
|
|
|
|
|
|
|
|
|
df[target_col] = pd.to_numeric(df[target_col], errors="coerce").fillna(0) |
|
|
|
|
|
|
|
|
|
|
|
if region_col and region_col in df: df[region_col] = df[region_col].astype(str) |
|
|
if brand_col and brand_col in df: df[brand_col] = df[brand_col].astype(str) |
|
|
if item_col and item_col in df: df[item_col] = df[item_col].astype(str) |
|
|
|
|
|
|
|
|
df = add_time_features(df, date_col) |
|
|
|
|
|
|
|
|
|
|
|
df = add_lag_features( |
|
|
df, date_col, target_col, |
|
|
[c for c in [region_col, brand_col, item_col] if c] |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
drop_cols = [c for c in df.columns if c.startswith("lag") or c.startswith("rmean") or c.startswith("rstd")] |
|
|
df = df.dropna(subset=drop_cols) |
|
|
|
|
|
|
|
|
|
|
|
num_cols = ["year","month","day","dow","week","is_weekend"] + drop_cols |
|
|
num_cols = [c for c in num_cols if c in df.columns] |
|
|
|
|
|
|
|
|
X_num = df[num_cols].values |
|
|
feat_names = list(num_cols) |
|
|
|
|
|
|
|
|
|
|
|
cat_cols = [c for c in [region_col, brand_col, item_col] if c and c in df.columns] |
|
|
if cat_cols: |
|
|
dummies = pd.get_dummies(df[cat_cols].astype(str), dummy_na=False) |
|
|
|
|
|
X = np.hstack([X_num, dummies.values]) |
|
|
feat_names += list(dummies.columns) |
|
|
else: |
|
|
X = X_num |
|
|
|
|
|
|
|
|
y = df[target_col].values |
|
|
|
|
|
return df, X, y, feat_names |
|
|
|