convinient_store_2 / preprocess.py
leedami's picture
Upload 17 files
e1f7d51 verified
import pandas as pd, numpy as np
def add_time_features(df, date_col):
"""
[๋ฌด์—‡์„ ํ•˜๋‚˜์š”?]
- ๋‚ ์งœ ์—ด(date_col)์—์„œ '์—ฐ๋„/์›”/์ผ/์š”์ผ/๋ช‡ ์ฃผ์ฐจ/์ฃผ๋ง ์—ฌ๋ถ€' ๊ฐ™์€
์‰ฌ์šด ๋‹ฌ๋ ฅ ์ •๋ณด๋ฅผ ๋ฝ‘์•„ ํ‘œ์—(๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์—) ๋ถ™์—ฌ์ค˜์š”.
[์™œ ํ•„์š”ํ•˜์ฃ ?]
- ๊ธฐ๊ณ„๋Š” '2025-01-15' ๊ฐ™์€ ๋‚ ์งœ ๊ธ€์ž๋ฅผ ์ž˜ ๋ชป ์ดํ•ดํ•ด์š”.
๋Œ€์‹  '2025๋…„', '1์›”', '15์ผ', '์ˆ˜์š”์ผ', '3์ฃผ์ฐจ' ์ฒ˜๋Ÿผ ์ˆซ์ž ์ •๋ณด๊ฐ€ ์žˆ์œผ๋ฉด
๊ทœ์น™(๊ณ„์ ˆ/์š”์ผ ํŒจํ„ด)์„ ๋” ์ž˜ ๋ฐฐ์šธ ์ˆ˜ ์žˆ์–ด์š”.
[์ž…๋ ฅ]
- df: ์›๋ž˜ ๋ฐ์ดํ„ฐ ํ‘œ (DataFrame)
- date_col: ๋‚ ์งœ๊ฐ€ ๋“ค์–ด์žˆ๋Š” ์—ด ์ด๋ฆ„ (์˜ˆ: 'date')
[์ถœ๋ ฅ]
- ๋‹ฌ๋ ฅ ์ •๋ณด ์—ด์ด ์ถ”๊ฐ€๋œ ์ƒˆ ํ‘œ (์›๋ณธ์€ ๊ฑด๋“œ๋ฆฌ์ง€ ์•Š์•„์š”)
"""
df = df.copy() # ์›๋ณธ์„ ๋ง๊ฐ€๋œจ๋ฆฌ์ง€ ์•Š์œผ๋ ค๊ณ  ๋ณต์‚ฌ๋ณธ์„ ๋งŒ๋“ค์–ด์š”.
# ๋‚ ์งœ ๊ธ€์ž๋ฅผ ์ง„์งœ '๋‚ ์งœ'๋กœ ๋ฐ”๊ฟ”์š”. ์ด์ƒํ•œ ๊ฐ’์€ NaT(๋น„์–ด์žˆ์Œ)๋กœ ์ฒ˜๋ฆฌ.
df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
# ๋‚ ์งœ๊ฐ€ ๋น„์–ด์žˆ๋Š” ํ–‰์€ ๊ณ„์‚ฐ์ด ์•ˆ ๋˜๋‹ˆ ๋นผ๊ณ , ๋‚ ์งœ์ˆœ์œผ๋กœ ์ •๋ ฌํ•ด์š”.
df = df.dropna(subset=[date_col]).sort_values(date_col)
# ๋‹ฌ๋ ฅ์—์„œ ๋ฐ”๋กœ ๊บผ๋‚ผ ์ˆ˜ ์žˆ๋Š” ์ •๋ณด๋“ค์„ ์ƒˆ ์—ด๋กœ ๋งŒ๋“ค์–ด์š”.
df["year"] = df[date_col].dt.year # ๋ช‡ ๋…„๋„์ธ์ง€
df["month"] = df[date_col].dt.month # ๋ช‡ ์›”์ธ์ง€(1~12)
df["day"] = df[date_col].dt.day # ๋ฉฐ์น ์ธ์ง€(1~31)
df["dow"] = df[date_col].dt.dayofweek # ์š”์ผ(์›”=0 ... ์ผ=6)
# '๋ช‡ ์ฃผ์ฐจ'๋Š” ISO ๋‹ฌ๋ ฅ ๊ธฐ์ค€์ด์—์š”. ์˜ˆ: 1์›”์˜ ์ฒซ ์ฃผ๊ฐ€ 1์ด ์•„๋‹ˆ๋ผ 52์ผ ์ˆ˜๋„ ์žˆ์–ด์š”.
df["week"] = df[date_col].dt.isocalendar().week.astype(int)
# ํ† /์ผ์ด๋ฉด ์ฃผ๋ง(1), ์•„๋‹ˆ๋ฉด 0
df["is_weekend"] = (df["dow"]>=5).astype(int)
return df
def add_lag_features(df, date_col, target_col, group_keys=None, lags=(1,7,14), rolls=(7,14)):
"""
[๋ฌด์—‡์„ ํ•˜๋‚˜์š”?]
- '์–ด์ œ/์ผ์ฃผ์ผ ์ „/๋ณด๋ฆ„ ์ „' ๊ฐ™์€ ๊ณผ๊ฑฐ ๊ฐ’(=์ง€์—ฐ๊ฐ’, lag)์„ ๋งŒ๋“ค์–ด์„œ ๋ถ™์ด๊ณ ,
์ตœ๊ทผ 7์ผ/14์ผ์˜ ํ‰๊ท ยทํ‘œ์ค€ํŽธ์ฐจ(ํ”๋“ค๋ฆผ)๋„ ๊ฐ™์ด ๋ถ™์—ฌ์ค˜์š”.
[์™œ ํ•„์š”ํ•˜์ฃ ?]
- ์ˆ˜์š”๋Š” ์–ด์ œ/์ง€๋‚œ์ฃผ์™€ ๋น„์Šทํ•˜๊ฒŒ ์›€์ง์ด๋Š” ๊ฒฝํ–ฅ์ด ์žˆ์–ด์š”.
๊ณผ๊ฑฐ ๊ฐ’์„ ํžŒํŠธ๋กœ ์ฃผ๋ฉด '๋‚ด์ผ'์„ ๋งž์ถ”๊ธฐ ์‰ฌ์›Œ์ ธ์š”.
- lag7: 7์ผ ์ „ ๊ฐ’ โ†’ '์ง€๋‚œ์ฃผ ๊ฐ™์€ ์š”์ผ'์˜ ํžŒํŠธ
- rmean7: ์ตœ๊ทผ 7์ผ ํ‰๊ท  โ†’ ์ตœ๊ทผ ํ๋ฆ„(ํ‰๊ท )
- rstd7: ์ตœ๊ทผ 7์ผ ํ”๋“ค๋ฆผ(ํ‘œ์ค€ํŽธ์ฐจ) โ†’ ๋ณ€๋™์„ฑ ํฌ๊ธฐ
[group_keys๊ฐ€ ๋ญ์ฃ ?]
- ์ ํฌ/๋ธŒ๋žœ๋“œ/์ƒํ’ˆ๋งˆ๋‹ค ๋”ฐ๋กœ ๊ณผ๊ฑฐ๋ฅผ ๋ณด๋ผ๊ณ  ์ง€์ •ํ•˜๋Š” ์—ด๋“ค์ด์—์š”.
์˜ˆ) ["region", "item"]์ด๋ฉด ์ง€์—ญ+์ƒํ’ˆ๋ณ„๋กœ ๊ฐ๊ฐ ์–ด์ œ/์ง€๋‚œ์ฃผ๋ฅผ ๊ณ„์‚ฐํ•ด์š”.
(๊ทธ๋ฃน ์—†์ด ํ†ต์œผ๋กœ ๊ณ„์‚ฐํ•˜๋ฉด ์„œ๋กœ ๋‹ค๋ฅธ ์ ํฌ/์ƒํ’ˆ์˜ ๊ฐ’์ด ์„ž์—ฌ์„œ ์˜๋ฏธ๊ฐ€ ํ๋ ค์งˆ ์ˆ˜ ์žˆ์–ด์š”.)
[์ž…๋ ฅ]
- df: ํ‘œ
- date_col: ๋‚ ์งœ ์—ด ์ด๋ฆ„
- target_col: ๋งž์ถ”๊ณ  ์‹ถ์€ ์ˆซ์ž(ํŒ๋งค๋Ÿ‰ ๋“ฑ) ์—ด
- group_keys: ๊ทธ๋ฃนํ•‘ํ•  ์—ด ๋ชฉ๋ก(์—†์–ด๋„ ๋จ)
- lags: ๋งŒ๋“ค lag ๋ชฉ๋ก(๊ธฐ๋ณธ 1, 7, 14)
- rolls: ๊ตด๋ฆฌ๋Š” ์ฐฝ ํฌ๊ธฐ(rolling window) ๋ชฉ๋ก(๊ธฐ๋ณธ 7, 14)
[์ถœ๋ ฅ]
- lag/rmean/rstd ์—ด์ด ์ถ”๊ฐ€๋œ ํ‘œ(๋‚ ์งœ์ˆœ)
"""
df = df.copy()
# group_keys ์ค‘ ํ‘œ์— ์‹ค์ œ๋กœ ์กด์žฌํ•˜๋Š” ๊ฒƒ๋งŒ ๋‚จ๊ฒจ์š”.
group_keys = [c for c in (group_keys or []) if c in df.columns]
# ๊ทธ๋ฃน์ด ์žˆ์œผ๋ฉด ๊ทธ๋ฃน๋ณ„๋กœ, ์—†์œผ๋ฉด ์ „์ฒด๋ฅผ ํ•˜๋‚˜์˜ ๊ทธ๋ฃน์ฒ˜๋Ÿผ ์ฒ˜๋ฆฌํ•ด์š”.
if group_keys:
g = df.groupby(group_keys, group_keys=False) # group_keys=False: ํ‚ค๋ฅผ ์ธ๋ฑ์Šค๋กœ ์˜ฌ๋ฆฌ์ง€ ๋ง๊ธฐ
else:
g = [(None, df)] # '๊ทธ๋ฃน์ด ํ•˜๋‚˜'๋ผ๊ณ  ๊ฐ€์ •ํ•œ ๋ฆฌ์ŠคํŠธ. ์•„๋ž˜ for๋ฌธ๊ณผ ํ˜ธํ™˜๋˜๊ฒŒ ๋งŒ๋“ค์–ด์š”.
out = [] # ๊ทธ๋ฃน๋ณ„๋กœ ์ฒ˜๋ฆฌํ•œ ๊ฒฐ๊ณผ๋ฅผ ๋ชจ์•„๋‘” ๋’ค, ๋งˆ์ง€๋ง‰์— ํ•ฉ์ณ์š”.
# pandas์˜ groupby๋Š” (ํ‚ค, ๋ถ€๋ถ„ํ‘œ) ํ˜•ํƒœ๋กœ ๋ฐ˜๋ณต๋ฉ๋‹ˆ๋‹ค.
# ์œ„์—์„œ g๋ฅผ ๋ฆฌ์ŠคํŠธ๋กœ ๋งž์ถฐ์คฌ๊ธฐ ๋•Œ๋ฌธ์— ๋‘˜ ๋ชจ๋‘ ๊ฐ™์€ ๋ฐฉ์‹์œผ๋กœ ์ˆœํšŒ ๊ฐ€๋Šฅํ•ด์š”.
for _, part in (g if isinstance(g, list) else g):
part = part.sort_values(date_col).copy() # ๋‚ ์งœ์ˆœ์œผ๋กœ ์ •๋ ฌ
# (1) lag ์—ด๋“ค ๋งŒ๋“ค๊ธฐ: ์˜ˆ) lag1(์–ด์ œ), lag7(์ง€๋‚œ์ฃผ), lag14(๋ณด๋ฆ„ ์ „)
for l in lags:
part[f"lag{l}"] = part[target_col].shift(l)
# shift(l)์€ ์œ„์—์„œ l์นธ ๋ฐ€์–ด์š”. ์˜ค๋Š˜ ํ–‰์—๋Š” 'l์ผ ์ „ ๊ฐ’'์ด ๋“ค์–ด๊ฐ.
# (2) rolling ํ‰๊ท /ํ‘œ์ค€ํŽธ์ฐจ: ์ตœ๊ทผ w์ผ ํ‰๊ท /ํ”๋“ค๋ฆผ
for w in rolls:
# min_periods๋ฅผ w์˜ ์ ˆ๋ฐ˜ ์ด์ƒ(์ตœ์†Œ 2)์œผ๋กœ ์ค˜์„œ
# ์ดˆ๋ฐ˜๋ถ€ ๋ฐ์ดํ„ฐ๊ฐ€ ๋„ˆ๋ฌด ์ž‘์„ ๋•Œ๋„ ๊ฐ’์ด ์กฐ๊ธˆ์ด๋ผ๋„ ๋‚˜์˜ค๋„๋ก ๋ฐฐ๋ ค.
part[f"rmean{w}"] = part[target_col].rolling(w, min_periods=max(2, w//2)).mean()
part[f"rstd{w}"] = part[target_col].rolling(w, min_periods=max(2, w//2)).std()
out.append(part)
# ๊ทธ๋ฃน๋ณ„๋กœ ๋งŒ๋“  ํ‘œ๋“ค์„ ์œ„์•„๋ž˜๋กœ ์ด์–ด๋ถ™์ด๊ณ , ๋‹ค์‹œ ๋‚ ์งœ์ˆœ ์ •๋ ฌ
return pd.concat(out, axis=0).sort_values(date_col)
def make_matrix(df, mapping):
"""
[๋ฌด์—‡์„ ํ•˜๋‚˜์š”?]
- ๋ชจ๋ธ ํ•™์Šต์šฉ '์ž…๋ ฅ X'์™€ '์ •๋‹ต y'๋ฅผ ๋งŒ๋“œ๋Š” ๊ณต์žฅ์ž…๋‹ˆ๋‹ค.
1) ๋‚ ์งœ/ํƒ€๊นƒ ์—ด ์ด๋ฆ„์„ mapping์—์„œ ์ฝ๊ณ ,
2) add_time_features / add_lag_features๋กœ ์ˆซ์ž ํžŒํŠธ๋ฅผ ์ถ”๊ฐ€ํ•˜๊ณ ,
3) (์žˆ๋‹ค๋ฉด) region/brand/item์„ '์›-ํ•ซ ์ธ์ฝ”๋”ฉ(๊ฐ€์งœ ์—ด)'์œผ๋กœ ๋ฐ”๊ฟ”์„œ X์— ๋ถ™์—ฌ์š”.
4) y๋Š” ํƒ€๊นƒ ๊ฐ’(ํŒ๋งค๋Ÿ‰ ๋“ฑ)์œผ๋กœ ์„ค์ •ํ•ด์š”.
[์ž…๋ ฅ]
- df: ์›๋ณธ ํ‘œ
- mapping: {'date':..., 'target':..., 'region':..., 'brand':..., 'item':...}
(region/brand/item์€ ์—†์–ด๋„ ๋จ)
[์ถœ๋ ฅ]
- df: ํ”ผ์ฒ˜๊ฐ€ ๋ถ™์€ ํ‘œ(์ดˆ๊ธฐ lag๋กœ NaN์ธ ๋งจ ์•ž๋ถ€๋ถ„์€ ์ œ๊ฑฐ๋จ)
- X: ๋ชจ๋ธ์— ๋“ค์–ด๊ฐˆ ์ˆซ์ž ๋ฐฐ์—ด(2์ฐจ์›)
- y: ์ •๋‹ต ๋ฒกํ„ฐ(1์ฐจ์›)
- feat_names: X์˜ ์—ด ์ด๋ฆ„ ๋ชฉ๋ก(๋ชจ๋ธ ํ•ด์„/์žฌํ˜„์— ํ•„์š”)
"""
df = df.copy()
# ๋งคํ•‘์—์„œ ์—ด ์ด๋ฆ„ ๊บผ๋‚ด์˜ค๊ธฐ
date_col = mapping.get("date")
target_col = mapping.get("target")
region_col = mapping.get("region")
brand_col = mapping.get("brand")
item_col = mapping.get("item")
# ๋‚ ์งœ/ํƒ€๊นƒ์€ ํ•„์ˆ˜! ์—†์œผ๋ฉด ์ง„ํ–‰ ๋ชป ํ•ด์š”.
if not date_col or not target_col:
raise ValueError("date/target ์ปฌ๋Ÿผ ๋งคํ•‘์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค.")
# --- (1) ์ˆซ์žํ˜• ์ •๋ฆฌ ---
# ํƒ€๊นƒ์€ ๋ฐ˜๋“œ์‹œ ์ˆซ์ž์—ฌ์•ผ ํ•ด์š”. ๊ธ€์ž๊ฐ€ ์„ž์—ฌ ์žˆ์œผ๋ฉด NaN์œผ๋กœ ๋ฐ”๋€œ โ†’ 0์œผ๋กœ ์ฑ„์›€.
df[target_col] = pd.to_numeric(df[target_col], errors="coerce").fillna(0)
# (์„ ํƒ) ๋ถ„๋ฅ˜ํ˜• ์—ด๋“ค์€ ๊ธ€์ž(๋ฌธ์ž์—ด)๋กœ ํ†ต์ผํ•ด์š”.
# ์ด๋ ‡๊ฒŒ ํ•ด์•ผ '์›-ํ•ซ ์ธ์ฝ”๋”ฉ'์ด ์ž˜ ๋ฉ๋‹ˆ๋‹ค.
if region_col and region_col in df: df[region_col] = df[region_col].astype(str)
if brand_col and brand_col in df: df[brand_col] = df[brand_col].astype(str)
if item_col and item_col in df: df[item_col] = df[item_col].astype(str)
# --- (2) ๋‹ฌ๋ ฅ ํ”ผ์ฒ˜ ๋ถ™์ด๊ธฐ ---
df = add_time_features(df, date_col)
# --- (3) ๊ณผ๊ฑฐ/์ตœ๊ทผ ํ†ต๊ณ„ ํ”ผ์ฒ˜ ๋ถ™์ด๊ธฐ ---
# ๊ทธ๋ฃนํ‚ค: ์กด์žฌํ•˜๋Š” ๊ฒƒ๋งŒ ์‚ฌ์šฉ (์˜ˆ: ['region','brand','item'] ์ค‘ ์‹ค์ œ ์žˆ๋Š” ์—ด๋งŒ)
df = add_lag_features(
df, date_col, target_col,
[c for c in [region_col, brand_col, item_col] if c]
)
# --- (4) lag/rolling ๋•Œ๋ฌธ์— ์•ž๋ถ€๋ถ„์— ์ƒ๊ธด ๋น„์–ด์žˆ๋Š” ํ–‰ ์ œ๊ฑฐ ---
# ์ฒซ ๋ช‡ ํ–‰์€ lag1/lag7 ๊ฐ™์€ ๊ฒŒ ์ฑ„์šธ ์ˆ˜ ์—†์–ด์„œ NaN์ด ๋ผ์š” โ†’ ํ•™์Šต์— ๋ชป ์“ฐ๋‹ˆ ์ œ๊ฑฐ.
drop_cols = [c for c in df.columns if c.startswith("lag") or c.startswith("rmean") or c.startswith("rstd")]
df = df.dropna(subset=drop_cols)
# --- (5) ์ˆซ์ž ํ”ผ์ฒ˜ ๋ชฉ๋ก ๋งŒ๋“ค๊ธฐ ---
# ๋‹ฌ๋ ฅ ์ˆซ์ž + lag/rolling ์ˆซ์ž๋“ค์„ ๋ชจ์•„์„œ X์˜ ๊ธฐ๋ณธ ๋ผˆ๋Œ€๋ฅผ ๋งŒ๋“ค์–ด์š”.
num_cols = ["year","month","day","dow","week","is_weekend"] + drop_cols
num_cols = [c for c in num_cols if c in df.columns] # ํ˜น์‹œ ๋น ์ง„ ๊ฒŒ ์žˆ์œผ๋ฉด ๊ฑธ๋Ÿฌ์คŒ
# ์ˆซ์ž ํ”ผ์ฒ˜๋ฅผ ๋จผ์ € ํ–‰๋ ฌ๋กœ ๋ณ€ํ™˜
X_num = df[num_cols].values
feat_names = list(num_cols) # ๋‚˜์ค‘์— ํ•ด์„/์žฌํ˜„ํ•  ๋•Œ ํ•„์š”
# --- (6) ๋ถ„๋ฅ˜ํ˜•(๋ฌธ์ž) โ†’ ์›-ํ•ซ ์ธ์ฝ”๋”ฉ ---
# ์˜ˆ: region์ด '์„œ์šธ','๊ฒฝ๊ธฐ'๋ฉด 'region_์„œ์šธ','region_๊ฒฝ๊ธฐ' ๊ฐ™์€ ๊ฐ€์งœ ์—ด์„ ๋งŒ๋“ค์–ด์š”(0/1)
cat_cols = [c for c in [region_col, brand_col, item_col] if c and c in df.columns]
if cat_cols:
dummies = pd.get_dummies(df[cat_cols].astype(str), dummy_na=False)
# ์ˆซ์ž ํ”ผ์ฒ˜(X_num) ์˜ค๋ฅธ์ชฝ์— ์›-ํ•ซ ํ”ผ์ฒ˜๋ฅผ ๋ถ™์—ฌ์š”.
X = np.hstack([X_num, dummies.values])
feat_names += list(dummies.columns) # ์ƒˆ๋กœ ์ƒ๊ธด ์—ด ์ด๋ฆ„๋„ ๊ธฐ๋ก
else:
X = X_num # ๋ถ„๋ฅ˜ํ˜•์ด ์—†์œผ๋ฉด ์ˆซ์ž๋งŒ ์‚ฌ์šฉ
# --- (7) ์ •๋‹ต y ๋งŒ๋“ค๊ธฐ ---
y = df[target_col].values # ์šฐ๋ฆฌ๊ฐ€ ๋งž์ถ”๊ณ  ์‹ถ์€ ๊ฐ’(์˜ˆ: ํŒ๋งค๋Ÿ‰)
return df, X, y, feat_names