File size: 8,908 Bytes
5841e58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import pandas as pd, numpy as np

def add_time_features(df, date_col):
    """
    [๋ฌด์—‡์„ ํ•˜๋‚˜์š”?]
    - ๋‚ ์งœ ์—ด(date_col)์—์„œ '์—ฐ๋„/์›”/์ผ/์š”์ผ/๋ช‡ ์ฃผ์ฐจ/์ฃผ๋ง ์—ฌ๋ถ€' ๊ฐ™์€
      ์‰ฌ์šด ๋‹ฌ๋ ฅ ์ •๋ณด๋ฅผ ๋ฝ‘์•„ ํ‘œ์—(๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์—) ๋ถ™์—ฌ์ค˜์š”.

    [์™œ ํ•„์š”ํ•˜์ฃ ?]
    - ๊ธฐ๊ณ„๋Š” '2025-01-15' ๊ฐ™์€ ๋‚ ์งœ ๊ธ€์ž๋ฅผ ์ž˜ ๋ชป ์ดํ•ดํ•ด์š”.
      ๋Œ€์‹  '2025๋…„', '1์›”', '15์ผ', '์ˆ˜์š”์ผ', '3์ฃผ์ฐจ' ์ฒ˜๋Ÿผ ์ˆซ์ž ์ •๋ณด๊ฐ€ ์žˆ์œผ๋ฉด
      ๊ทœ์น™(๊ณ„์ ˆ/์š”์ผ ํŒจํ„ด)์„ ๋” ์ž˜ ๋ฐฐ์šธ ์ˆ˜ ์žˆ์–ด์š”.

    [์ž…๋ ฅ]
    - df: ์›๋ž˜ ๋ฐ์ดํ„ฐ ํ‘œ (DataFrame)
    - date_col: ๋‚ ์งœ๊ฐ€ ๋“ค์–ด์žˆ๋Š” ์—ด ์ด๋ฆ„ (์˜ˆ: 'date')

    [์ถœ๋ ฅ]
    - ๋‹ฌ๋ ฅ ์ •๋ณด ์—ด์ด ์ถ”๊ฐ€๋œ ์ƒˆ ํ‘œ (์›๋ณธ์€ ๊ฑด๋“œ๋ฆฌ์ง€ ์•Š์•„์š”)
    """
    df = df.copy()  # ์›๋ณธ์„ ๋ง๊ฐ€๋œจ๋ฆฌ์ง€ ์•Š์œผ๋ ค๊ณ  ๋ณต์‚ฌ๋ณธ์„ ๋งŒ๋“ค์–ด์š”.

    # ๋‚ ์งœ ๊ธ€์ž๋ฅผ ์ง„์งœ '๋‚ ์งœ'๋กœ ๋ฐ”๊ฟ”์š”. ์ด์ƒํ•œ ๊ฐ’์€ NaT(๋น„์–ด์žˆ์Œ)๋กœ ์ฒ˜๋ฆฌ.
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")

    # ๋‚ ์งœ๊ฐ€ ๋น„์–ด์žˆ๋Š” ํ–‰์€ ๊ณ„์‚ฐ์ด ์•ˆ ๋˜๋‹ˆ ๋นผ๊ณ , ๋‚ ์งœ์ˆœ์œผ๋กœ ์ •๋ ฌํ•ด์š”.
    df = df.dropna(subset=[date_col]).sort_values(date_col)

    # ๋‹ฌ๋ ฅ์—์„œ ๋ฐ”๋กœ ๊บผ๋‚ผ ์ˆ˜ ์žˆ๋Š” ์ •๋ณด๋“ค์„ ์ƒˆ ์—ด๋กœ ๋งŒ๋“ค์–ด์š”.
    df["year"]  = df[date_col].dt.year        # ๋ช‡ ๋…„๋„์ธ์ง€
    df["month"] = df[date_col].dt.month       # ๋ช‡ ์›”์ธ์ง€(1~12)
    df["day"]   = df[date_col].dt.day         # ๋ฉฐ์น ์ธ์ง€(1~31)
    df["dow"]   = df[date_col].dt.dayofweek   # ์š”์ผ(์›”=0 ... ์ผ=6)
    # '๋ช‡ ์ฃผ์ฐจ'๋Š” ISO ๋‹ฌ๋ ฅ ๊ธฐ์ค€์ด์—์š”. ์˜ˆ: 1์›”์˜ ์ฒซ ์ฃผ๊ฐ€ 1์ด ์•„๋‹ˆ๋ผ 52์ผ ์ˆ˜๋„ ์žˆ์–ด์š”.
    df["week"]  = df[date_col].dt.isocalendar().week.astype(int)
    # ํ† /์ผ์ด๋ฉด ์ฃผ๋ง(1), ์•„๋‹ˆ๋ฉด 0
    df["is_weekend"] = (df["dow"]>=5).astype(int)

    return df


def add_lag_features(df, date_col, target_col, group_keys=None, lags=(1,7,14), rolls=(7,14)):
    """
    [๋ฌด์—‡์„ ํ•˜๋‚˜์š”?]
    - '์–ด์ œ/์ผ์ฃผ์ผ ์ „/๋ณด๋ฆ„ ์ „' ๊ฐ™์€ ๊ณผ๊ฑฐ ๊ฐ’(=์ง€์—ฐ๊ฐ’, lag)์„ ๋งŒ๋“ค์–ด์„œ ๋ถ™์ด๊ณ ,
      ์ตœ๊ทผ 7์ผ/14์ผ์˜ ํ‰๊ท ยทํ‘œ์ค€ํŽธ์ฐจ(ํ”๋“ค๋ฆผ)๋„ ๊ฐ™์ด ๋ถ™์—ฌ์ค˜์š”.

    [์™œ ํ•„์š”ํ•˜์ฃ ?]
    - ์ˆ˜์š”๋Š” ์–ด์ œ/์ง€๋‚œ์ฃผ์™€ ๋น„์Šทํ•˜๊ฒŒ ์›€์ง์ด๋Š” ๊ฒฝํ–ฅ์ด ์žˆ์–ด์š”.
      ๊ณผ๊ฑฐ ๊ฐ’์„ ํžŒํŠธ๋กœ ์ฃผ๋ฉด '๋‚ด์ผ'์„ ๋งž์ถ”๊ธฐ ์‰ฌ์›Œ์ ธ์š”.
      - lag7: 7์ผ ์ „ ๊ฐ’ โ†’ '์ง€๋‚œ์ฃผ ๊ฐ™์€ ์š”์ผ'์˜ ํžŒํŠธ
      - rmean7: ์ตœ๊ทผ 7์ผ ํ‰๊ท  โ†’ ์ตœ๊ทผ ํ๋ฆ„(ํ‰๊ท )
      - rstd7: ์ตœ๊ทผ 7์ผ ํ”๋“ค๋ฆผ(ํ‘œ์ค€ํŽธ์ฐจ) โ†’ ๋ณ€๋™์„ฑ ํฌ๊ธฐ

    [group_keys๊ฐ€ ๋ญ์ฃ ?]
    - ์ ํฌ/๋ธŒ๋žœ๋“œ/์ƒํ’ˆ๋งˆ๋‹ค ๋”ฐ๋กœ ๊ณผ๊ฑฐ๋ฅผ ๋ณด๋ผ๊ณ  ์ง€์ •ํ•˜๋Š” ์—ด๋“ค์ด์—์š”.
      ์˜ˆ) ["region", "item"]์ด๋ฉด ์ง€์—ญ+์ƒํ’ˆ๋ณ„๋กœ ๊ฐ๊ฐ ์–ด์ œ/์ง€๋‚œ์ฃผ๋ฅผ ๊ณ„์‚ฐํ•ด์š”.
      (๊ทธ๋ฃน ์—†์ด ํ†ต์œผ๋กœ ๊ณ„์‚ฐํ•˜๋ฉด ์„œ๋กœ ๋‹ค๋ฅธ ์ ํฌ/์ƒํ’ˆ์˜ ๊ฐ’์ด ์„ž์—ฌ์„œ ์˜๋ฏธ๊ฐ€ ํ๋ ค์งˆ ์ˆ˜ ์žˆ์–ด์š”.)

    [์ž…๋ ฅ]
    - df: ํ‘œ
    - date_col: ๋‚ ์งœ ์—ด ์ด๋ฆ„
    - target_col: ๋งž์ถ”๊ณ  ์‹ถ์€ ์ˆซ์ž(ํŒ๋งค๋Ÿ‰ ๋“ฑ) ์—ด
    - group_keys: ๊ทธ๋ฃนํ•‘ํ•  ์—ด ๋ชฉ๋ก(์—†์–ด๋„ ๋จ)
    - lags: ๋งŒ๋“ค lag ๋ชฉ๋ก(๊ธฐ๋ณธ 1, 7, 14)
    - rolls: ๊ตด๋ฆฌ๋Š” ์ฐฝ ํฌ๊ธฐ(rolling window) ๋ชฉ๋ก(๊ธฐ๋ณธ 7, 14)

    [์ถœ๋ ฅ]
    - lag/rmean/rstd ์—ด์ด ์ถ”๊ฐ€๋œ ํ‘œ(๋‚ ์งœ์ˆœ)
    """
    df = df.copy()

    # group_keys ์ค‘ ํ‘œ์— ์‹ค์ œ๋กœ ์กด์žฌํ•˜๋Š” ๊ฒƒ๋งŒ ๋‚จ๊ฒจ์š”.
    group_keys = [c for c in (group_keys or []) if c in df.columns]

    # ๊ทธ๋ฃน์ด ์žˆ์œผ๋ฉด ๊ทธ๋ฃน๋ณ„๋กœ, ์—†์œผ๋ฉด ์ „์ฒด๋ฅผ ํ•˜๋‚˜์˜ ๊ทธ๋ฃน์ฒ˜๋Ÿผ ์ฒ˜๋ฆฌํ•ด์š”.
    if group_keys:
        g = df.groupby(group_keys, group_keys=False)  # group_keys=False: ํ‚ค๋ฅผ ์ธ๋ฑ์Šค๋กœ ์˜ฌ๋ฆฌ์ง€ ๋ง๊ธฐ
    else:
        g = [(None, df)]  # '๊ทธ๋ฃน์ด ํ•˜๋‚˜'๋ผ๊ณ  ๊ฐ€์ •ํ•œ ๋ฆฌ์ŠคํŠธ. ์•„๋ž˜ for๋ฌธ๊ณผ ํ˜ธํ™˜๋˜๊ฒŒ ๋งŒ๋“ค์–ด์š”.

    out = []  # ๊ทธ๋ฃน๋ณ„๋กœ ์ฒ˜๋ฆฌํ•œ ๊ฒฐ๊ณผ๋ฅผ ๋ชจ์•„๋‘” ๋’ค, ๋งˆ์ง€๋ง‰์— ํ•ฉ์ณ์š”.

    # pandas์˜ groupby๋Š” (ํ‚ค, ๋ถ€๋ถ„ํ‘œ) ํ˜•ํƒœ๋กœ ๋ฐ˜๋ณต๋ฉ๋‹ˆ๋‹ค.
    # ์œ„์—์„œ g๋ฅผ ๋ฆฌ์ŠคํŠธ๋กœ ๋งž์ถฐ์คฌ๊ธฐ ๋•Œ๋ฌธ์— ๋‘˜ ๋ชจ๋‘ ๊ฐ™์€ ๋ฐฉ์‹์œผ๋กœ ์ˆœํšŒ ๊ฐ€๋Šฅํ•ด์š”.
    for _, part in (g if isinstance(g, list) else g):
        part = part.sort_values(date_col).copy()  # ๋‚ ์งœ์ˆœ์œผ๋กœ ์ •๋ ฌ

        # (1) lag ์—ด๋“ค ๋งŒ๋“ค๊ธฐ: ์˜ˆ) lag1(์–ด์ œ), lag7(์ง€๋‚œ์ฃผ), lag14(๋ณด๋ฆ„ ์ „)
        for l in lags:
            part[f"lag{l}"] = part[target_col].shift(l)
            # shift(l)์€ ์œ„์—์„œ l์นธ ๋ฐ€์–ด์š”. ์˜ค๋Š˜ ํ–‰์—๋Š” 'l์ผ ์ „ ๊ฐ’'์ด ๋“ค์–ด๊ฐ.

        # (2) rolling ํ‰๊ท /ํ‘œ์ค€ํŽธ์ฐจ: ์ตœ๊ทผ w์ผ ํ‰๊ท /ํ”๋“ค๋ฆผ
        for w in rolls:
            # min_periods๋ฅผ w์˜ ์ ˆ๋ฐ˜ ์ด์ƒ(์ตœ์†Œ 2)์œผ๋กœ ์ค˜์„œ
            # ์ดˆ๋ฐ˜๋ถ€ ๋ฐ์ดํ„ฐ๊ฐ€ ๋„ˆ๋ฌด ์ž‘์„ ๋•Œ๋„ ๊ฐ’์ด ์กฐ๊ธˆ์ด๋ผ๋„ ๋‚˜์˜ค๋„๋ก ๋ฐฐ๋ ค.
            part[f"rmean{w}"] = part[target_col].rolling(w, min_periods=max(2, w//2)).mean()
            part[f"rstd{w}"]  = part[target_col].rolling(w, min_periods=max(2, w//2)).std()

        out.append(part)

    # ๊ทธ๋ฃน๋ณ„๋กœ ๋งŒ๋“  ํ‘œ๋“ค์„ ์œ„์•„๋ž˜๋กœ ์ด์–ด๋ถ™์ด๊ณ , ๋‹ค์‹œ ๋‚ ์งœ์ˆœ ์ •๋ ฌ
    return pd.concat(out, axis=0).sort_values(date_col)


def make_matrix(df, mapping):
    """
    [๋ฌด์—‡์„ ํ•˜๋‚˜์š”?]
    - ๋ชจ๋ธ ํ•™์Šต์šฉ '์ž…๋ ฅ X'์™€ '์ •๋‹ต y'๋ฅผ ๋งŒ๋“œ๋Š” ๊ณต์žฅ์ž…๋‹ˆ๋‹ค.
      1) ๋‚ ์งœ/ํƒ€๊นƒ ์—ด ์ด๋ฆ„์„ mapping์—์„œ ์ฝ๊ณ ,
      2) add_time_features / add_lag_features๋กœ ์ˆซ์ž ํžŒํŠธ๋ฅผ ์ถ”๊ฐ€ํ•˜๊ณ ,
      3) (์žˆ๋‹ค๋ฉด) region/brand/item์„ '์›-ํ•ซ ์ธ์ฝ”๋”ฉ(๊ฐ€์งœ ์—ด)'์œผ๋กœ ๋ฐ”๊ฟ”์„œ X์— ๋ถ™์—ฌ์š”.
      4) y๋Š” ํƒ€๊นƒ ๊ฐ’(ํŒ๋งค๋Ÿ‰ ๋“ฑ)์œผ๋กœ ์„ค์ •ํ•ด์š”.

    [์ž…๋ ฅ]
    - df: ์›๋ณธ ํ‘œ
    - mapping: {'date':..., 'target':..., 'region':..., 'brand':..., 'item':...}
               (region/brand/item์€ ์—†์–ด๋„ ๋จ)

    [์ถœ๋ ฅ]
    - df: ํ”ผ์ฒ˜๊ฐ€ ๋ถ™์€ ํ‘œ(์ดˆ๊ธฐ lag๋กœ NaN์ธ ๋งจ ์•ž๋ถ€๋ถ„์€ ์ œ๊ฑฐ๋จ)
    - X: ๋ชจ๋ธ์— ๋“ค์–ด๊ฐˆ ์ˆซ์ž ๋ฐฐ์—ด(2์ฐจ์›)
    - y: ์ •๋‹ต ๋ฒกํ„ฐ(1์ฐจ์›)
    - feat_names: X์˜ ์—ด ์ด๋ฆ„ ๋ชฉ๋ก(๋ชจ๋ธ ํ•ด์„/์žฌํ˜„์— ํ•„์š”)
    """
    df = df.copy()

    # ๋งคํ•‘์—์„œ ์—ด ์ด๋ฆ„ ๊บผ๋‚ด์˜ค๊ธฐ
    date_col   = mapping.get("date")
    target_col = mapping.get("target")
    region_col = mapping.get("region")
    brand_col  = mapping.get("brand")
    item_col   = mapping.get("item")

    # ๋‚ ์งœ/ํƒ€๊นƒ์€ ํ•„์ˆ˜! ์—†์œผ๋ฉด ์ง„ํ–‰ ๋ชป ํ•ด์š”.
    if not date_col or not target_col:
        raise ValueError("date/target ์ปฌ๋Ÿผ ๋งคํ•‘์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค.")

    # --- (1) ์ˆซ์žํ˜• ์ •๋ฆฌ ---
    # ํƒ€๊นƒ์€ ๋ฐ˜๋“œ์‹œ ์ˆซ์ž์—ฌ์•ผ ํ•ด์š”. ๊ธ€์ž๊ฐ€ ์„ž์—ฌ ์žˆ์œผ๋ฉด NaN์œผ๋กœ ๋ฐ”๋€œ โ†’ 0์œผ๋กœ ์ฑ„์›€.
    df[target_col] = pd.to_numeric(df[target_col], errors="coerce").fillna(0)

    # (์„ ํƒ) ๋ถ„๋ฅ˜ํ˜• ์—ด๋“ค์€ ๊ธ€์ž(๋ฌธ์ž์—ด)๋กœ ํ†ต์ผํ•ด์š”.
    # ์ด๋ ‡๊ฒŒ ํ•ด์•ผ '์›-ํ•ซ ์ธ์ฝ”๋”ฉ'์ด ์ž˜ ๋ฉ๋‹ˆ๋‹ค.
    if region_col and region_col in df: df[region_col] = df[region_col].astype(str)
    if brand_col  and brand_col  in df: df[brand_col]  = df[brand_col].astype(str)
    if item_col   and item_col   in df: df[item_col]   = df[item_col].astype(str)

    # --- (2) ๋‹ฌ๋ ฅ ํ”ผ์ฒ˜ ๋ถ™์ด๊ธฐ ---
    df = add_time_features(df, date_col)

    # --- (3) ๊ณผ๊ฑฐ/์ตœ๊ทผ ํ†ต๊ณ„ ํ”ผ์ฒ˜ ๋ถ™์ด๊ธฐ ---
    # ๊ทธ๋ฃนํ‚ค: ์กด์žฌํ•˜๋Š” ๊ฒƒ๋งŒ ์‚ฌ์šฉ (์˜ˆ: ['region','brand','item'] ์ค‘ ์‹ค์ œ ์žˆ๋Š” ์—ด๋งŒ)
    df = add_lag_features(
        df, date_col, target_col,
        [c for c in [region_col, brand_col, item_col] if c]
    )

    # --- (4) lag/rolling ๋•Œ๋ฌธ์— ์•ž๋ถ€๋ถ„์— ์ƒ๊ธด ๋น„์–ด์žˆ๋Š” ํ–‰ ์ œ๊ฑฐ ---
    # ์ฒซ ๋ช‡ ํ–‰์€ lag1/lag7 ๊ฐ™์€ ๊ฒŒ ์ฑ„์šธ ์ˆ˜ ์—†์–ด์„œ NaN์ด ๋ผ์š” โ†’ ํ•™์Šต์— ๋ชป ์“ฐ๋‹ˆ ์ œ๊ฑฐ.
    drop_cols = [c for c in df.columns if c.startswith("lag") or c.startswith("rmean") or c.startswith("rstd")]
    df = df.dropna(subset=drop_cols)

    # --- (5) ์ˆซ์ž ํ”ผ์ฒ˜ ๋ชฉ๋ก ๋งŒ๋“ค๊ธฐ ---
    # ๋‹ฌ๋ ฅ ์ˆซ์ž + lag/rolling ์ˆซ์ž๋“ค์„ ๋ชจ์•„์„œ X์˜ ๊ธฐ๋ณธ ๋ผˆ๋Œ€๋ฅผ ๋งŒ๋“ค์–ด์š”.
    num_cols = ["year","month","day","dow","week","is_weekend"] + drop_cols
    num_cols = [c for c in num_cols if c in df.columns]  # ํ˜น์‹œ ๋น ์ง„ ๊ฒŒ ์žˆ์œผ๋ฉด ๊ฑธ๋Ÿฌ์คŒ

    # ์ˆซ์ž ํ”ผ์ฒ˜๋ฅผ ๋จผ์ € ํ–‰๋ ฌ๋กœ ๋ณ€ํ™˜
    X_num = df[num_cols].values
    feat_names = list(num_cols)  # ๋‚˜์ค‘์— ํ•ด์„/์žฌํ˜„ํ•  ๋•Œ ํ•„์š”

    # --- (6) ๋ถ„๋ฅ˜ํ˜•(๋ฌธ์ž) โ†’ ์›-ํ•ซ ์ธ์ฝ”๋”ฉ ---
    # ์˜ˆ: region์ด '์„œ์šธ','๊ฒฝ๊ธฐ'๋ฉด 'region_์„œ์šธ','region_๊ฒฝ๊ธฐ' ๊ฐ™์€ ๊ฐ€์งœ ์—ด์„ ๋งŒ๋“ค์–ด์š”(0/1)
    cat_cols = [c for c in [region_col, brand_col, item_col] if c and c in df.columns]
    if cat_cols:
        dummies = pd.get_dummies(df[cat_cols].astype(str), dummy_na=False)
        # ์ˆซ์ž ํ”ผ์ฒ˜(X_num) ์˜ค๋ฅธ์ชฝ์— ์›-ํ•ซ ํ”ผ์ฒ˜๋ฅผ ๋ถ™์—ฌ์š”.
        X = np.hstack([X_num, dummies.values])
        feat_names += list(dummies.columns)  # ์ƒˆ๋กœ ์ƒ๊ธด ์—ด ์ด๋ฆ„๋„ ๊ธฐ๋ก
    else:
        X = X_num  # ๋ถ„๋ฅ˜ํ˜•์ด ์—†์œผ๋ฉด ์ˆซ์ž๋งŒ ์‚ฌ์šฉ

    # --- (7) ์ •๋‹ต y ๋งŒ๋“ค๊ธฐ ---
    y = df[target_col].values  # ์šฐ๋ฆฌ๊ฐ€ ๋งž์ถ”๊ณ  ์‹ถ์€ ๊ฐ’(์˜ˆ: ํŒ๋งค๋Ÿ‰)

    return df, X, y, feat_names