leedami commited on
Commit
5841e58
ยท
verified ยท
1 Parent(s): 5a821d4

Upload 7 files

Browse files
Files changed (7) hide show
  1. app.py +964 -0
  2. preprocess.py +188 -0
  3. quick_train_runner.py +116 -0
  4. requirements.txt +10 -3
  5. starter.ipynb +1 -0
  6. train_core.py +438 -0
  7. utils_io.py +203 -0
app.py ADDED
@@ -0,0 +1,964 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # ============================================================
5
+ # ํŽธ์˜์  ์ˆ˜์š”์˜ˆ์ธก & ๋ฐœ์ฃผ ์ถ”์ฒœ โ€” Pro Suite (ํŒจ์น˜ ๋ฒ„์ „, ๋ฉ€ํ‹ฐ CSV + ์›”๋ณ„ ๊ทธ๋ž˜ํ”„)
6
+ # - โ‘  ์—ฌ๋Ÿฌ CSV ์—…๋กœ๋“œ/์„ ํƒ โ†’ ์ž๋™ ๊ฒฐํ•ฉ(์˜ต์…˜: source ์—ด ์ถ”๊ฐ€)
7
+ # - โ‘ก ์ปฌ๋Ÿผ ๋งคํ•‘: "์ปฌ๋Ÿผ๋ช…"์ด ์•„๋‹ˆ๋ผ "์˜ˆ์‹œ ๊ฐ’" ๊ธฐ๋ฐ˜ ์„ ํƒ
8
+ # - โ‘ข ์˜ˆ์ธกยท๋ฐœ์ฃผ: ์žฌ๊ณ  ์ปฌ๋Ÿผ ์ž๋™ ์ธ์‹ โ†’ ์˜ˆ์ธก ๊ธฐ๊ฐ„/๋ฐœ์ฃผ๋Ÿ‰ ์ž๋™ ๊ณ„์‚ฐ
9
+ # ยท ๋ฆฌ๋“œํƒ€์ž„ / ์„œ๋น„์Šค๋ ˆ๋ฒจ / ์•ˆ์ „์žฌ๊ณ  / MOQ / ํŒฉ๋‹จ์œ„ ์ž…๋ ฅ ์ œ๊ฑฐ
10
+ # - โ‘ฃ ๋ถ„์„(๊ทธ๋ž˜ํ”„):
11
+ # ยท ์šฐ์‚ฐ: ์›”๋ณ„ ๊ฐ•์ˆ˜๋Ÿ‰ โ†” ์šฐ์‚ฐ ํŒ๋งค๋Ÿ‰ (์‚ฐ์ ๋„ + ํšŒ๊ท€์„  + ์ผ๋ณ„ ์„ ํ˜• ๊ทธ๋ž˜ํ”„)
12
+ # ยท ๊ตฐ๊ณ ๊ตฌ๋งˆ: ์›”๋ณ„ ๊ธฐ์˜จ โ†” ๊ตฐ๊ณ ๊ตฌ๋งˆ ํŒ๋งค๋Ÿ‰ (์‚ฐ์ ๋„ + ํšŒ๊ท€์„  + ์ผ๋ณ„ ์„ ํ˜• ๊ทธ๋ž˜ํ”„)
13
+ # ยท ์ „์ฒด: ์šฐ์‚ฐยท๊ตฐ๊ณ ๊ตฌ๋งˆ ์ œ์™ธ ์ „์ฒด ์ƒํ’ˆ ์ผ๋ณ„ ํŒ๋งค๋Ÿ‰ ์„ ํ˜• ๊ทธ๋ž˜ํ”„
14
+ # - ์‚ฌ์ด๋“œ๋ฐ”: ์‹คํ–‰ ํŒŒ์ผ ํ‘œ์‹œ + ์บ์‹œ ์ดˆ๊ธฐํ™”
15
+ # ============================================================
16
+
17
+ import os, io, pickle, time, subprocess, sys
18
+ from datetime import timedelta
19
+ from pathlib import Path
20
+
21
+ import pandas as pd
22
+ import numpy as np
23
+ import streamlit as st
24
+ import altair as alt
25
+
26
+ from utils_io import read_csv_flexible, save_utf8sig, ensure_dirs, auto_map_columns
27
+ from preprocess import make_matrix
28
+ from train_core import train_and_score, save_artifacts
29
+
30
+ # Altair ๋Œ€์šฉ๋Ÿ‰ ๋ Œ๋”๋ง ์•ˆ์ „์žฅ์น˜ (ํ–‰ ์ˆ˜ ์ œํ•œ ํ•ด์ œ)
31
+ alt.data_transformers.disable_max_rows()
32
+
33
+ # ------------------------------------------------------------
34
+ # ํŽ˜์ด์ง€/์‚ฌ์ด๋“œ๋ฐ”
35
+ # ------------------------------------------------------------
36
+ st.set_page_config(page_title="ํŽธ์˜์  ์ˆ˜์š”์˜ˆ์ธก & ๋ฐœ์ฃผ ์ถ”์ฒœ โ€” Pro Suite (ํŒจ์น˜)", layout="wide")
37
+
38
+ # __file__ ์ด ์—†๋Š” Colab ๊ฐ™์€ ํ™˜๊ฒฝ ๋ฐฉ์–ด์šฉ
39
+ try:
40
+ script_name = Path(__file__).resolve().name
41
+ except NameError:
42
+ script_name = "app_streamlit_pro.py"
43
+
44
+ st.sidebar.write("๐Ÿงญ ์‹คํ–‰ ํŒŒ์ผ:", script_name)
45
+ if st.sidebar.button("์บ์‹œ ์ดˆ๊ธฐํ™” ํ›„ ๋‹ค์‹œ ์‹คํ–‰"):
46
+ try:
47
+ st.cache_data.clear()
48
+ except Exception:
49
+ pass
50
+ try:
51
+ st.cache_resource.clear()
52
+ except Exception:
53
+ pass
54
+ st.experimental_rerun()
55
+
56
+ # ------------------------------------------------------------
57
+ # ๊ธฐ๋ณธ ํ™˜๊ฒฝ/๊ฒฝ๋กœ ์„ค์ •
58
+ # ------------------------------------------------------------
59
+ PROJ = os.getcwd() # ํ˜„์žฌ ์ž‘์—… ๋””๋ ‰ํ† ๋ฆฌ(์•ฑ ๋ฃจํŠธ)
60
+ DATA_DIR = os.path.join(PROJ, "data") # CSV ๋ฐ์ดํ„ฐ ํด๋”
61
+ ARTI_DIR = os.path.join(PROJ, "artifacts") # ํ•™์Šต ์ค‘๊ฐ„์‚ฐ์ถœ๋ฌผ(๋กœ๊ทธ/์„ฑ๋Šฅ ๋“ฑ) ๋ณด๊ด€
62
+ MODELS_DIR = os.path.join(PROJ, "models") # ํ•™์Šต๋œ ๋ชจ๋ธ pkl ๋ณด๊ด€
63
+ ensure_dirs(DATA_DIR, ARTI_DIR, MODELS_DIR) # ํด๋” ์—†์œผ๋ฉด ์ƒ์„ฑ
64
+
65
+ # ------------------------------------------------------------
66
+ # ์œ ํ‹ธ: data ํด๋”์˜ CSV ํŒŒ์ผ ๋ฆฌ์ŠคํŠธ ์บ์‹œ
67
+ # ------------------------------------------------------------
68
+ @st.cache_data(show_spinner=False)
69
+ def list_data_files():
70
+ try:
71
+ return [f for f in os.listdir(DATA_DIR) if f.lower().endswith(".csv")]
72
+ except FileNotFoundError:
73
+ return []
74
+
75
+ # ------------------------------------------------------------
76
+ # ํผ๋ธ”๋ฆญ URL: cloudflared ์‹œ์ž‘ ํ•จ์ˆ˜
77
+ # ------------------------------------------------------------
78
+ def start_cloudflared(port=8501):
79
+ try:
80
+ proc = subprocess.Popen(
81
+ ["cloudflared", "tunnel", "--url", f"http://localhost:{port}"],
82
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
83
+ )
84
+ st.session_state["_cfd_proc"] = proc # ์ข…๋ฃŒ์šฉ ํ•ธ๋“ค ์ €์žฅ
85
+ with st.expander("cloudflared logs"):
86
+ for _ in range(120): # ์ตœ์ดˆ 120๋ผ์ธ ์ •๋„๋งŒ ์ฝ์–ด ํ‘œ์‹œ
87
+ line = proc.stdout.readline()
88
+ if not line:
89
+ break
90
+ st.text(line.strip())
91
+ if "trycloudflare.com" in line:
92
+ st.success(line.strip()) # ํผ๋ธ”๋ฆญ URL ํฌํ•จ ๋กœ๊ทธ
93
+ break
94
+ except FileNotFoundError:
95
+ st.error("cloudflared ๋ฐ”์ด๋„ˆ๋ฆฌ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. `pip install cloudflared` ๋˜๋Š” ๋ฐ”์ด๋„ˆ๋ฆฌ ์„ค์น˜ ํ›„ ๋‹ค์‹œ ์‹œ๋„ํ•˜์„ธ์š”.")
96
+
97
+ # ------------------------------------------------------------
98
+ # ํผ๋ธ”๋ฆญ URL: ngrok ์‹œ์ž‘ ํ•จ์ˆ˜
99
+ # ------------------------------------------------------------
100
+ def start_ngrok(port=8501, token: str | None = None):
101
+ try:
102
+ from pyngrok import ngrok, conf
103
+ except Exception:
104
+ st.error("pyngrok๊ฐ€ ์„ค์น˜๋˜์–ด ์žˆ์ง€ ์•Š์Šต๋‹ˆ๋‹ค. `pip install pyngrok` ํ›„ ๋‹ค์‹œ ์‹œ๋„ํ•˜์„ธ์š”.")
105
+ return
106
+
107
+ # ๊ธฐ์กด ngrok ์„ธ์…˜ ์ •๋ฆฌ(์žฌ์‹คํ–‰ ์‹œ ์ถฉ๋Œ ๋ฐฉ์ง€)
108
+ try:
109
+ ngrok.kill()
110
+ time.sleep(1.0)
111
+ except Exception:
112
+ pass
113
+
114
+ token = (token or os.environ.get("NGROK_AUTHTOKEN", "")).strip()
115
+ if token:
116
+ conf.get_default().auth_token = token
117
+ else:
118
+ st.warning("NGROK_AUTHTOKEN์ด ๋น„์–ด ์žˆ์Šต๋‹ˆ๋‹ค. ์ธ์ฆ ์—†์ด ์—ด๋ฉด ์ œํ•œ/์—๋Ÿฌ(4018) ๊ฐ€๋Šฅ.")
119
+
120
+ for attempt in range(2):
121
+ try:
122
+ tunnel = ngrok.connect(addr=f"http://localhost:{port}", proto="http")
123
+ url = tunnel.public_url
124
+ st.session_state["_ngrok_tunnel"] = tunnel
125
+ st.success(f"๐ŸŒ Public URL: {url}")
126
+ st.caption("๋Ÿฐํƒ€์ž„/ํ”„๋กœ์„ธ์Šค๋ฅผ ์ข…๋ฃŒํ•˜๋ฉด ํ„ฐ๋„๋„ ๋‹ซํž™๋‹ˆ๋‹ค.")
127
+ break
128
+ except Exception as e:
129
+ if attempt == 0:
130
+ time.sleep(1.5)
131
+ else:
132
+ msg = str(e)
133
+ if "4018" in msg:
134
+ st.error("ngrok ์ธ์ฆ ์‹คํŒจ(4018). ํ† ํฐ์„ ๋‹ค์‹œ ํ™•์ธํ•˜์„ธ์š”.")
135
+ elif "already online" in msg or "334" in msg:
136
+ st.error("๋™์ผ ์—”๋“œํฌ์ธํŠธ๊ฐ€ ์ด๋ฏธ ์—ด๋ ค ์žˆ์Šต๋‹ˆ๋‹ค. ์„ธ์…˜ ์žฌ์‹œ์ž‘ ๋˜๋Š” ๊ธฐ์กด ํ„ฐ๋„ ์ข…๋ฃŒ ํ›„ ์žฌ์‹œ๋„.")
137
+ else:
138
+ st.error(f"ngrok ์—ฐ๊ฒฐ ์‹คํŒจ: {e}")
139
+
140
+ # ------------------------------------------------------------
141
+ # ์•ฑ ํƒ€์ดํ‹€/ํƒญ ๊ตฌ์„ฑ
142
+ # ------------------------------------------------------------
143
+ st.title("ํŽธ์˜์  ์ˆ˜์š”์˜ˆ์ธก & ๋ฐœ์ฃผ ์ถ”์ฒœ โ€” Pro Suite")
144
+ tabs = st.tabs(["โ‘  ๋ฐ์ดํ„ฐ", "โ‘ก ํ•™์Šต/๋ชจ๋ธ", "โ‘ข ์˜ˆ์ธกยท๋ฐœ์ฃผ", "โ‘ฃ ๋ถ„์„(๊ทธ๋ž˜ํ”„)", "โ‘ค ์ง„๋‹จ/๋กœ๊ทธ"])
145
+
146
+ # ============================================================
147
+ # โ‘  ๋ฐ์ดํ„ฐ: CSV ์—…๋กœ๋“œ/์„ ํƒ + ์ž๋™ ์ปฌ๋Ÿผ ๋งคํ•‘ ์ €์žฅ (๋ฉ€ํ‹ฐ CSV ์ง€์›)
148
+ # ============================================================
149
+ with tabs[0]:
150
+ st.subheader("CSV ์—…๋กœ๋“œ ๋˜๋Š” ์„ ํƒ")
151
+ cols_top = st.columns([2,1])
152
+ with cols_top[0]:
153
+ add_source = st.checkbox("ํŒŒ์ผ๋ช…(source) ์—ด ์ถ”๊ฐ€", value=True, help="์—ฌ๋Ÿฌ CSV๋ฅผ ํ•ฉ์น  ๋•Œ ์›๋ณธ ํŒŒ์ผ๋ช…์„ ๋‚จ๊น๋‹ˆ๋‹ค.")
154
+ with cols_top[1]:
155
+ st.caption("โ€ป ์—…๋กœ๋“œ/์„ ํƒ ํ›„ ์•„๋ž˜์—์„œ ์ปฌ๋Ÿผ ๋งคํ•‘ ์ €์žฅ")
156
+
157
+ cols = st.columns(2)
158
+
159
+ # --- ๋‹ค์ค‘ ํŒŒ์ผ ์—…๋กœ๋“œ ---
160
+ with cols[0]:
161
+ up_multi = st.file_uploader("CSV ํŒŒ์ผ ์—…๋กœ๋“œ(์—ฌ๋Ÿฌ ๊ฐœ ๊ฐ€๋Šฅ)", type=["csv"], accept_multiple_files=True, key="multi_up")
162
+ if up_multi:
163
+ dfs = []
164
+ for f in up_multi:
165
+ raw = f.read()
166
+ df_i = read_csv_flexible(io.BytesIO(raw))
167
+ if add_source:
168
+ df_i["source"] = f.name
169
+ dfs.append(df_i)
170
+ # data/์— ์ €์žฅ
171
+ save_path = os.path.join(DATA_DIR, f.name)
172
+ try:
173
+ with open(save_path, "wb") as fp:
174
+ fp.write(raw)
175
+ except Exception as e:
176
+ st.warning(f"ํŒŒ์ผ ์ €์žฅ ๊ฒฝ๊ณ ({f.name}): {e}")
177
+ try:
178
+ list_data_files.clear() # ์บ์‹œ ๋ฌดํšจํ™”
179
+ except Exception:
180
+ pass
181
+ df = pd.concat(dfs, axis=0, ignore_index=True, sort=True)
182
+ st.session_state["df"] = df
183
+ st.success(f"์—…๋กœ๋“œ/๊ฒฐํ•ฉ ์™„๋ฃŒ: {df.shape} (ํŒŒ์ผ {len(dfs)}๊ฐœ)")
184
+ st.dataframe(df.head(20), use_container_width=True)
185
+
186
+ # --- data ํด๋”์—์„œ ๋‹ค์ค‘ ์„ ํƒ ---
187
+ with cols[1]:
188
+ files = list_data_files()
189
+ picks = st.multiselect("data ํด๋”์—์„œ ์„ ํƒ(์—ฌ๋Ÿฌ ๊ฐœ)", files)
190
+ if st.button("์„ ํƒ ํŒŒ์ผ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ", disabled=(len(picks)==0)):
191
+ dfs = []
192
+ for name in picks:
193
+ path = os.path.join(DATA_DIR, name)
194
+ df_i = read_csv_flexible(path)
195
+ if add_source:
196
+ df_i["source"] = name
197
+ dfs.append(df_i)
198
+ df = pd.concat(dfs, axis=0, ignore_index=True, sort=True)
199
+ st.session_state["df"] = df
200
+ st.success(f"๋ถˆ๋Ÿฌ์˜ค๊ธฐ/๊ฒฐํ•ฉ ์™„๋ฃŒ: {df.shape} (ํŒŒ์ผ {len(dfs)}๊ฐœ)")
201
+ st.dataframe(df.head(20), use_container_width=True)
202
+
203
+ # --- ์ž๋™ ์ปฌ๋Ÿผ ๋งคํ•‘ + ๋ณด์ • ---
204
+ if "df" in st.session_state:
205
+ st.divider()
206
+ st.caption("์ž๋™ ์ปฌ๋Ÿผ ๋งคํ•‘ โ€” ์„ ํƒ ์—†์ด ์ž๋™ ์ ์šฉ๋ฉ๋‹ˆ๋‹ค.")
207
+
208
+ df = st.session_state["df"]
209
+
210
+ # auto_map_columns ๊ฒฐ๊ณผ ์‚ฌ์šฉ
211
+ auto = auto_map_columns(df)
212
+ mapping = {
213
+ "date": auto.get("date"),
214
+ "target": auto.get("target"),
215
+ "region": auto.get("region"),
216
+ "brand": auto.get("brand"),
217
+ "item": auto.get("item"),
218
+ }
219
+ st.session_state["mapping"] = mapping
220
+
221
+ # โ˜… data ํด๋”์šฉ ๋ณด์ •:
222
+ # seoul_gyeonggi_with_demand.csv / usan.csv / gungoguma.csv ๋Š”
223
+ # auto_map_columns๊ฐ€ ํƒ€๊นƒ์„ '๊ฐ•์ˆ˜๋Ÿ‰'์œผ๋กœ ์žก๋Š” ์ผ€์ด์Šค๊ฐ€ ์žˆ์–ด์„œ,
224
+ # '์ผ์ผํŒ๋งค๋Ÿ‰' ์ปฌ๋Ÿผ์ด ์žˆ์œผ๋ฉด ๊ทธ๊ฑธ target์œผ๋กœ ๊ฐ•์ œ ๊ต์ฒด
225
+ if mapping.get("target") == "๊ฐ•์ˆ˜๋Ÿ‰" and "์ผ์ผํŒ๋งค๋Ÿ‰" in df.columns:
226
+ mapping["target"] = "์ผ์ผํŒ๋งค๋Ÿ‰"
227
+
228
+ # ํ™•์ธ์šฉ์œผ๋กœ๋งŒ ์ฝ๊ธฐ ์ „์šฉ ํ…Œ์ด๋ธ” ํ‘œ์‹œ
229
+ mapping_view = pd.DataFrame(
230
+ {
231
+ "์—ญํ• ": ["๋‚ ์งœ(date)", "์ˆ˜์š”/ํŒ๋งค๋Ÿ‰(target)", "์ง€์—ญ/์ ํฌ(region)", "๋ธŒ๋žœ๋“œ(์„ ํƒ)", "์ƒํ’ˆ/ํ’ˆ๋ชฉ(์„ ํƒ)"],
232
+ "์ปฌ๋Ÿผ": [
233
+ mapping.get("date"),
234
+ mapping.get("target"),
235
+ mapping.get("region"),
236
+ mapping.get("brand"),
237
+ mapping.get("item"),
238
+ ],
239
+ }
240
+ )
241
+
242
+ st.write("ํ˜„์žฌ ์ž๋™ ๋งคํ•‘ ๊ฒฐ๊ณผ:")
243
+ st.dataframe(mapping_view, use_container_width=True)
244
+
245
+ # ============================================================
246
+ # โ‘ก ํ•™์Šต/๋ชจ๋ธ
247
+ # ============================================================
248
+ with tabs[1]:
249
+ st.subheader("๋ชจ๋ธ ํ•™์Šต")
250
+
251
+ use_optuna = st.checkbox("Optuna ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹ ์‚ฌ์šฉ", value=False)
252
+ trials = st.slider("Optuna ์‹œ๋„ ํšŸ์ˆ˜", 5, 60, 15, 5)
253
+
254
+ if "df" not in st.session_state or "mapping" not in st.session_state:
255
+ st.info("๋จผ์ € โ‘  ํƒญ์—์„œ ๋ฐ์ดํ„ฐ์™€ ์ปฌ๋Ÿผ ๋งคํ•‘์„ ์ง€์ •ํ•˜์„ธ์š”.")
256
+ else:
257
+ v = st.slider("๊ฒ€์ฆ ๋น„์œจ(valid_ratio)", 0.05, 0.4, 0.2, 0.05)
258
+
259
+ if st.button("ํ•™์Šต ์‹œ์ž‘"):
260
+ # โžœ ์—ฌ๊ธฐ์„œ ์˜ˆ์™ธ๊ฐ€ ๋‚˜๋„ ์•ฑ์ด ์ฃฝ์ง€ ์•Š๋„๋ก ๋ฐฉ์–ด
261
+ try:
262
+ df, X, y, feat_names = make_matrix(
263
+ st.session_state["df"],
264
+ st.session_state["mapping"],
265
+ )
266
+ except Exception as e:
267
+ st.error(f"ํ•™์Šต์šฉ ๋ฐ์ดํ„ฐ ๊ตฌ์„ฑ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {e}")
268
+ else:
269
+ try:
270
+ best_model, lb = train_and_score(
271
+ X,
272
+ y,
273
+ valid_ratio=v,
274
+ use_optuna=use_optuna,
275
+ optuna_trials=trials,
276
+ )
277
+ save_artifacts(
278
+ [ARTI_DIR, MODELS_DIR],
279
+ best_model,
280
+ feat_names,
281
+ st.session_state["mapping"],
282
+ lb,
283
+ )
284
+ except Exception as e:
285
+ st.error(f"๋ชจ๋ธ ํ•™์Šต/์ €์žฅ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {e}")
286
+ else:
287
+ st.session_state["leaderboard"] = lb
288
+ st.session_state["feat_names"] = feat_names
289
+ st.success("ํ•™์Šต ์™„๋ฃŒ")
290
+
291
+ if "leaderboard" in st.session_state:
292
+ st.dataframe(st.session_state["leaderboard"], use_container_width=True)
293
+
294
+ # ============================================================
295
+ # โ‘ข ์˜ˆ์ธกยท๋ฐœ์ฃผ: ๋ฐ˜๋ณต(AR) ์˜ˆ์ธก + ์žฌ๊ณ  ๊ธฐ๋ฐ˜ ์ž๋™ ๋ฐœ์ฃผ ๊ณ„์‚ฐ
296
+ # ============================================================
297
+ with tabs[2]:
298
+ st.subheader("์˜ˆ์ธก(๋ฐ˜๋ณต AR) & ๋ฐœ์ฃผ๋Ÿ‰ ์ถ”์ฒœ")
299
+ st.caption("ํ•™์Šต๋œ ๋ชจ๋ธ๋กœ ๋ฏธ๋ž˜ ํ”ผ์ฒ˜๋ฅผ ์ƒ์„ฑํ•˜๊ณ , ์žฌ๊ณ ๋ฅผ ๊ณ ๋ คํ•ด ์ž๋™์œผ๋กœ ๋ฐœ์ฃผ ๊ธฐ๊ฐ„๊ณผ ์ˆ˜๋Ÿ‰์„ ๊ณ„์‚ฐํ•ฉ๋‹ˆ๋‹ค.")
300
+
301
+ if "df" not in st.session_state or "mapping" not in st.session_state:
302
+ st.info("๋จผ์ € โ‘  ํƒญ์—์„œ ๋ฐ์ดํ„ฐ์™€ ์ปฌ๋Ÿผ ๋งคํ•‘์„ ์ง€์ •ํ•˜๊ณ  โ‘ก์—์„œ ํ•™์Šต์„ ์™„๋ฃŒํ•˜์„ธ์š”.")
303
+ else:
304
+ horizon_days = 14 # ๊ณ ์ • ๊ธฐ๊ฐ„
305
+
306
+ # ์ •ํ™•๋„(๋ณด์ • ๊ณ„์ˆ˜)
307
+ accuracy = st.slider(
308
+ "์ •ํ™•๋„(์˜ˆ์ธก ๋ณด์ • ๊ณ„์ˆ˜)",
309
+ min_value=0.5,
310
+ max_value=2.0,
311
+ value=1.0,
312
+ step=0.05,
313
+ )
314
+
315
+ # ==============================
316
+ # ์„ธ๊ทธ๋จผํŠธ ์„ ํƒ
317
+ # ==============================
318
+ seg_cols = [
319
+ c for c in [
320
+ st.session_state["mapping"].get("region"),
321
+ st.session_state["mapping"].get("brand"),
322
+ st.session_state["mapping"].get("item"),
323
+ ] if c
324
+ ]
325
+ seg_vals = {}
326
+ if seg_cols:
327
+ col_objs = st.columns(len(seg_cols))
328
+ for i, ccol in enumerate(seg_cols):
329
+ opts = ["<์ „์ฒด>"] + sorted(
330
+ list(map(str, st.session_state["df"][ccol].dropna().astype(str).unique()))
331
+ )
332
+ seg_vals[ccol] = col_objs[i].selectbox(f"{ccol} ์„ ํƒ", opts, index=0)
333
+
334
+ # ==============================
335
+ # ๋ฐ˜๋ณต ์˜ˆ์ธก ํ•จ์ˆ˜
336
+ # ==============================
337
+ def iterative_forecast(df, mapping, model, feat_names, horizon, seg_vals):
338
+ df = df.copy()
339
+ dtc = mapping["date"]
340
+ tgt = mapping["target"]
341
+
342
+ if dtc not in df.columns or tgt not in df.columns:
343
+ st.error(f"์˜ˆ์ธก์— ํ•„์š”ํ•œ ์ปฌ๋Ÿผ์ด ์—†์Šต๋‹ˆ๋‹ค. (date='{dtc}', target='{tgt}')")
344
+ return pd.DataFrame(columns=[dtc, "์˜ˆ์ธก์ˆ˜๋Ÿ‰"])
345
+
346
+ df[dtc] = pd.to_datetime(df[dtc], errors="coerce")
347
+ df = df.dropna(subset=[dtc]).sort_values(dtc)
348
+
349
+ for k, v in seg_vals.items():
350
+ if v and v != "<์ „์ฒด>" and k in df.columns:
351
+ df = df[df[k].astype(str) == str(v)]
352
+
353
+ if df.empty:
354
+ st.error("์„ ํƒํ•œ ์„ธ๊ทธ๋จผํŠธ์— ํ•ด๋‹นํ•˜๋Š” ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
355
+ return pd.DataFrame(columns=[dtc, "์˜ˆ์ธก์ˆ˜๋Ÿ‰"])
356
+
357
+ if len(df) < 30:
358
+ st.warning("ํ•ด๋‹น ์„ธ๊ทธ๋จผํŠธ ๋ฐ์ดํ„ฐ๊ฐ€ ์ ์–ด ์˜ˆ์ธก ํ’ˆ์งˆ์ด ๋‚ฎ์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
359
+
360
+ last_date = df[dtc].max()
361
+
362
+ hist = list(
363
+ pd.to_numeric(df[tgt], errors="coerce")
364
+ .fillna(0)
365
+ .astype(float)
366
+ .values
367
+ )
368
+
369
+ def build_row_features(current_date, hist_vals):
370
+ if pd.isna(current_date):
371
+ current_date = df[dtc].max()
372
+
373
+ year = current_date.year
374
+ month = current_date.month
375
+ day = current_date.day
376
+ dow = current_date.weekday()
377
+ is_weekend = 1 if dow >= 5 else 0
378
+
379
+ try:
380
+ week = int(pd.Timestamp(current_date).isocalendar().week)
381
+ except Exception:
382
+ week = 0
383
+
384
+ def get_lag(k):
385
+ if len(hist_vals) >= k:
386
+ return float(hist_vals[-k])
387
+ return float(np.mean(hist_vals[-min(len(hist_vals), 7):])) if hist_vals else 0.0
388
+
389
+ lag1 = get_lag(1)
390
+ lag7 = get_lag(7)
391
+ lag14 = get_lag(14)
392
+
393
+ def rmean(w):
394
+ arr = np.array(hist_vals[-w:]) if len(hist_vals) >= 1 else np.array([0.0])
395
+ if len(arr) < max(2, w // 2):
396
+ arr = np.array(hist_vals[-max(2, w // 2):]) if len(hist_vals) else np.array([0.0])
397
+ return float(np.mean(arr))
398
+
399
+ def rstd(w):
400
+ arr = np.array(hist_vals[-w:]) if len(hist_vals) >= 2 else np.array([0.0, 0.0])
401
+ return float(np.std(arr))
402
+
403
+ feats = {
404
+ "year": year,
405
+ "month": month,
406
+ "day": day,
407
+ "dow": dow,
408
+ "week": week,
409
+ "is_weekend": is_weekend,
410
+ "lag1": lag1,
411
+ "lag7": lag7,
412
+ "lag14": lag14,
413
+ "rmean7": rmean(7),
414
+ "rmean14": rmean(14),
415
+ "rstd7": rstd(7),
416
+ "rstd14": rstd(14),
417
+ }
418
+
419
+ for fn in feat_names:
420
+ if fn not in feats:
421
+ feats[fn] = 0.0
422
+
423
+ x = [feats.get(fn, 0.0) for fn in feat_names]
424
+ return np.array(x, dtype=float)
425
+
426
+ preds, dates = [], []
427
+ cur = last_date
428
+ for _ in range(int(horizon)):
429
+ cur = cur + timedelta(days=1)
430
+ x = build_row_features(cur, hist)
431
+ val = float(model.predict([x])[0])
432
+ preds.append(val)
433
+ dates.append(cur)
434
+ hist.append(val)
435
+
436
+ return pd.DataFrame({dtc: dates, "์˜ˆ์ธก์ˆ˜๋Ÿ‰": preds})
437
+
438
+ # ==============================
439
+ # ์žฌ๊ณ  ์ž๋™ ์ธ์‹
440
+ # ==============================
441
+ def guess_inventory_onhand(df_seg: pd.DataFrame, mapping):
442
+ candidates = [
443
+ "์žฌ๊ณ ", "์žฌ๊ณ ์ˆ˜", "์žฌ๊ณ ์ˆ˜๋Ÿ‰",
444
+ "ํ˜„์žฌ์žฌ๊ณ ", "onhand", "on_hand",
445
+ "stock", "inventory",
446
+ ]
447
+ inv_col = None
448
+ for col in df_seg.columns:
449
+ low = col.lower()
450
+ if any(key in low for key in candidates):
451
+ inv_col = col
452
+ break
453
+ if not inv_col:
454
+ return None, None
455
+
456
+ series = pd.to_numeric(df_seg[inv_col], errors="coerce").dropna()
457
+ if series.empty:
458
+ return None, None
459
+
460
+ return inv_col, float(series.iloc[-1])
461
+
462
+ # ==============================
463
+ # ๊ฐ€๊ฒฉ ์ž๋™ ์ธ์‹
464
+ # ==============================
465
+ def guess_price_column(df_seg):
466
+ keys = ["price", "๊ฐ€๊ฒฉ", "๋‹จ๊ฐ€", "ํŒ๋งค๊ฐ€", "amount", "๊ธˆ์•ก"]
467
+ for col in df_seg.columns:
468
+ low = col.lower()
469
+ if any(k in low for k in keys):
470
+ return col
471
+ return None
472
+
473
+ # ==============================
474
+ # ๋ชจ๋ธ ๋กœ๋“œ
475
+ # ==============================
476
+ pkl_path = os.path.join(MODELS_DIR, "best_model.pkl")
477
+ if os.path.exists(pkl_path):
478
+ try:
479
+ with open(pkl_path, "rb") as f:
480
+ payload = pickle.load(f)
481
+ model = payload["model"]
482
+ feat_names = payload["feature_names"]
483
+ mapping = payload["mapping"]
484
+ except Exception as e:
485
+ st.error(f"์ €์žฅ๋œ ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘ ์˜ค๋ฅ˜: {e}")
486
+ else:
487
+ dtc = mapping["date"]
488
+
489
+ # ======================================
490
+ # 1) ์˜ˆ์ธก ์ˆ˜ํ–‰
491
+ # ======================================
492
+ fc_df = iterative_forecast(
493
+ st.session_state["df"],
494
+ mapping,
495
+ model,
496
+ feat_names,
497
+ horizon_days,
498
+ seg_vals,
499
+ )
500
+ if fc_df.empty:
501
+ st.stop()
502
+
503
+ # ======================================
504
+ # 2) ๊ฐ€๊ฒฉ ์ž๋™ ์ธ์‹ + ๊ธˆ์•ก์˜ˆ์ธก
505
+ # ======================================
506
+ df_seg_price = st.session_state["df"].copy()
507
+ for k, v in seg_vals.items():
508
+ if v and v != "<์ „์ฒด>" and k in df_seg_price.columns:
509
+ df_seg_price = df_seg_price[df_seg_price[k].astype(str) == str(v)]
510
+ df_seg_price = df_seg_price.sort_values(dtc)
511
+
512
+ price_col = guess_price_column(df_seg_price)
513
+
514
+ if price_col:
515
+ price_val = float(
516
+ pd.to_numeric(df_seg_price[price_col], errors="coerce").dropna().iloc[-1]
517
+ )
518
+ st.info(f"CSV '{price_col}' ์ปฌ๋Ÿผ์—์„œ ๊ฐ€๊ฒฉ {price_val:,.0f}์› ์ž๋™ ์ธ์‹.")
519
+ else:
520
+ price_val = st.number_input(
521
+ "๊ฐ€๊ฒฉ(์›) โ€“ CSV์—์„œ ๊ฐ€๊ฒฉ ์ปฌ๋Ÿผ์„ ์ฐพ์ง€ ๋ชปํ•ด ์ง์ ‘ ์ž…๋ ฅ",
522
+ min_value=0,
523
+ max_value=100000000,
524
+ value=0,
525
+ )
526
+
527
+ # **์ˆ˜๋Ÿ‰ ์ดํ•ฉ**
528
+ total_qty_demand = float(fc_df["์˜ˆ์ธก์ˆ˜๋Ÿ‰"].sum())
529
+
530
+ # **๊ธˆ์•ก ์ดํ•ฉ**
531
+ fc_df["๊ธˆ์•ก์˜ˆ์ธก"] = (fc_df["์˜ˆ์ธก์ˆ˜๋Ÿ‰"] * price_val * float(accuracy)).clip(lower=0.0)
532
+ total_amt_demand = float(fc_df["๊ธˆ์•ก์˜ˆ์ธก"].sum())
533
+
534
+ # ======================================
535
+ # 3) ์žฌ๊ณ  ์ž๋™ ์ธ์‹
536
+ # ======================================
537
+ df_seg = st.session_state["df"].copy()
538
+ df_seg[dtc] = pd.to_datetime(df_seg[dtc], errors="coerce")
539
+ for k, v in seg_vals.items():
540
+ if v and v != "<์ „์ฒด>" and k in df_seg.columns:
541
+ df_seg = df_seg[df_seg[k].astype(str) == str(v)]
542
+ df_seg = df_seg.sort_values(dtc)
543
+
544
+ inv_col, onhand_auto = guess_inventory_onhand(df_seg, mapping)
545
+ if onhand_auto is None:
546
+ onhand = st.number_input(
547
+ "ํ˜„์žฌ ์žฌ๊ณ (์ง์ ‘ ์ž…๋ ฅ)",
548
+ min_value=0,
549
+ max_value=100000,
550
+ value=0,
551
+ )
552
+ else:
553
+ onhand = onhand_auto
554
+ st.info(f"์žฌ๊ณ  '{inv_col}' ์ž๋™ ์ธ์‹ โ†’ {onhand:,.0f}๊ฐœ")
555
+
556
+ # ======================================
557
+ # 4) ๋ฐœ์ฃผ๋Ÿ‰/์†Œ์ง„์ผ ๊ณ„์‚ฐ (์ˆ˜๋Ÿ‰ ๊ธฐ์ค€)
558
+ # ======================================
559
+ avg_daily_qty = total_qty_demand / horizon_days if horizon_days > 0 else 0.0
560
+ days_to_out = (onhand / avg_daily_qty) if avg_daily_qty > 0 else float("inf")
561
+ rec_qty = max(0.0, total_qty_demand - onhand)
562
+
563
+ c1, c2, c3 = st.columns(3)
564
+ c1.metric("์˜ˆ์ธก ๊ธฐ๊ฐ„(์ผ)", f"{horizon_days}")
565
+ c2.metric("์žฌ๊ณ  ์†Œ์ง„ ์˜ˆ์ƒ์ผ์ˆ˜", "โˆž" if np.isinf(days_to_out) else f"{days_to_out:,.1f}")
566
+ c3.metric("2์ฃผ ์ด ์˜ˆ์ƒ ๋งค์ถœ", f"{total_amt_demand:,.0f}์›")
567
+
568
+ # ======================================
569
+ # 5) ํ‘œ ์ถœ๋ ฅ
570
+ # ======================================
571
+ st.dataframe(fc_df.set_index(dtc), use_container_width=True)
572
+ st.caption("โ€ป ์˜ˆ์ธก์ˆ˜๋Ÿ‰ ร— ๊ฐ€๊ฒฉ ร— ์ •ํ™•๋„ ๋ณด์ • = ๊ธˆ์•ก์˜ˆ์ธก")
573
+
574
+ else:
575
+ st.warning("best_model.pkl ์ด ์—†์Šต๋‹ˆ๋‹ค. โ‘ก ํƒญ์—์„œ ํ•™์Šต์„ ๋จผ์ € ์ˆ˜ํ–‰ํ•˜์„ธ์š”.")
576
+
577
+ # ============================================================
578
+ # โ‘ฃ ๋ถ„์„(๊ทธ๋ž˜ํ”„):
579
+ # - ์šฐ์‚ฐ: ํ•œ ๋‹ฌ ๊ฐ•์ˆ˜๋Ÿ‰ vs ์šฐ์‚ฐ ํŒ๋งค๋Ÿ‰ (์‚ฐ์ ๋„ + ํšŒ๊ท€์„  + ์ผ๋ณ„ ์„ ํ˜• ๊ทธ๋ž˜ํ”„)
580
+ # - ๊ตฐ๊ณ ๊ตฌ๋งˆ: ํ•œ ๋‹ฌ ๊ธฐ์˜จ vs ๊ตฐ๊ณ ๊ตฌ๋งˆ ํŒ๋งค๋Ÿ‰ (์‚ฐ์ ๋„ + ํšŒ๊ท€์„  + ์ผ๋ณ„ ์„ ํ˜• ๊ทธ๋ž˜ํ”„)
581
+ # - ์ „์ฒด: ์šฐ์‚ฐยท๊ตฐ๊ณ ๊ตฌ๋งˆ ์ œ์™ธ ์ผ๋ณ„ ํŒ๋งค๋Ÿ‰ ์„ ํ˜• ๊ทธ๋ž˜ํ”„
582
+ # ============================================================
583
+ with tabs[3]:
584
+ st.subheader("๋ถ„์„(๊ทธ๋ž˜ํ”„) โ€” ํ•œ ๋‹ฌ ๋‹จ์œ„ ์ƒ๊ด€ ๋ถ„์„")
585
+
586
+ if "df" not in st.session_state or "mapping" not in st.session_state or not st.session_state["mapping"].get("date"):
587
+ st.info("๋จผ์ € โ‘  ํƒญ์—์„œ ๋ฐ์ดํ„ฐ์™€ ์ปฌ๋Ÿผ ๋งคํ•‘(ํŠนํžˆ '๋‚ ์งœ'์™€ 'ํƒ€๊นƒ')์„ ์ง€์ •ํ•˜์„ธ์š”.")
588
+ else:
589
+ mapping = st.session_state["mapping"]
590
+ date_col = mapping["date"]
591
+ target_col = mapping.get("target")
592
+
593
+ def guess(colnames, cands):
594
+ low = [str(c).lower() for c in colnames]
595
+ for key in cands:
596
+ key_low = str(key).lower()
597
+ for i, l in enumerate(low):
598
+ if key_low in l:
599
+ return colnames[i]
600
+ return None
601
+
602
+ # ๊ณตํ†ต: ์—ฐ-์›” ์„ ํƒ์šฉ ์˜ต์…˜ ๋งŒ๋“œ๋Š” ํ•จ์ˆ˜
603
+ def build_year_month_options(df, date_col):
604
+ df = df.copy()
605
+ df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
606
+ df = df.dropna(subset=[date_col])
607
+ if df.empty:
608
+ return df, []
609
+ df["year_month"] = df[date_col].dt.to_period("M")
610
+ ym_unique = sorted(df["year_month"].unique())
611
+ ym_labels = [str(p) for p in ym_unique] # '2024-10' ๊ฐ™์€ ํ˜•์‹
612
+ return df, list(zip(ym_labels, ym_unique))
613
+
614
+ tab_u, tab_g, tab_all = st.tabs([
615
+ "โ˜” ์šฐ์‚ฐ: ํ•œ ๋‹ฌ ๊ฐ•์ˆ˜๋Ÿ‰ vs ํŒ๋งค๋Ÿ‰",
616
+ "๐Ÿ  ๊ตฐ๊ณ ๊ตฌ๋งˆ: ํ•œ ๋‹ฌ ๊ธฐ์˜จ vs ํŒ๋งค๋Ÿ‰",
617
+ "๐Ÿ“ˆ ์ „์ฒด: ์šฐ์‚ฐยท๊ตฐ๊ณ ๊ตฌ๋งˆ ์ œ์™ธ ์ผ๋ณ„ ํŒ๋งค๋Ÿ‰(์„ ํ˜•)"
618
+ ])
619
+
620
+ # ------------------------------
621
+ # 1) ์šฐ์‚ฐ: ์„ ํƒํ•œ ํ•œ ๋‹ฌ์˜ ๊ฐ•์ˆ˜๋Ÿ‰ โ†” ์šฐ์‚ฐ ํŒ๋งค๋Ÿ‰
622
+ # ------------------------------
623
+ with tab_u:
624
+ st.caption("์šฐ์‚ฐ ํŒ๋งค๋Ÿ‰๊ณผ ๊ฐ•์ˆ˜๋Ÿ‰์˜ ๊ด€๊ณ„๋ฅผ 'ํ•œ ๋‹ฌ' ๋‹จ์œ„๋กœ ๋ด…๋‹ˆ๋‹ค.")
625
+
626
+ up_u = st.file_uploader("์šฐ์‚ฐ/๋‚ ์”จ ๋ฐ์ดํ„ฐ CSV (์„ ํƒ)", type=["csv"], key="umbrella_month_up")
627
+ if up_u is not None:
628
+ df_u_raw = read_csv_flexible(io.BytesIO(up_u.read()))
629
+ else:
630
+ df_u_raw = st.session_state["df"].copy()
631
+
632
+ if date_col not in df_u_raw.columns:
633
+ st.warning(f"๋‚ ์งœ ์ปฌ๋Ÿผ '{date_col}' ์„(๋ฅผ) ๋ฐ์ดํ„ฐ์—์„œ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค.")
634
+ else:
635
+ # item์—์„œ ์šฐ์‚ฐ๋งŒ ํ•„ํ„ฐ (์žˆ์œผ๋ฉด)
636
+ item_col = mapping.get("item")
637
+ if item_col and item_col in df_u_raw.columns:
638
+ mask = df_u_raw[item_col].astype(str).str.contains("์šฐ์‚ฐ|umbrella", case=False, na=False)
639
+ if mask.any():
640
+ df_u_raw = df_u_raw[mask]
641
+
642
+ cols_all = list(df_u_raw.columns)
643
+
644
+ # ํŒ๋งค๋Ÿ‰ ์ปฌ๋Ÿผ: ๋งคํ•‘ target ์šฐ์„ , ์—†์œผ๋ฉด ์ถ”์ •
645
+ sales_col = target_col if target_col in cols_all else guess(
646
+ cols_all,
647
+ ["umbrella", "์šฐ์‚ฐ", "์ผ์ผํŒ๋งค๋Ÿ‰", "ํŒ๋งค๋Ÿ‰", "sales", "qty", "quantity", "target"],
648
+ )
649
+
650
+ # ๊ฐ•์ˆ˜๋Ÿ‰ ์ปฌ๋Ÿผ ์ถ”์ •
651
+ rain_col = guess(
652
+ cols_all,
653
+ ["rain", "precip", "precipitation", "๊ฐ•์ˆ˜", "๊ฐ•์ˆ˜๋Ÿ‰", "์ผ๊ฐ•์ˆ˜๋Ÿ‰", "๊ฐ•์šฐ", "๊ฐ•์šฐ๋Ÿ‰"],
654
+ )
655
+
656
+ if not sales_col or not rain_col:
657
+ st.warning(
658
+ "์šฐ์‚ฐ ํŒ๋งค๋Ÿ‰ ๋˜๋Š” ๊ฐ•์ˆ˜๋Ÿ‰ ์ปฌ๋Ÿผ์„ ์ž๋™์œผ๋กœ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค.\n"
659
+ "ํŒ๋งค๋Ÿ‰: '์šฐ์‚ฐ/umbrella/ํŒ๋งค๋Ÿ‰/sales', ๊ฐ•์ˆ˜๋Ÿ‰: '๊ฐ•์ˆ˜๋Ÿ‰/rain' ๋“ฑ์˜ ์ด๋ฆ„์„ ์‚ฌ์šฉํ•ด ์ฃผ์„ธ์š”."
660
+ )
661
+ else:
662
+ # ๋‚ ์งœ/์ˆซ์ž ํ˜•์‹ ์ •๋ฆฌ + ์—ฐ-์›” ์˜ต์…˜ ์ƒ์„ฑ
663
+ df_u_raw[sales_col] = pd.to_numeric(df_u_raw[sales_col], errors="coerce")
664
+ df_u_raw[rain_col] = pd.to_numeric(df_u_raw[rain_col], errors="coerce")
665
+
666
+ df_u_raw, ym_options = build_year_month_options(df_u_raw, date_col)
667
+
668
+ if not ym_options:
669
+ st.info("์œ ํšจํ•œ ๋‚ ์งœ ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
670
+ else:
671
+ # ์—ฐ-์›” ์„ ํƒ (YYYY-MM ํ˜•์‹๋งŒ ๋ณด์—ฌ์คŒ)
672
+ labels = [lab for lab, _ in ym_options]
673
+ default_idx = len(labels) - 1 # ๊ธฐ๋ณธ๊ฐ’: ๊ฐ€์žฅ ์ตœ๊ทผ ์›”
674
+ sel_label = st.selectbox("๋ถ„์„ํ•  ์—ฐ์›”(YYYY-MM)", labels, index=default_idx, key="ym_umbrella")
675
+ sel_period = dict(ym_options)[sel_label]
676
+
677
+ # ์„ ํƒํ•œ ํ•œ ๋‹ฌ๋งŒ ํ•„ํ„ฐ
678
+ df_month = df_u_raw[df_u_raw["year_month"] == sel_period].copy()
679
+ if df_month.empty:
680
+ st.info(f"{sel_label} ์— ํ•ด๋‹นํ•˜๋Š” ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
681
+ else:
682
+ # ์ผ ๋‹จ์œ„ ์ง‘๊ณ„
683
+ df_month["date_only"] = df_month[date_col].dt.date
684
+ daily = (
685
+ df_month.groupby("date_only", as_index=False)
686
+ .agg({sales_col: "sum", rain_col: "mean"})
687
+ .dropna(subset=[sales_col, rain_col])
688
+ )
689
+ daily = daily.rename(
690
+ columns={"date_only": "date", sales_col: "sales", rain_col: "rain"}
691
+ )
692
+
693
+ if daily.empty:
694
+ st.info("ํ•ด๋‹น ์—ฐ์›”์—์„œ ์ผ๋ณ„๋กœ ์ง‘๊ณ„ํ•  ์ˆ˜ ์žˆ๋Š” ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
695
+ else:
696
+ st.markdown(f"**{sel_label} ํ•œ ๋‹ฌ ๊ธฐ์ค€ ยท ๊ฐ•์ˆ˜๋Ÿ‰์— ๋”ฐ๋ฅธ ์šฐ์‚ฐ ํŒ๋งค๋Ÿ‰**")
697
+
698
+ base = alt.Chart(daily).encode(
699
+ x=alt.X("rain:Q", title="์ผ ๊ฐ•์ˆ˜๋Ÿ‰"),
700
+ y=alt.Y("sales:Q", title="์ผ ์šฐ์‚ฐ ํŒ๋งค๋Ÿ‰"),
701
+ )
702
+
703
+ # ๋ถ‰์€์ƒ‰ ์‚ฐ์ ๋„ + ์„ ํ˜• ํšŒ๊ท€์„ 
704
+ points = base.mark_circle(size=70, color="#d62728").encode(
705
+ tooltip=[
706
+ alt.Tooltip("date:T", title="๋‚ ์งœ"),
707
+ alt.Tooltip("rain:Q", title="๊ฐ•์ˆ˜๋Ÿ‰"),
708
+ alt.Tooltip("sales:Q", title="์šฐ์‚ฐ ํŒ๋งค๋Ÿ‰"),
709
+ ]
710
+ )
711
+ reg_line = base.transform_regression("rain", "sales").mark_line(color="#b22222")
712
+
713
+ st.altair_chart((points + reg_line).interactive(), use_container_width=True)
714
+
715
+ # โ˜… ์ถ”๊ฐ€: ์ผ๋ณ„ ์šฐ์‚ฐ ํŒ๋งค๋Ÿ‰ ์„ ํ˜• ๊ทธ๋ž˜ํ”„
716
+ st.markdown("**์ผ๋ณ„ ์šฐ์‚ฐ ํŒ๋งค๋Ÿ‰ ์ถ”์„ธ(์„ ํ˜• ๊ทธ๋ž˜ํ”„)**")
717
+ line_umbrella = (
718
+ alt.Chart(daily)
719
+ .mark_line()
720
+ .encode(
721
+ x=alt.X("date:T", title="๋‚ ์งœ"),
722
+ y=alt.Y("sales:Q", title="์ผ ์šฐ์‚ฐ ํŒ๋งค๋Ÿ‰"),
723
+ tooltip=[
724
+ alt.Tooltip("date:T", title="๋‚ ์งœ"),
725
+ alt.Tooltip("sales:Q", title="์šฐ์‚ฐ ํŒ๋งค๋Ÿ‰"),
726
+ alt.Tooltip("rain:Q", title="๊ฐ•์ˆ˜๋Ÿ‰"),
727
+ ],
728
+ )
729
+ )
730
+ st.altair_chart(line_umbrella.interactive(), use_container_width=True)
731
+
732
+ # ์ฐธ๊ณ ์šฉ ํ…Œ์ด๋ธ”
733
+ st.dataframe(daily, use_container_width=True)
734
+
735
+ # ------------------------------
736
+ # 2) ๊ตฐ๊ณ ๊ตฌ๋งˆ: ์„ ํƒํ•œ ํ•œ ๋‹ฌ์˜ ๊ธฐ์˜จ โ†” ๊ตฐ๊ณ ๊ตฌ๋งˆ ํŒ๋งค๋Ÿ‰
737
+ # ------------------------------
738
+ with tab_g:
739
+ st.caption("๊ตฐ๊ณ ๊ตฌ๋งˆ ํŒ๋งค๋Ÿ‰๊ณผ ๊ธฐ์˜จ(์ถ”์œ„)์˜ ๊ด€๊ณ„๋ฅผ 'ํ•œ ๋‹ฌ' ๋‹จ์œ„๋กœ ๋ด…๋‹ˆ๋‹ค.")
740
+
741
+ up_g = st.file_uploader("๊ตฐ๊ณ ๊ตฌ๋งˆ/๋‚ ์”จ ๋ฐ์ดํ„ฐ CSV (์„ ํƒ)", type=["csv"], key="goguma_month_up")
742
+ if up_g is not None:
743
+ df_g_raw = read_csv_flexible(io.BytesIO(up_g.read()))
744
+ else:
745
+ df_g_raw = st.session_state["df"].copy()
746
+
747
+ if date_col not in df_g_raw.columns:
748
+ st.warning(f"๋‚ ์งœ ์ปฌ๋Ÿผ '{date_col}' ์„(๋ฅผ) ๋ฐ์ดํ„ฐ์—์„œ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค.")
749
+ else:
750
+ # item์—์„œ ๊ตฐ๊ณ ๊ตฌ๋งˆ๋งŒ ํ•„ํ„ฐ (์žˆ์œผ๋ฉด)
751
+ item_col_g = mapping.get("item")
752
+ if item_col_g and item_col_g in df_g_raw.columns:
753
+ mask_g = df_g_raw[item_col_g].astype(str).str.contains(
754
+ "๊ณ ๊ตฌ๋งˆ|๊ตฐ๊ณ ๊ตฌ๋งˆ|sweet|goguma", case=False, na=False
755
+ )
756
+ if mask_g.any():
757
+ df_g_raw = df_g_raw[mask_g]
758
+
759
+ cols_all_g = list(df_g_raw.columns)
760
+
761
+ goguma_col = target_col if target_col in cols_all_g else guess(
762
+ cols_all_g,
763
+ ["๊ณ ๊ตฌ๋งˆ", "๊ตฐ๊ณ ๊ตฌ๋งˆ", "sweetpotato", "goguma", "ํŒ๋งค๋Ÿ‰", "sales", "qty", "quantity", "target"],
764
+ )
765
+ temp_col = guess(
766
+ cols_all_g,
767
+ ["์˜จ๋„", "tmin", "temp_min", "min_temp", "์ตœ์ €", "์ตœ์ €๊ธฐ์˜จ", "์ผ์ตœ์ €๊ธฐ์˜จ", "temperature", "temp"],
768
+ )
769
+
770
+ if not goguma_col or not temp_col:
771
+ st.warning(
772
+ "๊ตฐ๊ณ ๊ตฌ๋งˆ ํŒ๋งค๋Ÿ‰ ๋˜๋Š” ๊ธฐ์˜จ ์ปฌ๋Ÿผ์„ ์ž๋™์œผ๋กœ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค.\n"
773
+ "ํŒ๋งค๋Ÿ‰: '๊ตฐ๊ณ ๊ตฌ๋งˆ/๊ณ ๊ตฌ๋งˆ/sales/target', ๊ธฐ์˜จ: 'tmin/์ตœ์ €๊ธฐ์˜จ/temperature' ๋“ฑ์˜ ์ด๋ฆ„์„ ์‚ฌ์šฉํ•ด ์ฃผ์„ธ์š”."
774
+ )
775
+ else:
776
+ df_g_raw[goguma_col] = pd.to_numeric(df_g_raw[goguma_col], errors="coerce")
777
+ df_g_raw[temp_col] = pd.to_numeric(df_g_raw[temp_col], errors="coerce")
778
+
779
+ df_g_raw, ym_options_g = build_year_month_options(df_g_raw, date_col)
780
+
781
+ if not ym_options_g:
782
+ st.info("์œ ํšจํ•œ ๋‚ ์งœ ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
783
+ else:
784
+ labels_g = [lab for lab, _ in ym_options_g]
785
+ default_idx_g = len(labels_g) - 1
786
+ sel_label_g = st.selectbox("๋ถ„์„ํ•  ์—ฐ์›”(YYYY-MM)", labels_g, index=default_idx_g, key="ym_goguma")
787
+ sel_period_g = dict(ym_options_g)[sel_label_g]
788
+
789
+ df_month_g = df_g_raw[df_g_raw["year_month"] == sel_period_g].copy()
790
+ if df_month_g.empty:
791
+ st.info(f"{sel_label_g} ์— ํ•ด๋‹นํ•˜๋Š” ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
792
+ else:
793
+ df_month_g["date_only"] = df_month_g[date_col].dt.date
794
+ daily_g = (
795
+ df_month_g.groupby("date_only", as_index=False)
796
+ .agg({goguma_col: "sum", temp_col: "mean"})
797
+ .dropna(subset=[goguma_col, temp_col])
798
+ )
799
+ daily_g = daily_g.rename(
800
+ columns={"date_only": "date", goguma_col: "sales", temp_col: "temp"}
801
+ )
802
+
803
+ if daily_g.empty:
804
+ st.info("ํ•ด๋‹น ์—ฐ์›”์—์„œ ์ผ๋ณ„๋กœ ์ง‘๊ณ„ํ•  ์ˆ˜ ์žˆ๋Š” ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
805
+ else:
806
+ st.markdown(f"**{sel_label_g} ํ•œ ๋‹ฌ ๊ธฐ์ค€ ยท ๊ธฐ์˜จ์— ๋”ฐ๋ฅธ ๊ตฐ๊ณ ๊ตฌ๋งˆ ํŒ๋งค๋Ÿ‰**")
807
+
808
+ base_g = alt.Chart(daily_g).encode(
809
+ x=alt.X("temp:Q", title="์ผ ํ‰๊ท  ๊ธฐ์˜จ"),
810
+ y=alt.Y("sales:Q", title="์ผ ๊ตฐ๊ณ ๊ตฌ๋งˆ ํŒ๋งค๋Ÿ‰"),
811
+ )
812
+
813
+ points_g = base_g.mark_circle(size=70, color="#ff7f0e").encode(
814
+ tooltip=[
815
+ alt.Tooltip("date:T", title="๋‚ ์งœ"),
816
+ alt.Tooltip("temp:Q", title="๊ธฐ์˜จ"),
817
+ alt.Tooltip("sales:Q", title="๊ตฐ๊ณ ๊ตฌ๋งˆ ํŒ๋งค๋Ÿ‰"),
818
+ ]
819
+ )
820
+ reg_g = base_g.transform_regression("temp", "sales").mark_line(color="#d35400")
821
+
822
+ st.altair_chart((points_g + reg_g).interactive(), use_container_width=True)
823
+
824
+ # โ˜… ์ถ”๊ฐ€: ์ผ๋ณ„ ๊ตฐ๊ณ ๊ตฌ๋งˆ ํŒ๋งค๋Ÿ‰ ์„ ํ˜• ๊ทธ๋ž˜ํ”„
825
+ st.markdown("**์ผ๋ณ„ ๊ตฐ๊ณ ๊ตฌ๋งˆ ํŒ๋งค๋Ÿ‰ ์ถ”์„ธ(์„ ํ˜• ๊ทธ๋ž˜ํ”„)**")
826
+ line_goguma = (
827
+ alt.Chart(daily_g)
828
+ .mark_line()
829
+ .encode(
830
+ x=alt.X("date:T", title="๋‚ ์งœ"),
831
+ y=alt.Y("sales:Q", title="์ผ ๊ตฐ๊ณ ๊ตฌ๋งˆ ํŒ๋งค๋Ÿ‰"),
832
+ tooltip=[
833
+ alt.Tooltip("date:T", title="๋‚ ์งœ"),
834
+ alt.Tooltip("temp:Q", title="๊ธฐ์˜จ"),
835
+ alt.Tooltip("sales:Q", title="๊ตฐ๊ณ ๊ตฌ๋งˆ ํŒ๋งค๋Ÿ‰"),
836
+ ],
837
+ )
838
+ )
839
+ st.altair_chart(line_goguma.interactive(), use_container_width=True)
840
+
841
+ st.dataframe(daily_g, use_container_width=True)
842
+
843
+ # ------------------------------
844
+ # 3) ์ „์ฒด: ์šฐ์‚ฐยท๊ตฐ๊ณ ๊ตฌ๋งˆ ์ œ์™ธ ์ „์ฒด ์ƒํ’ˆ ์ผ๋ณ„ ํŒ๋งค๋Ÿ‰ ์„ ํ˜• ๊ทธ๋ž˜ํ”„
845
+ # ------------------------------
846
+ with tab_all:
847
+ st.caption("์šฐ์‚ฐยท๊ตฐ๊ณ ๊ตฌ๋งˆ๋ฅผ ์ œ์™ธํ•œ ๋ชจ๋“  ์ƒํ’ˆ์˜ ์ผ๋ณ„ ํŒ๋งค๋Ÿ‰ ์ถ”์„ธ๋ฅผ ํ•œ ๋ฒˆ์— ๋ด…๋‹ˆ๋‹ค.")
848
+
849
+ df_all = st.session_state["df"].copy()
850
+
851
+ if date_col not in df_all.columns or not target_col or target_col not in df_all.columns:
852
+ st.warning(f"๋‚ ์งœ('{date_col}') ๋˜๋Š” ํƒ€๊นƒ('{target_col}') ์ปฌ๋Ÿผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
853
+ else:
854
+ # item ์ปฌ๋Ÿผ์ด ์žˆ์œผ๋ฉด ์šฐ์‚ฐ/๊ตฐ๊ณ ๊ตฌ๋งˆ ๊ด€๋ จ ์ƒํ’ˆ ์ œ์™ธ
855
+ item_col_all = mapping.get("item")
856
+ if item_col_all and item_col_all in df_all.columns:
857
+ ex_mask = df_all[item_col_all].astype(str).str.contains(
858
+ "์šฐ์‚ฐ|umbrella|๊ณ ๊ตฌ๋งˆ|๊ตฐ๊ณ ๊ตฌ๋งˆ|sweet|goguma", case=False, na=False
859
+ )
860
+ df_all = df_all[~ex_mask]
861
+
862
+ df_all[target_col] = pd.to_numeric(df_all[target_col], errors="coerce")
863
+
864
+ df_all, ym_options_all = build_year_month_options(df_all, date_col)
865
+
866
+ if not ym_options_all:
867
+ st.info("์œ ํšจํ•œ ๋‚ ์งœ ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
868
+ else:
869
+ labels_all = [lab for lab, _ in ym_options_all]
870
+ default_idx_all = len(labels_all) - 1
871
+ sel_label_all = st.selectbox(
872
+ "๋ถ„์„ํ•  ์—ฐ์›”(YYYY-MM)",
873
+ labels_all,
874
+ index=default_idx_all,
875
+ key="ym_all",
876
+ )
877
+ sel_period_all = dict(ym_options_all)[sel_label_all]
878
+
879
+ df_month_all = df_all[df_all["year_month"] == sel_period_all].copy()
880
+ if df_month_all.empty:
881
+ st.info(f"{sel_label_all} ์— ํ•ด๋‹นํ•˜๋Š” ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
882
+ else:
883
+ df_month_all["date_only"] = df_month_all[date_col].dt.date
884
+ daily_all = (
885
+ df_month_all.groupby("date_only", as_index=False)
886
+ .agg({target_col: "sum"})
887
+ .dropna(subset=[target_col])
888
+ )
889
+ daily_all = daily_all.rename(
890
+ columns={"date_only": "date", target_col: "sales"}
891
+ )
892
+
893
+ if daily_all.empty:
894
+ st.info("ํ•ด๋‹น ์—ฐ์›”์—์„œ ์ผ๋ณ„๋กœ ์ง‘๊ณ„ํ•  ์ˆ˜ ์žˆ๋Š” ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
895
+ else:
896
+ st.markdown(f"**{sel_label_all} ํ•œ ๋‹ฌ ๊ธฐ์ค€ ยท ์šฐ์‚ฐยท๊ตฐ๊ณ ๊ตฌ๋งˆ ์ œ์™ธ ์ „์ฒด ์ƒํ’ˆ ์ผ๋ณ„ ํŒ๋งค๋Ÿ‰(์„ ํ˜•)**")
897
+
898
+ line_all = (
899
+ alt.Chart(daily_all)
900
+ .mark_line()
901
+ .encode(
902
+ x=alt.X("date:T", title="๋‚ ์งœ"),
903
+ y=alt.Y("sales:Q", title="์ผ ํŒ๋งค๋Ÿ‰(์ „์ฒด ์ƒํ’ˆ ํ•ฉ๊ณ„)"),
904
+ tooltip=[
905
+ alt.Tooltip("date:T", title="๋‚ ์งœ"),
906
+ alt.Tooltip("sales:Q", title="์ผ ํŒ๋งค๋Ÿ‰ ํ•ฉ๊ณ„"),
907
+ ],
908
+ )
909
+ )
910
+ st.altair_chart(line_all.interactive(), use_container_width=True)
911
+ st.dataframe(daily_all, use_container_width=True)
912
+
913
+ # ============================================================
914
+ # โ‘ค ์ง„๋‹จ/๋กœ๊ทธ: ๊ฒฝ๋กœ/ํŒŒ์ผ ํ™•์ธ + ํผ๋ธ”๋ฆญ URL ์—ด๊ธฐ/๋‹ซ๊ธฐ
915
+ # ============================================================
916
+ with tabs[4]:
917
+ st.subheader("๊ฒฝ๋กœ/ํŒŒ์ผ ์ƒํƒœ")
918
+
919
+ cols = st.columns(2)
920
+ with cols[0]:
921
+ st.write("**data**", DATA_DIR)
922
+ st.write(os.listdir(DATA_DIR) if os.path.exists(DATA_DIR) else [])
923
+ st.write("**artifacts**", ARTI_DIR)
924
+ st.write(os.listdir(ARTI_DIR) if os.path.exists(ARTI_DIR) else [])
925
+ with cols[1]:
926
+ st.write("**models**", MODELS_DIR)
927
+ st.write(os.listdir(MODELS_DIR) if os.path.exists(MODELS_DIR) else [])
928
+
929
+ st.caption("ํ•„์š” ์‹œ ํผ๋ธ”๋ฆญ URL์„ ์—ด์–ด ์™ธ๋ถ€์—์„œ ์ ‘์†ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
930
+ mode = st.radio("ํผ๋ธ”๋ฆญ URL ํ„ฐ๋„๋Ÿฌ", ["ngrok", "cloudflared"], horizontal=True, index=0)
931
+
932
+ ngk = None
933
+ if mode == "ngrok":
934
+ ngk = st.text_input(
935
+ "NGROK_AUTHTOKEN",
936
+ value=os.environ.get("NGROK_AUTHTOKEN", ""),
937
+ type="password",
938
+ help="ํ™˜๊ฒฝ๋ณ€์ˆ˜์— ๋„ฃ์–ด๋‘๋ฉด ๋‹ค์Œ๋ถ€ํ„ฐ ์ž๋™ ์ธ์‹ํ•ฉ๋‹ˆ๋‹ค.",
939
+ )
940
+
941
+ c_open, c_close = st.columns(2)
942
+ if c_open.button("ํผ๋ธ”๋ฆญ URL ์—ด๊ธฐ", use_container_width=True):
943
+ if mode == "ngrok":
944
+ if ngk:
945
+ os.environ["NGROK_AUTHTOKEN"] = ngk
946
+ start_ngrok()
947
+ else:
948
+ start_cloudflared()
949
+
950
+ if c_close.button("ํผ๋ธ”๋ฆญ URL ๋‹ซ๊ธฐ", use_container_width=True):
951
+ if mode == "ngrok":
952
+ try:
953
+ from pyngrok import ngrok
954
+ ngrok.kill()
955
+ st.info("ngrok ํ„ฐ๋„์„ ์ข…๋ฃŒํ–ˆ์Šต๋‹ˆ๋‹ค.")
956
+ except Exception as e:
957
+ st.warning(f"ngrok ์ข…๋ฃŒ ์ค‘ ๊ฒฝ๊ณ : {e}")
958
+ else:
959
+ proc = st.session_state.get("_cfd_proc")
960
+ if proc:
961
+ proc.terminate()
962
+ st.info("cloudflared ํ„ฐ๋„์„ ์ข…๋ฃŒํ–ˆ์Šต๋‹ˆ๋‹ค.")
963
+ else:
964
+ st.info("cloudflared ํ™œ์„ฑ ํ”„๋กœ์„ธ์Šค๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
preprocess.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd, numpy as np
2
+
3
+ def add_time_features(df, date_col):
4
+ """
5
+ [๋ฌด์—‡์„ ํ•˜๋‚˜์š”?]
6
+ - ๋‚ ์งœ ์—ด(date_col)์—์„œ '์—ฐ๋„/์›”/์ผ/์š”์ผ/๋ช‡ ์ฃผ์ฐจ/์ฃผ๋ง ์—ฌ๋ถ€' ๊ฐ™์€
7
+ ์‰ฌ์šด ๋‹ฌ๋ ฅ ์ •๋ณด๋ฅผ ๋ฝ‘์•„ ํ‘œ์—(๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์—) ๋ถ™์—ฌ์ค˜์š”.
8
+
9
+ [์™œ ํ•„์š”ํ•˜์ฃ ?]
10
+ - ๊ธฐ๊ณ„๋Š” '2025-01-15' ๊ฐ™์€ ๋‚ ์งœ ๊ธ€์ž๋ฅผ ์ž˜ ๋ชป ์ดํ•ดํ•ด์š”.
11
+ ๋Œ€์‹  '2025๋…„', '1์›”', '15์ผ', '์ˆ˜์š”์ผ', '3์ฃผ์ฐจ' ์ฒ˜๋Ÿผ ์ˆซ์ž ์ •๋ณด๊ฐ€ ์žˆ์œผ๋ฉด
12
+ ๊ทœ์น™(๊ณ„์ ˆ/์š”์ผ ํŒจํ„ด)์„ ๋” ์ž˜ ๋ฐฐ์šธ ์ˆ˜ ์žˆ์–ด์š”.
13
+
14
+ [์ž…๋ ฅ]
15
+ - df: ์›๋ž˜ ๋ฐ์ดํ„ฐ ํ‘œ (DataFrame)
16
+ - date_col: ๋‚ ์งœ๊ฐ€ ๋“ค์–ด์žˆ๋Š” ์—ด ์ด๋ฆ„ (์˜ˆ: 'date')
17
+
18
+ [์ถœ๋ ฅ]
19
+ - ๋‹ฌ๋ ฅ ์ •๋ณด ์—ด์ด ์ถ”๊ฐ€๋œ ์ƒˆ ํ‘œ (์›๋ณธ์€ ๊ฑด๋“œ๋ฆฌ์ง€ ์•Š์•„์š”)
20
+ """
21
+ df = df.copy() # ์›๋ณธ์„ ๋ง๊ฐ€๋œจ๋ฆฌ์ง€ ์•Š์œผ๋ ค๊ณ  ๋ณต์‚ฌ๋ณธ์„ ๋งŒ๋“ค์–ด์š”.
22
+
23
+ # ๋‚ ์งœ ๊ธ€์ž๋ฅผ ์ง„์งœ '๋‚ ์งœ'๋กœ ๋ฐ”๊ฟ”์š”. ์ด์ƒํ•œ ๊ฐ’์€ NaT(๋น„์–ด์žˆ์Œ)๋กœ ์ฒ˜๋ฆฌ.
24
+ df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
25
+
26
+ # ๋‚ ์งœ๊ฐ€ ๋น„์–ด์žˆ๋Š” ํ–‰์€ ๊ณ„์‚ฐ์ด ์•ˆ ๋˜๋‹ˆ ๋นผ๊ณ , ๋‚ ์งœ์ˆœ์œผ๋กœ ์ •๋ ฌํ•ด์š”.
27
+ df = df.dropna(subset=[date_col]).sort_values(date_col)
28
+
29
+ # ๋‹ฌ๋ ฅ์—์„œ ๋ฐ”๋กœ ๊บผ๋‚ผ ์ˆ˜ ์žˆ๋Š” ์ •๋ณด๋“ค์„ ์ƒˆ ์—ด๋กœ ๋งŒ๋“ค์–ด์š”.
30
+ df["year"] = df[date_col].dt.year # ๋ช‡ ๋…„๋„์ธ์ง€
31
+ df["month"] = df[date_col].dt.month # ๋ช‡ ์›”์ธ์ง€(1~12)
32
+ df["day"] = df[date_col].dt.day # ๋ฉฐ์น ์ธ์ง€(1~31)
33
+ df["dow"] = df[date_col].dt.dayofweek # ์š”์ผ(์›”=0 ... ์ผ=6)
34
+ # '๋ช‡ ์ฃผ์ฐจ'๋Š” ISO ๋‹ฌ๋ ฅ ๊ธฐ์ค€์ด์—์š”. ์˜ˆ: 1์›”์˜ ์ฒซ ์ฃผ๊ฐ€ 1์ด ์•„๋‹ˆ๋ผ 52์ผ ์ˆ˜๋„ ์žˆ์–ด์š”.
35
+ df["week"] = df[date_col].dt.isocalendar().week.astype(int)
36
+ # ํ† /์ผ์ด๋ฉด ์ฃผ๋ง(1), ์•„๋‹ˆ๋ฉด 0
37
+ df["is_weekend"] = (df["dow"]>=5).astype(int)
38
+
39
+ return df
40
+
41
+
42
+ def add_lag_features(df, date_col, target_col, group_keys=None, lags=(1,7,14), rolls=(7,14)):
43
+ """
44
+ [๋ฌด์—‡์„ ํ•˜๋‚˜์š”?]
45
+ - '์–ด์ œ/์ผ์ฃผ์ผ ์ „/๋ณด๋ฆ„ ์ „' ๊ฐ™์€ ๊ณผ๊ฑฐ ๊ฐ’(=์ง€์—ฐ๊ฐ’, lag)์„ ๋งŒ๋“ค์–ด์„œ ๋ถ™์ด๊ณ ,
46
+ ์ตœ๊ทผ 7์ผ/14์ผ์˜ ํ‰๊ท ยทํ‘œ์ค€ํŽธ์ฐจ(ํ”๋“ค๋ฆผ)๋„ ๊ฐ™์ด ๋ถ™์—ฌ์ค˜์š”.
47
+
48
+ [์™œ ํ•„์š”ํ•˜์ฃ ?]
49
+ - ์ˆ˜์š”๋Š” ์–ด์ œ/์ง€๋‚œ์ฃผ์™€ ๋น„์Šทํ•˜๊ฒŒ ์›€์ง์ด๋Š” ๊ฒฝํ–ฅ์ด ์žˆ์–ด์š”.
50
+ ๊ณผ๊ฑฐ ๊ฐ’์„ ํžŒํŠธ๋กœ ์ฃผ๋ฉด '๋‚ด์ผ'์„ ๋งž์ถ”๊ธฐ ์‰ฌ์›Œ์ ธ์š”.
51
+ - lag7: 7์ผ ์ „ ๊ฐ’ โ†’ '์ง€๋‚œ์ฃผ ๊ฐ™์€ ์š”์ผ'์˜ ํžŒํŠธ
52
+ - rmean7: ์ตœ๊ทผ 7์ผ ํ‰๊ท  โ†’ ์ตœ๊ทผ ํ๋ฆ„(ํ‰๊ท )
53
+ - rstd7: ์ตœ๊ทผ 7์ผ ํ”๋“ค๋ฆผ(ํ‘œ์ค€ํŽธ์ฐจ) โ†’ ๋ณ€๋™์„ฑ ํฌ๊ธฐ
54
+
55
+ [group_keys๊ฐ€ ๋ญ์ฃ ?]
56
+ - ์ ํฌ/๋ธŒ๋žœ๋“œ/์ƒํ’ˆ๋งˆ๋‹ค ๋”ฐ๋กœ ๊ณผ๊ฑฐ๋ฅผ ๋ณด๋ผ๊ณ  ์ง€์ •ํ•˜๋Š” ์—ด๋“ค์ด์—์š”.
57
+ ์˜ˆ) ["region", "item"]์ด๋ฉด ์ง€์—ญ+์ƒํ’ˆ๋ณ„๋กœ ๊ฐ๊ฐ ์–ด์ œ/์ง€๋‚œ์ฃผ๋ฅผ ๊ณ„์‚ฐํ•ด์š”.
58
+ (๊ทธ๋ฃน ์—†์ด ํ†ต์œผ๋กœ ๊ณ„์‚ฐํ•˜๋ฉด ์„œ๋กœ ๋‹ค๋ฅธ ์ ํฌ/์ƒํ’ˆ์˜ ๊ฐ’์ด ์„ž์—ฌ์„œ ์˜๋ฏธ๊ฐ€ ํ๋ ค์งˆ ์ˆ˜ ์žˆ์–ด์š”.)
59
+
60
+ [์ž…๋ ฅ]
61
+ - df: ํ‘œ
62
+ - date_col: ๋‚ ์งœ ์—ด ์ด๋ฆ„
63
+ - target_col: ๋งž์ถ”๊ณ  ์‹ถ์€ ์ˆซ์ž(ํŒ๋งค๋Ÿ‰ ๋“ฑ) ์—ด
64
+ - group_keys: ๊ทธ๋ฃนํ•‘ํ•  ์—ด ๋ชฉ๋ก(์—†์–ด๋„ ๋จ)
65
+ - lags: ๋งŒ๋“ค lag ๋ชฉ๋ก(๊ธฐ๋ณธ 1, 7, 14)
66
+ - rolls: ๊ตด๋ฆฌ๋Š” ์ฐฝ ํฌ๊ธฐ(rolling window) ๋ชฉ๋ก(๊ธฐ๋ณธ 7, 14)
67
+
68
+ [์ถœ๋ ฅ]
69
+ - lag/rmean/rstd ์—ด์ด ์ถ”๊ฐ€๋œ ํ‘œ(๋‚ ์งœ์ˆœ)
70
+ """
71
+ df = df.copy()
72
+
73
+ # group_keys ์ค‘ ํ‘œ์— ์‹ค์ œ๋กœ ์กด์žฌํ•˜๋Š” ๊ฒƒ๋งŒ ๋‚จ๊ฒจ์š”.
74
+ group_keys = [c for c in (group_keys or []) if c in df.columns]
75
+
76
+ # ๊ทธ๋ฃน์ด ์žˆ์œผ๋ฉด ๊ทธ๋ฃน๋ณ„๋กœ, ์—†์œผ๋ฉด ์ „์ฒด๋ฅผ ํ•˜๋‚˜์˜ ๊ทธ๋ฃน์ฒ˜๋Ÿผ ์ฒ˜๋ฆฌํ•ด์š”.
77
+ if group_keys:
78
+ g = df.groupby(group_keys, group_keys=False) # group_keys=False: ํ‚ค๋ฅผ ์ธ๋ฑ์Šค๋กœ ์˜ฌ๋ฆฌ์ง€ ๋ง๊ธฐ
79
+ else:
80
+ g = [(None, df)] # '๊ทธ๋ฃน์ด ํ•˜๋‚˜'๋ผ๊ณ  ๊ฐ€์ •ํ•œ ๋ฆฌ์ŠคํŠธ. ์•„๋ž˜ for๋ฌธ๊ณผ ํ˜ธํ™˜๋˜๊ฒŒ ๋งŒ๋“ค์–ด์š”.
81
+
82
+ out = [] # ๊ทธ๋ฃน๋ณ„๋กœ ์ฒ˜๋ฆฌํ•œ ๊ฒฐ๊ณผ๋ฅผ ๋ชจ์•„๋‘” ๋’ค, ๋งˆ์ง€๋ง‰์— ํ•ฉ์ณ์š”.
83
+
84
+ # pandas์˜ groupby๋Š” (ํ‚ค, ๋ถ€๋ถ„ํ‘œ) ํ˜•ํƒœ๋กœ ๋ฐ˜๋ณต๋ฉ๋‹ˆ๋‹ค.
85
+ # ์œ„์—์„œ g๋ฅผ ๋ฆฌ์ŠคํŠธ๋กœ ๋งž์ถฐ์คฌ๊ธฐ ๋•Œ๋ฌธ์— ๋‘˜ ๋ชจ๋‘ ๊ฐ™์€ ๋ฐฉ์‹์œผ๋กœ ์ˆœํšŒ ๊ฐ€๋Šฅํ•ด์š”.
86
+ for _, part in (g if isinstance(g, list) else g):
87
+ part = part.sort_values(date_col).copy() # ๋‚ ์งœ์ˆœ์œผ๋กœ ์ •๋ ฌ
88
+
89
+ # (1) lag ์—ด๋“ค ๋งŒ๋“ค๊ธฐ: ์˜ˆ) lag1(์–ด์ œ), lag7(์ง€๋‚œ์ฃผ), lag14(๋ณด๋ฆ„ ์ „)
90
+ for l in lags:
91
+ part[f"lag{l}"] = part[target_col].shift(l)
92
+ # shift(l)์€ ์œ„์—์„œ l์นธ ๋ฐ€์–ด์š”. ์˜ค๋Š˜ ํ–‰์—๋Š” 'l์ผ ์ „ ๊ฐ’'์ด ๋“ค์–ด๊ฐ.
93
+
94
+ # (2) rolling ํ‰๊ท /ํ‘œ์ค€ํŽธ์ฐจ: ์ตœ๊ทผ w์ผ ํ‰๊ท /ํ”๋“ค๋ฆผ
95
+ for w in rolls:
96
+ # min_periods๋ฅผ w์˜ ์ ˆ๋ฐ˜ ์ด์ƒ(์ตœ์†Œ 2)์œผ๋กœ ์ค˜์„œ
97
+ # ์ดˆ๋ฐ˜๋ถ€ ๋ฐ์ดํ„ฐ๊ฐ€ ๋„ˆ๋ฌด ์ž‘์„ ๋•Œ๋„ ๊ฐ’์ด ์กฐ๊ธˆ์ด๋ผ๋„ ๋‚˜์˜ค๋„๋ก ๋ฐฐ๋ ค.
98
+ part[f"rmean{w}"] = part[target_col].rolling(w, min_periods=max(2, w//2)).mean()
99
+ part[f"rstd{w}"] = part[target_col].rolling(w, min_periods=max(2, w//2)).std()
100
+
101
+ out.append(part)
102
+
103
+ # ๊ทธ๋ฃน๋ณ„๋กœ ๋งŒ๋“  ํ‘œ๋“ค์„ ์œ„์•„๋ž˜๋กœ ์ด์–ด๋ถ™์ด๊ณ , ๋‹ค์‹œ ๋‚ ์งœ์ˆœ ์ •๋ ฌ
104
+ return pd.concat(out, axis=0).sort_values(date_col)
105
+
106
+
107
+ def make_matrix(df, mapping):
108
+ """
109
+ [๋ฌด์—‡์„ ํ•˜๋‚˜์š”?]
110
+ - ๋ชจ๋ธ ํ•™์Šต์šฉ '์ž…๋ ฅ X'์™€ '์ •๋‹ต y'๋ฅผ ๋งŒ๋“œ๋Š” ๊ณต์žฅ์ž…๋‹ˆ๋‹ค.
111
+ 1) ๋‚ ์งœ/ํƒ€๊นƒ ์—ด ์ด๋ฆ„์„ mapping์—์„œ ์ฝ๊ณ ,
112
+ 2) add_time_features / add_lag_features๋กœ ์ˆซ์ž ํžŒํŠธ๋ฅผ ์ถ”๊ฐ€ํ•˜๊ณ ,
113
+ 3) (์žˆ๋‹ค๋ฉด) region/brand/item์„ '์›-ํ•ซ ์ธ์ฝ”๋”ฉ(๊ฐ€์งœ ์—ด)'์œผ๋กœ ๋ฐ”๊ฟ”์„œ X์— ๋ถ™์—ฌ์š”.
114
+ 4) y๋Š” ํƒ€๊นƒ ๊ฐ’(ํŒ๋งค๋Ÿ‰ ๋“ฑ)์œผ๋กœ ์„ค์ •ํ•ด์š”.
115
+
116
+ [์ž…๋ ฅ]
117
+ - df: ์›๋ณธ ํ‘œ
118
+ - mapping: {'date':..., 'target':..., 'region':..., 'brand':..., 'item':...}
119
+ (region/brand/item์€ ์—†์–ด๋„ ๋จ)
120
+
121
+ [์ถœ๋ ฅ]
122
+ - df: ํ”ผ์ฒ˜๊ฐ€ ๋ถ™์€ ํ‘œ(์ดˆ๊ธฐ lag๋กœ NaN์ธ ๋งจ ์•ž๋ถ€๋ถ„์€ ์ œ๊ฑฐ๋จ)
123
+ - X: ๋ชจ๋ธ์— ๋“ค์–ด๊ฐˆ ์ˆซ์ž ๋ฐฐ์—ด(2์ฐจ์›)
124
+ - y: ์ •๋‹ต ๋ฒกํ„ฐ(1์ฐจ์›)
125
+ - feat_names: X์˜ ์—ด ์ด๋ฆ„ ๋ชฉ๋ก(๋ชจ๋ธ ํ•ด์„/์žฌํ˜„์— ํ•„์š”)
126
+ """
127
+ df = df.copy()
128
+
129
+ # ๋งคํ•‘์—์„œ ์—ด ์ด๋ฆ„ ๊บผ๋‚ด์˜ค๊ธฐ
130
+ date_col = mapping.get("date")
131
+ target_col = mapping.get("target")
132
+ region_col = mapping.get("region")
133
+ brand_col = mapping.get("brand")
134
+ item_col = mapping.get("item")
135
+
136
+ # ๋‚ ์งœ/ํƒ€๊นƒ์€ ํ•„์ˆ˜! ์—†์œผ๋ฉด ์ง„ํ–‰ ๋ชป ํ•ด์š”.
137
+ if not date_col or not target_col:
138
+ raise ValueError("date/target ์ปฌ๋Ÿผ ๋งคํ•‘์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค.")
139
+
140
+ # --- (1) ์ˆซ์žํ˜• ์ •๋ฆฌ ---
141
+ # ํƒ€๊นƒ์€ ๋ฐ˜๋“œ์‹œ ์ˆซ์ž์—ฌ์•ผ ํ•ด์š”. ๊ธ€์ž๊ฐ€ ์„ž์—ฌ ์žˆ์œผ๋ฉด NaN์œผ๋กœ ๋ฐ”๋€œ โ†’ 0์œผ๋กœ ์ฑ„์›€.
142
+ df[target_col] = pd.to_numeric(df[target_col], errors="coerce").fillna(0)
143
+
144
+ # (์„ ํƒ) ๋ถ„๋ฅ˜ํ˜• ์—ด๋“ค์€ ๊ธ€์ž(๋ฌธ์ž์—ด)๋กœ ํ†ต์ผํ•ด์š”.
145
+ # ์ด๋ ‡๊ฒŒ ํ•ด์•ผ '์›-ํ•ซ ์ธ์ฝ”๋”ฉ'์ด ์ž˜ ๋ฉ๋‹ˆ๋‹ค.
146
+ if region_col and region_col in df: df[region_col] = df[region_col].astype(str)
147
+ if brand_col and brand_col in df: df[brand_col] = df[brand_col].astype(str)
148
+ if item_col and item_col in df: df[item_col] = df[item_col].astype(str)
149
+
150
+ # --- (2) ๋‹ฌ๋ ฅ ํ”ผ์ฒ˜ ๋ถ™์ด๊ธฐ ---
151
+ df = add_time_features(df, date_col)
152
+
153
+ # --- (3) ๊ณผ๊ฑฐ/์ตœ๊ทผ ํ†ต๊ณ„ ํ”ผ์ฒ˜ ๋ถ™์ด๊ธฐ ---
154
+ # ๊ทธ๋ฃนํ‚ค: ์กด์žฌํ•˜๋Š” ๊ฒƒ๋งŒ ์‚ฌ์šฉ (์˜ˆ: ['region','brand','item'] ์ค‘ ์‹ค์ œ ์žˆ๋Š” ์—ด๋งŒ)
155
+ df = add_lag_features(
156
+ df, date_col, target_col,
157
+ [c for c in [region_col, brand_col, item_col] if c]
158
+ )
159
+
160
+ # --- (4) lag/rolling ๋•Œ๋ฌธ์— ์•ž๋ถ€๋ถ„์— ์ƒ๊ธด ๋น„์–ด์žˆ๋Š” ํ–‰ ์ œ๊ฑฐ ---
161
+ # ์ฒซ ๋ช‡ ํ–‰์€ lag1/lag7 ๊ฐ™์€ ๊ฒŒ ์ฑ„์šธ ์ˆ˜ ์—†์–ด์„œ NaN์ด ๋ผ์š” โ†’ ํ•™์Šต์— ๋ชป ์“ฐ๋‹ˆ ์ œ๊ฑฐ.
162
+ drop_cols = [c for c in df.columns if c.startswith("lag") or c.startswith("rmean") or c.startswith("rstd")]
163
+ df = df.dropna(subset=drop_cols)
164
+
165
+ # --- (5) ์ˆซ์ž ํ”ผ์ฒ˜ ๋ชฉ๋ก ๋งŒ๋“ค๊ธฐ ---
166
+ # ๋‹ฌ๋ ฅ ์ˆซ์ž + lag/rolling ์ˆซ์ž๋“ค์„ ๋ชจ์•„์„œ X์˜ ๊ธฐ๋ณธ ๋ผˆ๋Œ€๋ฅผ ๋งŒ๋“ค์–ด์š”.
167
+ num_cols = ["year","month","day","dow","week","is_weekend"] + drop_cols
168
+ num_cols = [c for c in num_cols if c in df.columns] # ํ˜น์‹œ ๋น ์ง„ ๊ฒŒ ์žˆ์œผ๋ฉด ๊ฑธ๋Ÿฌ์คŒ
169
+
170
+ # ์ˆซ์ž ํ”ผ์ฒ˜๋ฅผ ๋จผ์ € ํ–‰๋ ฌ๋กœ ๋ณ€ํ™˜
171
+ X_num = df[num_cols].values
172
+ feat_names = list(num_cols) # ๋‚˜์ค‘์— ํ•ด์„/์žฌํ˜„ํ•  ๋•Œ ํ•„์š”
173
+
174
+ # --- (6) ๋ถ„๋ฅ˜ํ˜•(๋ฌธ์ž) โ†’ ์›-ํ•ซ ์ธ์ฝ”๋”ฉ ---
175
+ # ์˜ˆ: region์ด '์„œ์šธ','๊ฒฝ๊ธฐ'๋ฉด 'region_์„œ์šธ','region_๊ฒฝ๊ธฐ' ๊ฐ™์€ ๊ฐ€์งœ ์—ด์„ ๋งŒ๋“ค์–ด์š”(0/1)
176
+ cat_cols = [c for c in [region_col, brand_col, item_col] if c and c in df.columns]
177
+ if cat_cols:
178
+ dummies = pd.get_dummies(df[cat_cols].astype(str), dummy_na=False)
179
+ # ์ˆซ์ž ํ”ผ์ฒ˜(X_num) ์˜ค๋ฅธ์ชฝ์— ์›-ํ•ซ ํ”ผ์ฒ˜๋ฅผ ๋ถ™์—ฌ์š”.
180
+ X = np.hstack([X_num, dummies.values])
181
+ feat_names += list(dummies.columns) # ์ƒˆ๋กœ ์ƒ๊ธด ์—ด ์ด๋ฆ„๋„ ๊ธฐ๋ก
182
+ else:
183
+ X = X_num # ๋ถ„๋ฅ˜ํ˜•์ด ์—†์œผ๋ฉด ์ˆซ์ž๋งŒ ์‚ฌ์šฉ
184
+
185
+ # --- (7) ์ •๋‹ต y ๋งŒ๋“ค๊ธฐ ---
186
+ y = df[target_col].values # ์šฐ๋ฆฌ๊ฐ€ ๋งž์ถ”๊ณ  ์‹ถ์€ ๊ฐ’(์˜ˆ: ํŒ๋งค๋Ÿ‰)
187
+
188
+ return df, X, y, feat_names
quick_train_runner.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ ============================================================
5
+ ์ž๋™ ํ•™์Šต ๋Ÿฐ์ฒ˜ (train_cli.py ์˜ˆ์‹œ)
6
+ ------------------------------------------------------------
7
+ ์ด ์Šคํฌ๋ฆฝํŠธ๋Š” CSV๋ฅผ ์ฝ์–ด ์ž๋™์œผ๋กœ ์ปฌ๋Ÿผ ๋งคํ•‘ โ†’ ํ”ผ์ฒ˜ ์ƒ์„ฑ โ†’
8
+ ๋ชจ๋ธ ํ›„๋ณด ํ•™์Šต(์˜ต์…˜: Optuna ํŠœ๋‹) โ†’ ์•„ํ‹ฐํŒฉํŠธ/๋ชจ๋ธ ์ €์žฅ์„
9
+ ํ•œ ๋ฒˆ์— ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค.
10
+
11
+ [์‚ฌ์šฉ ์˜ˆ]
12
+ python train_cli.py --data ./data/sample_sales.csv \
13
+ --project . \
14
+ --valid_ratio 0.2 \
15
+ --use_optuna --optuna_trials 20
16
+
17
+ ํ•„์ˆ˜:
18
+ --data ํ•™์Šต์— ์‚ฌ์šฉํ•  CSV ํŒŒ์ผ ๊ฒฝ๋กœ
19
+
20
+ ์„ ํƒ:
21
+ --project ์ž‘์—… ๋ฃจํŠธ ํด๋”(๊ธฐ๋ณธ: ํ˜„์žฌ ํด๋” ".")
22
+ --valid_ratio ๊ฒ€์ฆ ๋น„์œจ(0.05~0.4 ๊ถŒ์žฅ, ๊ธฐ๋ณธ 0.2)
23
+ --use_optuna Optuna ํŠœ๋‹ ์‚ฌ์šฉ ํ”Œ๋ž˜๊ทธ(์ง€์ • ์‹œ on)
24
+ --optuna_trials Optuna ์‹œ๋„ ํšŸ์ˆ˜(๊ธฐ๋ณธ 15)
25
+
26
+ ์ถœ๋ ฅ:
27
+ ํ”„๋กœ์ ํŠธ ํด๋” ์•„๋ž˜์—
28
+ artifacts/ (๋กœ๊ทธ/๋ฆฌ๋”๋ณด๋“œ ๋“ฑ ์ค‘๊ฐ„ ์‚ฐ์ถœ๋ฌผ)
29
+ models/ (best_model.pkl ๋“ฑ ๋ชจ๋ธ ํŒŒ์ผ)
30
+ ์ด ์ƒ์„ฑ๋ฉ๋‹ˆ๋‹ค.
31
+ ============================================================
32
+ """
33
+
34
+ import os
35
+ import argparse
36
+ import pandas as pd # (ํ•„์š”ํ•˜๋ฉด ์ถ”ํ›„ ์‚ฌ์šฉ, ์ง€๊ธˆ์€ ์ž„ํฌํŠธ๋งŒ)
37
+
38
+ from utils_io import read_csv_flexible, save_utf8sig, ensure_dirs, auto_map_columns
39
+ from preprocess import make_matrix
40
+ from train_core import train_and_score, save_artifacts
41
+
42
+
43
+ def main():
44
+ """
45
+ ์ปค๋งจ๋“œ๋ผ์ธ ์ธ์ž๋ฅผ ํŒŒ์‹ฑํ•ด์„œ:
46
+ 1) CSV ๋กœ๋“œ
47
+ 2) ์ž๋™ ์ปฌ๋Ÿผ ๋งคํ•‘
48
+ 3) ํ•™์Šต์šฉ ๋ฐ์ดํ„ฐ์…‹(X, y) ๊ตฌ์„ฑ
49
+ 4) ๋ชจ๋ธ ํ•™์Šต(+์˜ต์…˜: Optuna ํŠœ๋‹)
50
+ 5) ๊ฒฐ๊ณผ ์ €์žฅ(artifacts/, models/)
51
+ ๋ฅผ ์ˆœ์ฐจ ์‹คํ–‰ํ•ฉ๋‹ˆ๋‹ค.
52
+ """
53
+ # --------------------------------------------------------
54
+ # 1) ์ปค๋งจ๋“œ๋ผ์ธ ์˜ต์…˜ ์ •์˜/ํŒŒ์‹ฑ
55
+ # --------------------------------------------------------
56
+ ap = argparse.ArgumentParser()
57
+ ap.add_argument("--data", required=True, help="ํ•™์Šต์— ์‚ฌ์šฉํ•  CSV ๊ฒฝ๋กœ (์˜ˆ: ./data/sales.csv)")
58
+ ap.add_argument("--project", default=".", help="์ž‘์—… ๋ฃจํŠธ ํด๋”(artifacts/models ์ƒ์„ฑ ์œ„์น˜). ๊ธฐ๋ณธ๊ฐ’='.'")
59
+ ap.add_argument("--valid_ratio", type=float, default=0.2, help="๊ฒ€์ฆ ๋ฐ์ดํ„ฐ ๋น„์œจ(๊ธฐ๋ณธ 0.2)")
60
+ ap.add_argument("--use_optuna", action="store_true", help="Optuna ํŠœ๋‹ ์‚ฌ์šฉ ์—ฌ๋ถ€(ํ”Œ๋ž˜๊ทธ ์ง€์ • ์‹œ ์‚ฌ์šฉ)")
61
+ ap.add_argument("--optuna_trials", type=int, default=15, help="Optuna ์‹œ๋„ ํšŸ์ˆ˜(๊ธฐ๋ณธ 15)")
62
+ args = ap.parse_args()
63
+
64
+ # --------------------------------------------------------
65
+ # 2) ์ž‘์—… ๋ฃจํŠธ ์ด๋™ (์ƒ๋Œ€ ๊ฒฝ๋กœ ํ˜ผ๋™ ๋ฐฉ์ง€)
66
+ # --------------------------------------------------------
67
+ proj = os.path.abspath(args.project) # ์ ˆ๋Œ€๊ฒฝ๋กœ๋กœ ๋ณ€ํ™˜
68
+ os.chdir(proj) # ์—ฌ๊ธธ ๊ธฐ์ค€์œผ๋กœ ํŒŒ์ผ ์ฝ๊ณ /์ €์žฅ
69
+
70
+ # --------------------------------------------------------
71
+ # 3) CSV ๋กœ๋“œ + ์ปฌ๋Ÿผ ์ž๋™ ๋งคํ•‘
72
+ # --------------------------------------------------------
73
+ data = read_csv_flexible(args.data)
74
+ mapping = auto_map_columns(data)
75
+
76
+ # --------------------------------------------------------
77
+ # 4) ํ”ผ์ฒ˜ ๊ตฌ์„ฑ(X, y, feat_names ์ƒ์„ฑ)
78
+ # --------------------------------------------------------
79
+ df, X, y, feat_names = make_matrix(data, mapping)
80
+
81
+ # --------------------------------------------------------
82
+ # 5) ์ถœ๋ ฅ ํด๋” ์ค€๋น„ (์—†์œผ๋ฉด ์ƒ์„ฑ)
83
+ # --------------------------------------------------------
84
+ artifacts = os.path.join(proj, "artifacts") # ๋ฆฌ๋”๋ณด๋“œ/๋กœ๊ทธ ๋“ฑ
85
+ models_dir = os.path.join(proj, "models") # best_model.pkl ์ €์žฅ ์œ„์น˜
86
+ ensure_dirs(artifacts, models_dir)
87
+
88
+ # --------------------------------------------------------
89
+ # 6) ๋ชจ๋ธ ํ•™์Šต(+์˜ต์…˜: Optuna) & ๋ฆฌ๋”๋ณด๋“œ ํš๋“
90
+ # --------------------------------------------------------
91
+ best_model, lb = train_and_score(
92
+ X, y,
93
+ valid_ratio=args.valid_ratio,
94
+ use_optuna=args.use_optuna,
95
+ optuna_trials=args.optuna_trials
96
+ )
97
+
98
+ # --------------------------------------------------------
99
+ # 7) ์‚ฐ์ถœ๋ฌผ ์ €์žฅ (๋ชจ๋ธ/๋ฉ”ํƒ€๋ฐ์ดํ„ฐ/๋ฆฌ๋”๋ณด๋“œ)
100
+ # --------------------------------------------------------
101
+ save_artifacts([artifacts, models_dir], best_model, feat_names, mapping, lb)
102
+
103
+ # --------------------------------------------------------
104
+ # 8) ์ฝ˜์†” ๋กœ๊ทธ(์š”์•ฝ)
105
+ # --------------------------------------------------------
106
+ print("โœ… training done.")
107
+ print(" - artifacts:", artifacts)
108
+ print(" - models :", models_dir)
109
+ try:
110
+ print(lb.head())
111
+ except Exception:
112
+ print(lb)
113
+
114
+
115
+ if __name__ == "__main__":
116
+ main()
requirements.txt CHANGED
@@ -1,3 +1,10 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
1
+ pandas==2.2.2
2
+ numpy==1.26.4
3
+ streamlit==1.39.0
4
+ altair>=5,<6
5
+ scikit-learn
6
+ pyngrok
7
+ cloudflared
8
+ xgboost
9
+ lightgbm
10
+ optuna
starter.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells":[{"cell_type":"markdown","id":"GTG-zSlVYDJg","metadata":{"id":"GTG-zSlVYDJg"},"source":["# cstore_suite Colab ๋Ÿฐ์ฒ˜\n","\n","์ด ๋…ธํŠธ๋ถ์€ `cstore_suite` ํ”„๋กœ์ ํŠธ๋ฅผ Colab์—์„œ ํ•œ ๋ฒˆ์— ์‹คํ–‰ํ•˜๊ธฐ ์œ„ํ•œ ๋Ÿฐ์ฒ˜์ž…๋‹ˆ๋‹ค.\n","\n","์ˆœ์„œ๋Œ€๋กœ ์œ„์—์„œ ์•„๋ž˜๋กœ ์…€๋งŒ ์‹คํ–‰ํ•˜๋ฉด ๋ฉ๋‹ˆ๋‹ค.\n"]},{"cell_type":"code","execution_count":null,"id":"DBPgdiZp0aP0","metadata":{"colab":{"background_save":true},"executionInfo":{"elapsed":4,"status":"aborted","timestamp":1763530193256,"user":{"displayName":"์ดํ˜œ์ง€","userId":"11960777193249666496"},"user_tz":-540},"id":"DBPgdiZp0aP0"},"outputs":[{"name":"stderr","output_type":"stream","text":["ERROR:root:Internal Python error in the inspect module.\n","Below is the traceback from this internal error.\n","\n","ERROR:root:Internal Python error in the inspect module.\n","Below is the traceback from this internal error.\n","\n","ERROR:root:Internal Python error in the inspect module.\n","Below is the traceback from this internal error.\n","\n"]},{"name":"stdout","output_type":"stream","text":["Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3553, in run_code\n"," exec(code_obj, self.user_global_ns, self.user_ns)\n"," File \"/tmp/ipython-input-693870770.py\", line 1, in \u003ccell line: 0\u003e\n"," get_ipython().run_line_magic('cd', '/content/cstore_suite')\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2418, in run_line_magic\n"," result = fn(*args, **kwargs)\n"," ^^^^^^^^^^^^^^^^^^^\n"," File \"\u003cdecorator-gen-85\u003e\", line 2, in cd\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/magic.py\", line 187, in \u003clambda\u003e\n"," call = lambda f, *a, **k: f(*a, **k)\n"," ^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/magics/osm.py\", line 342, in cd\n"," oldcwd = os.getcwd()\n"," ^^^^^^^^^^^\n","OSError: [Errno 107] Transport endpoint is not connected\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2099, in showtraceback\n"," stb = value._render_traceback_()\n"," ^^^^^^^^^^^^^^^^^^^^^^^^\n","AttributeError: 'OSError' object has no attribute '_render_traceback_'\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1101, in get_records\n"," return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 248, in wrapped\n"," return f(*args, **kwargs)\n"," ^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 281, in _fixed_getinnerframes\n"," records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 1769, in getinnerframes\n"," traceback_info = getframeinfo(tb, context)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 1714, in getframeinfo\n"," filename = getsourcefile(frame) or getfile(frame)\n"," ^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 970, in getsourcefile\n"," module = getmodule(object, filename)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 999, in getmodule\n"," file = getabsfile(object, _filename)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 983, in getabsfile\n"," return os.path.normcase(os.path.abspath(_filename))\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"\u003cfrozen posixpath\u003e\", line 415, in abspath\n","OSError: [Errno 107] Transport endpoint is not connected\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3553, in run_code\n"," exec(code_obj, self.user_global_ns, self.user_ns)\n"," File \"/tmp/ipython-input-693870770.py\", line 1, in \u003ccell line: 0\u003e\n"," get_ipython().run_line_magic('cd', '/content/cstore_suite')\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2418, in run_line_magic\n"," result = fn(*args, **kwargs)\n"," ^^^^^^^^^^^^^^^^^^^\n"," File \"\u003cdecorator-gen-85\u003e\", line 2, in cd\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/magic.py\", line 187, in \u003clambda\u003e\n"," call = lambda f, *a, **k: f(*a, **k)\n"," ^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/magics/osm.py\", line 342, in cd\n"," oldcwd = os.getcwd()\n"," ^^^^^^^^^^^\n","OSError: [Errno 107] Transport endpoint is not connected\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2099, in showtraceback\n"," stb = value._render_traceback_()\n"," ^^^^^^^^^^^^^^^^^^^^^^^^\n","AttributeError: 'OSError' object has no attribute '_render_traceback_'\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3473, in run_ast_nodes\n"," if (await self.run_code(code, result, async_=asy)):\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3575, in run_code\n"," self.showtraceback(running_compiled_code=True)\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2101, in showtraceback\n"," stb = self.InteractiveTB.structured_traceback(etype,\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1367, in structured_traceback\n"," return FormattedTB.structured_traceback(\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1267, in structured_traceback\n"," return VerboseTB.structured_traceback(\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1124, in structured_traceback\n"," formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1082, in format_exception_as_a_whole\n"," last_unique, recursion_repeat = find_recursion(orig_etype, evalue, records)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 382, in find_recursion\n"," return len(records), 0\n"," ^^^^^^^^^^^^\n","TypeError: object of type 'NoneType' has no len()\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2099, in showtraceback\n"," stb = value._render_traceback_()\n"," ^^^^^^^^^^^^^^^^^^^^^^^^\n","AttributeError: 'TypeError' object has no attribute '_render_traceback_'\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1101, in get_records\n"," return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 248, in wrapped\n"," return f(*args, **kwargs)\n"," ^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 281, in _fixed_getinnerframes\n"," records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 1769, in getinnerframes\n"," traceback_info = getframeinfo(tb, context)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 1714, in getframeinfo\n"," filename = getsourcefile(frame) or getfile(frame)\n"," ^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 970, in getsourcefile\n"," module = getmodule(object, filename)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 999, in getmodule\n"," file = getabsfile(object, _filename)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 983, in getabsfile\n"," return os.path.normcase(os.path.abspath(_filename))\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"\u003cfrozen posixpath\u003e\", line 415, in abspath\n","OSError: [Errno 107] Transport endpoint is not connected\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3553, in run_code\n"," exec(code_obj, self.user_global_ns, self.user_ns)\n"," File \"/tmp/ipython-input-693870770.py\", line 1, in \u003ccell line: 0\u003e\n"," get_ipython().run_line_magic('cd', '/content/cstore_suite')\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2418, in run_line_magic\n"," result = fn(*args, **kwargs)\n"," ^^^^^^^^^^^^^^^^^^^\n"," File \"\u003cdecorator-gen-85\u003e\", line 2, in cd\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/magic.py\", line 187, in \u003clambda\u003e\n"," call = lambda f, *a, **k: f(*a, **k)\n"," ^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/magics/osm.py\", line 342, in cd\n"," oldcwd = os.getcwd()\n"," ^^^^^^^^^^^\n","OSError: [Errno 107] Transport endpoint is not connected\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2099, in showtraceback\n"," stb = value._render_traceback_()\n"," ^^^^^^^^^^^^^^^^^^^^^^^^\n","AttributeError: 'OSError' object has no attribute '_render_traceback_'\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3473, in run_ast_nodes\n"," if (await self.run_code(code, result, async_=asy)):\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3575, in run_code\n"," self.showtraceback(running_compiled_code=True)\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2101, in showtraceback\n"," stb = self.InteractiveTB.structured_traceback(etype,\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1367, in structured_traceback\n"," return FormattedTB.structured_traceback(\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1267, in structured_traceback\n"," return VerboseTB.structured_traceback(\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1124, in structured_traceback\n"," formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1082, in format_exception_as_a_whole\n"," last_unique, recursion_repeat = find_recursion(orig_etype, evalue, records)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 382, in find_recursion\n"," return len(records), 0\n"," ^^^^^^^^^^^^\n","TypeError: object of type 'NoneType' has no len()\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2099, in showtraceback\n"," stb = value._render_traceback_()\n"," ^^^^^^^^^^^^^^^^^^^^^^^^\n","AttributeError: 'TypeError' object has no attribute '_render_traceback_'\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3030, in _run_cell\n"," return runner(coro)\n"," ^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/async_helpers.py\", line 78, in _pseudo_sync_runner\n"," coro.send(None)\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3257, in run_cell_async\n"," has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3492, in run_ast_nodes\n"," self.showtraceback()\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2101, in showtraceback\n"," stb = self.InteractiveTB.structured_traceback(etype,\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1367, in structured_traceback\n"," return FormattedTB.structured_traceback(\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1267, in structured_traceback\n"," return VerboseTB.structured_traceback(\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1142, in structured_traceback\n"," formatted_exceptions += self.format_exception_as_a_whole(etype, evalue, etb, lines_of_context,\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1082, in format_exception_as_a_whole\n"," last_unique, recursion_repeat = find_recursion(orig_etype, evalue, records)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 382, in find_recursion\n"," return len(records), 0\n"," ^^^^^^^^^^^^\n","TypeError: object of type 'NoneType' has no len()\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2099, in showtraceback\n"," stb = value._render_traceback_()\n"," ^^^^^^^^^^^^^^^^^^^^^^^^\n","AttributeError: 'TypeError' object has no attribute '_render_traceback_'\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1101, in get_records\n"," return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 248, in wrapped\n"," return f(*args, **kwargs)\n"," ^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 281, in _fixed_getinnerframes\n"," records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 1769, in getinnerframes\n"," traceback_info = getframeinfo(tb, context)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 1714, in getframeinfo\n"," filename = getsourcefile(frame) or getfile(frame)\n"," ^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 970, in getsourcefile\n"," module = getmodule(object, filename)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 999, in getmodule\n"," file = getabsfile(object, _filename)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 983, in getabsfile\n"," return os.path.normcase(os.path.abspath(_filename))\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"\u003cfrozen posixpath\u003e\", line 415, in abspath\n","OSError: [Errno 107] Transport endpoint is not connected\n"]}],"source":["%cd /content/cstore_suite"]},{"cell_type":"code","execution_count":7,"id":"setup_project","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":256},"executionInfo":{"elapsed":2520,"status":"error","timestamp":1763530504418,"user":{"displayName":"์ดํ˜œ์ง€","userId":"11960777193249666496"},"user_tz":-540},"id":"setup_project","outputId":"a33dfd95-9a46-477e-f6e3-c0121bd4a9e7"},"outputs":[{"name":"stdout","output_type":"stream","text":["โœ… Python version: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]\n","Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]},{"ename":"OSError","evalue":"[Errno 107] Transport endpoint is not connected","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)","\u001b[0;32m/tmp/ipython-input-1340184868.py\u001b[0m in \u001b[0;36m\u003ccell line: 0\u003e\u001b[0;34m()\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0mcandidates\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/content/cstore_suite\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# Colab์—์„œ ZIP ํ’€์–ด์„œ ์“ด ๊ฒฝ์šฐ\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0mcandidates\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/content/drive/MyDrive/cstore_suite\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# Drive์— ํด๋”๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---\u003e 24\u001b[0;31m \u001b[0mcandidates\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetcwd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# ํ˜„์žฌ ํด๋”\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0mproj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mOSError\u001b[0m: [Errno 107] Transport endpoint is not connected"]}],"source":["# 1๏ธโƒฃ Colab / Jupyter ํ™˜๊ฒฝ์—์„œ cstore_suite ํ”„๋กœ์ ํŠธ ํด๋” ์ž๋™ ์„ค์ •\n","import os, sys\n","\n","print(\"โœ… Python version:\", sys.version)\n","\n","# (์„ ํƒ) Google Colab ์—์„œ ์‹คํ–‰ ์ค‘์ด๋ฉด Drive ๋งˆ์šดํŠธ ์‹œ๋„\n","IN_COLAB = False\n","try:\n"," from google.colab import drive # type: ignore\n"," IN_COLAB = True\n","except Exception:\n"," IN_COLAB = False\n","\n","if IN_COLAB:\n"," try:\n"," drive.mount(\"/content/drive\")\n"," except Exception as e:\n"," print(\"โš  Drive ๋งˆ์šดํŠธ ์ค‘ ๊ฒฝ๊ณ :\", e)\n","\n","# ํ”„๋กœ์ ํŠธ ์œ„์น˜ ํ›„๋ณด๋“ค\n","candidates = []\n","candidates.append(\"/content/cstore_suite\") # Colab์—์„œ ZIP ํ’€์–ด์„œ ์“ด ๊ฒฝ์šฐ\n","candidates.append(\"/content/drive/MyDrive/cstore_suite\") # Drive์— ํด๋”๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ\n","candidates.append(os.getcwd()) # ํ˜„์žฌ ํด๋”\n","\n","proj = None\n","for p in candidates:\n"," if os.path.isdir(p):\n"," proj = p\n"," break\n","\n","if proj is None:\n"," raise FileNotFoundError(\n"," \"cstore_suite ํ”„๋กœ์ ํŠธ ํด๋”๋ฅผ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค.\\n\"\n"," \"- /content/cstore_suite ๋˜๋Š” /content/drive/MyDrive/cstore_suite ์œ„์น˜๋ฅผ ํ™•์ธํ•˜์„ธ์š”.\"\n"," )\n","\n","os.chdir(proj)\n","print(\"๐Ÿ“ ์ž‘์—… ํด๋”:\", os.getcwd())\n","print(\"๐Ÿ“‚ ํฌํ•จ ํŒŒ์ผ:\", os.listdir())\n"]},{"cell_type":"code","execution_count":null,"id":"install_requirements","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":220},"executionInfo":{"elapsed":46,"status":"error","timestamp":1763530163826,"user":{"displayName":"์ดํ˜œ์ง€","userId":"11960777193249666496"},"user_tz":-540},"id":"install_requirements","outputId":"f007b1e7-b1aa-4c08-cdc6-b65584b90950"},"outputs":[{"ename":"OSError","evalue":"[Errno 107] Transport endpoint is not connected","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)","\u001b[0;32m/tmp/ipython-input-3246530550.py\u001b[0m in \u001b[0;36m\u003ccell line: 0\u003e\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msubprocess\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----\u003e 4\u001b[0;31m \u001b[0mreq_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetcwd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"requirements.txt\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"โš  requirements.txt ๊ฐ€ ์—†์–ด ๊ธฐ๋ณธ ํ…œํ”Œ๋ฆฟ์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค: {req_path}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mOSError\u001b[0m: [Errno 107] Transport endpoint is not connected"]}],"source":["# 2๏ธโƒฃ requirements.txt ๊ธฐ๋ฐ˜ ์˜์กด์„ฑ ์„ค์น˜\n","import os, subprocess, sys\n","\n","req_path = os.path.join(os.getcwd(), \"requirements.txt\")\n","if not os.path.exists(req_path):\n"," print(f\"โš  requirements.txt ๊ฐ€ ์—†์–ด ๊ธฐ๋ณธ ํ…œํ”Œ๋ฆฟ์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค: {req_path}\")\n"," with open(req_path, \"w\", encoding=\"utf-8\") as f:\n"," f.write(\"pandas==2.2.2\\n\")\n"," f.write(\"numpy==1.26.4\\n\")\n"," f.write(\"streamlit==1.39.0\\n\")\n"," f.write(\"altair\u003e=5,\u003c6\\n\")\n"," f.write(\"scikit-learn\\n\")\n"," f.write(\"pyngrok\\n\")\n"," f.write(\"cloudflared\\n\")\n"," f.write(\"xgboost\\n\")\n"," f.write(\"lightgbm\\n\")\n"," f.write(\"optuna\\n\")\n","else:\n"," print(\"๐Ÿ“‘ requirements.txt ์œ„์น˜:\", req_path)\n","\n","print(\"๐Ÿ“ฆ ์˜์กด์„ฑ ์„ค์น˜ ์ค‘... (๋‹ค์†Œ ์‹œ๊ฐ„์ด ๊ฑธ๋ฆด ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค)\")\n","subprocess.run([sys.executable, \"-m\", \"pip\", \"install\", \"-r\", req_path], check=True)\n","print(\"โœ… ์˜์กด์„ฑ ์„ค์น˜ ์™„๋ฃŒ\")\n"]},{"cell_type":"code","execution_count":null,"id":"auto_train","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":16,"status":"ok","timestamp":1763097971961,"user":{"displayName":"๋ฐ•์ข…ํ›ˆ","userId":"17966453500002937995"},"user_tz":-540},"id":"auto_train","outputId":"c104b1cb-1aff-4ef8-99eb-e55d8477e50b"},"outputs":[{"name":"stdout","output_type":"stream","text":["โœ… ์ด๋ฏธ ํ•™์Šต๋œ ๋ชจ๋ธ์ด ์žˆ์Šต๋‹ˆ๋‹ค: /content/drive/MyDrive/cstore_suite/models/best_model.pkl\n"]}],"source":["# 3๏ธโƒฃ ๋ชจ๋ธ ์ž๋™ ํ•™์Šต (models/best_model.pkl ์ด ์—†์„ ๋•Œ๋งŒ ์ˆ˜ํ–‰)\n","import os, sys, glob, subprocess\n","\n","proj = os.getcwd()\n","models_dir = os.path.join(proj, \"models\")\n","os.makedirs(models_dir, exist_ok=True)\n","best_model_path = os.path.join(models_dir, \"best_model.pkl\")\n","\n","if os.path.exists(best_model_path):\n"," print(\"โœ… ์ด๋ฏธ ํ•™์Šต๋œ ๋ชจ๋ธ์ด ์žˆ์Šต๋‹ˆ๋‹ค:\", best_model_path)\n","else:\n"," data_dir = os.path.join(proj, \"data\")\n"," if not os.path.isdir(data_dir):\n"," raise FileNotFoundError(f\"data ํด๋”๋ฅผ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค: {data_dir}\")\n","\n"," candidates = [p for p in glob.glob(os.path.join(data_dir, \"*.csv\")) if os.path.isfile(p)]\n"," if not candidates:\n"," raise FileNotFoundError(f\"data ํด๋”์—์„œ CSV ํŒŒ์ผ์„ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค: {data_dir}\")\n","\n"," data_path = None\n"," # ์šฐ์„ ์ˆœ์œ„: sample_sales.csv โ†’ seoul_gyeonggi_with_demand.csv โ†’ ๊ทธ ์™ธ ์ฒซ ๋ฒˆ์งธ\n"," for name in [\"sample_sales.csv\", \"seoul_gyeonggi_with_demand.csv\"]:\n"," cand = os.path.join(data_dir, name)\n"," if os.path.exists(cand):\n"," data_path = cand\n"," break\n"," if data_path is None:\n"," data_path = candidates[0]\n","\n"," print(\"๐Ÿ“Š ํ•™์Šต์— ์‚ฌ์šฉํ•  ๋ฐ์ดํ„ฐ:\", data_path)\n","\n"," cmd = [\n"," sys.executable,\n"," \"quick_train_runner.py\",\n"," \"--data\", data_path,\n"," \"--project\", proj,\n"," \"--valid_ratio\", \"0.2\",\n"," \"--use_optuna\",\n"," \"--optuna_trials\", \"10\",\n"," ]\n"," print(\"๐Ÿš€ ํ•™์Šต ๋ช…๋ น:\", \" \".join(cmd))\n"," subprocess.run(cmd, check=True)\n"," if os.path.exists(best_model_path):\n"," print(\"โœ… ํ•™์Šต ๋ฐ ๋ชจ๋ธ ์ €์žฅ ์™„๋ฃŒ:\", best_model_path)\n"," else:\n"," print(\"โš  quick_train_runner.py ์‹คํ–‰์€ ๋๋‚ฌ์ง€๋งŒ best_model.pkl ์„ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค.\")\n"]},{"cell_type":"code","execution_count":null,"id":"run_streamlit","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"run_streamlit","outputId":"f98365b7-2f7a-4847-f8c3-5b2ae49a7ef7"},"outputs":[{"name":"stdout","output_type":"stream","text":["NGROK_AUTHTOKEN ์„ ์ž…๋ ฅํ•˜์„ธ์š”(์—†์œผ๋ฉด ์—”ํ„ฐ): 34Ug4uB0dodqJJVE41prai7dVdp_48R3SS6GGeSabkZFP89Xw\n","๐ŸŒ ngrok ํ„ฐ๋„์„ ์—ฌ๋Š” ์ค‘...\n","๐ŸŒ Public URL: https://debra-didactic-preculturally.ngrok-free.dev\n","โžก ์œ„ URL ์„ ์ƒˆ ํƒญ์—์„œ ์—ด๋ฉด Streamlit ์•ฑ์— ์ ‘์†ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.\n","(์…€์„ ๋ฉˆ์ถ”๋ฉด ํ„ฐ๋„๋„ ์ข…๋ฃŒ๋ฉ๋‹ˆ๋‹ค.)\n","๐Ÿš€ Streamlit ์‹คํ–‰: /usr/bin/python3 -m streamlit run app_streamlit_pro.py --server.port 8501 --server.address 0.0.0.0\n"]}],"source":["# 4๏ธโƒฃ Streamlit ์•ฑ + ngrok ํ„ฐ๋„ ์‹คํ–‰\n","# - ์…€ ์‹คํ–‰ ํ›„ ์ถœ๋ ฅ๋˜๋Š” Public URL ๋กœ ์ ‘์†ํ•˜๋ฉด ์•ฑ ํ™”๋ฉด์ด ๋ณด์ž…๋‹ˆ๋‹ค.\n","\n","import os, sys, subprocess, time\n","\n","from pyngrok import ngrok, conf\n","\n","# ๊ธฐ์กด ํ„ฐ๋„ ์ •๋ฆฌ\n","try:\n"," ngrok.kill()\n","except Exception:\n"," pass\n","\n","port = 8501\n","\n","# ํ† ํฐ ์ฝ๊ธฐ (ํ™˜๊ฒฝ๋ณ€์ˆ˜ ์šฐ์„ , ์—†์œผ๋ฉด ์ž…๋ ฅ ๋ฐ›๊ธฐ)\n","token = os.environ.get(\"NGROK_AUTHTOKEN\", \"\").strip()\n","if not token:\n"," try:\n"," token = input(\"NGROK_AUTHTOKEN ์„ ์ž…๋ ฅํ•˜์„ธ์š”(์—†์œผ๋ฉด ์—”ํ„ฐ): \").strip()\n"," except EOFError:\n"," token = \"\"\n","\n","if token:\n"," conf.get_default().auth_token = token\n","else:\n"," print(\"โš  NGROK_AUTHTOKEN ์ด ๋น„์–ด ์žˆ์Šต๋‹ˆ๋‹ค. ๋น„์ธ์ฆ ๋ชจ๋“œ๋Š” ์ œํ•œ/์—๋Ÿฌ๊ฐ€ ๋‚  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.\")\n","\n","# ngrok ํ„ฐ๋„ ์˜คํ”ˆ\n","print(\"๐ŸŒ ngrok ํ„ฐ๋„์„ ์—ฌ๋Š” ์ค‘...\")\n","tunnel = ngrok.connect(addr=f\"http://localhost:{port}\", proto=\"http\")\n","public_url = tunnel.public_url\n","print(\"๐ŸŒ Public URL:\", public_url)\n","print(\"โžก ์œ„ URL ์„ ์ƒˆ ํƒญ์—์„œ ์—ด๋ฉด Streamlit ์•ฑ์— ์ ‘์†ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.\")\n","print(\"(์…€์„ ๋ฉˆ์ถ”๋ฉด ํ„ฐ๋„๋„ ์ข…๋ฃŒ๋ฉ๋‹ˆ๋‹ค.)\")\n","\n","# Streamlit ์•ฑ ์‹คํ–‰ (๋กœ๊ทธ๋Š” ์•„๋ž˜์— ์ถœ๋ ฅ)\n","cmd = [\n"," sys.executable,\n"," \"-m\", \"streamlit\",\n"," \"run\", \"app_streamlit_pro.py\",\n"," \"--server.port\", str(port),\n"," \"--server.address\", \"0.0.0.0\",\n","]\n","print(\"๐Ÿš€ Streamlit ์‹คํ–‰:\", \" \".join(cmd))\n","subprocess.run(cmd, check=True)\n"]}],"metadata":{"colab":{"name":"","version":""},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.11.8"}},"nbformat":4,"nbformat_minor":5}
train_core.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ ============================================================
5
+ train_core.py โ€” ํ•™์Šต ํ•ต์‹ฌ ๋กœ์ง(์ฃผ์„ ์•„์ฃผ ์ž์„ธํžˆ)
6
+ ------------------------------------------------------------
7
+ ์ด ํŒŒ์ผ์€ ๋‹ค์Œ ์ผ์„ ํ•ด์š”:
8
+ 1) ํ‰๊ฐ€ ์ง€ํ‘œ ํ•จ์ˆ˜ ์ •์˜(RMSE/MAE/MAPE)
9
+ 2) ์‚ฌ์šฉํ•  ๋ชจ๋ธ ํ›„๋ณด๋“ค์„ ๋ชจ์•„์ฃผ๋Š” ํ•จ์ˆ˜(get_candidates)
10
+ 3) ์‹œ๊ณ„์—ด ๋ถ„ํ• (ํ•™์Šต/๊ฒ€์ฆ ๋‚˜๋ˆ„๊ธฐ)
11
+ 4) ๊ฐ„๋‹จํ•œ ์•™์ƒ๋ธ”(SimpleEnsemble)
12
+ 5) (์˜ต์…˜) Optuna ๋กœ ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹
13
+ 6) train_and_score: ๋ชจ๋ธ๋“ค ํ•™์Šต โ†’ ๊ฒ€์ฆ ์„ฑ๋Šฅ ๋น„๊ต โ†’ ๋ฒ ์ŠคํŠธ ์„ ํƒ
14
+ 7) save_artifacts: ๋ฒ ์ŠคํŠธ ๋ชจ๋ธ/๋ฆฌ๋”๋ณด๋“œ ์ €์žฅ
15
+
16
+ โ€ป XGBoost/LightGBM/Optuna ๋Š” ์„ค์น˜๋˜์–ด ์žˆ์ง€ ์•Š์œผ๋ฉด
17
+ ์ž๋™์œผ๋กœ ๊ฑด๋„ˆ๋›ฐ๋„๋ก ๋งŒ๋“ค์–ด์กŒ์Šต๋‹ˆ๋‹ค.
18
+ ============================================================
19
+ """
20
+
21
+ import os
22
+ import pickle
23
+ import numpy as np
24
+ import pandas as pd
25
+
26
+ # ํ‰๊ฐ€ ์ง€ํ‘œ ๊ณ„์‚ฐ์„ ์œ„ํ•ด scikit-learn ํ•จ์ˆ˜ ์‚ฌ์šฉ
27
+ from sklearn.metrics import mean_squared_error, mean_absolute_error
28
+
29
+ # ๊ธฐ๋ณธ ์„ ํ˜•ํšŒ๊ท€/๋žœ๋คํฌ๋ ˆ์ŠคํŠธ
30
+ from sklearn.linear_model import LinearRegression
31
+ from sklearn.ensemble import RandomForestRegressor
32
+
33
+ # XGBoost / LightGBM ์€ ์žˆ์„ ์ˆ˜๋„, ์—†์„ ์ˆ˜๋„ ์žˆ์–ด์š”. (try/except)
34
+ try:
35
+ from xgboost import XGBRegressor
36
+ except Exception:
37
+ XGBRegressor = None
38
+
39
+ try:
40
+ from lightgbm import LGBMRegressor
41
+ except Exception:
42
+ LGBMRegressor = None
43
+
44
+ # Optuna(ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ์ž๋™ ํƒ์ƒ‰๊ธฐ)๋„ ์„ ํƒ์‚ฌํ•ญ
45
+ try:
46
+ import optuna
47
+ except Exception:
48
+ optuna = None
49
+
50
+
51
+ # ------------------------------------------------------------
52
+ # 1) ํ‰๊ฐ€ ์ง€ํ‘œ: RMSE / MAE / MAPE
53
+ # ------------------------------------------------------------
54
+ def rmse(a, b):
55
+ """
56
+ RMSE (Root Mean Squared Error)
57
+ - ์˜ˆ์ธก์ด ์‹ค์ œ์™€ ์–ผ๋งˆ๋‚˜ ๋‹ค๋ฅธ์ง€, '์ œ๊ณฑ ํ‰๊ท  ์˜ค์ฐจ์˜ ์ œ๊ณฑ๊ทผ'
58
+ - ๊ฐ’์ด ์ž‘์„์ˆ˜๋ก ์ข‹์•„์š”.
59
+ """
60
+ a = np.array(a); b = np.array(b)
61
+ return float(np.sqrt(mean_squared_error(a, b))) if len(a) else float("nan")
62
+
63
+
64
+ def mae(a, b):
65
+ """
66
+ MAE (Mean Absolute Error)
67
+ - ์˜ˆ์ธก๊ณผ ์‹ค์ œ์˜ ์ฐจ์ด์˜ '์ ˆ๋Œ€๊ฐ’'์„ ํ‰๊ท ๋‚ธ ๊ฐ’
68
+ - ์‰ฌ์šด ์ง๊ด€: ํ‰๊ท ์ ์œผ๋กœ ๋ช‡ ๊ฐœ(๋˜๋Š” ๋ช‡ ๋‹จ์œ„) ๋งŒํผ ํ‹€๋ ธ๋‚˜?
69
+ """
70
+ a = np.array(a); b = np.array(b)
71
+ return float(mean_absolute_error(a, b)) if len(a) else float("nan")
72
+
73
+
74
+ def mape(a, b):
75
+ """
76
+ MAPE (Mean Absolute Percentage Error)
77
+ - ํผ์„ผํŠธ(%) ๊ธฐ์ค€ ์˜ค์ฐจ. 10%๋ฉด 'ํ‰๊ท ์ ์œผ๋กœ 10% ํ‹€๋ ธ๋‹ค'๋Š” ๋œป.
78
+ - ์‹ค์ œ๊ฐ’์ด 0์ด๋ฉด ๋‚˜๋ˆ—์…ˆ์ด ์•ˆ ๋˜๋ฏ€๋กœ 1๋กœ ๋ฐ”๊ฟ”์„œ ์•ˆ์ „ ์ฒ˜๋ฆฌํ•ด์š”.
79
+ """
80
+ a = np.array(a); b = np.array(b)
81
+ if len(a) == 0:
82
+ return float("nan")
83
+ denom = np.where(a == 0, 1, a) # 0์ธ ๊ณณ์€ 1๋กœ ์น˜ํ™˜(๋ถ„๋ชจ ์•ˆ์ „์žฅ์น˜)
84
+ return float(np.mean(np.abs((a - b) / denom)) * 100.0)
85
+
86
+
87
+ # ------------------------------------------------------------
88
+ # 2) ๋ชจ๋ธ ํ›„๋ณด๋ฅผ ๋งŒ๋“ค์–ด ์ฃผ๋Š” ํ•จ์ˆ˜
89
+ # ------------------------------------------------------------
90
+ def get_candidates():
91
+ """
92
+ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ๋Š” ๋ชจ๋ธ ๋ชฉ๋ก์„ ํŠœํ”Œ๋กœ ๋ชจ์•„ ๋ฐ˜ํ™˜ํ•ด์š”.
93
+ ๊ฐ ์›์†Œ: (์ด๋ฆ„, ๋ชจ๋ธ๊ฐ์ฒด, fit(ํ•™์Šต)ํ•  ๋•Œ ๋„ฃ์„ ์ถ”๊ฐ€ ํŒŒ๋ผ๋ฏธํ„ฐ ๋”•์…”๋„ˆ๋ฆฌ)
94
+
95
+ - LinearRegression: ๊ฐ€์žฅ ๊ธฐ๋ณธ์ ์ธ ์„ ํ˜• ๋ชจ๋ธ
96
+ - RandomForest: ๋น„์„ ํ˜• ํŒจํ„ด๋„ ์ž˜ ์žก๋Š” ๋‚˜๋ฌด ์•™์ƒ๋ธ”
97
+ - XGBoost / LightGBM: ๋น ๋ฅด๊ณ  ๊ฐ•๋ ฅํ•œ ๋ถ€์ŠคํŒ… ๋ชจ๋ธ(์„ค์น˜๋œ ๊ฒฝ์šฐ๋งŒ ์‚ฌ์šฉ)
98
+ """
99
+ models = []
100
+
101
+ # 1) ์„ ํ˜•ํšŒ๊ท€ (์„ค์ •ํ•  ๊ฒŒ ๊ฑฐ์˜ ์—†์Œ)
102
+ models.append(("LinearRegression", LinearRegression(), {}))
103
+
104
+ # 2) ๋žœ๋คํฌ๋ ˆ์ŠคํŠธ (๋‚˜๋ฌด 300๊ทธ๋ฃจ, ๋ฉ€ํ‹ฐ์ฝ”์–ด ์‚ฌ์šฉ)
105
+ models.append(("RandomForest", RandomForestRegressor(
106
+ n_estimators=300, # ๋‚˜๋ฌด ๊ฐœ์ˆ˜
107
+ max_depth=None, # ๊นŠ์ด ์ œํ•œ ์—†์Œ(๊ณผ์ ํ•ฉ ์‹œ ์ค„์ด๊ธฐ)
108
+ random_state=42,
109
+ n_jobs=-1 # CPU ์ฝ”์–ด ๋ชจ๋‘ ์‚ฌ์šฉ
110
+ ), {}))
111
+
112
+ # 3) XGBoost (์žˆ์„ ๋•Œ๋งŒ)
113
+ if XGBRegressor is not None:
114
+ models.append(("XGBoost", XGBRegressor(
115
+ n_estimators=400,
116
+ max_depth=6,
117
+ learning_rate=0.05,
118
+ subsample=0.9,
119
+ colsample_bytree=0.9,
120
+ reg_lambda=1.0,
121
+ random_state=42,
122
+ tree_method="hist", # ๋น ๋ฅธ ํžˆ์Šคํ† ๊ทธ๋žจ ๋ถ„ํ• 
123
+ n_jobs=-1
124
+ ), {"verbose": False})) # fit์— ๋„ฃ์„ ์ถ”๊ฐ€ ์ธ์ž ์˜ˆ์‹œ
125
+
126
+ # 4) LightGBM (์žˆ์„ ๋•Œ๋งŒ)
127
+ if LGBMRegressor is not None:
128
+ models.append(("LightGBM", LGBMRegressor(
129
+ n_estimators=600,
130
+ max_depth=-1, # ์ž๋™
131
+ learning_rate=0.05,
132
+ subsample=0.9,
133
+ colsample_bytree=0.9,
134
+ reg_lambda=1.0,
135
+ random_state=42,
136
+ n_jobs=-1
137
+ ), {}))
138
+
139
+ return models
140
+
141
+
142
+ # ------------------------------------------------------------
143
+ # 3) ์‹œ๊ณ„์—ด ๋ถ„ํ• : ์•ž๋ถ€๋ถ„(ํ•™์Šต) / ๋’ท๋ถ€๋ถ„(๊ฒ€์ฆ)
144
+ # ------------------------------------------------------------
145
+ def time_split(X, y, valid_ratio=0.2):
146
+ """
147
+ ์‹œ๊ฐ„ ์ˆœ์„œ๋ฅผ ์ง€ํ‚ค๊ธฐ ์œ„ํ•ด, ์•ž์ชฝ์€ 'ํ•™์Šต', ๋’ค์ชฝ์€ '๊ฒ€์ฆ'์œผ๋กœ ๋‚˜๋ˆ ์š”.
148
+ (์‹œ๊ณ„์—ด์€ ๋žœ๋ค ์„ž๊ธฐ๋ฅผ ์•ˆ ํ•˜๋Š” ๊ฒŒ ์ผ๋ฐ˜์ )
149
+
150
+ valid_ratio=0.2 ์ด๋ฉด ๋ฐ์ดํ„ฐ์˜ 20%๋ฅผ ๊ฒ€์ฆ์šฉ์œผ๋กœ ์‚ฌ์šฉ.
151
+ """
152
+ n = len(X)
153
+ v = max(1, int(n * valid_ratio)) # ๊ฒ€์ฆ ์ƒ˜ํ”Œ ๊ฐœ์ˆ˜(์ตœ์†Œ 1)
154
+ t = n - v # ํ•™์Šต ์ƒ˜ํ”Œ ๊ฐœ์ˆ˜
155
+ return (X[:t], y[:t], X[t:], y[t:])
156
+
157
+
158
+ # ------------------------------------------------------------
159
+ # 4) ๊ฐ„๋‹จํ•œ ์•™์ƒ๋ธ”: ์—ฌ๋Ÿฌ ๋ชจ๋ธ ์˜ˆ์ธก์„ '๊ฐ€์ค‘ ํ‰๊ท '
160
+ # ------------------------------------------------------------
161
+ class SimpleEnsemble:
162
+ """
163
+ ์—ฌ๋Ÿฌ ๋ชจ๋ธ์˜ ์˜ˆ์ธก์„ ์„ž์–ด์„œ ํ•˜๋‚˜๋กœ ๋งŒ๋“œ๋Š” ๊ฐ„๋‹จํ•œ ์•™์ƒ๋ธ”.
164
+ - weights: ๊ฐ€์ค‘์น˜(๊ฐ’์ด ํฌ๋ฉด ๊ทธ ๋ชจ๋ธ์„ ๋” ์‹ ๋ขฐํ•œ๋‹ค๋Š” ๋œป)
165
+ - ์—ฌ๊ธฐ์„œ๋Š” ๋ชจ๋ธ๋ณ„ ๊ฒ€์ฆ RMSE ์˜ ์—ญ์ˆ˜๋ฅผ ๊ฐ€์ค‘์น˜๋กœ ์‚ฌ์šฉ(์ข‹์„์ˆ˜๋ก ํฐ ๊ฐ€์ค‘)
166
+ """
167
+ def __init__(self, models, weights):
168
+ self.models = models
169
+ # ๊ฐ€์ค‘์น˜ ํ•ฉ์ด 1์ด ๋˜๋„๋ก ์ •๊ทœํ™”(ํ•ฉ์ด 0์ด๋ฉด ๋ถ„๋ชจ๋ฅผ ์•„์ฃผ ์ž‘์€ ๊ฐ’์œผ๋กœ)
170
+ self.weights = np.array(weights, dtype=float) / max(np.sum(weights), 1e-9)
171
+
172
+ def predict(self, X):
173
+ # ๊ฐ ๋ชจ๋ธ์˜ ์˜ˆ์ธก์„ ๋ชจ์•„์„œ(์—ด๋ฐฉํ–ฅ) ๊ฐ€์ค‘ ํ‰๊ท 
174
+ preds = [m.predict(X) for m in self.models] # ๋ฆฌ์ŠคํŠธ ๊ธธ์ด = ๋ชจ๋ธ ์ˆ˜
175
+ return np.sum(np.array(preds).T * self.weights, axis=1) # (์ƒ˜ํ”Œ, ๋ชจ๋ธ) ยท (๋ชจ๋ธ,) โ†’ (์ƒ˜ํ”Œ,)
176
+
177
+
178
+ # ------------------------------------------------------------
179
+ # 5) Optuna ๋กœ ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹(์„ ํƒ)
180
+ # ------------------------------------------------------------
181
+ def _tune_with_optuna(name, base_model, X_tr, y_tr, X_va, y_va, n_trials=20):
182
+ """
183
+ ํŠน์ • ๋ชจ๋ธ์— ๋Œ€ํ•ด Optuna ๋กœ '์ข‹์€ ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ'๋ฅผ ์ฐพ์•„์š”.
184
+ - name: ๋ชจ๋ธ๋ช… ๋ฌธ์ž์—ด (RandomForest/XGBoost/LightGBM)
185
+ - base_model: ์›๋ž˜ ๋ชจ๋ธ(๋Œ€์ฒด๋กœ ๋ฌด์‹œํ•˜๊ณ  ์ƒˆ๋กœ ๋งŒ๋“ฆ)
186
+ - X_tr, y_tr: ํ•™์Šต ์„ธํŠธ
187
+ - X_va, y_va: ๊ฒ€์ฆ ์„ธํŠธ
188
+ - n_trials: ์‹œ๋„ ํšŸ์ˆ˜(๋งŽ์„์ˆ˜๋ก ๋” ๊ผผ๊ผผํ•˜์ง€๋งŒ ์‹œ๊ฐ„์ด ์˜ค๋ž˜ ๊ฑธ๋ฆผ)
189
+
190
+ ๋ฐ˜ํ™˜:
191
+ - ํŠœ๋‹์ด ๊ฐ€๋Šฅํ•˜๋ฉด '์ตœ์  ๋ชจ๋ธ' ๊ฐ์ฒด๋ฅผ ๋ฐ˜ํ™˜
192
+ - Optuna๊ฐ€ ์—†๊ฑฐ๋‚˜ ๋ชจ๋ธ์ด ๋งค์นญ๋˜์ง€ ์•Š์œผ๋ฉด None
193
+ """
194
+ if optuna is None:
195
+ return None # Optuna ์„ค์น˜ ์•ˆ ๋˜์–ด ์žˆ์œผ๋ฉด ์Šคํ‚ต
196
+
197
+ # ํƒ์ƒ‰ ๋ชฉํ‘œ ํ•จ์ˆ˜: ๊ฒ€์ฆ RMSE ๋ฅผ ์ตœ์†Œํ™”
198
+ def objective(trial):
199
+ if name == "RandomForest":
200
+ # ํƒ์ƒ‰ ๋ฒ”์œ„ ์ •์˜(๋Œ€๋žต์ ์ธ ํ•ฉ๋ฆฌ์  ๊ตฌ๊ฐ„)
201
+ n_estimators = trial.suggest_int("n_estimators", 200, 800, step=100)
202
+ max_depth = trial.suggest_int("max_depth", 6, 24, step=2)
203
+ m = RandomForestRegressor(
204
+ n_estimators=n_estimators,
205
+ max_depth=max_depth,
206
+ random_state=42,
207
+ n_jobs=-1
208
+ )
209
+
210
+ elif name == "XGBoost" and XGBRegressor is not None:
211
+ n_estimators = trial.suggest_int("n_estimators", 300, 900, step=100)
212
+ max_depth = trial.suggest_int("max_depth", 4, 10)
213
+ lr = trial.suggest_float("learning_rate", 0.02, 0.2, log=True)
214
+ subsample = trial.suggest_float("subsample", 0.7, 1.0)
215
+ colsample = trial.suggest_float("colsample_bytree", 0.7, 1.0)
216
+ lam = trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True)
217
+ m = XGBRegressor(
218
+ n_estimators=n_estimators,
219
+ max_depth=max_depth,
220
+ learning_rate=lr,
221
+ subsample=subsample,
222
+ colsample_bytree=colsample,
223
+ reg_lambda=lam,
224
+ random_state=42,
225
+ tree_method="hist",
226
+ n_jobs=-1
227
+ )
228
+
229
+ elif name == "LightGBM" and LGBMRegressor is not None:
230
+ n_estimators = trial.suggest_int("n_estimators", 400, 1400, step=200)
231
+ lr = trial.suggest_float("learning_rate", 0.02, 0.2, log=True)
232
+ num_leaves = trial.suggest_int("num_leaves", 31, 255, step=16)
233
+ subsample = trial.suggest_float("subsample", 0.7, 1.0)
234
+ colsample = trial.suggest_float("colsample_bytree", 0.7, 1.0)
235
+ m = LGBMRegressor(
236
+ n_estimators=n_estimators,
237
+ learning_rate=lr,
238
+ num_leaves=num_leaves,
239
+ subsample=subsample,
240
+ colsample_bytree=colsample,
241
+ random_state=42,
242
+ n_jobs=-1
243
+ )
244
+ else:
245
+ # ์ด ํ•จ์ˆ˜๊ฐ€ ์ง€์›ํ•˜์ง€ ์•Š๋Š” ๋ชจ๋ธ์ด๋ฉด ํฐ ์ˆซ์ž(๋‚˜์œ ์ ์ˆ˜) ๋ฐ˜ํ™˜
246
+ return 1e9
247
+
248
+ # ํ•™์Šต ํ›„ ๊ฒ€์ฆ์„ธํŠธ ์˜ˆ์ธก โ†’ RMSE ๋ฐ˜ํ™˜
249
+ m.fit(X_tr, y_tr)
250
+ p = m.predict(X_va)
251
+ return rmse(y_va, p)
252
+
253
+ # Optuna ์‹คํ–‰(์ตœ์†Œํ™”)
254
+ study = optuna.create_study(direction="minimize")
255
+ study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
256
+
257
+ # ์ตœ์  ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ '๋‹ค์‹œ' ๋ชจ๋ธ์„ ๋งŒ๋“ค์–ด ํ•™์Šตํ•ด ๋ฐ˜ํ™˜
258
+ best_params = study.best_params
259
+ if name == "RandomForest":
260
+ m = RandomForestRegressor(
261
+ n_estimators=best_params["n_estimators"],
262
+ max_depth=best_params["max_depth"],
263
+ random_state=42,
264
+ n_jobs=-1
265
+ )
266
+ elif name == "XGBoost" and XGBRegressor is not None:
267
+ m = XGBRegressor(
268
+ n_estimators=best_params["n_estimators"],
269
+ max_depth=best_params["max_depth"],
270
+ learning_rate=best_params["learning_rate"],
271
+ subsample=best_params["subsample"],
272
+ colsample_bytree=best_params["colsample_bytree"],
273
+ reg_lambda=best_params["reg_lambda"],
274
+ random_state=42,
275
+ tree_method="hist",
276
+ n_jobs=-1
277
+ )
278
+ elif name == "LightGBM" and LGBMRegressor is not None:
279
+ m = LGBMRegressor(
280
+ n_estimators=best_params["n_estimators"],
281
+ learning_rate=best_params["learning_rate"],
282
+ num_leaves=best_params["num_leaves"],
283
+ subsample=best_params["subsample"],
284
+ colsample_bytree=best_params["colsample_bytree"],
285
+ random_state=42,
286
+ n_jobs=-1
287
+ )
288
+ else:
289
+ return None
290
+
291
+ # ์ตœ์  ๋ชจ๋ธ์€ ๋‹ค์‹œ ์ „์ฒด ํ•™์Šต์„ธํŠธ์— ๋งž์ถฐ์„œ ๋ฐ˜ํ™˜
292
+ m.fit(X_tr, y_tr)
293
+ return m
294
+
295
+
296
+ # ------------------------------------------------------------
297
+ # 6) ํ•™์Šต & ์„ฑ๋Šฅ ๋น„๊ต โ†’ ๋ฒ ์ŠคํŠธ ๋ชจ๋ธ ์„ ํƒ
298
+ # ------------------------------------------------------------
299
+ def train_and_score(X, y, valid_ratio=0.2, use_optuna=False, optuna_trials=15, build_ensemble=True):
300
+ """
301
+ ์—ฌ๋Ÿฌ ๋ชจ๋ธ์„ ํ•™์Šต์‹œํ‚ค๊ณ , ๊ฒ€์ฆ ์„ฑ๋Šฅ(RMSE/MAE/MAPE)์„ ๋น„๊ตํ•ด
302
+ '๊ฐ€์žฅ ์ข‹์€ ๋ชจ๋ธ'์„ ์ฐพ์•„ ๋ฐ˜ํ™˜ํ•ด์š”.
303
+
304
+ ์ž…๋ ฅ:
305
+ - X, y: ํ•™์Šต ๋ฐ์ดํ„ฐ(๋ฐฐ์—ด/๋„˜ํŒŒ์ด)
306
+ - valid_ratio: ๊ฒ€์ฆ ๋น„์œจ(0.2 = 20%)
307
+ - use_optuna: True๋ฉด ๋ชจ๋ธ๋ณ„ ํŠœ๋‹ ์‹œ๋„
308
+ - optuna_trials: ํŠœ๋‹ ์‹œ๋„ ํšŸ์ˆ˜
309
+ - build_ensemble: True๋ฉด ๊ฐ„๋‹จ ์•™์ƒ๋ธ”๋„ ํ›„๋ณด๋กœ ์ถ”๊ฐ€
310
+
311
+ ๋ฐ˜ํ™˜:
312
+ - best_model: ๊ฐ€์žฅ ์„ฑ๋Šฅ ์ข‹์€ ๋ชจ๋ธ(๋‹จ์ผ ๋˜๋Š” Ensemble)
313
+ - lb: ์„ฑ๋Šฅ ๋ฆฌ๋”๋ณด๋“œ(DataFrame, rmse ์˜ค๋ฆ„์ฐจ์ˆœ ์ •๋ ฌ)
314
+ """
315
+ # ์‹œ๊ฐ„ ์ˆœ์„œ ๊ธฐ๋ฐ˜ ๋ถ„ํ• (์•ž: ํ•™์Šต, ๋’ค: ๊ฒ€์ฆ)
316
+ X_tr, y_tr, X_va, y_va = time_split(X, y, valid_ratio=valid_ratio)
317
+
318
+ rows = [] # ๊ฐ ๋ชจ๋ธ์˜ ์„ฑ์ ํ‘œ๋ฅผ ๋‹ด์„ ๋ฆฌ์ŠคํŠธ(๋‚˜์ค‘์— DataFrame์œผ๋กœ)
319
+ best = (None, None, float("inf")) # (์ด๋ฆ„, ๋ชจ๋ธ, ํ˜„์žฌ๊นŒ์ง€์˜ ์ตœ์†Œ RMSE)
320
+ fitted = [] # ํ•™์Šต ์™„๋ฃŒ๋œ (์ด๋ฆ„, ๋ชจ๋ธ) ์ €์žฅ
321
+ va_preds = [] # ๊ฒ€์ฆ ์˜ˆ์ธก ๊ฒฐ๊ณผ(์•™์ƒ๋ธ” ๋งŒ๋“ค ๋•Œ ์‚ฌ์šฉ)
322
+
323
+ # ๋ชจ๋ธ ํ›„๋ณด๋“ค์„ ํ•˜๋‚˜์”ฉ ํ•™์Šต/ํ‰๊ฐ€
324
+ for name, mdl, fit_params in get_candidates():
325
+ try:
326
+ # Optuna ํŠœ๋‹์„ ์ผœ๋ฉด ๋จผ์ € ํŠœ๋‹์„ ์‹œ๋„
327
+ if use_optuna:
328
+ tuned = _tune_with_optuna(name, mdl, X_tr, y_tr, X_va, y_va, n_trials=optuna_trials)
329
+ if tuned is not None:
330
+ mdl = tuned # ํŠœ๋‹ ์„ฑ๊ณต ์‹œ ๊ทธ ๋ชจ๋ธ๋กœ ๊ต์ฒด
331
+
332
+ # ๋ชจ๋ธ ํ•™์Šต
333
+ mdl.fit(X_tr, y_tr, **fit_params)
334
+
335
+ # ๊ฒ€์ฆ ์˜ˆ์ธก
336
+ pred = mdl.predict(X_va)
337
+
338
+ # ์„ฑ์ ํ‘œ ํ•œ ์ค„ ์ž‘์„ฑ
339
+ row = {
340
+ "model": name,
341
+ "rmse": rmse(y_va, pred),
342
+ "mae": mae(y_va, pred),
343
+ "mape": mape(y_va, pred)
344
+ }
345
+ rows.append(row)
346
+
347
+ # ์•™์ƒ๋ธ” ํ›„๋ณด๋ฅผ ์œ„ํ•ด ์ €์žฅ
348
+ fitted.append((name, mdl))
349
+ va_preds.append(pred)
350
+
351
+ # ๋ฒ ์ŠคํŠธ ๊ฐฑ์‹ (๋” ์ž‘์€ RMSE๊ฐ€ ๋‚˜์˜ค๋ฉด ๊ต์ฒด)
352
+ if row["rmse"] < best[2]:
353
+ best = (name, mdl, row["rmse"])
354
+
355
+ except Exception:
356
+ # ์–ด๋–ค ๋ชจ๋ธ์ด ์‹คํŒจํ•˜๋”๋ผ๋„ ์ „์ฒด ํŒŒ์ดํ”„๋ผ์ธ์€ ๊ณ„์† ๊ฐ€์š”.
357
+ rows.append({"model": name, "rmse": np.nan, "mae": np.nan, "mape": np.nan})
358
+
359
+ # ---- ๊ฐ„๋‹จ ์•™์ƒ๋ธ” ํ›„๋ณด ์ถ”๊ฐ€ (์›ํ•˜๋ฉด) ----
360
+ # 2๊ฐœ ์ด์ƒ ๋ชจ๋ธ์ด ์„ฑ๊ณตํ–ˆ์„ ๋•Œ๋งŒ ์•™์ƒ๋ธ” ์‹œ๋„
361
+ if build_ensemble and len(va_preds) >= 2:
362
+ # ๋ชจ๋ธ๋ณ„ RMSE์˜ ์—ญ์ˆ˜๋ฅผ ๊ฐ€์ค‘์น˜๋กœ ์‚ฌ์šฉ(์ข‹์„์ˆ˜๋ก ํฐ ๊ฐ€์ค‘)
363
+ rmses = [rmse(y_va, p) for p in va_preds]
364
+ weights = [1.0 / max(r, 1e-6) for r in rmses] # 0 ๋‚˜๋ˆ” ๋ฐฉ์ง€
365
+
366
+ ens = SimpleEnsemble([m for _, m in fitted], weights)
367
+ ens_pred = ens.predict(X_va)
368
+
369
+ row = {
370
+ "model": "Ensemble",
371
+ "rmse": rmse(y_va, ens_pred),
372
+ "mae": mae(y_va, ens_pred),
373
+ "mape": mape(y_va, ens_pred)
374
+ }
375
+ rows.append(row)
376
+
377
+ # ์•™์ƒ๋ธ”์ด ์ œ์ผ ์ข‹์œผ๋ฉด ๋ฒ ์ŠคํŠธ๋กœ ๊ต์ฒด
378
+ if row["rmse"] < best[2]:
379
+ best = ("Ensemble", ens, row["rmse"])
380
+
381
+ # ๋ฆฌ๋”๋ณด๋“œ ํ…Œ์ด๋ธ” ๋งŒ๋“ค๊ธฐ(์ž‘์€ rmse ์ˆœ)
382
+ lb = pd.DataFrame(rows).sort_values("rmse", na_position="last").reset_index(drop=True)
383
+
384
+ # best[1] = ๋ฒ ์ŠคํŠธ ๋ชจ๋ธ ๊ฐ์ฒด
385
+ return best[1], lb
386
+
387
+
388
+ # ------------------------------------------------------------
389
+ # 7) ์‚ฐ์ถœ๋ฌผ ์ €์žฅ(๋ฒ ์ŠคํŠธ ๋ชจ๋ธ/ํ”ผ์ฒ˜๋ช…/๋งคํ•‘/๋ฆฌ๋”๋ณด๋“œ)
390
+ # ------------------------------------------------------------
391
+ def save_artifacts(out_dirs, best_model, feature_names, mapping, leaderboard_df):
392
+ """
393
+ ํ•™์Šต ๊ฒฐ๊ณผ๋ฅผ ๋””์Šคํฌ์— ์ €์žฅํ•ด์š”.
394
+
395
+ - out_dirs: ์ €์žฅํ•  ํด๋” ๋ชฉ๋ก(์˜ˆ: ['artifacts', 'models'])
396
+ ๋‘ ํด๋” ๋ชจ๋‘์— ๋™์ผํ•œ ํŒŒ์ผ์„ ๋งŒ๋“ค์–ด ๋‘ก๋‹ˆ๋‹ค(๋ณต๊ตฌ/๊ณต์œ  ํŽธ์˜).
397
+ - best_model: train_and_score ์—์„œ ๋ฝ‘ํžŒ ์ตœ๊ณ  ๋ชจ๋ธ(๋˜๋Š” ์•™์ƒ๋ธ”)
398
+ - feature_names: ๋ชจ๋ธ ์ž…๋ ฅ ์ปฌ๋Ÿผ ์ด๋ฆ„ ๋ฆฌ์ŠคํŠธ
399
+ - mapping: ๋‚ ์งœ/ํƒ€๊นƒ/์นดํ…Œ๊ณ ๋ฆฌ ๋งคํ•‘ ๋”•์…”๋„ˆ๋ฆฌ (์žฌํ˜„/์˜ˆ์ธก ์‹œ ํ•„์š”)
400
+ - leaderboard_df: ์„ฑ๋Šฅ ํ‘œ(DataFrame)
401
+
402
+ ์ƒ์„ฑ ํŒŒ์ผ:
403
+ - best_model.pkl: {model, feature_names, mapping} ๋ฅผ pickle ๋กœ ์ €์žฅ
404
+ - leaderboard.csv: ์„ฑ๋Šฅ ํ‘œ (UTF-8-SIG, ์—‘์…€ ํ˜ธํ™˜)
405
+ - leaderboard.parquet: ํŒŒ์ผ€์ด(์žˆ์œผ๋ฉด)
406
+ """
407
+ payload = {
408
+ "model": best_model,
409
+ "feature_names": feature_names,
410
+ "mapping": mapping
411
+ }
412
+
413
+ for d in out_dirs:
414
+ os.makedirs(d, exist_ok=True)
415
+
416
+ # 1) ๋ฒ ์ŠคํŠธ ๋ชจ๋ธ ํŒจํ‚ค์ง€ ์ €์žฅ
417
+ with open(os.path.join(d, "best_model.pkl"), "wb"):
418
+ # pickle.dump: ํŒŒ์ด์ฌ ๊ฐ์ฒด๋ฅผ ํŒŒ์ผ๋กœ ์ง๋ ฌํ™”ํ•ด์„œ ์ €์žฅ
419
+ pass
420
+ with open(os.path.join(d, "best_model.pkl"), "wb") as f:
421
+ pickle.dump(payload, f)
422
+
423
+ # 2) ๋ฆฌ๋”๋ณด๋“œ ์ €์žฅ (CSV)
424
+ leaderboard_df.to_csv(
425
+ os.path.join(d, "leaderboard.csv"),
426
+ index=False,
427
+ encoding="utf-8-sig" # ์—‘์…€์—์„œ ํ•œ๊ธ€ ์•ˆ๊นจ์ง€๋„๋ก
428
+ )
429
+
430
+ # 3) ๋ฆฌ๋”๋ณด๋“œ ์ €์žฅ (Parquet, ์„ ํƒ์‚ฌํ•ญ)
431
+ try:
432
+ leaderboard_df.to_parquet(
433
+ os.path.join(d, "leaderboard.parquet"),
434
+ index=False
435
+ )
436
+ except Exception:
437
+ # pyarrow ๊ฐ™์€ ์˜์กด์„ฑ์ด ์—†์„ ์ˆ˜ ์žˆ์œผ๋‹ˆ ์‹คํŒจํ•ด๋„ ๊ทธ๋ƒฅ ๋„˜์–ด๊ฐ
438
+ pass
utils_io.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ utils_io.py โ€” ์ž…์ถœ๋ ฅ/์ปฌ๋Ÿผ ์ž๋™ ๋งคํ•‘ ์œ ํ‹ธ ๋ชจ์Œ (์ƒ์„ธ ์ฃผ์„)
5
+
6
+ ์ด ํŒŒ์ผ์€ ๋‹ค์Œ ๊ธฐ๋Šฅ์„ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค.
7
+ 1) read_csv_flexible: ์—ฌ๋Ÿฌ ์ธ์ฝ”๋”ฉ ํ›„๋ณด๋กœ CSV๋ฅผ '์•ˆ์ „ํ•˜๊ฒŒ' ์ฝ๊ธฐ
8
+ 2) save_utf8sig : UTF-8-SIG(์—‘์…€ ํ˜ธํ™˜)๋กœ CSV ์ €์žฅ
9
+ 3) ensure_dirs : ํด๋”๊ฐ€ ์—†์œผ๋ฉด ๋งŒ๋“ค์–ด ์ฃผ๊ธฐ
10
+ 4) auto_map_columns : ๋‚ ์งœ/ํƒ€๊นƒ/์ง€์—ญ/๋ธŒ๋žœ๋“œ/์ƒํ’ˆ ์ปฌ๋Ÿผ ์ž๋™ ์ถ”์ •
11
+
12
+ โ€ป ์ฃผ์˜: ์•„๋ž˜ auto_map_columns()๋Š” ์›๋ณธ ์ฝ”๋“œ์˜ locals() ๊ธฐ๋ฐ˜ ์ถฉ๋Œ ํ•ด๊ฒฐ์„
13
+ '์•ˆ์ „ํ•œ ๋”•์…”๋„ˆ๋ฆฌ ๊ธฐ๋ฐ˜'์œผ๋กœ ๊ณ ์ณค์Šต๋‹ˆ๋‹ค. (Python์—์„œ locals() ์ˆ˜์ •์€
14
+ ํ•จ์ˆ˜ ์Šค์ฝ”ํ”„์—์„œ ๋ณด์žฅ์ด ๋˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.)
15
+ """
16
+
17
+ import os
18
+ import re
19
+ import glob
20
+ import pandas as pd
21
+ from typing import Optional, Dict, List, Union, IO
22
+
23
+ # 1) CSV ์ฝ๊ธฐ ์‹œ๋„ํ•  ์ธ์ฝ”๋”ฉ ํ›„๋ณด๋“ค
24
+ # - utf-8-sig: ์—‘์…€์—์„œ ์ž˜ ์—ด๋ฆฌ๋Š” UTF-8 with BOM
25
+ # - utf-8 : ๋ฒ”์šฉ
26
+ # - cp949/euc-kr: ์œˆ๋„์šฐ/๊ตญ๋‚ด ํ™˜๊ฒฝ์—์„œ ์ž์ฃผ ์“ฐ๋Š” ํ•œ๊ธ€ ์ธ์ฝ”๋”ฉ
27
+ # - latin1 : ๋งˆ์ง€๋ง‰ ์•ˆ์ „๋ง(์†์‹ค ์—†์ด ์ฝํžˆ๋‚˜ ๊ธ€์ž๊ฐ€ ๊นจ์งˆ ์ˆ˜ ์žˆ์Œ)
28
+ ENCODINGS: List[str] = ["utf-8-sig", "utf-8", "cp949", "euc-kr", "latin1"]
29
+
30
+
31
+ def read_csv_flexible(path_or_buf: Union[str, os.PathLike, IO[bytes], IO[str]]) -> pd.DataFrame:
32
+ """
33
+ ์—ฌ๋Ÿฌ ์ธ์ฝ”๋”ฉ์„ ์ˆœ์ฐจ์ ์œผ๋กœ ์‹œ๋„ํ•˜์—ฌ CSV๋ฅผ ์•ˆ์ „ํ•˜๊ฒŒ ์ฝ์Šต๋‹ˆ๋‹ค.
34
+ - ์ฒซ ๋ฒˆ์งธ๋กœ ์„ฑ๊ณตํ•˜๋Š” ์ธ์ฝ”๋”ฉ ๊ฒฐ๊ณผ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
35
+ - ๋ชจ๋‘ ์‹คํŒจํ•˜๋ฉด ๋งˆ์ง€๋ง‰ ์˜ˆ์™ธ๋ฅผ ๋‹ค์‹œ ๋˜์ง‘๋‹ˆ๋‹ค.
36
+ - ๋ฌธ์ž์—ด ๊ฒฝ๋กœ๋ฟ ์•„๋‹ˆ๋ผ BytesIO/ํŒŒ์ผ ๊ฐ์ฒด๋„ ์ง€์›ํ•ฉ๋‹ˆ๋‹ค.
37
+
38
+ Parameters
39
+ ----------
40
+ path_or_buf : str ๋˜๋Š” ํŒŒ์ผ ๊ฐ์ฒด
41
+ CSV ํŒŒ์ผ ๊ฒฝ๋กœ ๋˜๋Š” ํŒŒ์ผ ๊ฐ์ฒด/๋ฒ„ํผ(์˜ˆ: BytesIO, UploadedFile ๋“ฑ)
42
+
43
+ Returns
44
+ -------
45
+ pd.DataFrame
46
+ ์ฝ์–ด๋“ค์ธ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„
47
+ """
48
+ last_e: Optional[Exception] = None
49
+ for enc in ENCODINGS:
50
+ try:
51
+ # ํŒŒ์ผ ๊ฐ์ฒด์ผ ๊ฒฝ์šฐ ๋งค๋ฒˆ ์ฒ˜์Œ๋ถ€ํ„ฐ ๋‹ค์‹œ ์ฝ๋„๋ก ์ปค์„œ ์ด๋™
52
+ if hasattr(path_or_buf, "seek"):
53
+ try:
54
+ path_or_buf.seek(0)
55
+ except Exception:
56
+ # seek์„ ์ง€์›ํ•˜์ง€ ์•Š์œผ๋ฉด ๊ทธ๋ƒฅ ์ง„ํ–‰
57
+ pass
58
+ return pd.read_csv(path_or_buf, encoding=enc)
59
+ except Exception as e:
60
+ # ์‹คํŒจํ•˜๋ฉด ๋‹ค์Œ ์ธ์ฝ”๋”ฉ์œผ๋กœ ๋„˜์–ด๊ฐ€๊ณ , ๋งˆ์ง€๋ง‰ ์˜ˆ์™ธ๋ฅผ ์ €์žฅ
61
+ last_e = e
62
+ if last_e is not None:
63
+ # ๋ชจ๋“  ์ธ์ฝ”๋”ฉ์ด ์‹คํŒจ โ†’ ๋งˆ์ง€๋ง‰ ์—๋Ÿฌ๋ฅผ ๊ทธ๋Œ€๋กœ ์˜ฌ๋ฆผ(๋””๋ฒ„๊น…์— ์œ ์šฉ)
64
+ raise last_e
65
+ # ์ด๋ก ์ƒ ๋„๋‹ฌํ•˜์ง€ ์•Š์ง€๋งŒ, ์•ˆ์ „๋ง์œผ๋กœ ํ•œ ๋ฒˆ ๋” ์‹œ๋„
66
+ return pd.read_csv(path_or_buf)
67
+
68
+
69
+ def save_utf8sig(df: pd.DataFrame, path: str) -> None:
70
+ """
71
+ DataFrame์„ UTF-8-SIG๋กœ ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.
72
+ - ๋””๋ ‰ํ† ๋ฆฌ๊ฐ€ ์—†์œผ๋ฉด ๋จผ์ € ๋งŒ๋“ค์–ด ์ค๋‹ˆ๋‹ค.
73
+ - ์—‘์…€์—์„œ ํ•œ๊ธ€ ๊นจ์ง์„ ๋ฐฉ์ง€ํ•˜๋Š” ์ธ์ฝ”๋”ฉ์ž…๋‹ˆ๋‹ค.
74
+
75
+ Parameters
76
+ ----------
77
+ df : pd.DataFrame
78
+ ์ €์žฅํ•  ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„
79
+ path : str
80
+ ์ €์žฅ ๊ฒฝ๋กœ(ํŒŒ์ผ๋ช… ํฌํ•จ)
81
+ """
82
+ os.makedirs(os.path.dirname(path), exist_ok=True)
83
+ df.to_csv(path, index=False, encoding="utf-8-sig")
84
+
85
+
86
+ def ensure_dirs(*dirs: str) -> None:
87
+ """
88
+ ์ „๋‹ฌ๋œ ๋ชจ๋“  ๊ฒฝ๋กœ์— ๋Œ€ํ•ด ํด๋”๊ฐ€ ์—†์œผ๋ฉด ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
89
+ - ์—ฌ๋Ÿฌ ๊ฒฝ๋กœ๋ฅผ ํ•œ ๋ฒˆ์— ์ฒ˜๋ฆฌํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
90
+
91
+ Example
92
+ -------
93
+ ensure_dirs("data", "artifacts", "models")
94
+ """
95
+ for d in dirs:
96
+ os.makedirs(d, exist_ok=True)
97
+
98
+
99
+ # --- Column auto-mapping helpers --------------------------------------------
100
+ # ํ•œ๊ตญ์–ด/์˜์–ด๋กœ ์ž์ฃผ ์“ฐ์ด๋Š” ์—ด ์ด๋ฆ„ ํ›„๋ณด ๋ฆฌ์ŠคํŠธ
101
+ _CAND_DATE = ["date", "์ผ์ž", "๋‚ ์งœ", "dt", "๊ธฐ์ค€์ผ"]
102
+ _CAND_TARGET = ["qty", "sales_qty", "sales", "ํŒ๋งค์ˆ˜๋Ÿ‰", "์ˆ˜๋Ÿ‰", "demand", "target", "y"]
103
+ _CAND_REGION = ["region", "์ง€์ ", "์ ํฌ", "๋งค์žฅ", "์ง€์—ญ", "์‹œ๋„", "๊ด‘์—ญ", "๊ตฌ๋ถ„"]
104
+ _CAND_BRAND = ["brand", "๋ธŒ๋žœ๋“œ", "ํšŒ์‚ฌ", "์ œ์กฐ์‚ฌ"]
105
+ _CAND_ITEM = ["item", "์ƒํ’ˆ", "ํ’ˆ๋ชฉ", "sku", "์ƒํ’ˆ๋ช…", "์ œํ’ˆ๋ช…"]
106
+
107
+
108
+ def _guess_col(cols: List[str], candidates: List[str]) -> Optional[str]:
109
+ """
110
+ ์ปฌ๋Ÿผ ์ด๋ฆ„ ๋ชฉ๋ก(cols)์—์„œ ํ›„๋ณด(candidates)์™€ '๊ฐ€์žฅ ์ž˜ ๋งž๋Š”' ์ปฌ๋Ÿผ์„ ์ถ”์ •ํ•ฉ๋‹ˆ๋‹ค.
111
+ 1) ์ „๋ถ€ ์†Œ๋ฌธ์ž๋กœ ๋ฐ”๊พผ ๋’ค '์ •ํ™•ํžˆ ๊ฐ™์€ ์ด๋ฆ„' ์šฐ์„  ๋งค์นญ
112
+ 2) ์—†์œผ๋ฉด 'ํฌํ•จ(contains)' ๋งค์นญ์œผ๋กœ ์™„ํ™” ํƒ์ƒ‰
113
+
114
+ Parameters
115
+ ----------
116
+ cols : List[str]
117
+ ์‹ค์ œ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์˜ ์ปฌ๋Ÿผ๋ช… ๋ฆฌ์ŠคํŠธ
118
+ candidates : List[str]
119
+ ์šฐ๋ฆฌ๊ฐ€ ์ฐพ๊ณ  ์‹ถ์€ ์˜๋ฏธ์˜ ํ›„๋ณด๋ช…๋“ค
120
+
121
+ Returns
122
+ -------
123
+ Optional[str]
124
+ ๋งค์นญ๋œ ์ปฌ๋Ÿผ๋ช…(์—†์œผ๋ฉด None)
125
+ """
126
+ lower = {c.lower(): c for c in cols} # ์†Œ๋ฌธ์ž โ†’ ์›๋ž˜ ์ปฌ๋Ÿผ๋ช… ๋งคํ•‘
127
+
128
+ # (1) ์ •ํ™• ์ผ์น˜ ์šฐ์„ 
129
+ for c in candidates:
130
+ if c in lower:
131
+ return lower[c]
132
+
133
+ # (2) ๋ถ€๋ถ„ ํฌํ•จ(์™„ํ™” ๋งค์นญ)
134
+ for c in candidates:
135
+ for col in cols:
136
+ if c in col.lower():
137
+ return col
138
+
139
+ return None
140
+
141
+
142
+ def auto_map_columns(df: pd.DataFrame) -> Dict[str, Optional[str]]:
143
+ """
144
+ ๋‚ ์งœ/ํƒ€๊นƒ/์ง€์—ญ/๋ธŒ๋žœ๋“œ/์ƒํ’ˆ ์ปฌ๋Ÿผ๋ช…์„ ์ž๋™์œผ๋กœ ์ถ”์ •ํ•ฉ๋‹ˆ๋‹ค.
145
+ - ์ •/๋ถ€๋ถ„์ผ์น˜๋กœ ๊ฐ๊ฐ ํ•œ ๊ฐœ์”ฉ ์ฐพ์Šต๋‹ˆ๋‹ค.
146
+ - ์ค‘๋ณต(๊ฐ™์€ ์ปฌ๋Ÿผ์ด ๋‘ ์—ญํ• ๋กœ ์„ ํƒ) ๋ฐœ์ƒ ์‹œ, ๋‚ ์งœ/ํƒ€๊นƒ์„ ์šฐ์„  ๋ณด์กดํ•˜๊ณ 
147
+ ๋‚˜๋จธ์ง€(region/brand/item)๋Š” '์•„์ง ์‚ฌ์šฉ๋˜์ง€ ์•Š์€' ๋‹ค๋ฅธ ์ปฌ๋Ÿผ์œผ๋กœ
148
+ ๋Œ€์ฒด ์‹œ๋„ํ•ฉ๋‹ˆ๋‹ค. (์›๋ณธ ๋กœ์ง์˜ locals() ์ˆ˜์ • ๋ฒ„๊ทธ๋ฅผ ์ œ๊ฑฐ)
149
+
150
+ Parameters
151
+ ----------
152
+ df : pd.DataFrame
153
+ ์ž…๋ ฅ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„
154
+
155
+ Returns
156
+ -------
157
+ Dict[str, Optional[str]]
158
+ {'date': ..., 'target': ..., 'region': ..., 'brand': ..., 'item': ...}
159
+ ๊ฐ’์ด None์ผ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
160
+ """
161
+ cols = list(df.columns)
162
+
163
+ # 1) 1์ฐจ ์ž๋™ ์ถ”์ •
164
+ date = _guess_col(cols, _CAND_DATE)
165
+ target = _guess_col(cols, _CAND_TARGET)
166
+ region = _guess_col(cols, _CAND_REGION)
167
+ brand = _guess_col(cols, _CAND_BRAND)
168
+ item = _guess_col(cols, _CAND_ITEM)
169
+
170
+ # 2) ์ถฉ๋Œ(์ค‘๋ณต) ์ฒ˜๋ฆฌ โ€” ์•ˆ์ „ํ•œ ๋”•์…”๋„ˆ๋ฆฌ ๋ฐฉ์‹
171
+ picks = {
172
+ "date": date,
173
+ "target": target,
174
+ "region": region,
175
+ "brand": brand,
176
+ "item": item,
177
+ }
178
+
179
+ # None์ด ์•„๋‹Œ ๊ฐ’๋“ค๋งŒ ๋ฝ‘์•„ ์ค‘๋ณต ์—ฌ๋ถ€ ํ™•์ธ
180
+ chosen_non_null = [p for p in picks.values() if p]
181
+ has_dup = len(set(chosen_non_null)) != len(chosen_non_null)
182
+
183
+ if has_dup:
184
+ # ๋‚ ์งœ/ํƒ€๊นƒ ์ตœ์šฐ์„  ๋ณดํ˜ธ
185
+ used = set([p for p in (date, target) if p])
186
+ # ์ถฉ๋Œ ๊ฐ€๋Šฅ์„ฑ์ด ์žˆ๋Š” ํ‚ค๋“ค(์šฐ์„ ์ˆœ์œ„ ๋‚ฎ์Œ)
187
+ for key in ["region", "brand", "item"]:
188
+ val = picks.get(key)
189
+ # ์ด๋ฏธ ์‚ฌ์šฉ๋œ ์ปฌ๋Ÿผ๊ณผ ๊ฒน์น˜๋ฉด ๋‹ค๋ฅธ ํ›„๋ณด๋ฅผ ์ฐพ์•„๋ด„
190
+ if val and val in used:
191
+ # ์•„์ง ์“ฐ์ง€ ์•Š์€ ์ž„์˜์˜ ์ปฌ๋Ÿผ์„ ์ˆœํšŒํ•˜๋ฉฐ ๋Œ€์ฒด
192
+ replace = None
193
+ for c in cols:
194
+ if c not in used and c != val:
195
+ replace = c
196
+ break
197
+ picks[key] = replace # ๋ชป ์ฐพ์œผ๋ฉด None์ด ๋“ค์–ด๊ฐ‘๋‹ˆ๋‹ค.
198
+ if replace:
199
+ used.add(replace)
200
+ elif val:
201
+ used.add(val)
202
+
203
+ return picks