Upload 7 files
Browse files- app.py +964 -0
- preprocess.py +188 -0
- quick_train_runner.py +116 -0
- requirements.txt +10 -3
- starter.ipynb +1 -0
- train_core.py +438 -0
- utils_io.py +203 -0
app.py
ADDED
|
@@ -0,0 +1,964 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
# ============================================================
|
| 5 |
+
# ํธ์์ ์์์์ธก & ๋ฐ์ฃผ ์ถ์ฒ โ Pro Suite (ํจ์น ๋ฒ์ , ๋ฉํฐ CSV + ์๋ณ ๊ทธ๋ํ)
|
| 6 |
+
# - โ ์ฌ๋ฌ CSV ์
๋ก๋/์ ํ โ ์๋ ๊ฒฐํฉ(์ต์
: source ์ด ์ถ๊ฐ)
|
| 7 |
+
# - โก ์ปฌ๋ผ ๋งคํ: "์ปฌ๋ผ๋ช
"์ด ์๋๋ผ "์์ ๊ฐ" ๊ธฐ๋ฐ ์ ํ
|
| 8 |
+
# - โข ์์ธกยท๋ฐ์ฃผ: ์ฌ๊ณ ์ปฌ๋ผ ์๋ ์ธ์ โ ์์ธก ๊ธฐ๊ฐ/๋ฐ์ฃผ๋ ์๋ ๊ณ์ฐ
|
| 9 |
+
# ยท ๋ฆฌ๋ํ์ / ์๋น์ค๋ ๋ฒจ / ์์ ์ฌ๊ณ / MOQ / ํฉ๋จ์ ์
๋ ฅ ์ ๊ฑฐ
|
| 10 |
+
# - โฃ ๋ถ์(๊ทธ๋ํ):
|
| 11 |
+
# ยท ์ฐ์ฐ: ์๋ณ ๊ฐ์๋ โ ์ฐ์ฐ ํ๋งค๋ (์ฐ์ ๋ + ํ๊ท์ + ์ผ๋ณ ์ ํ ๊ทธ๋ํ)
|
| 12 |
+
# ยท ๊ตฐ๊ณ ๊ตฌ๋ง: ์๋ณ ๊ธฐ์จ โ ๊ตฐ๊ณ ๊ตฌ๋ง ํ๋งค๋ (์ฐ์ ๋ + ํ๊ท์ + ์ผ๋ณ ์ ํ ๊ทธ๋ํ)
|
| 13 |
+
# ยท ์ ์ฒด: ์ฐ์ฐยท๊ตฐ๊ณ ๊ตฌ๋ง ์ ์ธ ์ ์ฒด ์ํ ์ผ๋ณ ํ๋งค๋ ์ ํ ๊ทธ๋ํ
|
| 14 |
+
# - ์ฌ์ด๋๋ฐ: ์คํ ํ์ผ ํ์ + ์บ์ ์ด๊ธฐํ
|
| 15 |
+
# ============================================================
|
| 16 |
+
|
| 17 |
+
import os, io, pickle, time, subprocess, sys
|
| 18 |
+
from datetime import timedelta
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
|
| 21 |
+
import pandas as pd
|
| 22 |
+
import numpy as np
|
| 23 |
+
import streamlit as st
|
| 24 |
+
import altair as alt
|
| 25 |
+
|
| 26 |
+
from utils_io import read_csv_flexible, save_utf8sig, ensure_dirs, auto_map_columns
|
| 27 |
+
from preprocess import make_matrix
|
| 28 |
+
from train_core import train_and_score, save_artifacts
|
| 29 |
+
|
| 30 |
+
# Altair ๋์ฉ๋ ๋ ๋๋ง ์์ ์ฅ์น (ํ ์ ์ ํ ํด์ )
|
| 31 |
+
alt.data_transformers.disable_max_rows()
|
| 32 |
+
|
| 33 |
+
# ------------------------------------------------------------
|
| 34 |
+
# ํ์ด์ง/์ฌ์ด๋๋ฐ
|
| 35 |
+
# ------------------------------------------------------------
|
| 36 |
+
st.set_page_config(page_title="ํธ์์ ์์์์ธก & ๋ฐ์ฃผ ์ถ์ฒ โ Pro Suite (ํจ์น)", layout="wide")
|
| 37 |
+
|
| 38 |
+
# __file__ ์ด ์๋ Colab ๊ฐ์ ํ๊ฒฝ ๋ฐฉ์ด์ฉ
|
| 39 |
+
try:
|
| 40 |
+
script_name = Path(__file__).resolve().name
|
| 41 |
+
except NameError:
|
| 42 |
+
script_name = "app_streamlit_pro.py"
|
| 43 |
+
|
| 44 |
+
st.sidebar.write("๐งญ ์คํ ํ์ผ:", script_name)
|
| 45 |
+
if st.sidebar.button("์บ์ ์ด๊ธฐํ ํ ๋ค์ ์คํ"):
|
| 46 |
+
try:
|
| 47 |
+
st.cache_data.clear()
|
| 48 |
+
except Exception:
|
| 49 |
+
pass
|
| 50 |
+
try:
|
| 51 |
+
st.cache_resource.clear()
|
| 52 |
+
except Exception:
|
| 53 |
+
pass
|
| 54 |
+
st.experimental_rerun()
|
| 55 |
+
|
| 56 |
+
# ------------------------------------------------------------
|
| 57 |
+
# ๊ธฐ๋ณธ ํ๊ฒฝ/๊ฒฝ๋ก ์ค์
|
| 58 |
+
# ------------------------------------------------------------
|
| 59 |
+
PROJ = os.getcwd() # ํ์ฌ ์์
๋๋ ํ ๋ฆฌ(์ฑ ๋ฃจํธ)
|
| 60 |
+
DATA_DIR = os.path.join(PROJ, "data") # CSV ๋ฐ์ดํฐ ํด๋
|
| 61 |
+
ARTI_DIR = os.path.join(PROJ, "artifacts") # ํ์ต ์ค๊ฐ์ฐ์ถ๋ฌผ(๋ก๊ทธ/์ฑ๋ฅ ๋ฑ) ๋ณด๊ด
|
| 62 |
+
MODELS_DIR = os.path.join(PROJ, "models") # ํ์ต๋ ๋ชจ๋ธ pkl ๋ณด๊ด
|
| 63 |
+
ensure_dirs(DATA_DIR, ARTI_DIR, MODELS_DIR) # ํด๋ ์์ผ๋ฉด ์์ฑ
|
| 64 |
+
|
| 65 |
+
# ------------------------------------------------------------
|
| 66 |
+
# ์ ํธ: data ํด๋์ CSV ํ์ผ ๋ฆฌ์คํธ ์บ์
|
| 67 |
+
# ------------------------------------------------------------
|
| 68 |
+
@st.cache_data(show_spinner=False)
|
| 69 |
+
def list_data_files():
|
| 70 |
+
try:
|
| 71 |
+
return [f for f in os.listdir(DATA_DIR) if f.lower().endswith(".csv")]
|
| 72 |
+
except FileNotFoundError:
|
| 73 |
+
return []
|
| 74 |
+
|
| 75 |
+
# ------------------------------------------------------------
|
| 76 |
+
# ํผ๋ธ๋ฆญ URL: cloudflared ์์ ํจ์
|
| 77 |
+
# ------------------------------------------------------------
|
| 78 |
+
def start_cloudflared(port=8501):
|
| 79 |
+
try:
|
| 80 |
+
proc = subprocess.Popen(
|
| 81 |
+
["cloudflared", "tunnel", "--url", f"http://localhost:{port}"],
|
| 82 |
+
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
| 83 |
+
)
|
| 84 |
+
st.session_state["_cfd_proc"] = proc # ์ข
๋ฃ์ฉ ํธ๋ค ์ ์ฅ
|
| 85 |
+
with st.expander("cloudflared logs"):
|
| 86 |
+
for _ in range(120): # ์ต์ด 120๋ผ์ธ ์ ๋๋ง ์ฝ์ด ํ์
|
| 87 |
+
line = proc.stdout.readline()
|
| 88 |
+
if not line:
|
| 89 |
+
break
|
| 90 |
+
st.text(line.strip())
|
| 91 |
+
if "trycloudflare.com" in line:
|
| 92 |
+
st.success(line.strip()) # ํผ๋ธ๋ฆญ URL ํฌํจ ๋ก๊ทธ
|
| 93 |
+
break
|
| 94 |
+
except FileNotFoundError:
|
| 95 |
+
st.error("cloudflared ๋ฐ์ด๋๋ฆฌ๊ฐ ์์ต๋๋ค. `pip install cloudflared` ๋๋ ๋ฐ์ด๋๋ฆฌ ์ค์น ํ ๋ค์ ์๋ํ์ธ์.")
|
| 96 |
+
|
| 97 |
+
# ------------------------------------------------------------
|
| 98 |
+
# ํผ๋ธ๋ฆญ URL: ngrok ์์ ํจ์
|
| 99 |
+
# ------------------------------------------------------------
|
| 100 |
+
def start_ngrok(port=8501, token: str | None = None):
|
| 101 |
+
try:
|
| 102 |
+
from pyngrok import ngrok, conf
|
| 103 |
+
except Exception:
|
| 104 |
+
st.error("pyngrok๊ฐ ์ค์น๋์ด ์์ง ์์ต๋๋ค. `pip install pyngrok` ํ ๋ค์ ์๋ํ์ธ์.")
|
| 105 |
+
return
|
| 106 |
+
|
| 107 |
+
# ๊ธฐ์กด ngrok ์ธ์
์ ๋ฆฌ(์ฌ์คํ ์ ์ถฉ๋ ๋ฐฉ์ง)
|
| 108 |
+
try:
|
| 109 |
+
ngrok.kill()
|
| 110 |
+
time.sleep(1.0)
|
| 111 |
+
except Exception:
|
| 112 |
+
pass
|
| 113 |
+
|
| 114 |
+
token = (token or os.environ.get("NGROK_AUTHTOKEN", "")).strip()
|
| 115 |
+
if token:
|
| 116 |
+
conf.get_default().auth_token = token
|
| 117 |
+
else:
|
| 118 |
+
st.warning("NGROK_AUTHTOKEN์ด ๋น์ด ์์ต๋๋ค. ์ธ์ฆ ์์ด ์ด๋ฉด ์ ํ/์๋ฌ(4018) ๊ฐ๋ฅ.")
|
| 119 |
+
|
| 120 |
+
for attempt in range(2):
|
| 121 |
+
try:
|
| 122 |
+
tunnel = ngrok.connect(addr=f"http://localhost:{port}", proto="http")
|
| 123 |
+
url = tunnel.public_url
|
| 124 |
+
st.session_state["_ngrok_tunnel"] = tunnel
|
| 125 |
+
st.success(f"๐ Public URL: {url}")
|
| 126 |
+
st.caption("๋ฐํ์/ํ๋ก์ธ์ค๋ฅผ ์ข
๋ฃํ๋ฉด ํฐ๋๋ ๋ซํ๋๋ค.")
|
| 127 |
+
break
|
| 128 |
+
except Exception as e:
|
| 129 |
+
if attempt == 0:
|
| 130 |
+
time.sleep(1.5)
|
| 131 |
+
else:
|
| 132 |
+
msg = str(e)
|
| 133 |
+
if "4018" in msg:
|
| 134 |
+
st.error("ngrok ์ธ์ฆ ์คํจ(4018). ํ ํฐ์ ๋ค์ ํ์ธํ์ธ์.")
|
| 135 |
+
elif "already online" in msg or "334" in msg:
|
| 136 |
+
st.error("๋์ผ ์๋ํฌ์ธํธ๊ฐ ์ด๋ฏธ ์ด๋ ค ์์ต๋๋ค. ์ธ์
์ฌ์์ ๋๋ ๊ธฐ์กด ํฐ๋ ์ข
๋ฃ ํ ์ฌ์๋.")
|
| 137 |
+
else:
|
| 138 |
+
st.error(f"ngrok ์ฐ๊ฒฐ ์คํจ: {e}")
|
| 139 |
+
|
| 140 |
+
# ------------------------------------------------------------
|
| 141 |
+
# ์ฑ ํ์ดํ/ํญ ๊ตฌ์ฑ
|
| 142 |
+
# ------------------------------------------------------------
|
| 143 |
+
st.title("ํธ์์ ์์์์ธก & ๋ฐ์ฃผ ์ถ์ฒ โ Pro Suite")
|
| 144 |
+
tabs = st.tabs(["โ ๋ฐ์ดํฐ", "โก ํ์ต/๋ชจ๋ธ", "โข ์์ธกยท๋ฐ์ฃผ", "โฃ ๋ถ์(๊ทธ๋ํ)", "โค ์ง๋จ/๋ก๊ทธ"])
|
| 145 |
+
|
| 146 |
+
# ============================================================
|
| 147 |
+
# โ ๋ฐ์ดํฐ: CSV ์
๋ก๋/์ ํ + ์๋ ์ปฌ๋ผ ๋งคํ ์ ์ฅ (๋ฉํฐ CSV ์ง์)
|
| 148 |
+
# ============================================================
|
| 149 |
+
with tabs[0]:
|
| 150 |
+
st.subheader("CSV ์
๋ก๋ ๋๋ ์ ํ")
|
| 151 |
+
cols_top = st.columns([2,1])
|
| 152 |
+
with cols_top[0]:
|
| 153 |
+
add_source = st.checkbox("ํ์ผ๋ช
(source) ์ด ์ถ๊ฐ", value=True, help="์ฌ๋ฌ CSV๋ฅผ ํฉ์น ๋ ์๋ณธ ํ์ผ๋ช
์ ๋จ๊น๋๋ค.")
|
| 154 |
+
with cols_top[1]:
|
| 155 |
+
st.caption("โป ์
๋ก๋/์ ํ ํ ์๋์์ ์ปฌ๋ผ ๋งคํ ์ ์ฅ")
|
| 156 |
+
|
| 157 |
+
cols = st.columns(2)
|
| 158 |
+
|
| 159 |
+
# --- ๋ค์ค ํ์ผ ์
๋ก๋ ---
|
| 160 |
+
with cols[0]:
|
| 161 |
+
up_multi = st.file_uploader("CSV ํ์ผ ์
๋ก๋(์ฌ๋ฌ ๊ฐ ๊ฐ๋ฅ)", type=["csv"], accept_multiple_files=True, key="multi_up")
|
| 162 |
+
if up_multi:
|
| 163 |
+
dfs = []
|
| 164 |
+
for f in up_multi:
|
| 165 |
+
raw = f.read()
|
| 166 |
+
df_i = read_csv_flexible(io.BytesIO(raw))
|
| 167 |
+
if add_source:
|
| 168 |
+
df_i["source"] = f.name
|
| 169 |
+
dfs.append(df_i)
|
| 170 |
+
# data/์ ์ ์ฅ
|
| 171 |
+
save_path = os.path.join(DATA_DIR, f.name)
|
| 172 |
+
try:
|
| 173 |
+
with open(save_path, "wb") as fp:
|
| 174 |
+
fp.write(raw)
|
| 175 |
+
except Exception as e:
|
| 176 |
+
st.warning(f"ํ์ผ ์ ์ฅ ๊ฒฝ๊ณ ({f.name}): {e}")
|
| 177 |
+
try:
|
| 178 |
+
list_data_files.clear() # ์บ์ ๋ฌดํจํ
|
| 179 |
+
except Exception:
|
| 180 |
+
pass
|
| 181 |
+
df = pd.concat(dfs, axis=0, ignore_index=True, sort=True)
|
| 182 |
+
st.session_state["df"] = df
|
| 183 |
+
st.success(f"์
๋ก๋/๊ฒฐํฉ ์๋ฃ: {df.shape} (ํ์ผ {len(dfs)}๊ฐ)")
|
| 184 |
+
st.dataframe(df.head(20), use_container_width=True)
|
| 185 |
+
|
| 186 |
+
# --- data ํด๋์์ ๋ค์ค ์ ํ ---
|
| 187 |
+
with cols[1]:
|
| 188 |
+
files = list_data_files()
|
| 189 |
+
picks = st.multiselect("data ํด๋์์ ์ ํ(์ฌ๋ฌ ๊ฐ)", files)
|
| 190 |
+
if st.button("์ ํ ํ์ผ ๋ถ๋ฌ์ค๊ธฐ", disabled=(len(picks)==0)):
|
| 191 |
+
dfs = []
|
| 192 |
+
for name in picks:
|
| 193 |
+
path = os.path.join(DATA_DIR, name)
|
| 194 |
+
df_i = read_csv_flexible(path)
|
| 195 |
+
if add_source:
|
| 196 |
+
df_i["source"] = name
|
| 197 |
+
dfs.append(df_i)
|
| 198 |
+
df = pd.concat(dfs, axis=0, ignore_index=True, sort=True)
|
| 199 |
+
st.session_state["df"] = df
|
| 200 |
+
st.success(f"๋ถ๋ฌ์ค๊ธฐ/๊ฒฐํฉ ์๋ฃ: {df.shape} (ํ์ผ {len(dfs)}๊ฐ)")
|
| 201 |
+
st.dataframe(df.head(20), use_container_width=True)
|
| 202 |
+
|
| 203 |
+
# --- ์๋ ์ปฌ๋ผ ๋งคํ + ๋ณด์ ---
|
| 204 |
+
if "df" in st.session_state:
|
| 205 |
+
st.divider()
|
| 206 |
+
st.caption("์๋ ์ปฌ๋ผ ๋งคํ โ ์ ํ ์์ด ์๋ ์ ์ฉ๋ฉ๋๋ค.")
|
| 207 |
+
|
| 208 |
+
df = st.session_state["df"]
|
| 209 |
+
|
| 210 |
+
# auto_map_columns ๊ฒฐ๊ณผ ์ฌ์ฉ
|
| 211 |
+
auto = auto_map_columns(df)
|
| 212 |
+
mapping = {
|
| 213 |
+
"date": auto.get("date"),
|
| 214 |
+
"target": auto.get("target"),
|
| 215 |
+
"region": auto.get("region"),
|
| 216 |
+
"brand": auto.get("brand"),
|
| 217 |
+
"item": auto.get("item"),
|
| 218 |
+
}
|
| 219 |
+
st.session_state["mapping"] = mapping
|
| 220 |
+
|
| 221 |
+
# โ
data ํด๋์ฉ ๋ณด์ :
|
| 222 |
+
# seoul_gyeonggi_with_demand.csv / usan.csv / gungoguma.csv ๋
|
| 223 |
+
# auto_map_columns๊ฐ ํ๊น์ '๊ฐ์๋'์ผ๋ก ์ก๋ ์ผ์ด์ค๊ฐ ์์ด์,
|
| 224 |
+
# '์ผ์ผํ๋งค๋' ์ปฌ๋ผ์ด ์์ผ๋ฉด ๊ทธ๊ฑธ target์ผ๋ก ๊ฐ์ ๊ต์ฒด
|
| 225 |
+
if mapping.get("target") == "๊ฐ์๋" and "์ผ์ผํ๋งค๋" in df.columns:
|
| 226 |
+
mapping["target"] = "์ผ์ผํ๋งค๋"
|
| 227 |
+
|
| 228 |
+
# ํ์ธ์ฉ์ผ๋ก๋ง ์ฝ๊ธฐ ์ ์ฉ ํ
์ด๋ธ ํ์
|
| 229 |
+
mapping_view = pd.DataFrame(
|
| 230 |
+
{
|
| 231 |
+
"์ญํ ": ["๋ ์ง(date)", "์์/ํ๋งค๋(target)", "์ง์ญ/์ ํฌ(region)", "๋ธ๋๋(์ ํ)", "์ํ/ํ๋ชฉ(์ ํ)"],
|
| 232 |
+
"์ปฌ๋ผ": [
|
| 233 |
+
mapping.get("date"),
|
| 234 |
+
mapping.get("target"),
|
| 235 |
+
mapping.get("region"),
|
| 236 |
+
mapping.get("brand"),
|
| 237 |
+
mapping.get("item"),
|
| 238 |
+
],
|
| 239 |
+
}
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
st.write("ํ์ฌ ์๋ ๋งคํ ๊ฒฐ๊ณผ:")
|
| 243 |
+
st.dataframe(mapping_view, use_container_width=True)
|
| 244 |
+
|
| 245 |
+
# ============================================================
|
| 246 |
+
# โก ํ์ต/๋ชจ๋ธ
|
| 247 |
+
# ============================================================
|
| 248 |
+
with tabs[1]:
|
| 249 |
+
st.subheader("๋ชจ๋ธ ํ์ต")
|
| 250 |
+
|
| 251 |
+
use_optuna = st.checkbox("Optuna ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋ ์ฌ์ฉ", value=False)
|
| 252 |
+
trials = st.slider("Optuna ์๋ ํ์", 5, 60, 15, 5)
|
| 253 |
+
|
| 254 |
+
if "df" not in st.session_state or "mapping" not in st.session_state:
|
| 255 |
+
st.info("๋จผ์ โ ํญ์์ ๋ฐ์ดํฐ์ ์ปฌ๋ผ ๋งคํ์ ์ง์ ํ์ธ์.")
|
| 256 |
+
else:
|
| 257 |
+
v = st.slider("๊ฒ์ฆ ๋น์จ(valid_ratio)", 0.05, 0.4, 0.2, 0.05)
|
| 258 |
+
|
| 259 |
+
if st.button("ํ์ต ์์"):
|
| 260 |
+
# โ ์ฌ๊ธฐ์ ์์ธ๊ฐ ๋๋ ์ฑ์ด ์ฃฝ์ง ์๋๋ก ๋ฐฉ์ด
|
| 261 |
+
try:
|
| 262 |
+
df, X, y, feat_names = make_matrix(
|
| 263 |
+
st.session_state["df"],
|
| 264 |
+
st.session_state["mapping"],
|
| 265 |
+
)
|
| 266 |
+
except Exception as e:
|
| 267 |
+
st.error(f"ํ์ต์ฉ ๋ฐ์ดํฐ ๊ตฌ์ฑ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {e}")
|
| 268 |
+
else:
|
| 269 |
+
try:
|
| 270 |
+
best_model, lb = train_and_score(
|
| 271 |
+
X,
|
| 272 |
+
y,
|
| 273 |
+
valid_ratio=v,
|
| 274 |
+
use_optuna=use_optuna,
|
| 275 |
+
optuna_trials=trials,
|
| 276 |
+
)
|
| 277 |
+
save_artifacts(
|
| 278 |
+
[ARTI_DIR, MODELS_DIR],
|
| 279 |
+
best_model,
|
| 280 |
+
feat_names,
|
| 281 |
+
st.session_state["mapping"],
|
| 282 |
+
lb,
|
| 283 |
+
)
|
| 284 |
+
except Exception as e:
|
| 285 |
+
st.error(f"๋ชจ๋ธ ํ์ต/์ ์ฅ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {e}")
|
| 286 |
+
else:
|
| 287 |
+
st.session_state["leaderboard"] = lb
|
| 288 |
+
st.session_state["feat_names"] = feat_names
|
| 289 |
+
st.success("ํ์ต ์๋ฃ")
|
| 290 |
+
|
| 291 |
+
if "leaderboard" in st.session_state:
|
| 292 |
+
st.dataframe(st.session_state["leaderboard"], use_container_width=True)
|
| 293 |
+
|
| 294 |
+
# ============================================================
|
| 295 |
+
# โข ์์ธกยท๋ฐ์ฃผ: ๋ฐ๋ณต(AR) ์์ธก + ์ฌ๊ณ ๊ธฐ๋ฐ ์๋ ๋ฐ์ฃผ ๊ณ์ฐ
|
| 296 |
+
# ============================================================
|
| 297 |
+
with tabs[2]:
|
| 298 |
+
st.subheader("์์ธก(๋ฐ๋ณต AR) & ๋ฐ์ฃผ๋ ์ถ์ฒ")
|
| 299 |
+
st.caption("ํ์ต๋ ๋ชจ๋ธ๋ก ๋ฏธ๋ ํผ์ฒ๋ฅผ ์์ฑํ๊ณ , ์ฌ๊ณ ๋ฅผ ๊ณ ๋ คํด ์๋์ผ๋ก ๋ฐ์ฃผ ๊ธฐ๊ฐ๊ณผ ์๋์ ๊ณ์ฐํฉ๋๋ค.")
|
| 300 |
+
|
| 301 |
+
if "df" not in st.session_state or "mapping" not in st.session_state:
|
| 302 |
+
st.info("๋จผ์ โ ํญ์์ ๋ฐ์ดํฐ์ ์ปฌ๋ผ ๋งคํ์ ์ง์ ํ๊ณ โก์์ ํ์ต์ ์๋ฃํ์ธ์.")
|
| 303 |
+
else:
|
| 304 |
+
horizon_days = 14 # ๊ณ ์ ๊ธฐ๊ฐ
|
| 305 |
+
|
| 306 |
+
# ์ ํ๋(๋ณด์ ๊ณ์)
|
| 307 |
+
accuracy = st.slider(
|
| 308 |
+
"์ ํ๋(์์ธก ๋ณด์ ๊ณ์)",
|
| 309 |
+
min_value=0.5,
|
| 310 |
+
max_value=2.0,
|
| 311 |
+
value=1.0,
|
| 312 |
+
step=0.05,
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
# ==============================
|
| 316 |
+
# ์ธ๊ทธ๋จผํธ ์ ํ
|
| 317 |
+
# ==============================
|
| 318 |
+
seg_cols = [
|
| 319 |
+
c for c in [
|
| 320 |
+
st.session_state["mapping"].get("region"),
|
| 321 |
+
st.session_state["mapping"].get("brand"),
|
| 322 |
+
st.session_state["mapping"].get("item"),
|
| 323 |
+
] if c
|
| 324 |
+
]
|
| 325 |
+
seg_vals = {}
|
| 326 |
+
if seg_cols:
|
| 327 |
+
col_objs = st.columns(len(seg_cols))
|
| 328 |
+
for i, ccol in enumerate(seg_cols):
|
| 329 |
+
opts = ["<์ ์ฒด>"] + sorted(
|
| 330 |
+
list(map(str, st.session_state["df"][ccol].dropna().astype(str).unique()))
|
| 331 |
+
)
|
| 332 |
+
seg_vals[ccol] = col_objs[i].selectbox(f"{ccol} ์ ํ", opts, index=0)
|
| 333 |
+
|
| 334 |
+
# ==============================
|
| 335 |
+
# ๋ฐ๋ณต ์์ธก ํจ์
|
| 336 |
+
# ==============================
|
| 337 |
+
def iterative_forecast(df, mapping, model, feat_names, horizon, seg_vals):
|
| 338 |
+
df = df.copy()
|
| 339 |
+
dtc = mapping["date"]
|
| 340 |
+
tgt = mapping["target"]
|
| 341 |
+
|
| 342 |
+
if dtc not in df.columns or tgt not in df.columns:
|
| 343 |
+
st.error(f"์์ธก์ ํ์ํ ์ปฌ๋ผ์ด ์์ต๋๋ค. (date='{dtc}', target='{tgt}')")
|
| 344 |
+
return pd.DataFrame(columns=[dtc, "์์ธก์๋"])
|
| 345 |
+
|
| 346 |
+
df[dtc] = pd.to_datetime(df[dtc], errors="coerce")
|
| 347 |
+
df = df.dropna(subset=[dtc]).sort_values(dtc)
|
| 348 |
+
|
| 349 |
+
for k, v in seg_vals.items():
|
| 350 |
+
if v and v != "<์ ์ฒด>" and k in df.columns:
|
| 351 |
+
df = df[df[k].astype(str) == str(v)]
|
| 352 |
+
|
| 353 |
+
if df.empty:
|
| 354 |
+
st.error("์ ํํ ์ธ๊ทธ๋จผํธ์ ํด๋นํ๋ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.")
|
| 355 |
+
return pd.DataFrame(columns=[dtc, "์์ธก์๋"])
|
| 356 |
+
|
| 357 |
+
if len(df) < 30:
|
| 358 |
+
st.warning("ํด๋น ์ธ๊ทธ๋จผํธ ๋ฐ์ดํฐ๊ฐ ์ ์ด ์์ธก ํ์ง์ด ๋ฎ์ ์ ์์ต๋๋ค.")
|
| 359 |
+
|
| 360 |
+
last_date = df[dtc].max()
|
| 361 |
+
|
| 362 |
+
hist = list(
|
| 363 |
+
pd.to_numeric(df[tgt], errors="coerce")
|
| 364 |
+
.fillna(0)
|
| 365 |
+
.astype(float)
|
| 366 |
+
.values
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
def build_row_features(current_date, hist_vals):
|
| 370 |
+
if pd.isna(current_date):
|
| 371 |
+
current_date = df[dtc].max()
|
| 372 |
+
|
| 373 |
+
year = current_date.year
|
| 374 |
+
month = current_date.month
|
| 375 |
+
day = current_date.day
|
| 376 |
+
dow = current_date.weekday()
|
| 377 |
+
is_weekend = 1 if dow >= 5 else 0
|
| 378 |
+
|
| 379 |
+
try:
|
| 380 |
+
week = int(pd.Timestamp(current_date).isocalendar().week)
|
| 381 |
+
except Exception:
|
| 382 |
+
week = 0
|
| 383 |
+
|
| 384 |
+
def get_lag(k):
|
| 385 |
+
if len(hist_vals) >= k:
|
| 386 |
+
return float(hist_vals[-k])
|
| 387 |
+
return float(np.mean(hist_vals[-min(len(hist_vals), 7):])) if hist_vals else 0.0
|
| 388 |
+
|
| 389 |
+
lag1 = get_lag(1)
|
| 390 |
+
lag7 = get_lag(7)
|
| 391 |
+
lag14 = get_lag(14)
|
| 392 |
+
|
| 393 |
+
def rmean(w):
|
| 394 |
+
arr = np.array(hist_vals[-w:]) if len(hist_vals) >= 1 else np.array([0.0])
|
| 395 |
+
if len(arr) < max(2, w // 2):
|
| 396 |
+
arr = np.array(hist_vals[-max(2, w // 2):]) if len(hist_vals) else np.array([0.0])
|
| 397 |
+
return float(np.mean(arr))
|
| 398 |
+
|
| 399 |
+
def rstd(w):
|
| 400 |
+
arr = np.array(hist_vals[-w:]) if len(hist_vals) >= 2 else np.array([0.0, 0.0])
|
| 401 |
+
return float(np.std(arr))
|
| 402 |
+
|
| 403 |
+
feats = {
|
| 404 |
+
"year": year,
|
| 405 |
+
"month": month,
|
| 406 |
+
"day": day,
|
| 407 |
+
"dow": dow,
|
| 408 |
+
"week": week,
|
| 409 |
+
"is_weekend": is_weekend,
|
| 410 |
+
"lag1": lag1,
|
| 411 |
+
"lag7": lag7,
|
| 412 |
+
"lag14": lag14,
|
| 413 |
+
"rmean7": rmean(7),
|
| 414 |
+
"rmean14": rmean(14),
|
| 415 |
+
"rstd7": rstd(7),
|
| 416 |
+
"rstd14": rstd(14),
|
| 417 |
+
}
|
| 418 |
+
|
| 419 |
+
for fn in feat_names:
|
| 420 |
+
if fn not in feats:
|
| 421 |
+
feats[fn] = 0.0
|
| 422 |
+
|
| 423 |
+
x = [feats.get(fn, 0.0) for fn in feat_names]
|
| 424 |
+
return np.array(x, dtype=float)
|
| 425 |
+
|
| 426 |
+
preds, dates = [], []
|
| 427 |
+
cur = last_date
|
| 428 |
+
for _ in range(int(horizon)):
|
| 429 |
+
cur = cur + timedelta(days=1)
|
| 430 |
+
x = build_row_features(cur, hist)
|
| 431 |
+
val = float(model.predict([x])[0])
|
| 432 |
+
preds.append(val)
|
| 433 |
+
dates.append(cur)
|
| 434 |
+
hist.append(val)
|
| 435 |
+
|
| 436 |
+
return pd.DataFrame({dtc: dates, "์์ธก์๋": preds})
|
| 437 |
+
|
| 438 |
+
# ==============================
|
| 439 |
+
# ์ฌ๊ณ ์๋ ์ธ์
|
| 440 |
+
# ==============================
|
| 441 |
+
def guess_inventory_onhand(df_seg: pd.DataFrame, mapping):
|
| 442 |
+
candidates = [
|
| 443 |
+
"์ฌ๊ณ ", "์ฌ๊ณ ์", "์ฌ๊ณ ์๋",
|
| 444 |
+
"ํ์ฌ์ฌ๊ณ ", "onhand", "on_hand",
|
| 445 |
+
"stock", "inventory",
|
| 446 |
+
]
|
| 447 |
+
inv_col = None
|
| 448 |
+
for col in df_seg.columns:
|
| 449 |
+
low = col.lower()
|
| 450 |
+
if any(key in low for key in candidates):
|
| 451 |
+
inv_col = col
|
| 452 |
+
break
|
| 453 |
+
if not inv_col:
|
| 454 |
+
return None, None
|
| 455 |
+
|
| 456 |
+
series = pd.to_numeric(df_seg[inv_col], errors="coerce").dropna()
|
| 457 |
+
if series.empty:
|
| 458 |
+
return None, None
|
| 459 |
+
|
| 460 |
+
return inv_col, float(series.iloc[-1])
|
| 461 |
+
|
| 462 |
+
# ==============================
|
| 463 |
+
# ๊ฐ๊ฒฉ ์๋ ์ธ์
|
| 464 |
+
# ==============================
|
| 465 |
+
def guess_price_column(df_seg):
|
| 466 |
+
keys = ["price", "๊ฐ๊ฒฉ", "๋จ๊ฐ", "ํ๋งค๊ฐ", "amount", "๊ธ์ก"]
|
| 467 |
+
for col in df_seg.columns:
|
| 468 |
+
low = col.lower()
|
| 469 |
+
if any(k in low for k in keys):
|
| 470 |
+
return col
|
| 471 |
+
return None
|
| 472 |
+
|
| 473 |
+
# ==============================
|
| 474 |
+
# ๋ชจ๋ธ ๋ก๋
|
| 475 |
+
# ==============================
|
| 476 |
+
pkl_path = os.path.join(MODELS_DIR, "best_model.pkl")
|
| 477 |
+
if os.path.exists(pkl_path):
|
| 478 |
+
try:
|
| 479 |
+
with open(pkl_path, "rb") as f:
|
| 480 |
+
payload = pickle.load(f)
|
| 481 |
+
model = payload["model"]
|
| 482 |
+
feat_names = payload["feature_names"]
|
| 483 |
+
mapping = payload["mapping"]
|
| 484 |
+
except Exception as e:
|
| 485 |
+
st.error(f"์ ์ฅ๋ ๋ชจ๋ธ ๋ก๋ฉ ์ค ์ค๋ฅ: {e}")
|
| 486 |
+
else:
|
| 487 |
+
dtc = mapping["date"]
|
| 488 |
+
|
| 489 |
+
# ======================================
|
| 490 |
+
# 1) ์์ธก ์ํ
|
| 491 |
+
# ======================================
|
| 492 |
+
fc_df = iterative_forecast(
|
| 493 |
+
st.session_state["df"],
|
| 494 |
+
mapping,
|
| 495 |
+
model,
|
| 496 |
+
feat_names,
|
| 497 |
+
horizon_days,
|
| 498 |
+
seg_vals,
|
| 499 |
+
)
|
| 500 |
+
if fc_df.empty:
|
| 501 |
+
st.stop()
|
| 502 |
+
|
| 503 |
+
# ======================================
|
| 504 |
+
# 2) ๊ฐ๊ฒฉ ์๋ ์ธ์ + ๊ธ์ก์์ธก
|
| 505 |
+
# ======================================
|
| 506 |
+
df_seg_price = st.session_state["df"].copy()
|
| 507 |
+
for k, v in seg_vals.items():
|
| 508 |
+
if v and v != "<์ ์ฒด>" and k in df_seg_price.columns:
|
| 509 |
+
df_seg_price = df_seg_price[df_seg_price[k].astype(str) == str(v)]
|
| 510 |
+
df_seg_price = df_seg_price.sort_values(dtc)
|
| 511 |
+
|
| 512 |
+
price_col = guess_price_column(df_seg_price)
|
| 513 |
+
|
| 514 |
+
if price_col:
|
| 515 |
+
price_val = float(
|
| 516 |
+
pd.to_numeric(df_seg_price[price_col], errors="coerce").dropna().iloc[-1]
|
| 517 |
+
)
|
| 518 |
+
st.info(f"CSV '{price_col}' ์ปฌ๋ผ์์ ๊ฐ๊ฒฉ {price_val:,.0f}์ ์๋ ์ธ์.")
|
| 519 |
+
else:
|
| 520 |
+
price_val = st.number_input(
|
| 521 |
+
"๊ฐ๊ฒฉ(์) โ CSV์์ ๊ฐ๊ฒฉ ์ปฌ๋ผ์ ์ฐพ์ง ๋ชปํด ์ง์ ์
๋ ฅ",
|
| 522 |
+
min_value=0,
|
| 523 |
+
max_value=100000000,
|
| 524 |
+
value=0,
|
| 525 |
+
)
|
| 526 |
+
|
| 527 |
+
# **์๋ ์ดํฉ**
|
| 528 |
+
total_qty_demand = float(fc_df["์์ธก์๋"].sum())
|
| 529 |
+
|
| 530 |
+
# **๊ธ์ก ์ดํฉ**
|
| 531 |
+
fc_df["๊ธ์ก์์ธก"] = (fc_df["์์ธก์๋"] * price_val * float(accuracy)).clip(lower=0.0)
|
| 532 |
+
total_amt_demand = float(fc_df["๊ธ์ก์์ธก"].sum())
|
| 533 |
+
|
| 534 |
+
# ======================================
|
| 535 |
+
# 3) ์ฌ๊ณ ์๋ ์ธ์
|
| 536 |
+
# ======================================
|
| 537 |
+
df_seg = st.session_state["df"].copy()
|
| 538 |
+
df_seg[dtc] = pd.to_datetime(df_seg[dtc], errors="coerce")
|
| 539 |
+
for k, v in seg_vals.items():
|
| 540 |
+
if v and v != "<์ ์ฒด>" and k in df_seg.columns:
|
| 541 |
+
df_seg = df_seg[df_seg[k].astype(str) == str(v)]
|
| 542 |
+
df_seg = df_seg.sort_values(dtc)
|
| 543 |
+
|
| 544 |
+
inv_col, onhand_auto = guess_inventory_onhand(df_seg, mapping)
|
| 545 |
+
if onhand_auto is None:
|
| 546 |
+
onhand = st.number_input(
|
| 547 |
+
"ํ์ฌ ์ฌ๊ณ (์ง์ ์
๋ ฅ)",
|
| 548 |
+
min_value=0,
|
| 549 |
+
max_value=100000,
|
| 550 |
+
value=0,
|
| 551 |
+
)
|
| 552 |
+
else:
|
| 553 |
+
onhand = onhand_auto
|
| 554 |
+
st.info(f"์ฌ๊ณ '{inv_col}' ์๋ ์ธ์ โ {onhand:,.0f}๊ฐ")
|
| 555 |
+
|
| 556 |
+
# ======================================
|
| 557 |
+
# 4) ๋ฐ์ฃผ๋/์์ง์ผ ๊ณ์ฐ (์๋ ๊ธฐ์ค)
|
| 558 |
+
# ======================================
|
| 559 |
+
avg_daily_qty = total_qty_demand / horizon_days if horizon_days > 0 else 0.0
|
| 560 |
+
days_to_out = (onhand / avg_daily_qty) if avg_daily_qty > 0 else float("inf")
|
| 561 |
+
rec_qty = max(0.0, total_qty_demand - onhand)
|
| 562 |
+
|
| 563 |
+
c1, c2, c3 = st.columns(3)
|
| 564 |
+
c1.metric("์์ธก ๊ธฐ๊ฐ(์ผ)", f"{horizon_days}")
|
| 565 |
+
c2.metric("์ฌ๊ณ ์์ง ์์์ผ์", "โ" if np.isinf(days_to_out) else f"{days_to_out:,.1f}")
|
| 566 |
+
c3.metric("2์ฃผ ์ด ์์ ๋งค์ถ", f"{total_amt_demand:,.0f}์")
|
| 567 |
+
|
| 568 |
+
# ======================================
|
| 569 |
+
# 5) ํ ์ถ๋ ฅ
|
| 570 |
+
# ======================================
|
| 571 |
+
st.dataframe(fc_df.set_index(dtc), use_container_width=True)
|
| 572 |
+
st.caption("โป ์์ธก์๋ ร ๊ฐ๊ฒฉ ร ์ ํ๋ ๋ณด์ = ๊ธ์ก์์ธก")
|
| 573 |
+
|
| 574 |
+
else:
|
| 575 |
+
st.warning("best_model.pkl ์ด ์์ต๋๋ค. โก ํญ์์ ํ์ต์ ๋จผ์ ์ํํ์ธ์.")
|
| 576 |
+
|
| 577 |
+
# ============================================================
|
| 578 |
+
# โฃ ๋ถ์(๊ทธ๋ํ):
|
| 579 |
+
# - ์ฐ์ฐ: ํ ๋ฌ ๊ฐ์๋ vs ์ฐ์ฐ ํ๋งค๋ (์ฐ์ ๋ + ํ๊ท์ + ์ผ๋ณ ์ ํ ๊ทธ๋ํ)
|
| 580 |
+
# - ๊ตฐ๊ณ ๊ตฌ๋ง: ํ ๋ฌ ๊ธฐ์จ vs ๊ตฐ๊ณ ๊ตฌ๋ง ํ๋งค๋ (์ฐ์ ๋ + ํ๊ท์ + ์ผ๋ณ ์ ํ ๊ทธ๋ํ)
|
| 581 |
+
# - ์ ์ฒด: ์ฐ์ฐยท๊ตฐ๊ณ ๊ตฌ๋ง ์ ์ธ ์ผ๋ณ ํ๋งค๋ ์ ํ ๊ทธ๋ํ
|
| 582 |
+
# ============================================================
|
| 583 |
+
with tabs[3]:
|
| 584 |
+
st.subheader("๋ถ์(๊ทธ๋ํ) โ ํ ๋ฌ ๋จ์ ์๊ด ๋ถ์")
|
| 585 |
+
|
| 586 |
+
if "df" not in st.session_state or "mapping" not in st.session_state or not st.session_state["mapping"].get("date"):
|
| 587 |
+
st.info("๋จผ์ โ ํญ์์ ๋ฐ์ดํฐ์ ์ปฌ๋ผ ๋งคํ(ํนํ '๋ ์ง'์ 'ํ๊น')์ ์ง์ ํ์ธ์.")
|
| 588 |
+
else:
|
| 589 |
+
mapping = st.session_state["mapping"]
|
| 590 |
+
date_col = mapping["date"]
|
| 591 |
+
target_col = mapping.get("target")
|
| 592 |
+
|
| 593 |
+
def guess(colnames, cands):
|
| 594 |
+
low = [str(c).lower() for c in colnames]
|
| 595 |
+
for key in cands:
|
| 596 |
+
key_low = str(key).lower()
|
| 597 |
+
for i, l in enumerate(low):
|
| 598 |
+
if key_low in l:
|
| 599 |
+
return colnames[i]
|
| 600 |
+
return None
|
| 601 |
+
|
| 602 |
+
# ๊ณตํต: ์ฐ-์ ์ ํ์ฉ ์ต์
๋ง๋๋ ํจ์
|
| 603 |
+
def build_year_month_options(df, date_col):
|
| 604 |
+
df = df.copy()
|
| 605 |
+
df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
|
| 606 |
+
df = df.dropna(subset=[date_col])
|
| 607 |
+
if df.empty:
|
| 608 |
+
return df, []
|
| 609 |
+
df["year_month"] = df[date_col].dt.to_period("M")
|
| 610 |
+
ym_unique = sorted(df["year_month"].unique())
|
| 611 |
+
ym_labels = [str(p) for p in ym_unique] # '2024-10' ๊ฐ์ ํ์
|
| 612 |
+
return df, list(zip(ym_labels, ym_unique))
|
| 613 |
+
|
| 614 |
+
tab_u, tab_g, tab_all = st.tabs([
|
| 615 |
+
"โ ์ฐ์ฐ: ํ ๋ฌ ๊ฐ์๋ vs ํ๋งค๋",
|
| 616 |
+
"๐ ๊ตฐ๊ณ ๊ตฌ๋ง: ํ ๋ฌ ๊ธฐ์จ vs ํ๋งค๋",
|
| 617 |
+
"๐ ์ ์ฒด: ์ฐ์ฐยท๊ตฐ๊ณ ๊ตฌ๋ง ์ ์ธ ์ผ๋ณ ํ๋งค๋(์ ํ)"
|
| 618 |
+
])
|
| 619 |
+
|
| 620 |
+
# ------------------------------
|
| 621 |
+
# 1) ์ฐ์ฐ: ์ ํํ ํ ๋ฌ์ ๊ฐ์๋ โ ์ฐ์ฐ ํ๋งค๋
|
| 622 |
+
# ------------------------------
|
| 623 |
+
with tab_u:
|
| 624 |
+
st.caption("์ฐ์ฐ ํ๋งค๋๊ณผ ๊ฐ์๋์ ๊ด๊ณ๋ฅผ 'ํ ๋ฌ' ๋จ์๋ก ๋ด
๋๋ค.")
|
| 625 |
+
|
| 626 |
+
up_u = st.file_uploader("์ฐ์ฐ/๋ ์จ ๋ฐ์ดํฐ CSV (์ ํ)", type=["csv"], key="umbrella_month_up")
|
| 627 |
+
if up_u is not None:
|
| 628 |
+
df_u_raw = read_csv_flexible(io.BytesIO(up_u.read()))
|
| 629 |
+
else:
|
| 630 |
+
df_u_raw = st.session_state["df"].copy()
|
| 631 |
+
|
| 632 |
+
if date_col not in df_u_raw.columns:
|
| 633 |
+
st.warning(f"๋ ์ง ์ปฌ๋ผ '{date_col}' ์(๋ฅผ) ๋ฐ์ดํฐ์์ ์ฐพ์ง ๋ชปํ์ต๋๋ค.")
|
| 634 |
+
else:
|
| 635 |
+
# item์์ ์ฐ์ฐ๋ง ํํฐ (์์ผ๋ฉด)
|
| 636 |
+
item_col = mapping.get("item")
|
| 637 |
+
if item_col and item_col in df_u_raw.columns:
|
| 638 |
+
mask = df_u_raw[item_col].astype(str).str.contains("์ฐ์ฐ|umbrella", case=False, na=False)
|
| 639 |
+
if mask.any():
|
| 640 |
+
df_u_raw = df_u_raw[mask]
|
| 641 |
+
|
| 642 |
+
cols_all = list(df_u_raw.columns)
|
| 643 |
+
|
| 644 |
+
# ํ๋งค๋ ์ปฌ๋ผ: ๋งคํ target ์ฐ์ , ์์ผ๋ฉด ์ถ์
|
| 645 |
+
sales_col = target_col if target_col in cols_all else guess(
|
| 646 |
+
cols_all,
|
| 647 |
+
["umbrella", "์ฐ์ฐ", "์ผ์ผํ๋งค๋", "ํ๋งค๋", "sales", "qty", "quantity", "target"],
|
| 648 |
+
)
|
| 649 |
+
|
| 650 |
+
# ๊ฐ์๋ ์ปฌ๋ผ ์ถ์
|
| 651 |
+
rain_col = guess(
|
| 652 |
+
cols_all,
|
| 653 |
+
["rain", "precip", "precipitation", "๊ฐ์", "๊ฐ์๋", "์ผ๊ฐ์๋", "๊ฐ์ฐ", "๊ฐ์ฐ๋"],
|
| 654 |
+
)
|
| 655 |
+
|
| 656 |
+
if not sales_col or not rain_col:
|
| 657 |
+
st.warning(
|
| 658 |
+
"์ฐ์ฐ ํ๋งค๋ ๋๋ ๊ฐ์๋ ์ปฌ๋ผ์ ์๋์ผ๋ก ์ฐพ์ง ๋ชปํ์ต๋๋ค.\n"
|
| 659 |
+
"ํ๋งค๋: '์ฐ์ฐ/umbrella/ํ๋งค๋/sales', ๊ฐ์๋: '๊ฐ์๋/rain' ๋ฑ์ ์ด๋ฆ์ ์ฌ์ฉํด ์ฃผ์ธ์."
|
| 660 |
+
)
|
| 661 |
+
else:
|
| 662 |
+
# ๋ ์ง/์ซ์ ํ์ ์ ๋ฆฌ + ์ฐ-์ ์ต์
์์ฑ
|
| 663 |
+
df_u_raw[sales_col] = pd.to_numeric(df_u_raw[sales_col], errors="coerce")
|
| 664 |
+
df_u_raw[rain_col] = pd.to_numeric(df_u_raw[rain_col], errors="coerce")
|
| 665 |
+
|
| 666 |
+
df_u_raw, ym_options = build_year_month_options(df_u_raw, date_col)
|
| 667 |
+
|
| 668 |
+
if not ym_options:
|
| 669 |
+
st.info("์ ํจํ ๋ ์ง ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.")
|
| 670 |
+
else:
|
| 671 |
+
# ์ฐ-์ ์ ํ (YYYY-MM ํ์๋ง ๋ณด์ฌ์ค)
|
| 672 |
+
labels = [lab for lab, _ in ym_options]
|
| 673 |
+
default_idx = len(labels) - 1 # ๊ธฐ๋ณธ๊ฐ: ๊ฐ์ฅ ์ต๊ทผ ์
|
| 674 |
+
sel_label = st.selectbox("๋ถ์ํ ์ฐ์(YYYY-MM)", labels, index=default_idx, key="ym_umbrella")
|
| 675 |
+
sel_period = dict(ym_options)[sel_label]
|
| 676 |
+
|
| 677 |
+
# ์ ํํ ํ ๋ฌ๋ง ํํฐ
|
| 678 |
+
df_month = df_u_raw[df_u_raw["year_month"] == sel_period].copy()
|
| 679 |
+
if df_month.empty:
|
| 680 |
+
st.info(f"{sel_label} ์ ํด๋นํ๋ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.")
|
| 681 |
+
else:
|
| 682 |
+
# ์ผ ๋จ์ ์ง๊ณ
|
| 683 |
+
df_month["date_only"] = df_month[date_col].dt.date
|
| 684 |
+
daily = (
|
| 685 |
+
df_month.groupby("date_only", as_index=False)
|
| 686 |
+
.agg({sales_col: "sum", rain_col: "mean"})
|
| 687 |
+
.dropna(subset=[sales_col, rain_col])
|
| 688 |
+
)
|
| 689 |
+
daily = daily.rename(
|
| 690 |
+
columns={"date_only": "date", sales_col: "sales", rain_col: "rain"}
|
| 691 |
+
)
|
| 692 |
+
|
| 693 |
+
if daily.empty:
|
| 694 |
+
st.info("ํด๋น ์ฐ์์์ ์ผ๋ณ๋ก ์ง๊ณํ ์ ์๋ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.")
|
| 695 |
+
else:
|
| 696 |
+
st.markdown(f"**{sel_label} ํ ๋ฌ ๊ธฐ์ค ยท ๊ฐ์๋์ ๋ฐ๋ฅธ ์ฐ์ฐ ํ๋งค๋**")
|
| 697 |
+
|
| 698 |
+
base = alt.Chart(daily).encode(
|
| 699 |
+
x=alt.X("rain:Q", title="์ผ ๊ฐ์๋"),
|
| 700 |
+
y=alt.Y("sales:Q", title="์ผ ์ฐ์ฐ ํ๋งค๋"),
|
| 701 |
+
)
|
| 702 |
+
|
| 703 |
+
# ๋ถ์์ ์ฐ์ ๋ + ์ ํ ํ๊ท์
|
| 704 |
+
points = base.mark_circle(size=70, color="#d62728").encode(
|
| 705 |
+
tooltip=[
|
| 706 |
+
alt.Tooltip("date:T", title="๋ ์ง"),
|
| 707 |
+
alt.Tooltip("rain:Q", title="๊ฐ์๋"),
|
| 708 |
+
alt.Tooltip("sales:Q", title="์ฐ์ฐ ํ๋งค๋"),
|
| 709 |
+
]
|
| 710 |
+
)
|
| 711 |
+
reg_line = base.transform_regression("rain", "sales").mark_line(color="#b22222")
|
| 712 |
+
|
| 713 |
+
st.altair_chart((points + reg_line).interactive(), use_container_width=True)
|
| 714 |
+
|
| 715 |
+
# โ
์ถ๊ฐ: ์ผ๋ณ ์ฐ์ฐ ํ๋งค๋ ์ ํ ๊ทธ๋ํ
|
| 716 |
+
st.markdown("**์ผ๋ณ ์ฐ์ฐ ํ๋งค๋ ์ถ์ธ(์ ํ ๊ทธ๋ํ)**")
|
| 717 |
+
line_umbrella = (
|
| 718 |
+
alt.Chart(daily)
|
| 719 |
+
.mark_line()
|
| 720 |
+
.encode(
|
| 721 |
+
x=alt.X("date:T", title="๋ ์ง"),
|
| 722 |
+
y=alt.Y("sales:Q", title="์ผ ์ฐ์ฐ ํ๋งค๋"),
|
| 723 |
+
tooltip=[
|
| 724 |
+
alt.Tooltip("date:T", title="๋ ์ง"),
|
| 725 |
+
alt.Tooltip("sales:Q", title="์ฐ์ฐ ํ๋งค๋"),
|
| 726 |
+
alt.Tooltip("rain:Q", title="๊ฐ์๋"),
|
| 727 |
+
],
|
| 728 |
+
)
|
| 729 |
+
)
|
| 730 |
+
st.altair_chart(line_umbrella.interactive(), use_container_width=True)
|
| 731 |
+
|
| 732 |
+
# ์ฐธ๊ณ ์ฉ ํ
์ด๋ธ
|
| 733 |
+
st.dataframe(daily, use_container_width=True)
|
| 734 |
+
|
| 735 |
+
# ------------------------------
|
| 736 |
+
# 2) ๊ตฐ๊ณ ๊ตฌ๋ง: ์ ํํ ํ ๋ฌ์ ๊ธฐ์จ โ ๊ตฐ๊ณ ๊ตฌ๋ง ํ๋งค๋
|
| 737 |
+
# ------------------------------
|
| 738 |
+
with tab_g:
|
| 739 |
+
st.caption("๊ตฐ๊ณ ๊ตฌ๋ง ํ๋งค๋๊ณผ ๊ธฐ์จ(์ถ์)์ ๊ด๊ณ๋ฅผ 'ํ ๋ฌ' ๋จ์๋ก ๋ด
๋๋ค.")
|
| 740 |
+
|
| 741 |
+
up_g = st.file_uploader("๊ตฐ๊ณ ๊ตฌ๋ง/๋ ์จ ๋ฐ์ดํฐ CSV (์ ํ)", type=["csv"], key="goguma_month_up")
|
| 742 |
+
if up_g is not None:
|
| 743 |
+
df_g_raw = read_csv_flexible(io.BytesIO(up_g.read()))
|
| 744 |
+
else:
|
| 745 |
+
df_g_raw = st.session_state["df"].copy()
|
| 746 |
+
|
| 747 |
+
if date_col not in df_g_raw.columns:
|
| 748 |
+
st.warning(f"๋ ์ง ์ปฌ๋ผ '{date_col}' ์(๋ฅผ) ๋ฐ์ดํฐ์์ ์ฐพ์ง ๋ชปํ์ต๋๋ค.")
|
| 749 |
+
else:
|
| 750 |
+
# item์์ ๊ตฐ๊ณ ๊ตฌ๋ง๋ง ํํฐ (์์ผ๋ฉด)
|
| 751 |
+
item_col_g = mapping.get("item")
|
| 752 |
+
if item_col_g and item_col_g in df_g_raw.columns:
|
| 753 |
+
mask_g = df_g_raw[item_col_g].astype(str).str.contains(
|
| 754 |
+
"๊ณ ๊ตฌ๋ง|๊ตฐ๊ณ ๊ตฌ๋ง|sweet|goguma", case=False, na=False
|
| 755 |
+
)
|
| 756 |
+
if mask_g.any():
|
| 757 |
+
df_g_raw = df_g_raw[mask_g]
|
| 758 |
+
|
| 759 |
+
cols_all_g = list(df_g_raw.columns)
|
| 760 |
+
|
| 761 |
+
goguma_col = target_col if target_col in cols_all_g else guess(
|
| 762 |
+
cols_all_g,
|
| 763 |
+
["๊ณ ๊ตฌ๋ง", "๊ตฐ๊ณ ๊ตฌ๋ง", "sweetpotato", "goguma", "ํ๋งค๋", "sales", "qty", "quantity", "target"],
|
| 764 |
+
)
|
| 765 |
+
temp_col = guess(
|
| 766 |
+
cols_all_g,
|
| 767 |
+
["์จ๋", "tmin", "temp_min", "min_temp", "์ต์ ", "์ต์ ๊ธฐ์จ", "์ผ์ต์ ๊ธฐ์จ", "temperature", "temp"],
|
| 768 |
+
)
|
| 769 |
+
|
| 770 |
+
if not goguma_col or not temp_col:
|
| 771 |
+
st.warning(
|
| 772 |
+
"๊ตฐ๊ณ ๊ตฌ๋ง ํ๋งค๋ ๋๋ ๊ธฐ์จ ์ปฌ๋ผ์ ์๋์ผ๋ก ์ฐพ์ง ๋ชปํ์ต๋๋ค.\n"
|
| 773 |
+
"ํ๋งค๋: '๊ตฐ๊ณ ๊ตฌ๋ง/๊ณ ๊ตฌ๋ง/sales/target', ๊ธฐ์จ: 'tmin/์ต์ ๊ธฐ์จ/temperature' ๋ฑ์ ์ด๋ฆ์ ์ฌ์ฉํด ์ฃผ์ธ์."
|
| 774 |
+
)
|
| 775 |
+
else:
|
| 776 |
+
df_g_raw[goguma_col] = pd.to_numeric(df_g_raw[goguma_col], errors="coerce")
|
| 777 |
+
df_g_raw[temp_col] = pd.to_numeric(df_g_raw[temp_col], errors="coerce")
|
| 778 |
+
|
| 779 |
+
df_g_raw, ym_options_g = build_year_month_options(df_g_raw, date_col)
|
| 780 |
+
|
| 781 |
+
if not ym_options_g:
|
| 782 |
+
st.info("์ ํจํ ๋ ์ง ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.")
|
| 783 |
+
else:
|
| 784 |
+
labels_g = [lab for lab, _ in ym_options_g]
|
| 785 |
+
default_idx_g = len(labels_g) - 1
|
| 786 |
+
sel_label_g = st.selectbox("๋ถ์ํ ์ฐ์(YYYY-MM)", labels_g, index=default_idx_g, key="ym_goguma")
|
| 787 |
+
sel_period_g = dict(ym_options_g)[sel_label_g]
|
| 788 |
+
|
| 789 |
+
df_month_g = df_g_raw[df_g_raw["year_month"] == sel_period_g].copy()
|
| 790 |
+
if df_month_g.empty:
|
| 791 |
+
st.info(f"{sel_label_g} ์ ํด๋นํ๋ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.")
|
| 792 |
+
else:
|
| 793 |
+
df_month_g["date_only"] = df_month_g[date_col].dt.date
|
| 794 |
+
daily_g = (
|
| 795 |
+
df_month_g.groupby("date_only", as_index=False)
|
| 796 |
+
.agg({goguma_col: "sum", temp_col: "mean"})
|
| 797 |
+
.dropna(subset=[goguma_col, temp_col])
|
| 798 |
+
)
|
| 799 |
+
daily_g = daily_g.rename(
|
| 800 |
+
columns={"date_only": "date", goguma_col: "sales", temp_col: "temp"}
|
| 801 |
+
)
|
| 802 |
+
|
| 803 |
+
if daily_g.empty:
|
| 804 |
+
st.info("ํด๋น ์ฐ์์์ ์ผ๋ณ๋ก ์ง๊ณํ ์ ์๋ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.")
|
| 805 |
+
else:
|
| 806 |
+
st.markdown(f"**{sel_label_g} ํ ๋ฌ ๊ธฐ์ค ยท ๊ธฐ์จ์ ๋ฐ๋ฅธ ๊ตฐ๊ณ ๊ตฌ๋ง ํ๋งค๋**")
|
| 807 |
+
|
| 808 |
+
base_g = alt.Chart(daily_g).encode(
|
| 809 |
+
x=alt.X("temp:Q", title="์ผ ํ๊ท ๊ธฐ์จ"),
|
| 810 |
+
y=alt.Y("sales:Q", title="์ผ ๊ตฐ๊ณ ๊ตฌ๋ง ํ๋งค๋"),
|
| 811 |
+
)
|
| 812 |
+
|
| 813 |
+
points_g = base_g.mark_circle(size=70, color="#ff7f0e").encode(
|
| 814 |
+
tooltip=[
|
| 815 |
+
alt.Tooltip("date:T", title="๋ ์ง"),
|
| 816 |
+
alt.Tooltip("temp:Q", title="๊ธฐ์จ"),
|
| 817 |
+
alt.Tooltip("sales:Q", title="๊ตฐ๊ณ ๊ตฌ๋ง ํ๋งค๋"),
|
| 818 |
+
]
|
| 819 |
+
)
|
| 820 |
+
reg_g = base_g.transform_regression("temp", "sales").mark_line(color="#d35400")
|
| 821 |
+
|
| 822 |
+
st.altair_chart((points_g + reg_g).interactive(), use_container_width=True)
|
| 823 |
+
|
| 824 |
+
# โ
์ถ๊ฐ: ์ผ๋ณ ๊ตฐ๊ณ ๊ตฌ๋ง ํ๋งค๋ ์ ํ ๊ทธ๋ํ
|
| 825 |
+
st.markdown("**์ผ๋ณ ๊ตฐ๊ณ ๊ตฌ๋ง ํ๋งค๋ ์ถ์ธ(์ ํ ๊ทธ๋ํ)**")
|
| 826 |
+
line_goguma = (
|
| 827 |
+
alt.Chart(daily_g)
|
| 828 |
+
.mark_line()
|
| 829 |
+
.encode(
|
| 830 |
+
x=alt.X("date:T", title="๋ ์ง"),
|
| 831 |
+
y=alt.Y("sales:Q", title="์ผ ๊ตฐ๊ณ ๊ตฌ๋ง ํ๋งค๋"),
|
| 832 |
+
tooltip=[
|
| 833 |
+
alt.Tooltip("date:T", title="๋ ์ง"),
|
| 834 |
+
alt.Tooltip("temp:Q", title="๊ธฐ์จ"),
|
| 835 |
+
alt.Tooltip("sales:Q", title="๊ตฐ๊ณ ๊ตฌ๋ง ํ๋งค๋"),
|
| 836 |
+
],
|
| 837 |
+
)
|
| 838 |
+
)
|
| 839 |
+
st.altair_chart(line_goguma.interactive(), use_container_width=True)
|
| 840 |
+
|
| 841 |
+
st.dataframe(daily_g, use_container_width=True)
|
| 842 |
+
|
| 843 |
+
# ------------------------------
|
| 844 |
+
# 3) ์ ์ฒด: ์ฐ์ฐยท๊ตฐ๊ณ ๊ตฌ๋ง ์ ์ธ ์ ์ฒด ์ํ ์ผ๋ณ ํ๋งค๋ ์ ํ ๊ทธ๋ํ
|
| 845 |
+
# ------------------------------
|
| 846 |
+
with tab_all:
|
| 847 |
+
st.caption("์ฐ์ฐยท๊ตฐ๊ณ ๊ตฌ๋ง๋ฅผ ์ ์ธํ ๋ชจ๋ ์ํ์ ์ผ๋ณ ํ๋งค๋ ์ถ์ธ๋ฅผ ํ ๋ฒ์ ๋ด
๋๋ค.")
|
| 848 |
+
|
| 849 |
+
df_all = st.session_state["df"].copy()
|
| 850 |
+
|
| 851 |
+
if date_col not in df_all.columns or not target_col or target_col not in df_all.columns:
|
| 852 |
+
st.warning(f"๋ ์ง('{date_col}') ๋๋ ํ๊น('{target_col}') ์ปฌ๋ผ์ ์ฐพ์ ์ ์์ต๋๋ค.")
|
| 853 |
+
else:
|
| 854 |
+
# item ์ปฌ๋ผ์ด ์์ผ๋ฉด ์ฐ์ฐ/๊ตฐ๊ณ ๊ตฌ๋ง ๊ด๋ จ ์ํ ์ ์ธ
|
| 855 |
+
item_col_all = mapping.get("item")
|
| 856 |
+
if item_col_all and item_col_all in df_all.columns:
|
| 857 |
+
ex_mask = df_all[item_col_all].astype(str).str.contains(
|
| 858 |
+
"์ฐ์ฐ|umbrella|๊ณ ๊ตฌ๋ง|๊ตฐ๊ณ ๊ตฌ๋ง|sweet|goguma", case=False, na=False
|
| 859 |
+
)
|
| 860 |
+
df_all = df_all[~ex_mask]
|
| 861 |
+
|
| 862 |
+
df_all[target_col] = pd.to_numeric(df_all[target_col], errors="coerce")
|
| 863 |
+
|
| 864 |
+
df_all, ym_options_all = build_year_month_options(df_all, date_col)
|
| 865 |
+
|
| 866 |
+
if not ym_options_all:
|
| 867 |
+
st.info("์ ํจํ ๋ ์ง ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.")
|
| 868 |
+
else:
|
| 869 |
+
labels_all = [lab for lab, _ in ym_options_all]
|
| 870 |
+
default_idx_all = len(labels_all) - 1
|
| 871 |
+
sel_label_all = st.selectbox(
|
| 872 |
+
"๋ถ์ํ ์ฐ์(YYYY-MM)",
|
| 873 |
+
labels_all,
|
| 874 |
+
index=default_idx_all,
|
| 875 |
+
key="ym_all",
|
| 876 |
+
)
|
| 877 |
+
sel_period_all = dict(ym_options_all)[sel_label_all]
|
| 878 |
+
|
| 879 |
+
df_month_all = df_all[df_all["year_month"] == sel_period_all].copy()
|
| 880 |
+
if df_month_all.empty:
|
| 881 |
+
st.info(f"{sel_label_all} ์ ํด๋นํ๋ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.")
|
| 882 |
+
else:
|
| 883 |
+
df_month_all["date_only"] = df_month_all[date_col].dt.date
|
| 884 |
+
daily_all = (
|
| 885 |
+
df_month_all.groupby("date_only", as_index=False)
|
| 886 |
+
.agg({target_col: "sum"})
|
| 887 |
+
.dropna(subset=[target_col])
|
| 888 |
+
)
|
| 889 |
+
daily_all = daily_all.rename(
|
| 890 |
+
columns={"date_only": "date", target_col: "sales"}
|
| 891 |
+
)
|
| 892 |
+
|
| 893 |
+
if daily_all.empty:
|
| 894 |
+
st.info("ํด๋น ์ฐ์์์ ์ผ๋ณ๋ก ์ง๊ณํ ์ ์๋ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.")
|
| 895 |
+
else:
|
| 896 |
+
st.markdown(f"**{sel_label_all} ํ ๋ฌ ๊ธฐ์ค ยท ์ฐ์ฐยท๊ตฐ๊ณ ๊ตฌ๋ง ์ ์ธ ์ ์ฒด ์ํ ์ผ๋ณ ํ๋งค๋(์ ํ)**")
|
| 897 |
+
|
| 898 |
+
line_all = (
|
| 899 |
+
alt.Chart(daily_all)
|
| 900 |
+
.mark_line()
|
| 901 |
+
.encode(
|
| 902 |
+
x=alt.X("date:T", title="๋ ์ง"),
|
| 903 |
+
y=alt.Y("sales:Q", title="์ผ ํ๋งค๋(์ ์ฒด ์ํ ํฉ๊ณ)"),
|
| 904 |
+
tooltip=[
|
| 905 |
+
alt.Tooltip("date:T", title="๋ ์ง"),
|
| 906 |
+
alt.Tooltip("sales:Q", title="์ผ ํ๋งค๋ ํฉ๊ณ"),
|
| 907 |
+
],
|
| 908 |
+
)
|
| 909 |
+
)
|
| 910 |
+
st.altair_chart(line_all.interactive(), use_container_width=True)
|
| 911 |
+
st.dataframe(daily_all, use_container_width=True)
|
| 912 |
+
|
| 913 |
+
# ============================================================
|
| 914 |
+
# โค ์ง๋จ/๋ก๊ทธ: ๊ฒฝ๋ก/ํ์ผ ํ์ธ + ํผ๋ธ๋ฆญ URL ์ด๊ธฐ/๋ซ๊ธฐ
|
| 915 |
+
# ============================================================
|
| 916 |
+
with tabs[4]:
|
| 917 |
+
st.subheader("๊ฒฝ๋ก/ํ์ผ ์ํ")
|
| 918 |
+
|
| 919 |
+
cols = st.columns(2)
|
| 920 |
+
with cols[0]:
|
| 921 |
+
st.write("**data**", DATA_DIR)
|
| 922 |
+
st.write(os.listdir(DATA_DIR) if os.path.exists(DATA_DIR) else [])
|
| 923 |
+
st.write("**artifacts**", ARTI_DIR)
|
| 924 |
+
st.write(os.listdir(ARTI_DIR) if os.path.exists(ARTI_DIR) else [])
|
| 925 |
+
with cols[1]:
|
| 926 |
+
st.write("**models**", MODELS_DIR)
|
| 927 |
+
st.write(os.listdir(MODELS_DIR) if os.path.exists(MODELS_DIR) else [])
|
| 928 |
+
|
| 929 |
+
st.caption("ํ์ ์ ํผ๋ธ๋ฆญ URL์ ์ด์ด ์ธ๋ถ์์ ์ ์ํ ์ ์์ต๋๋ค.")
|
| 930 |
+
mode = st.radio("ํผ๋ธ๋ฆญ URL ํฐ๋๋ฌ", ["ngrok", "cloudflared"], horizontal=True, index=0)
|
| 931 |
+
|
| 932 |
+
ngk = None
|
| 933 |
+
if mode == "ngrok":
|
| 934 |
+
ngk = st.text_input(
|
| 935 |
+
"NGROK_AUTHTOKEN",
|
| 936 |
+
value=os.environ.get("NGROK_AUTHTOKEN", ""),
|
| 937 |
+
type="password",
|
| 938 |
+
help="ํ๊ฒฝ๋ณ์์ ๋ฃ์ด๋๋ฉด ๋ค์๋ถํฐ ์๋ ์ธ์ํฉ๋๋ค.",
|
| 939 |
+
)
|
| 940 |
+
|
| 941 |
+
c_open, c_close = st.columns(2)
|
| 942 |
+
if c_open.button("ํผ๋ธ๋ฆญ URL ์ด๊ธฐ", use_container_width=True):
|
| 943 |
+
if mode == "ngrok":
|
| 944 |
+
if ngk:
|
| 945 |
+
os.environ["NGROK_AUTHTOKEN"] = ngk
|
| 946 |
+
start_ngrok()
|
| 947 |
+
else:
|
| 948 |
+
start_cloudflared()
|
| 949 |
+
|
| 950 |
+
if c_close.button("ํผ๋ธ๋ฆญ URL ๋ซ๊ธฐ", use_container_width=True):
|
| 951 |
+
if mode == "ngrok":
|
| 952 |
+
try:
|
| 953 |
+
from pyngrok import ngrok
|
| 954 |
+
ngrok.kill()
|
| 955 |
+
st.info("ngrok ํฐ๋์ ์ข
๋ฃํ์ต๋๋ค.")
|
| 956 |
+
except Exception as e:
|
| 957 |
+
st.warning(f"ngrok ์ข
๋ฃ ์ค ๊ฒฝ๊ณ : {e}")
|
| 958 |
+
else:
|
| 959 |
+
proc = st.session_state.get("_cfd_proc")
|
| 960 |
+
if proc:
|
| 961 |
+
proc.terminate()
|
| 962 |
+
st.info("cloudflared ํฐ๋์ ์ข
๋ฃํ์ต๋๋ค.")
|
| 963 |
+
else:
|
| 964 |
+
st.info("cloudflared ํ์ฑ ํ๋ก์ธ์ค๊ฐ ์์ต๋๋ค.")
|
preprocess.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd, numpy as np
|
| 2 |
+
|
| 3 |
+
def add_time_features(df, date_col):
|
| 4 |
+
"""
|
| 5 |
+
[๋ฌด์์ ํ๋์?]
|
| 6 |
+
- ๋ ์ง ์ด(date_col)์์ '์ฐ๋/์/์ผ/์์ผ/๋ช ์ฃผ์ฐจ/์ฃผ๋ง ์ฌ๋ถ' ๊ฐ์
|
| 7 |
+
์ฌ์ด ๋ฌ๋ ฅ ์ ๋ณด๋ฅผ ๋ฝ์ ํ์(๋ฐ์ดํฐํ๋ ์์) ๋ถ์ฌ์ค์.
|
| 8 |
+
|
| 9 |
+
[์ ํ์ํ์ฃ ?]
|
| 10 |
+
- ๊ธฐ๊ณ๋ '2025-01-15' ๊ฐ์ ๋ ์ง ๊ธ์๋ฅผ ์ ๋ชป ์ดํดํด์.
|
| 11 |
+
๋์ '2025๋
', '1์', '15์ผ', '์์์ผ', '3์ฃผ์ฐจ' ์ฒ๋ผ ์ซ์ ์ ๋ณด๊ฐ ์์ผ๋ฉด
|
| 12 |
+
๊ท์น(๊ณ์ /์์ผ ํจํด)์ ๋ ์ ๋ฐฐ์ธ ์ ์์ด์.
|
| 13 |
+
|
| 14 |
+
[์
๋ ฅ]
|
| 15 |
+
- df: ์๋ ๋ฐ์ดํฐ ํ (DataFrame)
|
| 16 |
+
- date_col: ๋ ์ง๊ฐ ๋ค์ด์๋ ์ด ์ด๋ฆ (์: 'date')
|
| 17 |
+
|
| 18 |
+
[์ถ๋ ฅ]
|
| 19 |
+
- ๋ฌ๋ ฅ ์ ๋ณด ์ด์ด ์ถ๊ฐ๋ ์ ํ (์๋ณธ์ ๊ฑด๋๋ฆฌ์ง ์์์)
|
| 20 |
+
"""
|
| 21 |
+
df = df.copy() # ์๋ณธ์ ๋ง๊ฐ๋จ๋ฆฌ์ง ์์ผ๋ ค๊ณ ๋ณต์ฌ๋ณธ์ ๋ง๋ค์ด์.
|
| 22 |
+
|
| 23 |
+
# ๋ ์ง ๊ธ์๋ฅผ ์ง์ง '๋ ์ง'๋ก ๋ฐ๊ฟ์. ์ด์ํ ๊ฐ์ NaT(๋น์ด์์)๋ก ์ฒ๋ฆฌ.
|
| 24 |
+
df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
|
| 25 |
+
|
| 26 |
+
# ๋ ์ง๊ฐ ๋น์ด์๋ ํ์ ๊ณ์ฐ์ด ์ ๋๋ ๋นผ๊ณ , ๋ ์ง์์ผ๋ก ์ ๋ ฌํด์.
|
| 27 |
+
df = df.dropna(subset=[date_col]).sort_values(date_col)
|
| 28 |
+
|
| 29 |
+
# ๋ฌ๋ ฅ์์ ๋ฐ๋ก ๊บผ๋ผ ์ ์๋ ์ ๋ณด๋ค์ ์ ์ด๋ก ๋ง๋ค์ด์.
|
| 30 |
+
df["year"] = df[date_col].dt.year # ๋ช ๋
๋์ธ์ง
|
| 31 |
+
df["month"] = df[date_col].dt.month # ๋ช ์์ธ์ง(1~12)
|
| 32 |
+
df["day"] = df[date_col].dt.day # ๋ฉฐ์น ์ธ์ง(1~31)
|
| 33 |
+
df["dow"] = df[date_col].dt.dayofweek # ์์ผ(์=0 ... ์ผ=6)
|
| 34 |
+
# '๋ช ์ฃผ์ฐจ'๋ ISO ๋ฌ๋ ฅ ๊ธฐ์ค์ด์์. ์: 1์์ ์ฒซ ์ฃผ๊ฐ 1์ด ์๋๋ผ 52์ผ ์๋ ์์ด์.
|
| 35 |
+
df["week"] = df[date_col].dt.isocalendar().week.astype(int)
|
| 36 |
+
# ํ /์ผ์ด๋ฉด ์ฃผ๋ง(1), ์๋๋ฉด 0
|
| 37 |
+
df["is_weekend"] = (df["dow"]>=5).astype(int)
|
| 38 |
+
|
| 39 |
+
return df
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def add_lag_features(df, date_col, target_col, group_keys=None, lags=(1,7,14), rolls=(7,14)):
|
| 43 |
+
"""
|
| 44 |
+
[๋ฌด์์ ํ๋์?]
|
| 45 |
+
- '์ด์ /์ผ์ฃผ์ผ ์ /๋ณด๋ฆ ์ ' ๊ฐ์ ๊ณผ๊ฑฐ ๊ฐ(=์ง์ฐ๊ฐ, lag)์ ๋ง๋ค์ด์ ๋ถ์ด๊ณ ,
|
| 46 |
+
์ต๊ทผ 7์ผ/14์ผ์ ํ๊ท ยทํ์คํธ์ฐจ(ํ๋ค๋ฆผ)๋ ๊ฐ์ด ๋ถ์ฌ์ค์.
|
| 47 |
+
|
| 48 |
+
[์ ํ์ํ์ฃ ?]
|
| 49 |
+
- ์์๋ ์ด์ /์ง๋์ฃผ์ ๋น์ทํ๊ฒ ์์ง์ด๋ ๊ฒฝํฅ์ด ์์ด์.
|
| 50 |
+
๊ณผ๊ฑฐ ๊ฐ์ ํํธ๋ก ์ฃผ๋ฉด '๋ด์ผ'์ ๋ง์ถ๊ธฐ ์ฌ์์ ธ์.
|
| 51 |
+
- lag7: 7์ผ ์ ๊ฐ โ '์ง๋์ฃผ ๊ฐ์ ์์ผ'์ ํํธ
|
| 52 |
+
- rmean7: ์ต๊ทผ 7์ผ ํ๊ท โ ์ต๊ทผ ํ๋ฆ(ํ๊ท )
|
| 53 |
+
- rstd7: ์ต๊ทผ 7์ผ ํ๋ค๋ฆผ(ํ์คํธ์ฐจ) โ ๋ณ๋์ฑ ํฌ๊ธฐ
|
| 54 |
+
|
| 55 |
+
[group_keys๊ฐ ๋ญ์ฃ ?]
|
| 56 |
+
- ์ ํฌ/๋ธ๋๋/์ํ๋ง๋ค ๋ฐ๋ก ๊ณผ๊ฑฐ๋ฅผ ๋ณด๋ผ๊ณ ์ง์ ํ๋ ์ด๋ค์ด์์.
|
| 57 |
+
์) ["region", "item"]์ด๋ฉด ์ง์ญ+์ํ๋ณ๋ก ๊ฐ๊ฐ ์ด์ /์ง๋์ฃผ๋ฅผ ๊ณ์ฐํด์.
|
| 58 |
+
(๊ทธ๋ฃน ์์ด ํต์ผ๋ก ๊ณ์ฐํ๋ฉด ์๋ก ๋ค๋ฅธ ์ ํฌ/์ํ์ ๊ฐ์ด ์์ฌ์ ์๋ฏธ๊ฐ ํ๋ ค์ง ์ ์์ด์.)
|
| 59 |
+
|
| 60 |
+
[์
๋ ฅ]
|
| 61 |
+
- df: ํ
|
| 62 |
+
- date_col: ๋ ์ง ์ด ์ด๋ฆ
|
| 63 |
+
- target_col: ๋ง์ถ๊ณ ์ถ์ ์ซ์(ํ๋งค๋ ๋ฑ) ์ด
|
| 64 |
+
- group_keys: ๊ทธ๋ฃนํํ ์ด ๋ชฉ๋ก(์์ด๋ ๋จ)
|
| 65 |
+
- lags: ๋ง๋ค lag ๋ชฉ๋ก(๊ธฐ๋ณธ 1, 7, 14)
|
| 66 |
+
- rolls: ๊ตด๋ฆฌ๋ ์ฐฝ ํฌ๊ธฐ(rolling window) ๋ชฉ๋ก(๊ธฐ๋ณธ 7, 14)
|
| 67 |
+
|
| 68 |
+
[์ถ๋ ฅ]
|
| 69 |
+
- lag/rmean/rstd ์ด์ด ์ถ๊ฐ๋ ํ(๋ ์ง์)
|
| 70 |
+
"""
|
| 71 |
+
df = df.copy()
|
| 72 |
+
|
| 73 |
+
# group_keys ์ค ํ์ ์ค์ ๋ก ์กด์ฌํ๋ ๊ฒ๋ง ๋จ๊ฒจ์.
|
| 74 |
+
group_keys = [c for c in (group_keys or []) if c in df.columns]
|
| 75 |
+
|
| 76 |
+
# ๊ทธ๋ฃน์ด ์์ผ๋ฉด ๊ทธ๋ฃน๋ณ๋ก, ์์ผ๋ฉด ์ ์ฒด๋ฅผ ํ๋์ ๊ทธ๋ฃน์ฒ๋ผ ์ฒ๋ฆฌํด์.
|
| 77 |
+
if group_keys:
|
| 78 |
+
g = df.groupby(group_keys, group_keys=False) # group_keys=False: ํค๋ฅผ ์ธ๋ฑ์ค๋ก ์ฌ๋ฆฌ์ง ๋ง๊ธฐ
|
| 79 |
+
else:
|
| 80 |
+
g = [(None, df)] # '๊ทธ๋ฃน์ด ํ๋'๋ผ๊ณ ๊ฐ์ ํ ๋ฆฌ์คํธ. ์๋ for๋ฌธ๊ณผ ํธํ๋๊ฒ ๋ง๋ค์ด์.
|
| 81 |
+
|
| 82 |
+
out = [] # ๊ทธ๋ฃน๋ณ๋ก ์ฒ๋ฆฌํ ๊ฒฐ๊ณผ๋ฅผ ๋ชจ์๋ ๋ค, ๋ง์ง๋ง์ ํฉ์ณ์.
|
| 83 |
+
|
| 84 |
+
# pandas์ groupby๋ (ํค, ๋ถ๋ถํ) ํํ๋ก ๋ฐ๋ณต๋ฉ๋๋ค.
|
| 85 |
+
# ์์์ g๋ฅผ ๋ฆฌ์คํธ๋ก ๋ง์ถฐ์คฌ๊ธฐ ๋๋ฌธ์ ๋ ๋ชจ๋ ๊ฐ์ ๋ฐฉ์์ผ๋ก ์ํ ๊ฐ๋ฅํด์.
|
| 86 |
+
for _, part in (g if isinstance(g, list) else g):
|
| 87 |
+
part = part.sort_values(date_col).copy() # ๋ ์ง์์ผ๋ก ์ ๋ ฌ
|
| 88 |
+
|
| 89 |
+
# (1) lag ์ด๋ค ๋ง๋ค๊ธฐ: ์) lag1(์ด์ ), lag7(์ง๋์ฃผ), lag14(๋ณด๋ฆ ์ )
|
| 90 |
+
for l in lags:
|
| 91 |
+
part[f"lag{l}"] = part[target_col].shift(l)
|
| 92 |
+
# shift(l)์ ์์์ l์นธ ๋ฐ์ด์. ์ค๋ ํ์๋ 'l์ผ ์ ๊ฐ'์ด ๋ค์ด๊ฐ.
|
| 93 |
+
|
| 94 |
+
# (2) rolling ํ๊ท /ํ์คํธ์ฐจ: ์ต๊ทผ w์ผ ํ๊ท /ํ๋ค๋ฆผ
|
| 95 |
+
for w in rolls:
|
| 96 |
+
# min_periods๋ฅผ w์ ์ ๋ฐ ์ด์(์ต์ 2)์ผ๋ก ์ค์
|
| 97 |
+
# ์ด๋ฐ๋ถ ๋ฐ์ดํฐ๊ฐ ๋๋ฌด ์์ ๋๋ ๊ฐ์ด ์กฐ๊ธ์ด๋ผ๋ ๋์ค๋๋ก ๋ฐฐ๋ ค.
|
| 98 |
+
part[f"rmean{w}"] = part[target_col].rolling(w, min_periods=max(2, w//2)).mean()
|
| 99 |
+
part[f"rstd{w}"] = part[target_col].rolling(w, min_periods=max(2, w//2)).std()
|
| 100 |
+
|
| 101 |
+
out.append(part)
|
| 102 |
+
|
| 103 |
+
# ๊ทธ๋ฃน๋ณ๋ก ๋ง๋ ํ๋ค์ ์์๋๋ก ์ด์ด๋ถ์ด๊ณ , ๋ค์ ๋ ์ง์ ์ ๋ ฌ
|
| 104 |
+
return pd.concat(out, axis=0).sort_values(date_col)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def make_matrix(df, mapping):
|
| 108 |
+
"""
|
| 109 |
+
[๋ฌด์์ ํ๋์?]
|
| 110 |
+
- ๋ชจ๋ธ ํ์ต์ฉ '์
๋ ฅ X'์ '์ ๋ต y'๋ฅผ ๋ง๋๋ ๊ณต์ฅ์
๋๋ค.
|
| 111 |
+
1) ๋ ์ง/ํ๊น ์ด ์ด๋ฆ์ mapping์์ ์ฝ๊ณ ,
|
| 112 |
+
2) add_time_features / add_lag_features๋ก ์ซ์ ํํธ๋ฅผ ์ถ๊ฐํ๊ณ ,
|
| 113 |
+
3) (์๋ค๋ฉด) region/brand/item์ '์-ํซ ์ธ์ฝ๋ฉ(๊ฐ์ง ์ด)'์ผ๋ก ๋ฐ๊ฟ์ X์ ๋ถ์ฌ์.
|
| 114 |
+
4) y๋ ํ๊น ๊ฐ(ํ๋งค๋ ๋ฑ)์ผ๋ก ์ค์ ํด์.
|
| 115 |
+
|
| 116 |
+
[์
๋ ฅ]
|
| 117 |
+
- df: ์๋ณธ ํ
|
| 118 |
+
- mapping: {'date':..., 'target':..., 'region':..., 'brand':..., 'item':...}
|
| 119 |
+
(region/brand/item์ ์์ด๋ ๋จ)
|
| 120 |
+
|
| 121 |
+
[์ถ๋ ฅ]
|
| 122 |
+
- df: ํผ์ฒ๊ฐ ๋ถ์ ํ(์ด๊ธฐ lag๋ก NaN์ธ ๋งจ ์๋ถ๋ถ์ ์ ๊ฑฐ๋จ)
|
| 123 |
+
- X: ๋ชจ๋ธ์ ๋ค์ด๊ฐ ์ซ์ ๋ฐฐ์ด(2์ฐจ์)
|
| 124 |
+
- y: ์ ๋ต ๋ฒกํฐ(1์ฐจ์)
|
| 125 |
+
- feat_names: X์ ์ด ์ด๋ฆ ๋ชฉ๋ก(๋ชจ๋ธ ํด์/์ฌํ์ ํ์)
|
| 126 |
+
"""
|
| 127 |
+
df = df.copy()
|
| 128 |
+
|
| 129 |
+
# ๋งคํ์์ ์ด ์ด๋ฆ ๊บผ๋ด์ค๊ธฐ
|
| 130 |
+
date_col = mapping.get("date")
|
| 131 |
+
target_col = mapping.get("target")
|
| 132 |
+
region_col = mapping.get("region")
|
| 133 |
+
brand_col = mapping.get("brand")
|
| 134 |
+
item_col = mapping.get("item")
|
| 135 |
+
|
| 136 |
+
# ๋ ์ง/ํ๊น์ ํ์! ์์ผ๋ฉด ์งํ ๋ชป ํด์.
|
| 137 |
+
if not date_col or not target_col:
|
| 138 |
+
raise ValueError("date/target ์ปฌ๋ผ ๋งคํ์ด ํ์ํฉ๋๋ค.")
|
| 139 |
+
|
| 140 |
+
# --- (1) ์ซ์ํ ์ ๋ฆฌ ---
|
| 141 |
+
# ํ๊น์ ๋ฐ๋์ ์ซ์์ฌ์ผ ํด์. ๊ธ์๊ฐ ์์ฌ ์์ผ๋ฉด NaN์ผ๋ก ๋ฐ๋ โ 0์ผ๋ก ์ฑ์.
|
| 142 |
+
df[target_col] = pd.to_numeric(df[target_col], errors="coerce").fillna(0)
|
| 143 |
+
|
| 144 |
+
# (์ ํ) ๋ถ๋ฅํ ์ด๋ค์ ๊ธ์(๋ฌธ์์ด)๋ก ํต์ผํด์.
|
| 145 |
+
# ์ด๋ ๊ฒ ํด์ผ '์-ํซ ์ธ์ฝ๋ฉ'์ด ์ ๋ฉ๋๋ค.
|
| 146 |
+
if region_col and region_col in df: df[region_col] = df[region_col].astype(str)
|
| 147 |
+
if brand_col and brand_col in df: df[brand_col] = df[brand_col].astype(str)
|
| 148 |
+
if item_col and item_col in df: df[item_col] = df[item_col].astype(str)
|
| 149 |
+
|
| 150 |
+
# --- (2) ๋ฌ๋ ฅ ํผ์ฒ ๋ถ์ด๊ธฐ ---
|
| 151 |
+
df = add_time_features(df, date_col)
|
| 152 |
+
|
| 153 |
+
# --- (3) ๊ณผ๊ฑฐ/์ต๊ทผ ํต๊ณ ํผ์ฒ ๋ถ์ด๊ธฐ ---
|
| 154 |
+
# ๊ทธ๋ฃนํค: ์กด์ฌํ๋ ๊ฒ๋ง ์ฌ์ฉ (์: ['region','brand','item'] ์ค ์ค์ ์๋ ์ด๋ง)
|
| 155 |
+
df = add_lag_features(
|
| 156 |
+
df, date_col, target_col,
|
| 157 |
+
[c for c in [region_col, brand_col, item_col] if c]
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
# --- (4) lag/rolling ๋๋ฌธ์ ์๋ถ๋ถ์ ์๊ธด ๋น์ด์๋ ํ ์ ๊ฑฐ ---
|
| 161 |
+
# ์ฒซ ๋ช ํ์ lag1/lag7 ๊ฐ์ ๊ฒ ์ฑ์ธ ์ ์์ด์ NaN์ด ๋ผ์ โ ํ์ต์ ๋ชป ์ฐ๋ ์ ๊ฑฐ.
|
| 162 |
+
drop_cols = [c for c in df.columns if c.startswith("lag") or c.startswith("rmean") or c.startswith("rstd")]
|
| 163 |
+
df = df.dropna(subset=drop_cols)
|
| 164 |
+
|
| 165 |
+
# --- (5) ์ซ์ ํผ์ฒ ๋ชฉ๋ก ๋ง๋ค๊ธฐ ---
|
| 166 |
+
# ๋ฌ๋ ฅ ์ซ์ + lag/rolling ์ซ์๋ค์ ๋ชจ์์ X์ ๊ธฐ๋ณธ ๋ผ๋๋ฅผ ๋ง๋ค์ด์.
|
| 167 |
+
num_cols = ["year","month","day","dow","week","is_weekend"] + drop_cols
|
| 168 |
+
num_cols = [c for c in num_cols if c in df.columns] # ํน์ ๋น ์ง ๊ฒ ์์ผ๋ฉด ๊ฑธ๋ฌ์ค
|
| 169 |
+
|
| 170 |
+
# ์ซ์ ํผ์ฒ๋ฅผ ๋จผ์ ํ๋ ฌ๋ก ๋ณํ
|
| 171 |
+
X_num = df[num_cols].values
|
| 172 |
+
feat_names = list(num_cols) # ๋์ค์ ํด์/์ฌํํ ๋ ํ์
|
| 173 |
+
|
| 174 |
+
# --- (6) ๋ถ๋ฅํ(๋ฌธ์) โ ์-ํซ ์ธ์ฝ๋ฉ ---
|
| 175 |
+
# ์: region์ด '์์ธ','๊ฒฝ๊ธฐ'๋ฉด 'region_์์ธ','region_๊ฒฝ๊ธฐ' ๊ฐ์ ๊ฐ์ง ์ด์ ๋ง๋ค์ด์(0/1)
|
| 176 |
+
cat_cols = [c for c in [region_col, brand_col, item_col] if c and c in df.columns]
|
| 177 |
+
if cat_cols:
|
| 178 |
+
dummies = pd.get_dummies(df[cat_cols].astype(str), dummy_na=False)
|
| 179 |
+
# ์ซ์ ํผ์ฒ(X_num) ์ค๋ฅธ์ชฝ์ ์-ํซ ํผ์ฒ๋ฅผ ๋ถ์ฌ์.
|
| 180 |
+
X = np.hstack([X_num, dummies.values])
|
| 181 |
+
feat_names += list(dummies.columns) # ์๋ก ์๊ธด ์ด ์ด๋ฆ๋ ๊ธฐ๋ก
|
| 182 |
+
else:
|
| 183 |
+
X = X_num # ๋ถ๋ฅํ์ด ์์ผ๋ฉด ์ซ์๋ง ์ฌ์ฉ
|
| 184 |
+
|
| 185 |
+
# --- (7) ์ ๋ต y ๋ง๋ค๊ธฐ ---
|
| 186 |
+
y = df[target_col].values # ์ฐ๋ฆฌ๊ฐ ๋ง์ถ๊ณ ์ถ์ ๊ฐ(์: ํ๋งค๋)
|
| 187 |
+
|
| 188 |
+
return df, X, y, feat_names
|
quick_train_runner.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
============================================================
|
| 5 |
+
์๋ ํ์ต ๋ฐ์ฒ (train_cli.py ์์)
|
| 6 |
+
------------------------------------------------------------
|
| 7 |
+
์ด ์คํฌ๋ฆฝํธ๋ CSV๋ฅผ ์ฝ์ด ์๋์ผ๋ก ์ปฌ๋ผ ๋งคํ โ ํผ์ฒ ์์ฑ โ
|
| 8 |
+
๋ชจ๋ธ ํ๋ณด ํ์ต(์ต์
: Optuna ํ๋) โ ์ํฐํฉํธ/๋ชจ๋ธ ์ ์ฅ์
|
| 9 |
+
ํ ๋ฒ์ ์ํํฉ๋๋ค.
|
| 10 |
+
|
| 11 |
+
[์ฌ์ฉ ์]
|
| 12 |
+
python train_cli.py --data ./data/sample_sales.csv \
|
| 13 |
+
--project . \
|
| 14 |
+
--valid_ratio 0.2 \
|
| 15 |
+
--use_optuna --optuna_trials 20
|
| 16 |
+
|
| 17 |
+
ํ์:
|
| 18 |
+
--data ํ์ต์ ์ฌ์ฉํ CSV ํ์ผ ๊ฒฝ๋ก
|
| 19 |
+
|
| 20 |
+
์ ํ:
|
| 21 |
+
--project ์์
๋ฃจํธ ํด๋(๊ธฐ๋ณธ: ํ์ฌ ํด๋ ".")
|
| 22 |
+
--valid_ratio ๊ฒ์ฆ ๋น์จ(0.05~0.4 ๊ถ์ฅ, ๊ธฐ๋ณธ 0.2)
|
| 23 |
+
--use_optuna Optuna ํ๋ ์ฌ์ฉ ํ๋๊ทธ(์ง์ ์ on)
|
| 24 |
+
--optuna_trials Optuna ์๋ ํ์(๊ธฐ๋ณธ 15)
|
| 25 |
+
|
| 26 |
+
์ถ๋ ฅ:
|
| 27 |
+
ํ๋ก์ ํธ ํด๋ ์๋์
|
| 28 |
+
artifacts/ (๋ก๊ทธ/๋ฆฌ๋๋ณด๋ ๋ฑ ์ค๊ฐ ์ฐ์ถ๋ฌผ)
|
| 29 |
+
models/ (best_model.pkl ๋ฑ ๋ชจ๋ธ ํ์ผ)
|
| 30 |
+
์ด ์์ฑ๋ฉ๋๋ค.
|
| 31 |
+
============================================================
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
import os
|
| 35 |
+
import argparse
|
| 36 |
+
import pandas as pd # (ํ์ํ๋ฉด ์ถํ ์ฌ์ฉ, ์ง๊ธ์ ์ํฌํธ๋ง)
|
| 37 |
+
|
| 38 |
+
from utils_io import read_csv_flexible, save_utf8sig, ensure_dirs, auto_map_columns
|
| 39 |
+
from preprocess import make_matrix
|
| 40 |
+
from train_core import train_and_score, save_artifacts
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def main():
|
| 44 |
+
"""
|
| 45 |
+
์ปค๋งจ๋๋ผ์ธ ์ธ์๋ฅผ ํ์ฑํด์:
|
| 46 |
+
1) CSV ๋ก๋
|
| 47 |
+
2) ์๋ ์ปฌ๋ผ ๋งคํ
|
| 48 |
+
3) ํ์ต์ฉ ๋ฐ์ดํฐ์
(X, y) ๊ตฌ์ฑ
|
| 49 |
+
4) ๋ชจ๋ธ ํ์ต(+์ต์
: Optuna ํ๋)
|
| 50 |
+
5) ๊ฒฐ๊ณผ ์ ์ฅ(artifacts/, models/)
|
| 51 |
+
๋ฅผ ์์ฐจ ์คํํฉ๋๋ค.
|
| 52 |
+
"""
|
| 53 |
+
# --------------------------------------------------------
|
| 54 |
+
# 1) ์ปค๋งจ๋๋ผ์ธ ์ต์
์ ์/ํ์ฑ
|
| 55 |
+
# --------------------------------------------------------
|
| 56 |
+
ap = argparse.ArgumentParser()
|
| 57 |
+
ap.add_argument("--data", required=True, help="ํ์ต์ ์ฌ์ฉํ CSV ๊ฒฝ๋ก (์: ./data/sales.csv)")
|
| 58 |
+
ap.add_argument("--project", default=".", help="์์
๋ฃจํธ ํด๋(artifacts/models ์์ฑ ์์น). ๊ธฐ๋ณธ๊ฐ='.'")
|
| 59 |
+
ap.add_argument("--valid_ratio", type=float, default=0.2, help="๊ฒ์ฆ ๋ฐ์ดํฐ ๋น์จ(๊ธฐ๋ณธ 0.2)")
|
| 60 |
+
ap.add_argument("--use_optuna", action="store_true", help="Optuna ํ๋ ์ฌ์ฉ ์ฌ๋ถ(ํ๋๊ทธ ์ง์ ์ ์ฌ์ฉ)")
|
| 61 |
+
ap.add_argument("--optuna_trials", type=int, default=15, help="Optuna ์๋ ํ์(๊ธฐ๋ณธ 15)")
|
| 62 |
+
args = ap.parse_args()
|
| 63 |
+
|
| 64 |
+
# --------------------------------------------------------
|
| 65 |
+
# 2) ์์
๋ฃจํธ ์ด๋ (์๋ ๊ฒฝ๋ก ํผ๋ ๋ฐฉ์ง)
|
| 66 |
+
# --------------------------------------------------------
|
| 67 |
+
proj = os.path.abspath(args.project) # ์ ๋๊ฒฝ๋ก๋ก ๋ณํ
|
| 68 |
+
os.chdir(proj) # ์ฌ๊ธธ ๊ธฐ์ค์ผ๋ก ํ์ผ ์ฝ๊ณ /์ ์ฅ
|
| 69 |
+
|
| 70 |
+
# --------------------------------------------------------
|
| 71 |
+
# 3) CSV ๋ก๋ + ์ปฌ๋ผ ์๋ ๋งคํ
|
| 72 |
+
# --------------------------------------------------------
|
| 73 |
+
data = read_csv_flexible(args.data)
|
| 74 |
+
mapping = auto_map_columns(data)
|
| 75 |
+
|
| 76 |
+
# --------------------------------------------------------
|
| 77 |
+
# 4) ํผ์ฒ ๊ตฌ์ฑ(X, y, feat_names ์์ฑ)
|
| 78 |
+
# --------------------------------------------------------
|
| 79 |
+
df, X, y, feat_names = make_matrix(data, mapping)
|
| 80 |
+
|
| 81 |
+
# --------------------------------------------------------
|
| 82 |
+
# 5) ์ถ๋ ฅ ํด๋ ์ค๋น (์์ผ๋ฉด ์์ฑ)
|
| 83 |
+
# --------------------------------------------------------
|
| 84 |
+
artifacts = os.path.join(proj, "artifacts") # ๋ฆฌ๋๋ณด๋/๋ก๊ทธ ๋ฑ
|
| 85 |
+
models_dir = os.path.join(proj, "models") # best_model.pkl ์ ์ฅ ์์น
|
| 86 |
+
ensure_dirs(artifacts, models_dir)
|
| 87 |
+
|
| 88 |
+
# --------------------------------------------------------
|
| 89 |
+
# 6) ๋ชจ๋ธ ํ์ต(+์ต์
: Optuna) & ๋ฆฌ๋๋ณด๋ ํ๋
|
| 90 |
+
# --------------------------------------------------------
|
| 91 |
+
best_model, lb = train_and_score(
|
| 92 |
+
X, y,
|
| 93 |
+
valid_ratio=args.valid_ratio,
|
| 94 |
+
use_optuna=args.use_optuna,
|
| 95 |
+
optuna_trials=args.optuna_trials
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# --------------------------------------------------------
|
| 99 |
+
# 7) ์ฐ์ถ๋ฌผ ์ ์ฅ (๋ชจ๋ธ/๋ฉํ๋ฐ์ดํฐ/๋ฆฌ๋๋ณด๋)
|
| 100 |
+
# --------------------------------------------------------
|
| 101 |
+
save_artifacts([artifacts, models_dir], best_model, feat_names, mapping, lb)
|
| 102 |
+
|
| 103 |
+
# --------------------------------------------------------
|
| 104 |
+
# 8) ์ฝ์ ๋ก๊ทธ(์์ฝ)
|
| 105 |
+
# --------------------------------------------------------
|
| 106 |
+
print("โ
training done.")
|
| 107 |
+
print(" - artifacts:", artifacts)
|
| 108 |
+
print(" - models :", models_dir)
|
| 109 |
+
try:
|
| 110 |
+
print(lb.head())
|
| 111 |
+
except Exception:
|
| 112 |
+
print(lb)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
if __name__ == "__main__":
|
| 116 |
+
main()
|
requirements.txt
CHANGED
|
@@ -1,3 +1,10 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
streamlit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas==2.2.2
|
| 2 |
+
numpy==1.26.4
|
| 3 |
+
streamlit==1.39.0
|
| 4 |
+
altair>=5,<6
|
| 5 |
+
scikit-learn
|
| 6 |
+
pyngrok
|
| 7 |
+
cloudflared
|
| 8 |
+
xgboost
|
| 9 |
+
lightgbm
|
| 10 |
+
optuna
|
starter.ipynb
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"cells":[{"cell_type":"markdown","id":"GTG-zSlVYDJg","metadata":{"id":"GTG-zSlVYDJg"},"source":["# cstore_suite Colab ๋ฐ์ฒ\n","\n","์ด ๋
ธํธ๋ถ์ `cstore_suite` ํ๋ก์ ํธ๋ฅผ Colab์์ ํ ๋ฒ์ ์คํํ๊ธฐ ์ํ ๋ฐ์ฒ์
๋๋ค.\n","\n","์์๋๋ก ์์์ ์๋๋ก ์
๋ง ์คํํ๋ฉด ๋ฉ๋๋ค.\n"]},{"cell_type":"code","execution_count":null,"id":"DBPgdiZp0aP0","metadata":{"colab":{"background_save":true},"executionInfo":{"elapsed":4,"status":"aborted","timestamp":1763530193256,"user":{"displayName":"์ดํ์ง","userId":"11960777193249666496"},"user_tz":-540},"id":"DBPgdiZp0aP0"},"outputs":[{"name":"stderr","output_type":"stream","text":["ERROR:root:Internal Python error in the inspect module.\n","Below is the traceback from this internal error.\n","\n","ERROR:root:Internal Python error in the inspect module.\n","Below is the traceback from this internal error.\n","\n","ERROR:root:Internal Python error in the inspect module.\n","Below is the traceback from this internal error.\n","\n"]},{"name":"stdout","output_type":"stream","text":["Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3553, in run_code\n"," exec(code_obj, self.user_global_ns, self.user_ns)\n"," File \"/tmp/ipython-input-693870770.py\", line 1, in \u003ccell line: 0\u003e\n"," get_ipython().run_line_magic('cd', '/content/cstore_suite')\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2418, in run_line_magic\n"," result = fn(*args, **kwargs)\n"," ^^^^^^^^^^^^^^^^^^^\n"," File \"\u003cdecorator-gen-85\u003e\", line 2, in cd\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/magic.py\", line 187, in \u003clambda\u003e\n"," call = lambda f, *a, **k: f(*a, **k)\n"," ^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/magics/osm.py\", line 342, in cd\n"," oldcwd = os.getcwd()\n"," ^^^^^^^^^^^\n","OSError: [Errno 107] Transport endpoint is not connected\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2099, in showtraceback\n"," stb = value._render_traceback_()\n"," ^^^^^^^^^^^^^^^^^^^^^^^^\n","AttributeError: 'OSError' object has no attribute '_render_traceback_'\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1101, in get_records\n"," return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 248, in wrapped\n"," return f(*args, **kwargs)\n"," ^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 281, in _fixed_getinnerframes\n"," records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 1769, in getinnerframes\n"," traceback_info = getframeinfo(tb, context)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 1714, in getframeinfo\n"," filename = getsourcefile(frame) or getfile(frame)\n"," ^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 970, in getsourcefile\n"," module = getmodule(object, filename)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 999, in getmodule\n"," file = getabsfile(object, _filename)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 983, in getabsfile\n"," return os.path.normcase(os.path.abspath(_filename))\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"\u003cfrozen posixpath\u003e\", line 415, in abspath\n","OSError: [Errno 107] Transport endpoint is not connected\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3553, in run_code\n"," exec(code_obj, self.user_global_ns, self.user_ns)\n"," File \"/tmp/ipython-input-693870770.py\", line 1, in \u003ccell line: 0\u003e\n"," get_ipython().run_line_magic('cd', '/content/cstore_suite')\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2418, in run_line_magic\n"," result = fn(*args, **kwargs)\n"," ^^^^^^^^^^^^^^^^^^^\n"," File \"\u003cdecorator-gen-85\u003e\", line 2, in cd\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/magic.py\", line 187, in \u003clambda\u003e\n"," call = lambda f, *a, **k: f(*a, **k)\n"," ^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/magics/osm.py\", line 342, in cd\n"," oldcwd = os.getcwd()\n"," ^^^^^^^^^^^\n","OSError: [Errno 107] Transport endpoint is not connected\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2099, in showtraceback\n"," stb = value._render_traceback_()\n"," ^^^^^^^^^^^^^^^^^^^^^^^^\n","AttributeError: 'OSError' object has no attribute '_render_traceback_'\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3473, in run_ast_nodes\n"," if (await self.run_code(code, result, async_=asy)):\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3575, in run_code\n"," self.showtraceback(running_compiled_code=True)\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2101, in showtraceback\n"," stb = self.InteractiveTB.structured_traceback(etype,\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1367, in structured_traceback\n"," return FormattedTB.structured_traceback(\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1267, in structured_traceback\n"," return VerboseTB.structured_traceback(\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1124, in structured_traceback\n"," formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1082, in format_exception_as_a_whole\n"," last_unique, recursion_repeat = find_recursion(orig_etype, evalue, records)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 382, in find_recursion\n"," return len(records), 0\n"," ^^^^^^^^^^^^\n","TypeError: object of type 'NoneType' has no len()\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2099, in showtraceback\n"," stb = value._render_traceback_()\n"," ^^^^^^^^^^^^^^^^^^^^^^^^\n","AttributeError: 'TypeError' object has no attribute '_render_traceback_'\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1101, in get_records\n"," return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 248, in wrapped\n"," return f(*args, **kwargs)\n"," ^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 281, in _fixed_getinnerframes\n"," records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 1769, in getinnerframes\n"," traceback_info = getframeinfo(tb, context)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 1714, in getframeinfo\n"," filename = getsourcefile(frame) or getfile(frame)\n"," ^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 970, in getsourcefile\n"," module = getmodule(object, filename)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 999, in getmodule\n"," file = getabsfile(object, _filename)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 983, in getabsfile\n"," return os.path.normcase(os.path.abspath(_filename))\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"\u003cfrozen posixpath\u003e\", line 415, in abspath\n","OSError: [Errno 107] Transport endpoint is not connected\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3553, in run_code\n"," exec(code_obj, self.user_global_ns, self.user_ns)\n"," File \"/tmp/ipython-input-693870770.py\", line 1, in \u003ccell line: 0\u003e\n"," get_ipython().run_line_magic('cd', '/content/cstore_suite')\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2418, in run_line_magic\n"," result = fn(*args, **kwargs)\n"," ^^^^^^^^^^^^^^^^^^^\n"," File \"\u003cdecorator-gen-85\u003e\", line 2, in cd\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/magic.py\", line 187, in \u003clambda\u003e\n"," call = lambda f, *a, **k: f(*a, **k)\n"," ^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/magics/osm.py\", line 342, in cd\n"," oldcwd = os.getcwd()\n"," ^^^^^^^^^^^\n","OSError: [Errno 107] Transport endpoint is not connected\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2099, in showtraceback\n"," stb = value._render_traceback_()\n"," ^^^^^^^^^^^^^^^^^^^^^^^^\n","AttributeError: 'OSError' object has no attribute '_render_traceback_'\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3473, in run_ast_nodes\n"," if (await self.run_code(code, result, async_=asy)):\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3575, in run_code\n"," self.showtraceback(running_compiled_code=True)\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2101, in showtraceback\n"," stb = self.InteractiveTB.structured_traceback(etype,\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1367, in structured_traceback\n"," return FormattedTB.structured_traceback(\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1267, in structured_traceback\n"," return VerboseTB.structured_traceback(\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1124, in structured_traceback\n"," formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1082, in format_exception_as_a_whole\n"," last_unique, recursion_repeat = find_recursion(orig_etype, evalue, records)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 382, in find_recursion\n"," return len(records), 0\n"," ^^^^^^^^^^^^\n","TypeError: object of type 'NoneType' has no len()\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2099, in showtraceback\n"," stb = value._render_traceback_()\n"," ^^^^^^^^^^^^^^^^^^^^^^^^\n","AttributeError: 'TypeError' object has no attribute '_render_traceback_'\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3030, in _run_cell\n"," return runner(coro)\n"," ^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/async_helpers.py\", line 78, in _pseudo_sync_runner\n"," coro.send(None)\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3257, in run_cell_async\n"," has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 3492, in run_ast_nodes\n"," self.showtraceback()\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2101, in showtraceback\n"," stb = self.InteractiveTB.structured_traceback(etype,\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1367, in structured_traceback\n"," return FormattedTB.structured_traceback(\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1267, in structured_traceback\n"," return VerboseTB.structured_traceback(\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1142, in structured_traceback\n"," formatted_exceptions += self.format_exception_as_a_whole(etype, evalue, etb, lines_of_context,\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1082, in format_exception_as_a_whole\n"," last_unique, recursion_repeat = find_recursion(orig_etype, evalue, records)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 382, in find_recursion\n"," return len(records), 0\n"," ^^^^^^^^^^^^\n","TypeError: object of type 'NoneType' has no len()\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py\", line 2099, in showtraceback\n"," stb = value._render_traceback_()\n"," ^^^^^^^^^^^^^^^^^^^^^^^^\n","AttributeError: 'TypeError' object has no attribute '_render_traceback_'\n","\n","During handling of the above exception, another exception occurred:\n","\n","Traceback (most recent call last):\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 1101, in get_records\n"," return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 248, in wrapped\n"," return f(*args, **kwargs)\n"," ^^^^^^^^^^^^^^^^^^\n"," File \"/usr/local/lib/python3.12/dist-packages/IPython/core/ultratb.py\", line 281, in _fixed_getinnerframes\n"," records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 1769, in getinnerframes\n"," traceback_info = getframeinfo(tb, context)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 1714, in getframeinfo\n"," filename = getsourcefile(frame) or getfile(frame)\n"," ^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 970, in getsourcefile\n"," module = getmodule(object, filename)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 999, in getmodule\n"," file = getabsfile(object, _filename)\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"/usr/lib/python3.12/inspect.py\", line 983, in getabsfile\n"," return os.path.normcase(os.path.abspath(_filename))\n"," ^^^^^^^^^^^^^^^^^^^^^^^^^^\n"," File \"\u003cfrozen posixpath\u003e\", line 415, in abspath\n","OSError: [Errno 107] Transport endpoint is not connected\n"]}],"source":["%cd /content/cstore_suite"]},{"cell_type":"code","execution_count":7,"id":"setup_project","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":256},"executionInfo":{"elapsed":2520,"status":"error","timestamp":1763530504418,"user":{"displayName":"์ดํ์ง","userId":"11960777193249666496"},"user_tz":-540},"id":"setup_project","outputId":"a33dfd95-9a46-477e-f6e3-c0121bd4a9e7"},"outputs":[{"name":"stdout","output_type":"stream","text":["โ
Python version: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]\n","Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]},{"ename":"OSError","evalue":"[Errno 107] Transport endpoint is not connected","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)","\u001b[0;32m/tmp/ipython-input-1340184868.py\u001b[0m in \u001b[0;36m\u003ccell line: 0\u003e\u001b[0;34m()\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0mcandidates\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/content/cstore_suite\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# Colab์์ ZIP ํ์ด์ ์ด ๊ฒฝ์ฐ\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0mcandidates\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/content/drive/MyDrive/cstore_suite\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# Drive์ ํด๋๊ฐ ์๋ ๊ฒฝ์ฐ\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---\u003e 24\u001b[0;31m \u001b[0mcandidates\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetcwd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# ํ์ฌ ํด๋\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0mproj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mOSError\u001b[0m: [Errno 107] Transport endpoint is not connected"]}],"source":["# 1๏ธโฃ Colab / Jupyter ํ๊ฒฝ์์ cstore_suite ํ๋ก์ ํธ ํด๋ ์๋ ์ค์ \n","import os, sys\n","\n","print(\"โ
Python version:\", sys.version)\n","\n","# (์ ํ) Google Colab ์์ ์คํ ์ค์ด๋ฉด Drive ๋ง์ดํธ ์๋\n","IN_COLAB = False\n","try:\n"," from google.colab import drive # type: ignore\n"," IN_COLAB = True\n","except Exception:\n"," IN_COLAB = False\n","\n","if IN_COLAB:\n"," try:\n"," drive.mount(\"/content/drive\")\n"," except Exception as e:\n"," print(\"โ Drive ๋ง์ดํธ ์ค ๊ฒฝ๊ณ :\", e)\n","\n","# ํ๋ก์ ํธ ์์น ํ๋ณด๋ค\n","candidates = []\n","candidates.append(\"/content/cstore_suite\") # Colab์์ ZIP ํ์ด์ ์ด ๊ฒฝ์ฐ\n","candidates.append(\"/content/drive/MyDrive/cstore_suite\") # Drive์ ํด๋๊ฐ ์๋ ๊ฒฝ์ฐ\n","candidates.append(os.getcwd()) # ํ์ฌ ํด๋\n","\n","proj = None\n","for p in candidates:\n"," if os.path.isdir(p):\n"," proj = p\n"," break\n","\n","if proj is None:\n"," raise FileNotFoundError(\n"," \"cstore_suite ํ๋ก์ ํธ ํด๋๋ฅผ ์ฐพ์ง ๋ชปํ์ต๋๋ค.\\n\"\n"," \"- /content/cstore_suite ๋๋ /content/drive/MyDrive/cstore_suite ์์น๋ฅผ ํ์ธํ์ธ์.\"\n"," )\n","\n","os.chdir(proj)\n","print(\"๐ ์์
ํด๋:\", os.getcwd())\n","print(\"๐ ํฌํจ ํ์ผ:\", os.listdir())\n"]},{"cell_type":"code","execution_count":null,"id":"install_requirements","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":220},"executionInfo":{"elapsed":46,"status":"error","timestamp":1763530163826,"user":{"displayName":"์ดํ์ง","userId":"11960777193249666496"},"user_tz":-540},"id":"install_requirements","outputId":"f007b1e7-b1aa-4c08-cdc6-b65584b90950"},"outputs":[{"ename":"OSError","evalue":"[Errno 107] Transport endpoint is not connected","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)","\u001b[0;32m/tmp/ipython-input-3246530550.py\u001b[0m in \u001b[0;36m\u003ccell line: 0\u003e\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msubprocess\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----\u003e 4\u001b[0;31m \u001b[0mreq_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetcwd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"requirements.txt\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"โ requirements.txt ๊ฐ ์์ด ๊ธฐ๋ณธ ํ
ํ๋ฆฟ์ ์์ฑํฉ๋๋ค: {req_path}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mOSError\u001b[0m: [Errno 107] Transport endpoint is not connected"]}],"source":["# 2๏ธโฃ requirements.txt ๊ธฐ๋ฐ ์์กด์ฑ ์ค์น\n","import os, subprocess, sys\n","\n","req_path = os.path.join(os.getcwd(), \"requirements.txt\")\n","if not os.path.exists(req_path):\n"," print(f\"โ requirements.txt ๊ฐ ์์ด ๊ธฐ๋ณธ ํ
ํ๋ฆฟ์ ์์ฑํฉ๋๋ค: {req_path}\")\n"," with open(req_path, \"w\", encoding=\"utf-8\") as f:\n"," f.write(\"pandas==2.2.2\\n\")\n"," f.write(\"numpy==1.26.4\\n\")\n"," f.write(\"streamlit==1.39.0\\n\")\n"," f.write(\"altair\u003e=5,\u003c6\\n\")\n"," f.write(\"scikit-learn\\n\")\n"," f.write(\"pyngrok\\n\")\n"," f.write(\"cloudflared\\n\")\n"," f.write(\"xgboost\\n\")\n"," f.write(\"lightgbm\\n\")\n"," f.write(\"optuna\\n\")\n","else:\n"," print(\"๐ requirements.txt ์์น:\", req_path)\n","\n","print(\"๐ฆ ์์กด์ฑ ์ค์น ์ค... (๋ค์ ์๊ฐ์ด ๊ฑธ๋ฆด ์ ์์ต๋๋ค)\")\n","subprocess.run([sys.executable, \"-m\", \"pip\", \"install\", \"-r\", req_path], check=True)\n","print(\"โ
์์กด์ฑ ์ค์น ์๋ฃ\")\n"]},{"cell_type":"code","execution_count":null,"id":"auto_train","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":16,"status":"ok","timestamp":1763097971961,"user":{"displayName":"๋ฐ์ข
ํ","userId":"17966453500002937995"},"user_tz":-540},"id":"auto_train","outputId":"c104b1cb-1aff-4ef8-99eb-e55d8477e50b"},"outputs":[{"name":"stdout","output_type":"stream","text":["โ
์ด๋ฏธ ํ์ต๋ ๋ชจ๋ธ์ด ์์ต๋๋ค: /content/drive/MyDrive/cstore_suite/models/best_model.pkl\n"]}],"source":["# 3๏ธโฃ ๋ชจ๋ธ ์๋ ํ์ต (models/best_model.pkl ์ด ์์ ๋๋ง ์ํ)\n","import os, sys, glob, subprocess\n","\n","proj = os.getcwd()\n","models_dir = os.path.join(proj, \"models\")\n","os.makedirs(models_dir, exist_ok=True)\n","best_model_path = os.path.join(models_dir, \"best_model.pkl\")\n","\n","if os.path.exists(best_model_path):\n"," print(\"โ
์ด๋ฏธ ํ์ต๋ ๋ชจ๋ธ์ด ์์ต๋๋ค:\", best_model_path)\n","else:\n"," data_dir = os.path.join(proj, \"data\")\n"," if not os.path.isdir(data_dir):\n"," raise FileNotFoundError(f\"data ํด๋๋ฅผ ์ฐพ์ง ๋ชปํ์ต๋๋ค: {data_dir}\")\n","\n"," candidates = [p for p in glob.glob(os.path.join(data_dir, \"*.csv\")) if os.path.isfile(p)]\n"," if not candidates:\n"," raise FileNotFoundError(f\"data ํด๋์์ CSV ํ์ผ์ ์ฐพ์ง ๋ชปํ์ต๋๋ค: {data_dir}\")\n","\n"," data_path = None\n"," # ์ฐ์ ์์: sample_sales.csv โ seoul_gyeonggi_with_demand.csv โ ๊ทธ ์ธ ์ฒซ ๋ฒ์งธ\n"," for name in [\"sample_sales.csv\", \"seoul_gyeonggi_with_demand.csv\"]:\n"," cand = os.path.join(data_dir, name)\n"," if os.path.exists(cand):\n"," data_path = cand\n"," break\n"," if data_path is None:\n"," data_path = candidates[0]\n","\n"," print(\"๐ ํ์ต์ ์ฌ์ฉํ ๋ฐ์ดํฐ:\", data_path)\n","\n"," cmd = [\n"," sys.executable,\n"," \"quick_train_runner.py\",\n"," \"--data\", data_path,\n"," \"--project\", proj,\n"," \"--valid_ratio\", \"0.2\",\n"," \"--use_optuna\",\n"," \"--optuna_trials\", \"10\",\n"," ]\n"," print(\"๐ ํ์ต ๋ช
๋ น:\", \" \".join(cmd))\n"," subprocess.run(cmd, check=True)\n"," if os.path.exists(best_model_path):\n"," print(\"โ
ํ์ต ๋ฐ ๋ชจ๋ธ ์ ์ฅ ์๋ฃ:\", best_model_path)\n"," else:\n"," print(\"โ quick_train_runner.py ์คํ์ ๋๋ฌ์ง๋ง best_model.pkl ์ ์ฐพ์ง ๋ชปํ์ต๋๋ค.\")\n"]},{"cell_type":"code","execution_count":null,"id":"run_streamlit","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"run_streamlit","outputId":"f98365b7-2f7a-4847-f8c3-5b2ae49a7ef7"},"outputs":[{"name":"stdout","output_type":"stream","text":["NGROK_AUTHTOKEN ์ ์
๋ ฅํ์ธ์(์์ผ๋ฉด ์ํฐ): 34Ug4uB0dodqJJVE41prai7dVdp_48R3SS6GGeSabkZFP89Xw\n","๐ ngrok ํฐ๋์ ์ฌ๋ ์ค...\n","๐ Public URL: https://debra-didactic-preculturally.ngrok-free.dev\n","โก ์ URL ์ ์ ํญ์์ ์ด๋ฉด Streamlit ์ฑ์ ์ ์ํ ์ ์์ต๋๋ค.\n","(์
์ ๋ฉ์ถ๋ฉด ํฐ๋๋ ์ข
๋ฃ๋ฉ๋๋ค.)\n","๐ Streamlit ์คํ: /usr/bin/python3 -m streamlit run app_streamlit_pro.py --server.port 8501 --server.address 0.0.0.0\n"]}],"source":["# 4๏ธโฃ Streamlit ์ฑ + ngrok ํฐ๋ ์คํ\n","# - ์
์คํ ํ ์ถ๋ ฅ๋๋ Public URL ๋ก ์ ์ํ๋ฉด ์ฑ ํ๋ฉด์ด ๋ณด์
๋๋ค.\n","\n","import os, sys, subprocess, time\n","\n","from pyngrok import ngrok, conf\n","\n","# ๊ธฐ์กด ํฐ๋ ์ ๋ฆฌ\n","try:\n"," ngrok.kill()\n","except Exception:\n"," pass\n","\n","port = 8501\n","\n","# ํ ํฐ ์ฝ๊ธฐ (ํ๊ฒฝ๋ณ์ ์ฐ์ , ์์ผ๋ฉด ์
๋ ฅ ๋ฐ๊ธฐ)\n","token = os.environ.get(\"NGROK_AUTHTOKEN\", \"\").strip()\n","if not token:\n"," try:\n"," token = input(\"NGROK_AUTHTOKEN ์ ์
๋ ฅํ์ธ์(์์ผ๋ฉด ์ํฐ): \").strip()\n"," except EOFError:\n"," token = \"\"\n","\n","if token:\n"," conf.get_default().auth_token = token\n","else:\n"," print(\"โ NGROK_AUTHTOKEN ์ด ๋น์ด ์์ต๋๋ค. ๋น์ธ์ฆ ๋ชจ๋๋ ์ ํ/์๋ฌ๊ฐ ๋ ์ ์์ต๋๋ค.\")\n","\n","# ngrok ํฐ๋ ์คํ\n","print(\"๐ ngrok ํฐ๋์ ์ฌ๋ ์ค...\")\n","tunnel = ngrok.connect(addr=f\"http://localhost:{port}\", proto=\"http\")\n","public_url = tunnel.public_url\n","print(\"๐ Public URL:\", public_url)\n","print(\"โก ์ URL ์ ์ ํญ์์ ์ด๋ฉด Streamlit ์ฑ์ ์ ์ํ ์ ์์ต๋๋ค.\")\n","print(\"(์
์ ๋ฉ์ถ๋ฉด ํฐ๋๋ ์ข
๋ฃ๋ฉ๋๋ค.)\")\n","\n","# Streamlit ์ฑ ์คํ (๋ก๊ทธ๋ ์๋์ ์ถ๋ ฅ)\n","cmd = [\n"," sys.executable,\n"," \"-m\", \"streamlit\",\n"," \"run\", \"app_streamlit_pro.py\",\n"," \"--server.port\", str(port),\n"," \"--server.address\", \"0.0.0.0\",\n","]\n","print(\"๐ Streamlit ์คํ:\", \" \".join(cmd))\n","subprocess.run(cmd, check=True)\n"]}],"metadata":{"colab":{"name":"","version":""},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.11.8"}},"nbformat":4,"nbformat_minor":5}
|
train_core.py
ADDED
|
@@ -0,0 +1,438 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
============================================================
|
| 5 |
+
train_core.py โ ํ์ต ํต์ฌ ๋ก์ง(์ฃผ์ ์์ฃผ ์์ธํ)
|
| 6 |
+
------------------------------------------------------------
|
| 7 |
+
์ด ํ์ผ์ ๋ค์ ์ผ์ ํด์:
|
| 8 |
+
1) ํ๊ฐ ์งํ ํจ์ ์ ์(RMSE/MAE/MAPE)
|
| 9 |
+
2) ์ฌ์ฉํ ๋ชจ๋ธ ํ๋ณด๋ค์ ๋ชจ์์ฃผ๋ ํจ์(get_candidates)
|
| 10 |
+
3) ์๊ณ์ด ๋ถํ (ํ์ต/๊ฒ์ฆ ๋๋๊ธฐ)
|
| 11 |
+
4) ๊ฐ๋จํ ์์๋ธ(SimpleEnsemble)
|
| 12 |
+
5) (์ต์
) Optuna ๋ก ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋
|
| 13 |
+
6) train_and_score: ๋ชจ๋ธ๋ค ํ์ต โ ๊ฒ์ฆ ์ฑ๋ฅ ๋น๊ต โ ๋ฒ ์คํธ ์ ํ
|
| 14 |
+
7) save_artifacts: ๋ฒ ์คํธ ๋ชจ๋ธ/๋ฆฌ๋๋ณด๋ ์ ์ฅ
|
| 15 |
+
|
| 16 |
+
โป XGBoost/LightGBM/Optuna ๋ ์ค์น๋์ด ์์ง ์์ผ๋ฉด
|
| 17 |
+
์๋์ผ๋ก ๊ฑด๋๋ฐ๋๋ก ๋ง๋ค์ด์ก์ต๋๋ค.
|
| 18 |
+
============================================================
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import os
|
| 22 |
+
import pickle
|
| 23 |
+
import numpy as np
|
| 24 |
+
import pandas as pd
|
| 25 |
+
|
| 26 |
+
# ํ๊ฐ ์งํ ๊ณ์ฐ์ ์ํด scikit-learn ํจ์ ์ฌ์ฉ
|
| 27 |
+
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
| 28 |
+
|
| 29 |
+
# ๊ธฐ๋ณธ ์ ํํ๊ท/๋๋คํฌ๋ ์คํธ
|
| 30 |
+
from sklearn.linear_model import LinearRegression
|
| 31 |
+
from sklearn.ensemble import RandomForestRegressor
|
| 32 |
+
|
| 33 |
+
# XGBoost / LightGBM ์ ์์ ์๋, ์์ ์๋ ์์ด์. (try/except)
|
| 34 |
+
try:
|
| 35 |
+
from xgboost import XGBRegressor
|
| 36 |
+
except Exception:
|
| 37 |
+
XGBRegressor = None
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
from lightgbm import LGBMRegressor
|
| 41 |
+
except Exception:
|
| 42 |
+
LGBMRegressor = None
|
| 43 |
+
|
| 44 |
+
# Optuna(ํ์ดํผํ๋ผ๋ฏธํฐ ์๋ ํ์๊ธฐ)๋ ์ ํ์ฌํญ
|
| 45 |
+
try:
|
| 46 |
+
import optuna
|
| 47 |
+
except Exception:
|
| 48 |
+
optuna = None
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# ------------------------------------------------------------
|
| 52 |
+
# 1) ํ๊ฐ ์งํ: RMSE / MAE / MAPE
|
| 53 |
+
# ------------------------------------------------------------
|
| 54 |
+
def rmse(a, b):
|
| 55 |
+
"""
|
| 56 |
+
RMSE (Root Mean Squared Error)
|
| 57 |
+
- ์์ธก์ด ์ค์ ์ ์ผ๋ง๋ ๋ค๋ฅธ์ง, '์ ๊ณฑ ํ๊ท ์ค์ฐจ์ ์ ๊ณฑ๊ทผ'
|
| 58 |
+
- ๊ฐ์ด ์์์๋ก ์ข์์.
|
| 59 |
+
"""
|
| 60 |
+
a = np.array(a); b = np.array(b)
|
| 61 |
+
return float(np.sqrt(mean_squared_error(a, b))) if len(a) else float("nan")
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def mae(a, b):
|
| 65 |
+
"""
|
| 66 |
+
MAE (Mean Absolute Error)
|
| 67 |
+
- ์์ธก๊ณผ ์ค์ ์ ์ฐจ์ด์ '์ ๋๊ฐ'์ ํ๊ท ๋ธ ๊ฐ
|
| 68 |
+
- ์ฌ์ด ์ง๊ด: ํ๊ท ์ ์ผ๋ก ๋ช ๊ฐ(๋๋ ๋ช ๋จ์) ๋งํผ ํ๋ ธ๋?
|
| 69 |
+
"""
|
| 70 |
+
a = np.array(a); b = np.array(b)
|
| 71 |
+
return float(mean_absolute_error(a, b)) if len(a) else float("nan")
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def mape(a, b):
|
| 75 |
+
"""
|
| 76 |
+
MAPE (Mean Absolute Percentage Error)
|
| 77 |
+
- ํผ์ผํธ(%) ๊ธฐ์ค ์ค์ฐจ. 10%๋ฉด 'ํ๊ท ์ ์ผ๋ก 10% ํ๋ ธ๋ค'๋ ๋ป.
|
| 78 |
+
- ์ค์ ๊ฐ์ด 0์ด๋ฉด ๋๋์
์ด ์ ๋๋ฏ๋ก 1๋ก ๋ฐ๊ฟ์ ์์ ์ฒ๋ฆฌํด์.
|
| 79 |
+
"""
|
| 80 |
+
a = np.array(a); b = np.array(b)
|
| 81 |
+
if len(a) == 0:
|
| 82 |
+
return float("nan")
|
| 83 |
+
denom = np.where(a == 0, 1, a) # 0์ธ ๊ณณ์ 1๋ก ์นํ(๋ถ๋ชจ ์์ ์ฅ์น)
|
| 84 |
+
return float(np.mean(np.abs((a - b) / denom)) * 100.0)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
# ------------------------------------------------------------
|
| 88 |
+
# 2) ๋ชจ๋ธ ํ๋ณด๋ฅผ ๋ง๋ค์ด ์ฃผ๋ ํจ์
|
| 89 |
+
# ------------------------------------------------------------
|
| 90 |
+
def get_candidates():
|
| 91 |
+
"""
|
| 92 |
+
์ฌ์ฉํ ์ ์๋ ๋ชจ๋ธ ๋ชฉ๋ก์ ํํ๋ก ๋ชจ์ ๋ฐํํด์.
|
| 93 |
+
๊ฐ ์์: (์ด๋ฆ, ๋ชจ๋ธ๊ฐ์ฒด, fit(ํ์ต)ํ ๋ ๋ฃ์ ์ถ๊ฐ ํ๋ผ๋ฏธํฐ ๋์
๋๋ฆฌ)
|
| 94 |
+
|
| 95 |
+
- LinearRegression: ๊ฐ์ฅ ๊ธฐ๋ณธ์ ์ธ ์ ํ ๋ชจ๋ธ
|
| 96 |
+
- RandomForest: ๋น์ ํ ํจํด๋ ์ ์ก๋ ๋๋ฌด ์์๋ธ
|
| 97 |
+
- XGBoost / LightGBM: ๋น ๋ฅด๊ณ ๊ฐ๋ ฅํ ๋ถ์คํ
๋ชจ๋ธ(์ค์น๋ ๊ฒฝ์ฐ๋ง ์ฌ์ฉ)
|
| 98 |
+
"""
|
| 99 |
+
models = []
|
| 100 |
+
|
| 101 |
+
# 1) ์ ํํ๊ท (์ค์ ํ ๊ฒ ๊ฑฐ์ ์์)
|
| 102 |
+
models.append(("LinearRegression", LinearRegression(), {}))
|
| 103 |
+
|
| 104 |
+
# 2) ๋๋คํฌ๋ ์คํธ (๋๋ฌด 300๊ทธ๋ฃจ, ๋ฉํฐ์ฝ์ด ์ฌ์ฉ)
|
| 105 |
+
models.append(("RandomForest", RandomForestRegressor(
|
| 106 |
+
n_estimators=300, # ๋๋ฌด ๊ฐ์
|
| 107 |
+
max_depth=None, # ๊น์ด ์ ํ ์์(๊ณผ์ ํฉ ์ ์ค์ด๊ธฐ)
|
| 108 |
+
random_state=42,
|
| 109 |
+
n_jobs=-1 # CPU ์ฝ์ด ๋ชจ๋ ์ฌ์ฉ
|
| 110 |
+
), {}))
|
| 111 |
+
|
| 112 |
+
# 3) XGBoost (์์ ๋๋ง)
|
| 113 |
+
if XGBRegressor is not None:
|
| 114 |
+
models.append(("XGBoost", XGBRegressor(
|
| 115 |
+
n_estimators=400,
|
| 116 |
+
max_depth=6,
|
| 117 |
+
learning_rate=0.05,
|
| 118 |
+
subsample=0.9,
|
| 119 |
+
colsample_bytree=0.9,
|
| 120 |
+
reg_lambda=1.0,
|
| 121 |
+
random_state=42,
|
| 122 |
+
tree_method="hist", # ๋น ๋ฅธ ํ์คํ ๊ทธ๋จ ๋ถํ
|
| 123 |
+
n_jobs=-1
|
| 124 |
+
), {"verbose": False})) # fit์ ๋ฃ์ ์ถ๊ฐ ์ธ์ ์์
|
| 125 |
+
|
| 126 |
+
# 4) LightGBM (์์ ๋๋ง)
|
| 127 |
+
if LGBMRegressor is not None:
|
| 128 |
+
models.append(("LightGBM", LGBMRegressor(
|
| 129 |
+
n_estimators=600,
|
| 130 |
+
max_depth=-1, # ์๋
|
| 131 |
+
learning_rate=0.05,
|
| 132 |
+
subsample=0.9,
|
| 133 |
+
colsample_bytree=0.9,
|
| 134 |
+
reg_lambda=1.0,
|
| 135 |
+
random_state=42,
|
| 136 |
+
n_jobs=-1
|
| 137 |
+
), {}))
|
| 138 |
+
|
| 139 |
+
return models
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
# ------------------------------------------------------------
|
| 143 |
+
# 3) ์๊ณ์ด ๋ถํ : ์๋ถ๋ถ(ํ์ต) / ๋ท๋ถ๋ถ(๊ฒ์ฆ)
|
| 144 |
+
# ------------------------------------------------------------
|
| 145 |
+
def time_split(X, y, valid_ratio=0.2):
|
| 146 |
+
"""
|
| 147 |
+
์๊ฐ ์์๋ฅผ ์งํค๊ธฐ ์ํด, ์์ชฝ์ 'ํ์ต', ๋ค์ชฝ์ '๊ฒ์ฆ'์ผ๋ก ๋๋ ์.
|
| 148 |
+
(์๊ณ์ด์ ๋๋ค ์๊ธฐ๋ฅผ ์ ํ๋ ๊ฒ ์ผ๋ฐ์ )
|
| 149 |
+
|
| 150 |
+
valid_ratio=0.2 ์ด๋ฉด ๋ฐ์ดํฐ์ 20%๋ฅผ ๊ฒ์ฆ์ฉ์ผ๋ก ์ฌ์ฉ.
|
| 151 |
+
"""
|
| 152 |
+
n = len(X)
|
| 153 |
+
v = max(1, int(n * valid_ratio)) # ๊ฒ์ฆ ์ํ ๊ฐ์(์ต์ 1)
|
| 154 |
+
t = n - v # ํ์ต ์ํ ๊ฐ์
|
| 155 |
+
return (X[:t], y[:t], X[t:], y[t:])
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
# ------------------------------------------------------------
|
| 159 |
+
# 4) ๊ฐ๋จํ ์์๋ธ: ์ฌ๋ฌ ๋ชจ๋ธ ์์ธก์ '๊ฐ์ค ํ๊ท '
|
| 160 |
+
# ------------------------------------------------------------
|
| 161 |
+
class SimpleEnsemble:
|
| 162 |
+
"""
|
| 163 |
+
์ฌ๋ฌ ๋ชจ๋ธ์ ์์ธก์ ์์ด์ ํ๋๋ก ๋ง๋๋ ๊ฐ๋จํ ์์๋ธ.
|
| 164 |
+
- weights: ๊ฐ์ค์น(๊ฐ์ด ํฌ๋ฉด ๊ทธ ๋ชจ๋ธ์ ๋ ์ ๋ขฐํ๋ค๋ ๋ป)
|
| 165 |
+
- ์ฌ๊ธฐ์๋ ๋ชจ๋ธ๋ณ ๊ฒ์ฆ RMSE ์ ์ญ์๋ฅผ ๊ฐ์ค์น๋ก ์ฌ์ฉ(์ข์์๋ก ํฐ ๊ฐ์ค)
|
| 166 |
+
"""
|
| 167 |
+
def __init__(self, models, weights):
|
| 168 |
+
self.models = models
|
| 169 |
+
# ๊ฐ์ค์น ํฉ์ด 1์ด ๋๋๋ก ์ ๊ทํ(ํฉ์ด 0์ด๋ฉด ๋ถ๋ชจ๋ฅผ ์์ฃผ ์์ ๊ฐ์ผ๋ก)
|
| 170 |
+
self.weights = np.array(weights, dtype=float) / max(np.sum(weights), 1e-9)
|
| 171 |
+
|
| 172 |
+
def predict(self, X):
|
| 173 |
+
# ๊ฐ ๋ชจ๋ธ์ ์์ธก์ ๋ชจ์์(์ด๋ฐฉํฅ) ๊ฐ์ค ํ๊ท
|
| 174 |
+
preds = [m.predict(X) for m in self.models] # ๋ฆฌ์คํธ ๊ธธ์ด = ๋ชจ๋ธ ์
|
| 175 |
+
return np.sum(np.array(preds).T * self.weights, axis=1) # (์ํ, ๋ชจ๋ธ) ยท (๋ชจ๋ธ,) โ (์ํ,)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
# ------------------------------------------------------------
|
| 179 |
+
# 5) Optuna ๋ก ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋(์ ํ)
|
| 180 |
+
# ------------------------------------------------------------
|
| 181 |
+
def _tune_with_optuna(name, base_model, X_tr, y_tr, X_va, y_va, n_trials=20):
|
| 182 |
+
"""
|
| 183 |
+
ํน์ ๋ชจ๋ธ์ ๋ํด Optuna ๋ก '์ข์ ํ์ดํผํ๋ผ๋ฏธํฐ'๋ฅผ ์ฐพ์์.
|
| 184 |
+
- name: ๋ชจ๋ธ๋ช
๋ฌธ์์ด (RandomForest/XGBoost/LightGBM)
|
| 185 |
+
- base_model: ์๋ ๋ชจ๋ธ(๋์ฒด๋ก ๋ฌด์ํ๊ณ ์๋ก ๋ง๋ฆ)
|
| 186 |
+
- X_tr, y_tr: ํ์ต ์ธํธ
|
| 187 |
+
- X_va, y_va: ๊ฒ์ฆ ์ธํธ
|
| 188 |
+
- n_trials: ์๋ ํ์(๋ง์์๋ก ๋ ๊ผผ๊ผผํ์ง๋ง ์๊ฐ์ด ์ค๋ ๊ฑธ๋ฆผ)
|
| 189 |
+
|
| 190 |
+
๋ฐํ:
|
| 191 |
+
- ํ๋์ด ๊ฐ๋ฅํ๋ฉด '์ต์ ๋ชจ๋ธ' ๊ฐ์ฒด๋ฅผ ๋ฐํ
|
| 192 |
+
- Optuna๊ฐ ์๊ฑฐ๋ ๋ชจ๋ธ์ด ๋งค์นญ๋์ง ์์ผ๋ฉด None
|
| 193 |
+
"""
|
| 194 |
+
if optuna is None:
|
| 195 |
+
return None # Optuna ์ค์น ์ ๋์ด ์์ผ๋ฉด ์คํต
|
| 196 |
+
|
| 197 |
+
# ํ์ ๋ชฉํ ํจ์: ๊ฒ์ฆ RMSE ๋ฅผ ์ต์ํ
|
| 198 |
+
def objective(trial):
|
| 199 |
+
if name == "RandomForest":
|
| 200 |
+
# ํ์ ๋ฒ์ ์ ์(๋๋ต์ ์ธ ํฉ๋ฆฌ์ ๊ตฌ๊ฐ)
|
| 201 |
+
n_estimators = trial.suggest_int("n_estimators", 200, 800, step=100)
|
| 202 |
+
max_depth = trial.suggest_int("max_depth", 6, 24, step=2)
|
| 203 |
+
m = RandomForestRegressor(
|
| 204 |
+
n_estimators=n_estimators,
|
| 205 |
+
max_depth=max_depth,
|
| 206 |
+
random_state=42,
|
| 207 |
+
n_jobs=-1
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
elif name == "XGBoost" and XGBRegressor is not None:
|
| 211 |
+
n_estimators = trial.suggest_int("n_estimators", 300, 900, step=100)
|
| 212 |
+
max_depth = trial.suggest_int("max_depth", 4, 10)
|
| 213 |
+
lr = trial.suggest_float("learning_rate", 0.02, 0.2, log=True)
|
| 214 |
+
subsample = trial.suggest_float("subsample", 0.7, 1.0)
|
| 215 |
+
colsample = trial.suggest_float("colsample_bytree", 0.7, 1.0)
|
| 216 |
+
lam = trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True)
|
| 217 |
+
m = XGBRegressor(
|
| 218 |
+
n_estimators=n_estimators,
|
| 219 |
+
max_depth=max_depth,
|
| 220 |
+
learning_rate=lr,
|
| 221 |
+
subsample=subsample,
|
| 222 |
+
colsample_bytree=colsample,
|
| 223 |
+
reg_lambda=lam,
|
| 224 |
+
random_state=42,
|
| 225 |
+
tree_method="hist",
|
| 226 |
+
n_jobs=-1
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
elif name == "LightGBM" and LGBMRegressor is not None:
|
| 230 |
+
n_estimators = trial.suggest_int("n_estimators", 400, 1400, step=200)
|
| 231 |
+
lr = trial.suggest_float("learning_rate", 0.02, 0.2, log=True)
|
| 232 |
+
num_leaves = trial.suggest_int("num_leaves", 31, 255, step=16)
|
| 233 |
+
subsample = trial.suggest_float("subsample", 0.7, 1.0)
|
| 234 |
+
colsample = trial.suggest_float("colsample_bytree", 0.7, 1.0)
|
| 235 |
+
m = LGBMRegressor(
|
| 236 |
+
n_estimators=n_estimators,
|
| 237 |
+
learning_rate=lr,
|
| 238 |
+
num_leaves=num_leaves,
|
| 239 |
+
subsample=subsample,
|
| 240 |
+
colsample_bytree=colsample,
|
| 241 |
+
random_state=42,
|
| 242 |
+
n_jobs=-1
|
| 243 |
+
)
|
| 244 |
+
else:
|
| 245 |
+
# ์ด ํจ์๊ฐ ์ง์ํ์ง ์๋ ๋ชจ๋ธ์ด๋ฉด ํฐ ์ซ์(๋์ ์ ์) ๋ฐํ
|
| 246 |
+
return 1e9
|
| 247 |
+
|
| 248 |
+
# ํ์ต ํ ๊ฒ์ฆ์ธํธ ์์ธก โ RMSE ๋ฐํ
|
| 249 |
+
m.fit(X_tr, y_tr)
|
| 250 |
+
p = m.predict(X_va)
|
| 251 |
+
return rmse(y_va, p)
|
| 252 |
+
|
| 253 |
+
# Optuna ์คํ(์ต์ํ)
|
| 254 |
+
study = optuna.create_study(direction="minimize")
|
| 255 |
+
study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
|
| 256 |
+
|
| 257 |
+
# ์ต์ ํ๋ผ๋ฏธํฐ๋ก '๋ค์' ๋ชจ๋ธ์ ๋ง๋ค์ด ํ์ตํด ๋ฐํ
|
| 258 |
+
best_params = study.best_params
|
| 259 |
+
if name == "RandomForest":
|
| 260 |
+
m = RandomForestRegressor(
|
| 261 |
+
n_estimators=best_params["n_estimators"],
|
| 262 |
+
max_depth=best_params["max_depth"],
|
| 263 |
+
random_state=42,
|
| 264 |
+
n_jobs=-1
|
| 265 |
+
)
|
| 266 |
+
elif name == "XGBoost" and XGBRegressor is not None:
|
| 267 |
+
m = XGBRegressor(
|
| 268 |
+
n_estimators=best_params["n_estimators"],
|
| 269 |
+
max_depth=best_params["max_depth"],
|
| 270 |
+
learning_rate=best_params["learning_rate"],
|
| 271 |
+
subsample=best_params["subsample"],
|
| 272 |
+
colsample_bytree=best_params["colsample_bytree"],
|
| 273 |
+
reg_lambda=best_params["reg_lambda"],
|
| 274 |
+
random_state=42,
|
| 275 |
+
tree_method="hist",
|
| 276 |
+
n_jobs=-1
|
| 277 |
+
)
|
| 278 |
+
elif name == "LightGBM" and LGBMRegressor is not None:
|
| 279 |
+
m = LGBMRegressor(
|
| 280 |
+
n_estimators=best_params["n_estimators"],
|
| 281 |
+
learning_rate=best_params["learning_rate"],
|
| 282 |
+
num_leaves=best_params["num_leaves"],
|
| 283 |
+
subsample=best_params["subsample"],
|
| 284 |
+
colsample_bytree=best_params["colsample_bytree"],
|
| 285 |
+
random_state=42,
|
| 286 |
+
n_jobs=-1
|
| 287 |
+
)
|
| 288 |
+
else:
|
| 289 |
+
return None
|
| 290 |
+
|
| 291 |
+
# ์ต์ ๋ชจ๋ธ์ ๋ค์ ์ ์ฒด ํ์ต์ธํธ์ ๋ง์ถฐ์ ๋ฐํ
|
| 292 |
+
m.fit(X_tr, y_tr)
|
| 293 |
+
return m
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
# ------------------------------------------------------------
|
| 297 |
+
# 6) ํ์ต & ์ฑ๋ฅ ๋น๊ต โ ๋ฒ ์คํธ ๋ชจ๋ธ ์ ํ
|
| 298 |
+
# ------------------------------------------------------------
|
| 299 |
+
def train_and_score(X, y, valid_ratio=0.2, use_optuna=False, optuna_trials=15, build_ensemble=True):
|
| 300 |
+
"""
|
| 301 |
+
์ฌ๋ฌ ๋ชจ๋ธ์ ํ์ต์ํค๊ณ , ๊ฒ์ฆ ์ฑ๋ฅ(RMSE/MAE/MAPE)์ ๋น๊ตํด
|
| 302 |
+
'๊ฐ์ฅ ์ข์ ๋ชจ๋ธ'์ ์ฐพ์ ๋ฐํํด์.
|
| 303 |
+
|
| 304 |
+
์
๋ ฅ:
|
| 305 |
+
- X, y: ํ์ต ๋ฐ์ดํฐ(๋ฐฐ์ด/๋ํ์ด)
|
| 306 |
+
- valid_ratio: ๊ฒ์ฆ ๋น์จ(0.2 = 20%)
|
| 307 |
+
- use_optuna: True๋ฉด ๋ชจ๋ธ๋ณ ํ๋ ์๋
|
| 308 |
+
- optuna_trials: ํ๋ ์๋ ํ์
|
| 309 |
+
- build_ensemble: True๋ฉด ๊ฐ๋จ ์์๋ธ๋ ํ๋ณด๋ก ์ถ๊ฐ
|
| 310 |
+
|
| 311 |
+
๋ฐํ:
|
| 312 |
+
- best_model: ๊ฐ์ฅ ์ฑ๋ฅ ์ข์ ๋ชจ๋ธ(๋จ์ผ ๋๋ Ensemble)
|
| 313 |
+
- lb: ์ฑ๋ฅ ๋ฆฌ๋๋ณด๋(DataFrame, rmse ์ค๋ฆ์ฐจ์ ์ ๋ ฌ)
|
| 314 |
+
"""
|
| 315 |
+
# ์๊ฐ ์์ ๊ธฐ๋ฐ ๋ถํ (์: ํ์ต, ๋ค: ๊ฒ์ฆ)
|
| 316 |
+
X_tr, y_tr, X_va, y_va = time_split(X, y, valid_ratio=valid_ratio)
|
| 317 |
+
|
| 318 |
+
rows = [] # ๊ฐ ๋ชจ๋ธ์ ์ฑ์ ํ๋ฅผ ๋ด์ ๋ฆฌ์คํธ(๋์ค์ DataFrame์ผ๋ก)
|
| 319 |
+
best = (None, None, float("inf")) # (์ด๋ฆ, ๋ชจ๋ธ, ํ์ฌ๊น์ง์ ์ต์ RMSE)
|
| 320 |
+
fitted = [] # ํ์ต ์๋ฃ๋ (์ด๋ฆ, ๋ชจ๋ธ) ์ ์ฅ
|
| 321 |
+
va_preds = [] # ๊ฒ์ฆ ์์ธก ๊ฒฐ๊ณผ(์์๋ธ ๋ง๋ค ๋ ์ฌ์ฉ)
|
| 322 |
+
|
| 323 |
+
# ๋ชจ๋ธ ํ๋ณด๋ค์ ํ๋์ฉ ํ์ต/ํ๊ฐ
|
| 324 |
+
for name, mdl, fit_params in get_candidates():
|
| 325 |
+
try:
|
| 326 |
+
# Optuna ํ๋์ ์ผ๋ฉด ๋จผ์ ํ๋์ ์๋
|
| 327 |
+
if use_optuna:
|
| 328 |
+
tuned = _tune_with_optuna(name, mdl, X_tr, y_tr, X_va, y_va, n_trials=optuna_trials)
|
| 329 |
+
if tuned is not None:
|
| 330 |
+
mdl = tuned # ํ๋ ์ฑ๊ณต ์ ๊ทธ ๋ชจ๋ธ๋ก ๊ต์ฒด
|
| 331 |
+
|
| 332 |
+
# ๋ชจ๋ธ ํ์ต
|
| 333 |
+
mdl.fit(X_tr, y_tr, **fit_params)
|
| 334 |
+
|
| 335 |
+
# ๊ฒ์ฆ ์์ธก
|
| 336 |
+
pred = mdl.predict(X_va)
|
| 337 |
+
|
| 338 |
+
# ์ฑ์ ํ ํ ์ค ์์ฑ
|
| 339 |
+
row = {
|
| 340 |
+
"model": name,
|
| 341 |
+
"rmse": rmse(y_va, pred),
|
| 342 |
+
"mae": mae(y_va, pred),
|
| 343 |
+
"mape": mape(y_va, pred)
|
| 344 |
+
}
|
| 345 |
+
rows.append(row)
|
| 346 |
+
|
| 347 |
+
# ์์๋ธ ํ๋ณด๋ฅผ ์ํด ์ ์ฅ
|
| 348 |
+
fitted.append((name, mdl))
|
| 349 |
+
va_preds.append(pred)
|
| 350 |
+
|
| 351 |
+
# ๋ฒ ์คํธ ๊ฐฑ์ (๋ ์์ RMSE๊ฐ ๋์ค๋ฉด ๊ต์ฒด)
|
| 352 |
+
if row["rmse"] < best[2]:
|
| 353 |
+
best = (name, mdl, row["rmse"])
|
| 354 |
+
|
| 355 |
+
except Exception:
|
| 356 |
+
# ์ด๋ค ๋ชจ๋ธ์ด ์คํจํ๋๋ผ๋ ์ ์ฒด ํ์ดํ๋ผ์ธ์ ๊ณ์ ๊ฐ์.
|
| 357 |
+
rows.append({"model": name, "rmse": np.nan, "mae": np.nan, "mape": np.nan})
|
| 358 |
+
|
| 359 |
+
# ---- ๊ฐ๋จ ์์๋ธ ํ๋ณด ์ถ๊ฐ (์ํ๋ฉด) ----
|
| 360 |
+
# 2๊ฐ ์ด์ ๋ชจ๋ธ์ด ์ฑ๊ณตํ์ ๋๋ง ์์๋ธ ์๋
|
| 361 |
+
if build_ensemble and len(va_preds) >= 2:
|
| 362 |
+
# ๋ชจ๋ธ๋ณ RMSE์ ์ญ์๋ฅผ ๊ฐ์ค์น๋ก ์ฌ์ฉ(์ข์์๋ก ํฐ ๊ฐ์ค)
|
| 363 |
+
rmses = [rmse(y_va, p) for p in va_preds]
|
| 364 |
+
weights = [1.0 / max(r, 1e-6) for r in rmses] # 0 ๋๋ ๋ฐฉ์ง
|
| 365 |
+
|
| 366 |
+
ens = SimpleEnsemble([m for _, m in fitted], weights)
|
| 367 |
+
ens_pred = ens.predict(X_va)
|
| 368 |
+
|
| 369 |
+
row = {
|
| 370 |
+
"model": "Ensemble",
|
| 371 |
+
"rmse": rmse(y_va, ens_pred),
|
| 372 |
+
"mae": mae(y_va, ens_pred),
|
| 373 |
+
"mape": mape(y_va, ens_pred)
|
| 374 |
+
}
|
| 375 |
+
rows.append(row)
|
| 376 |
+
|
| 377 |
+
# ์์๋ธ์ด ์ ์ผ ์ข์ผ๋ฉด ๋ฒ ์คํธ๋ก ๊ต์ฒด
|
| 378 |
+
if row["rmse"] < best[2]:
|
| 379 |
+
best = ("Ensemble", ens, row["rmse"])
|
| 380 |
+
|
| 381 |
+
# ๋ฆฌ๋๋ณด๋ ํ
์ด๋ธ ๋ง๋ค๊ธฐ(์์ rmse ์)
|
| 382 |
+
lb = pd.DataFrame(rows).sort_values("rmse", na_position="last").reset_index(drop=True)
|
| 383 |
+
|
| 384 |
+
# best[1] = ๋ฒ ์คํธ ๋ชจ๋ธ ๊ฐ์ฒด
|
| 385 |
+
return best[1], lb
|
| 386 |
+
|
| 387 |
+
|
| 388 |
+
# ------------------------------------------------------------
|
| 389 |
+
# 7) ์ฐ์ถ๋ฌผ ์ ์ฅ(๋ฒ ์คํธ ๋ชจ๋ธ/ํผ์ฒ๋ช
/๋งคํ/๋ฆฌ๋๋ณด๋)
|
| 390 |
+
# ------------------------------------------------------------
|
| 391 |
+
def save_artifacts(out_dirs, best_model, feature_names, mapping, leaderboard_df):
|
| 392 |
+
"""
|
| 393 |
+
ํ์ต ๊ฒฐ๊ณผ๋ฅผ ๋์คํฌ์ ์ ์ฅํด์.
|
| 394 |
+
|
| 395 |
+
- out_dirs: ์ ์ฅํ ํด๋ ๋ชฉ๋ก(์: ['artifacts', 'models'])
|
| 396 |
+
๋ ํด๋ ๋ชจ๋์ ๋์ผํ ํ์ผ์ ๋ง๋ค์ด ๋ก๋๋ค(๋ณต๊ตฌ/๊ณต์ ํธ์).
|
| 397 |
+
- best_model: train_and_score ์์ ๋ฝํ ์ต๊ณ ๋ชจ๋ธ(๋๋ ์์๋ธ)
|
| 398 |
+
- feature_names: ๋ชจ๋ธ ์
๋ ฅ ์ปฌ๋ผ ์ด๋ฆ ๋ฆฌ์คํธ
|
| 399 |
+
- mapping: ๋ ์ง/ํ๊น/์นดํ
๊ณ ๋ฆฌ ๋งคํ ๋์
๋๋ฆฌ (์ฌํ/์์ธก ์ ํ์)
|
| 400 |
+
- leaderboard_df: ์ฑ๋ฅ ํ(DataFrame)
|
| 401 |
+
|
| 402 |
+
์์ฑ ํ์ผ:
|
| 403 |
+
- best_model.pkl: {model, feature_names, mapping} ๋ฅผ pickle ๋ก ์ ์ฅ
|
| 404 |
+
- leaderboard.csv: ์ฑ๋ฅ ํ (UTF-8-SIG, ์์
ํธํ)
|
| 405 |
+
- leaderboard.parquet: ํ์ผ์ด(์์ผ๋ฉด)
|
| 406 |
+
"""
|
| 407 |
+
payload = {
|
| 408 |
+
"model": best_model,
|
| 409 |
+
"feature_names": feature_names,
|
| 410 |
+
"mapping": mapping
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
for d in out_dirs:
|
| 414 |
+
os.makedirs(d, exist_ok=True)
|
| 415 |
+
|
| 416 |
+
# 1) ๋ฒ ์คํธ ๋ชจ๋ธ ํจํค์ง ์ ์ฅ
|
| 417 |
+
with open(os.path.join(d, "best_model.pkl"), "wb"):
|
| 418 |
+
# pickle.dump: ํ์ด์ฌ ๊ฐ์ฒด๋ฅผ ํ์ผ๋ก ์ง๋ ฌํํด์ ์ ์ฅ
|
| 419 |
+
pass
|
| 420 |
+
with open(os.path.join(d, "best_model.pkl"), "wb") as f:
|
| 421 |
+
pickle.dump(payload, f)
|
| 422 |
+
|
| 423 |
+
# 2) ๋ฆฌ๋๋ณด๋ ์ ์ฅ (CSV)
|
| 424 |
+
leaderboard_df.to_csv(
|
| 425 |
+
os.path.join(d, "leaderboard.csv"),
|
| 426 |
+
index=False,
|
| 427 |
+
encoding="utf-8-sig" # ์์
์์ ํ๊ธ ์๊นจ์ง๋๋ก
|
| 428 |
+
)
|
| 429 |
+
|
| 430 |
+
# 3) ๋ฆฌ๋๋ณด๋ ์ ์ฅ (Parquet, ์ ํ์ฌํญ)
|
| 431 |
+
try:
|
| 432 |
+
leaderboard_df.to_parquet(
|
| 433 |
+
os.path.join(d, "leaderboard.parquet"),
|
| 434 |
+
index=False
|
| 435 |
+
)
|
| 436 |
+
except Exception:
|
| 437 |
+
# pyarrow ๊ฐ์ ์์กด์ฑ์ด ์์ ์ ์์ผ๋ ์คํจํด๋ ๊ทธ๋ฅ ๋์ด๊ฐ
|
| 438 |
+
pass
|
utils_io.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
utils_io.py โ ์
์ถ๋ ฅ/์ปฌ๋ผ ์๋ ๋งคํ ์ ํธ ๋ชจ์ (์์ธ ์ฃผ์)
|
| 5 |
+
|
| 6 |
+
์ด ํ์ผ์ ๋ค์ ๊ธฐ๋ฅ์ ์ ๊ณตํฉ๋๋ค.
|
| 7 |
+
1) read_csv_flexible: ์ฌ๋ฌ ์ธ์ฝ๋ฉ ํ๋ณด๋ก CSV๋ฅผ '์์ ํ๊ฒ' ์ฝ๊ธฐ
|
| 8 |
+
2) save_utf8sig : UTF-8-SIG(์์
ํธํ)๋ก CSV ์ ์ฅ
|
| 9 |
+
3) ensure_dirs : ํด๋๊ฐ ์์ผ๋ฉด ๋ง๋ค์ด ์ฃผ๊ธฐ
|
| 10 |
+
4) auto_map_columns : ๋ ์ง/ํ๊น/์ง์ญ/๋ธ๋๋/์ํ ์ปฌ๋ผ ์๋ ์ถ์
|
| 11 |
+
|
| 12 |
+
โป ์ฃผ์: ์๋ auto_map_columns()๋ ์๋ณธ ์ฝ๋์ locals() ๊ธฐ๋ฐ ์ถฉ๋ ํด๊ฒฐ์
|
| 13 |
+
'์์ ํ ๋์
๋๋ฆฌ ๊ธฐ๋ฐ'์ผ๋ก ๊ณ ์ณค์ต๋๋ค. (Python์์ locals() ์์ ์
|
| 14 |
+
ํจ์ ์ค์ฝํ์์ ๋ณด์ฅ์ด ๋์ง ์์ต๋๋ค.)
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import os
|
| 18 |
+
import re
|
| 19 |
+
import glob
|
| 20 |
+
import pandas as pd
|
| 21 |
+
from typing import Optional, Dict, List, Union, IO
|
| 22 |
+
|
| 23 |
+
# 1) CSV ์ฝ๊ธฐ ์๋ํ ์ธ์ฝ๋ฉ ํ๋ณด๋ค
|
| 24 |
+
# - utf-8-sig: ์์
์์ ์ ์ด๋ฆฌ๋ UTF-8 with BOM
|
| 25 |
+
# - utf-8 : ๋ฒ์ฉ
|
| 26 |
+
# - cp949/euc-kr: ์๋์ฐ/๊ตญ๋ด ํ๊ฒฝ์์ ์์ฃผ ์ฐ๋ ํ๊ธ ์ธ์ฝ๋ฉ
|
| 27 |
+
# - latin1 : ๋ง์ง๋ง ์์ ๋ง(์์ค ์์ด ์ฝํ๋ ๊ธ์๊ฐ ๊นจ์ง ์ ์์)
|
| 28 |
+
ENCODINGS: List[str] = ["utf-8-sig", "utf-8", "cp949", "euc-kr", "latin1"]
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def read_csv_flexible(path_or_buf: Union[str, os.PathLike, IO[bytes], IO[str]]) -> pd.DataFrame:
|
| 32 |
+
"""
|
| 33 |
+
์ฌ๋ฌ ์ธ์ฝ๋ฉ์ ์์ฐจ์ ์ผ๋ก ์๋ํ์ฌ CSV๋ฅผ ์์ ํ๊ฒ ์ฝ์ต๋๋ค.
|
| 34 |
+
- ์ฒซ ๋ฒ์งธ๋ก ์ฑ๊ณตํ๋ ์ธ์ฝ๋ฉ ๊ฒฐ๊ณผ๋ฅผ ๋ฐํํฉ๋๋ค.
|
| 35 |
+
- ๋ชจ๋ ์คํจํ๋ฉด ๋ง์ง๋ง ์์ธ๋ฅผ ๋ค์ ๋์ง๋๋ค.
|
| 36 |
+
- ๋ฌธ์์ด ๊ฒฝ๋ก๋ฟ ์๋๋ผ BytesIO/ํ์ผ ๊ฐ์ฒด๋ ์ง์ํฉ๋๋ค.
|
| 37 |
+
|
| 38 |
+
Parameters
|
| 39 |
+
----------
|
| 40 |
+
path_or_buf : str ๋๋ ํ์ผ ๊ฐ์ฒด
|
| 41 |
+
CSV ํ์ผ ๊ฒฝ๋ก ๋๋ ํ์ผ ๊ฐ์ฒด/๋ฒํผ(์: BytesIO, UploadedFile ๋ฑ)
|
| 42 |
+
|
| 43 |
+
Returns
|
| 44 |
+
-------
|
| 45 |
+
pd.DataFrame
|
| 46 |
+
์ฝ์ด๋ค์ธ ๋ฐ์ดํฐํ๋ ์
|
| 47 |
+
"""
|
| 48 |
+
last_e: Optional[Exception] = None
|
| 49 |
+
for enc in ENCODINGS:
|
| 50 |
+
try:
|
| 51 |
+
# ํ์ผ ๊ฐ์ฒด์ผ ๊ฒฝ์ฐ ๋งค๋ฒ ์ฒ์๋ถํฐ ๋ค์ ์ฝ๋๋ก ์ปค์ ์ด๋
|
| 52 |
+
if hasattr(path_or_buf, "seek"):
|
| 53 |
+
try:
|
| 54 |
+
path_or_buf.seek(0)
|
| 55 |
+
except Exception:
|
| 56 |
+
# seek์ ์ง์ํ์ง ์์ผ๋ฉด ๊ทธ๋ฅ ์งํ
|
| 57 |
+
pass
|
| 58 |
+
return pd.read_csv(path_or_buf, encoding=enc)
|
| 59 |
+
except Exception as e:
|
| 60 |
+
# ์คํจํ๋ฉด ๋ค์ ์ธ์ฝ๋ฉ์ผ๋ก ๋์ด๊ฐ๊ณ , ๋ง์ง๋ง ์์ธ๋ฅผ ์ ์ฅ
|
| 61 |
+
last_e = e
|
| 62 |
+
if last_e is not None:
|
| 63 |
+
# ๋ชจ๋ ์ธ์ฝ๋ฉ์ด ์คํจ โ ๋ง์ง๋ง ์๋ฌ๋ฅผ ๊ทธ๋๋ก ์ฌ๋ฆผ(๋๋ฒ๊น
์ ์ ์ฉ)
|
| 64 |
+
raise last_e
|
| 65 |
+
# ์ด๋ก ์ ๋๋ฌํ์ง ์์ง๋ง, ์์ ๋ง์ผ๋ก ํ ๋ฒ ๋ ์๋
|
| 66 |
+
return pd.read_csv(path_or_buf)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def save_utf8sig(df: pd.DataFrame, path: str) -> None:
|
| 70 |
+
"""
|
| 71 |
+
DataFrame์ UTF-8-SIG๋ก ์ ์ฅํฉ๋๋ค.
|
| 72 |
+
- ๋๋ ํ ๋ฆฌ๊ฐ ์์ผ๋ฉด ๋จผ์ ๋ง๋ค์ด ์ค๋๋ค.
|
| 73 |
+
- ์์
์์ ํ๊ธ ๊นจ์ง์ ๋ฐฉ์งํ๋ ์ธ์ฝ๋ฉ์
๋๋ค.
|
| 74 |
+
|
| 75 |
+
Parameters
|
| 76 |
+
----------
|
| 77 |
+
df : pd.DataFrame
|
| 78 |
+
์ ์ฅํ ๋ฐ์ดํฐํ๋ ์
|
| 79 |
+
path : str
|
| 80 |
+
์ ์ฅ ๊ฒฝ๋ก(ํ์ผ๋ช
ํฌํจ)
|
| 81 |
+
"""
|
| 82 |
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
| 83 |
+
df.to_csv(path, index=False, encoding="utf-8-sig")
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def ensure_dirs(*dirs: str) -> None:
|
| 87 |
+
"""
|
| 88 |
+
์ ๋ฌ๋ ๋ชจ๋ ๊ฒฝ๋ก์ ๋ํด ํด๋๊ฐ ์์ผ๋ฉด ์์ฑํฉ๋๋ค.
|
| 89 |
+
- ์ฌ๋ฌ ๊ฒฝ๋ก๋ฅผ ํ ๋ฒ์ ์ฒ๋ฆฌํ ์ ์์ต๋๋ค.
|
| 90 |
+
|
| 91 |
+
Example
|
| 92 |
+
-------
|
| 93 |
+
ensure_dirs("data", "artifacts", "models")
|
| 94 |
+
"""
|
| 95 |
+
for d in dirs:
|
| 96 |
+
os.makedirs(d, exist_ok=True)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# --- Column auto-mapping helpers --------------------------------------------
|
| 100 |
+
# ํ๊ตญ์ด/์์ด๋ก ์์ฃผ ์ฐ์ด๋ ์ด ์ด๋ฆ ํ๋ณด ๋ฆฌ์คํธ
|
| 101 |
+
_CAND_DATE = ["date", "์ผ์", "๋ ์ง", "dt", "๊ธฐ์ค์ผ"]
|
| 102 |
+
_CAND_TARGET = ["qty", "sales_qty", "sales", "ํ๋งค์๋", "์๋", "demand", "target", "y"]
|
| 103 |
+
_CAND_REGION = ["region", "์ง์ ", "์ ํฌ", "๋งค์ฅ", "์ง์ญ", "์๋", "๊ด์ญ", "๊ตฌ๋ถ"]
|
| 104 |
+
_CAND_BRAND = ["brand", "๋ธ๋๋", "ํ์ฌ", "์ ์กฐ์ฌ"]
|
| 105 |
+
_CAND_ITEM = ["item", "์ํ", "ํ๋ชฉ", "sku", "์ํ๋ช
", "์ ํ๋ช
"]
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _guess_col(cols: List[str], candidates: List[str]) -> Optional[str]:
|
| 109 |
+
"""
|
| 110 |
+
์ปฌ๋ผ ์ด๋ฆ ๋ชฉ๋ก(cols)์์ ํ๋ณด(candidates)์ '๊ฐ์ฅ ์ ๋ง๋' ์ปฌ๋ผ์ ์ถ์ ํฉ๋๋ค.
|
| 111 |
+
1) ์ ๋ถ ์๋ฌธ์๋ก ๋ฐ๊พผ ๋ค '์ ํํ ๊ฐ์ ์ด๋ฆ' ์ฐ์ ๋งค์นญ
|
| 112 |
+
2) ์์ผ๋ฉด 'ํฌํจ(contains)' ๋งค์นญ์ผ๋ก ์ํ ํ์
|
| 113 |
+
|
| 114 |
+
Parameters
|
| 115 |
+
----------
|
| 116 |
+
cols : List[str]
|
| 117 |
+
์ค์ ๋ฐ์ดํฐํ๋ ์์ ์ปฌ๋ผ๋ช
๋ฆฌ์คํธ
|
| 118 |
+
candidates : List[str]
|
| 119 |
+
์ฐ๋ฆฌ๊ฐ ์ฐพ๊ณ ์ถ์ ์๋ฏธ์ ํ๋ณด๋ช
๋ค
|
| 120 |
+
|
| 121 |
+
Returns
|
| 122 |
+
-------
|
| 123 |
+
Optional[str]
|
| 124 |
+
๋งค์นญ๋ ์ปฌ๋ผ๋ช
(์์ผ๋ฉด None)
|
| 125 |
+
"""
|
| 126 |
+
lower = {c.lower(): c for c in cols} # ์๋ฌธ์ โ ์๋ ์ปฌ๋ผ๋ช
๋งคํ
|
| 127 |
+
|
| 128 |
+
# (1) ์ ํ ์ผ์น ์ฐ์
|
| 129 |
+
for c in candidates:
|
| 130 |
+
if c in lower:
|
| 131 |
+
return lower[c]
|
| 132 |
+
|
| 133 |
+
# (2) ๋ถ๋ถ ํฌํจ(์ํ ๋งค์นญ)
|
| 134 |
+
for c in candidates:
|
| 135 |
+
for col in cols:
|
| 136 |
+
if c in col.lower():
|
| 137 |
+
return col
|
| 138 |
+
|
| 139 |
+
return None
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def auto_map_columns(df: pd.DataFrame) -> Dict[str, Optional[str]]:
|
| 143 |
+
"""
|
| 144 |
+
๋ ์ง/ํ๊น/์ง์ญ/๋ธ๋๋/์ํ ์ปฌ๋ผ๋ช
์ ์๋์ผ๋ก ์ถ์ ํฉ๋๋ค.
|
| 145 |
+
- ์ /๋ถ๋ถ์ผ์น๋ก ๊ฐ๊ฐ ํ ๊ฐ์ฉ ์ฐพ์ต๋๋ค.
|
| 146 |
+
- ์ค๋ณต(๊ฐ์ ์ปฌ๋ผ์ด ๋ ์ญํ ๋ก ์ ํ) ๋ฐ์ ์, ๋ ์ง/ํ๊น์ ์ฐ์ ๋ณด์กดํ๊ณ
|
| 147 |
+
๋๋จธ์ง(region/brand/item)๋ '์์ง ์ฌ์ฉ๋์ง ์์' ๋ค๋ฅธ ์ปฌ๋ผ์ผ๋ก
|
| 148 |
+
๋์ฒด ์๋ํฉ๋๋ค. (์๋ณธ ๋ก์ง์ locals() ์์ ๋ฒ๊ทธ๋ฅผ ์ ๊ฑฐ)
|
| 149 |
+
|
| 150 |
+
Parameters
|
| 151 |
+
----------
|
| 152 |
+
df : pd.DataFrame
|
| 153 |
+
์
๋ ฅ ๋ฐ์ดํฐํ๋ ์
|
| 154 |
+
|
| 155 |
+
Returns
|
| 156 |
+
-------
|
| 157 |
+
Dict[str, Optional[str]]
|
| 158 |
+
{'date': ..., 'target': ..., 'region': ..., 'brand': ..., 'item': ...}
|
| 159 |
+
๊ฐ์ด None์ผ ์ ์์ต๋๋ค.
|
| 160 |
+
"""
|
| 161 |
+
cols = list(df.columns)
|
| 162 |
+
|
| 163 |
+
# 1) 1์ฐจ ์๋ ์ถ์
|
| 164 |
+
date = _guess_col(cols, _CAND_DATE)
|
| 165 |
+
target = _guess_col(cols, _CAND_TARGET)
|
| 166 |
+
region = _guess_col(cols, _CAND_REGION)
|
| 167 |
+
brand = _guess_col(cols, _CAND_BRAND)
|
| 168 |
+
item = _guess_col(cols, _CAND_ITEM)
|
| 169 |
+
|
| 170 |
+
# 2) ์ถฉ๋(์ค๋ณต) ์ฒ๋ฆฌ โ ์์ ํ ๋์
๋๋ฆฌ ๋ฐฉ์
|
| 171 |
+
picks = {
|
| 172 |
+
"date": date,
|
| 173 |
+
"target": target,
|
| 174 |
+
"region": region,
|
| 175 |
+
"brand": brand,
|
| 176 |
+
"item": item,
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
# None์ด ์๋ ๊ฐ๋ค๋ง ๋ฝ์ ์ค๋ณต ์ฌ๋ถ ํ์ธ
|
| 180 |
+
chosen_non_null = [p for p in picks.values() if p]
|
| 181 |
+
has_dup = len(set(chosen_non_null)) != len(chosen_non_null)
|
| 182 |
+
|
| 183 |
+
if has_dup:
|
| 184 |
+
# ๋ ์ง/ํ๊น ์ต์ฐ์ ๋ณดํธ
|
| 185 |
+
used = set([p for p in (date, target) if p])
|
| 186 |
+
# ์ถฉ๋ ๊ฐ๋ฅ์ฑ์ด ์๋ ํค๋ค(์ฐ์ ์์ ๋ฎ์)
|
| 187 |
+
for key in ["region", "brand", "item"]:
|
| 188 |
+
val = picks.get(key)
|
| 189 |
+
# ์ด๋ฏธ ์ฌ์ฉ๋ ์ปฌ๋ผ๊ณผ ๊ฒน์น๋ฉด ๋ค๋ฅธ ํ๋ณด๋ฅผ ์ฐพ์๋ด
|
| 190 |
+
if val and val in used:
|
| 191 |
+
# ์์ง ์ฐ์ง ์์ ์์์ ์ปฌ๋ผ์ ์ํํ๋ฉฐ ๋์ฒด
|
| 192 |
+
replace = None
|
| 193 |
+
for c in cols:
|
| 194 |
+
if c not in used and c != val:
|
| 195 |
+
replace = c
|
| 196 |
+
break
|
| 197 |
+
picks[key] = replace # ๋ชป ์ฐพ์ผ๋ฉด None์ด ๋ค์ด๊ฐ๋๋ค.
|
| 198 |
+
if replace:
|
| 199 |
+
used.add(replace)
|
| 200 |
+
elif val:
|
| 201 |
+
used.add(val)
|
| 202 |
+
|
| 203 |
+
return picks
|