|
|
|
|
|
|
|
|
""" |
|
|
utils_io.py โ ์
์ถ๋ ฅ/์ปฌ๋ผ ์๋ ๋งคํ ์ ํธ ๋ชจ์ (์์ธ ์ฃผ์) |
|
|
|
|
|
์ด ํ์ผ์ ๋ค์ ๊ธฐ๋ฅ์ ์ ๊ณตํฉ๋๋ค. |
|
|
1) read_csv_flexible: ์ฌ๋ฌ ์ธ์ฝ๋ฉ ํ๋ณด๋ก CSV๋ฅผ '์์ ํ๊ฒ' ์ฝ๊ธฐ |
|
|
2) save_utf8sig : UTF-8-SIG(์์
ํธํ)๋ก CSV ์ ์ฅ |
|
|
3) ensure_dirs : ํด๋๊ฐ ์์ผ๋ฉด ๋ง๋ค์ด ์ฃผ๊ธฐ |
|
|
4) auto_map_columns : ๋ ์ง/ํ๊น/์ง์ญ/๋ธ๋๋/์ํ ์ปฌ๋ผ ์๋ ์ถ์ |
|
|
|
|
|
โป ์ฃผ์: ์๋ auto_map_columns()๋ ์๋ณธ ์ฝ๋์ locals() ๊ธฐ๋ฐ ์ถฉ๋ ํด๊ฒฐ์ |
|
|
'์์ ํ ๋์
๋๋ฆฌ ๊ธฐ๋ฐ'์ผ๋ก ๊ณ ์ณค์ต๋๋ค. (Python์์ locals() ์์ ์ |
|
|
ํจ์ ์ค์ฝํ์์ ๋ณด์ฅ์ด ๋์ง ์์ต๋๋ค.) |
|
|
""" |
|
|
|
|
|
import os |
|
|
import re |
|
|
import glob |
|
|
import pandas as pd |
|
|
from typing import Optional, Dict, List, Union, IO |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ENCODINGS: List[str] = ["utf-8-sig", "utf-8", "cp949", "euc-kr", "latin1"] |
|
|
|
|
|
|
|
|
def read_csv_flexible(path_or_buf: Union[str, os.PathLike, IO[bytes], IO[str]]) -> pd.DataFrame: |
|
|
""" |
|
|
์ฌ๋ฌ ์ธ์ฝ๋ฉ์ ์์ฐจ์ ์ผ๋ก ์๋ํ์ฌ CSV๋ฅผ ์์ ํ๊ฒ ์ฝ์ต๋๋ค. |
|
|
- ์ฒซ ๋ฒ์งธ๋ก ์ฑ๊ณตํ๋ ์ธ์ฝ๋ฉ ๊ฒฐ๊ณผ๋ฅผ ๋ฐํํฉ๋๋ค. |
|
|
- ๋ชจ๋ ์คํจํ๋ฉด ๋ง์ง๋ง ์์ธ๋ฅผ ๋ค์ ๋์ง๋๋ค. |
|
|
- ๋ฌธ์์ด ๊ฒฝ๋ก๋ฟ ์๋๋ผ BytesIO/ํ์ผ ๊ฐ์ฒด๋ ์ง์ํฉ๋๋ค. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
path_or_buf : str ๋๋ ํ์ผ ๊ฐ์ฒด |
|
|
CSV ํ์ผ ๊ฒฝ๋ก ๋๋ ํ์ผ ๊ฐ์ฒด/๋ฒํผ(์: BytesIO, UploadedFile ๋ฑ) |
|
|
|
|
|
Returns |
|
|
------- |
|
|
pd.DataFrame |
|
|
์ฝ์ด๋ค์ธ ๋ฐ์ดํฐํ๋ ์ |
|
|
""" |
|
|
last_e: Optional[Exception] = None |
|
|
for enc in ENCODINGS: |
|
|
try: |
|
|
|
|
|
if hasattr(path_or_buf, "seek"): |
|
|
try: |
|
|
path_or_buf.seek(0) |
|
|
except Exception: |
|
|
|
|
|
pass |
|
|
return pd.read_csv(path_or_buf, encoding=enc) |
|
|
except Exception as e: |
|
|
|
|
|
last_e = e |
|
|
if last_e is not None: |
|
|
|
|
|
raise last_e |
|
|
|
|
|
return pd.read_csv(path_or_buf) |
|
|
|
|
|
|
|
|
def save_utf8sig(df: pd.DataFrame, path: str) -> None: |
|
|
""" |
|
|
DataFrame์ UTF-8-SIG๋ก ์ ์ฅํฉ๋๋ค. |
|
|
- ๋๋ ํ ๋ฆฌ๊ฐ ์์ผ๋ฉด ๋จผ์ ๋ง๋ค์ด ์ค๋๋ค. |
|
|
- ์์
์์ ํ๊ธ ๊นจ์ง์ ๋ฐฉ์งํ๋ ์ธ์ฝ๋ฉ์
๋๋ค. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
df : pd.DataFrame |
|
|
์ ์ฅํ ๋ฐ์ดํฐํ๋ ์ |
|
|
path : str |
|
|
์ ์ฅ ๊ฒฝ๋ก(ํ์ผ๋ช
ํฌํจ) |
|
|
""" |
|
|
os.makedirs(os.path.dirname(path), exist_ok=True) |
|
|
df.to_csv(path, index=False, encoding="utf-8-sig") |
|
|
|
|
|
|
|
|
def ensure_dirs(*dirs: str) -> None: |
|
|
""" |
|
|
์ ๋ฌ๋ ๋ชจ๋ ๊ฒฝ๋ก์ ๋ํด ํด๋๊ฐ ์์ผ๋ฉด ์์ฑํฉ๋๋ค. |
|
|
- ์ฌ๋ฌ ๊ฒฝ๋ก๋ฅผ ํ ๋ฒ์ ์ฒ๋ฆฌํ ์ ์์ต๋๋ค. |
|
|
|
|
|
Example |
|
|
------- |
|
|
ensure_dirs("data", "artifacts", "models") |
|
|
""" |
|
|
for d in dirs: |
|
|
os.makedirs(d, exist_ok=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_CAND_DATE = ["date", "์ผ์", "๋ ์ง", "dt", "๊ธฐ์ค์ผ"] |
|
|
_CAND_TARGET = ["qty", "sales_qty", "sales", "ํ๋งค์๋", "์๋", "demand", "target", "y"] |
|
|
_CAND_REGION = ["region", "์ง์ ", "์ ํฌ", "๋งค์ฅ", "์ง์ญ", "์๋", "๊ด์ญ", "๊ตฌ๋ถ"] |
|
|
_CAND_BRAND = ["brand", "๋ธ๋๋", "ํ์ฌ", "์ ์กฐ์ฌ"] |
|
|
_CAND_ITEM = ["item", "์ํ", "ํ๋ชฉ", "sku", "์ํ๋ช
", "์ ํ๋ช
"] |
|
|
|
|
|
|
|
|
def _guess_col(cols: List[str], candidates: List[str]) -> Optional[str]: |
|
|
""" |
|
|
์ปฌ๋ผ ์ด๋ฆ ๋ชฉ๋ก(cols)์์ ํ๋ณด(candidates)์ '๊ฐ์ฅ ์ ๋ง๋' ์ปฌ๋ผ์ ์ถ์ ํฉ๋๋ค. |
|
|
1) ์ ๋ถ ์๋ฌธ์๋ก ๋ฐ๊พผ ๋ค '์ ํํ ๊ฐ์ ์ด๋ฆ' ์ฐ์ ๋งค์นญ |
|
|
2) ์์ผ๋ฉด 'ํฌํจ(contains)' ๋งค์นญ์ผ๋ก ์ํ ํ์ |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
cols : List[str] |
|
|
์ค์ ๋ฐ์ดํฐํ๋ ์์ ์ปฌ๋ผ๋ช
๋ฆฌ์คํธ |
|
|
candidates : List[str] |
|
|
์ฐ๋ฆฌ๊ฐ ์ฐพ๊ณ ์ถ์ ์๋ฏธ์ ํ๋ณด๋ช
๋ค |
|
|
|
|
|
Returns |
|
|
------- |
|
|
Optional[str] |
|
|
๋งค์นญ๋ ์ปฌ๋ผ๋ช
(์์ผ๋ฉด None) |
|
|
""" |
|
|
lower = {c.lower(): c for c in cols} |
|
|
|
|
|
|
|
|
for c in candidates: |
|
|
if c in lower: |
|
|
return lower[c] |
|
|
|
|
|
|
|
|
for c in candidates: |
|
|
for col in cols: |
|
|
if c in col.lower(): |
|
|
return col |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
def auto_map_columns(df: pd.DataFrame) -> Dict[str, Optional[str]]: |
|
|
""" |
|
|
๋ ์ง/ํ๊น/์ง์ญ/๋ธ๋๋/์ํ ์ปฌ๋ผ๋ช
์ ์๋์ผ๋ก ์ถ์ ํฉ๋๋ค. |
|
|
- ์ /๋ถ๋ถ์ผ์น๋ก ๊ฐ๊ฐ ํ ๊ฐ์ฉ ์ฐพ์ต๋๋ค. |
|
|
- ์ค๋ณต(๊ฐ์ ์ปฌ๋ผ์ด ๋ ์ญํ ๋ก ์ ํ) ๋ฐ์ ์, ๋ ์ง/ํ๊น์ ์ฐ์ ๋ณด์กดํ๊ณ |
|
|
๋๋จธ์ง(region/brand/item)๋ '์์ง ์ฌ์ฉ๋์ง ์์' ๋ค๋ฅธ ์ปฌ๋ผ์ผ๋ก |
|
|
๋์ฒด ์๋ํฉ๋๋ค. (์๋ณธ ๋ก์ง์ locals() ์์ ๋ฒ๊ทธ๋ฅผ ์ ๊ฑฐ) |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
df : pd.DataFrame |
|
|
์
๋ ฅ ๋ฐ์ดํฐํ๋ ์ |
|
|
|
|
|
Returns |
|
|
------- |
|
|
Dict[str, Optional[str]] |
|
|
{'date': ..., 'target': ..., 'region': ..., 'brand': ..., 'item': ...} |
|
|
๊ฐ์ด None์ผ ์ ์์ต๋๋ค. |
|
|
""" |
|
|
cols = list(df.columns) |
|
|
|
|
|
|
|
|
date = _guess_col(cols, _CAND_DATE) |
|
|
target = _guess_col(cols, _CAND_TARGET) |
|
|
region = _guess_col(cols, _CAND_REGION) |
|
|
brand = _guess_col(cols, _CAND_BRAND) |
|
|
item = _guess_col(cols, _CAND_ITEM) |
|
|
|
|
|
|
|
|
picks = { |
|
|
"date": date, |
|
|
"target": target, |
|
|
"region": region, |
|
|
"brand": brand, |
|
|
"item": item, |
|
|
} |
|
|
|
|
|
|
|
|
chosen_non_null = [p for p in picks.values() if p] |
|
|
has_dup = len(set(chosen_non_null)) != len(chosen_non_null) |
|
|
|
|
|
if has_dup: |
|
|
|
|
|
used = set([p for p in (date, target) if p]) |
|
|
|
|
|
for key in ["region", "brand", "item"]: |
|
|
val = picks.get(key) |
|
|
|
|
|
if val and val in used: |
|
|
|
|
|
replace = None |
|
|
for c in cols: |
|
|
if c not in used and c != val: |
|
|
replace = c |
|
|
break |
|
|
picks[key] = replace |
|
|
if replace: |
|
|
used.add(replace) |
|
|
elif val: |
|
|
used.add(val) |
|
|
|
|
|
return picks |
|
|
|