Spaces:
Paused
Paused
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| """ | |
| utils_io.py โ ์ ์ถ๋ ฅ/์ปฌ๋ผ ์๋ ๋งคํ ์ ํธ ๋ชจ์ (์์ธ ์ฃผ์) | |
| ์ด ํ์ผ์ ๋ค์ ๊ธฐ๋ฅ์ ์ ๊ณตํฉ๋๋ค. | |
| 1) read_csv_flexible: ์ฌ๋ฌ ์ธ์ฝ๋ฉ ํ๋ณด๋ก CSV๋ฅผ '์์ ํ๊ฒ' ์ฝ๊ธฐ | |
| 2) save_utf8sig : UTF-8-SIG(์์ ํธํ)๋ก CSV ์ ์ฅ | |
| 3) ensure_dirs : ํด๋๊ฐ ์์ผ๋ฉด ๋ง๋ค์ด ์ฃผ๊ธฐ | |
| 4) auto_map_columns : ๋ ์ง/ํ๊น/์ง์ญ/๋ธ๋๋/์ํ ์ปฌ๋ผ ์๋ ์ถ์ | |
| โป ์ฃผ์: ์๋ auto_map_columns()๋ ์๋ณธ ์ฝ๋์ locals() ๊ธฐ๋ฐ ์ถฉ๋ ํด๊ฒฐ์ | |
| '์์ ํ ๋์ ๋๋ฆฌ ๊ธฐ๋ฐ'์ผ๋ก ๊ณ ์ณค์ต๋๋ค. (Python์์ locals() ์์ ์ | |
| ํจ์ ์ค์ฝํ์์ ๋ณด์ฅ์ด ๋์ง ์์ต๋๋ค.) | |
| """ | |
| import os | |
| import re | |
| import glob | |
| import pandas as pd | |
| from typing import Optional, Dict, List, Union, IO | |
| # 1) CSV ์ฝ๊ธฐ ์๋ํ ์ธ์ฝ๋ฉ ํ๋ณด๋ค | |
| # - utf-8-sig: ์์ ์์ ์ ์ด๋ฆฌ๋ UTF-8 with BOM | |
| # - utf-8 : ๋ฒ์ฉ | |
| # - cp949/euc-kr: ์๋์ฐ/๊ตญ๋ด ํ๊ฒฝ์์ ์์ฃผ ์ฐ๋ ํ๊ธ ์ธ์ฝ๋ฉ | |
| # - latin1 : ๋ง์ง๋ง ์์ ๋ง(์์ค ์์ด ์ฝํ๋ ๊ธ์๊ฐ ๊นจ์ง ์ ์์) | |
| ENCODINGS: List[str] = ["utf-8-sig", "utf-8", "cp949", "euc-kr", "latin1"] | |
| def read_csv_flexible(path_or_buf: Union[str, os.PathLike, IO[bytes], IO[str]]) -> pd.DataFrame: | |
| """ | |
| ์ฌ๋ฌ ์ธ์ฝ๋ฉ์ ์์ฐจ์ ์ผ๋ก ์๋ํ์ฌ CSV๋ฅผ ์์ ํ๊ฒ ์ฝ์ต๋๋ค. | |
| - ์ฒซ ๋ฒ์งธ๋ก ์ฑ๊ณตํ๋ ์ธ์ฝ๋ฉ ๊ฒฐ๊ณผ๋ฅผ ๋ฐํํฉ๋๋ค. | |
| - ๋ชจ๋ ์คํจํ๋ฉด ๋ง์ง๋ง ์์ธ๋ฅผ ๋ค์ ๋์ง๋๋ค. | |
| - ๋ฌธ์์ด ๊ฒฝ๋ก๋ฟ ์๋๋ผ BytesIO/ํ์ผ ๊ฐ์ฒด๋ ์ง์ํฉ๋๋ค. | |
| Parameters | |
| ---------- | |
| path_or_buf : str ๋๋ ํ์ผ ๊ฐ์ฒด | |
| CSV ํ์ผ ๊ฒฝ๋ก ๋๋ ํ์ผ ๊ฐ์ฒด/๋ฒํผ(์: BytesIO, UploadedFile ๋ฑ) | |
| Returns | |
| ------- | |
| pd.DataFrame | |
| ์ฝ์ด๋ค์ธ ๋ฐ์ดํฐํ๋ ์ | |
| """ | |
| last_e: Optional[Exception] = None | |
| for enc in ENCODINGS: | |
| try: | |
| # ํ์ผ ๊ฐ์ฒด์ผ ๊ฒฝ์ฐ ๋งค๋ฒ ์ฒ์๋ถํฐ ๋ค์ ์ฝ๋๋ก ์ปค์ ์ด๋ | |
| if hasattr(path_or_buf, "seek"): | |
| try: | |
| path_or_buf.seek(0) | |
| except Exception: | |
| # seek์ ์ง์ํ์ง ์์ผ๋ฉด ๊ทธ๋ฅ ์งํ | |
| pass | |
| return pd.read_csv(path_or_buf, encoding=enc) | |
| except Exception as e: | |
| # ์คํจํ๋ฉด ๋ค์ ์ธ์ฝ๋ฉ์ผ๋ก ๋์ด๊ฐ๊ณ , ๋ง์ง๋ง ์์ธ๋ฅผ ์ ์ฅ | |
| last_e = e | |
| if last_e is not None: | |
| # ๋ชจ๋ ์ธ์ฝ๋ฉ์ด ์คํจ โ ๋ง์ง๋ง ์๋ฌ๋ฅผ ๊ทธ๋๋ก ์ฌ๋ฆผ(๋๋ฒ๊น ์ ์ ์ฉ) | |
| raise last_e | |
| # ์ด๋ก ์ ๋๋ฌํ์ง ์์ง๋ง, ์์ ๋ง์ผ๋ก ํ ๋ฒ ๋ ์๋ | |
| return pd.read_csv(path_or_buf) | |
| def save_utf8sig(df: pd.DataFrame, path: str) -> None: | |
| """ | |
| DataFrame์ UTF-8-SIG๋ก ์ ์ฅํฉ๋๋ค. | |
| - ๋๋ ํ ๋ฆฌ๊ฐ ์์ผ๋ฉด ๋จผ์ ๋ง๋ค์ด ์ค๋๋ค. | |
| - ์์ ์์ ํ๊ธ ๊นจ์ง์ ๋ฐฉ์งํ๋ ์ธ์ฝ๋ฉ์ ๋๋ค. | |
| Parameters | |
| ---------- | |
| df : pd.DataFrame | |
| ์ ์ฅํ ๋ฐ์ดํฐํ๋ ์ | |
| path : str | |
| ์ ์ฅ ๊ฒฝ๋ก(ํ์ผ๋ช ํฌํจ) | |
| """ | |
| os.makedirs(os.path.dirname(path), exist_ok=True) | |
| df.to_csv(path, index=False, encoding="utf-8-sig") | |
| def ensure_dirs(*dirs: str) -> None: | |
| """ | |
| ์ ๋ฌ๋ ๋ชจ๋ ๊ฒฝ๋ก์ ๋ํด ํด๋๊ฐ ์์ผ๋ฉด ์์ฑํฉ๋๋ค. | |
| - ์ฌ๋ฌ ๊ฒฝ๋ก๋ฅผ ํ ๋ฒ์ ์ฒ๋ฆฌํ ์ ์์ต๋๋ค. | |
| Example | |
| ------- | |
| ensure_dirs("data", "artifacts", "models") | |
| """ | |
| for d in dirs: | |
| os.makedirs(d, exist_ok=True) | |
| # --- Column auto-mapping helpers -------------------------------------------- | |
| # ํ๊ตญ์ด/์์ด๋ก ์์ฃผ ์ฐ์ด๋ ์ด ์ด๋ฆ ํ๋ณด ๋ฆฌ์คํธ | |
| _CAND_DATE = ["date", "์ผ์", "๋ ์ง", "dt", "๊ธฐ์ค์ผ"] | |
| _CAND_TARGET = ["qty", "sales_qty", "sales", "ํ๋งค์๋", "์๋", "demand", "target", "y"] | |
| _CAND_REGION = ["region", "์ง์ ", "์ ํฌ", "๋งค์ฅ", "์ง์ญ", "์๋", "๊ด์ญ", "๊ตฌ๋ถ"] | |
| _CAND_BRAND = ["brand", "๋ธ๋๋", "ํ์ฌ", "์ ์กฐ์ฌ"] | |
| _CAND_ITEM = ["item", "์ํ", "ํ๋ชฉ", "sku", "์ํ๋ช ", "์ ํ๋ช "] | |
| def _guess_col(cols: List[str], candidates: List[str]) -> Optional[str]: | |
| """ | |
| ์ปฌ๋ผ ์ด๋ฆ ๋ชฉ๋ก(cols)์์ ํ๋ณด(candidates)์ '๊ฐ์ฅ ์ ๋ง๋' ์ปฌ๋ผ์ ์ถ์ ํฉ๋๋ค. | |
| 1) ์ ๋ถ ์๋ฌธ์๋ก ๋ฐ๊พผ ๋ค '์ ํํ ๊ฐ์ ์ด๋ฆ' ์ฐ์ ๋งค์นญ | |
| 2) ์์ผ๋ฉด 'ํฌํจ(contains)' ๋งค์นญ์ผ๋ก ์ํ ํ์ | |
| Parameters | |
| ---------- | |
| cols : List[str] | |
| ์ค์ ๋ฐ์ดํฐํ๋ ์์ ์ปฌ๋ผ๋ช ๋ฆฌ์คํธ | |
| candidates : List[str] | |
| ์ฐ๋ฆฌ๊ฐ ์ฐพ๊ณ ์ถ์ ์๋ฏธ์ ํ๋ณด๋ช ๋ค | |
| Returns | |
| ------- | |
| Optional[str] | |
| ๋งค์นญ๋ ์ปฌ๋ผ๋ช (์์ผ๋ฉด None) | |
| """ | |
| lower = {c.lower(): c for c in cols} # ์๋ฌธ์ โ ์๋ ์ปฌ๋ผ๋ช ๋งคํ | |
| # (1) ์ ํ ์ผ์น ์ฐ์ | |
| for c in candidates: | |
| if c in lower: | |
| return lower[c] | |
| # (2) ๋ถ๋ถ ํฌํจ(์ํ ๋งค์นญ) | |
| for c in candidates: | |
| for col in cols: | |
| if c in col.lower(): | |
| return col | |
| return None | |
| def auto_map_columns(df: pd.DataFrame) -> Dict[str, Optional[str]]: | |
| """ | |
| ๋ ์ง/ํ๊น/์ง์ญ/๋ธ๋๋/์ํ ์ปฌ๋ผ๋ช ์ ์๋์ผ๋ก ์ถ์ ํฉ๋๋ค. | |
| - ์ /๋ถ๋ถ์ผ์น๋ก ๊ฐ๊ฐ ํ ๊ฐ์ฉ ์ฐพ์ต๋๋ค. | |
| - ์ค๋ณต(๊ฐ์ ์ปฌ๋ผ์ด ๋ ์ญํ ๋ก ์ ํ) ๋ฐ์ ์, ๋ ์ง/ํ๊น์ ์ฐ์ ๋ณด์กดํ๊ณ | |
| ๋๋จธ์ง(region/brand/item)๋ '์์ง ์ฌ์ฉ๋์ง ์์' ๋ค๋ฅธ ์ปฌ๋ผ์ผ๋ก | |
| ๋์ฒด ์๋ํฉ๋๋ค. (์๋ณธ ๋ก์ง์ locals() ์์ ๋ฒ๊ทธ๋ฅผ ์ ๊ฑฐ) | |
| Parameters | |
| ---------- | |
| df : pd.DataFrame | |
| ์ ๋ ฅ ๋ฐ์ดํฐํ๋ ์ | |
| Returns | |
| ------- | |
| Dict[str, Optional[str]] | |
| {'date': ..., 'target': ..., 'region': ..., 'brand': ..., 'item': ...} | |
| ๊ฐ์ด None์ผ ์ ์์ต๋๋ค. | |
| """ | |
| cols = list(df.columns) | |
| # 1) 1์ฐจ ์๋ ์ถ์ | |
| date = _guess_col(cols, _CAND_DATE) | |
| target = _guess_col(cols, _CAND_TARGET) | |
| region = _guess_col(cols, _CAND_REGION) | |
| brand = _guess_col(cols, _CAND_BRAND) | |
| item = _guess_col(cols, _CAND_ITEM) | |
| # 2) ์ถฉ๋(์ค๋ณต) ์ฒ๋ฆฌ โ ์์ ํ ๋์ ๋๋ฆฌ ๋ฐฉ์ | |
| picks = { | |
| "date": date, | |
| "target": target, | |
| "region": region, | |
| "brand": brand, | |
| "item": item, | |
| } | |
| # None์ด ์๋ ๊ฐ๋ค๋ง ๋ฝ์ ์ค๋ณต ์ฌ๋ถ ํ์ธ | |
| chosen_non_null = [p for p in picks.values() if p] | |
| has_dup = len(set(chosen_non_null)) != len(chosen_non_null) | |
| if has_dup: | |
| # ๋ ์ง/ํ๊น ์ต์ฐ์ ๋ณดํธ | |
| used = set([p for p in (date, target) if p]) | |
| # ์ถฉ๋ ๊ฐ๋ฅ์ฑ์ด ์๋ ํค๋ค(์ฐ์ ์์ ๋ฎ์) | |
| for key in ["region", "brand", "item"]: | |
| val = picks.get(key) | |
| # ์ด๋ฏธ ์ฌ์ฉ๋ ์ปฌ๋ผ๊ณผ ๊ฒน์น๋ฉด ๋ค๋ฅธ ํ๋ณด๋ฅผ ์ฐพ์๋ด | |
| if val and val in used: | |
| # ์์ง ์ฐ์ง ์์ ์์์ ์ปฌ๋ผ์ ์ํํ๋ฉฐ ๋์ฒด | |
| replace = None | |
| for c in cols: | |
| if c not in used and c != val: | |
| replace = c | |
| break | |
| picks[key] = replace # ๋ชป ์ฐพ์ผ๋ฉด None์ด ๋ค์ด๊ฐ๋๋ค. | |
| if replace: | |
| used.add(replace) | |
| elif val: | |
| used.add(val) | |
| return picks | |