cstore / utils_io.py
leedami's picture
Upload 7 files
5841e58 verified
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
utils_io.py โ€” ์ž…์ถœ๋ ฅ/์ปฌ๋Ÿผ ์ž๋™ ๋งคํ•‘ ์œ ํ‹ธ ๋ชจ์Œ (์ƒ์„ธ ์ฃผ์„)
์ด ํŒŒ์ผ์€ ๋‹ค์Œ ๊ธฐ๋Šฅ์„ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค.
1) read_csv_flexible: ์—ฌ๋Ÿฌ ์ธ์ฝ”๋”ฉ ํ›„๋ณด๋กœ CSV๋ฅผ '์•ˆ์ „ํ•˜๊ฒŒ' ์ฝ๊ธฐ
2) save_utf8sig : UTF-8-SIG(์—‘์…€ ํ˜ธํ™˜)๋กœ CSV ์ €์žฅ
3) ensure_dirs : ํด๋”๊ฐ€ ์—†์œผ๋ฉด ๋งŒ๋“ค์–ด ์ฃผ๊ธฐ
4) auto_map_columns : ๋‚ ์งœ/ํƒ€๊นƒ/์ง€์—ญ/๋ธŒ๋žœ๋“œ/์ƒํ’ˆ ์ปฌ๋Ÿผ ์ž๋™ ์ถ”์ •
โ€ป ์ฃผ์˜: ์•„๋ž˜ auto_map_columns()๋Š” ์›๋ณธ ์ฝ”๋“œ์˜ locals() ๊ธฐ๋ฐ˜ ์ถฉ๋Œ ํ•ด๊ฒฐ์„
'์•ˆ์ „ํ•œ ๋”•์…”๋„ˆ๋ฆฌ ๊ธฐ๋ฐ˜'์œผ๋กœ ๊ณ ์ณค์Šต๋‹ˆ๋‹ค. (Python์—์„œ locals() ์ˆ˜์ •์€
ํ•จ์ˆ˜ ์Šค์ฝ”ํ”„์—์„œ ๋ณด์žฅ์ด ๋˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.)
"""
import os
import re
import glob
import pandas as pd
from typing import Optional, Dict, List, Union, IO
# 1) CSV ์ฝ๊ธฐ ์‹œ๋„ํ•  ์ธ์ฝ”๋”ฉ ํ›„๋ณด๋“ค
# - utf-8-sig: ์—‘์…€์—์„œ ์ž˜ ์—ด๋ฆฌ๋Š” UTF-8 with BOM
# - utf-8 : ๋ฒ”์šฉ
# - cp949/euc-kr: ์œˆ๋„์šฐ/๊ตญ๋‚ด ํ™˜๊ฒฝ์—์„œ ์ž์ฃผ ์“ฐ๋Š” ํ•œ๊ธ€ ์ธ์ฝ”๋”ฉ
# - latin1 : ๋งˆ์ง€๋ง‰ ์•ˆ์ „๋ง(์†์‹ค ์—†์ด ์ฝํžˆ๋‚˜ ๊ธ€์ž๊ฐ€ ๊นจ์งˆ ์ˆ˜ ์žˆ์Œ)
ENCODINGS: List[str] = ["utf-8-sig", "utf-8", "cp949", "euc-kr", "latin1"]
def read_csv_flexible(path_or_buf: Union[str, os.PathLike, IO[bytes], IO[str]]) -> pd.DataFrame:
"""
์—ฌ๋Ÿฌ ์ธ์ฝ”๋”ฉ์„ ์ˆœ์ฐจ์ ์œผ๋กœ ์‹œ๋„ํ•˜์—ฌ CSV๋ฅผ ์•ˆ์ „ํ•˜๊ฒŒ ์ฝ์Šต๋‹ˆ๋‹ค.
- ์ฒซ ๋ฒˆ์งธ๋กœ ์„ฑ๊ณตํ•˜๋Š” ์ธ์ฝ”๋”ฉ ๊ฒฐ๊ณผ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
- ๋ชจ๋‘ ์‹คํŒจํ•˜๋ฉด ๋งˆ์ง€๋ง‰ ์˜ˆ์™ธ๋ฅผ ๋‹ค์‹œ ๋˜์ง‘๋‹ˆ๋‹ค.
- ๋ฌธ์ž์—ด ๊ฒฝ๋กœ๋ฟ ์•„๋‹ˆ๋ผ BytesIO/ํŒŒ์ผ ๊ฐ์ฒด๋„ ์ง€์›ํ•ฉ๋‹ˆ๋‹ค.
Parameters
----------
path_or_buf : str ๋˜๋Š” ํŒŒ์ผ ๊ฐ์ฒด
CSV ํŒŒ์ผ ๊ฒฝ๋กœ ๋˜๋Š” ํŒŒ์ผ ๊ฐ์ฒด/๋ฒ„ํผ(์˜ˆ: BytesIO, UploadedFile ๋“ฑ)
Returns
-------
pd.DataFrame
์ฝ์–ด๋“ค์ธ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„
"""
last_e: Optional[Exception] = None
for enc in ENCODINGS:
try:
# ํŒŒ์ผ ๊ฐ์ฒด์ผ ๊ฒฝ์šฐ ๋งค๋ฒˆ ์ฒ˜์Œ๋ถ€ํ„ฐ ๋‹ค์‹œ ์ฝ๋„๋ก ์ปค์„œ ์ด๋™
if hasattr(path_or_buf, "seek"):
try:
path_or_buf.seek(0)
except Exception:
# seek์„ ์ง€์›ํ•˜์ง€ ์•Š์œผ๋ฉด ๊ทธ๋ƒฅ ์ง„ํ–‰
pass
return pd.read_csv(path_or_buf, encoding=enc)
except Exception as e:
# ์‹คํŒจํ•˜๋ฉด ๋‹ค์Œ ์ธ์ฝ”๋”ฉ์œผ๋กœ ๋„˜์–ด๊ฐ€๊ณ , ๋งˆ์ง€๋ง‰ ์˜ˆ์™ธ๋ฅผ ์ €์žฅ
last_e = e
if last_e is not None:
# ๋ชจ๋“  ์ธ์ฝ”๋”ฉ์ด ์‹คํŒจ โ†’ ๋งˆ์ง€๋ง‰ ์—๋Ÿฌ๋ฅผ ๊ทธ๋Œ€๋กœ ์˜ฌ๋ฆผ(๋””๋ฒ„๊น…์— ์œ ์šฉ)
raise last_e
# ์ด๋ก ์ƒ ๋„๋‹ฌํ•˜์ง€ ์•Š์ง€๋งŒ, ์•ˆ์ „๋ง์œผ๋กœ ํ•œ ๋ฒˆ ๋” ์‹œ๋„
return pd.read_csv(path_or_buf)
def save_utf8sig(df: pd.DataFrame, path: str) -> None:
"""
DataFrame์„ UTF-8-SIG๋กœ ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.
- ๋””๋ ‰ํ† ๋ฆฌ๊ฐ€ ์—†์œผ๋ฉด ๋จผ์ € ๋งŒ๋“ค์–ด ์ค๋‹ˆ๋‹ค.
- ์—‘์…€์—์„œ ํ•œ๊ธ€ ๊นจ์ง์„ ๋ฐฉ์ง€ํ•˜๋Š” ์ธ์ฝ”๋”ฉ์ž…๋‹ˆ๋‹ค.
Parameters
----------
df : pd.DataFrame
์ €์žฅํ•  ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„
path : str
์ €์žฅ ๊ฒฝ๋กœ(ํŒŒ์ผ๋ช… ํฌํ•จ)
"""
os.makedirs(os.path.dirname(path), exist_ok=True)
df.to_csv(path, index=False, encoding="utf-8-sig")
def ensure_dirs(*dirs: str) -> None:
"""
์ „๋‹ฌ๋œ ๋ชจ๋“  ๊ฒฝ๋กœ์— ๋Œ€ํ•ด ํด๋”๊ฐ€ ์—†์œผ๋ฉด ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
- ์—ฌ๋Ÿฌ ๊ฒฝ๋กœ๋ฅผ ํ•œ ๋ฒˆ์— ์ฒ˜๋ฆฌํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
Example
-------
ensure_dirs("data", "artifacts", "models")
"""
for d in dirs:
os.makedirs(d, exist_ok=True)
# --- Column auto-mapping helpers --------------------------------------------
# ํ•œ๊ตญ์–ด/์˜์–ด๋กœ ์ž์ฃผ ์“ฐ์ด๋Š” ์—ด ์ด๋ฆ„ ํ›„๋ณด ๋ฆฌ์ŠคํŠธ
_CAND_DATE = ["date", "์ผ์ž", "๋‚ ์งœ", "dt", "๊ธฐ์ค€์ผ"]
_CAND_TARGET = ["qty", "sales_qty", "sales", "ํŒ๋งค์ˆ˜๋Ÿ‰", "์ˆ˜๋Ÿ‰", "demand", "target", "y"]
_CAND_REGION = ["region", "์ง€์ ", "์ ํฌ", "๋งค์žฅ", "์ง€์—ญ", "์‹œ๋„", "๊ด‘์—ญ", "๊ตฌ๋ถ„"]
_CAND_BRAND = ["brand", "๋ธŒ๋žœ๋“œ", "ํšŒ์‚ฌ", "์ œ์กฐ์‚ฌ"]
_CAND_ITEM = ["item", "์ƒํ’ˆ", "ํ’ˆ๋ชฉ", "sku", "์ƒํ’ˆ๋ช…", "์ œํ’ˆ๋ช…"]
def _guess_col(cols: List[str], candidates: List[str]) -> Optional[str]:
"""
์ปฌ๋Ÿผ ์ด๋ฆ„ ๋ชฉ๋ก(cols)์—์„œ ํ›„๋ณด(candidates)์™€ '๊ฐ€์žฅ ์ž˜ ๋งž๋Š”' ์ปฌ๋Ÿผ์„ ์ถ”์ •ํ•ฉ๋‹ˆ๋‹ค.
1) ์ „๋ถ€ ์†Œ๋ฌธ์ž๋กœ ๋ฐ”๊พผ ๋’ค '์ •ํ™•ํžˆ ๊ฐ™์€ ์ด๋ฆ„' ์šฐ์„  ๋งค์นญ
2) ์—†์œผ๋ฉด 'ํฌํ•จ(contains)' ๋งค์นญ์œผ๋กœ ์™„ํ™” ํƒ์ƒ‰
Parameters
----------
cols : List[str]
์‹ค์ œ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์˜ ์ปฌ๋Ÿผ๋ช… ๋ฆฌ์ŠคํŠธ
candidates : List[str]
์šฐ๋ฆฌ๊ฐ€ ์ฐพ๊ณ  ์‹ถ์€ ์˜๋ฏธ์˜ ํ›„๋ณด๋ช…๋“ค
Returns
-------
Optional[str]
๋งค์นญ๋œ ์ปฌ๋Ÿผ๋ช…(์—†์œผ๋ฉด None)
"""
lower = {c.lower(): c for c in cols} # ์†Œ๋ฌธ์ž โ†’ ์›๋ž˜ ์ปฌ๋Ÿผ๋ช… ๋งคํ•‘
# (1) ์ •ํ™• ์ผ์น˜ ์šฐ์„ 
for c in candidates:
if c in lower:
return lower[c]
# (2) ๋ถ€๋ถ„ ํฌํ•จ(์™„ํ™” ๋งค์นญ)
for c in candidates:
for col in cols:
if c in col.lower():
return col
return None
def auto_map_columns(df: pd.DataFrame) -> Dict[str, Optional[str]]:
"""
๋‚ ์งœ/ํƒ€๊นƒ/์ง€์—ญ/๋ธŒ๋žœ๋“œ/์ƒํ’ˆ ์ปฌ๋Ÿผ๋ช…์„ ์ž๋™์œผ๋กœ ์ถ”์ •ํ•ฉ๋‹ˆ๋‹ค.
- ์ •/๋ถ€๋ถ„์ผ์น˜๋กœ ๊ฐ๊ฐ ํ•œ ๊ฐœ์”ฉ ์ฐพ์Šต๋‹ˆ๋‹ค.
- ์ค‘๋ณต(๊ฐ™์€ ์ปฌ๋Ÿผ์ด ๋‘ ์—ญํ• ๋กœ ์„ ํƒ) ๋ฐœ์ƒ ์‹œ, ๋‚ ์งœ/ํƒ€๊นƒ์„ ์šฐ์„  ๋ณด์กดํ•˜๊ณ 
๋‚˜๋จธ์ง€(region/brand/item)๋Š” '์•„์ง ์‚ฌ์šฉ๋˜์ง€ ์•Š์€' ๋‹ค๋ฅธ ์ปฌ๋Ÿผ์œผ๋กœ
๋Œ€์ฒด ์‹œ๋„ํ•ฉ๋‹ˆ๋‹ค. (์›๋ณธ ๋กœ์ง์˜ locals() ์ˆ˜์ • ๋ฒ„๊ทธ๋ฅผ ์ œ๊ฑฐ)
Parameters
----------
df : pd.DataFrame
์ž…๋ ฅ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„
Returns
-------
Dict[str, Optional[str]]
{'date': ..., 'target': ..., 'region': ..., 'brand': ..., 'item': ...}
๊ฐ’์ด None์ผ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
"""
cols = list(df.columns)
# 1) 1์ฐจ ์ž๋™ ์ถ”์ •
date = _guess_col(cols, _CAND_DATE)
target = _guess_col(cols, _CAND_TARGET)
region = _guess_col(cols, _CAND_REGION)
brand = _guess_col(cols, _CAND_BRAND)
item = _guess_col(cols, _CAND_ITEM)
# 2) ์ถฉ๋Œ(์ค‘๋ณต) ์ฒ˜๋ฆฌ โ€” ์•ˆ์ „ํ•œ ๋”•์…”๋„ˆ๋ฆฌ ๋ฐฉ์‹
picks = {
"date": date,
"target": target,
"region": region,
"brand": brand,
"item": item,
}
# None์ด ์•„๋‹Œ ๊ฐ’๋“ค๋งŒ ๋ฝ‘์•„ ์ค‘๋ณต ์—ฌ๋ถ€ ํ™•์ธ
chosen_non_null = [p for p in picks.values() if p]
has_dup = len(set(chosen_non_null)) != len(chosen_non_null)
if has_dup:
# ๋‚ ์งœ/ํƒ€๊นƒ ์ตœ์šฐ์„  ๋ณดํ˜ธ
used = set([p for p in (date, target) if p])
# ์ถฉ๋Œ ๊ฐ€๋Šฅ์„ฑ์ด ์žˆ๋Š” ํ‚ค๋“ค(์šฐ์„ ์ˆœ์œ„ ๋‚ฎ์Œ)
for key in ["region", "brand", "item"]:
val = picks.get(key)
# ์ด๋ฏธ ์‚ฌ์šฉ๋œ ์ปฌ๋Ÿผ๊ณผ ๊ฒน์น˜๋ฉด ๋‹ค๋ฅธ ํ›„๋ณด๋ฅผ ์ฐพ์•„๋ด„
if val and val in used:
# ์•„์ง ์“ฐ์ง€ ์•Š์€ ์ž„์˜์˜ ์ปฌ๋Ÿผ์„ ์ˆœํšŒํ•˜๋ฉฐ ๋Œ€์ฒด
replace = None
for c in cols:
if c not in used and c != val:
replace = c
break
picks[key] = replace # ๋ชป ์ฐพ์œผ๋ฉด None์ด ๋“ค์–ด๊ฐ‘๋‹ˆ๋‹ค.
if replace:
used.add(replace)
elif val:
used.add(val)
return picks