Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import json | |
| import os | |
| import re | |
| import sys | |
| # β μΈλΆμμ νμΌ κ²½λ‘λ₯Ό μΈμλ‘ λ°μ | |
| if len(sys.argv) < 2: | |
| raise ValueError("β Excel νμΌ κ²½λ‘λ₯Ό μΈμλ‘ μ λ¬ν΄μΌ ν©λλ€. μ: python convert_excel_to_json_auto.py data/raw_excels/μνμΌ.xlsx") | |
| EXCEL_PATH = sys.argv[1] | |
| JSON_PATH = "data/deposit_docs.json" | |
| # β οΈ μμ μμνμΌ(~$) 무μ | |
| if os.path.basename(EXCEL_PATH).startswith("~$"): | |
| print(f"βοΈ μμνμΌ κ°μ§λ¨, μ²λ¦¬ μλ΅: {EXCEL_PATH}") | |
| sys.exit(0) | |
| # ============================================ | |
| # 1οΈβ£ μμ ν€λ μλ κ°μ§ | |
| # ============================================ | |
| def read_excel_auto(path): | |
| """μμ ν€λ μ 무 μλ κ°μ§ ν DataFrameμΌλ‘ λ‘λ""" | |
| try: | |
| preview = pd.read_excel(path, nrows=1, header=None) | |
| first_row = preview.iloc[0].tolist() | |
| str_ratio = sum(isinstance(x, str) for x in first_row) / len(first_row) | |
| if str_ratio > 0.5: | |
| print("β ν€λ κ°μ§λ¨ β 첫 νμ 컬λΌλͺ μΌλ‘ μ¬μ©ν©λλ€.") | |
| df = pd.read_excel(path, header=0) | |
| else: | |
| print("β οΈ ν€λ μμ β μμ 컬λΌλͺ (컬λΌ1, 컬λΌ2...) λΆμ¬ν©λλ€.") | |
| df = pd.read_excel(path, header=None) | |
| df.columns = [f"컬λΌ{i+1}" for i in range(len(df.columns))] | |
| except Exception as e: | |
| raise RuntimeError(f"μμ λ‘λ μ€ν¨: {e}") | |
| return df.fillna("") | |
| df = read_excel_auto(EXCEL_PATH) | |
| # ============================================ | |
| # 2οΈβ£ μ£Όμ μ»¬λΌ μλ νμ§ (κΈλ¦¬ / μνλͺ / μνλͺ / κΈ°κ°) | |
| # ============================================ | |
| def detect_column(columns, keywords): | |
| for col in columns: | |
| if any(kw in str(col) for kw in keywords): | |
| return col | |
| return None | |
| col_bank = detect_column(df.columns, ["κΈμ΅νμ¬", "μν", "κΈ°κ΄"]) | |
| col_product = detect_column(df.columns, ["μν", "μκΈ", "νλ", "λμΆ"]) | |
| col_rate = detect_column(df.columns, ["κΈλ¦¬", "μ΄μ¨", "μμ΅λ₯ "]) | |
| col_period = detect_column(df.columns, ["κΈ°κ°", "λ§κΈ°", "κ°μ "]) | |
| # ============================================ | |
| # 3οΈβ£ κΈλ¦¬ μ«μ λ³ν ν¨μ (λ μ§ λ± μ€μΈ λ°©μ§) | |
| # ============================================ | |
| def parse_rate(value): | |
| if pd.isna(value): | |
| return None | |
| s = str(value).strip() | |
| # μ«μ μΆμΆ | |
| match = re.search(r"\d+(\.\d+)?", s) | |
| if not match: | |
| return None | |
| num = float(match.group()) | |
| # λ μ§λ‘ μΈμλλ μ«μ(100 μ΄μ or μ°λ νν) μ μΈ | |
| if num > 50 or "202" in s or "λ " in s: | |
| return None | |
| return num | |
| # ============================================ | |
| # 4οΈβ£ κ° ν(row)μ λ¬Έμ₯ ννλ‘ λ³ν | |
| # ============================================ | |
| records = [] | |
| for _, row in df.iterrows(): | |
| text_parts = [f"{col}: {row[col]}" for col in df.columns] | |
| combined_text = " | ".join(text_parts) | |
| rate_val = parse_rate(row[col_rate]) if col_rate else None | |
| meta = { | |
| "bank": str(row[col_bank]) if col_bank else None, | |
| "product": str(row[col_product]) if col_product else None, | |
| "rate": rate_val, | |
| "period": str(row[col_period]) if col_period else None, | |
| } | |
| records.append({ | |
| "source": os.path.basename(EXCEL_PATH), | |
| "content": combined_text, | |
| "meta": {k: v for k, v in meta.items() if v not in [None, ""]} | |
| }) | |
| # ============================================ | |
| # 5οΈβ£ κΈ°μ‘΄ JSON λ³ν© λ° μ μ₯ | |
| # ============================================ | |
| if os.path.exists(JSON_PATH): | |
| with open(JSON_PATH, "r", encoding="utf-8") as f: | |
| old_data = json.load(f) | |
| else: | |
| old_data = [] | |
| source_name = os.path.basename(EXCEL_PATH) | |
| filtered_old = [item for item in old_data if item["source"] != source_name] | |
| new_data = filtered_old + records | |
| os.makedirs(os.path.dirname(JSON_PATH), exist_ok=True) | |
| with open(JSON_PATH, "w", encoding="utf-8") as f: | |
| json.dump(new_data, f, ensure_ascii=False, indent=2) | |
| print(f"\nβ μ΄ {len(records)}κ°μ νμ μ²λ¦¬νμ΅λλ€.") | |
| print(f"π μ μ₯ μμΉ: {JSON_PATH}") | |
| if col_rate is None: | |
| print("β οΈ κΈλ¦¬ 컬λΌμ μλμΌλ‘ μ°Ύμ§ λͺ»νμ΅λλ€. rate νλλ NoneμΌλ‘ μ²λ¦¬λ©λλ€.") | |