Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import json | |
| import os | |
| import re | |
| import sys | |
| import chardet # ์ธ์ฝ๋ฉ ๊ฐ์ง๋ฅผ ์ํด ์ฌ์ฉ (pip install chardet) | |
| # โ ์ธ๋ถ์์ ํ์ผ ๊ฒฝ๋ก๋ฅผ ์ธ์๋ก ๋ฐ์ | |
| if len(sys.argv) < 2: | |
| raise ValueError("โ CSV ํ์ผ ๊ฒฝ๋ก๋ฅผ ์ธ์๋ก ์ ๋ฌํด์ผ ํฉ๋๋ค. ์: python convert_csv_to_json_auto.py data/raw_data/์ํ์ผ.csv") | |
| CSV_PATH = sys.argv[1] | |
| JSON_PATH = "data/deposit_docs.json" | |
| # ============================================ | |
| # 1๏ธโฃ CSV ์ธ์ฝ๋ฉ ๋ฐ ํค๋ ์๋ ๊ฐ์ง | |
| # ============================================ | |
| def read_csv_auto(path): | |
| """CSV ์ธ์ฝ๋ฉ ๋ฐ ํค๋ ์๋ ๊ฐ์ง ํ DataFrame์ผ๋ก ๋ก๋""" | |
| try: | |
| # ๐ ์ธ์ฝ๋ฉ ์๋ ๊ฐ์ง | |
| with open(path, "rb") as f: | |
| encoding_info = chardet.detect(f.read(50000)) | |
| encoding = encoding_info["encoding"] or "utf-8" | |
| # ์ฒซ ์ค ๋ฏธ๋ฆฌ ๋ณด๊ธฐ | |
| preview = pd.read_csv(path, nrows=1, header=None, encoding=encoding) | |
| first_row = preview.iloc[0].tolist() | |
| str_ratio = sum(isinstance(x, str) for x in first_row) / len(first_row) | |
| if str_ratio > 0.5: | |
| print(f"โ ํค๋ ๊ฐ์ง๋จ โ ์ฒซ ํ์ ์ปฌ๋ผ๋ช ์ผ๋ก ์ฌ์ฉํฉ๋๋ค. (์ธ์ฝ๋ฉ: {encoding})") | |
| df = pd.read_csv(path, header=0, encoding=encoding) | |
| else: | |
| print(f"โ ๏ธ ํค๋ ์์ โ ์์ ์ปฌ๋ผ๋ช ๋ถ์ฌํฉ๋๋ค. (์ธ์ฝ๋ฉ: {encoding})") | |
| df = pd.read_csv(path, header=None, encoding=encoding) | |
| df.columns = [f"์ปฌ๋ผ{i+1}" for i in range(len(df.columns))] | |
| except Exception as e: | |
| raise RuntimeError(f"CSV ๋ก๋ ์คํจ: {e}") | |
| return df.fillna("") | |
| df = read_csv_auto(CSV_PATH) | |
| # ============================================ | |
| # 2๏ธโฃ ์ฃผ์ ์ปฌ๋ผ ์๋ ํ์ง (๊ธ๋ฆฌ / ์ํ๋ช / ์ํ๋ช / ๊ธฐ๊ฐ) | |
| # ============================================ | |
| def detect_column(columns, keywords): | |
| for col in columns: | |
| if any(kw in str(col) for kw in keywords): | |
| return col | |
| return None | |
| col_bank = detect_column(df.columns, ["๊ธ์ตํ์ฌ", "์ํ", "๊ธฐ๊ด"]) | |
| col_product = detect_column(df.columns, ["์ํ", "์๊ธ", "ํ๋", "๋์ถ"]) | |
| col_rate = detect_column(df.columns, ["๊ธ๋ฆฌ", "์ด์จ", "์์ต๋ฅ "]) | |
| col_period = detect_column(df.columns, ["๊ธฐ๊ฐ", "๋ง๊ธฐ", "๊ฐ์ "]) | |
| # ============================================ | |
| # 3๏ธโฃ ๊ฐ ํ(row)์ ๋ฌธ์ฅ ํํ๋ก ๋ณํ | |
| # ============================================ | |
| records = [] | |
| for _, row in df.iterrows(): | |
| text_parts = [f"{col}: {row[col]}" for col in df.columns] | |
| combined_text = " | ".join(text_parts) | |
| # ๊ธ๋ฆฌ ์ซ์ ๋ณํ | |
| rate_val = None | |
| if col_rate and str(row[col_rate]).strip() != "": | |
| match = re.search(r"[\d.]+", str(row[col_rate])) | |
| rate_val = float(match.group()) if match else None | |
| meta = { | |
| "bank": str(row[col_bank]) if col_bank else None, | |
| "product": str(row[col_product]) if col_product else None, | |
| "rate": rate_val, | |
| "period": str(row[col_period]) if col_period else None, | |
| } | |
| records.append({ | |
| "source": os.path.basename(CSV_PATH), | |
| "content": combined_text, | |
| "meta": {k: v for k, v in meta.items() if v not in [None, ""]}, | |
| }) | |
| # ============================================ | |
| # 4๏ธโฃ ๊ธฐ์กด JSON ๋ณํฉ ๋ฐ ์ ์ฅ | |
| # ============================================ | |
| if os.path.exists(JSON_PATH): | |
| with open(JSON_PATH, "r", encoding="utf-8") as f: | |
| old_data = json.load(f) | |
| else: | |
| old_data = [] | |
| source_name = os.path.basename(CSV_PATH) | |
| filtered_old = [item for item in old_data if item["source"] != source_name] | |
| new_data = filtered_old + records | |
| os.makedirs(os.path.dirname(JSON_PATH), exist_ok=True) | |
| with open(JSON_PATH, "w", encoding="utf-8") as f: | |
| json.dump(new_data, f, ensure_ascii=False, indent=2) | |
| print(f"\nโ ์ด {len(records)}๊ฐ์ ํ์ ์ฒ๋ฆฌํ์ต๋๋ค.") | |
| print(f"๐ ์ ์ฅ ์์น: {JSON_PATH}") | |
| if col_rate is None: | |
| print("โ ๏ธ ๊ธ๋ฆฌ ์ปฌ๋ผ์ ์๋์ผ๋ก ์ฐพ์ง ๋ชปํ์ต๋๋ค. rate ํ๋๋ None์ผ๋ก ์ฒ๋ฆฌ๋ฉ๋๋ค.") | |