Spaces:
Sleeping
Sleeping
File size: 4,104 Bytes
c9ace58 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import pandas as pd
import json
import os
import re
import sys
import chardet # ์ธ์ฝ๋ฉ ๊ฐ์ง๋ฅผ ์ํด ์ฌ์ฉ (pip install chardet)
# โ
์ธ๋ถ์์ ํ์ผ ๊ฒฝ๋ก๋ฅผ ์ธ์๋ก ๋ฐ์
if len(sys.argv) < 2:
raise ValueError("โ CSV ํ์ผ ๊ฒฝ๋ก๋ฅผ ์ธ์๋ก ์ ๋ฌํด์ผ ํฉ๋๋ค. ์: python convert_csv_to_json_auto.py data/raw_data/์ํ์ผ.csv")
CSV_PATH = sys.argv[1]
JSON_PATH = "data/deposit_docs.json"
# ============================================
# 1๏ธโฃ CSV ์ธ์ฝ๋ฉ ๋ฐ ํค๋ ์๋ ๊ฐ์ง
# ============================================
def read_csv_auto(path):
"""CSV ์ธ์ฝ๋ฉ ๋ฐ ํค๋ ์๋ ๊ฐ์ง ํ DataFrame์ผ๋ก ๋ก๋"""
try:
# ๐ ์ธ์ฝ๋ฉ ์๋ ๊ฐ์ง
with open(path, "rb") as f:
encoding_info = chardet.detect(f.read(50000))
encoding = encoding_info["encoding"] or "utf-8"
# ์ฒซ ์ค ๋ฏธ๋ฆฌ ๋ณด๊ธฐ
preview = pd.read_csv(path, nrows=1, header=None, encoding=encoding)
first_row = preview.iloc[0].tolist()
str_ratio = sum(isinstance(x, str) for x in first_row) / len(first_row)
if str_ratio > 0.5:
print(f"โ
ํค๋ ๊ฐ์ง๋จ โ ์ฒซ ํ์ ์ปฌ๋ผ๋ช
์ผ๋ก ์ฌ์ฉํฉ๋๋ค. (์ธ์ฝ๋ฉ: {encoding})")
df = pd.read_csv(path, header=0, encoding=encoding)
else:
print(f"โ ๏ธ ํค๋ ์์ โ ์์ ์ปฌ๋ผ๋ช
๋ถ์ฌํฉ๋๋ค. (์ธ์ฝ๋ฉ: {encoding})")
df = pd.read_csv(path, header=None, encoding=encoding)
df.columns = [f"์ปฌ๋ผ{i+1}" for i in range(len(df.columns))]
except Exception as e:
raise RuntimeError(f"CSV ๋ก๋ ์คํจ: {e}")
return df.fillna("")
df = read_csv_auto(CSV_PATH)
# ============================================
# 2๏ธโฃ ์ฃผ์ ์ปฌ๋ผ ์๋ ํ์ง (๊ธ๋ฆฌ / ์ํ๋ช
/ ์ํ๋ช
/ ๊ธฐ๊ฐ)
# ============================================
def detect_column(columns, keywords):
for col in columns:
if any(kw in str(col) for kw in keywords):
return col
return None
col_bank = detect_column(df.columns, ["๊ธ์ตํ์ฌ", "์ํ", "๊ธฐ๊ด"])
col_product = detect_column(df.columns, ["์ํ", "์๊ธ", "ํ๋", "๋์ถ"])
col_rate = detect_column(df.columns, ["๊ธ๋ฆฌ", "์ด์จ", "์์ต๋ฅ "])
col_period = detect_column(df.columns, ["๊ธฐ๊ฐ", "๋ง๊ธฐ", "๊ฐ์
"])
# ============================================
# 3๏ธโฃ ๊ฐ ํ(row)์ ๋ฌธ์ฅ ํํ๋ก ๋ณํ
# ============================================
records = []
for _, row in df.iterrows():
text_parts = [f"{col}: {row[col]}" for col in df.columns]
combined_text = " | ".join(text_parts)
# ๊ธ๋ฆฌ ์ซ์ ๋ณํ
rate_val = None
if col_rate and str(row[col_rate]).strip() != "":
match = re.search(r"[\d.]+", str(row[col_rate]))
rate_val = float(match.group()) if match else None
meta = {
"bank": str(row[col_bank]) if col_bank else None,
"product": str(row[col_product]) if col_product else None,
"rate": rate_val,
"period": str(row[col_period]) if col_period else None,
}
records.append({
"source": os.path.basename(CSV_PATH),
"content": combined_text,
"meta": {k: v for k, v in meta.items() if v not in [None, ""]},
})
# ============================================
# 4๏ธโฃ ๊ธฐ์กด JSON ๋ณํฉ ๋ฐ ์ ์ฅ
# ============================================
if os.path.exists(JSON_PATH):
with open(JSON_PATH, "r", encoding="utf-8") as f:
old_data = json.load(f)
else:
old_data = []
source_name = os.path.basename(CSV_PATH)
filtered_old = [item for item in old_data if item["source"] != source_name]
new_data = filtered_old + records
os.makedirs(os.path.dirname(JSON_PATH), exist_ok=True)
with open(JSON_PATH, "w", encoding="utf-8") as f:
json.dump(new_data, f, ensure_ascii=False, indent=2)
print(f"\nโ
์ด {len(records)}๊ฐ์ ํ์ ์ฒ๋ฆฌํ์ต๋๋ค.")
print(f"๐ ์ ์ฅ ์์น: {JSON_PATH}")
if col_rate is None:
print("โ ๏ธ ๊ธ๋ฆฌ ์ปฌ๋ผ์ ์๋์ผ๋ก ์ฐพ์ง ๋ชปํ์ต๋๋ค. rate ํ๋๋ None์ผ๋ก ์ฒ๋ฆฌ๋ฉ๋๋ค.")
|