capstone_backend_v2 / scripts /convert_csv_to_json.py
dongchan21
Fixed LFS tracking for index file and removed unnecessary excels
c9ace58
import pandas as pd
import json
import os
import re
import sys
import chardet # ์ธ์ฝ”๋”ฉ ๊ฐ์ง€๋ฅผ ์œ„ํ•ด ์‚ฌ์šฉ (pip install chardet)
# โœ… ์™ธ๋ถ€์—์„œ ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ ์ธ์ž๋กœ ๋ฐ›์Œ
if len(sys.argv) < 2:
raise ValueError("โŒ CSV ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ ์ธ์ž๋กœ ์ „๋‹ฌํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ์˜ˆ: python convert_csv_to_json_auto.py data/raw_data/์ƒˆํŒŒ์ผ.csv")
CSV_PATH = sys.argv[1]
JSON_PATH = "data/deposit_docs.json"
# ============================================
# 1๏ธโƒฃ CSV ์ธ์ฝ”๋”ฉ ๋ฐ ํ—ค๋” ์ž๋™ ๊ฐ์ง€
# ============================================
def read_csv_auto(path):
"""CSV ์ธ์ฝ”๋”ฉ ๋ฐ ํ—ค๋” ์ž๋™ ๊ฐ์ง€ ํ›„ DataFrame์œผ๋กœ ๋กœ๋“œ"""
try:
# ๐Ÿ” ์ธ์ฝ”๋”ฉ ์ž๋™ ๊ฐ์ง€
with open(path, "rb") as f:
encoding_info = chardet.detect(f.read(50000))
encoding = encoding_info["encoding"] or "utf-8"
# ์ฒซ ์ค„ ๋ฏธ๋ฆฌ ๋ณด๊ธฐ
preview = pd.read_csv(path, nrows=1, header=None, encoding=encoding)
first_row = preview.iloc[0].tolist()
str_ratio = sum(isinstance(x, str) for x in first_row) / len(first_row)
if str_ratio > 0.5:
print(f"โœ… ํ—ค๋” ๊ฐ์ง€๋จ โ†’ ์ฒซ ํ–‰์„ ์ปฌ๋Ÿผ๋ช…์œผ๋กœ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. (์ธ์ฝ”๋”ฉ: {encoding})")
df = pd.read_csv(path, header=0, encoding=encoding)
else:
print(f"โš ๏ธ ํ—ค๋” ์—†์Œ โ†’ ์ž„์˜ ์ปฌ๋Ÿผ๋ช… ๋ถ€์—ฌํ•ฉ๋‹ˆ๋‹ค. (์ธ์ฝ”๋”ฉ: {encoding})")
df = pd.read_csv(path, header=None, encoding=encoding)
df.columns = [f"์ปฌ๋Ÿผ{i+1}" for i in range(len(df.columns))]
except Exception as e:
raise RuntimeError(f"CSV ๋กœ๋“œ ์‹คํŒจ: {e}")
return df.fillna("")
df = read_csv_auto(CSV_PATH)
# ============================================
# 2๏ธโƒฃ ์ฃผ์š” ์ปฌ๋Ÿผ ์ž๋™ ํƒ์ง€ (๊ธˆ๋ฆฌ / ์€ํ–‰๋ช… / ์ƒํ’ˆ๋ช… / ๊ธฐ๊ฐ„)
# ============================================
def detect_column(columns, keywords):
for col in columns:
if any(kw in str(col) for kw in keywords):
return col
return None
col_bank = detect_column(df.columns, ["๊ธˆ์œตํšŒ์‚ฌ", "์€ํ–‰", "๊ธฐ๊ด€"])
col_product = detect_column(df.columns, ["์ƒํ’ˆ", "์˜ˆ๊ธˆ", "ํŽ€๋“œ", "๋Œ€์ถœ"])
col_rate = detect_column(df.columns, ["๊ธˆ๋ฆฌ", "์ด์œจ", "์ˆ˜์ต๋ฅ "])
col_period = detect_column(df.columns, ["๊ธฐ๊ฐ„", "๋งŒ๊ธฐ", "๊ฐ€์ž…"])
# ============================================
# 3๏ธโƒฃ ๊ฐ ํ–‰(row)์„ ๋ฌธ์žฅ ํ˜•ํƒœ๋กœ ๋ณ€ํ™˜
# ============================================
records = []
for _, row in df.iterrows():
text_parts = [f"{col}: {row[col]}" for col in df.columns]
combined_text = " | ".join(text_parts)
# ๊ธˆ๋ฆฌ ์ˆซ์ž ๋ณ€ํ™˜
rate_val = None
if col_rate and str(row[col_rate]).strip() != "":
match = re.search(r"[\d.]+", str(row[col_rate]))
rate_val = float(match.group()) if match else None
meta = {
"bank": str(row[col_bank]) if col_bank else None,
"product": str(row[col_product]) if col_product else None,
"rate": rate_val,
"period": str(row[col_period]) if col_period else None,
}
records.append({
"source": os.path.basename(CSV_PATH),
"content": combined_text,
"meta": {k: v for k, v in meta.items() if v not in [None, ""]},
})
# ============================================
# 4๏ธโƒฃ ๊ธฐ์กด JSON ๋ณ‘ํ•ฉ ๋ฐ ์ €์žฅ
# ============================================
if os.path.exists(JSON_PATH):
with open(JSON_PATH, "r", encoding="utf-8") as f:
old_data = json.load(f)
else:
old_data = []
source_name = os.path.basename(CSV_PATH)
filtered_old = [item for item in old_data if item["source"] != source_name]
new_data = filtered_old + records
os.makedirs(os.path.dirname(JSON_PATH), exist_ok=True)
with open(JSON_PATH, "w", encoding="utf-8") as f:
json.dump(new_data, f, ensure_ascii=False, indent=2)
print(f"\nโœ… ์ด {len(records)}๊ฐœ์˜ ํ–‰์„ ์ฒ˜๋ฆฌํ–ˆ์Šต๋‹ˆ๋‹ค.")
print(f"๐Ÿ“ ์ €์žฅ ์œ„์น˜: {JSON_PATH}")
if col_rate is None:
print("โš ๏ธ ๊ธˆ๋ฆฌ ์ปฌ๋Ÿผ์„ ์ž๋™์œผ๋กœ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. rate ํ•„๋“œ๋Š” None์œผ๋กœ ์ฒ˜๋ฆฌ๋ฉ๋‹ˆ๋‹ค.")