capstone_backend_v2 / scripts /convert_excel_to_json.py
dongchan21
Fixed LFS tracking for index file and removed unnecessary excels
c9ace58
import pandas as pd
import json
import os
import re
import sys
# βœ… μ™ΈλΆ€μ—μ„œ 파일 경둜λ₯Ό 인자둜 λ°›μŒ
if len(sys.argv) < 2:
raise ValueError("❌ Excel 파일 경둜λ₯Ό 인자둜 전달해야 ν•©λ‹ˆλ‹€. 예: python convert_excel_to_json_auto.py data/raw_excels/μƒˆνŒŒμΌ.xlsx")
EXCEL_PATH = sys.argv[1]
JSON_PATH = "data/deposit_docs.json"
# ⚠️ μ—‘μ…€ μž„μ‹œνŒŒμΌ(~$) λ¬΄μ‹œ
if os.path.basename(EXCEL_PATH).startswith("~$"):
print(f"⏭️ μž„μ‹œνŒŒμΌ 감지됨, 처리 μƒλž΅: {EXCEL_PATH}")
sys.exit(0)
# ============================================
# 1️⃣ μ—‘μ…€ 헀더 μžλ™ 감지
# ============================================
def read_excel_auto(path):
"""μ—‘μ…€ 헀더 유무 μžλ™ 감지 ν›„ DataFrame으둜 λ‘œλ“œ"""
try:
preview = pd.read_excel(path, nrows=1, header=None)
first_row = preview.iloc[0].tolist()
str_ratio = sum(isinstance(x, str) for x in first_row) / len(first_row)
if str_ratio > 0.5:
print("βœ… 헀더 감지됨 β†’ 첫 행을 컬럼λͺ…μœΌλ‘œ μ‚¬μš©ν•©λ‹ˆλ‹€.")
df = pd.read_excel(path, header=0)
else:
print("⚠️ 헀더 μ—†μŒ β†’ μž„μ˜ 컬럼λͺ…(컬럼1, 컬럼2...) λΆ€μ—¬ν•©λ‹ˆλ‹€.")
df = pd.read_excel(path, header=None)
df.columns = [f"컬럼{i+1}" for i in range(len(df.columns))]
except Exception as e:
raise RuntimeError(f"μ—‘μ…€ λ‘œλ“œ μ‹€νŒ¨: {e}")
return df.fillna("")
df = read_excel_auto(EXCEL_PATH)
# ============================================
# 2️⃣ μ£Όμš” 컬럼 μžλ™ 탐지 (금리 / 은행λͺ… / μƒν’ˆλͺ… / κΈ°κ°„)
# ============================================
def detect_column(columns, keywords):
for col in columns:
if any(kw in str(col) for kw in keywords):
return col
return None
col_bank = detect_column(df.columns, ["κΈˆμœ΅νšŒμ‚¬", "은행", "κΈ°κ΄€"])
col_product = detect_column(df.columns, ["μƒν’ˆ", "예금", "νŽ€λ“œ", "λŒ€μΆœ"])
col_rate = detect_column(df.columns, ["금리", "이율", "수읡λ₯ "])
col_period = detect_column(df.columns, ["κΈ°κ°„", "만기", "κ°€μž…"])
# ============================================
# 3️⃣ 금리 숫자 λ³€ν™˜ ν•¨μˆ˜ (λ‚ μ§œ λ“± 였인 λ°©μ§€)
# ============================================
def parse_rate(value):
if pd.isna(value):
return None
s = str(value).strip()
# 숫자 μΆ”μΆœ
match = re.search(r"\d+(\.\d+)?", s)
if not match:
return None
num = float(match.group())
# λ‚ μ§œλ‘œ μΈμ‹λ˜λŠ” 숫자(100 이상 or 연도 ν˜•νƒœ) μ œμ™Έ
if num > 50 or "202" in s or "λ…„" in s:
return None
return num
# ============================================
# 4️⃣ 각 ν–‰(row)을 λ¬Έμž₯ ν˜•νƒœλ‘œ λ³€ν™˜
# ============================================
records = []
for _, row in df.iterrows():
text_parts = [f"{col}: {row[col]}" for col in df.columns]
combined_text = " | ".join(text_parts)
rate_val = parse_rate(row[col_rate]) if col_rate else None
meta = {
"bank": str(row[col_bank]) if col_bank else None,
"product": str(row[col_product]) if col_product else None,
"rate": rate_val,
"period": str(row[col_period]) if col_period else None,
}
records.append({
"source": os.path.basename(EXCEL_PATH),
"content": combined_text,
"meta": {k: v for k, v in meta.items() if v not in [None, ""]}
})
# ============================================
# 5️⃣ κΈ°μ‘΄ JSON 병합 및 μ €μž₯
# ============================================
if os.path.exists(JSON_PATH):
with open(JSON_PATH, "r", encoding="utf-8") as f:
old_data = json.load(f)
else:
old_data = []
source_name = os.path.basename(EXCEL_PATH)
filtered_old = [item for item in old_data if item["source"] != source_name]
new_data = filtered_old + records
os.makedirs(os.path.dirname(JSON_PATH), exist_ok=True)
with open(JSON_PATH, "w", encoding="utf-8") as f:
json.dump(new_data, f, ensure_ascii=False, indent=2)
print(f"\nβœ… 총 {len(records)}개의 행을 μ²˜λ¦¬ν–ˆμŠ΅λ‹ˆλ‹€.")
print(f"πŸ“ μ €μž₯ μœ„μΉ˜: {JSON_PATH}")
if col_rate is None:
print("⚠️ 금리 μ»¬λŸΌμ„ μžλ™μœΌλ‘œ μ°Ύμ§€ λͺ»ν–ˆμŠ΅λ‹ˆλ‹€. rate ν•„λ“œλŠ” None으둜 μ²˜λ¦¬λ©λ‹ˆλ‹€.")