File size: 4,296 Bytes
c9ace58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pandas as pd
import json
import os
import re
import sys

# โœ… ์™ธ๋ถ€์—์„œ ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ ์ธ์ž๋กœ ๋ฐ›์Œ
if len(sys.argv) < 2:
    raise ValueError("โŒ Excel ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ ์ธ์ž๋กœ ์ „๋‹ฌํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ์˜ˆ: python convert_excel_to_json_auto.py data/raw_excels/์ƒˆํŒŒ์ผ.xlsx")

EXCEL_PATH = sys.argv[1]
JSON_PATH = "data/deposit_docs.json"

# โš ๏ธ ์—‘์…€ ์ž„์‹œํŒŒ์ผ(~$) ๋ฌด์‹œ
if os.path.basename(EXCEL_PATH).startswith("~$"):
    print(f"โญ๏ธ ์ž„์‹œํŒŒ์ผ ๊ฐ์ง€๋จ, ์ฒ˜๋ฆฌ ์ƒ๋žต: {EXCEL_PATH}")
    sys.exit(0)

# ============================================
# 1๏ธโƒฃ ์—‘์…€ ํ—ค๋” ์ž๋™ ๊ฐ์ง€
# ============================================

def read_excel_auto(path):
    """์—‘์…€ ํ—ค๋” ์œ ๋ฌด ์ž๋™ ๊ฐ์ง€ ํ›„ DataFrame์œผ๋กœ ๋กœ๋“œ"""
    try:
        preview = pd.read_excel(path, nrows=1, header=None)
        first_row = preview.iloc[0].tolist()
        str_ratio = sum(isinstance(x, str) for x in first_row) / len(first_row)

        if str_ratio > 0.5:
            print("โœ… ํ—ค๋” ๊ฐ์ง€๋จ โ†’ ์ฒซ ํ–‰์„ ์ปฌ๋Ÿผ๋ช…์œผ๋กœ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.")
            df = pd.read_excel(path, header=0)
        else:
            print("โš ๏ธ ํ—ค๋” ์—†์Œ โ†’ ์ž„์˜ ์ปฌ๋Ÿผ๋ช…(์ปฌ๋Ÿผ1, ์ปฌ๋Ÿผ2...) ๋ถ€์—ฌํ•ฉ๋‹ˆ๋‹ค.")
            df = pd.read_excel(path, header=None)
            df.columns = [f"์ปฌ๋Ÿผ{i+1}" for i in range(len(df.columns))]
    except Exception as e:
        raise RuntimeError(f"์—‘์…€ ๋กœ๋“œ ์‹คํŒจ: {e}")
    return df.fillna("")

df = read_excel_auto(EXCEL_PATH)

# ============================================
# 2๏ธโƒฃ ์ฃผ์š” ์ปฌ๋Ÿผ ์ž๋™ ํƒ์ง€ (๊ธˆ๋ฆฌ / ์€ํ–‰๋ช… / ์ƒํ’ˆ๋ช… / ๊ธฐ๊ฐ„)
# ============================================

def detect_column(columns, keywords):
    for col in columns:
        if any(kw in str(col) for kw in keywords):
            return col
    return None

col_bank = detect_column(df.columns, ["๊ธˆ์œตํšŒ์‚ฌ", "์€ํ–‰", "๊ธฐ๊ด€"])
col_product = detect_column(df.columns, ["์ƒํ’ˆ", "์˜ˆ๊ธˆ", "ํŽ€๋“œ", "๋Œ€์ถœ"])
col_rate = detect_column(df.columns, ["๊ธˆ๋ฆฌ", "์ด์œจ", "์ˆ˜์ต๋ฅ "])
col_period = detect_column(df.columns, ["๊ธฐ๊ฐ„", "๋งŒ๊ธฐ", "๊ฐ€์ž…"])

# ============================================
# 3๏ธโƒฃ ๊ธˆ๋ฆฌ ์ˆซ์ž ๋ณ€ํ™˜ ํ•จ์ˆ˜ (๋‚ ์งœ ๋“ฑ ์˜ค์ธ ๋ฐฉ์ง€)
# ============================================

def parse_rate(value):
    if pd.isna(value):
        return None
    s = str(value).strip()
    # ์ˆซ์ž ์ถ”์ถœ
    match = re.search(r"\d+(\.\d+)?", s)
    if not match:
        return None
    num = float(match.group())
    # ๋‚ ์งœ๋กœ ์ธ์‹๋˜๋Š” ์ˆซ์ž(100 ์ด์ƒ or ์—ฐ๋„ ํ˜•ํƒœ) ์ œ์™ธ
    if num > 50 or "202" in s or "๋…„" in s:
        return None
    return num

# ============================================
# 4๏ธโƒฃ ๊ฐ ํ–‰(row)์„ ๋ฌธ์žฅ ํ˜•ํƒœ๋กœ ๋ณ€ํ™˜
# ============================================

records = []
for _, row in df.iterrows():
    text_parts = [f"{col}: {row[col]}" for col in df.columns]
    combined_text = " | ".join(text_parts)

    rate_val = parse_rate(row[col_rate]) if col_rate else None

    meta = {
        "bank": str(row[col_bank]) if col_bank else None,
        "product": str(row[col_product]) if col_product else None,
        "rate": rate_val,
        "period": str(row[col_period]) if col_period else None,
    }

    records.append({
        "source": os.path.basename(EXCEL_PATH),
        "content": combined_text,
        "meta": {k: v for k, v in meta.items() if v not in [None, ""]}
    })

# ============================================
# 5๏ธโƒฃ ๊ธฐ์กด JSON ๋ณ‘ํ•ฉ ๋ฐ ์ €์žฅ
# ============================================

if os.path.exists(JSON_PATH):
    with open(JSON_PATH, "r", encoding="utf-8") as f:
        old_data = json.load(f)
else:
    old_data = []

source_name = os.path.basename(EXCEL_PATH)
filtered_old = [item for item in old_data if item["source"] != source_name]
new_data = filtered_old + records

os.makedirs(os.path.dirname(JSON_PATH), exist_ok=True)
with open(JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(new_data, f, ensure_ascii=False, indent=2)

print(f"\nโœ… ์ด {len(records)}๊ฐœ์˜ ํ–‰์„ ์ฒ˜๋ฆฌํ–ˆ์Šต๋‹ˆ๋‹ค.")
print(f"๐Ÿ“ ์ €์žฅ ์œ„์น˜: {JSON_PATH}")
if col_rate is None:
    print("โš ๏ธ ๊ธˆ๋ฆฌ ์ปฌ๋Ÿผ์„ ์ž๋™์œผ๋กœ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. rate ํ•„๋“œ๋Š” None์œผ๋กœ ์ฒ˜๋ฆฌ๋ฉ๋‹ˆ๋‹ค.")