File size: 4,104 Bytes
c9ace58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import pandas as pd
import json
import os
import re
import sys
import chardet  # ์ธ์ฝ”๋”ฉ ๊ฐ์ง€๋ฅผ ์œ„ํ•ด ์‚ฌ์šฉ (pip install chardet)

# โœ… ์™ธ๋ถ€์—์„œ ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ ์ธ์ž๋กœ ๋ฐ›์Œ
if len(sys.argv) < 2:
    raise ValueError("โŒ CSV ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ ์ธ์ž๋กœ ์ „๋‹ฌํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ์˜ˆ: python convert_csv_to_json_auto.py data/raw_data/์ƒˆํŒŒ์ผ.csv")

CSV_PATH = sys.argv[1]
JSON_PATH = "data/deposit_docs.json"

# ============================================
# 1๏ธโƒฃ CSV ์ธ์ฝ”๋”ฉ ๋ฐ ํ—ค๋” ์ž๋™ ๊ฐ์ง€
# ============================================

def read_csv_auto(path):
    """CSV ์ธ์ฝ”๋”ฉ ๋ฐ ํ—ค๋” ์ž๋™ ๊ฐ์ง€ ํ›„ DataFrame์œผ๋กœ ๋กœ๋“œ"""
    try:
        # ๐Ÿ” ์ธ์ฝ”๋”ฉ ์ž๋™ ๊ฐ์ง€
        with open(path, "rb") as f:
            encoding_info = chardet.detect(f.read(50000))
        encoding = encoding_info["encoding"] or "utf-8"

        # ์ฒซ ์ค„ ๋ฏธ๋ฆฌ ๋ณด๊ธฐ
        preview = pd.read_csv(path, nrows=1, header=None, encoding=encoding)
        first_row = preview.iloc[0].tolist()
        str_ratio = sum(isinstance(x, str) for x in first_row) / len(first_row)

        if str_ratio > 0.5:
            print(f"โœ… ํ—ค๋” ๊ฐ์ง€๋จ โ†’ ์ฒซ ํ–‰์„ ์ปฌ๋Ÿผ๋ช…์œผ๋กœ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. (์ธ์ฝ”๋”ฉ: {encoding})")
            df = pd.read_csv(path, header=0, encoding=encoding)
        else:
            print(f"โš ๏ธ ํ—ค๋” ์—†์Œ โ†’ ์ž„์˜ ์ปฌ๋Ÿผ๋ช… ๋ถ€์—ฌํ•ฉ๋‹ˆ๋‹ค. (์ธ์ฝ”๋”ฉ: {encoding})")
            df = pd.read_csv(path, header=None, encoding=encoding)
            df.columns = [f"์ปฌ๋Ÿผ{i+1}" for i in range(len(df.columns))]
    except Exception as e:
        raise RuntimeError(f"CSV ๋กœ๋“œ ์‹คํŒจ: {e}")
    return df.fillna("")

df = read_csv_auto(CSV_PATH)

# ============================================
# 2๏ธโƒฃ ์ฃผ์š” ์ปฌ๋Ÿผ ์ž๋™ ํƒ์ง€ (๊ธˆ๋ฆฌ / ์€ํ–‰๋ช… / ์ƒํ’ˆ๋ช… / ๊ธฐ๊ฐ„)
# ============================================

def detect_column(columns, keywords):
    for col in columns:
        if any(kw in str(col) for kw in keywords):
            return col
    return None

col_bank = detect_column(df.columns, ["๊ธˆ์œตํšŒ์‚ฌ", "์€ํ–‰", "๊ธฐ๊ด€"])
col_product = detect_column(df.columns, ["์ƒํ’ˆ", "์˜ˆ๊ธˆ", "ํŽ€๋“œ", "๋Œ€์ถœ"])
col_rate = detect_column(df.columns, ["๊ธˆ๋ฆฌ", "์ด์œจ", "์ˆ˜์ต๋ฅ "])
col_period = detect_column(df.columns, ["๊ธฐ๊ฐ„", "๋งŒ๊ธฐ", "๊ฐ€์ž…"])

# ============================================
# 3๏ธโƒฃ ๊ฐ ํ–‰(row)์„ ๋ฌธ์žฅ ํ˜•ํƒœ๋กœ ๋ณ€ํ™˜
# ============================================

records = []
for _, row in df.iterrows():
    text_parts = [f"{col}: {row[col]}" for col in df.columns]
    combined_text = " | ".join(text_parts)

    # ๊ธˆ๋ฆฌ ์ˆซ์ž ๋ณ€ํ™˜
    rate_val = None
    if col_rate and str(row[col_rate]).strip() != "":
        match = re.search(r"[\d.]+", str(row[col_rate]))
        rate_val = float(match.group()) if match else None

    meta = {
        "bank": str(row[col_bank]) if col_bank else None,
        "product": str(row[col_product]) if col_product else None,
        "rate": rate_val,
        "period": str(row[col_period]) if col_period else None,
    }

    records.append({
        "source": os.path.basename(CSV_PATH),
        "content": combined_text,
        "meta": {k: v for k, v in meta.items() if v not in [None, ""]},
    })

# ============================================
# 4๏ธโƒฃ ๊ธฐ์กด JSON ๋ณ‘ํ•ฉ ๋ฐ ์ €์žฅ
# ============================================

if os.path.exists(JSON_PATH):
    with open(JSON_PATH, "r", encoding="utf-8") as f:
        old_data = json.load(f)
else:
    old_data = []

source_name = os.path.basename(CSV_PATH)
filtered_old = [item for item in old_data if item["source"] != source_name]
new_data = filtered_old + records

os.makedirs(os.path.dirname(JSON_PATH), exist_ok=True)
with open(JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(new_data, f, ensure_ascii=False, indent=2)

print(f"\nโœ… ์ด {len(records)}๊ฐœ์˜ ํ–‰์„ ์ฒ˜๋ฆฌํ–ˆ์Šต๋‹ˆ๋‹ค.")
print(f"๐Ÿ“ ์ €์žฅ ์œ„์น˜: {JSON_PATH}")
if col_rate is None:
    print("โš ๏ธ ๊ธˆ๋ฆฌ ์ปฌ๋Ÿผ์„ ์ž๋™์œผ๋กœ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. rate ํ•„๋“œ๋Š” None์œผ๋กœ ์ฒ˜๋ฆฌ๋ฉ๋‹ˆ๋‹ค.")