Spaces:
Sleeping
Sleeping
File size: 2,286 Bytes
c9ace58 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | import json
import os
import sys
import chardet
# β
μΈλΆμμ νμΌ κ²½λ‘λ₯Ό μΈμλ‘ λ°μ
if len(sys.argv) < 2:
raise ValueError("β ν
μ€νΈ νμΌ κ²½λ‘λ₯Ό μΈμλ‘ μ λ¬ν΄μΌ ν©λλ€. μ: python convert_txt_to_json.py data/raw_txt/μνμΌ.txt")
TXT_PATH = sys.argv[1]
JSON_PATH = "data/deposit_docs.json"
def read_txt_auto(path):
"""ν
μ€νΈ νμΌ μΈμ½λ© μλ κ°μ§ ν λ΄μ© μ½κΈ°"""
try:
# π μΈμ½λ© μλ κ°μ§
with open(path, "rb") as f:
raw_data = f.read()
encoding_info = chardet.detect(raw_data)
encoding = encoding_info["encoding"] or "utf-8"
print(f"β
μΈμ½λ© κ°μ§λ¨: {encoding}")
return raw_data.decode(encoding)
except Exception as e:
raise RuntimeError(f"TXT λ‘λ μ€ν¨: {e}")
# ν
μ€νΈ λ΄μ© μ½κΈ°
content = read_txt_auto(TXT_PATH)
# ============================================
# JSON λ°μ΄ν° μμ±
# ============================================
# ν
μ€νΈ νμΌμ μ 체 λ΄μ©μ νλμ λ¬Έμλ‘ μ²λ¦¬νκ±°λ,
# νμμ λ°λΌ λ¨λ½λ³λ‘ λλ μ μμ΅λλ€.
# μ¬κΈ°μλ μ 체 λ΄μ©μ νλμ 'content'λ‘ μ²λ¦¬ν©λλ€.
record = {
"source": os.path.basename(TXT_PATH),
"content": content,
"meta": {
"type": "text_file",
"original_path": TXT_PATH
}
}
# ============================================
# κΈ°μ‘΄ JSON λ³ν© λ° μ μ₯
# ============================================
if os.path.exists(JSON_PATH):
with open(JSON_PATH, "r", encoding="utf-8") as f:
try:
old_data = json.load(f)
except json.JSONDecodeError:
old_data = []
else:
old_data = []
source_name = os.path.basename(TXT_PATH)
# κΈ°μ‘΄μ κ°μ νμΌλͺ
μΌλ‘ μ²λ¦¬λ λ°μ΄ν°κ° μλ€λ©΄ μ κ±° (μ€λ³΅ λ°©μ§)
filtered_old = [item for item in old_data if item.get("source") != source_name]
# μ λ°μ΄ν° μΆκ°
filtered_old.append(record)
os.makedirs(os.path.dirname(JSON_PATH), exist_ok=True)
with open(JSON_PATH, "w", encoding="utf-8") as f:
json.dump(filtered_old, f, ensure_ascii=False, indent=2)
print(f"\nβ
ν
μ€νΈ νμΌ μ²λ¦¬ μλ£: {source_name}")
print(f"π μ μ₯ μμΉ: {JSON_PATH}")
|