Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| import sys | |
| import chardet | |
| # β μΈλΆμμ νμΌ κ²½λ‘λ₯Ό μΈμλ‘ λ°μ | |
| if len(sys.argv) < 2: | |
| raise ValueError("β ν μ€νΈ νμΌ κ²½λ‘λ₯Ό μΈμλ‘ μ λ¬ν΄μΌ ν©λλ€. μ: python convert_txt_to_json.py data/raw_txt/μνμΌ.txt") | |
| TXT_PATH = sys.argv[1] | |
| JSON_PATH = "data/deposit_docs.json" | |
| def read_txt_auto(path): | |
| """ν μ€νΈ νμΌ μΈμ½λ© μλ κ°μ§ ν λ΄μ© μ½κΈ°""" | |
| try: | |
| # π μΈμ½λ© μλ κ°μ§ | |
| with open(path, "rb") as f: | |
| raw_data = f.read() | |
| encoding_info = chardet.detect(raw_data) | |
| encoding = encoding_info["encoding"] or "utf-8" | |
| print(f"β μΈμ½λ© κ°μ§λ¨: {encoding}") | |
| return raw_data.decode(encoding) | |
| except Exception as e: | |
| raise RuntimeError(f"TXT λ‘λ μ€ν¨: {e}") | |
| # ν μ€νΈ λ΄μ© μ½κΈ° | |
| content = read_txt_auto(TXT_PATH) | |
| # ============================================ | |
| # JSON λ°μ΄ν° μμ± | |
| # ============================================ | |
| # ν μ€νΈ νμΌμ μ 체 λ΄μ©μ νλμ λ¬Έμλ‘ μ²λ¦¬νκ±°λ, | |
| # νμμ λ°λΌ λ¨λ½λ³λ‘ λλ μ μμ΅λλ€. | |
| # μ¬κΈ°μλ μ 체 λ΄μ©μ νλμ 'content'λ‘ μ²λ¦¬ν©λλ€. | |
| record = { | |
| "source": os.path.basename(TXT_PATH), | |
| "content": content, | |
| "meta": { | |
| "type": "text_file", | |
| "original_path": TXT_PATH | |
| } | |
| } | |
| # ============================================ | |
| # κΈ°μ‘΄ JSON λ³ν© λ° μ μ₯ | |
| # ============================================ | |
| if os.path.exists(JSON_PATH): | |
| with open(JSON_PATH, "r", encoding="utf-8") as f: | |
| try: | |
| old_data = json.load(f) | |
| except json.JSONDecodeError: | |
| old_data = [] | |
| else: | |
| old_data = [] | |
| source_name = os.path.basename(TXT_PATH) | |
| # κΈ°μ‘΄μ κ°μ νμΌλͺ μΌλ‘ μ²λ¦¬λ λ°μ΄ν°κ° μλ€λ©΄ μ κ±° (μ€λ³΅ λ°©μ§) | |
| filtered_old = [item for item in old_data if item.get("source") != source_name] | |
| # μ λ°μ΄ν° μΆκ° | |
| filtered_old.append(record) | |
| os.makedirs(os.path.dirname(JSON_PATH), exist_ok=True) | |
| with open(JSON_PATH, "w", encoding="utf-8") as f: | |
| json.dump(filtered_old, f, ensure_ascii=False, indent=2) | |
| print(f"\nβ ν μ€νΈ νμΌ μ²λ¦¬ μλ£: {source_name}") | |
| print(f"π μ μ₯ μμΉ: {JSON_PATH}") | |