Spaces:

dohyune
/

boxlabel

Sleeping

App Files Files Community

dohyune commited on Aug 28, 2025

Commit

b6bf769

verified ·

1 Parent(s): 57565d7

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -19

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import streamlit as st
 import pandas as pd
 import io, zipfile, re, html, json
-from typing import Dict, Tuple, Optional
-st.set_page_config(page_title="📦 박스라벨(HWPX) — 필드 완전 치환", layout="wide")
-st.title("📦 박스라벨 자동 생성기 — HWPX 필드 완전 치환(모든 XML / 접두어 와일드카드)")
 # ================== 데이터 유틸 ==================
 def _year_range(series: pd.Series) -> str:
@@ -47,20 +47,28 @@ def build_rows(df: pd.DataFrame) -> pd.DataFrame:
     return merged
 # ================== 치환 유틸 ==================
-# 접두어 와일드카드: <hp:..> 뿐 아니라 <hwp:..>, <h:..> 등 모두 허용
 FIELD_PAIR_RE_TMPL = (
     r'<(?P<prefix>[a-zA-Z0-9_]+):fieldBegin\b[^>]*\bname="{name}"[^>]*/>'
     r'(.*?)'
     r'<(?P=prefix):fieldEnd\b[^>]*/>'
 )
-# 토큰(백업 경로)
 TOKEN_FMT = "{{{{{key}}}}}"
-def _run_for_plain(text: str) -> str:
     return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>"
-def _run_for_list(text: str) -> str:
     if text is None: return ""
     lines = str(text).replace("\r\n","\n").split("\n")
     parts = []
@@ -72,11 +80,10 @@ def _run_for_list(text: str) -> str:
 def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
     changed_any = False
-    # 1) 필드쌍 완전 치환 (모든 접두어, 모든 XML 대상)
     for k, v in mapping.items():
         is_list = bool(re.match(r"^(목록|list)\d+$", k, re.IGNORECASE))
-        replacement = _run_for_list(v) if is_list else _run_for_plain(v)
         pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
         xml_new, n = pat.subn(replacement, xml)
         if n:
@@ -84,21 +91,54 @@ def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
             xml = xml_new
             changed_any = True
-    # 2) 백업 경로: 토큰 치환 ({{키}}가 어딘가 남아있다면)
     for k, v in mapping.items():
         tok = TOKEN_FMT.format(key=k)
         if tok in xml:
-            rep = _run_for_list(v) if re.match(r"^(목록|list)\d+$", k, re.IGNORECASE) else html.escape("" if v is None else str(v))
             xml = xml.replace(tok, rep)
             dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1
             changed_any = True
     if changed_any:
-        dbg["files_touched"] = True
     return xml
 def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, dict]:
-    dbg = {"field_hits":{}, "token_hits":{}, "touched_files": []}
     zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
     out_buf = io.BytesIO()
     zout = zipfile.ZipFile(out_buf, "w")
@@ -117,7 +157,8 @@ def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, d
             try:
                 s = data.decode("utf-8", errors="ignore")
                 before = s
-                s = _apply_to_xml(s, mapping, {"field_hits": dbg["field_hits"], "token_hits": dbg["token_hits"], "files_touched": False})
                 if s != before:
                     dbg["touched_files"].append(e.filename)
                 data = s.encode("utf-8")
@@ -132,10 +173,11 @@ def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, d
 # ================== UI ==================
 with st.expander("사용법", expanded=True):
     st.markdown("""
-- 템플릿은 **HWPX(한글)**이며, 필드컨트롤 `name="박스번호1"` 등의 이름을 그대로 사용합니다.
-- 이 앱은 ZIP 내부의 **모든 XML**을 훑으며, 접두어가 무엇이든(`<hp:...>`, `<hwp:...>` 등)
-  **`fieldBegin(name=키)` ~ `fieldEnd` 사이 전체 블록을 값 run들로 교체**합니다.
-- 토큰(`{{박스번호1}}`)이 남아 있으면 **백업 경로**로 그 자리도 치환합니다.
 """)
 tpl = st.file_uploader("📄 HWPX 템플릿 업로드", type=["hwpx"])

 import streamlit as st
 import pandas as pd
 import io, zipfile, re, html, json
+from typing import Dict, Tuple
+st.set_page_config(page_title="📦 박스라벨(HWPX) — 완전 치환(필드/토큰/텍스트)", layout="wide")
+st.title("📦 박스라벨 자동 생성기 — HWPX 필드·토큰·텍스트 자리표시자 **완전 치환**")
 # ================== 데이터 유틸 ==================
 def _year_range(series: pd.Series) -> str:
     return merged
 # ================== 치환 유틸 ==================
+# 1) 접두어 와일드카드: <hp:..> 뿐 아니라 <hwp:..>, <h:..> 등 모두 허용
 FIELD_PAIR_RE_TMPL = (
     r'<(?P<prefix>[a-zA-Z0-9_]+):fieldBegin\b[^>]*\bname="{name}"[^>]*/>'
     r'(.*?)'
     r'<(?P=prefix):fieldEnd\b[^>]*/>'
 )
+# 2) 토큰(백업 경로)
 TOKEN_FMT = "{{{{{key}}}}}"
+# 3) 순수 텍스트 자리표시자: <hp:run>…<hp:t>키</hp:t>…</hp:run> 전체를 값 run들로 교체
+TEXT_RUN_RE_TMPL = (
+    r'(<(?P<prefix>[a-zA-Z0-9_]+):run\b[^>]*>\s*'
+    r'(?:<(?P=prefix):t[^>]*>)\s*)'
+    r'{name}'
+    r'(\s*(?:</(?P=prefix):t>)\s*</(?P=prefix):run>)'
+)
+def _runs_plain(text: str) -> str:
     return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>"
+def _runs_list(text: str) -> str:
     if text is None: return ""
     lines = str(text).replace("\r\n","\n").split("\n")
     parts = []
 def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
     changed_any = False
+    # 1) 필드쌍 완전 치환
     for k, v in mapping.items():
         is_list = bool(re.match(r"^(목록|list)\d+$", k, re.IGNORECASE))
+        replacement = _runs_list(v) if is_list else _runs_plain(v)
         pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
         xml_new, n = pat.subn(replacement, xml)
         if n:
             xml = xml_new
             changed_any = True
+    # 2) 순수 텍스트 자리표시자 치환 (<*:t>키</*:t>를 값 run들로)
+    for k, v in mapping.items():
+        is_list = bool(re.match(r"^(목록|list)\d+$", k, re.IGNORECASE))
+        replacement = _runs_list(v) if is_list else _runs_plain(v)
+        # 정확 일치 치환 (공백·개행 없이 딱 그 텍스트일 때)
+        pat_text = re.compile(TEXT_RUN_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
+        xml_new, n = pat_text.subn(replacement, xml)
+        if n:
+            dbg["text_hits"][k] = dbg["text_hits"].get(k, 0) + n
+            xml = xml_new
+            changed_any = True
+        else:
+            # 부분 일치(한 run 안에 다른 문자와 섞여 있을 때) → <t>내용</t>만 값으로 갈아끼움(줄바꿈은 \n 그대로)
+            # 안전을 위해 '한 run 내'에서만 처리
+            pat_tnode = re.compile(
+                r'(<(?P<prefix>[a-zA-Z0-9_]+):t[^>]*>)([^<]*?)</(?P=prefix):t>',
+                re.DOTALL
+            )
+            def repl_tnode(m):
+                text_node = m.group(3)
+                if k not in text_node:
+                    return m.group(0)
+                # 단순 텍스트 치환 (리스트면 줄바꿈을 실제 lineBreak로 바꾸기 어려워서 여기선 plain 텍스트로 우선 대입)
+                val = "" if v is None else str(v)
+                new_text = html.escape(text_node.replace(k, val))
+                return f"{m.group(1)}{new_text}</{m.group('prefix')}:t>"
+            xml2 = pat_tnode.sub(repl_tnode, xml)
+            if xml2 != xml:
+                dbg["text_hits"][k] = dbg["text_hits"].get(k, 0) + 1
+                xml = xml2
+                changed_any = True
+    # 3) 토큰 치환
     for k, v in mapping.items():
         tok = TOKEN_FMT.format(key=k)
         if tok in xml:
+            rep = _runs_list(v) if re.match(r"^(목록|list)\d+$", k, re.IGNORECASE) else html.escape("" if v is None else str(v))
             xml = xml.replace(tok, rep)
             dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1
             changed_any = True
     if changed_any:
+        dbg["touched"] = True
     return xml
 def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, dict]:
+    dbg = {"field_hits":{}, "text_hits":{}, "token_hits":{}, "touched_files": []}
     zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
     out_buf = io.BytesIO()
     zout = zipfile.ZipFile(out_buf, "w")
             try:
                 s = data.decode("utf-8", errors="ignore")
                 before = s
+                local_dbg = {"field_hits": dbg["field_hits"], "text_hits": dbg["text_hits"], "token_hits": dbg["token_hits"], "touched": False}
+                s = _apply_to_xml(s, mapping, local_dbg)
                 if s != before:
                     dbg["touched_files"].append(e.filename)
                 data = s.encode("utf-8")
 # ================== UI ==================
 with st.expander("사용법", expanded=True):
     st.markdown("""
+- 이 앱은 HWPX의 **모든 XML**에서 다음 3가지를 순서대로 치환합니다.
+  1) `fieldBegin(name=키)` ~ `fieldEnd` **필드쌍** 전체를 값으로 교체
+  2) `<*:t>키</*:t>` 같은 **순수 텍스트 자리표시자**를 값(run들)으로 교체
+  3) `{{키}}` **토큰**을 값으로 교체
+- 디버그 JSON의 `field_hits`/`text_hits`/`token_hits`가 0이면 템플릿에 해당 타입의 자리표시자가 없는 것입니다.
 """)
 tpl = st.file_uploader("📄 HWPX 템플릿 업로드", type=["hwpx"])