Update app.py
Browse files
app.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
import io, zipfile, re, html, json
|
| 4 |
-
from typing import Dict, Tuple
|
| 5 |
|
| 6 |
-
st.set_page_config(page_title="π¦ λ°μ€λΌλ²¨(HWPX) β
|
| 7 |
-
st.title("π¦ λ°μ€λΌλ²¨ μλ μμ±κΈ° β HWPX νλ μμ μΉν
|
| 8 |
|
| 9 |
# ================== λ°μ΄ν° μ νΈ ==================
|
| 10 |
def _year_range(series: pd.Series) -> str:
|
|
@@ -47,20 +47,28 @@ def build_rows(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 47 |
return merged
|
| 48 |
|
| 49 |
# ================== μΉν μ νΈ ==================
|
| 50 |
-
# μ λμ΄ μμΌλμΉ΄λ: <hp:..> λΏ μλλΌ <hwp:..>, <h:..> λ± λͺ¨λ νμ©
|
| 51 |
FIELD_PAIR_RE_TMPL = (
|
| 52 |
r'<(?P<prefix>[a-zA-Z0-9_]+):fieldBegin\b[^>]*\bname="{name}"[^>]*/>'
|
| 53 |
r'(.*?)'
|
| 54 |
r'<(?P=prefix):fieldEnd\b[^>]*/>'
|
| 55 |
)
|
| 56 |
|
| 57 |
-
# ν ν°(λ°±μ
κ²½λ‘)
|
| 58 |
TOKEN_FMT = "{{{{{key}}}}}"
|
| 59 |
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>"
|
| 62 |
|
| 63 |
-
def
|
| 64 |
if text is None: return ""
|
| 65 |
lines = str(text).replace("\r\n","\n").split("\n")
|
| 66 |
parts = []
|
|
@@ -72,11 +80,10 @@ def _run_for_list(text: str) -> str:
|
|
| 72 |
def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
|
| 73 |
changed_any = False
|
| 74 |
|
| 75 |
-
# 1) νλμ μμ μΉν
|
| 76 |
for k, v in mapping.items():
|
| 77 |
is_list = bool(re.match(r"^(λͺ©λ‘|list)\d+$", k, re.IGNORECASE))
|
| 78 |
-
replacement =
|
| 79 |
-
|
| 80 |
pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
|
| 81 |
xml_new, n = pat.subn(replacement, xml)
|
| 82 |
if n:
|
|
@@ -84,21 +91,54 @@ def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
|
|
| 84 |
xml = xml_new
|
| 85 |
changed_any = True
|
| 86 |
|
| 87 |
-
# 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
for k, v in mapping.items():
|
| 89 |
tok = TOKEN_FMT.format(key=k)
|
| 90 |
if tok in xml:
|
| 91 |
-
rep =
|
| 92 |
xml = xml.replace(tok, rep)
|
| 93 |
dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1
|
| 94 |
changed_any = True
|
| 95 |
|
| 96 |
if changed_any:
|
| 97 |
-
dbg["
|
| 98 |
return xml
|
| 99 |
|
| 100 |
def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, dict]:
|
| 101 |
-
dbg = {"field_hits":{}, "token_hits":{}, "touched_files": []}
|
| 102 |
zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
|
| 103 |
out_buf = io.BytesIO()
|
| 104 |
zout = zipfile.ZipFile(out_buf, "w")
|
|
@@ -117,7 +157,8 @@ def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, d
|
|
| 117 |
try:
|
| 118 |
s = data.decode("utf-8", errors="ignore")
|
| 119 |
before = s
|
| 120 |
-
|
|
|
|
| 121 |
if s != before:
|
| 122 |
dbg["touched_files"].append(e.filename)
|
| 123 |
data = s.encode("utf-8")
|
|
@@ -132,10 +173,11 @@ def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, d
|
|
| 132 |
# ================== UI ==================
|
| 133 |
with st.expander("μ¬μ©λ²", expanded=True):
|
| 134 |
st.markdown("""
|
| 135 |
-
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
|
|
|
| 139 |
""")
|
| 140 |
|
| 141 |
tpl = st.file_uploader("π HWPX ν
νλ¦Ώ μ
λ‘λ", type=["hwpx"])
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
import io, zipfile, re, html, json
|
| 4 |
+
from typing import Dict, Tuple
|
| 5 |
|
| 6 |
+
st.set_page_config(page_title="π¦ λ°μ€λΌλ²¨(HWPX) β μμ μΉν(νλ/ν ν°/ν
μ€νΈ)", layout="wide")
|
| 7 |
+
st.title("π¦ λ°μ€λΌλ²¨ μλ μμ±κΈ° β HWPX νλΒ·ν ν°Β·ν
μ€νΈ μ리νμμ **μμ μΉν**")
|
| 8 |
|
| 9 |
# ================== λ°μ΄ν° μ νΈ ==================
|
| 10 |
def _year_range(series: pd.Series) -> str:
|
|
|
|
| 47 |
return merged
|
| 48 |
|
| 49 |
# ================== μΉν μ νΈ ==================
|
| 50 |
+
# 1) μ λμ΄ μμΌλμΉ΄λ: <hp:..> λΏ μλλΌ <hwp:..>, <h:..> λ± λͺ¨λ νμ©
|
| 51 |
FIELD_PAIR_RE_TMPL = (
|
| 52 |
r'<(?P<prefix>[a-zA-Z0-9_]+):fieldBegin\b[^>]*\bname="{name}"[^>]*/>'
|
| 53 |
r'(.*?)'
|
| 54 |
r'<(?P=prefix):fieldEnd\b[^>]*/>'
|
| 55 |
)
|
| 56 |
|
| 57 |
+
# 2) ν ν°(λ°±μ
κ²½λ‘)
|
| 58 |
TOKEN_FMT = "{{{{{key}}}}}"
|
| 59 |
|
| 60 |
+
# 3) μμ ν
μ€νΈ μ리νμμ: <hp:run>β¦<hp:t>ν€</hp:t>β¦</hp:run> μ 체λ₯Ό κ° runλ€λ‘ κ΅μ²΄
|
| 61 |
+
TEXT_RUN_RE_TMPL = (
|
| 62 |
+
r'(<(?P<prefix>[a-zA-Z0-9_]+):run\b[^>]*>\s*'
|
| 63 |
+
r'(?:<(?P=prefix):t[^>]*>)\s*)'
|
| 64 |
+
r'{name}'
|
| 65 |
+
r'(\s*(?:</(?P=prefix):t>)\s*</(?P=prefix):run>)'
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
def _runs_plain(text: str) -> str:
|
| 69 |
return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>"
|
| 70 |
|
| 71 |
+
def _runs_list(text: str) -> str:
|
| 72 |
if text is None: return ""
|
| 73 |
lines = str(text).replace("\r\n","\n").split("\n")
|
| 74 |
parts = []
|
|
|
|
| 80 |
def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
|
| 81 |
changed_any = False
|
| 82 |
|
| 83 |
+
# 1) νλμ μμ μΉν
|
| 84 |
for k, v in mapping.items():
|
| 85 |
is_list = bool(re.match(r"^(λͺ©λ‘|list)\d+$", k, re.IGNORECASE))
|
| 86 |
+
replacement = _runs_list(v) if is_list else _runs_plain(v)
|
|
|
|
| 87 |
pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
|
| 88 |
xml_new, n = pat.subn(replacement, xml)
|
| 89 |
if n:
|
|
|
|
| 91 |
xml = xml_new
|
| 92 |
changed_any = True
|
| 93 |
|
| 94 |
+
# 2) μμ ν
μ€νΈ μ리νμμ μΉν (<*:t>ν€</*:t>λ₯Ό κ° runλ€λ‘)
|
| 95 |
+
for k, v in mapping.items():
|
| 96 |
+
is_list = bool(re.match(r"^(λͺ©λ‘|list)\d+$", k, re.IGNORECASE))
|
| 97 |
+
replacement = _runs_list(v) if is_list else _runs_plain(v)
|
| 98 |
+
# μ ν μΌμΉ μΉν (곡백·κ°ν μμ΄ λ± κ·Έ ν
μ€νΈμΌ λ)
|
| 99 |
+
pat_text = re.compile(TEXT_RUN_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
|
| 100 |
+
xml_new, n = pat_text.subn(replacement, xml)
|
| 101 |
+
if n:
|
| 102 |
+
dbg["text_hits"][k] = dbg["text_hits"].get(k, 0) + n
|
| 103 |
+
xml = xml_new
|
| 104 |
+
changed_any = True
|
| 105 |
+
else:
|
| 106 |
+
# λΆλΆ μΌμΉ(ν run μμ λ€λ₯Έ λ¬Έμμ μμ¬ μμ λ) β <t>λ΄μ©</t>λ§ κ°μΌλ‘ κ°μλΌμ(μ€λ°κΏμ \n κ·Έλλ‘)
|
| 107 |
+
# μμ μ μν΄ 'ν run λ΄'μμλ§ μ²λ¦¬
|
| 108 |
+
pat_tnode = re.compile(
|
| 109 |
+
r'(<(?P<prefix>[a-zA-Z0-9_]+):t[^>]*>)([^<]*?)</(?P=prefix):t>',
|
| 110 |
+
re.DOTALL
|
| 111 |
+
)
|
| 112 |
+
def repl_tnode(m):
|
| 113 |
+
text_node = m.group(3)
|
| 114 |
+
if k not in text_node:
|
| 115 |
+
return m.group(0)
|
| 116 |
+
# λ¨μ ν
μ€νΈ μΉν (리μ€νΈλ©΄ μ€λ°κΏμ μ€μ lineBreakλ‘ λ°κΎΈκΈ° μ΄λ €μμ μ¬κΈ°μ plain ν
μ€νΈλ‘ μ°μ λμ
)
|
| 117 |
+
val = "" if v is None else str(v)
|
| 118 |
+
new_text = html.escape(text_node.replace(k, val))
|
| 119 |
+
return f"{m.group(1)}{new_text}</{m.group('prefix')}:t>"
|
| 120 |
+
|
| 121 |
+
xml2 = pat_tnode.sub(repl_tnode, xml)
|
| 122 |
+
if xml2 != xml:
|
| 123 |
+
dbg["text_hits"][k] = dbg["text_hits"].get(k, 0) + 1
|
| 124 |
+
xml = xml2
|
| 125 |
+
changed_any = True
|
| 126 |
+
|
| 127 |
+
# 3) ν ν° μΉν
|
| 128 |
for k, v in mapping.items():
|
| 129 |
tok = TOKEN_FMT.format(key=k)
|
| 130 |
if tok in xml:
|
| 131 |
+
rep = _runs_list(v) if re.match(r"^(λͺ©λ‘|list)\d+$", k, re.IGNORECASE) else html.escape("" if v is None else str(v))
|
| 132 |
xml = xml.replace(tok, rep)
|
| 133 |
dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1
|
| 134 |
changed_any = True
|
| 135 |
|
| 136 |
if changed_any:
|
| 137 |
+
dbg["touched"] = True
|
| 138 |
return xml
|
| 139 |
|
| 140 |
def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, dict]:
|
| 141 |
+
dbg = {"field_hits":{}, "text_hits":{}, "token_hits":{}, "touched_files": []}
|
| 142 |
zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
|
| 143 |
out_buf = io.BytesIO()
|
| 144 |
zout = zipfile.ZipFile(out_buf, "w")
|
|
|
|
| 157 |
try:
|
| 158 |
s = data.decode("utf-8", errors="ignore")
|
| 159 |
before = s
|
| 160 |
+
local_dbg = {"field_hits": dbg["field_hits"], "text_hits": dbg["text_hits"], "token_hits": dbg["token_hits"], "touched": False}
|
| 161 |
+
s = _apply_to_xml(s, mapping, local_dbg)
|
| 162 |
if s != before:
|
| 163 |
dbg["touched_files"].append(e.filename)
|
| 164 |
data = s.encode("utf-8")
|
|
|
|
| 173 |
# ================== UI ==================
|
| 174 |
with st.expander("μ¬μ©λ²", expanded=True):
|
| 175 |
st.markdown("""
|
| 176 |
+
- μ΄ μ±μ HWPXμ **λͺ¨λ XML**μμ λ€μ 3κ°μ§λ₯Ό μμλλ‘ μΉνν©λλ€.
|
| 177 |
+
1) `fieldBegin(name=ν€)` ~ `fieldEnd` **νλμ** μ 체λ₯Ό κ°μΌλ‘ κ΅μ²΄
|
| 178 |
+
2) `<*:t>ν€</*:t>` κ°μ **μμ ν
μ€νΈ μ리νμμ**λ₯Ό κ°(runλ€)μΌλ‘ κ΅μ²΄
|
| 179 |
+
3) `{{ν€}}` **ν ν°**μ κ°μΌλ‘ κ΅μ²΄
|
| 180 |
+
- λλ²κ·Έ JSONμ `field_hits`/`text_hits`/`token_hits`κ° 0μ΄λ©΄ ν
νλ¦Ώμ ν΄λΉ νμ
μ μ리νμμκ° μλ κ²μ
λλ€.
|
| 181 |
""")
|
| 182 |
|
| 183 |
tpl = st.file_uploader("π HWPX ν
νλ¦Ώ μ
λ‘λ", type=["hwpx"])
|