Update app.py
Browse files
app.py
CHANGED
|
@@ -3,18 +3,16 @@ import pandas as pd
|
|
| 3 |
import io, zipfile, re, html, json
|
| 4 |
from typing import Dict, Tuple
|
| 5 |
|
| 6 |
-
st.set_page_config(page_title="๐ฆ ๋ฐ์ค๋ผ๋ฒจ(HWPX) โ ์์ ์นํ", layout="wide")
|
| 7 |
-
st.title("๐ฆ ๋ฐ์ค๋ผ๋ฒจ ์๋ ์์ฑ๊ธฐ โ HWPX ํ๋ยทํ ํฐยทํ
์คํธ
|
| 8 |
|
| 9 |
-
#
|
| 10 |
def _year_range(series: pd.Series) -> str:
|
| 11 |
s = series.astype(str).fillna("")
|
| 12 |
v = s[~s.isin(["", "0", "0000"])]
|
| 13 |
-
if v.empty:
|
| 14 |
-
return "0000-0000"
|
| 15 |
nums = pd.to_numeric(v, errors="coerce").dropna().astype(int)
|
| 16 |
-
if nums.empty:
|
| 17 |
-
return "0000-0000"
|
| 18 |
return f"{nums.min():04d}-{nums.max():04d}"
|
| 19 |
|
| 20 |
def build_rows(df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -34,68 +32,89 @@ def build_rows(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 34 |
has_mgmt = "๊ด๋ฆฌ๋ฒํธ" in df.columns
|
| 35 |
lists = []
|
| 36 |
for b, g in df.groupby("๋ฐ์ค๋ฒํธ"):
|
| 37 |
-
lines = [
|
| 38 |
-
|
| 39 |
-
for _, r in g.iterrows()
|
| 40 |
-
]
|
| 41 |
lists.append({"๋ฐ์ค๋ฒํธ": b, "๋ชฉ๋ก": "\r\n".join(lines)})
|
| 42 |
list_df = pd.DataFrame(lists)
|
| 43 |
|
| 44 |
# ๋ํ ๋ฉํ
|
| 45 |
-
meta_cols = ["๋ฐ์ค๋ฒํธ",
|
| 46 |
meta_exist = [c for c in meta_cols if c in df.columns]
|
| 47 |
-
meta = (
|
| 48 |
-
|
| 49 |
-
if meta_exist
|
| 50 |
-
else pd.DataFrame({"๋ฐ์ค๋ฒํธ": df["๋ฐ์ค๋ฒํธ"].unique()})
|
| 51 |
-
)
|
| 52 |
|
| 53 |
merged = meta.merge(list_df, on="๋ฐ์ค๋ฒํธ", how="left").merge(yr, on="๋ฐ์ค๋ฒํธ", how="left")
|
| 54 |
return merged
|
| 55 |
|
| 56 |
-
#
|
| 57 |
-
# 1) ์ ๋์ด ์์ผ๋์นด๋: <hp:..> ๋ฟ ์๋๋ผ <hwp:..>, <h:..> ๋ฑ ๋ชจ๋ ํ์ฉ
|
| 58 |
FIELD_PAIR_RE_TMPL = (
|
| 59 |
-
r'<(?P<
|
| 60 |
r'(.*?)'
|
| 61 |
-
r'<(?P=
|
| 62 |
)
|
| 63 |
-
|
| 64 |
-
# 2) ํ ํฐ(๋ฐฑ์
๊ฒฝ๋ก)
|
| 65 |
TOKEN_FMT = "{{{{{key}}}}}"
|
| 66 |
|
| 67 |
-
#
|
| 68 |
-
|
| 69 |
-
r'
|
| 70 |
-
|
| 71 |
-
r'{name}'
|
| 72 |
-
r'(\s*(?:</(?P=prefix):t>)\s*</(?P=prefix):run>)'
|
| 73 |
)
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
def
|
| 79 |
"""
|
| 80 |
-
|
| 81 |
-
|
| 82 |
"""
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
|
| 93 |
changed_any = False
|
| 94 |
|
| 95 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
for k, v in mapping.items():
|
| 97 |
-
|
| 98 |
-
|
|
|
|
| 99 |
pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
|
| 100 |
xml_new, n = pat.subn(replacement, xml)
|
| 101 |
if n:
|
|
@@ -103,54 +122,42 @@ def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
|
|
| 103 |
xml = xml_new
|
| 104 |
changed_any = True
|
| 105 |
|
| 106 |
-
# 2) ์์ ํ
์คํธ ์๋ฆฌํ์์
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
for k, v in mapping.items():
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
| 117 |
changed_any = True
|
| 118 |
-
else:
|
| 119 |
-
# ๋ถ๋ถ ์ผ์น: ๊ฐ์ <t> ์์ ๋ค๋ฅธ ๋ฌธ์์ ์์ฌ ์์ ๋
|
| 120 |
-
pat_tnode = re.compile(
|
| 121 |
-
r'(<(?P<prefix>[a-zA-Z0-9_]+):t[^>]*>)([^<]*?)</(?P=prefix):t>',
|
| 122 |
-
re.DOTALL
|
| 123 |
-
)
|
| 124 |
-
def repl_tnode(m):
|
| 125 |
-
text_node = m.group(3)
|
| 126 |
-
if k not in text_node:
|
| 127 |
-
return m.group(0)
|
| 128 |
-
val = "" if v is None else str(v)
|
| 129 |
-
# ๋ถ๋ถ ์นํ์ ๋ฌธ๋จ ๊ตฌ์กฐ๋ฅผ ๊ฑด๋๋ฆฌ์ง ์๊ณ ๋ฌธ์์ด๋ง ๊ต์ฒด
|
| 130 |
-
new_text = html.escape(text_node.replace(k, val))
|
| 131 |
-
return f"{m.group(1)}{new_text}</{m.group('prefix')}:t>"
|
| 132 |
-
|
| 133 |
-
xml2 = pat_tnode.sub(repl_tnode, xml)
|
| 134 |
-
if xml2 != xml:
|
| 135 |
-
dbg["text_hits"][k] = dbg["text_hits"].get(k, 0) + 1
|
| 136 |
-
xml = xml2
|
| 137 |
-
changed_any = True
|
| 138 |
|
| 139 |
-
# 3) ํ ํฐ ์นํ
|
| 140 |
for k, v in mapping.items():
|
|
|
|
|
|
|
| 141 |
tok = TOKEN_FMT.format(key=k)
|
| 142 |
if tok in xml:
|
| 143 |
-
|
| 144 |
-
xml = xml.replace(tok, rep)
|
| 145 |
dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1
|
| 146 |
changed_any = True
|
| 147 |
|
| 148 |
if changed_any:
|
| 149 |
-
dbg["
|
| 150 |
return xml
|
| 151 |
|
| 152 |
def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, dict]:
|
| 153 |
-
dbg = {"field_hits":{}, "text_hits":{}, "token_hits":{}, "touched_files": []}
|
| 154 |
zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
|
| 155 |
out_buf = io.BytesIO()
|
| 156 |
zout = zipfile.ZipFile(out_buf, "w")
|
|
@@ -158,8 +165,7 @@ def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, d
|
|
| 158 |
# mimetype ๋ฌด์์ถ + ๋งจ์
|
| 159 |
names = zin.namelist()
|
| 160 |
if "mimetype" in names:
|
| 161 |
-
zi = zipfile.ZipInfo("mimetype")
|
| 162 |
-
zi.compress_type = zipfile.ZIP_STORED
|
| 163 |
zout.writestr(zi, zin.read("mimetype"))
|
| 164 |
|
| 165 |
for e in zin.infolist():
|
|
@@ -170,33 +176,25 @@ def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, d
|
|
| 170 |
try:
|
| 171 |
s = data.decode("utf-8", errors="ignore")
|
| 172 |
before = s
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
"token_hits": dbg["token_hits"],
|
| 177 |
-
"touched": False
|
| 178 |
-
}
|
| 179 |
-
s = _apply_to_xml(s, mapping, local_dbg)
|
| 180 |
if s != before:
|
| 181 |
dbg["touched_files"].append(e.filename)
|
| 182 |
data = s.encode("utf-8")
|
| 183 |
except Exception:
|
| 184 |
pass
|
| 185 |
-
zi = zipfile.ZipInfo(e.filename)
|
| 186 |
-
zi.compress_type = zipfile.ZIP_DEFLATED
|
| 187 |
zout.writestr(zi, data)
|
| 188 |
|
| 189 |
zout.close(); out_buf.seek(0); zin.close()
|
| 190 |
return out_buf.getvalue(), dbg
|
| 191 |
|
| 192 |
-
#
|
| 193 |
with st.expander("์ฌ์ฉ๋ฒ", expanded=True):
|
| 194 |
st.markdown("""
|
| 195 |
-
-
|
| 196 |
-
|
| 197 |
-
2) `<*:t>ํค</*:t>` ๊ฐ์ **์์ ํ
์คํธ ์๋ฆฌํ์์** run ๊ต์ฒด
|
| 198 |
-
3) `{{ํค}}` **ํ ํฐ** ๊ต์ฒด
|
| 199 |
-
- โ๋ชฉ๋ก/์ ๋ชฉ/์
๋ฌด๋ช
โ ๊ฐ์ด ์ฌ๋ฌ ์ค์ด ๋ค์ด๊ฐ ์ ์๋ ๊ฐ์ **๊ฐ ์ค์ ๋
๋ฆฝ run + `lineBreak`**๋ก ๋ฃ์ด ๊ฒน์นจ์ ๋ฐฉ์งํฉ๋๋ค.
|
| 200 |
""")
|
| 201 |
|
| 202 |
tpl = st.file_uploader("๐ HWPX ํ
ํ๋ฆฟ ์
๋ก๋", type=["hwpx"])
|
|
@@ -208,8 +206,7 @@ if tpl and data:
|
|
| 208 |
df = pd.read_csv(data) if data.name.lower().endswith(".csv") else pd.read_excel(data)
|
| 209 |
|
| 210 |
if "๋ฐ์ค๋ฒํธ" not in df.columns:
|
| 211 |
-
st.error("โ ํ์ ์ปฌ๋ผ '๋ฐ์ค๋ฒํธ'๊ฐ ์์ต๋๋ค.")
|
| 212 |
-
st.stop()
|
| 213 |
|
| 214 |
st.success("โ
์์น ๋งคํ ์๋ฃ (์์
์ธก)")
|
| 215 |
st.dataframe(df.head(10), use_container_width=True)
|
|
@@ -227,60 +224,55 @@ if tpl and data:
|
|
| 227 |
|
| 228 |
# 1ํ์ด์ง ๋งคํ ํ๋ฆฌ๋ทฐ
|
| 229 |
st.subheader("๐งช 1ํ์ด์ง ๋งคํ ํ๋ฆฌ๋ทฐ")
|
| 230 |
-
keys = ["๋ฐ์ค๋ฒํธ",
|
| 231 |
mapping_preview = {}
|
| 232 |
for i in range(int(n_per_page)):
|
| 233 |
if i < len(records):
|
| 234 |
r = records[i]
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
else:
|
| 239 |
-
for k in keys:
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
pd.DataFrame([{"ํค": k, "๊ฐ ์๋ถ๋ถ": str(v)[:120]} for k, v in sorted(mapping_preview.items())]),
|
| 243 |
-
use_container_width=True,
|
| 244 |
-
height=320,
|
| 245 |
-
)
|
| 246 |
|
| 247 |
if st.button("๐ ๋ผ๋ฒจ ์์ฑ (ํ์ด์ง๋ณ HWPX ZIP)"):
|
| 248 |
-
mem = io.BytesIO()
|
| 249 |
-
zout = zipfile.ZipFile(mem, "w", zipfile.ZIP_DEFLATED)
|
| 250 |
pages = (len(records) + int(n_per_page) - 1) // int(n_per_page)
|
| 251 |
debug_all = []
|
| 252 |
|
| 253 |
for p in range(pages):
|
| 254 |
-
chunk = records[p
|
| 255 |
-
# ๋งคํ ๊ตฌ์ถ (์ ๋ชฉ == ์
๋ฌด๋ช
๋์น)
|
| 256 |
mapping = {}
|
| 257 |
for i in range(int(n_per_page)):
|
| 258 |
if i < len(chunk):
|
| 259 |
r = chunk[i]
|
| 260 |
-
mapping[f"๋ฐ์ค๋ฒํธ{i+1}"] = r.get("๋ฐ์ค๋ฒํธ",
|
| 261 |
-
mapping[f"์ข
๋ฃ์ฐ๋{i+1}"] = r.get("์์ฐ์ฐ๋",
|
| 262 |
-
mapping[f"๋ณด์กด๊ธฐ๊ฐ{i+1}"] = r.get("๋ณด์กด๊ธฐ๊ฐ",
|
| 263 |
-
mapping[f"๋จ์์
๋ฌด{i+1}"] = r.get("๋จ์์
๋ฌด",
|
| 264 |
-
mapping[f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}"] = r.get("๊ธฐ๋ก๋ฌผ์ฒ ",
|
| 265 |
-
mapping[f"๋ชฉ๋ก{i+1}"] = r.get("๋ชฉ๋ก",
|
| 266 |
-
title_val = r.get("์ ๋ชฉ",
|
| 267 |
mapping[f"์ ๋ชฉ{i+1}"] = title_val
|
| 268 |
-
mapping[f"์
๋ฌด๋ช
{i+1}"] = title_val
|
| 269 |
else:
|
| 270 |
-
for k in keys:
|
| 271 |
-
mapping[f"{k}{i+1}"] = ""
|
| 272 |
|
| 273 |
out_hwpx, dbg = replace_in_hwpx(tpl_bytes, mapping)
|
| 274 |
-
debug_all.append({"page": p
|
| 275 |
-
name = "_".join([r.get("๋ฐ์ค๋ฒํธ",
|
| 276 |
zout.writestr(f"label_{name}.hwpx", out_hwpx)
|
| 277 |
|
| 278 |
-
zout.close()
|
| 279 |
-
mem.seek(0)
|
| 280 |
st.download_button("โฌ๏ธ ZIP ๋ค์ด๋ก๋", data=mem, file_name="labels_by_page.zip", mime="application/zip")
|
| 281 |
-
st.download_button(
|
| 282 |
-
|
| 283 |
-
data=json.dumps(debug_all, ensure_ascii=False, indent=2),
|
| 284 |
-
file_name="debug.json",
|
| 285 |
-
mime="application/json",
|
| 286 |
-
)
|
|
|
|
| 3 |
import io, zipfile, re, html, json
|
| 4 |
from typing import Dict, Tuple
|
| 5 |
|
| 6 |
+
st.set_page_config(page_title="๐ฆ ๋ฐ์ค๋ผ๋ฒจ(HWPX) โ ๋ฌธ๋จ ๋จ์ ์์ ์นํ", layout="wide")
|
| 7 |
+
st.title("๐ฆ ๋ฐ์ค๋ผ๋ฒจ ์๋ ์์ฑ๊ธฐ โ HWPX ํ๋ยทํ ํฐยทํ
์คํธ ์์ ์นํ(๋ฌธ๋จ ๋จ์)")
|
| 8 |
|
| 9 |
+
# -------------------- ๋ฐ์ดํฐ ์ ํธ --------------------
|
| 10 |
def _year_range(series: pd.Series) -> str:
|
| 11 |
s = series.astype(str).fillna("")
|
| 12 |
v = s[~s.isin(["", "0", "0000"])]
|
| 13 |
+
if v.empty: return "0000-0000"
|
|
|
|
| 14 |
nums = pd.to_numeric(v, errors="coerce").dropna().astype(int)
|
| 15 |
+
if nums.empty: return "0000-0000"
|
|
|
|
| 16 |
return f"{nums.min():04d}-{nums.max():04d}"
|
| 17 |
|
| 18 |
def build_rows(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
|
| 32 |
has_mgmt = "๊ด๋ฆฌ๋ฒํธ" in df.columns
|
| 33 |
lists = []
|
| 34 |
for b, g in df.groupby("๋ฐ์ค๋ฒํธ"):
|
| 35 |
+
lines = [f"- {r['๊ด๋ฆฌ๋ฒํธ']} {r.get('์ ๋ชฉ','')}" if has_mgmt else f"- {r.get('์ ๋ชฉ','')}"
|
| 36 |
+
for _, r in g.iterrows()]
|
|
|
|
|
|
|
| 37 |
lists.append({"๋ฐ์ค๋ฒํธ": b, "๋ชฉ๋ก": "\r\n".join(lines)})
|
| 38 |
list_df = pd.DataFrame(lists)
|
| 39 |
|
| 40 |
# ๋ํ ๋ฉํ
|
| 41 |
+
meta_cols = ["๋ฐ์ค๋ฒํธ","์ข
๋ฃ์ฐ๋","๋ณด์กด๊ธฐ๊ฐ","๋จ์์
๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","์ ๋ชฉ"]
|
| 42 |
meta_exist = [c for c in meta_cols if c in df.columns]
|
| 43 |
+
meta = df.groupby("๋ฐ์ค๋ฒํธ", as_index=False).first()[meta_exist] if meta_exist \
|
| 44 |
+
else pd.DataFrame({"๋ฐ์ค๋ฒํธ": df["๋ฐ์ค๋ฒํธ"].unique()})
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
merged = meta.merge(list_df, on="๋ฐ์ค๋ฒํธ", how="left").merge(yr, on="๋ฐ์ค๋ฒํธ", how="left")
|
| 47 |
return merged
|
| 48 |
|
| 49 |
+
# -------------------- ์นํ ์ ํธ --------------------
|
|
|
|
| 50 |
FIELD_PAIR_RE_TMPL = (
|
| 51 |
+
r'<(?P<fprefix>[a-zA-Z0-9_]+):fieldBegin\b[^>]*\bname="{name}"[^>]*/>'
|
| 52 |
r'(.*?)'
|
| 53 |
+
r'<(?P=fprefix):fieldEnd\b[^>]*/>'
|
| 54 |
)
|
|
|
|
|
|
|
| 55 |
TOKEN_FMT = "{{{{{key}}}}}"
|
| 56 |
|
| 57 |
+
# ๋ฌธ๋จ(<*:p>) ํ์ ํจํด
|
| 58 |
+
PARA_RE = re.compile(
|
| 59 |
+
r'<(?P<pprefix>[a-zA-Z0-9_]+):p(?P<pattrs>[^>]*)>(?P<pbody>.*?)</(?P=pprefix):p>',
|
| 60 |
+
re.DOTALL
|
|
|
|
|
|
|
| 61 |
)
|
| 62 |
|
| 63 |
+
# ๋ฌธ๋จ ํ๋๋ฅผ ๊ฐ์ ์คํ์ผ๋ก ๋ณต์ ํด์ฃผ๋ ํฌํผ
|
| 64 |
+
def _make_para(pprefix: str, pattrs: str, text: str) -> str:
|
| 65 |
+
esc = html.escape("" if text is None else str(text))
|
| 66 |
+
return f'<{pprefix}:p{pattrs}><{pprefix}:run><{pprefix}:t>{esc}</{pprefix}:t></{pprefix}:run></{pprefix}:p>'
|
| 67 |
+
|
| 68 |
+
def _split_lines(val) -> list:
|
| 69 |
+
if val is None: return [""]
|
| 70 |
+
return str(val).replace("\r\n","\n").split("\n")
|
| 71 |
|
| 72 |
+
def _replace_para_multiline(xml: str, key: str, value: str, dbg: dict) -> str:
|
| 73 |
"""
|
| 74 |
+
key๊ฐ ํฌํจ๋ '๋ถ๋ชจ ๋ฌธ๋จ ์ ์ฒด'๋ฅผ, ๊ฐ์ ๊ฐ ์ค์ ๋ด์ ์ฌ๋ฌ ๋ฌธ๋จ์ผ๋ก ๊ต์ฒด.
|
| 75 |
+
- fieldBegin/End, <*:t>ํค</*:t>, {{ํค}} ๋ชจ๋ ๊ฐ์ง
|
| 76 |
"""
|
| 77 |
+
pair_pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(key)), re.DOTALL)
|
| 78 |
+
tnode_pat = re.compile(rf'<(?P<p>[a-zA-Z0-9_]+):t[^>]*>[^<]*{re.escape(key)}[^<]*</(?P=p):t>', re.DOTALL)
|
| 79 |
+
token_str = TOKEN_FMT.format(key=key)
|
| 80 |
+
|
| 81 |
+
def para_repl(m):
|
| 82 |
+
body = m.group("pbody")
|
| 83 |
+
if not (pair_pat.search(body) or tnode_pat.search(body) or (token_str in body)):
|
| 84 |
+
return m.group(0)
|
| 85 |
+
|
| 86 |
+
lines = _split_lines(value)
|
| 87 |
+
pprefix = m.group("pprefix")
|
| 88 |
+
pattrs = m.group("pattrs")
|
| 89 |
+
new_paras = "".join(_make_para(pprefix, pattrs, ln) for ln in lines)
|
| 90 |
+
dbg["para_hits"][key] = dbg["para_hits"].get(key, 0) + 1
|
| 91 |
+
return new_paras
|
| 92 |
+
|
| 93 |
+
xml2 = PARA_RE.sub(para_repl, xml)
|
| 94 |
+
if xml2 != xml:
|
| 95 |
+
dbg["touched"] = True
|
| 96 |
+
return xml2
|
| 97 |
+
|
| 98 |
+
def _runs_plain(text: str) -> str:
|
| 99 |
+
return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>"
|
| 100 |
|
| 101 |
def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
|
| 102 |
changed_any = False
|
| 103 |
|
| 104 |
+
# 0) ๋ค์ค์ค ํค๋ ๋จผ์ "๋ถ๋ชจ ๋ฌธ๋จ ๊ต์ฒด"๋ก ์ฒ๋ฆฌ
|
| 105 |
+
multi_key = re.compile(r"^(๋ชฉ๋ก|list|์ ๋ชฉ|์
๋ฌด๋ช
)\d+$", re.IGNORECASE)
|
| 106 |
+
for k, v in mapping.items():
|
| 107 |
+
if multi_key.match(k):
|
| 108 |
+
xml_new = _replace_para_multiline(xml, k, v, dbg)
|
| 109 |
+
if xml_new != xml:
|
| 110 |
+
xml = xml_new
|
| 111 |
+
changed_any = True
|
| 112 |
+
|
| 113 |
+
# 1) ํ๋์(์ธ๋ผ์ธ) ์นํ โ ๋จ์ผ์ค ํค๋ง
|
| 114 |
for k, v in mapping.items():
|
| 115 |
+
if multi_key.match(k):
|
| 116 |
+
continue
|
| 117 |
+
replacement = _runs_plain(v)
|
| 118 |
pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
|
| 119 |
xml_new, n = pat.subn(replacement, xml)
|
| 120 |
if n:
|
|
|
|
| 122 |
xml = xml_new
|
| 123 |
changed_any = True
|
| 124 |
|
| 125 |
+
# 2) ์์ ํ
์คํธ ์๋ฆฌํ์์(<*:t>ํค</*:t>) ๋ถ๋ถ์นํ โ ๋จ์ผ์ค ํค๋ง
|
| 126 |
+
tnode_all = re.compile(
|
| 127 |
+
r'(<(?P<prefix>[a-zA-Z0-9_]+):t[^>]*>)([^<]*?)</(?P=prefix):t>',
|
| 128 |
+
re.DOTALL
|
| 129 |
+
)
|
| 130 |
for k, v in mapping.items():
|
| 131 |
+
if multi_key.match(k):
|
| 132 |
+
continue
|
| 133 |
+
def repl_tnode(m):
|
| 134 |
+
text_node = m.group(3)
|
| 135 |
+
if k not in text_node:
|
| 136 |
+
return m.group(0)
|
| 137 |
+
new_text = html.escape(text_node.replace(k, "" if v is None else str(v)))
|
| 138 |
+
return f"{m.group(1)}{new_text}</{m.group('prefix')}:t>"
|
| 139 |
+
xml2 = tnode_all.sub(repl_tnode, xml)
|
| 140 |
+
if xml2 != xml:
|
| 141 |
+
dbg["text_hits"][k] = dbg["text_hits"].get(k, 0) + 1
|
| 142 |
+
xml = xml2
|
| 143 |
changed_any = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
+
# 3) ํ ํฐ ์นํ โ ๋จ์ผ์ค ํค๋ง
|
| 146 |
for k, v in mapping.items():
|
| 147 |
+
if multi_key.match(k):
|
| 148 |
+
continue
|
| 149 |
tok = TOKEN_FMT.format(key=k)
|
| 150 |
if tok in xml:
|
| 151 |
+
xml = xml.replace(tok, html.escape("" if v is None else str(v)))
|
|
|
|
| 152 |
dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1
|
| 153 |
changed_any = True
|
| 154 |
|
| 155 |
if changed_any:
|
| 156 |
+
dbg["files_touched"] = True
|
| 157 |
return xml
|
| 158 |
|
| 159 |
def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, dict]:
|
| 160 |
+
dbg = {"para_hits":{}, "field_hits":{}, "text_hits":{}, "token_hits":{}, "touched_files": []}
|
| 161 |
zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
|
| 162 |
out_buf = io.BytesIO()
|
| 163 |
zout = zipfile.ZipFile(out_buf, "w")
|
|
|
|
| 165 |
# mimetype ๋ฌด์์ถ + ๋งจ์
|
| 166 |
names = zin.namelist()
|
| 167 |
if "mimetype" in names:
|
| 168 |
+
zi = zipfile.ZipInfo("mimetype"); zi.compress_type = zipfile.ZIP_STORED
|
|
|
|
| 169 |
zout.writestr(zi, zin.read("mimetype"))
|
| 170 |
|
| 171 |
for e in zin.infolist():
|
|
|
|
| 176 |
try:
|
| 177 |
s = data.decode("utf-8", errors="ignore")
|
| 178 |
before = s
|
| 179 |
+
s = _apply_to_xml(s, mapping, {"para_hits":dbg["para_hits"], "field_hits":dbg["field_hits"],
|
| 180 |
+
"text_hits":dbg["text_hits"], "token_hits":dbg["token_hits"],
|
| 181 |
+
"files_touched":False})
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
if s != before:
|
| 183 |
dbg["touched_files"].append(e.filename)
|
| 184 |
data = s.encode("utf-8")
|
| 185 |
except Exception:
|
| 186 |
pass
|
| 187 |
+
zi = zipfile.ZipInfo(e.filename); zi.compress_type = zipfile.ZIP_DEFLATED
|
|
|
|
| 188 |
zout.writestr(zi, data)
|
| 189 |
|
| 190 |
zout.close(); out_buf.seek(0); zin.close()
|
| 191 |
return out_buf.getvalue(), dbg
|
| 192 |
|
| 193 |
+
# -------------------- UI --------------------
|
| 194 |
with st.expander("์ฌ์ฉ๋ฒ", expanded=True):
|
| 195 |
st.markdown("""
|
| 196 |
+
- **๋ค์ค ์ค(๋ชฉ๋ก/์ ๋ชฉ/์
๋ฌด๋ช
)์ ๋ถ๋ชจ ๋ฌธ๋จ์ ์ฌ๋ฌ ๋ฌธ๋จ์ผ๋ก ๊ต์ฒด**ํ์ฌ ๊ฒน์นจ ์์ด ํ์ํฉ๋๋ค.
|
| 197 |
+
- ๋๋จธ์ง ํค๋ ํ๋์/ํ
์คํธ/ํ ํฐ์ ์ธ๋ผ์ธ ์นํํฉ๋๋ค.
|
|
|
|
|
|
|
|
|
|
| 198 |
""")
|
| 199 |
|
| 200 |
tpl = st.file_uploader("๐ HWPX ํ
ํ๋ฆฟ ์
๋ก๋", type=["hwpx"])
|
|
|
|
| 206 |
df = pd.read_csv(data) if data.name.lower().endswith(".csv") else pd.read_excel(data)
|
| 207 |
|
| 208 |
if "๋ฐ์ค๋ฒํธ" not in df.columns:
|
| 209 |
+
st.error("โ ํ์ ์ปฌ๋ผ '๋ฐ์ค๋ฒํธ'๊ฐ ์์ต๋๋ค."); st.stop()
|
|
|
|
| 210 |
|
| 211 |
st.success("โ
์์น ๋งคํ ์๋ฃ (์์
์ธก)")
|
| 212 |
st.dataframe(df.head(10), use_container_width=True)
|
|
|
|
| 224 |
|
| 225 |
# 1ํ์ด์ง ๋งคํ ํ๋ฆฌ๋ทฐ
|
| 226 |
st.subheader("๐งช 1ํ์ด์ง ๋งคํ ํ๋ฆฌ๋ทฐ")
|
| 227 |
+
keys = ["๋ฐ์ค๋ฒํธ","์ข
๋ฃ์ฐ๋","๋ณด์กด๊ธฐ๊ฐ","๋จ์์
๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","๋ชฉ๋ก","์ ๋ชฉ","์
๋ฌด๋ช
"]
|
| 228 |
mapping_preview = {}
|
| 229 |
for i in range(int(n_per_page)):
|
| 230 |
if i < len(records):
|
| 231 |
r = records[i]
|
| 232 |
+
mapping_preview.update({
|
| 233 |
+
f"๋ฐ์ค๋ฒํธ{i+1}": r.get("๋ฐ์ค๋ฒํธ",""),
|
| 234 |
+
f"์ข
๋ฃ์ฐ๋{i+1}": r.get("์์ฐ์ฐ๋",""),
|
| 235 |
+
f"๋ณด์กด๊ธฐ๊ฐ{i+1}": r.get("๋ณด์กด๊ธฐ๊ฐ",""),
|
| 236 |
+
f"๋จ์์
๋ฌด{i+1}": r.get("๋จ์์
๋ฌด",""),
|
| 237 |
+
f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}": r.get("๊ธฐ๋ก๋ฌผ์ฒ ",""),
|
| 238 |
+
f"๋ชฉ๋ก{i+1}": r.get("๋ชฉ๋ก",""),
|
| 239 |
+
f"์ ๋ชฉ{i+1}": r.get("์ ๋ชฉ",""),
|
| 240 |
+
f"์
๋ฌด๋ช
{i+1}": r.get("์ ๋ชฉ",""), # ํ
ํ๋ฆฟ์ด '์
๋ฌด๋ช
1'์ ์ฐ๋ ๊ฒฝ์ฐ ๋์
|
| 241 |
+
})
|
| 242 |
else:
|
| 243 |
+
for k in keys: mapping_preview[f"{k}{i+1}"] = ""
|
| 244 |
+
st.dataframe(pd.DataFrame([{"ํค":k, "๊ฐ ์๋ถ๋ถ":str(v)[:120]} for k,v in sorted(mapping_preview.items())]),
|
| 245 |
+
use_container_width=True, height=320)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
if st.button("๐ ๋ผ๋ฒจ ์์ฑ (ํ์ด์ง๋ณ HWPX ZIP)"):
|
| 248 |
+
mem = io.BytesIO(); zout = zipfile.ZipFile(mem, "w", zipfile.ZIP_DEFLATED)
|
|
|
|
| 249 |
pages = (len(records) + int(n_per_page) - 1) // int(n_per_page)
|
| 250 |
debug_all = []
|
| 251 |
|
| 252 |
for p in range(pages):
|
| 253 |
+
chunk = records[p*int(n_per_page):(p+1)*int(n_per_page)]
|
|
|
|
| 254 |
mapping = {}
|
| 255 |
for i in range(int(n_per_page)):
|
| 256 |
if i < len(chunk):
|
| 257 |
r = chunk[i]
|
| 258 |
+
mapping[f"๋ฐ์ค๋ฒํธ{i+1}"] = r.get("๋ฐ์ค๋ฒํธ","")
|
| 259 |
+
mapping[f"์ข
๋ฃ์ฐ๋{i+1}"] = r.get("์์ฐ์ฐ๋","")
|
| 260 |
+
mapping[f"๋ณด์กด๊ธฐ๊ฐ{i+1}"] = r.get("๋ณด์กด๊ธฐ๊ฐ","")
|
| 261 |
+
mapping[f"๋จ์์
๋ฌด{i+1}"] = r.get("๋จ์์
๋ฌด","")
|
| 262 |
+
mapping[f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}"] = r.get("๊ธฐ๋ก๋ฌผ์ฒ ","")
|
| 263 |
+
mapping[f"๋ชฉ๋ก{i+1}"] = r.get("๋ชฉ๋ก","")
|
| 264 |
+
title_val = r.get("์ ๋ชฉ","")
|
| 265 |
mapping[f"์ ๋ชฉ{i+1}"] = title_val
|
| 266 |
+
mapping[f"์
๋ฌด๋ช
{i+1}"] = title_val
|
| 267 |
else:
|
| 268 |
+
for k in keys: mapping[f"{k}{i+1}"] = ""
|
|
|
|
| 269 |
|
| 270 |
out_hwpx, dbg = replace_in_hwpx(tpl_bytes, mapping)
|
| 271 |
+
debug_all.append({"page": p+1, "stats": dbg})
|
| 272 |
+
name = "_".join([r.get("๋ฐ์ค๋ฒํธ","") for r in chunk]) if chunk else f"empty_{p+1}"
|
| 273 |
zout.writestr(f"label_{name}.hwpx", out_hwpx)
|
| 274 |
|
| 275 |
+
zout.close(); mem.seek(0)
|
|
|
|
| 276 |
st.download_button("โฌ๏ธ ZIP ๋ค์ด๋ก๋", data=mem, file_name="labels_by_page.zip", mime="application/zip")
|
| 277 |
+
st.download_button("โฌ๏ธ ๋๋ฒ๊ทธ(JSON)", data=json.dumps(debug_all, ensure_ascii=False, indent=2),
|
| 278 |
+
file_name="debug.json", mime="application/json")
|
|
|
|
|
|
|
|
|
|
|
|