import streamlit as st import pandas as pd import io, zipfile, re, html, json from typing import Dict, Tuple st.set_page_config(page_title="๐Ÿ“ฆ ๋ณด์กด์ƒ์ž ๋ผ๋ฒจ ์ƒ์„ฑ๊ธฐ", layout="wide") st.title("๐Ÿ“ฆ ๋ณด์กด์ƒ์ž ๋ผ๋ฒจ ์ƒ์„ฑ๊ธฐ ๐Ÿ“ฆ") # -------------------- ๋ฐ์ดํ„ฐ ์œ ํ‹ธ -------------------- def _year_range(series: pd.Series) -> str: s = series.astype(str).fillna("") v = s[~s.isin(["", "0", "0000"])] if v.empty: return "0000-0000" nums = pd.to_numeric(v, errors="coerce").dropna().astype(int) if nums.empty: return "0000-0000" return f"{nums.min():04d}-{nums.max():04d}" def build_rows(df: pd.DataFrame) -> pd.DataFrame: df = df.copy() df["๋ฐ•์Šค๋ฒˆํ˜ธ"] = df["๋ฐ•์Šค๋ฒˆํ˜ธ"].astype(str).str.zfill(4) if "์ œ๋ชฉ" in df.columns: df["์ œ๋ชฉ"] = df["์ œ๋ชฉ"].astype(str) # ์ƒ์‚ฐ์—ฐ๋„(๋ฒ”์œ„) = ์ข…๋ฃŒ์—ฐ๋„ ๊ทธ๋ฃน ๋ฒ”์œ„ if "์ข…๋ฃŒ์—ฐ๋„" in df.columns: yr = df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ")["์ข…๋ฃŒ์—ฐ๋„"].apply(_year_range).reset_index() yr.columns = ["๋ฐ•์Šค๋ฒˆํ˜ธ", "์ƒ์‚ฐ์—ฐ๋„"] else: yr = pd.DataFrame({"๋ฐ•์Šค๋ฒˆํ˜ธ": df["๋ฐ•์Šค๋ฒˆํ˜ธ"].unique(), "์ƒ์‚ฐ์—ฐ๋„": "0000-0000"}) # ๋ชฉ๋ก(๊ด€๋ฆฌ๋ฒˆํ˜ธ + ์ œ๋ชฉ) has_mgmt = "๊ด€๋ฆฌ๋ฒˆํ˜ธ" in df.columns lists = [] for b, g in df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ"): lines = [f"- {r['๊ด€๋ฆฌ๋ฒˆํ˜ธ']} {r.get('์ œ๋ชฉ','')}" if has_mgmt else f"- {r.get('์ œ๋ชฉ','')}" for _, r in g.iterrows()] lists.append({"๋ฐ•์Šค๋ฒˆํ˜ธ": b, "๋ชฉ๋ก": "\r\n".join(lines)}) list_df = pd.DataFrame(lists) # ๋Œ€ํ‘œ ๋ฉ”ํƒ€ meta_cols = ["๋ฐ•์Šค๋ฒˆํ˜ธ","์ข…๋ฃŒ์—ฐ๋„","๋ณด์กด๊ธฐ๊ฐ„","๋‹จ์œ„์—…๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","์ œ๋ชฉ"] meta_exist = [c for c in meta_cols if c in df.columns] meta = df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ", as_index=False).first()[meta_exist] if meta_exist \ else pd.DataFrame({"๋ฐ•์Šค๋ฒˆํ˜ธ": df["๋ฐ•์Šค๋ฒˆํ˜ธ"].unique()}) merged = meta.merge(list_df, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left").merge(yr, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left") return merged # -------------------- ์น˜ํ™˜ ์œ ํ‹ธ -------------------- FIELD_PAIR_RE_TMPL = ( r'<(?P[a-zA-Z0-9_]+):fieldBegin\b[^>]*\bname="{name}"[^>]*/>' r'(.*?)' r'<(?P=fprefix):fieldEnd\b[^>]*/>' ) TOKEN_FMT = "{{{{{key}}}}}" # ๋ฌธ๋‹จ(<*:p>) ํƒ์ƒ‰ ํŒจํ„ด PARA_RE = re.compile( r'<(?P[a-zA-Z0-9_]+):p(?P[^>]*)>(?P.*?)', re.DOTALL ) # ์›๋ณธ run ์Šคํƒ€์ผ์„ ์ถ”์ถœํ•˜๋Š” ํ•จ์ˆ˜ def _extract_run_style(body: str, pprefix: str) -> str: """๋ฌธ๋‹จ ๋‚ด์šฉ์—์„œ ์ฒซ ๋ฒˆ์งธ run ์š”์†Œ์˜ ์Šคํƒ€์ผ์„ ์ถ”์ถœ""" run_pattern = re.compile( rf'<{pprefix}:run[^>]*>.*?', re.DOTALL ) match = run_pattern.search(body) if match: return match.group(0) return f'<{pprefix}:run><{pprefix}:t>' # ๋ฌธ๋‹จ ํ•˜๋‚˜๋ฅผ ๊ฐ™์€ ์Šคํƒ€์ผ๋กœ ๋ณต์ œํ•ด์ฃผ๋Š” ํ—ฌํผ (์Šคํƒ€์ผ ๋ณด์กด) def _make_para_with_style(pprefix: str, pattrs: str, text: str, original_run: str) -> str: esc = html.escape("" if text is None else str(text)) # ์›๋ณธ run์—์„œ ํ…์ŠคํŠธ ๋ถ€๋ถ„๋งŒ ๊ต์ฒด text_pattern = re.compile(rf'(<{pprefix}:t[^>]*>)[^<]*()') new_run = text_pattern.sub(rf'\g<1>{esc}\g<2>', original_run) # ๋งŒ์•ฝ ํ…์ŠคํŠธ ๋…ธ๋“œ๊ฐ€ ์—†๋‹ค๋ฉด ๊ธฐ๋ณธ ํ˜•ํƒœ๋กœ if new_run == original_run: t_pattern = re.compile(rf'(<{pprefix}:run[^>]*>)(.*?)()', re.DOTALL) new_run = t_pattern.sub(rf'\g<1><{pprefix}:t>{esc}\g<3>', original_run) return f'<{pprefix}:p{pattrs}>{new_run}' def _split_lines(val) -> list: if val is None: return [""] return str(val).replace("\r\n","\n").split("\n") def _replace_para_multiline(xml: str, key: str, value: str, dbg: dict) -> str: """ key๊ฐ€ ํฌํ•จ๋œ '๋ถ€๋ชจ ๋ฌธ๋‹จ ์ „์ฒด'๋ฅผ, ๊ฐ’์˜ ๊ฐ ์ค„์„ ๋‹ด์€ ์—ฌ๋Ÿฌ ๋ฌธ๋‹จ์œผ๋กœ ๊ต์ฒด. ์›๋ณธ ์Šคํƒ€์ผ์„ ์œ ์ง€ํ•˜๋ฉด์„œ ๊ต์ฒด. """ pair_pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(key)), re.DOTALL) tnode_pat = re.compile(rf'<(?P

[a-zA-Z0-9_]+):t[^>]*>[^<]*{re.escape(key)}[^<]*', re.DOTALL) token_str = TOKEN_FMT.format(key=key) def para_repl(m): body = m.group("pbody") if not (pair_pat.search(body) or tnode_pat.search(body) or (token_str in body)): return m.group(0) lines = _split_lines(value) pprefix = m.group("pprefix") pattrs = m.group("pattrs") # ์›๋ณธ run ์Šคํƒ€์ผ ์ถ”์ถœ original_run = _extract_run_style(body, pprefix) # ๊ฐ ์ค„์— ๋Œ€ํ•ด ์›๋ณธ ์Šคํƒ€์ผ์„ ์œ ์ง€ํ•˜๋ฉด์„œ ์ƒˆ ๋ฌธ๋‹จ ์ƒ์„ฑ new_paras = "".join(_make_para_with_style(pprefix, pattrs, ln, original_run) for ln in lines) dbg["para_hits"][key] = dbg["para_hits"].get(key, 0) + 1 return new_paras xml2 = PARA_RE.sub(para_repl, xml) if xml2 != xml: dbg["touched"] = True return xml2 def _runs_plain(text: str) -> str: return f"{html.escape('' if text is None else str(text))}" def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str: changed_any = False # 0) ๋‹ค์ค‘์ค„ ํ‚ค๋Š” ๋จผ์ € "๋ถ€๋ชจ ๋ฌธ๋‹จ ๊ต์ฒด"๋กœ ์ฒ˜๋ฆฌ (์—…๋ฌด๋ช…์€ ์ œ์™ธํ•˜์—ฌ ํฐํŠธ ๋ฌธ์ œ ํ•ด๊ฒฐ) multi_key = re.compile(r"^(๋ชฉ๋ก|list|์ œ๋ชฉ)\d+$", re.IGNORECASE) for k, v in mapping.items(): if multi_key.match(k): xml_new = _replace_para_multiline(xml, k, v, dbg) if xml_new != xml: xml = xml_new changed_any = True # 1) ํ•„๋“œ์Œ(์ธ๋ผ์ธ) ์น˜ํ™˜ โ€” ๋‹จ์ผ์ค„ ํ‚ค๋งŒ for k, v in mapping.items(): if multi_key.match(k): continue replacement = _runs_plain(v) pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(k)), re.DOTALL) xml_new, n = pat.subn(replacement, xml) if n: dbg["field_hits"][k] = dbg["field_hits"].get(k, 0) + n xml = xml_new changed_any = True # 2) ์ˆœ์ˆ˜ ํ…์ŠคํŠธ ์ž๋ฆฌํ‘œ์‹œ์ž(<*:t>ํ‚ค) ๋ถ€๋ถ„์น˜ํ™˜ โ€” ๋‹จ์ผ์ค„ ํ‚ค๋งŒ tnode_all = re.compile( r'(<(?P[a-zA-Z0-9_]+):t[^>]*>)([^<]*?)', re.DOTALL ) for k, v in mapping.items(): if multi_key.match(k): continue def repl_tnode(m): text_node = m.group(3) if k not in text_node: return m.group(0) new_text = html.escape(text_node.replace(k, "" if v is None else str(v))) return f"{m.group(1)}{new_text}" xml2 = tnode_all.sub(repl_tnode, xml) if xml2 != xml: dbg["text_hits"][k] = dbg["text_hits"].get(k, 0) + 1 xml = xml2 changed_any = True # 3) ํ† ํฐ ์น˜ํ™˜ โ€” ๋‹จ์ผ์ค„ ํ‚ค๋งŒ for k, v in mapping.items(): if multi_key.match(k): continue tok = TOKEN_FMT.format(key=k) if tok in xml: xml = xml.replace(tok, html.escape("" if v is None else str(v))) dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1 changed_any = True if changed_any: dbg["files_touched"] = True return xml def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, dict]: import stat, time dbg = {"para_hits":{}, "field_hits":{}, "text_hits":{}, "token_hits":{}, "touched_files": []} zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r") out_buf = io.BytesIO() zout = zipfile.ZipFile(out_buf, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6) # ํ˜„์žฌ ์‹œ๊ฐ„ now = time.localtime() # mimetype ๋ฌด์••์ถ• + ๋งจ์•ž names = zin.namelist() if "mimetype" in names: zi = zipfile.ZipInfo("mimetype") zi.compress_type = zipfile.ZIP_STORED # ์™„์ „ํžˆ ์ƒˆ๋กœ์šด ZipInfo๋กœ ์ฝ๊ธฐ์ „์šฉ ๋ฐฉ์ง€ zi.external_attr = 0o100666 << 16 # ์ผ๋ฐ˜ ํŒŒ์ผ + ๋ชจ๋“  ๊ถŒํ•œ zi.create_system = 0 # DOS/Windows zi.date_time = now[:6] zout.writestr(zi, zin.read("mimetype")) for e in zin.infolist(): if e.filename == "mimetype": continue data = zin.read(e.filename) if e.filename.lower().endswith(".xml"): try: s = data.decode("utf-8", errors="ignore") before = s s = _apply_to_xml(s, mapping, {"para_hits":dbg["para_hits"], "field_hits":dbg["field_hits"], "text_hits":dbg["text_hits"], "token_hits":dbg["token_hits"], "files_touched":False}) if s != before: dbg["touched_files"].append(e.filename) data = s.encode("utf-8") except Exception: pass # ์™„์ „ํžˆ ์ƒˆ๋กœ์šด ZipInfo ์ƒ์„ฑ์œผ๋กœ ์ฝ๊ธฐ์ „์šฉ ๋ฐฉ์ง€ zi = zipfile.ZipInfo(e.filename) zi.compress_type = zipfile.ZIP_DEFLATED zi.external_attr = 0o100666 << 16 # ์ผ๋ฐ˜ ํŒŒ์ผ + ๋ชจ๋“  ๊ถŒํ•œ zi.create_system = 0 # DOS/Windows ์‹œ์Šคํ…œ zi.date_time = now[:6] # ํ˜„์žฌ ์‹œ๊ฐ„ zi.flag_bits = 0 # ํŠน๋ณ„ํ•œ ํ”Œ๋ž˜๊ทธ ์—†์Œ zout.writestr(zi, data) zout.close() out_buf.seek(0) zin.close() return out_buf.getvalue(), dbg # -------------------- UI -------------------- with st.expander("์‚ฌ์šฉ๋ฒ•", expanded=True): st.markdown(""" 1. ํ…œํ”Œ๋ฆฟ ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”. 2. ๋ณด์กด์ƒ์ž ์ •๋ณด๊ฐ€ ๋“ค์–ด์žˆ๋Š” ์—‘์…€ ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”. 3. ์ถœ๋ ฅํ•  ๋ผ๋ฒจ ๋ฒˆํ˜ธ๋ฅผ ์„ ํƒํ•ด์ฃผ์„ธ์š”. 4. ์••์ถ• ํŒŒ์ผ์„ ๋‹ค์šด๋ฐ›๊ณ , ์••์ถ•ํ•ด์ œ ํ›„ ํŒŒ์ผ์„ ๋ณ‘ํ•ฉํ•ด์ฃผ์„ธ์š”. 5. ๋ณ‘ํ•ฉ ํ›„, ๋ผ๋ฒจ์„ ์ถœ๋ ฅํ•˜์„ธ์š”. ๋‹จ, ํ…œํ”Œ๋ฆฟ์€ .HWPX(ํ•œ๊ธ€) ํŒŒ์ผ์ด์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. (.HWP ๋ถˆ๊ฐ€) """) tpl = st.file_uploader("๐Ÿ“„ HWPX ํ…œํ”Œ๋ฆฟ ์—…๋กœ๋“œ", type=["hwpx"]) n_per_page = st.number_input("ํ…œํ”Œ๋ฆฟ์˜ ๋ผ๋ฒจ ์„ธํŠธ ๊ฐœ์ˆ˜(ํ•œ ํŽ˜์ด์ง€ N๊ฐœ)", 1, 12, 3, 1) data = st.file_uploader("๐Ÿ“Š ๋ฐ์ดํ„ฐ ์—…๋กœ๋“œ (Excel/CSV)", type=["xlsx","xls","csv"]) if tpl and data: tpl_bytes = tpl.read() df = pd.read_csv(data) if data.name.lower().endswith(".csv") else pd.read_excel(data) if "๋ฐ•์Šค๋ฒˆํ˜ธ" not in df.columns: st.error("โŒ ํ•„์ˆ˜ ์ปฌ๋Ÿผ '๋ฐ•์Šค๋ฒˆํ˜ธ'๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค."); st.stop() st.success("โœ… ์œ„์น˜ ๋งคํ•‘ ์™„๋ฃŒ (์—‘์…€ ์ธก)") st.dataframe(df.head(10), use_container_width=True) merged = build_rows(df) boxes = merged["๋ฐ•์Šค๋ฒˆํ˜ธ"].astype(str).str.zfill(4).unique().tolist() st.subheader("๐Ÿ”Ž ์—…๋กœ๋“œ๋œ ๋ฐ•์Šค๋ฒˆํ˜ธ ๋ชฉ๋ก") st.write(f"์ด **{len(boxes)}**๊ฐœ") st.dataframe(pd.DataFrame({"๋ฐ•์Šค๋ฒˆํ˜ธ": boxes}), use_container_width=True, height=240) sel = st.multiselect("์ƒ์„ฑํ•  ๋ฐ•์Šค๋ฒˆํ˜ธ ์„ ํƒ (๋น„์šฐ๋ฉด ์ „์ฒด)", options=boxes) work = merged[merged["๋ฐ•์Šค๋ฒˆํ˜ธ"].isin(sel)] if sel else merged records = work.sort_values("๋ฐ•์Šค๋ฒˆํ˜ธ").to_dict(orient="records") # 1ํŽ˜์ด์ง€ ๋งคํ•‘ ํ”„๋ฆฌ๋ทฐ st.subheader("๐Ÿงช 1ํŽ˜์ด์ง€ ๋งคํ•‘ ํ”„๋ฆฌ๋ทฐ") keys = ["๋ฐ•์Šค๋ฒˆํ˜ธ","์ข…๋ฃŒ์—ฐ๋„","๋ณด์กด๊ธฐ๊ฐ„","๋‹จ์œ„์—…๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","๋ชฉ๋ก","์ œ๋ชฉ","์—…๋ฌด๋ช…"] mapping_preview = {} for i in range(int(n_per_page)): if i < len(records): r = records[i] mapping_preview.update({ f"๋ฐ•์Šค๋ฒˆํ˜ธ{i+1}": r.get("๋ฐ•์Šค๋ฒˆํ˜ธ",""), f"์ข…๋ฃŒ์—ฐ๋„{i+1}": r.get("์ƒ์‚ฐ์—ฐ๋„",""), f"๋ณด์กด๊ธฐ๊ฐ„{i+1}": r.get("๋ณด์กด๊ธฐ๊ฐ„",""), f"๋‹จ์œ„์—…๋ฌด{i+1}": r.get("๋‹จ์œ„์—…๋ฌด",""), f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}": r.get("๊ธฐ๋ก๋ฌผ์ฒ ",""), f"๋ชฉ๋ก{i+1}": r.get("๋ชฉ๋ก",""), f"์ œ๋ชฉ{i+1}": r.get("์ œ๋ชฉ",""), f"์—…๋ฌด๋ช…{i+1}": r.get("์ œ๋ชฉ",""), # ํ…œํ”Œ๋ฆฟ์ด '์—…๋ฌด๋ช…1'์„ ์“ฐ๋Š” ๊ฒฝ์šฐ ๋Œ€์‘ }) else: for k in keys: mapping_preview[f"{k}{i+1}"] = "" st.dataframe(pd.DataFrame([{"ํ‚ค":k, "๊ฐ’ ์•ž๋ถ€๋ถ„":str(v)[:120]} for k,v in sorted(mapping_preview.items())]), use_container_width=True, height=320) if st.button("๐Ÿš€ ๋ผ๋ฒจ ์ƒ์„ฑ (ํŽ˜์ด์ง€๋ณ„ HWPX ZIP)"): mem = io.BytesIO(); zout = zipfile.ZipFile(mem, "w", zipfile.ZIP_DEFLATED) pages = (len(records) + int(n_per_page) - 1) // int(n_per_page) debug_all = [] for p in range(pages): chunk = records[p*int(n_per_page):(p+1)*int(n_per_page)] mapping = {} for i in range(int(n_per_page)): if i < len(chunk): r = chunk[i] mapping[f"๋ฐ•์Šค๋ฒˆํ˜ธ{i+1}"] = r.get("๋ฐ•์Šค๋ฒˆํ˜ธ","") mapping[f"์ข…๋ฃŒ์—ฐ๋„{i+1}"] = r.get("์ƒ์‚ฐ์—ฐ๋„","") mapping[f"๋ณด์กด๊ธฐ๊ฐ„{i+1}"] = r.get("๋ณด์กด๊ธฐ๊ฐ„","") mapping[f"๋‹จ์œ„์—…๋ฌด{i+1}"] = r.get("๋‹จ์œ„์—…๋ฌด","") mapping[f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}"] = r.get("๊ธฐ๋ก๋ฌผ์ฒ ","") mapping[f"๋ชฉ๋ก{i+1}"] = r.get("๋ชฉ๋ก","") title_val = r.get("์ œ๋ชฉ","") mapping[f"์ œ๋ชฉ{i+1}"] = title_val mapping[f"์—…๋ฌด๋ช…{i+1}"] = title_val else: for k in keys: mapping[f"{k}{i+1}"] = "" out_hwpx, dbg = replace_in_hwpx(tpl_bytes, mapping) debug_all.append({"page": p+1, "stats": dbg}) name = "_".join([r.get("๋ฐ•์Šค๋ฒˆํ˜ธ","") for r in chunk]) if chunk else f"empty_{p+1}" zout.writestr(f"label_{name}.hwpx", out_hwpx) zout.close(); mem.seek(0) st.download_button("โฌ‡๏ธ ZIP ๋‹ค์šด๋กœ๋“œ", data=mem, file_name="labels_by_page.zip", mime="application/zip") st.download_button("โฌ‡๏ธ ๋””๋ฒ„๊ทธ(JSON)", data=json.dumps(debug_all, ensure_ascii=False, indent=2), file_name="debug.json", mime="application/json")