|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import io, zipfile, re, html, json |
|
|
from typing import Dict, Tuple |
|
|
|
|
|
st.set_page_config(page_title="๐ฆ ๋ณด์กด์์ ๋ผ๋ฒจ ์์ฑ๊ธฐ", layout="wide") |
|
|
st.title("๐ฆ ๋ณด์กด์์ ๋ผ๋ฒจ ์์ฑ๊ธฐ ๐ฆ") |
|
|
|
|
|
|
|
|
def _year_range(series: pd.Series) -> str: |
|
|
s = series.astype(str).fillna("") |
|
|
v = s[~s.isin(["", "0", "0000"])] |
|
|
if v.empty: return "0000-0000" |
|
|
nums = pd.to_numeric(v, errors="coerce").dropna().astype(int) |
|
|
if nums.empty: return "0000-0000" |
|
|
return f"{nums.min():04d}-{nums.max():04d}" |
|
|
|
|
|
def build_rows(df: pd.DataFrame) -> pd.DataFrame: |
|
|
df = df.copy() |
|
|
df["๋ฐ์ค๋ฒํธ"] = df["๋ฐ์ค๋ฒํธ"].astype(str).str.zfill(4) |
|
|
if "์ ๋ชฉ" in df.columns: |
|
|
df["์ ๋ชฉ"] = df["์ ๋ชฉ"].astype(str) |
|
|
|
|
|
|
|
|
if "์ข
๋ฃ์ฐ๋" in df.columns: |
|
|
yr = df.groupby("๋ฐ์ค๋ฒํธ")["์ข
๋ฃ์ฐ๋"].apply(_year_range).reset_index() |
|
|
yr.columns = ["๋ฐ์ค๋ฒํธ", "์์ฐ์ฐ๋"] |
|
|
else: |
|
|
yr = pd.DataFrame({"๋ฐ์ค๋ฒํธ": df["๋ฐ์ค๋ฒํธ"].unique(), "์์ฐ์ฐ๋": "0000-0000"}) |
|
|
|
|
|
|
|
|
has_mgmt = "๊ด๋ฆฌ๋ฒํธ" in df.columns |
|
|
lists = [] |
|
|
for b, g in df.groupby("๋ฐ์ค๋ฒํธ"): |
|
|
lines = [f"- {r['๊ด๋ฆฌ๋ฒํธ']} {r.get('์ ๋ชฉ','')}" if has_mgmt else f"- {r.get('์ ๋ชฉ','')}" |
|
|
for _, r in g.iterrows()] |
|
|
lists.append({"๋ฐ์ค๋ฒํธ": b, "๋ชฉ๋ก": "\r\n".join(lines)}) |
|
|
list_df = pd.DataFrame(lists) |
|
|
|
|
|
|
|
|
meta_cols = ["๋ฐ์ค๋ฒํธ","์ข
๋ฃ์ฐ๋","๋ณด์กด๊ธฐ๊ฐ","๋จ์์
๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","์ ๋ชฉ"] |
|
|
meta_exist = [c for c in meta_cols if c in df.columns] |
|
|
meta = df.groupby("๋ฐ์ค๋ฒํธ", as_index=False).first()[meta_exist] if meta_exist \ |
|
|
else pd.DataFrame({"๋ฐ์ค๋ฒํธ": df["๋ฐ์ค๋ฒํธ"].unique()}) |
|
|
|
|
|
merged = meta.merge(list_df, on="๋ฐ์ค๋ฒํธ", how="left").merge(yr, on="๋ฐ์ค๋ฒํธ", how="left") |
|
|
return merged |
|
|
|
|
|
|
|
|
FIELD_PAIR_RE_TMPL = ( |
|
|
r'<(?P<fprefix>[a-zA-Z0-9_]+):fieldBegin\b[^>]*\bname="{name}"[^>]*/>' |
|
|
r'(.*?)' |
|
|
r'<(?P=fprefix):fieldEnd\b[^>]*/>' |
|
|
) |
|
|
TOKEN_FMT = "{{{{{key}}}}}" |
|
|
|
|
|
|
|
|
PARA_RE = re.compile( |
|
|
r'<(?P<pprefix>[a-zA-Z0-9_]+):p(?P<pattrs>[^>]*)>(?P<pbody>.*?)</(?P=pprefix):p>', |
|
|
re.DOTALL |
|
|
) |
|
|
|
|
|
|
|
|
def _extract_run_style(body: str, pprefix: str) -> str: |
|
|
"""๋ฌธ๋จ ๋ด์ฉ์์ ์ฒซ ๋ฒ์งธ run ์์์ ์คํ์ผ์ ์ถ์ถ""" |
|
|
run_pattern = re.compile( |
|
|
rf'<{pprefix}:run[^>]*>.*?</{pprefix}:run>', |
|
|
re.DOTALL |
|
|
) |
|
|
match = run_pattern.search(body) |
|
|
if match: |
|
|
return match.group(0) |
|
|
return f'<{pprefix}:run><{pprefix}:t><//{pprefix}:t></{pprefix}:run>' |
|
|
|
|
|
|
|
|
def _make_para_with_style(pprefix: str, pattrs: str, text: str, original_run: str) -> str: |
|
|
esc = html.escape("" if text is None else str(text)) |
|
|
|
|
|
|
|
|
text_pattern = re.compile(rf'(<{pprefix}:t[^>]*>)[^<]*(</{pprefix}:t>)') |
|
|
new_run = text_pattern.sub(rf'\g<1>{esc}\g<2>', original_run) |
|
|
|
|
|
|
|
|
if new_run == original_run: |
|
|
t_pattern = re.compile(rf'(<{pprefix}:run[^>]*>)(.*?)(</{pprefix}:run>)', re.DOTALL) |
|
|
new_run = t_pattern.sub(rf'\g<1><{pprefix}:t>{esc}</{pprefix}:t>\g<3>', original_run) |
|
|
|
|
|
return f'<{pprefix}:p{pattrs}>{new_run}</{pprefix}:p>' |
|
|
|
|
|
def _split_lines(val) -> list: |
|
|
if val is None: return [""] |
|
|
return str(val).replace("\r\n","\n").split("\n") |
|
|
|
|
|
def _replace_para_multiline(xml: str, key: str, value: str, dbg: dict) -> str: |
|
|
""" |
|
|
key๊ฐ ํฌํจ๋ '๋ถ๋ชจ ๋ฌธ๋จ ์ ์ฒด'๋ฅผ, ๊ฐ์ ๊ฐ ์ค์ ๋ด์ ์ฌ๋ฌ ๋ฌธ๋จ์ผ๋ก ๊ต์ฒด. |
|
|
์๋ณธ ์คํ์ผ์ ์ ์งํ๋ฉด์ ๊ต์ฒด. |
|
|
""" |
|
|
pair_pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(key)), re.DOTALL) |
|
|
tnode_pat = re.compile(rf'<(?P<p>[a-zA-Z0-9_]+):t[^>]*>[^<]*{re.escape(key)}[^<]*</(?P=p):t>', re.DOTALL) |
|
|
token_str = TOKEN_FMT.format(key=key) |
|
|
|
|
|
def para_repl(m): |
|
|
body = m.group("pbody") |
|
|
if not (pair_pat.search(body) or tnode_pat.search(body) or (token_str in body)): |
|
|
return m.group(0) |
|
|
|
|
|
lines = _split_lines(value) |
|
|
pprefix = m.group("pprefix") |
|
|
pattrs = m.group("pattrs") |
|
|
|
|
|
|
|
|
original_run = _extract_run_style(body, pprefix) |
|
|
|
|
|
|
|
|
new_paras = "".join(_make_para_with_style(pprefix, pattrs, ln, original_run) for ln in lines) |
|
|
dbg["para_hits"][key] = dbg["para_hits"].get(key, 0) + 1 |
|
|
return new_paras |
|
|
|
|
|
xml2 = PARA_RE.sub(para_repl, xml) |
|
|
if xml2 != xml: |
|
|
dbg["touched"] = True |
|
|
return xml2 |
|
|
|
|
|
def _runs_plain(text: str) -> str: |
|
|
return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>" |
|
|
|
|
|
def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str: |
|
|
changed_any = False |
|
|
|
|
|
|
|
|
multi_key = re.compile(r"^(๋ชฉ๋ก|list|์ ๋ชฉ)\d+$", re.IGNORECASE) |
|
|
for k, v in mapping.items(): |
|
|
if multi_key.match(k): |
|
|
xml_new = _replace_para_multiline(xml, k, v, dbg) |
|
|
if xml_new != xml: |
|
|
xml = xml_new |
|
|
changed_any = True |
|
|
|
|
|
|
|
|
for k, v in mapping.items(): |
|
|
if multi_key.match(k): |
|
|
continue |
|
|
replacement = _runs_plain(v) |
|
|
pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(k)), re.DOTALL) |
|
|
xml_new, n = pat.subn(replacement, xml) |
|
|
if n: |
|
|
dbg["field_hits"][k] = dbg["field_hits"].get(k, 0) + n |
|
|
xml = xml_new |
|
|
changed_any = True |
|
|
|
|
|
|
|
|
tnode_all = re.compile( |
|
|
r'(<(?P<prefix>[a-zA-Z0-9_]+):t[^>]*>)([^<]*?)</(?P=prefix):t>', |
|
|
re.DOTALL |
|
|
) |
|
|
for k, v in mapping.items(): |
|
|
if multi_key.match(k): |
|
|
continue |
|
|
def repl_tnode(m): |
|
|
text_node = m.group(3) |
|
|
if k not in text_node: |
|
|
return m.group(0) |
|
|
new_text = html.escape(text_node.replace(k, "" if v is None else str(v))) |
|
|
return f"{m.group(1)}{new_text}</{m.group('prefix')}:t>" |
|
|
xml2 = tnode_all.sub(repl_tnode, xml) |
|
|
if xml2 != xml: |
|
|
dbg["text_hits"][k] = dbg["text_hits"].get(k, 0) + 1 |
|
|
xml = xml2 |
|
|
changed_any = True |
|
|
|
|
|
|
|
|
for k, v in mapping.items(): |
|
|
if multi_key.match(k): |
|
|
continue |
|
|
tok = TOKEN_FMT.format(key=k) |
|
|
if tok in xml: |
|
|
xml = xml.replace(tok, html.escape("" if v is None else str(v))) |
|
|
dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1 |
|
|
changed_any = True |
|
|
|
|
|
if changed_any: |
|
|
dbg["files_touched"] = True |
|
|
return xml |
|
|
|
|
|
def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, dict]: |
|
|
import stat, time |
|
|
dbg = {"para_hits":{}, "field_hits":{}, "text_hits":{}, "token_hits":{}, "touched_files": []} |
|
|
zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r") |
|
|
out_buf = io.BytesIO() |
|
|
zout = zipfile.ZipFile(out_buf, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6) |
|
|
|
|
|
|
|
|
now = time.localtime() |
|
|
|
|
|
|
|
|
names = zin.namelist() |
|
|
if "mimetype" in names: |
|
|
zi = zipfile.ZipInfo("mimetype") |
|
|
zi.compress_type = zipfile.ZIP_STORED |
|
|
|
|
|
zi.external_attr = 0o100666 << 16 |
|
|
zi.create_system = 0 |
|
|
zi.date_time = now[:6] |
|
|
zout.writestr(zi, zin.read("mimetype")) |
|
|
|
|
|
for e in zin.infolist(): |
|
|
if e.filename == "mimetype": |
|
|
continue |
|
|
data = zin.read(e.filename) |
|
|
if e.filename.lower().endswith(".xml"): |
|
|
try: |
|
|
s = data.decode("utf-8", errors="ignore") |
|
|
before = s |
|
|
s = _apply_to_xml(s, mapping, {"para_hits":dbg["para_hits"], "field_hits":dbg["field_hits"], |
|
|
"text_hits":dbg["text_hits"], "token_hits":dbg["token_hits"], |
|
|
"files_touched":False}) |
|
|
if s != before: |
|
|
dbg["touched_files"].append(e.filename) |
|
|
data = s.encode("utf-8") |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
zi = zipfile.ZipInfo(e.filename) |
|
|
zi.compress_type = zipfile.ZIP_DEFLATED |
|
|
zi.external_attr = 0o100666 << 16 |
|
|
zi.create_system = 0 |
|
|
zi.date_time = now[:6] |
|
|
zi.flag_bits = 0 |
|
|
zout.writestr(zi, data) |
|
|
|
|
|
zout.close() |
|
|
out_buf.seek(0) |
|
|
zin.close() |
|
|
return out_buf.getvalue(), dbg |
|
|
|
|
|
|
|
|
with st.expander("์ฌ์ฉ๋ฒ", expanded=True): |
|
|
st.markdown(""" |
|
|
1. ํ
ํ๋ฆฟ ํ์ผ์ ์
๋ก๋ํด์ฃผ์ธ์. |
|
|
2. ๋ณด์กด์์ ์ ๋ณด๊ฐ ๋ค์ด์๋ ์์
ํ์ผ์ ์
๋ก๋ํด์ฃผ์ธ์. |
|
|
3. ์ถ๋ ฅํ ๋ผ๋ฒจ ๋ฒํธ๋ฅผ ์ ํํด์ฃผ์ธ์. |
|
|
4. ์์ถ ํ์ผ์ ๋ค์ด๋ฐ๊ณ , ์์ถํด์ ํ ํ์ผ์ ๋ณํฉํด์ฃผ์ธ์. |
|
|
5. ๋ณํฉ ํ, ๋ผ๋ฒจ์ ์ถ๋ ฅํ์ธ์. |
|
|
๋จ, ํ
ํ๋ฆฟ์ .HWPX(ํ๊ธ) ํ์ผ์ด์ด์ผ ํฉ๋๋ค. (.HWP ๋ถ๊ฐ) |
|
|
""") |
|
|
|
|
|
tpl = st.file_uploader("๐ HWPX ํ
ํ๋ฆฟ ์
๋ก๋", type=["hwpx"]) |
|
|
n_per_page = st.number_input("ํ
ํ๋ฆฟ์ ๋ผ๋ฒจ ์ธํธ ๊ฐ์(ํ ํ์ด์ง N๊ฐ)", 1, 12, 3, 1) |
|
|
data = st.file_uploader("๐ ๋ฐ์ดํฐ ์
๋ก๋ (Excel/CSV)", type=["xlsx","xls","csv"]) |
|
|
|
|
|
if tpl and data: |
|
|
tpl_bytes = tpl.read() |
|
|
df = pd.read_csv(data) if data.name.lower().endswith(".csv") else pd.read_excel(data) |
|
|
|
|
|
if "๋ฐ์ค๋ฒํธ" not in df.columns: |
|
|
st.error("โ ํ์ ์ปฌ๋ผ '๋ฐ์ค๋ฒํธ'๊ฐ ์์ต๋๋ค."); st.stop() |
|
|
|
|
|
st.success("โ
์์น ๋งคํ ์๋ฃ (์์
์ธก)") |
|
|
st.dataframe(df.head(10), use_container_width=True) |
|
|
|
|
|
merged = build_rows(df) |
|
|
boxes = merged["๋ฐ์ค๋ฒํธ"].astype(str).str.zfill(4).unique().tolist() |
|
|
|
|
|
st.subheader("๐ ์
๋ก๋๋ ๋ฐ์ค๋ฒํธ ๋ชฉ๋ก") |
|
|
st.write(f"์ด **{len(boxes)}**๊ฐ") |
|
|
st.dataframe(pd.DataFrame({"๋ฐ์ค๋ฒํธ": boxes}), use_container_width=True, height=240) |
|
|
|
|
|
sel = st.multiselect("์์ฑํ ๋ฐ์ค๋ฒํธ ์ ํ (๋น์ฐ๋ฉด ์ ์ฒด)", options=boxes) |
|
|
work = merged[merged["๋ฐ์ค๋ฒํธ"].isin(sel)] if sel else merged |
|
|
records = work.sort_values("๋ฐ์ค๋ฒํธ").to_dict(orient="records") |
|
|
|
|
|
|
|
|
st.subheader("๐งช 1ํ์ด์ง ๋งคํ ํ๋ฆฌ๋ทฐ") |
|
|
keys = ["๋ฐ์ค๋ฒํธ","์ข
๋ฃ์ฐ๋","๋ณด์กด๊ธฐ๊ฐ","๋จ์์
๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","๋ชฉ๋ก","์ ๋ชฉ","์
๋ฌด๋ช
"] |
|
|
mapping_preview = {} |
|
|
for i in range(int(n_per_page)): |
|
|
if i < len(records): |
|
|
r = records[i] |
|
|
mapping_preview.update({ |
|
|
f"๋ฐ์ค๋ฒํธ{i+1}": r.get("๋ฐ์ค๋ฒํธ",""), |
|
|
f"์ข
๋ฃ์ฐ๋{i+1}": r.get("์์ฐ์ฐ๋",""), |
|
|
f"๋ณด์กด๊ธฐ๊ฐ{i+1}": r.get("๋ณด์กด๊ธฐ๊ฐ",""), |
|
|
f"๋จ์์
๋ฌด{i+1}": r.get("๋จ์์
๋ฌด",""), |
|
|
f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}": r.get("๊ธฐ๋ก๋ฌผ์ฒ ",""), |
|
|
f"๋ชฉ๋ก{i+1}": r.get("๋ชฉ๋ก",""), |
|
|
f"์ ๋ชฉ{i+1}": r.get("์ ๋ชฉ",""), |
|
|
f"์
๋ฌด๋ช
{i+1}": r.get("์ ๋ชฉ",""), |
|
|
}) |
|
|
else: |
|
|
for k in keys: mapping_preview[f"{k}{i+1}"] = "" |
|
|
st.dataframe(pd.DataFrame([{"ํค":k, "๊ฐ ์๋ถ๋ถ":str(v)[:120]} for k,v in sorted(mapping_preview.items())]), |
|
|
use_container_width=True, height=320) |
|
|
|
|
|
if st.button("๐ ๋ผ๋ฒจ ์์ฑ (ํ์ด์ง๋ณ HWPX ZIP)"): |
|
|
mem = io.BytesIO(); zout = zipfile.ZipFile(mem, "w", zipfile.ZIP_DEFLATED) |
|
|
pages = (len(records) + int(n_per_page) - 1) // int(n_per_page) |
|
|
debug_all = [] |
|
|
|
|
|
for p in range(pages): |
|
|
chunk = records[p*int(n_per_page):(p+1)*int(n_per_page)] |
|
|
mapping = {} |
|
|
for i in range(int(n_per_page)): |
|
|
if i < len(chunk): |
|
|
r = chunk[i] |
|
|
mapping[f"๋ฐ์ค๋ฒํธ{i+1}"] = r.get("๋ฐ์ค๋ฒํธ","") |
|
|
mapping[f"์ข
๋ฃ์ฐ๋{i+1}"] = r.get("์์ฐ์ฐ๋","") |
|
|
mapping[f"๋ณด์กด๊ธฐ๊ฐ{i+1}"] = r.get("๋ณด์กด๊ธฐ๊ฐ","") |
|
|
mapping[f"๋จ์์
๋ฌด{i+1}"] = r.get("๋จ์์
๋ฌด","") |
|
|
mapping[f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}"] = r.get("๊ธฐ๋ก๋ฌผ์ฒ ","") |
|
|
mapping[f"๋ชฉ๋ก{i+1}"] = r.get("๋ชฉ๋ก","") |
|
|
title_val = r.get("์ ๋ชฉ","") |
|
|
mapping[f"์ ๋ชฉ{i+1}"] = title_val |
|
|
mapping[f"์
๋ฌด๋ช
{i+1}"] = title_val |
|
|
else: |
|
|
for k in keys: mapping[f"{k}{i+1}"] = "" |
|
|
|
|
|
out_hwpx, dbg = replace_in_hwpx(tpl_bytes, mapping) |
|
|
debug_all.append({"page": p+1, "stats": dbg}) |
|
|
name = "_".join([r.get("๋ฐ์ค๋ฒํธ","") for r in chunk]) if chunk else f"empty_{p+1}" |
|
|
zout.writestr(f"label_{name}.hwpx", out_hwpx) |
|
|
|
|
|
zout.close(); mem.seek(0) |
|
|
st.download_button("โฌ๏ธ ZIP ๋ค์ด๋ก๋", data=mem, file_name="labels_by_page.zip", mime="application/zip") |
|
|
st.download_button("โฌ๏ธ ๋๋ฒ๊ทธ(JSON)", data=json.dumps(debug_all, ensure_ascii=False, indent=2), |
|
|
file_name="debug.json", mime="application/json") |