boxlabel / app.py
dohyune's picture
Update app.py
c0ef99a verified
import streamlit as st
import pandas as pd
import io, zipfile, re, html, json
from typing import Dict, Tuple
st.set_page_config(page_title="๐Ÿ“ฆ ๋ณด์กด์ƒ์ž ๋ผ๋ฒจ ์ƒ์„ฑ๊ธฐ", layout="wide")
st.title("๐Ÿ“ฆ ๋ณด์กด์ƒ์ž ๋ผ๋ฒจ ์ƒ์„ฑ๊ธฐ ๐Ÿ“ฆ")
# -------------------- ๋ฐ์ดํ„ฐ ์œ ํ‹ธ --------------------
def _year_range(series: pd.Series) -> str:
s = series.astype(str).fillna("")
v = s[~s.isin(["", "0", "0000"])]
if v.empty: return "0000-0000"
nums = pd.to_numeric(v, errors="coerce").dropna().astype(int)
if nums.empty: return "0000-0000"
return f"{nums.min():04d}-{nums.max():04d}"
def build_rows(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df["๋ฐ•์Šค๋ฒˆํ˜ธ"] = df["๋ฐ•์Šค๋ฒˆํ˜ธ"].astype(str).str.zfill(4)
if "์ œ๋ชฉ" in df.columns:
df["์ œ๋ชฉ"] = df["์ œ๋ชฉ"].astype(str)
# ์ƒ์‚ฐ์—ฐ๋„(๋ฒ”์œ„) = ์ข…๋ฃŒ์—ฐ๋„ ๊ทธ๋ฃน ๋ฒ”์œ„
if "์ข…๋ฃŒ์—ฐ๋„" in df.columns:
yr = df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ")["์ข…๋ฃŒ์—ฐ๋„"].apply(_year_range).reset_index()
yr.columns = ["๋ฐ•์Šค๋ฒˆํ˜ธ", "์ƒ์‚ฐ์—ฐ๋„"]
else:
yr = pd.DataFrame({"๋ฐ•์Šค๋ฒˆํ˜ธ": df["๋ฐ•์Šค๋ฒˆํ˜ธ"].unique(), "์ƒ์‚ฐ์—ฐ๋„": "0000-0000"})
# ๋ชฉ๋ก(๊ด€๋ฆฌ๋ฒˆํ˜ธ + ์ œ๋ชฉ)
has_mgmt = "๊ด€๋ฆฌ๋ฒˆํ˜ธ" in df.columns
lists = []
for b, g in df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ"):
lines = [f"- {r['๊ด€๋ฆฌ๋ฒˆํ˜ธ']} {r.get('์ œ๋ชฉ','')}" if has_mgmt else f"- {r.get('์ œ๋ชฉ','')}"
for _, r in g.iterrows()]
lists.append({"๋ฐ•์Šค๋ฒˆํ˜ธ": b, "๋ชฉ๋ก": "\r\n".join(lines)})
list_df = pd.DataFrame(lists)
# ๋Œ€ํ‘œ ๋ฉ”ํƒ€
meta_cols = ["๋ฐ•์Šค๋ฒˆํ˜ธ","์ข…๋ฃŒ์—ฐ๋„","๋ณด์กด๊ธฐ๊ฐ„","๋‹จ์œ„์—…๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","์ œ๋ชฉ"]
meta_exist = [c for c in meta_cols if c in df.columns]
meta = df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ", as_index=False).first()[meta_exist] if meta_exist \
else pd.DataFrame({"๋ฐ•์Šค๋ฒˆํ˜ธ": df["๋ฐ•์Šค๋ฒˆํ˜ธ"].unique()})
merged = meta.merge(list_df, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left").merge(yr, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left")
return merged
# -------------------- ์น˜ํ™˜ ์œ ํ‹ธ --------------------
FIELD_PAIR_RE_TMPL = (
r'<(?P<fprefix>[a-zA-Z0-9_]+):fieldBegin\b[^>]*\bname="{name}"[^>]*/>'
r'(.*?)'
r'<(?P=fprefix):fieldEnd\b[^>]*/>'
)
TOKEN_FMT = "{{{{{key}}}}}"
# ๋ฌธ๋‹จ(<*:p>) ํƒ์ƒ‰ ํŒจํ„ด
PARA_RE = re.compile(
r'<(?P<pprefix>[a-zA-Z0-9_]+):p(?P<pattrs>[^>]*)>(?P<pbody>.*?)</(?P=pprefix):p>',
re.DOTALL
)
# ์›๋ณธ run ์Šคํƒ€์ผ์„ ์ถ”์ถœํ•˜๋Š” ํ•จ์ˆ˜
def _extract_run_style(body: str, pprefix: str) -> str:
"""๋ฌธ๋‹จ ๋‚ด์šฉ์—์„œ ์ฒซ ๋ฒˆ์งธ run ์š”์†Œ์˜ ์Šคํƒ€์ผ์„ ์ถ”์ถœ"""
run_pattern = re.compile(
rf'<{pprefix}:run[^>]*>.*?</{pprefix}:run>',
re.DOTALL
)
match = run_pattern.search(body)
if match:
return match.group(0)
return f'<{pprefix}:run><{pprefix}:t><//{pprefix}:t></{pprefix}:run>'
# ๋ฌธ๋‹จ ํ•˜๋‚˜๋ฅผ ๊ฐ™์€ ์Šคํƒ€์ผ๋กœ ๋ณต์ œํ•ด์ฃผ๋Š” ํ—ฌํผ (์Šคํƒ€์ผ ๋ณด์กด)
def _make_para_with_style(pprefix: str, pattrs: str, text: str, original_run: str) -> str:
esc = html.escape("" if text is None else str(text))
# ์›๋ณธ run์—์„œ ํ…์ŠคํŠธ ๋ถ€๋ถ„๋งŒ ๊ต์ฒด
text_pattern = re.compile(rf'(<{pprefix}:t[^>]*>)[^<]*(</{pprefix}:t>)')
new_run = text_pattern.sub(rf'\g<1>{esc}\g<2>', original_run)
# ๋งŒ์•ฝ ํ…์ŠคํŠธ ๋…ธ๋“œ๊ฐ€ ์—†๋‹ค๋ฉด ๊ธฐ๋ณธ ํ˜•ํƒœ๋กœ
if new_run == original_run:
t_pattern = re.compile(rf'(<{pprefix}:run[^>]*>)(.*?)(</{pprefix}:run>)', re.DOTALL)
new_run = t_pattern.sub(rf'\g<1><{pprefix}:t>{esc}</{pprefix}:t>\g<3>', original_run)
return f'<{pprefix}:p{pattrs}>{new_run}</{pprefix}:p>'
def _split_lines(val) -> list:
if val is None: return [""]
return str(val).replace("\r\n","\n").split("\n")
def _replace_para_multiline(xml: str, key: str, value: str, dbg: dict) -> str:
"""
key๊ฐ€ ํฌํ•จ๋œ '๋ถ€๋ชจ ๋ฌธ๋‹จ ์ „์ฒด'๋ฅผ, ๊ฐ’์˜ ๊ฐ ์ค„์„ ๋‹ด์€ ์—ฌ๋Ÿฌ ๋ฌธ๋‹จ์œผ๋กœ ๊ต์ฒด.
์›๋ณธ ์Šคํƒ€์ผ์„ ์œ ์ง€ํ•˜๋ฉด์„œ ๊ต์ฒด.
"""
pair_pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(key)), re.DOTALL)
tnode_pat = re.compile(rf'<(?P<p>[a-zA-Z0-9_]+):t[^>]*>[^<]*{re.escape(key)}[^<]*</(?P=p):t>', re.DOTALL)
token_str = TOKEN_FMT.format(key=key)
def para_repl(m):
body = m.group("pbody")
if not (pair_pat.search(body) or tnode_pat.search(body) or (token_str in body)):
return m.group(0)
lines = _split_lines(value)
pprefix = m.group("pprefix")
pattrs = m.group("pattrs")
# ์›๋ณธ run ์Šคํƒ€์ผ ์ถ”์ถœ
original_run = _extract_run_style(body, pprefix)
# ๊ฐ ์ค„์— ๋Œ€ํ•ด ์›๋ณธ ์Šคํƒ€์ผ์„ ์œ ์ง€ํ•˜๋ฉด์„œ ์ƒˆ ๋ฌธ๋‹จ ์ƒ์„ฑ
new_paras = "".join(_make_para_with_style(pprefix, pattrs, ln, original_run) for ln in lines)
dbg["para_hits"][key] = dbg["para_hits"].get(key, 0) + 1
return new_paras
xml2 = PARA_RE.sub(para_repl, xml)
if xml2 != xml:
dbg["touched"] = True
return xml2
def _runs_plain(text: str) -> str:
return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>"
def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
changed_any = False
# 0) ๋‹ค์ค‘์ค„ ํ‚ค๋Š” ๋จผ์ € "๋ถ€๋ชจ ๋ฌธ๋‹จ ๊ต์ฒด"๋กœ ์ฒ˜๋ฆฌ (์—…๋ฌด๋ช…์€ ์ œ์™ธํ•˜์—ฌ ํฐํŠธ ๋ฌธ์ œ ํ•ด๊ฒฐ)
multi_key = re.compile(r"^(๋ชฉ๋ก|list|์ œ๋ชฉ)\d+$", re.IGNORECASE)
for k, v in mapping.items():
if multi_key.match(k):
xml_new = _replace_para_multiline(xml, k, v, dbg)
if xml_new != xml:
xml = xml_new
changed_any = True
# 1) ํ•„๋“œ์Œ(์ธ๋ผ์ธ) ์น˜ํ™˜ โ€” ๋‹จ์ผ์ค„ ํ‚ค๋งŒ
for k, v in mapping.items():
if multi_key.match(k):
continue
replacement = _runs_plain(v)
pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
xml_new, n = pat.subn(replacement, xml)
if n:
dbg["field_hits"][k] = dbg["field_hits"].get(k, 0) + n
xml = xml_new
changed_any = True
# 2) ์ˆœ์ˆ˜ ํ…์ŠคํŠธ ์ž๋ฆฌํ‘œ์‹œ์ž(<*:t>ํ‚ค</*:t>) ๋ถ€๋ถ„์น˜ํ™˜ โ€” ๋‹จ์ผ์ค„ ํ‚ค๋งŒ
tnode_all = re.compile(
r'(<(?P<prefix>[a-zA-Z0-9_]+):t[^>]*>)([^<]*?)</(?P=prefix):t>',
re.DOTALL
)
for k, v in mapping.items():
if multi_key.match(k):
continue
def repl_tnode(m):
text_node = m.group(3)
if k not in text_node:
return m.group(0)
new_text = html.escape(text_node.replace(k, "" if v is None else str(v)))
return f"{m.group(1)}{new_text}</{m.group('prefix')}:t>"
xml2 = tnode_all.sub(repl_tnode, xml)
if xml2 != xml:
dbg["text_hits"][k] = dbg["text_hits"].get(k, 0) + 1
xml = xml2
changed_any = True
# 3) ํ† ํฐ ์น˜ํ™˜ โ€” ๋‹จ์ผ์ค„ ํ‚ค๋งŒ
for k, v in mapping.items():
if multi_key.match(k):
continue
tok = TOKEN_FMT.format(key=k)
if tok in xml:
xml = xml.replace(tok, html.escape("" if v is None else str(v)))
dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1
changed_any = True
if changed_any:
dbg["files_touched"] = True
return xml
def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, dict]:
import stat, time
dbg = {"para_hits":{}, "field_hits":{}, "text_hits":{}, "token_hits":{}, "touched_files": []}
zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
out_buf = io.BytesIO()
zout = zipfile.ZipFile(out_buf, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6)
# ํ˜„์žฌ ์‹œ๊ฐ„
now = time.localtime()
# mimetype ๋ฌด์••์ถ• + ๋งจ์•ž
names = zin.namelist()
if "mimetype" in names:
zi = zipfile.ZipInfo("mimetype")
zi.compress_type = zipfile.ZIP_STORED
# ์™„์ „ํžˆ ์ƒˆ๋กœ์šด ZipInfo๋กœ ์ฝ๊ธฐ์ „์šฉ ๋ฐฉ์ง€
zi.external_attr = 0o100666 << 16 # ์ผ๋ฐ˜ ํŒŒ์ผ + ๋ชจ๋“  ๊ถŒํ•œ
zi.create_system = 0 # DOS/Windows
zi.date_time = now[:6]
zout.writestr(zi, zin.read("mimetype"))
for e in zin.infolist():
if e.filename == "mimetype":
continue
data = zin.read(e.filename)
if e.filename.lower().endswith(".xml"):
try:
s = data.decode("utf-8", errors="ignore")
before = s
s = _apply_to_xml(s, mapping, {"para_hits":dbg["para_hits"], "field_hits":dbg["field_hits"],
"text_hits":dbg["text_hits"], "token_hits":dbg["token_hits"],
"files_touched":False})
if s != before:
dbg["touched_files"].append(e.filename)
data = s.encode("utf-8")
except Exception:
pass
# ์™„์ „ํžˆ ์ƒˆ๋กœ์šด ZipInfo ์ƒ์„ฑ์œผ๋กœ ์ฝ๊ธฐ์ „์šฉ ๋ฐฉ์ง€
zi = zipfile.ZipInfo(e.filename)
zi.compress_type = zipfile.ZIP_DEFLATED
zi.external_attr = 0o100666 << 16 # ์ผ๋ฐ˜ ํŒŒ์ผ + ๋ชจ๋“  ๊ถŒํ•œ
zi.create_system = 0 # DOS/Windows ์‹œ์Šคํ…œ
zi.date_time = now[:6] # ํ˜„์žฌ ์‹œ๊ฐ„
zi.flag_bits = 0 # ํŠน๋ณ„ํ•œ ํ”Œ๋ž˜๊ทธ ์—†์Œ
zout.writestr(zi, data)
zout.close()
out_buf.seek(0)
zin.close()
return out_buf.getvalue(), dbg
# -------------------- UI --------------------
with st.expander("์‚ฌ์šฉ๋ฒ•", expanded=True):
st.markdown("""
1. ํ…œํ”Œ๋ฆฟ ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”.
2. ๋ณด์กด์ƒ์ž ์ •๋ณด๊ฐ€ ๋“ค์–ด์žˆ๋Š” ์—‘์…€ ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”.
3. ์ถœ๋ ฅํ•  ๋ผ๋ฒจ ๋ฒˆํ˜ธ๋ฅผ ์„ ํƒํ•ด์ฃผ์„ธ์š”.
4. ์••์ถ• ํŒŒ์ผ์„ ๋‹ค์šด๋ฐ›๊ณ , ์••์ถ•ํ•ด์ œ ํ›„ ํŒŒ์ผ์„ ๋ณ‘ํ•ฉํ•ด์ฃผ์„ธ์š”.
5. ๋ณ‘ํ•ฉ ํ›„, ๋ผ๋ฒจ์„ ์ถœ๋ ฅํ•˜์„ธ์š”.
๋‹จ, ํ…œํ”Œ๋ฆฟ์€ .HWPX(ํ•œ๊ธ€) ํŒŒ์ผ์ด์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. (.HWP ๋ถˆ๊ฐ€)
""")
tpl = st.file_uploader("๐Ÿ“„ HWPX ํ…œํ”Œ๋ฆฟ ์—…๋กœ๋“œ", type=["hwpx"])
n_per_page = st.number_input("ํ…œํ”Œ๋ฆฟ์˜ ๋ผ๋ฒจ ์„ธํŠธ ๊ฐœ์ˆ˜(ํ•œ ํŽ˜์ด์ง€ N๊ฐœ)", 1, 12, 3, 1)
data = st.file_uploader("๐Ÿ“Š ๋ฐ์ดํ„ฐ ์—…๋กœ๋“œ (Excel/CSV)", type=["xlsx","xls","csv"])
if tpl and data:
tpl_bytes = tpl.read()
df = pd.read_csv(data) if data.name.lower().endswith(".csv") else pd.read_excel(data)
if "๋ฐ•์Šค๋ฒˆํ˜ธ" not in df.columns:
st.error("โŒ ํ•„์ˆ˜ ์ปฌ๋Ÿผ '๋ฐ•์Šค๋ฒˆํ˜ธ'๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค."); st.stop()
st.success("โœ… ์œ„์น˜ ๋งคํ•‘ ์™„๋ฃŒ (์—‘์…€ ์ธก)")
st.dataframe(df.head(10), use_container_width=True)
merged = build_rows(df)
boxes = merged["๋ฐ•์Šค๋ฒˆํ˜ธ"].astype(str).str.zfill(4).unique().tolist()
st.subheader("๐Ÿ”Ž ์—…๋กœ๋“œ๋œ ๋ฐ•์Šค๋ฒˆํ˜ธ ๋ชฉ๋ก")
st.write(f"์ด **{len(boxes)}**๊ฐœ")
st.dataframe(pd.DataFrame({"๋ฐ•์Šค๋ฒˆํ˜ธ": boxes}), use_container_width=True, height=240)
sel = st.multiselect("์ƒ์„ฑํ•  ๋ฐ•์Šค๋ฒˆํ˜ธ ์„ ํƒ (๋น„์šฐ๋ฉด ์ „์ฒด)", options=boxes)
work = merged[merged["๋ฐ•์Šค๋ฒˆํ˜ธ"].isin(sel)] if sel else merged
records = work.sort_values("๋ฐ•์Šค๋ฒˆํ˜ธ").to_dict(orient="records")
# 1ํŽ˜์ด์ง€ ๋งคํ•‘ ํ”„๋ฆฌ๋ทฐ
st.subheader("๐Ÿงช 1ํŽ˜์ด์ง€ ๋งคํ•‘ ํ”„๋ฆฌ๋ทฐ")
keys = ["๋ฐ•์Šค๋ฒˆํ˜ธ","์ข…๋ฃŒ์—ฐ๋„","๋ณด์กด๊ธฐ๊ฐ„","๋‹จ์œ„์—…๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","๋ชฉ๋ก","์ œ๋ชฉ","์—…๋ฌด๋ช…"]
mapping_preview = {}
for i in range(int(n_per_page)):
if i < len(records):
r = records[i]
mapping_preview.update({
f"๋ฐ•์Šค๋ฒˆํ˜ธ{i+1}": r.get("๋ฐ•์Šค๋ฒˆํ˜ธ",""),
f"์ข…๋ฃŒ์—ฐ๋„{i+1}": r.get("์ƒ์‚ฐ์—ฐ๋„",""),
f"๋ณด์กด๊ธฐ๊ฐ„{i+1}": r.get("๋ณด์กด๊ธฐ๊ฐ„",""),
f"๋‹จ์œ„์—…๋ฌด{i+1}": r.get("๋‹จ์œ„์—…๋ฌด",""),
f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}": r.get("๊ธฐ๋ก๋ฌผ์ฒ ",""),
f"๋ชฉ๋ก{i+1}": r.get("๋ชฉ๋ก",""),
f"์ œ๋ชฉ{i+1}": r.get("์ œ๋ชฉ",""),
f"์—…๋ฌด๋ช…{i+1}": r.get("์ œ๋ชฉ",""), # ํ…œํ”Œ๋ฆฟ์ด '์—…๋ฌด๋ช…1'์„ ์“ฐ๋Š” ๊ฒฝ์šฐ ๋Œ€์‘
})
else:
for k in keys: mapping_preview[f"{k}{i+1}"] = ""
st.dataframe(pd.DataFrame([{"ํ‚ค":k, "๊ฐ’ ์•ž๋ถ€๋ถ„":str(v)[:120]} for k,v in sorted(mapping_preview.items())]),
use_container_width=True, height=320)
if st.button("๐Ÿš€ ๋ผ๋ฒจ ์ƒ์„ฑ (ํŽ˜์ด์ง€๋ณ„ HWPX ZIP)"):
mem = io.BytesIO(); zout = zipfile.ZipFile(mem, "w", zipfile.ZIP_DEFLATED)
pages = (len(records) + int(n_per_page) - 1) // int(n_per_page)
debug_all = []
for p in range(pages):
chunk = records[p*int(n_per_page):(p+1)*int(n_per_page)]
mapping = {}
for i in range(int(n_per_page)):
if i < len(chunk):
r = chunk[i]
mapping[f"๋ฐ•์Šค๋ฒˆํ˜ธ{i+1}"] = r.get("๋ฐ•์Šค๋ฒˆํ˜ธ","")
mapping[f"์ข…๋ฃŒ์—ฐ๋„{i+1}"] = r.get("์ƒ์‚ฐ์—ฐ๋„","")
mapping[f"๋ณด์กด๊ธฐ๊ฐ„{i+1}"] = r.get("๋ณด์กด๊ธฐ๊ฐ„","")
mapping[f"๋‹จ์œ„์—…๋ฌด{i+1}"] = r.get("๋‹จ์œ„์—…๋ฌด","")
mapping[f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}"] = r.get("๊ธฐ๋ก๋ฌผ์ฒ ","")
mapping[f"๋ชฉ๋ก{i+1}"] = r.get("๋ชฉ๋ก","")
title_val = r.get("์ œ๋ชฉ","")
mapping[f"์ œ๋ชฉ{i+1}"] = title_val
mapping[f"์—…๋ฌด๋ช…{i+1}"] = title_val
else:
for k in keys: mapping[f"{k}{i+1}"] = ""
out_hwpx, dbg = replace_in_hwpx(tpl_bytes, mapping)
debug_all.append({"page": p+1, "stats": dbg})
name = "_".join([r.get("๋ฐ•์Šค๋ฒˆํ˜ธ","") for r in chunk]) if chunk else f"empty_{p+1}"
zout.writestr(f"label_{name}.hwpx", out_hwpx)
zout.close(); mem.seek(0)
st.download_button("โฌ‡๏ธ ZIP ๋‹ค์šด๋กœ๋“œ", data=mem, file_name="labels_by_page.zip", mime="application/zip")
st.download_button("โฌ‡๏ธ ๋””๋ฒ„๊ทธ(JSON)", data=json.dumps(debug_all, ensure_ascii=False, indent=2),
file_name="debug.json", mime="application/json")