Spaces:

dohyune
/

boxlabel

Build error

App Files Files Community

dohyune commited on Aug 28, 2025

Commit

4d98592

verified ·

1 Parent(s): f2a332f

Update app.py

Browse files

Files changed (1) hide show

app.py +195 -110

app.py CHANGED Viewed

@@ -1,50 +1,93 @@
 import streamlit as st
 import pandas as pd
-import io, zipfile, re, html
-st.set_page_config(page_title="📦 박스라벨 자동 생성기", layout="wide")
-# =========================================================
-# HWPX 토큰 치환 함수
-# =========================================================
-def replace_tokens_in_hwpx(hwpx_bytes: bytes, mapping: dict, collect_debug: bool=False):
     """
-    - {{토큰}} 문자열을 직접 치환
-    - 목록(list) 계열은 줄바꿈을 <hp:lineBreak/> 로 처리
-    - collect_debug=True 시 debug_info 반환
     """
-    debug_info = {"token_hits": {}, "files_touched": []} if collect_debug else None
     zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
     mem_out = io.BytesIO()
     zout = zipfile.ZipFile(mem_out, "w")
-    def build_runs(value: str, is_list: bool) -> str:
-        if value is None:
-            return ""
-        text = str(value)
-        if not is_list:
-            return html.escape(text)
-        # 줄바꿈은 한글에서 <hp:lineBreak/> 필요
-        lines = text.replace("\r\n", "\n").split("\n")
-        parts = []
-        for i, ln in enumerate(lines):
-            if i > 0:
-                parts.append("<hp:lineBreak/>")
-            parts.append(html.escape(ln))
-        return "".join(parts)
-    def repl_xml(xml_text: str, kv: dict) -> str:
-        for k, v in kv.items():
-            is_list = bool(re.match(r"^(목록|list)\d*$", k, flags=re.IGNORECASE))
-            token = f"{{{{{k}}}}}"  # e.g. {{박스번호1}}
-            if token in xml_text:
-                xml_text = xml_text.replace(token, build_runs(v, is_list))
-                if collect_debug:
-                    debug_info["token_hits"][k] = debug_info["token_hits"].get(k, 0) + 1
-        return xml_text
-    # mimetype은 무압축 + 제일 먼저 기록
     names = zin.namelist()
     if "mimetype" in names:
         data = zin.read("mimetype")
@@ -52,6 +95,26 @@ def replace_tokens_in_hwpx(hwpx_bytes: bytes, mapping: dict, collect_debug: bool
         zi.compress_type = zipfile.ZIP_STORED
         zout.writestr(zi, data)
     for e in zin.infolist():
         if e.filename == "mimetype":
             continue
@@ -59,9 +122,9 @@ def replace_tokens_in_hwpx(hwpx_bytes: bytes, mapping: dict, collect_debug: bool
         if e.filename.startswith("Contents/") and e.filename.endswith(".xml"):
             try:
                 s = data.decode("utf-8", errors="ignore")
-                s2 = repl_xml(s, mapping)
-                if collect_debug and s2 != s:
-                    debug_info["files_touched"].append(e.filename)
                 data = s2.encode("utf-8")
             except Exception:
                 pass
@@ -69,78 +132,100 @@ def replace_tokens_in_hwpx(hwpx_bytes: bytes, mapping: dict, collect_debug: bool
         zi.compress_type = zipfile.ZIP_DEFLATED
         zout.writestr(zi, data)
-    zin.close()
-    zout.close()
-    mem_out.seek(0)
-    return (mem_out.getvalue(), debug_info) if collect_debug else mem_out.getvalue()
-# =========================================================
-# Streamlit UI
-# =========================================================
-st.title("📦 박스라벨 자동 생성기 (.HWPX 토큰 버전)")
-st.markdown("""
-HWPX 템플릿 안에 `{{박스번호1}}`, `{{종료연도1}}`, `{{보존기간1}}`, `{{단위업무1}}`,
-`{{기록물철1}}`, `{{목록1}}` 같은 토큰을 넣어주세요.
-- 엑셀/CSV 업로드 → 컬럼명 매핑
-- 라벨 생성 → ZIP으로 다운로드
-""")
-# ------------------------
-# 업로드 영역
-# ------------------------
 tpl_file = st.file_uploader("📄 HWPX 템플릿 업로드", type=["hwpx"])
-excel_file = st.file_uploader("📊 데이터 업로드 (Excel/CSV)", type=["xlsx", "xls", "csv"])
-if tpl_file and excel_file:
-    # 템플릿 읽기
     tpl_bytes = tpl_file.read()
-    # 데이터 읽기
-    if excel_file.name.endswith(".csv"):
-        df = pd.read_csv(excel_file)
-    else:
-        df = pd.read_excel(excel_file)
-    st.subheader("📋 데이터 미리보기")
-    st.dataframe(df.head())
-    box_col = "박스번호"
-    if box_col not in df.columns:
         st.error("❌ 필수 컬럼 '박스번호'가 없습니다.")
-    else:
-        st.success("✅ 위치 매핑 완료 (엑셀 측)")
-        # 박스번호 목록
-        st.subheader("🔎 업로드된 박스번호 목록")
-        st.write(f"총 {len(df[box_col].unique())}개")
-        selected_boxes = st.multiselect("생성할 박스번호 선택 (비우면 전체)",
-                                        df[box_col].unique().tolist())
-        # 라벨 생성 버튼
-        if st.button("🚀 라벨 생성 (ZIP)"):
-            mem_zip = io.BytesIO()
-            zout = zipfile.ZipFile(mem_zip, "w")
-            for _, row in df.iterrows():
-                box_no = str(row[box_col])
-                if selected_boxes and box_no not in selected_boxes:
-                    continue
-                mapping = {}
-                for i, col in enumerate(df.columns, start=1):
-                    key = col
-                    value = row[col]
-                    mapping[key] = value
-                hwpx_bytes, dbg = replace_tokens_in_hwpx(tpl_bytes, mapping, collect_debug=True)
-                fn = f"label_{box_no}.hwpx"
-                zout.writestr(fn, hwpx_bytes)
-            zout.close()
-            mem_zip.seek(0)
-            st.download_button("⬇️ ZIP 다운로드", data=mem_zip,
-                               file_name="labels.zip", mime="application/zip")

 import streamlit as st
 import pandas as pd
+import io, zipfile, re, html, json
+st.set_page_config(page_title="📦 박스라벨 자동 생성기 (토큰·배치)", layout="wide")
+st.title("📦 박스라벨 자동 생성기 (.HWPX 토큰·배치 지원)")
+with st.expander("사용 방법", expanded=True):
+    st.markdown("""
+1) **HWPX 템플릿**: 라벨 한 페이지에 `{{박스번호1}} … {{박스번호N}}`, `{{종료연도1}} …` 처럼 **번호가 붙은 토큰**을 넣어 주세요.
+   - 사용 토큰 예: `{{박스번호i}}`, `{{종료연도i}}`, `{{보존기간i}}`, `{{단위업무i}}`, `{{기록물철i}}`, `{{목록i}}` (i = 1..N)
+2) **엑셀/CSV 업로드** → `박스번호`는 필수, 나머지는 있으면 자동 반영
+3) **템플릿의 라벨 세트 개수(N)** 를 지정하면 N개씩 묶어 한 페이지를 생성합니다.
+4) **ZIP 다운로드**를 받으면 `label_0001_0003.hwpx` 처럼 페이지별 파일이 들어 있습니다.
+    """)
+# =========================
+# 데이터 전처리
+# =========================
+def compute_year_range(series: pd.Series) -> str:
+    s = series.astype(str).fillna("")
+    valid = s[~s.isin(["", "0", "0000"])]
+    if len(valid) == 0:
+        return "0000-0000"
+    valid_int = pd.to_numeric(valid, errors="coerce").dropna().astype(int)
+    if len(valid_int) == 0:
+        return "0000-0000"
+    return f"{valid_int.min():04d}-{valid_int.max():04d}"
+def build_merged_df(df: pd.DataFrame) -> pd.DataFrame:
+    df = df.copy()
+    # 표준화
+    df["박스번호"] = df["박스번호"].astype(str).str.zfill(4)
+    if "제목" in df.columns:
+        df["제목"] = df["제목"].astype(str)
+    # 생산연도(범위) = 종료연도 그룹 범위
+    if "종료연도" in df.columns:
+        prod_df = df.groupby("박스번호")["종료연도"].apply(compute_year_range).reset_index()
+        prod_df.columns = ["박스번호", "생산연도"]
+    else:
+        prod_df = pd.DataFrame({"박스번호": df["박스번호"].unique(), "생산연도": "0000-0000"})
+    # 목록(관리번호 + 제목)
+    has_mgmt = "관리번호" in df.columns
+    list_rows = []
+    for box, g in df.groupby("박스번호"):
+        if has_mgmt:
+            lines = [f"- {r['관리번호']} {r['제목']}" for _, r in g.iterrows()]
+        else:
+            lines = [f"- {r['제목']}" for _, r in g.iterrows()]
+        list_rows.append({"박스번호": box, "목록": "\r\n".join(lines)})
+    list_df = pd.DataFrame(list_rows)
+    # 대표 메타
+    meta_cols = ["박스번호","종료연도","보존기간","단위업무","기록물철","제목"]
+    meta_exist = [c for c in meta_cols if c in df.columns]
+    meta_df = df.groupby("박스번호", as_index=False).first()[meta_exist] if meta_exist else pd.DataFrame({"박스번호": df["박스번호"].unique()})
+    return meta_df.merge(list_df, on="박스번호", how="left").merge(prod_df, on="박스번호", how="left")
+# =========================
+# HWPX 토큰 치환 (배치)
+# =========================
+def _build_runs_for_list(text: str) -> str:
+    """ 목록 줄바꿈을 <hp:lineBreak/>로 바꾼 문자열(토큰 자리에 들어갈 텍스트) """
+    if text is None: return ""
+    text = str(text)
+    lines = text.replace("\r\n", "\n").split("\n")
+    parts = []
+    for i, ln in enumerate(lines):
+        if i > 0:
+            parts.append("<hp:lineBreak/>")
+        parts.append(html.escape(ln))
+    return "".join(parts)
+def replace_tokens_in_hwpx_batch(hwpx_bytes: bytes, mapping: dict, collect_debug: bool=False):
     """
+    mapping 예:
+      {'박스번호1': '0001', '종료연도1': '1999-2002', '목록1': '<hp:run..>',
+       '박스번호2': '0002', ...}
+    - {{토큰}} 문자열을 직접 치환 (토큰은 한 run 안에 있어야 안전)
+    - mimetype은 무압축 + 첫 엔트리
     """
+    dbg = {"token_hits": {}, "files_touched": []} if collect_debug else None
     zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
     mem_out = io.BytesIO()
     zout = zipfile.ZipFile(mem_out, "w")
     names = zin.namelist()
     if "mimetype" in names:
         data = zin.read("mimetype")
         zi.compress_type = zipfile.ZIP_STORED
         zout.writestr(zi, data)
+    token_keys = list(mapping.keys())
+    def repl_xml(xml_text: str) -> str:
+        changed = False
+        # 빠른 경로: 포함된 키만 순회 (간단/안전)
+        for k in token_keys:
+            tok = f"{{{{{k}}}}}"
+            if tok in xml_text:
+                v = mapping.get(k, "")
+                # 목록 계열 줄바꿈 처리
+                if re.match(r"^(목록|list)\d+$", k):
+                    v = _build_runs_for_list(v)
+                else:
+                    v = html.escape("" if v is None else str(v))
+                xml_text = xml_text.replace(tok, v)
+                changed = True
+                if collect_debug:
+                    dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1
+        return xml_text, changed
     for e in zin.infolist():
         if e.filename == "mimetype":
             continue
         if e.filename.startswith("Contents/") and e.filename.endswith(".xml"):
             try:
                 s = data.decode("utf-8", errors="ignore")
+                s2, changed = repl_xml(s)
+                if collect_debug and changed:
+                    dbg["files_touched"].append(e.filename)
                 data = s2.encode("utf-8")
             except Exception:
                 pass
         zi.compress_type = zipfile.ZIP_DEFLATED
         zout.writestr(zi, data)
+    zin.close(); zout.close(); mem_out.seek(0)
+    return (mem_out.getvalue(), dbg) if collect_debug else (mem_out.getvalue(), None)
+# =========================
+# UI
+# =========================
 tpl_file = st.file_uploader("📄 HWPX 템플릿 업로드", type=["hwpx"])
+batch_size = st.number_input("템플릿의 라벨 세트 개수 (한 페이지에 몇 개?)", min_value=1, max_value=12, value=3, step=1)
+data_file = st.file_uploader("📊 데이터 업로드 (Excel/CSV)", type=["xlsx","xls","csv"])
+if tpl_file and data_file:
     tpl_bytes = tpl_file.read()
+    df = pd.read_csv(data_file) if data_file.name.lower().endswith(".csv") else pd.read_excel(data_file)
+    if "박스번호" not in df.columns:
         st.error("❌ 필수 컬럼 '박스번호'가 없습니다.")
+        st.stop()
+    st.success("✅ 위치 매핑 완료 (엑셀 측)")
+    st.dataframe(df.head(10), use_container_width=True)
+    merged = build_merged_df(df)
+    box_list = merged["박스번호"].astype(str).str.zfill(4).unique().tolist()
+    st.subheader("🔎 업로드된 박스번호 목록")
+    st.write(f"총 **{len(box_list)}**개")
+    st.dataframe(pd.DataFrame({"박스번호": box_list}), use_container_width=True, height=240)
+    selected = st.multiselect("생성할 박스번호 선택 (비우면 전체 생성)", options=box_list)
+    work_df = merged[merged["박스번호"].isin(selected)] if selected else merged
+    rows = work_df.sort_values("박스번호").to_dict(orient="records")
+    # 1페이지 미리보기용 매핑 표시
+    st.subheader("🧪 1페이지 토큰 매핑 미리보기")
+    first_page = rows[:int(batch_size)]
+    keys = ["박스번호","종료연도","보존기간","단위업무","기록물철","목록"]
+    mapping_preview = {}
+    for i in range(int(batch_size)):
+        if i < len(first_page):
+            r = first_page[i]
+            for k in keys:
+                if k == "종료연도":
+                    mapping_preview[f"{k}{i+1}"] = r.get("생산연도","")
+                else:
+                    mapping_preview[f"{k}{i+1}"] = r.get(k,"")
+        else:
+            for k in keys:
+                mapping_preview[f"{k}{i+1}"] = ""
+    st.dataframe(
+        pd.DataFrame(
+            [{"토큰": k, "값(앞부분)": (str(v)[:120] if v is not None else ""), "길이": (len(str(v)) if v is not None else 0)}
+             for k, v in sorted(mapping_preview.items())]
+        ),
+        use_container_width=True, height=320
+    )
+    if st.button("🚀 라벨 생성 (페이지별 HWPX ZIP)"):
+        mem_zip = io.BytesIO()
+        zout = zipfile.ZipFile(mem_zip, "w", zipfile.ZIP_DEFLATED)
+        n = int(batch_size)
+        total = len(rows)
+        pages = (total + n - 1) // n
+        all_debug = []
+        for p in range(pages):
+            start = p * n
+            chunk = rows[start:start+n]
+            mapping = {}
+            for i in range(n):
+                if i < len(chunk):
+                    r = chunk[i]
+                    for k in keys:
+                        if k == "종료연도":
+                            mapping[f"{k}{i+1}"] = r.get("생산연도","")
+                        else:
+                            mapping[f"{k}{i+1}"] = r.get(k,"")
+                else:
+                    for k in keys:
+                        mapping[f"{k}{i+1}"] = ""
+            out_hwpx, dbg = replace_tokens_in_hwpx_batch(tpl_bytes, mapping, collect_debug=True)
+            all_debug.append({"page": p+1, "mapping_keys": sorted(list(mapping.keys())), "stats": dbg})
+            page_boxes = [r.get("박스번호","") for r in chunk]
+            safe = "_".join(page_boxes) if page_boxes else f"empty_{p+1}"
+            zout.writestr(f"label_{safe}.hwpx", out_hwpx)
+        zout.close(); mem_zip.seek(0)
+        st.download_button("⬇️ ZIP 다운로드", data=mem_zip, file_name="labels_by_page.zip", mime="application/zip")
+        st.download_button("⬇️ 디버그 리포트(JSON)", data=json.dumps(all_debug, ensure_ascii=False, indent=2),
+                           file_name="debug_by_page.json", mime="application/json")
+st.caption("※ 템플릿의 토큰은 **반드시 run 하나에 온전한 문자열**로 넣어주세요(예: `{{박스번호1}}`). 토큰이 글자 단위로 쪼개져 여러 run에 나뉘면 치환이 되지 않을 수 있습니다.")