Update app.py
Browse files
app.py
CHANGED
|
@@ -2,28 +2,10 @@ import streamlit as st
|
|
| 2 |
import pandas as pd
|
| 3 |
import io, zipfile, re, html, json
|
| 4 |
|
| 5 |
-
st.set_page_config(page_title="๐ฆ ๋ฐ์ค๋ผ๋ฒจ ์๋ ์์ฑ๊ธฐ (ํ ํฐ
|
| 6 |
-
st.title("๐ฆ ๋ฐ์ค๋ผ๋ฒจ ์๋ ์์ฑ๊ธฐ (.HWPX โ ํ ํฐ
|
| 7 |
|
| 8 |
-
|
| 9 |
-
st.markdown("""
|
| 10 |
-
**ํ
ํ๋ฆฟ ์ค๋น**
|
| 11 |
-
- ๋ผ๋ฒจ ํ ํ์ด์ง์ `{{๋ฐ์ค๋ฒํธ1}} ... {{๋ฐ์ค๋ฒํธN}}`, `{{์ข
๋ฃ์ฐ๋1}} ...`, `{{๋ณด์กด๊ธฐ๊ฐ1}} ...`, `{{๋จ์์
๋ฌด1}} ...`, `{{๊ธฐ๋ก๋ฌผ์ฒ 1}} ...`, `{{๋ชฉ๋ก1}} ...` ์ฒ๋ผ **๋ฒํธ๊ฐ ๋ถ์ ํ ํฐ**์ ๋ฃ์ด ์ฃผ์ธ์.
|
| 12 |
-
- ํ ํฐ์ ๊ฐ๋ฅํ๋ฉด ํ ๋ฉ์ด๋ฆฌ ํ
์คํธ๋ก ์
๋ ฅํ์ธ์. (ํ์ง๋ง ์ด ์ฑ์ ํ ํฐ์ด ์ฌ๋ฌ run์ผ๋ก ์ชผ๊ฐ์ ธ ์์ด๋ **์๋ ๋ณํฉ**ํด์ ์นํํฉ๋๋ค.)
|
| 13 |
-
|
| 14 |
-
**๋ฐ์ดํฐ**
|
| 15 |
-
- ํ์: `๋ฐ์ค๋ฒํธ`
|
| 16 |
-
- ๊ถ์ฅ: `์ข
๋ฃ์ฐ๋`, `๋ณด์กด๊ธฐ๊ฐ`, `๋จ์์
๋ฌด`, `๊ธฐ๋ก๋ฌผ์ฒ `, `์ ๋ชฉ`
|
| 17 |
-
- ๋ชฉ๋ก์ (๊ด๋ฆฌ๋ฒํธ + ์ ๋ชฉ) ์กฐํฉ์ผ๋ก ์๋ ์์ฑ. `์ข
๋ฃ์ฐ๋`๋ ๋ฐ์ค๋ณ ์ต์~์ต๋๋ก ๋ฌถ์ด **์์ฐ์ฐ๋(๋ฒ์)** ๋ก ๋ค์ด๊ฐ๋๋ค.
|
| 18 |
-
|
| 19 |
-
**์ถ๋ ฅ**
|
| 20 |
-
- ํ
ํ๋ฆฟ์ ๋ผ๋ฒจ ์ธํธ ๊ฐ์(N)๋ฅผ ์ง์ ํ๋ฉด N๊ฐ์ฉ ๋ฌถ์ด **ํ์ด์ง ๋จ์ HWPX**๋ฅผ ๋ง๋ญ๋๋ค.
|
| 21 |
-
- ZIP ์ ํ์ผ๋ช
์: `label_0001_0003.hwpx` (ํด๋น ํ์ด์ง์ ๋ค์ด๊ฐ ๋ฐ์ค๋ฒํธ)
|
| 22 |
-
""")
|
| 23 |
-
|
| 24 |
-
# ---------------------------
|
| 25 |
-
# ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ
|
| 26 |
-
# ---------------------------
|
| 27 |
def compute_year_range(series: pd.Series) -> str:
|
| 28 |
s = series.astype(str).fillna("")
|
| 29 |
valid = s[~s.isin(["", "0", "0000"])]
|
|
@@ -40,14 +22,14 @@ def build_merged_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 40 |
if "์ ๋ชฉ" in df.columns:
|
| 41 |
df["์ ๋ชฉ"] = df["์ ๋ชฉ"].astype(str)
|
| 42 |
|
| 43 |
-
# ์์ฐ์ฐ๋(๋ฒ์)
|
| 44 |
if "์ข
๋ฃ์ฐ๋" in df.columns:
|
| 45 |
prod_df = df.groupby("๋ฐ์ค๋ฒํธ")["์ข
๋ฃ์ฐ๋"].apply(compute_year_range).reset_index()
|
| 46 |
prod_df.columns = ["๋ฐ์ค๋ฒํธ", "์์ฐ์ฐ๋"]
|
| 47 |
else:
|
| 48 |
prod_df = pd.DataFrame({"๋ฐ์ค๋ฒํธ": df["๋ฐ์ค๋ฒํธ"].unique(), "์์ฐ์ฐ๋": "0000-0000"})
|
| 49 |
|
| 50 |
-
# ๋ชฉ๋ก
|
| 51 |
has_mgmt = "๊ด๋ฆฌ๋ฒํธ" in df.columns
|
| 52 |
list_rows = []
|
| 53 |
for box, g in df.groupby("๋ฐ์ค๋ฒํธ"):
|
|
@@ -58,7 +40,6 @@ def build_merged_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 58 |
list_rows.append({"๋ฐ์ค๋ฒํธ": box, "๋ชฉ๋ก": "\r\n".join(lines)})
|
| 59 |
list_df = pd.DataFrame(list_rows)
|
| 60 |
|
| 61 |
-
# ๋ํ ๋ฉํ
|
| 62 |
meta_cols = ["๋ฐ์ค๋ฒํธ","์ข
๋ฃ์ฐ๋","๋ณด์กด๊ธฐ๊ฐ","๋จ์์
๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","์ ๋ชฉ"]
|
| 63 |
meta_exist = [c for c in meta_cols if c in df.columns]
|
| 64 |
meta_df = df.groupby("๋ฐ์ค๋ฒํธ", as_index=False).first()[meta_exist] if meta_exist \
|
|
@@ -66,72 +47,32 @@ def build_merged_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 66 |
|
| 67 |
return meta_df.merge(list_df, on="๋ฐ์ค๋ฒํธ", how="left").merge(prod_df, on="๋ฐ์ค๋ฒํธ", how="left")
|
| 68 |
|
| 69 |
-
|
| 70 |
-
# HWPX ํ ํฐ ์นํ (๋ฐ ๋ณํฉ ํฌํจ)
|
| 71 |
-
# ---------------------------
|
| 72 |
-
# run ๊ฒฝ๊ณ ๋ณํฉ: </hp:t></hp:run><hp:run...><hp:t> ์ฌ์ด ํ๊ทธ๋ค์ ์ง์ ํ
์คํธ๋ฅผ ์ด์ด ๋ถ์
|
| 73 |
-
RUN_JOIN_RE = re.compile(
|
| 74 |
-
r'</hp:t>\s*</hp:run>\s*<hp:run[^>]*>\s*<hp:t>',
|
| 75 |
-
flags=re.DOTALL
|
| 76 |
-
)
|
| 77 |
-
|
| 78 |
-
def _build_list_text(text: str) -> str:
|
| 79 |
if text is None: return ""
|
| 80 |
-
|
| 81 |
-
lines = text.replace("\r\n", "\n").split("\n")
|
| 82 |
parts = []
|
| 83 |
for i, ln in enumerate(lines):
|
| 84 |
if i > 0:
|
| 85 |
parts.append("<hp:lineBreak/>")
|
| 86 |
-
parts.append(html.escape(ln))
|
| 87 |
return "".join(parts)
|
| 88 |
|
| 89 |
-
def
|
| 90 |
-
""
|
| 91 |
-
mapping: {'๋ฐ์ค๋ฒํธ1': '0001', '์ข
๋ฃ์ฐ๋1': '1999-2002', '๋ชฉ๋ก1': '- a\\n- b', ...}
|
| 92 |
-
์ ์ฐจ:
|
| 93 |
-
1) XML ๋ก๋
|
| 94 |
-
2) ์ธ์ run ๋ณํฉ (RUN_JOIN_RE)
|
| 95 |
-
3) {{ํ ํฐ}} -> ๊ฐ (๋ชฉ๋ก์ <hp:lineBreak/>)
|
| 96 |
-
4) mimetype: ๋ฌด์์ถ(STORED) + ์ฒซ ์ํธ๋ฆฌ
|
| 97 |
-
"""
|
| 98 |
-
dbg = {"token_hits": {}, "files_touched": []} if collect_debug else None
|
| 99 |
-
|
| 100 |
-
zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
|
| 101 |
-
out_buf = io.BytesIO()
|
| 102 |
-
zout = zipfile.ZipFile(out_buf, "w")
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
names = zin.namelist()
|
|
|
|
|
|
|
| 105 |
if "mimetype" in names:
|
| 106 |
-
data = zin.read("mimetype")
|
| 107 |
zi = zipfile.ZipInfo("mimetype")
|
| 108 |
zi.compress_type = zipfile.ZIP_STORED
|
| 109 |
-
zout.writestr(zi,
|
| 110 |
-
|
| 111 |
-
token_keys = list(mapping.keys())
|
| 112 |
-
|
| 113 |
-
def do_replace(s: str) -> (str, bool):
|
| 114 |
-
changed_any = False
|
| 115 |
-
# 1) run ๋ณํฉ
|
| 116 |
-
s2 = RUN_JOIN_RE.sub('', s)
|
| 117 |
-
if s2 != s:
|
| 118 |
-
changed_any = True
|
| 119 |
-
s = s2
|
| 120 |
-
# 2) ํ ํฐ ์นํ
|
| 121 |
-
for k in token_keys:
|
| 122 |
-
tok = f"{{{{{k}}}}}"
|
| 123 |
-
if tok in s:
|
| 124 |
-
val = mapping.get(k, "")
|
| 125 |
-
if re.match(r"^(๋ชฉ๋ก|list)\d+$", k):
|
| 126 |
-
val = _build_list_text(val)
|
| 127 |
-
else:
|
| 128 |
-
val = html.escape("" if val is None else str(val))
|
| 129 |
-
s = s.replace(tok, val)
|
| 130 |
-
changed_any = True
|
| 131 |
-
if collect_debug:
|
| 132 |
-
dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1
|
| 133 |
-
return s, changed_any
|
| 134 |
|
|
|
|
| 135 |
for e in zin.infolist():
|
| 136 |
if e.filename == "mimetype":
|
| 137 |
continue
|
|
@@ -139,9 +80,7 @@ def replace_tokens_in_hwpx_batch(hwpx_bytes: bytes, mapping: dict, collect_debug
|
|
| 139 |
if e.filename.startswith("Contents/") and e.filename.endswith(".xml"):
|
| 140 |
try:
|
| 141 |
s = data.decode("utf-8", errors="ignore")
|
| 142 |
-
s2
|
| 143 |
-
if collect_debug and changed:
|
| 144 |
-
dbg["files_touched"].append(e.filename)
|
| 145 |
data = s2.encode("utf-8")
|
| 146 |
except Exception:
|
| 147 |
pass
|
|
@@ -149,20 +88,132 @@ def replace_tokens_in_hwpx_batch(hwpx_bytes: bytes, mapping: dict, collect_debug
|
|
| 149 |
zi.compress_type = zipfile.ZIP_DEFLATED
|
| 150 |
zout.writestr(zi, data)
|
| 151 |
|
| 152 |
-
|
| 153 |
-
return
|
| 154 |
|
| 155 |
-
#
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
batch_size = st.number_input("ํ
ํ๋ฆฟ์ ๋ผ๋ฒจ ์ธํธ ๊ฐ์ (ํ ํ์ด์ง N๊ฐ)", min_value=1, max_value=12, value=3, step=1)
|
| 160 |
-
|
| 161 |
|
| 162 |
-
if
|
| 163 |
-
tpl_bytes =
|
| 164 |
-
|
|
|
|
| 165 |
|
|
|
|
| 166 |
if "๋ฐ์ค๋ฒํธ" not in df.columns:
|
| 167 |
st.error("โ ํ์ ์ปฌ๋ผ '๋ฐ์ค๋ฒํธ'๊ฐ ์์ต๋๋ค.")
|
| 168 |
st.stop()
|
|
@@ -178,71 +229,63 @@ if tpl_file and data_file:
|
|
| 178 |
st.dataframe(pd.DataFrame({"๋ฐ์ค๋ฒํธ": box_list}), use_container_width=True, height=240)
|
| 179 |
|
| 180 |
selected = st.multiselect("์์ฑํ ๋ฐ์ค๋ฒํธ ์ ํ (๋น์ฐ๋ฉด ์ ์ฒด ์์ฑ)", options=box_list)
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
rows = work_df.sort_values("๋ฐ์ค๋ฒํธ").to_dict(orient="records")
|
| 184 |
|
| 185 |
-
# 1ํ์ด์ง ๋ฏธ๋ฆฌ๋ณด๊ธฐ
|
| 186 |
-
st.subheader("๐งช 1ํ์ด์ง
|
| 187 |
-
first_page = rows[:int(batch_size)]
|
| 188 |
keys = ["๋ฐ์ค๋ฒํธ","์ข
๋ฃ์ฐ๋","๋ณด์กด๊ธฐ๊ฐ","๋จ์์
๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","๋ชฉ๋ก"]
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
|
|
|
| 193 |
for k in keys:
|
| 194 |
-
if k
|
| 195 |
-
mapping_preview[f"{k}{i+1}"] = r.get("์์ฐ์ฐ๋","")
|
| 196 |
-
else:
|
| 197 |
-
mapping_preview[f"{k}{i+1}"] = r.get(k,"")
|
| 198 |
else:
|
| 199 |
for k in keys:
|
| 200 |
-
|
| 201 |
-
|
| 202 |
st.dataframe(
|
| 203 |
-
pd.DataFrame(
|
| 204 |
-
[{"ํ ํฐ": k, "๊ฐ(์๋ถ๋ถ)": (str(v)[:120] if v is not None else ""), "๊ธธ์ด": (len(str(v)) if v is not None else 0)}
|
| 205 |
-
for k, v in sorted(mapping_preview.items())]
|
| 206 |
-
),
|
| 207 |
use_container_width=True, height=320
|
| 208 |
)
|
| 209 |
|
| 210 |
if st.button("๐ ๋ผ๋ฒจ ์์ฑ (ํ์ด์ง๋ณ HWPX ZIP)"):
|
| 211 |
mem_zip = io.BytesIO()
|
| 212 |
zout = zipfile.ZipFile(mem_zip, "w", zipfile.ZIP_DEFLATED)
|
|
|
|
|
|
|
| 213 |
|
| 214 |
-
n = int(batch_size)
|
| 215 |
-
total = len(rows)
|
| 216 |
-
pages = (total + n - 1) // n
|
| 217 |
-
|
| 218 |
-
all_debug = []
|
| 219 |
for p in range(pages):
|
| 220 |
-
|
| 221 |
-
chunk = rows[start:start+n]
|
| 222 |
mapping = {}
|
| 223 |
for i in range(n):
|
| 224 |
if i < len(chunk):
|
| 225 |
r = chunk[i]
|
| 226 |
for k in keys:
|
| 227 |
-
if k
|
| 228 |
-
mapping[f"{k}{i+1}"] = r.get("์์ฐ์ฐ๋","")
|
| 229 |
-
else:
|
| 230 |
-
mapping[f"{k}{i+1}"] = r.get(k,"")
|
| 231 |
else:
|
| 232 |
for k in keys:
|
| 233 |
mapping[f"{k}{i+1}"] = ""
|
| 234 |
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
page_boxes = [r.get("๋ฐ์ค๋ฒํธ","") for r in chunk]
|
| 239 |
-
|
| 240 |
-
zout.writestr(f"label_{
|
| 241 |
|
| 242 |
zout.close(); mem_zip.seek(0)
|
| 243 |
st.download_button("โฌ๏ธ ZIP ๋ค์ด๋ก๋", data=mem_zip, file_name="labels_by_page.zip", mime="application/zip")
|
| 244 |
-
st.download_button("โฌ๏ธ ๋๋ฒ๊ทธ
|
| 245 |
-
|
| 246 |
-
file_name="debug_by_page.json", mime="application/json")
|
| 247 |
|
| 248 |
-
st.caption("
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import io, zipfile, re, html, json
|
| 4 |
|
| 5 |
+
st.set_page_config(page_title="๐ฆ ๋ฐ์ค๋ผ๋ฒจ ์๋ ์์ฑ๊ธฐ (ํ๋/ํ ํฐ ์๋๊ฐ์ง)", layout="wide")
|
| 6 |
+
st.title("๐ฆ ๋ฐ์ค๋ผ๋ฒจ ์๋ ์์ฑ๊ธฐ (.HWPX โ ํ๋/ํ ํฐ ์๋๊ฐ์ง)")
|
| 7 |
|
| 8 |
+
# =============== ๊ณตํต ์ ํธ ===============
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
def compute_year_range(series: pd.Series) -> str:
|
| 10 |
s = series.astype(str).fillna("")
|
| 11 |
valid = s[~s.isin(["", "0", "0000"])]
|
|
|
|
| 22 |
if "์ ๋ชฉ" in df.columns:
|
| 23 |
df["์ ๋ชฉ"] = df["์ ๋ชฉ"].astype(str)
|
| 24 |
|
| 25 |
+
# ์์ฐ์ฐ๋(๋ฒ์) = ์ข
๋ฃ์ฐ๋ ๊ทธ๋ฃน ๋ฒ์
|
| 26 |
if "์ข
๋ฃ์ฐ๋" in df.columns:
|
| 27 |
prod_df = df.groupby("๋ฐ์ค๋ฒํธ")["์ข
๋ฃ์ฐ๋"].apply(compute_year_range).reset_index()
|
| 28 |
prod_df.columns = ["๋ฐ์ค๋ฒํธ", "์์ฐ์ฐ๋"]
|
| 29 |
else:
|
| 30 |
prod_df = pd.DataFrame({"๋ฐ์ค๋ฒํธ": df["๋ฐ์ค๋ฒํธ"].unique(), "์์ฐ์ฐ๋": "0000-0000"})
|
| 31 |
|
| 32 |
+
# ๋ชฉ๋ก(๊ด๋ฆฌ๋ฒํธ + ์ ๋ชฉ)
|
| 33 |
has_mgmt = "๊ด๋ฆฌ๋ฒํธ" in df.columns
|
| 34 |
list_rows = []
|
| 35 |
for box, g in df.groupby("๋ฐ์ค๋ฒํธ"):
|
|
|
|
| 40 |
list_rows.append({"๋ฐ์ค๋ฒํธ": box, "๋ชฉ๋ก": "\r\n".join(lines)})
|
| 41 |
list_df = pd.DataFrame(list_rows)
|
| 42 |
|
|
|
|
| 43 |
meta_cols = ["๋ฐ์ค๋ฒํธ","์ข
๋ฃ์ฐ๋","๋ณด์กด๊ธฐ๊ฐ","๋จ์์
๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","์ ๋ชฉ"]
|
| 44 |
meta_exist = [c for c in meta_cols if c in df.columns]
|
| 45 |
meta_df = df.groupby("๋ฐ์ค๋ฒํธ", as_index=False).first()[meta_exist] if meta_exist \
|
|
|
|
| 47 |
|
| 48 |
return meta_df.merge(list_df, on="๋ฐ์ค๋ฒํธ", how="left").merge(prod_df, on="๋ฐ์ค๋ฒํธ", how="left")
|
| 49 |
|
| 50 |
+
def _build_list_runs(text: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
if text is None: return ""
|
| 52 |
+
lines = str(text).replace("\r\n", "\n").split("\n")
|
|
|
|
| 53 |
parts = []
|
| 54 |
for i, ln in enumerate(lines):
|
| 55 |
if i > 0:
|
| 56 |
parts.append("<hp:lineBreak/>")
|
| 57 |
+
parts.append(f"<hp:run><hp:t>{html.escape(ln)}</hp:t></hp:run>")
|
| 58 |
return "".join(parts)
|
| 59 |
|
| 60 |
+
def _build_plain_runs(text: str) -> str:
|
| 61 |
+
return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
+
# =============== HWPX ์ฐ๊ธฐ ๊ณตํต (mimetype ๋ฌด์์ถ/๋งจ์) ===============
|
| 64 |
+
def write_hwpx_like_src(zin: zipfile.ZipFile, writer_fn) -> bytes:
|
| 65 |
+
out = io.BytesIO()
|
| 66 |
+
zout = zipfile.ZipFile(out, "w")
|
| 67 |
names = zin.namelist()
|
| 68 |
+
|
| 69 |
+
# 1) mimetype ๋จผ์ ๋ฌด์์ถ
|
| 70 |
if "mimetype" in names:
|
|
|
|
| 71 |
zi = zipfile.ZipInfo("mimetype")
|
| 72 |
zi.compress_type = zipfile.ZIP_STORED
|
| 73 |
+
zout.writestr(zi, zin.read("mimetype"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
+
# 2) ๋๋จธ์ง ํ์ผ
|
| 76 |
for e in zin.infolist():
|
| 77 |
if e.filename == "mimetype":
|
| 78 |
continue
|
|
|
|
| 80 |
if e.filename.startswith("Contents/") and e.filename.endswith(".xml"):
|
| 81 |
try:
|
| 82 |
s = data.decode("utf-8", errors="ignore")
|
| 83 |
+
s2 = writer_fn(e.filename, s)
|
|
|
|
|
|
|
| 84 |
data = s2.encode("utf-8")
|
| 85 |
except Exception:
|
| 86 |
pass
|
|
|
|
| 88 |
zi.compress_type = zipfile.ZIP_DEFLATED
|
| 89 |
zout.writestr(zi, data)
|
| 90 |
|
| 91 |
+
zout.close(); out.seek(0)
|
| 92 |
+
return out.getvalue()
|
| 93 |
|
| 94 |
+
# =============== ๋ชจ๋1: ํ ํฐ ์นํ ({{ํค}}) ===============
|
| 95 |
+
RUN_JOIN_RE = re.compile(r'</hp:t>\s*</hp:run>\s*<hp:run[^>]*>\s*<hp:t>', re.DOTALL)
|
| 96 |
+
|
| 97 |
+
def token_mode_apply(hwpx_bytes: bytes, mapping: dict, collect_debug=False):
|
| 98 |
+
dbg = {"mode":"token","files_touched":[], "token_hits":{}} if collect_debug else None
|
| 99 |
+
zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
|
| 100 |
+
|
| 101 |
+
token_keys = list(mapping.keys())
|
| 102 |
+
|
| 103 |
+
def writer_fn(fname: str, xml: str) -> str:
|
| 104 |
+
changed = False
|
| 105 |
+
# run ๊ฒฝ๊ณ ๋ณํฉ (ํ ํฐ์ด ์๋ ค ์์ด๋ ์ด์ด๋ถ์ด๊ธฐ)
|
| 106 |
+
xml2 = RUN_JOIN_RE.sub('', xml)
|
| 107 |
+
if xml2 != xml:
|
| 108 |
+
changed = True
|
| 109 |
+
xml = xml2
|
| 110 |
+
# ํ ํฐ ๋ฌธ์์ด ์นํ
|
| 111 |
+
for k in token_keys:
|
| 112 |
+
tok = f"{{{{{k}}}}}"
|
| 113 |
+
if tok in xml:
|
| 114 |
+
val = mapping[k]
|
| 115 |
+
if re.match(r"^(๋ชฉ๋ก|list)\d+$", k, re.IGNORECASE):
|
| 116 |
+
# ํ ํฐ์ run ์์ ๋ค์ด๊ฐ ์์ผ๋ฏ๋ก, run ๊ตฌ์กฐ๋ฅผ ํต์งธ๋ก ์์ฑ
|
| 117 |
+
xml = xml.replace(tok, _build_list_runs(val))
|
| 118 |
+
else:
|
| 119 |
+
xml = xml.replace(tok, html.escape("" if val is None else str(val)))
|
| 120 |
+
changed = True
|
| 121 |
+
if dbg: dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1
|
| 122 |
+
if changed and dbg and fname not in dbg["files_touched"]:
|
| 123 |
+
dbg["files_touched"].append(fname)
|
| 124 |
+
return xml
|
| 125 |
+
|
| 126 |
+
out = write_hwpx_like_src(zin, writer_fn)
|
| 127 |
+
zin.close()
|
| 128 |
+
return (out, dbg) if collect_debug else (out, None)
|
| 129 |
+
|
| 130 |
+
# =============== ๋ชจ๋2: ํ๋์ปจํธ๋กค ์นํ (๊ฐ์ ํ
์คํธ ์ค๋ณต ์ฝ์
) ===============
|
| 131 |
+
# <hp:fieldBegin ... name="ํค"> ... </hp:fieldBegin> [๋ณธ๋ฌธ] <hp:fieldEnd ... />
|
| 132 |
+
FIELD_BLOCK_RE_TMPL = r'(<hp:fieldBegin[^>]*name="{name}"[^>]*>.*?</hp:fieldBegin>)(.*?)(<hp:fieldEnd[^>]*/>)'
|
| 133 |
+
|
| 134 |
+
def field_mode_apply(hwpx_bytes: bytes, mapping: dict, collect_debug=False):
|
| 135 |
+
dbg = {"mode":"field","files_touched":[], "field_hits":{}} if collect_debug else None
|
| 136 |
+
zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
|
| 137 |
+
|
| 138 |
+
# ์ด๋ค ํค๋ค์ด ์ค์ ๋ก ์กด์ฌํ๋์ง ๋น ๋ฅด๊ฒ ์์ง (์ ํ๋โ, ์๋โ)
|
| 139 |
+
contents = [e.filename for e in zin.infolist() if e.filename.startswith("Contents/") and e.filename.endswith(".xml")]
|
| 140 |
+
present_keys = set()
|
| 141 |
+
for fn in contents:
|
| 142 |
+
try:
|
| 143 |
+
s = zin.read(fn).decode("utf-8", errors="ignore")
|
| 144 |
+
for k in mapping.keys():
|
| 145 |
+
if f'name="{k}"' in s:
|
| 146 |
+
present_keys.add(k)
|
| 147 |
+
except:
|
| 148 |
+
pass
|
| 149 |
+
|
| 150 |
+
def writer_fn(fname: str, xml: str) -> str:
|
| 151 |
+
any_change = False
|
| 152 |
+
for k in present_keys:
|
| 153 |
+
val = mapping.get(k, "")
|
| 154 |
+
is_list = bool(re.match(r"^(๋ชฉ๋ก|list)\d+$", k, re.IGNORECASE))
|
| 155 |
+
pattern = re.compile(FIELD_BLOCK_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
|
| 156 |
+
|
| 157 |
+
def _repl(m):
|
| 158 |
+
# ํ๋ ๋ด๋ถ ๋ด์ฉ์ run์ผ๋ก ๊ต์ฒด
|
| 159 |
+
inner = _build_list_runs(val) if is_list else _build_plain_runs(val)
|
| 160 |
+
# ํ๋ ๋ค์ ๊ฐ์ ํ
์คํธ๋ฅผ 'ํ ๋ฒ ๋' ๋ฃ์ด ํญ์ ๋ณด์ด๊ฒ
|
| 161 |
+
visible_dup = inner
|
| 162 |
+
if dbg: dbg["field_hits"][k] = dbg["field_hits"].get(k, 0) + 1
|
| 163 |
+
return f'{m.group(1)}{inner}{m.group(3)}{visible_dup}'
|
| 164 |
+
|
| 165 |
+
xml_new, n = pattern.subn(_repl, xml)
|
| 166 |
+
if n:
|
| 167 |
+
any_change = True
|
| 168 |
+
xml = xml_new
|
| 169 |
+
|
| 170 |
+
if any_change and dbg and fname not in dbg["files_touched"]:
|
| 171 |
+
dbg["files_touched"].append(fname)
|
| 172 |
+
return xml
|
| 173 |
+
|
| 174 |
+
out = write_hwpx_like_src(zin, writer_fn)
|
| 175 |
+
zin.close()
|
| 176 |
+
return (out, dbg) if collect_debug else (out, None)
|
| 177 |
+
|
| 178 |
+
# =============== ๋ชจ๋ ์๋๊ฐ์ง ===============
|
| 179 |
+
def detect_template_mode(hwpx_bytes: bytes) -> str:
|
| 180 |
+
zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
|
| 181 |
+
has_token = False
|
| 182 |
+
has_field = False
|
| 183 |
+
for e in zin.infolist():
|
| 184 |
+
if not (e.filename.startswith("Contents/") and e.filename.endswith(".xml")):
|
| 185 |
+
continue
|
| 186 |
+
try:
|
| 187 |
+
s = zin.read(e.filename).decode("utf-8", errors="ignore")
|
| 188 |
+
if "{{" in s and "}}" in s:
|
| 189 |
+
has_token = True
|
| 190 |
+
if "<hp:fieldBegin" in s and 'name="' in s:
|
| 191 |
+
has_field = True
|
| 192 |
+
except:
|
| 193 |
+
pass
|
| 194 |
+
zin.close()
|
| 195 |
+
if has_token: return "token"
|
| 196 |
+
if has_field: return "field"
|
| 197 |
+
return "unknown"
|
| 198 |
+
|
| 199 |
+
# =============== Streamlit UI ===============
|
| 200 |
+
with st.expander("์ฌ์ฉ ๋ฐฉ๋ฒ ์์ฝ", expanded=True):
|
| 201 |
+
st.markdown("""
|
| 202 |
+
- ํ
ํ๋ฆฟ์ด **ํ ํฐ(`{{๋ฐ์ค๋ฒํธ1}}` ๋ฑ)** ์ด๋ฉด ์๋์ผ๋ก ํ ํฐ ๋ชจ๋,
|
| 203 |
+
**ํ๊ธ ํ๋์ปจํธ๋กค(`name="๋ฐ์ค๋ฒํธ1"`)** ์ด๋ฉด ํ๋ ๋ชจ๋๋ก ์๋ ์ฒ๋ฆฌํฉ๋๋ค.
|
| 204 |
+
- ํ๋ ๋ชจ๋์์๋ ๊ฐ์ด ์ ๋ณด์ด๋ ๋ฌธ์ ๋ฅผ ๋ง๊ธฐ ์ํด **fieldEnd ๋ค์ ๊ฐ์ ํ
์คํธ๋ฅผ ํ ๋ฒ ๋ ๋ฃ์ต๋๋ค.**
|
| 205 |
+
""")
|
| 206 |
+
|
| 207 |
+
tpl = st.file_uploader("๐ HWPX ํ
ํ๋ฆฟ ์
๋ก๋", type=["hwpx"])
|
| 208 |
batch_size = st.number_input("ํ
ํ๋ฆฟ์ ๋ผ๋ฒจ ์ธํธ ๊ฐ์ (ํ ํ์ด์ง N๊ฐ)", min_value=1, max_value=12, value=3, step=1)
|
| 209 |
+
data = st.file_uploader("๐ ๋ฐ์ดํฐ ์
๋ก๋ (Excel/CSV)", type=["xlsx","xls","csv"])
|
| 210 |
|
| 211 |
+
if tpl and data:
|
| 212 |
+
tpl_bytes = tpl.read()
|
| 213 |
+
mode = detect_template_mode(tpl_bytes)
|
| 214 |
+
st.info(f"ํ์ง๋ ํ
ํ๋ฆฟ ๋ชจ๋: **{mode}**")
|
| 215 |
|
| 216 |
+
df = pd.read_csv(data) if data.name.lower().endswith(".csv") else pd.read_excel(data)
|
| 217 |
if "๋ฐ์ค๋ฒํธ" not in df.columns:
|
| 218 |
st.error("โ ํ์ ์ปฌ๋ผ '๋ฐ์ค๋ฒํธ'๊ฐ ์์ต๋๋ค.")
|
| 219 |
st.stop()
|
|
|
|
| 229 |
st.dataframe(pd.DataFrame({"๋ฐ์ค๋ฒํธ": box_list}), use_container_width=True, height=240)
|
| 230 |
|
| 231 |
selected = st.multiselect("์์ฑํ ๋ฐ์ค๋ฒํธ ์ ํ (๋น์ฐ๋ฉด ์ ์ฒด ์์ฑ)", options=box_list)
|
| 232 |
+
work = merged[merged["๋ฐ์ค๋ฒํธ"].isin(selected)] if selected else merged
|
| 233 |
+
rows = work.sort_values("๋ฐ์ค๋ฒํธ").to_dict(orient="records")
|
|
|
|
| 234 |
|
| 235 |
+
# 1ํ์ด์ง ๋ฏธ๋ฆฌ๋ณด๊ธฐ ๋งคํ
|
| 236 |
+
st.subheader("๐งช 1ํ์ด์ง ๋งคํ ํ๋ฆฌ๋ทฐ")
|
|
|
|
| 237 |
keys = ["๋ฐ์ค๋ฒํธ","์ข
๋ฃ์ฐ๋","๋ณด์กด๊ธฐ๊ฐ","๋จ์์
๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","๋ชฉ๋ก"]
|
| 238 |
+
preview = {}
|
| 239 |
+
n = int(batch_size)
|
| 240 |
+
for i in range(n):
|
| 241 |
+
if i < len(rows):
|
| 242 |
+
r = rows[i]
|
| 243 |
for k in keys:
|
| 244 |
+
preview[f"{k}{i+1}"] = r.get("์์ฐ์ฐ๋","") if k=="์ข
๋ฃ์ฐ๋" else r.get(k,"")
|
|
|
|
|
|
|
|
|
|
| 245 |
else:
|
| 246 |
for k in keys:
|
| 247 |
+
preview[f"{k}{i+1}"] = ""
|
|
|
|
| 248 |
st.dataframe(
|
| 249 |
+
pd.DataFrame([{"ํ ํฐ/ํ๋":k, "๊ฐ ์๋ถ๋ถ":str(v)[:120]} for k,v in sorted(preview.items())]),
|
|
|
|
|
|
|
|
|
|
| 250 |
use_container_width=True, height=320
|
| 251 |
)
|
| 252 |
|
| 253 |
if st.button("๐ ๋ผ๋ฒจ ์์ฑ (ํ์ด์ง๋ณ HWPX ZIP)"):
|
| 254 |
mem_zip = io.BytesIO()
|
| 255 |
zout = zipfile.ZipFile(mem_zip, "w", zipfile.ZIP_DEFLATED)
|
| 256 |
+
pages = (len(rows) + n - 1) // n
|
| 257 |
+
all_dbg = []
|
| 258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
for p in range(pages):
|
| 260 |
+
chunk = rows[p*n:(p+1)*n]
|
|
|
|
| 261 |
mapping = {}
|
| 262 |
for i in range(n):
|
| 263 |
if i < len(chunk):
|
| 264 |
r = chunk[i]
|
| 265 |
for k in keys:
|
| 266 |
+
mapping[f"{k}{i+1}"] = r.get("์์ฐ์ฐ๋","") if k=="์ข
๋ฃ์ฐ๋" else r.get(k,"")
|
|
|
|
|
|
|
|
|
|
| 267 |
else:
|
| 268 |
for k in keys:
|
| 269 |
mapping[f"{k}{i+1}"] = ""
|
| 270 |
|
| 271 |
+
if mode == "token":
|
| 272 |
+
out, dbg = token_mode_apply(tpl_bytes, mapping, collect_debug=True)
|
| 273 |
+
elif mode == "field":
|
| 274 |
+
out, dbg = field_mode_apply(tpl_bytes, mapping, collect_debug=True)
|
| 275 |
+
else:
|
| 276 |
+
# ์์ ๋นต: ๋ ๋ค ์๋ (token -> field)
|
| 277 |
+
out, dbg = token_mode_apply(tpl_bytes, mapping, collect_debug=True)
|
| 278 |
+
if dbg and not dbg["files_touched"]:
|
| 279 |
+
out, dbg = field_mode_apply(tpl_bytes, mapping, collect_debug=True)
|
| 280 |
+
|
| 281 |
+
all_dbg.append({"page": p+1, "mode": dbg.get("mode") if dbg else mode, "stats": dbg})
|
| 282 |
page_boxes = [r.get("๋ฐ์ค๋ฒํธ","") for r in chunk]
|
| 283 |
+
name = "_".join(page_boxes) if page_boxes else f"empty_{p+1}"
|
| 284 |
+
zout.writestr(f"label_{name}.hwpx", out)
|
| 285 |
|
| 286 |
zout.close(); mem_zip.seek(0)
|
| 287 |
st.download_button("โฌ๏ธ ZIP ๋ค์ด๋ก๋", data=mem_zip, file_name="labels_by_page.zip", mime="application/zip")
|
| 288 |
+
st.download_button("โฌ๏ธ ๋๋ฒ๊ทธ(JSON)", data=json.dumps(all_dbg, ensure_ascii=False, indent=2),
|
| 289 |
+
file_name="debug.json", mime="application/json")
|
|
|
|
| 290 |
|
| 291 |
+
st.caption("ํ๋ ๋ชจ๋: ๊ฐ์ ํ๋ ๋ด๋ถ + fieldEnd ๋ค์ ์ผ๋ฐ ํ
์คํธ๋ก ํ ๋ฒ ๋ ๋ฃ์ต๋๋ค(ํญ์ ๋ณด์ด๋๋ก). ํ ํฐ ๋ชจ๋: run ๋ณํฉ ํ ์นํํฉ๋๋ค.")
|