dohyune commited on
Commit
45aa168
ยท
verified ยท
1 Parent(s): 105b0bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +173 -130
app.py CHANGED
@@ -2,28 +2,10 @@ import streamlit as st
2
  import pandas as pd
3
  import io, zipfile, re, html, json
4
 
5
- st.set_page_config(page_title="๐Ÿ“ฆ ๋ฐ•์Šค๋ผ๋ฒจ ์ž๋™ ์ƒ์„ฑ๊ธฐ (ํ† ํฐยท๋ฐฐ์น˜ยท๋Ÿฐ๋ณ‘ํ•ฉ)", layout="wide")
6
- st.title("๐Ÿ“ฆ ๋ฐ•์Šค๋ผ๋ฒจ ์ž๋™ ์ƒ์„ฑ๊ธฐ (.HWPX โ€” ํ† ํฐยท๋ฐฐ์น˜ยท๋Ÿฐ ๋ณ‘ํ•ฉ)")
7
 
8
- with st.expander("์‚ฌ์šฉ ๋ฐฉ๋ฒ•", expanded=True):
9
- st.markdown("""
10
- **ํ…œํ”Œ๋ฆฟ ์ค€๋น„**
11
- - ๋ผ๋ฒจ ํ•œ ํŽ˜์ด์ง€์— `{{๋ฐ•์Šค๋ฒˆํ˜ธ1}} ... {{๋ฐ•์Šค๋ฒˆํ˜ธN}}`, `{{์ข…๋ฃŒ์—ฐ๋„1}} ...`, `{{๋ณด์กด๊ธฐ๊ฐ„1}} ...`, `{{๋‹จ์œ„์—…๋ฌด1}} ...`, `{{๊ธฐ๋ก๋ฌผ์ฒ 1}} ...`, `{{๋ชฉ๋ก1}} ...` ์ฒ˜๋Ÿผ **๋ฒˆํ˜ธ๊ฐ€ ๋ถ™์€ ํ† ํฐ**์„ ๋„ฃ์–ด ์ฃผ์„ธ์š”.
12
- - ํ† ํฐ์€ ๊ฐ€๋Šฅํ•˜๋ฉด ํ•œ ๋ฉ์–ด๋ฆฌ ํ…์ŠคํŠธ๋กœ ์ž…๋ ฅํ•˜์„ธ์š”. (ํ•˜์ง€๋งŒ ์ด ์•ฑ์€ ํ† ํฐ์ด ์—ฌ๋Ÿฌ run์œผ๋กœ ์ชผ๊ฐœ์ ธ ์žˆ์–ด๋„ **์ž๋™ ๋ณ‘ํ•ฉ**ํ•ด์„œ ์น˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.)
13
-
14
- **๋ฐ์ดํ„ฐ**
15
- - ํ•„์ˆ˜: `๋ฐ•์Šค๋ฒˆํ˜ธ`
16
- - ๊ถŒ์žฅ: `์ข…๋ฃŒ์—ฐ๋„`, `๋ณด์กด๊ธฐ๊ฐ„`, `๋‹จ์œ„์—…๋ฌด`, `๊ธฐ๋ก๋ฌผ์ฒ `, `์ œ๋ชฉ`
17
- - ๋ชฉ๋ก์€ (๊ด€๋ฆฌ๋ฒˆํ˜ธ + ์ œ๋ชฉ) ์กฐํ•ฉ์œผ๋กœ ์ž๋™ ์ƒ์„ฑ. `์ข…๋ฃŒ์—ฐ๋„`๋Š” ๋ฐ•์Šค๋ณ„ ์ตœ์†Œ~์ตœ๋Œ€๋กœ ๋ฌถ์–ด **์ƒ์‚ฐ์—ฐ๋„(๋ฒ”์œ„)** ๋กœ ๋“ค์–ด๊ฐ‘๋‹ˆ๋‹ค.
18
-
19
- **์ถœ๋ ฅ**
20
- - ํ…œํ”Œ๋ฆฟ์˜ ๋ผ๋ฒจ ์„ธํŠธ ๊ฐœ์ˆ˜(N)๋ฅผ ์ง€์ •ํ•˜๋ฉด N๊ฐœ์”ฉ ๋ฌถ์–ด **ํŽ˜์ด์ง€ ๋‹จ์œ„ HWPX**๋ฅผ ๋งŒ๋“ญ๋‹ˆ๋‹ค.
21
- - ZIP ์•ˆ ํŒŒ์ผ๋ช… ์˜ˆ: `label_0001_0003.hwpx` (ํ•ด๋‹น ํŽ˜์ด์ง€์— ๋“ค์–ด๊ฐ„ ๋ฐ•์Šค๋ฒˆํ˜ธ)
22
- """)
23
-
24
- # ---------------------------
25
- # ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ
26
- # ---------------------------
27
  def compute_year_range(series: pd.Series) -> str:
28
  s = series.astype(str).fillna("")
29
  valid = s[~s.isin(["", "0", "0000"])]
@@ -40,14 +22,14 @@ def build_merged_df(df: pd.DataFrame) -> pd.DataFrame:
40
  if "์ œ๋ชฉ" in df.columns:
41
  df["์ œ๋ชฉ"] = df["์ œ๋ชฉ"].astype(str)
42
 
43
- # ์ƒ์‚ฐ์—ฐ๋„(๋ฒ”์œ„)
44
  if "์ข…๋ฃŒ์—ฐ๋„" in df.columns:
45
  prod_df = df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ")["์ข…๋ฃŒ์—ฐ๋„"].apply(compute_year_range).reset_index()
46
  prod_df.columns = ["๋ฐ•์Šค๋ฒˆํ˜ธ", "์ƒ์‚ฐ์—ฐ๋„"]
47
  else:
48
  prod_df = pd.DataFrame({"๋ฐ•์Šค๋ฒˆํ˜ธ": df["๋ฐ•์Šค๋ฒˆํ˜ธ"].unique(), "์ƒ์‚ฐ์—ฐ๋„": "0000-0000"})
49
 
50
- # ๋ชฉ๋ก
51
  has_mgmt = "๊ด€๋ฆฌ๋ฒˆํ˜ธ" in df.columns
52
  list_rows = []
53
  for box, g in df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ"):
@@ -58,7 +40,6 @@ def build_merged_df(df: pd.DataFrame) -> pd.DataFrame:
58
  list_rows.append({"๋ฐ•์Šค๋ฒˆํ˜ธ": box, "๋ชฉ๋ก": "\r\n".join(lines)})
59
  list_df = pd.DataFrame(list_rows)
60
 
61
- # ๋Œ€ํ‘œ ๋ฉ”ํƒ€
62
  meta_cols = ["๋ฐ•์Šค๋ฒˆํ˜ธ","์ข…๋ฃŒ์—ฐ๋„","๋ณด์กด๊ธฐ๊ฐ„","๋‹จ์œ„์—…๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","์ œ๋ชฉ"]
63
  meta_exist = [c for c in meta_cols if c in df.columns]
64
  meta_df = df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ", as_index=False).first()[meta_exist] if meta_exist \
@@ -66,72 +47,32 @@ def build_merged_df(df: pd.DataFrame) -> pd.DataFrame:
66
 
67
  return meta_df.merge(list_df, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left").merge(prod_df, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left")
68
 
69
- # ---------------------------
70
- # HWPX ํ† ํฐ ์น˜ํ™˜ (๋Ÿฐ ๋ณ‘ํ•ฉ ํฌํ•จ)
71
- # ---------------------------
72
- # run ๊ฒฝ๊ณ„ ๋ณ‘ํ•ฉ: </hp:t></hp:run><hp:run...><hp:t> ์‚ฌ์ด ํƒœ๊ทธ๋“ค์„ ์ง€์›Œ ํ…์ŠคํŠธ๋ฅผ ์ด์–ด ๋ถ™์ž„
73
- RUN_JOIN_RE = re.compile(
74
- r'</hp:t>\s*</hp:run>\s*<hp:run[^>]*>\s*<hp:t>',
75
- flags=re.DOTALL
76
- )
77
-
78
- def _build_list_text(text: str) -> str:
79
  if text is None: return ""
80
- text = str(text)
81
- lines = text.replace("\r\n", "\n").split("\n")
82
  parts = []
83
  for i, ln in enumerate(lines):
84
  if i > 0:
85
  parts.append("<hp:lineBreak/>")
86
- parts.append(html.escape(ln))
87
  return "".join(parts)
88
 
89
- def replace_tokens_in_hwpx_batch(hwpx_bytes: bytes, mapping: dict, collect_debug: bool=False):
90
- """
91
- mapping: {'๋ฐ•์Šค๋ฒˆํ˜ธ1': '0001', '์ข…๋ฃŒ์—ฐ๋„1': '1999-2002', '๋ชฉ๋ก1': '- a\\n- b', ...}
92
- ์ ˆ์ฐจ:
93
- 1) XML ๋กœ๋“œ
94
- 2) ์ธ์ ‘ run ๋ณ‘ํ•ฉ (RUN_JOIN_RE)
95
- 3) {{ํ† ํฐ}} -> ๊ฐ’ (๋ชฉ๋ก์€ <hp:lineBreak/>)
96
- 4) mimetype: ๋ฌด์••์ถ•(STORED) + ์ฒซ ์—”ํŠธ๋ฆฌ
97
- """
98
- dbg = {"token_hits": {}, "files_touched": []} if collect_debug else None
99
-
100
- zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
101
- out_buf = io.BytesIO()
102
- zout = zipfile.ZipFile(out_buf, "w")
103
 
 
 
 
 
104
  names = zin.namelist()
 
 
105
  if "mimetype" in names:
106
- data = zin.read("mimetype")
107
  zi = zipfile.ZipInfo("mimetype")
108
  zi.compress_type = zipfile.ZIP_STORED
109
- zout.writestr(zi, data)
110
-
111
- token_keys = list(mapping.keys())
112
-
113
- def do_replace(s: str) -> (str, bool):
114
- changed_any = False
115
- # 1) run ๋ณ‘ํ•ฉ
116
- s2 = RUN_JOIN_RE.sub('', s)
117
- if s2 != s:
118
- changed_any = True
119
- s = s2
120
- # 2) ํ† ํฐ ์น˜ํ™˜
121
- for k in token_keys:
122
- tok = f"{{{{{k}}}}}"
123
- if tok in s:
124
- val = mapping.get(k, "")
125
- if re.match(r"^(๋ชฉ๋ก|list)\d+$", k):
126
- val = _build_list_text(val)
127
- else:
128
- val = html.escape("" if val is None else str(val))
129
- s = s.replace(tok, val)
130
- changed_any = True
131
- if collect_debug:
132
- dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1
133
- return s, changed_any
134
 
 
135
  for e in zin.infolist():
136
  if e.filename == "mimetype":
137
  continue
@@ -139,9 +80,7 @@ def replace_tokens_in_hwpx_batch(hwpx_bytes: bytes, mapping: dict, collect_debug
139
  if e.filename.startswith("Contents/") and e.filename.endswith(".xml"):
140
  try:
141
  s = data.decode("utf-8", errors="ignore")
142
- s2, changed = do_replace(s)
143
- if collect_debug and changed:
144
- dbg["files_touched"].append(e.filename)
145
  data = s2.encode("utf-8")
146
  except Exception:
147
  pass
@@ -149,20 +88,132 @@ def replace_tokens_in_hwpx_batch(hwpx_bytes: bytes, mapping: dict, collect_debug
149
  zi.compress_type = zipfile.ZIP_DEFLATED
150
  zout.writestr(zi, data)
151
 
152
- zin.close(); zout.close(); out_buf.seek(0)
153
- return (out_buf.getvalue(), dbg) if collect_debug else (out_buf.getvalue(), None)
154
 
155
- # ---------------------------
156
- # UI
157
- # ---------------------------
158
- tpl_file = st.file_uploader("๐Ÿ“„ HWPX ํ…œํ”Œ๋ฆฟ ์—…๋กœ๋“œ", type=["hwpx"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  batch_size = st.number_input("ํ…œํ”Œ๋ฆฟ์˜ ๋ผ๋ฒจ ์„ธํŠธ ๊ฐœ์ˆ˜ (ํ•œ ํŽ˜์ด์ง€ N๊ฐœ)", min_value=1, max_value=12, value=3, step=1)
160
- data_file = st.file_uploader("๐Ÿ“Š ๋ฐ์ดํ„ฐ ์—…๋กœ๋“œ (Excel/CSV)", type=["xlsx","xls","csv"])
161
 
162
- if tpl_file and data_file:
163
- tpl_bytes = tpl_file.read()
164
- df = pd.read_csv(data_file) if data_file.name.lower().endswith(".csv") else pd.read_excel(data_file)
 
165
 
 
166
  if "๋ฐ•์Šค๋ฒˆํ˜ธ" not in df.columns:
167
  st.error("โŒ ํ•„์ˆ˜ ์ปฌ๋Ÿผ '๋ฐ•์Šค๋ฒˆํ˜ธ'๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
168
  st.stop()
@@ -178,71 +229,63 @@ if tpl_file and data_file:
178
  st.dataframe(pd.DataFrame({"๋ฐ•์Šค๋ฒˆํ˜ธ": box_list}), use_container_width=True, height=240)
179
 
180
  selected = st.multiselect("์ƒ์„ฑํ•  ๋ฐ•์Šค๋ฒˆํ˜ธ ์„ ํƒ (๋น„์šฐ๋ฉด ์ „์ฒด ์ƒ์„ฑ)", options=box_list)
181
- work_df = merged[merged["๋ฐ•์Šค๋ฒˆํ˜ธ"].isin(selected)] if selected else merged
182
-
183
- rows = work_df.sort_values("๋ฐ•์Šค๋ฒˆํ˜ธ").to_dict(orient="records")
184
 
185
- # 1ํŽ˜์ด์ง€ ๋ฏธ๋ฆฌ๋ณด๊ธฐ
186
- st.subheader("๐Ÿงช 1ํŽ˜์ด์ง€ ํ† ํฐ ๋งคํ•‘ ๋ฏธ๋ฆฌ๋ณด๊ธฐ")
187
- first_page = rows[:int(batch_size)]
188
  keys = ["๋ฐ•์Šค๋ฒˆํ˜ธ","์ข…๋ฃŒ์—ฐ๋„","๋ณด์กด๊ธฐ๊ฐ„","๋‹จ์œ„์—…๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","๋ชฉ๋ก"]
189
- mapping_preview = {}
190
- for i in range(int(batch_size)):
191
- if i < len(first_page):
192
- r = first_page[i]
 
193
  for k in keys:
194
- if k == "์ข…๋ฃŒ์—ฐ๋„":
195
- mapping_preview[f"{k}{i+1}"] = r.get("์ƒ์‚ฐ์—ฐ๋„","")
196
- else:
197
- mapping_preview[f"{k}{i+1}"] = r.get(k,"")
198
  else:
199
  for k in keys:
200
- mapping_preview[f"{k}{i+1}"] = ""
201
-
202
  st.dataframe(
203
- pd.DataFrame(
204
- [{"ํ† ํฐ": k, "๊ฐ’(์•ž๋ถ€๋ถ„)": (str(v)[:120] if v is not None else ""), "๊ธธ์ด": (len(str(v)) if v is not None else 0)}
205
- for k, v in sorted(mapping_preview.items())]
206
- ),
207
  use_container_width=True, height=320
208
  )
209
 
210
  if st.button("๐Ÿš€ ๋ผ๋ฒจ ์ƒ์„ฑ (ํŽ˜์ด์ง€๋ณ„ HWPX ZIP)"):
211
  mem_zip = io.BytesIO()
212
  zout = zipfile.ZipFile(mem_zip, "w", zipfile.ZIP_DEFLATED)
 
 
213
 
214
- n = int(batch_size)
215
- total = len(rows)
216
- pages = (total + n - 1) // n
217
-
218
- all_debug = []
219
  for p in range(pages):
220
- start = p * n
221
- chunk = rows[start:start+n]
222
  mapping = {}
223
  for i in range(n):
224
  if i < len(chunk):
225
  r = chunk[i]
226
  for k in keys:
227
- if k == "์ข…๋ฃŒ์—ฐ๋„":
228
- mapping[f"{k}{i+1}"] = r.get("์ƒ์‚ฐ์—ฐ๋„","")
229
- else:
230
- mapping[f"{k}{i+1}"] = r.get(k,"")
231
  else:
232
  for k in keys:
233
  mapping[f"{k}{i+1}"] = ""
234
 
235
- out_hwpx, dbg = replace_tokens_in_hwpx_batch(tpl_bytes, mapping, collect_debug=True)
236
- all_debug.append({"page": p+1, "mapping_keys": sorted(list(mapping.keys())), "stats": dbg})
237
-
 
 
 
 
 
 
 
 
238
  page_boxes = [r.get("๋ฐ•์Šค๋ฒˆํ˜ธ","") for r in chunk]
239
- safe = "_".join(page_boxes) if page_boxes else f"empty_{p+1}"
240
- zout.writestr(f"label_{safe}.hwpx", out_hwpx)
241
 
242
  zout.close(); mem_zip.seek(0)
243
  st.download_button("โฌ‡๏ธ ZIP ๋‹ค์šด๋กœ๋“œ", data=mem_zip, file_name="labels_by_page.zip", mime="application/zip")
244
- st.download_button("โฌ‡๏ธ ๋””๋ฒ„๊ทธ ๋ฆฌํฌํŠธ(JSON)",
245
- data=json.dumps(all_debug, ensure_ascii=False, indent=2),
246
- file_name="debug_by_page.json", mime="application/json")
247
 
248
- st.caption("โ€ป ํ† ํฐ์ด run์œผ๋กœ ์ชผ๊ฐœ์ ธ ์žˆ์–ด๋„ ์ž๋™ ๋ณ‘ํ•ฉ ํ›„ ์น˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค. ๊ทธ๋ž˜๋„ ์•ˆ ๋ฐ”๋€Œ๋Š” ํ† ํฐ์ด ์žˆ์œผ๋ฉด ๊ทธ ํ† ํฐ ๋ฌธ์ž์—ด์„ ์•Œ๋ ค์ฃผ์„ธ์š”.")
 
2
  import pandas as pd
3
  import io, zipfile, re, html, json
4
 
5
+ st.set_page_config(page_title="๐Ÿ“ฆ ๋ฐ•์Šค๋ผ๋ฒจ ์ž๋™ ์ƒ์„ฑ๊ธฐ (ํ•„๋“œ/ํ† ํฐ ์ž๋™๊ฐ์ง€)", layout="wide")
6
+ st.title("๐Ÿ“ฆ ๋ฐ•์Šค๋ผ๋ฒจ ์ž๋™ ์ƒ์„ฑ๊ธฐ (.HWPX โ€” ํ•„๋“œ/ํ† ํฐ ์ž๋™๊ฐ์ง€)")
7
 
8
+ # =============== ๊ณตํ†ต ์œ ํ‹ธ ===============
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def compute_year_range(series: pd.Series) -> str:
10
  s = series.astype(str).fillna("")
11
  valid = s[~s.isin(["", "0", "0000"])]
 
22
  if "์ œ๋ชฉ" in df.columns:
23
  df["์ œ๋ชฉ"] = df["์ œ๋ชฉ"].astype(str)
24
 
25
+ # ์ƒ์‚ฐ์—ฐ๋„(๋ฒ”์œ„) = ์ข…๋ฃŒ์—ฐ๋„ ๊ทธ๋ฃน ๋ฒ”์œ„
26
  if "์ข…๋ฃŒ์—ฐ๋„" in df.columns:
27
  prod_df = df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ")["์ข…๋ฃŒ์—ฐ๋„"].apply(compute_year_range).reset_index()
28
  prod_df.columns = ["๋ฐ•์Šค๋ฒˆํ˜ธ", "์ƒ์‚ฐ์—ฐ๋„"]
29
  else:
30
  prod_df = pd.DataFrame({"๋ฐ•์Šค๋ฒˆํ˜ธ": df["๋ฐ•์Šค๋ฒˆํ˜ธ"].unique(), "์ƒ์‚ฐ์—ฐ๋„": "0000-0000"})
31
 
32
+ # ๋ชฉ๋ก(๊ด€๋ฆฌ๋ฒˆํ˜ธ + ์ œ๋ชฉ)
33
  has_mgmt = "๊ด€๋ฆฌ๋ฒˆํ˜ธ" in df.columns
34
  list_rows = []
35
  for box, g in df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ"):
 
40
  list_rows.append({"๋ฐ•์Šค๋ฒˆํ˜ธ": box, "๋ชฉ๋ก": "\r\n".join(lines)})
41
  list_df = pd.DataFrame(list_rows)
42
 
 
43
  meta_cols = ["๋ฐ•์Šค๋ฒˆํ˜ธ","์ข…๋ฃŒ์—ฐ๋„","๋ณด์กด๊ธฐ๊ฐ„","๋‹จ์œ„์—…๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","์ œ๋ชฉ"]
44
  meta_exist = [c for c in meta_cols if c in df.columns]
45
  meta_df = df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ", as_index=False).first()[meta_exist] if meta_exist \
 
47
 
48
  return meta_df.merge(list_df, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left").merge(prod_df, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left")
49
 
50
+ def _build_list_runs(text: str) -> str:
 
 
 
 
 
 
 
 
 
51
  if text is None: return ""
52
+ lines = str(text).replace("\r\n", "\n").split("\n")
 
53
  parts = []
54
  for i, ln in enumerate(lines):
55
  if i > 0:
56
  parts.append("<hp:lineBreak/>")
57
+ parts.append(f"<hp:run><hp:t>{html.escape(ln)}</hp:t></hp:run>")
58
  return "".join(parts)
59
 
60
+ def _build_plain_runs(text: str) -> str:
61
+ return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>"
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ # =============== HWPX ์“ฐ๊ธฐ ๊ณตํ†ต (mimetype ๋ฌด์••์ถ•/๋งจ์•ž) ===============
64
+ def write_hwpx_like_src(zin: zipfile.ZipFile, writer_fn) -> bytes:
65
+ out = io.BytesIO()
66
+ zout = zipfile.ZipFile(out, "w")
67
  names = zin.namelist()
68
+
69
+ # 1) mimetype ๋จผ์ € ๋ฌด์••์ถ•
70
  if "mimetype" in names:
 
71
  zi = zipfile.ZipInfo("mimetype")
72
  zi.compress_type = zipfile.ZIP_STORED
73
+ zout.writestr(zi, zin.read("mimetype"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ # 2) ๋‚˜๋จธ์ง€ ํŒŒ์ผ
76
  for e in zin.infolist():
77
  if e.filename == "mimetype":
78
  continue
 
80
  if e.filename.startswith("Contents/") and e.filename.endswith(".xml"):
81
  try:
82
  s = data.decode("utf-8", errors="ignore")
83
+ s2 = writer_fn(e.filename, s)
 
 
84
  data = s2.encode("utf-8")
85
  except Exception:
86
  pass
 
88
  zi.compress_type = zipfile.ZIP_DEFLATED
89
  zout.writestr(zi, data)
90
 
91
+ zout.close(); out.seek(0)
92
+ return out.getvalue()
93
 
94
+ # =============== ๋ชจ๋“œ1: ํ† ํฐ ์น˜ํ™˜ ({{ํ‚ค}}) ===============
95
+ RUN_JOIN_RE = re.compile(r'</hp:t>\s*</hp:run>\s*<hp:run[^>]*>\s*<hp:t>', re.DOTALL)
96
+
97
+ def token_mode_apply(hwpx_bytes: bytes, mapping: dict, collect_debug=False):
98
+ dbg = {"mode":"token","files_touched":[], "token_hits":{}} if collect_debug else None
99
+ zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
100
+
101
+ token_keys = list(mapping.keys())
102
+
103
+ def writer_fn(fname: str, xml: str) -> str:
104
+ changed = False
105
+ # run ๊ฒฝ๊ณ„ ๋ณ‘ํ•ฉ (ํ† ํฐ์ด ์ž˜๋ ค ์žˆ์–ด๋„ ์ด์–ด๋ถ™์ด๊ธฐ)
106
+ xml2 = RUN_JOIN_RE.sub('', xml)
107
+ if xml2 != xml:
108
+ changed = True
109
+ xml = xml2
110
+ # ํ† ํฐ ๋ฌธ์ž์—ด ์น˜ํ™˜
111
+ for k in token_keys:
112
+ tok = f"{{{{{k}}}}}"
113
+ if tok in xml:
114
+ val = mapping[k]
115
+ if re.match(r"^(๋ชฉ๋ก|list)\d+$", k, re.IGNORECASE):
116
+ # ํ† ํฐ์€ run ์•ˆ์— ๋“ค์–ด๊ฐ€ ์žˆ์œผ๋ฏ€๋กœ, run ๊ตฌ์กฐ๋ฅผ ํ†ต์งธ๋กœ ์ƒ์„ฑ
117
+ xml = xml.replace(tok, _build_list_runs(val))
118
+ else:
119
+ xml = xml.replace(tok, html.escape("" if val is None else str(val)))
120
+ changed = True
121
+ if dbg: dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1
122
+ if changed and dbg and fname not in dbg["files_touched"]:
123
+ dbg["files_touched"].append(fname)
124
+ return xml
125
+
126
+ out = write_hwpx_like_src(zin, writer_fn)
127
+ zin.close()
128
+ return (out, dbg) if collect_debug else (out, None)
129
+
130
+ # =============== ๋ชจ๋“œ2: ํ•„๋“œ์ปจํŠธ๋กค ์น˜ํ™˜ (๊ฐ€์‹œ ํ…์ŠคํŠธ ์ค‘๋ณต ์‚ฝ์ž…) ===============
131
+ # <hp:fieldBegin ... name="ํ‚ค"> ... </hp:fieldBegin> [๋ณธ๋ฌธ] <hp:fieldEnd ... />
132
+ FIELD_BLOCK_RE_TMPL = r'(<hp:fieldBegin[^>]*name="{name}"[^>]*>.*?</hp:fieldBegin>)(.*?)(<hp:fieldEnd[^>]*/>)'
133
+
134
+ def field_mode_apply(hwpx_bytes: bytes, mapping: dict, collect_debug=False):
135
+ dbg = {"mode":"field","files_touched":[], "field_hits":{}} if collect_debug else None
136
+ zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
137
+
138
+ # ์–ด๋–ค ํ‚ค๋“ค์ด ์‹ค์ œ๋กœ ์กด์žฌํ•˜๋Š”์ง€ ๋น ๋ฅด๊ฒŒ ์ˆ˜์ง‘ (์ •ํ™•๋„โ†‘, ์†๋„โ†‘)
139
+ contents = [e.filename for e in zin.infolist() if e.filename.startswith("Contents/") and e.filename.endswith(".xml")]
140
+ present_keys = set()
141
+ for fn in contents:
142
+ try:
143
+ s = zin.read(fn).decode("utf-8", errors="ignore")
144
+ for k in mapping.keys():
145
+ if f'name="{k}"' in s:
146
+ present_keys.add(k)
147
+ except:
148
+ pass
149
+
150
+ def writer_fn(fname: str, xml: str) -> str:
151
+ any_change = False
152
+ for k in present_keys:
153
+ val = mapping.get(k, "")
154
+ is_list = bool(re.match(r"^(๋ชฉ๋ก|list)\d+$", k, re.IGNORECASE))
155
+ pattern = re.compile(FIELD_BLOCK_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
156
+
157
+ def _repl(m):
158
+ # ํ•„๋“œ ๋‚ด๋ถ€ ๋‚ด์šฉ์€ run์œผ๋กœ ๊ต์ฒด
159
+ inner = _build_list_runs(val) if is_list else _build_plain_runs(val)
160
+ # ํ•„๋“œ ๋’ค์— ๊ฐ€์‹œ ํ…์ŠคํŠธ๋ฅผ 'ํ•œ ๋ฒˆ ๋”' ๋„ฃ์–ด ํ•ญ์ƒ ๋ณด์ด๊ฒŒ
161
+ visible_dup = inner
162
+ if dbg: dbg["field_hits"][k] = dbg["field_hits"].get(k, 0) + 1
163
+ return f'{m.group(1)}{inner}{m.group(3)}{visible_dup}'
164
+
165
+ xml_new, n = pattern.subn(_repl, xml)
166
+ if n:
167
+ any_change = True
168
+ xml = xml_new
169
+
170
+ if any_change and dbg and fname not in dbg["files_touched"]:
171
+ dbg["files_touched"].append(fname)
172
+ return xml
173
+
174
+ out = write_hwpx_like_src(zin, writer_fn)
175
+ zin.close()
176
+ return (out, dbg) if collect_debug else (out, None)
177
+
178
+ # =============== ๋ชจ๋“œ ์ž๋™๊ฐ์ง€ ===============
179
+ def detect_template_mode(hwpx_bytes: bytes) -> str:
180
+ zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
181
+ has_token = False
182
+ has_field = False
183
+ for e in zin.infolist():
184
+ if not (e.filename.startswith("Contents/") and e.filename.endswith(".xml")):
185
+ continue
186
+ try:
187
+ s = zin.read(e.filename).decode("utf-8", errors="ignore")
188
+ if "{{" in s and "}}" in s:
189
+ has_token = True
190
+ if "<hp:fieldBegin" in s and 'name="' in s:
191
+ has_field = True
192
+ except:
193
+ pass
194
+ zin.close()
195
+ if has_token: return "token"
196
+ if has_field: return "field"
197
+ return "unknown"
198
+
199
+ # =============== Streamlit UI ===============
200
+ with st.expander("์‚ฌ์šฉ ๋ฐฉ๋ฒ• ์š”์•ฝ", expanded=True):
201
+ st.markdown("""
202
+ - ํ…œํ”Œ๋ฆฟ์ด **ํ† ํฐ(`{{๋ฐ•์Šค๋ฒˆํ˜ธ1}}` ๋“ฑ)** ์ด๋ฉด ์ž๋™์œผ๋กœ ํ† ํฐ ๋ชจ๋“œ,
203
+ **ํ•œ๊ธ€ ํ•„๋“œ์ปจํŠธ๋กค(`name="๋ฐ•์Šค๋ฒˆํ˜ธ1"`)** ์ด๋ฉด ํ•„๋“œ ๋ชจ๋“œ๋กœ ์ž๋™ ์ฒ˜๋ฆฌํ•ฉ๋‹ˆ๋‹ค.
204
+ - ํ•„๋“œ ๋ชจ๋“œ์—์„œ๋Š” ๊ฐ’์ด ์•ˆ ๋ณด์ด๋Š” ๋ฌธ์ œ๋ฅผ ๋ง‰๊ธฐ ์œ„ํ•ด **fieldEnd ๋’ค์— ๊ฐ€์‹œ ํ…์ŠคํŠธ๋ฅผ ํ•œ ๋ฒˆ ๋” ๋„ฃ์Šต๋‹ˆ๋‹ค.**
205
+ """)
206
+
207
+ tpl = st.file_uploader("๐Ÿ“„ HWPX ํ…œํ”Œ๋ฆฟ ์—…๋กœ๋“œ", type=["hwpx"])
208
  batch_size = st.number_input("ํ…œํ”Œ๋ฆฟ์˜ ๋ผ๋ฒจ ์„ธํŠธ ๊ฐœ์ˆ˜ (ํ•œ ํŽ˜์ด์ง€ N๊ฐœ)", min_value=1, max_value=12, value=3, step=1)
209
+ data = st.file_uploader("๐Ÿ“Š ๋ฐ์ดํ„ฐ ์—…๋กœ๋“œ (Excel/CSV)", type=["xlsx","xls","csv"])
210
 
211
+ if tpl and data:
212
+ tpl_bytes = tpl.read()
213
+ mode = detect_template_mode(tpl_bytes)
214
+ st.info(f"ํƒ์ง€๋œ ํ…œํ”Œ๋ฆฟ ๋ชจ๋“œ: **{mode}**")
215
 
216
+ df = pd.read_csv(data) if data.name.lower().endswith(".csv") else pd.read_excel(data)
217
  if "๋ฐ•์Šค๋ฒˆํ˜ธ" not in df.columns:
218
  st.error("โŒ ํ•„์ˆ˜ ์ปฌ๋Ÿผ '๋ฐ•์Šค๋ฒˆํ˜ธ'๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
219
  st.stop()
 
229
  st.dataframe(pd.DataFrame({"๋ฐ•์Šค๋ฒˆํ˜ธ": box_list}), use_container_width=True, height=240)
230
 
231
  selected = st.multiselect("์ƒ์„ฑํ•  ๋ฐ•์Šค๋ฒˆํ˜ธ ์„ ํƒ (๋น„์šฐ๋ฉด ์ „์ฒด ์ƒ์„ฑ)", options=box_list)
232
+ work = merged[merged["๋ฐ•์Šค๋ฒˆํ˜ธ"].isin(selected)] if selected else merged
233
+ rows = work.sort_values("๋ฐ•์Šค๋ฒˆํ˜ธ").to_dict(orient="records")
 
234
 
235
+ # 1ํŽ˜์ด์ง€ ๋ฏธ๋ฆฌ๋ณด๊ธฐ ๋งคํ•‘
236
+ st.subheader("๐Ÿงช 1ํŽ˜์ด์ง€ ๋งคํ•‘ ํ”„๋ฆฌ๋ทฐ")
 
237
  keys = ["๋ฐ•์Šค๋ฒˆํ˜ธ","์ข…๋ฃŒ์—ฐ๋„","๋ณด์กด๊ธฐ๊ฐ„","๋‹จ์œ„์—…๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","๋ชฉ๋ก"]
238
+ preview = {}
239
+ n = int(batch_size)
240
+ for i in range(n):
241
+ if i < len(rows):
242
+ r = rows[i]
243
  for k in keys:
244
+ preview[f"{k}{i+1}"] = r.get("์ƒ์‚ฐ์—ฐ๋„","") if k=="์ข…๋ฃŒ์—ฐ๋„" else r.get(k,"")
 
 
 
245
  else:
246
  for k in keys:
247
+ preview[f"{k}{i+1}"] = ""
 
248
  st.dataframe(
249
+ pd.DataFrame([{"ํ† ํฐ/ํ•„๋“œ":k, "๊ฐ’ ์•ž๋ถ€๋ถ„":str(v)[:120]} for k,v in sorted(preview.items())]),
 
 
 
250
  use_container_width=True, height=320
251
  )
252
 
253
  if st.button("๐Ÿš€ ๋ผ๋ฒจ ์ƒ์„ฑ (ํŽ˜์ด์ง€๋ณ„ HWPX ZIP)"):
254
  mem_zip = io.BytesIO()
255
  zout = zipfile.ZipFile(mem_zip, "w", zipfile.ZIP_DEFLATED)
256
+ pages = (len(rows) + n - 1) // n
257
+ all_dbg = []
258
 
 
 
 
 
 
259
  for p in range(pages):
260
+ chunk = rows[p*n:(p+1)*n]
 
261
  mapping = {}
262
  for i in range(n):
263
  if i < len(chunk):
264
  r = chunk[i]
265
  for k in keys:
266
+ mapping[f"{k}{i+1}"] = r.get("์ƒ์‚ฐ์—ฐ๋„","") if k=="์ข…๋ฃŒ์—ฐ๋„" else r.get(k,"")
 
 
 
267
  else:
268
  for k in keys:
269
  mapping[f"{k}{i+1}"] = ""
270
 
271
+ if mode == "token":
272
+ out, dbg = token_mode_apply(tpl_bytes, mapping, collect_debug=True)
273
+ elif mode == "field":
274
+ out, dbg = field_mode_apply(tpl_bytes, mapping, collect_debug=True)
275
+ else:
276
+ # ์•ˆ์ „๋นต: ๋‘˜ ๋‹ค ์‹œ๋„ (token -> field)
277
+ out, dbg = token_mode_apply(tpl_bytes, mapping, collect_debug=True)
278
+ if dbg and not dbg["files_touched"]:
279
+ out, dbg = field_mode_apply(tpl_bytes, mapping, collect_debug=True)
280
+
281
+ all_dbg.append({"page": p+1, "mode": dbg.get("mode") if dbg else mode, "stats": dbg})
282
  page_boxes = [r.get("๋ฐ•์Šค๋ฒˆํ˜ธ","") for r in chunk]
283
+ name = "_".join(page_boxes) if page_boxes else f"empty_{p+1}"
284
+ zout.writestr(f"label_{name}.hwpx", out)
285
 
286
  zout.close(); mem_zip.seek(0)
287
  st.download_button("โฌ‡๏ธ ZIP ๋‹ค์šด๋กœ๋“œ", data=mem_zip, file_name="labels_by_page.zip", mime="application/zip")
288
+ st.download_button("โฌ‡๏ธ ๋””๋ฒ„๊ทธ(JSON)", data=json.dumps(all_dbg, ensure_ascii=False, indent=2),
289
+ file_name="debug.json", mime="application/json")
 
290
 
291
+ st.caption("ํ•„๋“œ ๋ชจ๋“œ: ๊ฐ’์€ ํ•„๋“œ ๋‚ด๋ถ€ + fieldEnd ๋’ค์— ์ผ๋ฐ˜ ํ…์ŠคํŠธ๋กœ ํ•œ ๋ฒˆ ๋” ๋„ฃ์Šต๋‹ˆ๋‹ค(ํ•ญ์ƒ ๋ณด์ด๋„๋ก). ํ† ํฐ ๋ชจ๋“œ: run ๋ณ‘ํ•ฉ ํ›„ ์น˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.")