dohyune commited on
Commit
f84af8d
ยท
verified ยท
1 Parent(s): 3902f45

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +289 -267
app.py CHANGED
@@ -1,27 +1,40 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import io, zipfile, re, html, json
 
 
 
4
  from typing import Dict, Tuple
5
 
6
- st.set_page_config(page_title="๐Ÿ“ฆ ๋ฐ•์Šค๋ผ๋ฒจ(HWPX) โ€” ๋ฌธ๋‹จ ๋‹จ์œ„ ์™„์ „ ์น˜ํ™˜", layout="wide")
7
- st.title("๐Ÿ“ฆ ๋ฐ•์Šค๋ผ๋ฒจ ์ž๋™ ์ƒ์„ฑ๊ธฐ โ€” HWPX ํ•„๋“œยทํ† ํฐยทํ…์ŠคํŠธ ์™„์ „ ์น˜ํ™˜(๋ฌธ๋‹จ ๋‹จ์œ„)")
 
 
 
 
 
8
 
9
- # -------------------- ๋ฐ์ดํ„ฐ ์œ ํ‹ธ --------------------
 
10
  def _year_range(series: pd.Series) -> str:
11
  s = series.astype(str).fillna("")
12
  v = s[~s.isin(["", "0", "0000"])]
13
- if v.empty: return "0000-0000"
 
14
  nums = pd.to_numeric(v, errors="coerce").dropna().astype(int)
15
- if nums.empty: return "0000-0000"
 
16
  return f"{nums.min():04d}-{nums.max():04d}"
17
 
 
18
  def build_rows(df: pd.DataFrame) -> pd.DataFrame:
 
19
  df = df.copy()
20
  df["๋ฐ•์Šค๋ฒˆํ˜ธ"] = df["๋ฐ•์Šค๋ฒˆํ˜ธ"].astype(str).str.zfill(4)
21
  if "์ œ๋ชฉ" in df.columns:
22
  df["์ œ๋ชฉ"] = df["์ œ๋ชฉ"].astype(str)
23
 
24
- # ์ƒ์‚ฐ์—ฐ๋„(๋ฒ”์œ„) = ์ข…๋ฃŒ์—ฐ๋„ ๊ทธ๋ฃน ๋ฒ”์œ„
25
  if "์ข…๋ฃŒ์—ฐ๋„" in df.columns:
26
  yr = df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ")["์ข…๋ฃŒ์—ฐ๋„"].apply(_year_range).reset_index()
27
  yr.columns = ["๋ฐ•์Šค๋ฒˆํ˜ธ", "์ƒ์‚ฐ์—ฐ๋„"]
@@ -32,72 +45,102 @@ def build_rows(df: pd.DataFrame) -> pd.DataFrame:
32
  has_mgmt = "๊ด€๋ฆฌ๋ฒˆํ˜ธ" in df.columns
33
  lists = []
34
  for b, g in df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ"):
35
- lines = [f"- {r['๊ด€๋ฆฌ๋ฒˆํ˜ธ']} {r.get('์ œ๋ชฉ','')}" if has_mgmt else f"- {r.get('์ œ๋ชฉ','')}"
36
- for _, r in g.iterrows()]
 
 
37
  lists.append({"๋ฐ•์Šค๋ฒˆํ˜ธ": b, "๋ชฉ๋ก": "\r\n".join(lines)})
38
  list_df = pd.DataFrame(lists)
39
 
40
  # ๋Œ€ํ‘œ ๋ฉ”ํƒ€
41
- meta_cols = ["๋ฐ•์Šค๋ฒˆํ˜ธ","์ข…๋ฃŒ์—ฐ๋„","๋ณด์กด๊ธฐ๊ฐ„","๋‹จ์œ„์—…๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","์ œ๋ชฉ"]
42
  meta_exist = [c for c in meta_cols if c in df.columns]
43
- meta = df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ", as_index=False).first()[meta_exist] if meta_exist \
44
- else pd.DataFrame({"๋ฐ•์Šค๋ฒˆํ˜ธ": df["๋ฐ•์Šค๋ฒˆํ˜ธ"].unique()})
 
 
45
 
46
  merged = meta.merge(list_df, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left").merge(yr, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left")
47
  return merged
48
 
49
- # -------------------- ์น˜ํ™˜ ์œ ํ‹ธ --------------------
 
 
 
50
  FIELD_PAIR_RE_TMPL = (
51
  r'<(?P<fprefix>[a-zA-Z0-9_]+):fieldBegin\b[^>]*\bname="{name}"[^>]*/>'
52
  r'(.*?)'
53
  r'<(?P=fprefix):fieldEnd\b[^>]*/>'
54
  )
 
55
  TOKEN_FMT = "{{{{{key}}}}}"
56
 
57
- # ๋ฌธ๋‹จ(<*:p>) ํƒ์ƒ‰ ํŒจํ„ด
58
  PARA_RE = re.compile(
59
  r'<(?P<pprefix>[a-zA-Z0-9_]+):p(?P<pattrs>[^>]*)>(?P<pbody>.*?)</(?P=pprefix):p>',
60
- re.DOTALL
61
  )
62
 
63
- # ์›๋ณธ run ์Šคํƒ€์ผ์„ ์ถ”์ถœํ•˜๋Š” ํ•จ์ˆ˜
64
- def _extract_run_style(body: str, pprefix: str) -> str:
65
- """๋ฌธ๋‹จ ๋‚ด์šฉ์—์„œ ์ฒซ ๋ฒˆ์งธ run ์š”์†Œ์˜ ์Šคํƒ€์ผ์„ ์ถ”์ถœ"""
66
- run_pattern = re.compile(
67
- rf'<{pprefix}:run[^>]*>.*?</{pprefix}:run>',
68
- re.DOTALL
69
- )
70
- match = run_pattern.search(body)
71
- if match:
72
- return match.group(0)
73
- return f'<{pprefix}:run><{pprefix}:t><//{pprefix}:t></{pprefix}:run>'
74
-
75
- # ๋ฌธ๋‹จ ํ•˜๋‚˜๋ฅผ ๊ฐ™์€ ์Šคํƒ€์ผ๋กœ ๋ณต์ œํ•ด์ฃผ๋Š” ํ—ฌํผ (์Šคํƒ€์ผ ๋ณด์กด)
76
- def _make_para_with_style(pprefix: str, pattrs: str, text: str, original_run: str) -> str:
77
- esc = html.escape("" if text is None else str(text))
78
-
79
- # ์›๋ณธ run์—์„œ ํ…์ŠคํŠธ ๋ถ€๋ถ„๋งŒ ๊ต์ฒด
80
- text_pattern = re.compile(rf'(<{pprefix}:t[^>]*>)[^<]*(</{pprefix}:t>)')
81
- new_run = text_pattern.sub(rf'\g<1>{esc}\g<2>', original_run)
82
-
83
- # ๋งŒ์•ฝ ํ…์ŠคํŠธ ๋…ธ๋“œ๊ฐ€ ์—†๋‹ค๋ฉด ๊ธฐ๋ณธ ํ˜•ํƒœ๋กœ
84
- if new_run == original_run:
85
- t_pattern = re.compile(rf'(<{pprefix}:run[^>]*>)(.*?)(</{pprefix}:run>)', re.DOTALL)
86
- new_run = t_pattern.sub(rf'\g<1><{pprefix}:t>{esc}</{pprefix}:t>\g<3>', original_run)
87
-
88
- return f'<{pprefix}:p{pattrs}>{new_run}</{pprefix}:p>'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  def _split_lines(val) -> list:
91
- if val is None: return [""]
92
- return str(val).replace("\r\n","\n").split("\n")
 
 
93
 
94
  def _replace_para_multiline(xml: str, key: str, value: str, dbg: dict) -> str:
95
  """
96
- key๊ฐ€ ํฌํ•จ๋œ '๋ถ€๋ชจ ๋ฌธ๋‹จ ์ „์ฒด'๋ฅผ, ๊ฐ’์˜ ๊ฐ ์ค„์„ ๋‹ด์€ ์—ฌ๋Ÿฌ ๋ฌธ๋‹จ์œผ๋กœ ๊ต์ฒด.
97
- ์›๋ณธ ์Šคํƒ€์ผ์„ ์œ ์ง€ํ•˜๋ฉด์„œ ๊ต์ฒด.
98
  """
99
  pair_pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(key)), re.DOTALL)
100
- tnode_pat = re.compile(rf'<(?P<p>[a-zA-Z0-9_]+):t[^>]*>[^<]*{re.escape(key)}[^<]*</(?P=p):t>', re.DOTALL)
 
 
 
101
  token_str = TOKEN_FMT.format(key=key)
102
 
103
  def para_repl(m):
@@ -107,29 +150,28 @@ def _replace_para_multiline(xml: str, key: str, value: str, dbg: dict) -> str:
107
 
108
  lines = _split_lines(value)
109
  pprefix = m.group("pprefix")
110
- pattrs = m.group("pattrs")
111
-
112
- # ์›๋ณธ run ์Šคํƒ€์ผ ์ถ”์ถœ
113
- original_run = _extract_run_style(body, pprefix)
114
-
115
- # ๊ฐ ์ค„์— ๋Œ€ํ•ด ์›๋ณธ ์Šคํƒ€์ผ์„ ์œ ์ง€ํ•˜๋ฉด์„œ ์ƒˆ ๋ฌธ๋‹จ ์ƒ์„ฑ
116
- new_paras = "".join(_make_para_with_style(pprefix, pattrs, ln, original_run) for ln in lines)
117
  dbg["para_hits"][key] = dbg["para_hits"].get(key, 0) + 1
118
  return new_paras
119
 
120
  xml2 = PARA_RE.sub(para_repl, xml)
121
  if xml2 != xml:
122
- dbg["touched"] = True
123
  return xml2
124
 
 
125
  def _runs_plain(text: str) -> str:
126
  return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>"
127
 
 
128
  def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
129
  changed_any = False
130
 
131
- # 0) ๋‹ค์ค‘์ค„ ํ‚ค๋Š” ๋จผ์ € "๋ถ€๋ชจ ๋ฌธ๋‹จ ๊ต์ฒด"๋กœ ์ฒ˜๋ฆฌ (์—…๋ฌด๋ช…์€ ์ œ์™ธํ•˜์—ฌ ํฐํŠธ ๋ฌธ์ œ ํ•ด๊ฒฐ)
132
- multi_key = re.compile(r"^(๋ชฉ๋ก|list|์ œ๋ชฉ)\d+$", re.IGNORECASE)
133
  for k, v in mapping.items():
134
  if multi_key.match(k):
135
  xml_new = _replace_para_multiline(xml, k, v, dbg)
@@ -137,7 +179,7 @@ def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
137
  xml = xml_new
138
  changed_any = True
139
 
140
- # 1) ํ•„๋“œ์Œ(์ธ๋ผ์ธ) ์น˜ํ™˜ โ€” ๋‹จ์ผ์ค„ ํ‚ค๋งŒ
141
  for k, v in mapping.items():
142
  if multi_key.match(k):
143
  continue
@@ -149,27 +191,29 @@ def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
149
  xml = xml_new
150
  changed_any = True
151
 
152
- # 2) ์ˆœ์ˆ˜ ํ…์ŠคํŠธ ์ž๋ฆฌํ‘œ์‹œ์ž(<*:t>ํ‚ค</*:t>) ๋ถ€๋ถ„์น˜ํ™˜ โ€” ๋‹จ์ผ์ค„ ํ‚ค๋งŒ
153
  tnode_all = re.compile(
154
  r'(<(?P<prefix>[a-zA-Z0-9_]+):t[^>]*>)([^<]*?)</(?P=prefix):t>',
155
- re.DOTALL
156
  )
157
  for k, v in mapping.items():
158
  if multi_key.match(k):
159
  continue
 
160
  def repl_tnode(m):
161
  text_node = m.group(3)
162
  if k not in text_node:
163
  return m.group(0)
164
  new_text = html.escape(text_node.replace(k, "" if v is None else str(v)))
165
  return f"{m.group(1)}{new_text}</{m.group('prefix')}:t>"
 
166
  xml2 = tnode_all.sub(repl_tnode, xml)
167
  if xml2 != xml:
168
  dbg["text_hits"][k] = dbg["text_hits"].get(k, 0) + 1
169
  xml = xml2
170
  changed_any = True
171
 
172
- # 3) ํ† ํฐ ์น˜ํ™˜ โ€” ๋‹จ์ผ์ค„ ํ‚ค๋งŒ
173
  for k, v in mapping.items():
174
  if multi_key.match(k):
175
  continue
@@ -183,24 +227,25 @@ def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
183
  dbg["files_touched"] = True
184
  return xml
185
 
186
- def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, dict]:
187
- import stat, time
188
- dbg = {"para_hits":{}, "field_hits":{}, "text_hits":{}, "token_hits":{}, "touched_files": []}
 
 
 
189
  zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
190
  out_buf = io.BytesIO()
191
  zout = zipfile.ZipFile(out_buf, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6)
192
 
193
- # ํ˜„์žฌ ์‹œ๊ฐ„
194
  now = time.localtime()
195
 
196
- # mimetype ๋ฌด์••์ถ• + ๋งจ์•ž
197
  names = zin.namelist()
198
  if "mimetype" in names:
199
  zi = zipfile.ZipInfo("mimetype")
200
  zi.compress_type = zipfile.ZIP_STORED
201
- # ์™„์ „ํžˆ ์ƒˆ๋กœ์šด ZipInfo๋กœ ์ฝ๊ธฐ์ „์šฉ ๋ฐฉ์ง€
202
- zi.external_attr = 0o100666 << 16 # ์ผ๋ฐ˜ ํŒŒ์ผ + ๋ชจ๋“  ๊ถŒํ•œ
203
- zi.create_system = 0 # DOS/Windows
204
  zi.date_time = now[:6]
205
  zout.writestr(zi, zin.read("mimetype"))
206
 
@@ -212,22 +257,29 @@ def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, d
212
  try:
213
  s = data.decode("utf-8", errors="ignore")
214
  before = s
215
- s = _apply_to_xml(s, mapping, {"para_hits":dbg["para_hits"], "field_hits":dbg["field_hits"],
216
- "text_hits":dbg["text_hits"], "token_hits":dbg["token_hits"],
217
- "files_touched":False})
 
 
 
 
 
 
 
 
218
  if s != before:
219
  dbg["touched_files"].append(e.filename)
220
  data = s.encode("utf-8")
221
  except Exception:
222
  pass
223
-
224
- # ์™„์ „ํžˆ ์ƒˆ๋กœ์šด ZipInfo ์ƒ์„ฑ์œผ๋กœ ์ฝ๊ธฐ์ „์šฉ ๋ฐฉ์ง€
225
  zi = zipfile.ZipInfo(e.filename)
226
  zi.compress_type = zipfile.ZIP_DEFLATED
227
- zi.external_attr = 0o100666 << 16 # ์ผ๋ฐ˜ ํŒŒ์ผ + ๋ชจ๋“  ๊ถŒํ•œ
228
- zi.create_system = 0 # DOS/Windows ์‹œ์Šคํ…œ
229
- zi.date_time = now[:6] # ํ˜„์žฌ ์‹œ๊ฐ„
230
- zi.flag_bits = 0 # ํŠน๋ณ„ํ•œ ํ”Œ๋ž˜๊ทธ ์—†์Œ
231
  zout.writestr(zi, data)
232
 
233
  zout.close()
@@ -235,200 +287,155 @@ def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, d
235
  zin.close()
236
  return out_buf.getvalue(), dbg
237
 
 
 
 
238
  def merge_hwpx_pages(base_hwpx: bytes, additional_hwpx: bytes) -> bytes:
239
- """HWPX ํŒŒ์ผ๋“ค์„ ์„น์…˜ ๋‹จ์œ„๋กœ ๋ณ‘ํ•ฉ (COM InsertFile๊ณผ ์œ ์‚ฌํ•œ ๋ฐฉ์‹)"""
240
  import time
241
-
242
  base_zip = zipfile.ZipFile(io.BytesIO(base_hwpx), "r")
243
  add_zip = zipfile.ZipFile(io.BytesIO(additional_hwpx), "r")
244
-
245
  out_buf = io.BytesIO()
246
  out_zip = zipfile.ZipFile(out_buf, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6)
247
-
248
  now = time.localtime()
249
-
250
- # mimetype ๋จผ์ € ์ฒ˜๋ฆฌ
251
  if "mimetype" in base_zip.namelist():
252
  zi = zipfile.ZipInfo("mimetype")
253
  zi.compress_type = zipfile.ZIP_STORED
254
  zi.external_attr = 0o100666 << 16
255
  zi.create_system = 0
256
  zi.date_time = now[:6]
257
- zi.flag_bits = 0
258
  out_zip.writestr(zi, base_zip.read("mimetype"))
259
-
260
- # ๋ฒ ์ด์Šค ํŒŒ์ผ๋“ค ์ฒ˜๋ฆฌ
261
- base_sections = {}
262
- base_files = {}
263
-
264
- for filename in base_zip.namelist():
265
- if filename == "mimetype":
266
  continue
267
- data = base_zip.read(filename)
268
-
269
- if filename.startswith("Contents/section") and filename.endswith(".xml"):
270
- base_sections[filename] = data.decode("utf-8", errors="ignore")
271
  else:
272
- base_files[filename] = data
273
-
274
- # ์ถ”๊ฐ€ ํŒŒ์ผ์˜ ์„น์…˜๋“ค ์ˆ˜์ง‘
275
- add_sections = {}
276
- next_section_num = len(base_sections) + 1
277
-
278
- for filename in add_zip.namelist():
279
- if filename.startswith("Contents/section") and filename.endswith(".xml"):
280
- # ์ƒˆ๋กœ์šด ์„น์…˜ ๋ฒˆํ˜ธ๋กœ ๋ณ€๊ฒฝ
281
- new_filename = f"Contents/section{next_section_num}.xml"
282
- add_sections[new_filename] = add_zip.read(filename).decode("utf-8", errors="ignore")
283
- next_section_num += 1
284
-
285
- # ๋ชจ๋“  ๋ฒ ์ด์Šค ํŒŒ์ผ๋“ค ๋ณต์‚ฌ
286
- for filename, data in base_files.items():
287
- zi = zipfile.ZipInfo(filename)
288
- zi.compress_type = zipfile.ZIP_DEFLATED
289
- zi.external_attr = 0o100666 << 16
290
- zi.create_system = 0
291
- zi.date_time = now[:6]
292
- zi.flag_bits = 0
293
- out_zip.writestr(zi, data)
294
-
295
- # ๋ฒ ์ด์Šค ์„น์…˜๋“ค ๋ณต์‚ฌ
296
- for filename, content in base_sections.items():
297
- zi = zipfile.ZipInfo(filename)
298
- zi.compress_type = zipfile.ZIP_DEFLATED
299
- zi.external_attr = 0o100666 << 16
300
- zi.create_system = 0
301
- zi.date_time = now[:6]
302
- zi.flag_bits = 0
303
- out_zip.writestr(zi, content.encode("utf-8"))
304
-
305
- # ์ƒˆ๋กœ์šด ์„น์…˜๋“ค ์ถ”๊ฐ€
306
- for filename, content in add_sections.items():
307
- zi = zipfile.ZipInfo(filename)
308
  zi.compress_type = zipfile.ZIP_DEFLATED
309
  zi.external_attr = 0o100666 << 16
310
  zi.create_system = 0
311
  zi.date_time = now[:6]
312
  zi.flag_bits = 0
313
  out_zip.writestr(zi, content.encode("utf-8"))
314
-
315
- # BodyText ์—…๋ฐ์ดํŠธ (์ƒˆ ์„น์…˜ ์ฐธ์กฐ ์ถ”๊ฐ€)
316
- if "Contents/bodytext.xml" in base_files:
317
- bodytext = base_files["Contents/bodytext.xml"].decode("utf-8", errors="ignore")
318
- updated_bodytext = add_sections_to_bodytext(bodytext, list(add_sections.keys()))
319
-
320
- zi = zipfile.ZipInfo("Contents/bodytext.xml")
321
- zi.compress_type = zipfile.ZIP_DEFLATED
322
- zi.external_attr = 0o100666 << 16
323
- zi.create_system = 0
324
- zi.date_time = now[:6]
325
- zi.flag_bits = 0
326
- out_zip.writestr(zi, updated_bodytext.encode("utf-8"))
327
-
328
  base_zip.close()
329
  add_zip.close()
330
  out_zip.close()
331
  out_buf.seek(0)
332
-
333
  return out_buf.getvalue()
334
 
335
- def add_sections_to_bodytext(bodytext: str, new_section_files: list) -> str:
336
- """BodyText์— ์ƒˆ ์„น์…˜ ์ฐธ์กฐ ์ถ”๊ฐ€"""
337
- # ๋งˆ์ง€๋ง‰ ์„น์…˜ ๋’ค์— ์ƒˆ ์„น์…˜๋“ค ์ถ”๊ฐ€
338
- # </hml:body> ํƒœ๊ทธ ์•ž์— ์ƒˆ ์„น์…˜ ์ฐธ์กฐ ์‚ฝ์ž…
339
-
340
- section_refs = []
341
- for section_file in new_section_files:
342
- # section1.xml -> 1 ์ถ”์ถœ
343
- section_num = section_file.split("section")[1].split(".xml")[0]
344
- section_ref = f'<hml:secDef><hml:secPtr hml:hRef="../Contents/section{section_num}.xml#0"/></hml:secDef>'
345
- section_refs.append(section_ref)
346
-
347
- if section_refs:
348
- # </hml:body> ์•ž์— ์‚ฝ์ž…
349
- body_close_pattern = re.compile(r'(</hml:body>)')
350
- new_sections_xml = ''.join(section_refs)
351
- bodytext = body_close_pattern.sub(new_sections_xml + r'\1', bodytext)
352
-
353
- return bodytext
354
-
355
- def update_page_id(base_xml: str, new_page: str) -> str:
356
- """ํŽ˜์ด์ง€ ID๋ฅผ ์ค‘๋ณต๋˜์ง€ ์•Š๊ฒŒ ์—…๋ฐ์ดํŠธ (๋” ์ด์ƒ ์‚ฌ์šฉํ•˜์ง€ ์•Š์Œ)"""
357
- return new_page
358
-
359
- def add_page_to_section(base_xml: str, add_xml: str) -> str:
360
- """์„น์…˜์— ์ƒˆ ํŽ˜์ด์ง€ ์ถ”๊ฐ€ (๋” ์ด์ƒ ์‚ฌ์šฉํ•˜์ง€ ์•Š์Œ)"""
361
- return base_xml
362
-
363
- def merge_section_xml_list(xml_list: list) -> str:
364
- """์—ฌ๋Ÿฌ ์„น์…˜ XML์„ ํ•˜๋‚˜๋กœ ๋ณ‘ํ•ฉ (์‚ฌ์šฉํ•˜์ง€ ์•Š์ง€๋งŒ ํ˜ธํ™˜์„ฑ ์œ ์ง€)"""
365
- if len(xml_list) <= 1:
366
- return xml_list[0] if xml_list else ""
367
-
368
- base_xml = xml_list[0]
369
- for additional_xml in xml_list[1:]:
370
- base_xml = add_page_to_section(base_xml, additional_xml)
371
-
372
- return base_xml
373
 
374
  def merge_sections(base_sections: dict, add_sections: dict) -> dict:
375
- """์„น์…˜ XML๋“ค์„ ๋ณ‘ํ•ฉ"""
376
  merged = base_sections.copy()
377
-
378
- for filename, add_content in add_sections.items():
379
- if filename in merged:
380
- # ๊ธฐ์กด ์„น์…˜์— ํŽ˜์ด์ง€ ์ถ”๊ฐ€
381
- merged[filename] = merge_section_content(merged[filename], add_content)
382
  else:
383
- # ์ƒˆ๋กœ์šด ์„น์…˜ ์ถ”๊ฐ€
384
- merged[filename] = add_content
385
-
386
  return merged
387
 
 
388
  def merge_section_content(base_xml: str, add_xml: str) -> str:
389
- """๋‹จ์ผ ์„น์…˜ XML ๋‚ด์šฉ์„ ๋ณ‘ํ•ฉ"""
390
- # ์ถ”๊ฐ€ํ•  XML์—์„œ ํŽ˜์ด์ง€๋“ค ์ถ”์ถœ - ๋” ์ •ํ™•ํ•œ ํŒจํ„ด
391
- page_pattern = re.compile(
392
- r'<(?P<prefix>[a-zA-Z0-9_]+):page\b[^>]*>.*?</(?P=prefix):page>',
393
- re.DOTALL
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  )
395
-
396
- # ํŽ˜์ด์ง€ ๋งค์นญ
397
- page_matches = list(page_pattern.finditer(add_xml))
398
- if not page_matches:
399
- return base_xml
400
-
401
- # ์ถ”๊ฐ€ํ•  ํŽ˜์ด์ง€๋“ค
402
- pages_to_add = [match.group(0) for match in page_matches]
403
-
404
- # ๋ฒ ์ด์Šค XML์˜ </hp:pages> ๋˜๋Š” </hml:pages> ํƒœ๊ทธ ์•ž์— ์‚ฝ์ž…
405
- pages_end_pattern = re.compile(r'(</[a-zA-Z0-9_]+:pages>)')
406
- pages_str = ''.join(pages_to_add)
407
-
408
- merged_xml = pages_end_pattern.sub(pages_str + r'\1', base_xml)
409
-
410
- return merged_xml
411
-
412
- # -------------------- UI --------------------
 
 
 
 
 
413
  with st.expander("์‚ฌ์šฉ๋ฒ•", expanded=True):
414
- st.markdown("""
415
- - **ํ…œํ”Œ๋ฆฟ์€ 1ํŽ˜์ด์ง€์— N๊ฐœ ๋ผ๋ฒจ**์ด ์žˆ๋Š” ํ‘œ ํ˜•ํƒœ๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
416
- - **๋ฐ•์Šค ์ˆ˜๊ฐ€ N๊ฐœ๋ฅผ ์ดˆ๊ณผํ•˜๋ฉด ์ƒˆ ํŽ˜์ด์ง€๊ฐ€ ์ž๋™ ์ถ”๊ฐ€**๋ฉ๋‹ˆ๋‹ค.
417
- - **๋‹ค์ค‘ ์ค„(๋ชฉ๋ก/์ œ๋ชฉ)์€ ๋ถ€๋ชจ ๋ฌธ๋‹จ์„ ์—ฌ๋Ÿฌ ๋ฌธ๋‹จ์œผ๋กœ ๊ต์ฒด**ํ•˜์—ฌ ๊ฒน์นจ ์—†์ด ํ‘œ์‹œํ•ฉ๋‹ˆ๋‹ค.
418
- - **์—…๋ฌด๋ช…์€ ๋‹จ์ผ์ค„๋กœ ์ฒ˜๋ฆฌ**ํ•˜์—ฌ ์›๋ณธ ํฐํŠธ ์Šคํƒ€์ผ์„ ์œ ์ง€ํ•ฉ๋‹ˆ๋‹ค.
419
- - **์ƒ์„ฑ๋œ HWPX ํŒŒ์ผ์˜ ์ฝ๊ธฐ์ „์šฉ ์†์„ฑ์ด ํ•ด์ œ**๋˜์–ด ํŽธ์ง‘ ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค.
420
- """)
421
 
422
  tpl = st.file_uploader("๐Ÿ“„ HWPX ํ…œํ”Œ๋ฆฟ ์—…๋กœ๋“œ", type=["hwpx"])
423
  n_per_page = st.number_input("ํ…œํ”Œ๋ฆฟ์˜ ๋ผ๋ฒจ ์„ธํŠธ ๊ฐœ์ˆ˜(ํ•œ ํŽ˜์ด์ง€ N๊ฐœ)", 1, 12, 3, 1)
424
- data = st.file_uploader("๐Ÿ“Š ๋ฐ์ดํ„ฐ ์—…๋กœ๋“œ (Excel/CSV)", type=["xlsx","xls","csv"])
425
 
426
  if tpl and data:
427
  tpl_bytes = tpl.read()
428
  df = pd.read_csv(data) if data.name.lower().endswith(".csv") else pd.read_excel(data)
429
 
430
  if "๋ฐ•์Šค๋ฒˆํ˜ธ" not in df.columns:
431
- st.error("โŒ ํ•„์ˆ˜ ์ปฌ๋Ÿผ '๋ฐ•์Šค๋ฒˆํ˜ธ'๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค."); st.stop()
 
432
 
433
  st.success("โœ… ์œ„์น˜ ๋งคํ•‘ ์™„๋ฃŒ (์—‘์…€ ์ธก)")
434
  st.dataframe(df.head(10), use_container_width=True)
@@ -446,66 +453,81 @@ if tpl and data:
446
 
447
  # 1ํŽ˜์ด์ง€ ๋งคํ•‘ ํ”„๋ฆฌ๋ทฐ
448
  st.subheader("๐Ÿงช 1ํŽ˜์ด์ง€ ๋งคํ•‘ ํ”„๋ฆฌ๋ทฐ")
449
- keys = ["๋ฐ•์Šค๋ฒˆํ˜ธ","์ข…๋ฃŒ์—ฐ๋„","๋ณด์กด๊ธฐ๊ฐ„","๋‹จ์œ„์—…๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","๋ชฉ๋ก","์ œ๋ชฉ","์—…๋ฌด๋ช…"]
450
  mapping_preview = {}
451
  for i in range(int(n_per_page)):
452
  if i < len(records):
453
  r = records[i]
454
- mapping_preview.update({
455
- f"๋ฐ•์Šค๋ฒˆํ˜ธ{i+1}": r.get("๋ฐ•์Šค๋ฒˆํ˜ธ",""),
456
- f"์ข…๋ฃŒ์—ฐ๋„{i+1}": r.get("์ƒ์‚ฐ์—ฐ๋„",""),
457
- f"๋ณด์กด๊ธฐ๊ฐ„{i+1}": r.get("๋ณด์กด๊ธฐ๊ฐ„",""),
458
- f"๋‹จ์œ„์—…๋ฌด{i+1}": r.get("๋‹จ์œ„์—…๋ฌด",""),
459
- f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}": r.get("๊ธฐ๋ก๋ฌผ์ฒ ",""),
460
- f"๋ชฉ๋ก{i+1}": r.get("๋ชฉ๋ก",""),
461
- f"์ œ๋ชฉ{i+1}": r.get("์ œ๋ชฉ",""),
462
- f"์—…๋ฌด๋ช…{i+1}": r.get("์ œ๋ชฉ",""), # ํ…œํ”Œ๋ฆฟ์ด '์—…๋ฌด๋ช…1'์„ ์“ฐ๋Š” ๊ฒฝ์šฐ ๋Œ€์‘
463
- })
 
 
464
  else:
465
- for k in keys: mapping_preview[f"{k}{i+1}"] = ""
466
- st.dataframe(pd.DataFrame([{"ํ‚ค":k, "๊ฐ’ ์•ž๋ถ€๋ถ„":str(v)[:120]} for k,v in sorted(mapping_preview.items())]),
467
- use_container_width=True, height=320)
 
 
 
 
468
 
469
- if st.button("๐Ÿš€ ํ†ตํ•ฉ ๋ผ๋ฒจ ์ƒ์„ฑ (๋‹จ์ผ HWPX ํŒŒ์ผ)"):
470
  pages = (len(records) + int(n_per_page) - 1) // int(n_per_page)
471
  debug_all = []
472
-
473
- # ์ฒซ ํŽ˜์ด์ง€๋กœ ์‹œ์ž‘
474
- merged_hwpx = None
475
-
476
  for p in range(pages):
477
- chunk = records[p*int(n_per_page):(p+1)*int(n_per_page)]
478
- mapping = {}
479
  for i in range(int(n_per_page)):
480
  if i < len(chunk):
481
  r = chunk[i]
482
- mapping[f"๋ฐ•์Šค๋ฒˆํ˜ธ{i+1}"] = r.get("๋ฐ•์Šค๋ฒˆํ˜ธ","")
483
- mapping[f"์ข…๋ฃŒ์—ฐ๋„{i+1}"] = r.get("์ƒ์‚ฐ์—ฐ๋„","")
484
- mapping[f"๋ณด์กด๊ธฐ๊ฐ„{i+1}"] = r.get("๋ณด์กด๊ธฐ๊ฐ„","")
485
- mapping[f"๋‹จ์œ„์—…๋ฌด{i+1}"] = r.get("๋‹จ์œ„์—…๋ฌด","")
486
- mapping[f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}"] = r.get("๊ธฐ๋ก๋ฌผ์ฒ ","")
487
- mapping[f"๋ชฉ๋ก{i+1}"] = r.get("๋ชฉ๋ก","")
488
- title_val = r.get("์ œ๋ชฉ","")
489
- mapping[f"์ œ๋ชฉ{i+1}"] = title_val
490
  mapping[f"์—…๋ฌด๋ช…{i+1}"] = title_val
491
  else:
492
- for k in keys: mapping[f"{k}{i+1}"] = ""
 
493
 
494
  if p == 0:
495
- # ์ฒซ ํŽ˜์ด์ง€: ํ…œํ”Œ๋ฆฟ ๊ธฐ๋ฐ˜์œผ๋กœ ์ƒ์„ฑ
496
  merged_hwpx, dbg = replace_in_hwpx(tpl_bytes, mapping)
497
  else:
498
- # ๋‘ ๋ฒˆ์งธ ํŽ˜์ด์ง€๋ถ€ํ„ฐ: ๊ธฐ์กด HWPX์— ํŽ˜์ด์ง€ ์ถ”๊ฐ€
499
  page_hwpx, dbg = replace_in_hwpx(tpl_bytes, mapping)
500
  merged_hwpx = merge_hwpx_pages(merged_hwpx, page_hwpx)
501
-
502
- debug_all.append({"page": p+1, "stats": dbg})
503
 
504
- # ๋ฐ•์Šค๋ฒˆํ˜ธ ๋ฒ”์œ„๋กœ ํŒŒ์ผ๋ช… ์ƒ์„ฑ
 
 
505
  first_box = records[0].get("๋ฐ•์Šค๋ฒˆํ˜ธ", "0000") if records else "0000"
506
  last_box = records[-1].get("๋ฐ•์Šค๋ฒˆํ˜ธ", "0000") if records else "0000"
507
- filename = f"labels_{first_box}to{last_box}.hwpx" if first_box != last_box else f"labels_{first_box}.hwpx"
508
-
509
- st.download_button("โฌ‡๏ธ ํ†ตํ•ฉ HWPX ๋‹ค์šด๋กœ๋“œ", data=merged_hwpx, file_name=filename, mime="application/zip")
510
- st.download_button("โฌ‡๏ธ ๋””๋ฒ„๊ทธ(JSON)", data=json.dumps(debug_all, ensure_ascii=False, indent=2),
511
- file_name="debug.json", mime="application/json")
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import io
3
+ import json
4
+ import html
5
+ import re
6
+ import zipfile
7
  from typing import Dict, Tuple
8
 
9
+ import pandas as pd
10
+ import streamlit as st
11
+
12
+
13
+ # ====================== Streamlit ======================
14
+ st.set_page_config(page_title="๐Ÿ“ฆ ๋ฐ•์Šค๋ผ๋ฒจ(HWPX) โ€” ํ†ตํ•ฉ ํŒŒ์ผ ์ถœ๋ ฅ", layout="wide")
15
+ st.title("๐Ÿ“ฆ ๋ฐ•์Šค๋ผ๋ฒจ ์ž๋™ ์ƒ์„ฑ๊ธฐ โ€” HWPX ํ•„๋“œยทํ† ํฐยท๋ฌธ๋‹จ ์™„์ „ ์น˜ํ™˜ + ๋‹คํŽ˜์ด์ง€ ํ†ตํ•ฉ ์ถœ๋ ฅ")
16
 
17
+
18
+ # ====================== ๋ฐ์ดํ„ฐ ์œ ํ‹ธ ======================
19
  def _year_range(series: pd.Series) -> str:
20
  s = series.astype(str).fillna("")
21
  v = s[~s.isin(["", "0", "0000"])]
22
+ if v.empty:
23
+ return "0000-0000"
24
  nums = pd.to_numeric(v, errors="coerce").dropna().astype(int)
25
+ if nums.empty:
26
+ return "0000-0000"
27
  return f"{nums.min():04d}-{nums.max():04d}"
28
 
29
+
30
  def build_rows(df: pd.DataFrame) -> pd.DataFrame:
31
+ """๋ฐ•์Šค๋ฒˆํ˜ธ ๊ธฐ์ค€ ๋Œ€ํ‘œ ๋ฉ”ํƒ€ + ๋ชฉ๋ก(์—ฌ๋Ÿฌ ์ค„) + ์ƒ์‚ฐ์—ฐ๋„ ๋ฒ”์œ„ ์ƒ์„ฑ"""
32
  df = df.copy()
33
  df["๋ฐ•์Šค๋ฒˆํ˜ธ"] = df["๋ฐ•์Šค๋ฒˆํ˜ธ"].astype(str).str.zfill(4)
34
  if "์ œ๋ชฉ" in df.columns:
35
  df["์ œ๋ชฉ"] = df["์ œ๋ชฉ"].astype(str)
36
 
37
+ # ์ƒ์‚ฐ์—ฐ๋„(๋ฒ”์œ„)
38
  if "์ข…๋ฃŒ์—ฐ๋„" in df.columns:
39
  yr = df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ")["์ข…๋ฃŒ์—ฐ๋„"].apply(_year_range).reset_index()
40
  yr.columns = ["๋ฐ•์Šค๋ฒˆํ˜ธ", "์ƒ์‚ฐ์—ฐ๋„"]
 
45
  has_mgmt = "๊ด€๋ฆฌ๋ฒˆํ˜ธ" in df.columns
46
  lists = []
47
  for b, g in df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ"):
48
+ lines = [
49
+ f"- {r['๊ด€๋ฆฌ๋ฒˆํ˜ธ']} {r.get('์ œ๋ชฉ','')}" if has_mgmt else f"- {r.get('์ œ๋ชฉ','')}"
50
+ for _, r in g.iterrows()
51
+ ]
52
  lists.append({"๋ฐ•์Šค๋ฒˆํ˜ธ": b, "๋ชฉ๋ก": "\r\n".join(lines)})
53
  list_df = pd.DataFrame(lists)
54
 
55
  # ๋Œ€ํ‘œ ๋ฉ”ํƒ€
56
+ meta_cols = ["๋ฐ•์Šค๋ฒˆํ˜ธ", "์ข…๋ฃŒ์—ฐ๋„", "๋ณด์กด๊ธฐ๊ฐ„", "๋‹จ์œ„์—…๋ฌด", "๊ธฐ๋ก๋ฌผ์ฒ ", "์ œ๋ชฉ"]
57
  meta_exist = [c for c in meta_cols if c in df.columns]
58
+ if meta_exist:
59
+ meta = df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ", as_index=False).first()[meta_exist]
60
+ else:
61
+ meta = pd.DataFrame({"๋ฐ•์Šค๋ฒˆํ˜ธ": df["๋ฐ•์Šค๋ฒˆํ˜ธ"].unique()})
62
 
63
  merged = meta.merge(list_df, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left").merge(yr, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left")
64
  return merged
65
 
66
+
67
+ # ====================== ์น˜ํ™˜ ์œ ํ‹ธ (์ธ๋ผ์ธ/๋ฌธ๋‹จ) ======================
68
+
69
+ # fieldBegin/fieldEnd ์Œ (์ ‘๋‘์–ด ์™€์ผ๋“œ์นด๋“œ)
70
  FIELD_PAIR_RE_TMPL = (
71
  r'<(?P<fprefix>[a-zA-Z0-9_]+):fieldBegin\b[^>]*\bname="{name}"[^>]*/>'
72
  r'(.*?)'
73
  r'<(?P=fprefix):fieldEnd\b[^>]*/>'
74
  )
75
+ # ํ† ํฐ ํฌ๋งท
76
  TOKEN_FMT = "{{{{{key}}}}}"
77
 
78
+ # ๋ฌธ๋‹จ ํƒ์ƒ‰์šฉ
79
  PARA_RE = re.compile(
80
  r'<(?P<pprefix>[a-zA-Z0-9_]+):p(?P<pattrs>[^>]*)>(?P<pbody>.*?)</(?P=pprefix):p>',
81
+ re.DOTALL,
82
  )
83
 
84
+ # run / t ๏ฟฝ๏ฟฝ๏ฟฝ๋“œ ์ถ”์ถœ์šฉ
85
+ RUN_RE = re.compile(
86
+ r'<(?P<prefix>[a-zA-Z0-9_]+):run(?P<rattrs>[^>]*)>(?P<body>.*?)</(?P=prefix):run>',
87
+ re.DOTALL,
88
+ )
89
+ TP_RE = re.compile(
90
+ r'<(?P<prefix>[a-zA-Z0-9_]+):t[^>]*>(?P<text>.*?)</(?P=prefix):t>',
91
+ re.DOTALL,
92
+ )
93
+
94
+
95
+ def _clone_run_with_text(run_xml: str, text: str) -> str:
96
+ """๊ธฐ์กด run์˜ rPr/์†์„ฑ ๋ณด์กด, t ๋‚ด์šฉ๋งŒ ๊ต์ฒด"""
97
+ def _repl_t(m):
98
+ return f"<{m.group('prefix')}:t>{html.escape(text)}</{m.group('prefix')}:t>"
99
+
100
+ if TP_RE.search(run_xml):
101
+ return TP_RE.sub(_repl_t, run_xml, count=1)
102
+ # t ๋…ธ๋“œ ์—†์œผ๋ฉด ๊ธฐ๋ณธ ์‚ฝ์ž…
103
+ m = RUN_RE.search(run_xml)
104
+ if not m:
105
+ return f"<hp:run><hp:t>{html.escape(text)}</hp:t></hp:run>"
106
+ prefix = m.group("prefix")
107
+ return f"<{prefix}:run><{prefix}:t>{html.escape(text)}</{prefix}:t></{prefix}:run>"
108
+
109
+
110
+ def _extract_ppr_and_template_run(pbody: str):
111
+ """๋ฌธ๋‹จ pPr(์žˆ์œผ๋ฉด)๊ณผ ์ฒซ ๋ฒˆ์งธ run ์›ํ˜•์„ ์ถ”์ถœ"""
112
+ ppr_match = re.search(r'<(?P<prefix>[a-zA-Z0-9_]+):pPr\b[^>]*/>', pbody)
113
+ ppr_xml = ppr_match.group(0) if ppr_match else ""
114
+
115
+ run_match = RUN_RE.search(pbody)
116
+ if run_match:
117
+ template_run = run_match.group(0) # rPr ํฌํ•จ
118
+ else:
119
+ template_run = "<hp:run><hp:t></hp:t></hp:run>"
120
+ return ppr_xml, template_run
121
+
122
+
123
+ def _make_para_from_templates(pprefix: str, pattrs: str, ppr_xml: str, template_run: str, text: str) -> str:
124
+ cloned_run = _clone_run_with_text(template_run, text)
125
+ return f"<{pprefix}:p{pattrs}>{ppr_xml}{cloned_run}</{pprefix}:p>"
126
+
127
 
128
  def _split_lines(val) -> list:
129
+ if val is None:
130
+ return [""]
131
+ return str(val).replace("\r\n", "\n").split("\n")
132
+
133
 
134
  def _replace_para_multiline(xml: str, key: str, value: str, dbg: dict) -> str:
135
  """
136
+ key๊ฐ€ ํฌํ•จ๋œ '๋ถ€๋ชจ ๋ฌธ๋‹จ ์ „์ฒด'๋ฅผ ๊ฐ’์˜ ๊ฐ ์ค„์„ ๋‹ด์€ ๋‹ค์ˆ˜ ๋ฌธ๋‹จ์œผ๋กœ ๊ต์ฒด.
137
+ ์› ๋ฌธ๋‹จ pPr/rPr ์Šคํƒ€์ผ ์œ ์ง€.
138
  """
139
  pair_pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(key)), re.DOTALL)
140
+ tnode_pat = re.compile(
141
+ rf'<(?P<p>[a-zA-Z0-9_]+):t[^>]*>[^<]*{re.escape(key)}[^<]*</(?P=p):t>',
142
+ re.DOTALL,
143
+ )
144
  token_str = TOKEN_FMT.format(key=key)
145
 
146
  def para_repl(m):
 
150
 
151
  lines = _split_lines(value)
152
  pprefix = m.group("pprefix")
153
+ pattrs = m.group("pattrs")
154
+ ppr_xml, template_run = _extract_ppr_and_template_run(body)
155
+
156
+ new_paras = "".join(_make_para_from_templates(pprefix, pattrs, ppr_xml, template_run, ln) for ln in lines)
 
 
 
157
  dbg["para_hits"][key] = dbg["para_hits"].get(key, 0) + 1
158
  return new_paras
159
 
160
  xml2 = PARA_RE.sub(para_repl, xml)
161
  if xml2 != xml:
162
+ dbg["files_touched"] = True
163
  return xml2
164
 
165
+
166
  def _runs_plain(text: str) -> str:
167
  return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>"
168
 
169
+
170
  def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
171
  changed_any = False
172
 
173
+ # (A) ๋‹ค์ค‘ ์ค„ ํ‚ค๋Š” "๋ฌธ๋‹จ ๊ต์ฒด"๋กœ ๋จผ์ € ์ฒ˜๋ฆฌ (๋ชฉ๋ก/์ œ๋ชฉ/์—…๋ฌด๋ช… ๋ชจ๋‘ ์ค„๋ฐ”๊ฟˆ ๊ฐ•์ œ)
174
+ multi_key = re.compile(r"^(๋ชฉ๋ก|list|์ œ๋ชฉ|์—…๋ฌด๋ช…)\d+$", re.IGNORECASE)
175
  for k, v in mapping.items():
176
  if multi_key.match(k):
177
  xml_new = _replace_para_multiline(xml, k, v, dbg)
 
179
  xml = xml_new
180
  changed_any = True
181
 
182
+ # (B) ์ธ๋ผ์ธ ํ•„๋“œ์Œ ์น˜ํ™˜ โ€” ๋‹จ์ผ ์ค„๋งŒ
183
  for k, v in mapping.items():
184
  if multi_key.match(k):
185
  continue
 
191
  xml = xml_new
192
  changed_any = True
193
 
194
+ # (C) ์ˆœ์ˆ˜ ํ…์ŠคํŠธ ์ž๋ฆฌํ‘œ์‹œ์ž(<*:t>ํ‚ค</*:t>) ์น˜ํ™˜ โ€” ๋‹จ์ผ ์ค„๋งŒ
195
  tnode_all = re.compile(
196
  r'(<(?P<prefix>[a-zA-Z0-9_]+):t[^>]*>)([^<]*?)</(?P=prefix):t>',
197
+ re.DOTALL,
198
  )
199
  for k, v in mapping.items():
200
  if multi_key.match(k):
201
  continue
202
+
203
  def repl_tnode(m):
204
  text_node = m.group(3)
205
  if k not in text_node:
206
  return m.group(0)
207
  new_text = html.escape(text_node.replace(k, "" if v is None else str(v)))
208
  return f"{m.group(1)}{new_text}</{m.group('prefix')}:t>"
209
+
210
  xml2 = tnode_all.sub(repl_tnode, xml)
211
  if xml2 != xml:
212
  dbg["text_hits"][k] = dbg["text_hits"].get(k, 0) + 1
213
  xml = xml2
214
  changed_any = True
215
 
216
+ # (D) ํ† ํฐ ์น˜ํ™˜ โ€” ๋‹จ์ผ ์ค„๋งŒ
217
  for k, v in mapping.items():
218
  if multi_key.match(k):
219
  continue
 
227
  dbg["files_touched"] = True
228
  return xml
229
 
230
+
231
+ def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str, str]) -> Tuple[bytes, dict]:
232
+ """HWPX(zip) ๋‚ด๋ถ€ ๋ชจ๋“  XML์— ์น˜ํ™˜ ์ ์šฉ"""
233
+ import time
234
+
235
+ dbg = {"para_hits": {}, "field_hits": {}, "text_hits": {}, "token_hits": {}, "touched_files": []}
236
  zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
237
  out_buf = io.BytesIO()
238
  zout = zipfile.ZipFile(out_buf, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6)
239
 
 
240
  now = time.localtime()
241
 
242
+ # mimetype: ๋ฌด์••์ถ• + ๋งจ์•ž
243
  names = zin.namelist()
244
  if "mimetype" in names:
245
  zi = zipfile.ZipInfo("mimetype")
246
  zi.compress_type = zipfile.ZIP_STORED
247
+ zi.external_attr = 0o100666 << 16
248
+ zi.create_system = 0
 
249
  zi.date_time = now[:6]
250
  zout.writestr(zi, zin.read("mimetype"))
251
 
 
257
  try:
258
  s = data.decode("utf-8", errors="ignore")
259
  before = s
260
+ s = _apply_to_xml(
261
+ s,
262
+ mapping,
263
+ {
264
+ "para_hits": dbg["para_hits"],
265
+ "field_hits": dbg["field_hits"],
266
+ "text_hits": dbg["text_hits"],
267
+ "token_hits": dbg["token_hits"],
268
+ "files_touched": False,
269
+ },
270
+ )
271
  if s != before:
272
  dbg["touched_files"].append(e.filename)
273
  data = s.encode("utf-8")
274
  except Exception:
275
  pass
276
+
 
277
  zi = zipfile.ZipInfo(e.filename)
278
  zi.compress_type = zipfile.ZIP_DEFLATED
279
+ zi.external_attr = 0o100666 << 16
280
+ zi.create_system = 0
281
+ zi.date_time = now[:6]
282
+ zi.flag_bits = 0
283
  zout.writestr(zi, data)
284
 
285
  zout.close()
 
287
  zin.close()
288
  return out_buf.getvalue(), dbg
289
 
290
+
291
+ # ====================== ์„น์…˜/ํŽ˜์ด์ง€ ๋ณ‘ํ•ฉ (๋‹จ์ผ HWPX๋กœ ์ถœ๋ ฅ) ======================
292
+
293
  def merge_hwpx_pages(base_hwpx: bytes, additional_hwpx: bytes) -> bytes:
294
+ """๋‘ HWPX๋ฅผ 1๊ฐœ๋กœ ๋ณ‘ํ•ฉ: pages ๋ชฉ๋ก๊ณผ ๋ณธ๋ฌธ ๋ฌธ๋‹จ๊นŒ์ง€ ํ•ฉ์นจ"""
295
  import time
296
+
297
  base_zip = zipfile.ZipFile(io.BytesIO(base_hwpx), "r")
298
  add_zip = zipfile.ZipFile(io.BytesIO(additional_hwpx), "r")
299
+
300
  out_buf = io.BytesIO()
301
  out_zip = zipfile.ZipFile(out_buf, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6)
 
302
  now = time.localtime()
303
+
304
+ # mimetype
305
  if "mimetype" in base_zip.namelist():
306
  zi = zipfile.ZipInfo("mimetype")
307
  zi.compress_type = zipfile.ZIP_STORED
308
  zi.external_attr = 0o100666 << 16
309
  zi.create_system = 0
310
  zi.date_time = now[:6]
 
311
  out_zip.writestr(zi, base_zip.read("mimetype"))
312
+
313
+ # ์„น์…˜ XML ์ˆ˜์ง‘
314
+ base_sections, add_sections = {}, {}
315
+ for fn in base_zip.namelist():
316
+ if fn == "mimetype":
 
 
317
  continue
318
+ if fn.startswith("Contents/section") and fn.endswith(".xml"):
319
+ base_sections[fn] = base_zip.read(fn).decode("utf-8", errors="ignore")
 
 
320
  else:
321
+ zi = zipfile.ZipInfo(fn)
322
+ zi.compress_type = zipfile.ZIP_DEFLATED
323
+ zi.external_attr = 0o100666 << 16
324
+ zi.create_system = 0
325
+ zi.date_time = now[:6]
326
+ zi.flag_bits = 0
327
+ out_zip.writestr(zi, base_zip.read(fn))
328
+
329
+ for fn in add_zip.namelist():
330
+ if fn.startswith("Contents/section") and fn.endswith(".xml"):
331
+ add_sections[fn] = add_zip.read(fn).decode("utf-8", errors="ignore")
332
+
333
+ # ์„น์…˜ ๋ณ‘ํ•ฉ
334
+ merged_sections = merge_sections(base_sections, add_sections)
335
+
336
+ # ๊ฒฐ๊ณผ ๊ธฐ๋ก
337
+ for fn, content in merged_sections.items():
338
+ zi = zipfile.ZipInfo(fn)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  zi.compress_type = zipfile.ZIP_DEFLATED
340
  zi.external_attr = 0o100666 << 16
341
  zi.create_system = 0
342
  zi.date_time = now[:6]
343
  zi.flag_bits = 0
344
  out_zip.writestr(zi, content.encode("utf-8"))
345
+
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  base_zip.close()
347
  add_zip.close()
348
  out_zip.close()
349
  out_buf.seek(0)
 
350
  return out_buf.getvalue()
351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
  def merge_sections(base_sections: dict, add_sections: dict) -> dict:
 
354
  merged = base_sections.copy()
355
+ for fn, add_xml in add_sections.items():
356
+ if fn in merged:
357
+ merged[fn] = merge_section_content(merged[fn], add_xml)
 
 
358
  else:
359
+ merged[fn] = add_xml
 
 
360
  return merged
361
 
362
+
363
  def merge_section_content(base_xml: str, add_xml: str) -> str:
364
+ """
365
+ 1) <*:pages>์— ์ƒˆ page ์—”ํŠธ๋ฆฌ ์ถ”๊ฐ€ (self-closing/์ผ๋ฐ˜ ๋ชจ๋‘)
366
+ 2) ๋ณธ๋ฌธ(<*:p>) ๋์— pageBreak + ์ถ”๊ฐ€ ๋ฌธ๋‹จ ๋ถ™์ด๊ธฐ
367
+ """
368
+ # pages ๋ชฉ๋ก ํ•ฉ์น˜๊ธฐ
369
+ pages_block_re = re.compile(
370
+ r'<(?P<pfx>[a-zA-Z0-9_]+):pages\b[^>]*>(?P<body>.*?)</(?P=pfx):pages>',
371
+ re.DOTALL,
372
+ )
373
+ m_base_pages = pages_block_re.search(base_xml)
374
+ m_add_pages = pages_block_re.search(add_xml)
375
+ if m_base_pages and m_add_pages:
376
+ pfx = m_base_pages.group("pfx")
377
+ body_base = m_base_pages.group("body")
378
+ body_add = m_add_pages.group("body")
379
+ add_entries = re.findall(
380
+ rf'<{pfx}:page\b[^>]*/>|<{pfx}:page\b[^>]*>.*?</{pfx}:page>',
381
+ body_add,
382
+ re.DOTALL,
383
+ )
384
+ if add_entries:
385
+ new_body = body_base + "".join(add_entries)
386
+ base_xml = (
387
+ base_xml[: m_base_pages.start("body")]
388
+ + new_body
389
+ + base_xml[m_base_pages.end("body") :]
390
+ )
391
+
392
+ # ๋ณธ๋ฌธ ๋ฌธ๋‹จ ํ•ฉ์น˜๊ธฐ
393
+ para_re = re.compile(
394
+ r'<(?P<pfx>[a-zA-Z0-9_]+):p\b[^>]*>.*?</(?P=pfx):p>', re.DOTALL
395
  )
396
+ pfx_in_base = None
397
+ m0 = para_re.search(base_xml)
398
+ if m0:
399
+ pfx_in_base = m0.group("pfx")
400
+
401
+ add_paras = [m.group(0) for m in para_re.finditer(add_xml)]
402
+ if add_paras and pfx_in_base:
403
+ pagebreak_para = (
404
+ f'<{pfx_in_base}:p><{pfx_in_base}:run>'
405
+ f'<{pfx_in_base}:pageBreak/>'
406
+ f'</{pfx_in_base}:run></{pfx_in_base}:p>'
407
+ )
408
+ section_end_re = re.compile(rf'</{pfx_in_base}:section>')
409
+ m_end = section_end_re.search(base_xml)
410
+ if m_end:
411
+ insert_at = m_end.start()
412
+ base_xml = (
413
+ base_xml[:insert_at] + pagebreak_para + "".join(add_paras) + base_xml[insert_at:]
414
+ )
415
+ return base_xml
416
+
417
+
418
+ # ====================== UI ======================
419
  with st.expander("์‚ฌ์šฉ๋ฒ•", expanded=True):
420
+ st.markdown(
421
+ """
422
+ - **๋‹ค์ค‘ ์ค„(๋ชฉ๋ก/์ œ๋ชฉ/์—…๋ฌด๋ช…)** ์€ ์› ๋ฌธ๋‹จ ์Šคํƒ€์ผ์„ ์œ ์ง€ํ•œ ์ฑ„ **๋ถ€๋ชจ ๋ฌธ๋‹จ์„ ์ค„ ์ˆ˜๋งŒํผ ๋ณต์ œ**ํ•˜์—ฌ ๊ฒน์นจ ์—†์ด ํ‘œ์‹œํ•ฉ๋‹ˆ๋‹ค.
423
+ - ๋ฐ•์Šค๊ฐ€ ๋งŽ์•„๋„ **๋งˆ์ง€๋ง‰์— ํ•œ ๊ฐœ์˜ HWPX ํŒŒ์ผ**๋กœ ํ†ตํ•ฉํ•ด ๋‚ด๋ ค์ค๋‹ˆ๋‹ค.
424
+ - ํ…œํ”Œ๋ฆฟ์€ ๋ฐ˜๋“œ์‹œ **.HWPX** ์—ฌ์•ผ ํ•ฉ๋‹ˆ๋‹ค. (.HWP ๋ถˆ๊ฐ€)
425
+ """
426
+ )
427
 
428
  tpl = st.file_uploader("๐Ÿ“„ HWPX ํ…œํ”Œ๋ฆฟ ์—…๋กœ๋“œ", type=["hwpx"])
429
  n_per_page = st.number_input("ํ…œํ”Œ๋ฆฟ์˜ ๋ผ๋ฒจ ์„ธํŠธ ๊ฐœ์ˆ˜(ํ•œ ํŽ˜์ด์ง€ N๊ฐœ)", 1, 12, 3, 1)
430
+ data = st.file_uploader("๐Ÿ“Š ๋ฐ์ดํ„ฐ ์—…๋กœ๋“œ (Excel/CSV)", type=["xlsx", "xls", "csv"])
431
 
432
  if tpl and data:
433
  tpl_bytes = tpl.read()
434
  df = pd.read_csv(data) if data.name.lower().endswith(".csv") else pd.read_excel(data)
435
 
436
  if "๋ฐ•์Šค๋ฒˆํ˜ธ" not in df.columns:
437
+ st.error("โŒ ํ•„์ˆ˜ ์ปฌ๋Ÿผ '๋ฐ•์Šค๋ฒˆํ˜ธ'๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
438
+ st.stop()
439
 
440
  st.success("โœ… ์œ„์น˜ ๋งคํ•‘ ์™„๋ฃŒ (์—‘์…€ ์ธก)")
441
  st.dataframe(df.head(10), use_container_width=True)
 
453
 
454
  # 1ํŽ˜์ด์ง€ ๋งคํ•‘ ํ”„๋ฆฌ๋ทฐ
455
  st.subheader("๐Ÿงช 1ํŽ˜์ด์ง€ ๋งคํ•‘ ํ”„๋ฆฌ๋ทฐ")
456
+ keys = ["๋ฐ•์Šค๋ฒˆํ˜ธ", "์ข…๋ฃŒ์—ฐ๋„", "๋ณด์กด๊ธฐ๊ฐ„", "๋‹จ์œ„์—…๋ฌด", "๊ธฐ๋ก๋ฌผ์ฒ ", "๋ชฉ๋ก", "์ œ๋ชฉ", "์—…๋ฌด๋ช…"]
457
  mapping_preview = {}
458
  for i in range(int(n_per_page)):
459
  if i < len(records):
460
  r = records[i]
461
+ mapping_preview.update(
462
+ {
463
+ f"๋ฐ•์Šค๋ฒˆํ˜ธ{i+1}": r.get("๋ฐ•์Šค๋ฒˆํ˜ธ", ""),
464
+ f"์ข…๋ฃŒ์—ฐ๋„{i+1}": r.get("์ƒ์‚ฐ์—ฐ๋„", ""),
465
+ f"๋ณด์กด๊ธฐ๊ฐ„{i+1}": r.get("๋ณด์กด๊ธฐ๊ฐ„", ""),
466
+ f"๋‹จ์œ„์—…๋ฌด{i+1}": r.get("๋‹จ์œ„์—…๋ฌด", ""),
467
+ f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}": r.get("๊ธฐ๋ก๋ฌผ์ฒ ", ""),
468
+ f"๋ชฉ๋ก{i+1}": r.get("๋ชฉ๋ก", ""),
469
+ f"์ œ๋ชฉ{i+1}": r.get("์ œ๋ชฉ", ""),
470
+ f"์—…๋ฌด๋ช…{i+1}": r.get("์ œ๋ชฉ", ""), # ํ…œํ”Œ๋ฆฟ์ด '์—…๋ฌด๋ช…X'์„ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์–ด ๋™์‹œ ๋งคํ•‘
471
+ }
472
+ )
473
  else:
474
+ for k in keys:
475
+ mapping_preview[f"{k}{i+1}"] = ""
476
+ st.dataframe(
477
+ pd.DataFrame([{"ํ‚ค": k, "๊ฐ’ ์•ž๋ถ€๋ถ„": str(v)[:120]} for k, v in sorted(mapping_preview.items())]),
478
+ use_container_width=True,
479
+ height=320,
480
+ )
481
 
482
+ if st.button("๐Ÿš€ ํ†ตํ•ฉ HWPX ์ƒ์„ฑ (ํ•œ ํŒŒ์ผ๋กœ ๋‹ค์šด๋กœ๋“œ)"):
483
  pages = (len(records) + int(n_per_page) - 1) // int(n_per_page)
484
  debug_all = []
485
+
486
+ merged_hwpx: bytes | None = None
487
+
 
488
  for p in range(pages):
489
+ chunk = records[p * int(n_per_page) : (p + 1) * int(n_per_page)]
490
+ mapping: Dict[str, str] = {}
491
  for i in range(int(n_per_page)):
492
  if i < len(chunk):
493
  r = chunk[i]
494
+ mapping[f"๋ฐ•์Šค๋ฒˆํ˜ธ{i+1}"] = r.get("๋ฐ•์Šค๋ฒˆํ˜ธ", "")
495
+ mapping[f"์ข…๋ฃŒ์—ฐ๋„{i+1}"] = r.get("์ƒ์‚ฐ์—ฐ๋„", "")
496
+ mapping[f"๋ณด์กด๊ธฐ๊ฐ„{i+1}"] = r.get("๋ณด์กด๊ธฐ๊ฐ„", "")
497
+ mapping[f"๋‹จ์œ„์—…๋ฌด{i+1}"] = r.get("๋‹จ์œ„์—…๋ฌด", "")
498
+ mapping[f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}"] = r.get("๊ธฐ๋ก๋ฌผ์ฒ ", "")
499
+ mapping[f"๋ชฉ๋ก{i+1}"] = r.get("๋ชฉ๋ก", "")
500
+ title_val = r.get("์ œ๋ชฉ", "")
501
+ mapping[f"์ œ๋ชฉ{i+1}"] = title_val
502
  mapping[f"์—…๋ฌด๋ช…{i+1}"] = title_val
503
  else:
504
+ for k in keys:
505
+ mapping[f"{k}{i+1}"] = ""
506
 
507
  if p == 0:
 
508
  merged_hwpx, dbg = replace_in_hwpx(tpl_bytes, mapping)
509
  else:
 
510
  page_hwpx, dbg = replace_in_hwpx(tpl_bytes, mapping)
511
  merged_hwpx = merge_hwpx_pages(merged_hwpx, page_hwpx)
 
 
512
 
513
+ debug_all.append({"page": p + 1, "stats": dbg})
514
+
515
+ # ํŒŒ์ผ๋ช…
516
  first_box = records[0].get("๋ฐ•์Šค๋ฒˆํ˜ธ", "0000") if records else "0000"
517
  last_box = records[-1].get("๋ฐ•์Šค๋ฒˆํ˜ธ", "0000") if records else "0000"
518
+ filename = (
519
+ f"labels_{first_box}to{last_box}.hwpx" if first_box != last_box else f"labels_{first_box}.hwpx"
520
+ )
521
+
522
+ st.download_button(
523
+ "โฌ‡๏ธ ํ†ตํ•ฉ HWPX ๋‹ค์šด๋กœ๋“œ",
524
+ data=merged_hwpx,
525
+ file_name=filename,
526
+ mime="application/vnd.hancom.hwpx",
527
+ )
528
+ st.download_button(
529
+ "โฌ‡๏ธ ๋””๋ฒ„๊ทธ(JSON)",
530
+ data=json.dumps(debug_all, ensure_ascii=False, indent=2),
531
+ file_name="debug.json",
532
+ mime="application/json",
533
+ )