dohyune commited on
Commit
2a35ebc
ยท
verified ยท
1 Parent(s): 0fbe1ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -134
app.py CHANGED
@@ -3,18 +3,16 @@ import pandas as pd
3
  import io, zipfile, re, html, json
4
  from typing import Dict, Tuple
5
 
6
- st.set_page_config(page_title="๐Ÿ“ฆ ๋ฐ•์Šค๋ผ๋ฒจ(HWPX) โ€” ์™„์ „ ์น˜ํ™˜", layout="wide")
7
- st.title("๐Ÿ“ฆ ๋ฐ•์Šค๋ผ๋ฒจ ์ž๋™ ์ƒ์„ฑ๊ธฐ โ€” HWPX ํ•„๋“œยทํ† ํฐยทํ…์ŠคํŠธ ์ž๋ฆฌํ‘œ์‹œ์ž ์™„์ „ ์น˜ํ™˜")
8
 
9
- # ================== ๋ฐ์ดํ„ฐ ์œ ํ‹ธ ==================
10
  def _year_range(series: pd.Series) -> str:
11
  s = series.astype(str).fillna("")
12
  v = s[~s.isin(["", "0", "0000"])]
13
- if v.empty:
14
- return "0000-0000"
15
  nums = pd.to_numeric(v, errors="coerce").dropna().astype(int)
16
- if nums.empty:
17
- return "0000-0000"
18
  return f"{nums.min():04d}-{nums.max():04d}"
19
 
20
  def build_rows(df: pd.DataFrame) -> pd.DataFrame:
@@ -34,68 +32,89 @@ def build_rows(df: pd.DataFrame) -> pd.DataFrame:
34
  has_mgmt = "๊ด€๋ฆฌ๋ฒˆํ˜ธ" in df.columns
35
  lists = []
36
  for b, g in df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ"):
37
- lines = [
38
- f"- {r['๊ด€๋ฆฌ๋ฒˆํ˜ธ']} {r.get('์ œ๋ชฉ','')}" if has_mgmt else f"- {r.get('์ œ๋ชฉ','')}"
39
- for _, r in g.iterrows()
40
- ]
41
  lists.append({"๋ฐ•์Šค๋ฒˆํ˜ธ": b, "๋ชฉ๋ก": "\r\n".join(lines)})
42
  list_df = pd.DataFrame(lists)
43
 
44
  # ๋Œ€ํ‘œ ๋ฉ”ํƒ€
45
- meta_cols = ["๋ฐ•์Šค๋ฒˆํ˜ธ", "์ข…๋ฃŒ์—ฐ๋„", "๋ณด์กด๊ธฐ๊ฐ„", "๋‹จ์œ„์—…๋ฌด", "๊ธฐ๋ก๋ฌผ์ฒ ", "์ œ๋ชฉ"]
46
  meta_exist = [c for c in meta_cols if c in df.columns]
47
- meta = (
48
- df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ", as_index=False).first()[meta_exist]
49
- if meta_exist
50
- else pd.DataFrame({"๋ฐ•์Šค๋ฒˆํ˜ธ": df["๋ฐ•์Šค๋ฒˆํ˜ธ"].unique()})
51
- )
52
 
53
  merged = meta.merge(list_df, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left").merge(yr, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left")
54
  return merged
55
 
56
- # ================== ์น˜ํ™˜ ์œ ํ‹ธ ==================
57
- # 1) ์ ‘๋‘์–ด ์™€์ผ๋“œ์นด๋“œ: <hp:..> ๋ฟ ์•„๋‹ˆ๋ผ <hwp:..>, <h:..> ๋“ฑ ๋ชจ๋‘ ํ—ˆ์šฉ
58
  FIELD_PAIR_RE_TMPL = (
59
- r'<(?P<prefix>[a-zA-Z0-9_]+):fieldBegin\b[^>]*\bname="{name}"[^>]*/>'
60
  r'(.*?)'
61
- r'<(?P=prefix):fieldEnd\b[^>]*/>'
62
  )
63
-
64
- # 2) ํ† ํฐ(๋ฐฑ์—… ๊ฒฝ๋กœ)
65
  TOKEN_FMT = "{{{{{key}}}}}"
66
 
67
- # 3) ์ˆœ์ˆ˜ ํ…์ŠคํŠธ ์ž๋ฆฌํ‘œ์‹œ์ž: <hp:run>โ€ฆ<hp:t>ํ‚ค</hp:t>โ€ฆ</hp:run> ์ „์ฒด๋ฅผ ๊ฐ’ run๋“ค๋กœ ๊ต์ฒด
68
- TEXT_RUN_RE_TMPL = (
69
- r'(<(?P<prefix>[a-zA-Z0-9_]+):run\b[^>]*>\s*'
70
- r'(?:<(?P=prefix):t[^>]*>)\s*)'
71
- r'{name}'
72
- r'(\s*(?:</(?P=prefix):t>)\s*</(?P=prefix):run>)'
73
  )
74
 
75
- def _runs_plain(text: str) -> str:
76
- return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>"
 
 
 
 
 
 
77
 
78
- def _runs_multiline(text: str) -> str:
79
  """
80
- ๋‹ค์ค‘ ์ค„ ํ…์ŠคํŠธ๋ฅผ ์•ˆ์ „ํ•˜๊ฒŒ ํ‘œ์‹œ:
81
- - ๊ฐ ์ค„์„ ์•„์˜ˆ ๋…๋ฆฝ <hp:p> ๋ฌธ๋‹จ์œผ๋กœ ์ƒ์„ฑ
82
  """
83
- if text is None:
84
- return ""
85
- lines = str(text).replace("\r\n", "\n").split("\n")
86
- parts = []
87
- for ln in lines:
88
- escaped = html.escape(ln)
89
- parts.append(f"<hp:p><hp:run><hp:t>{escaped}</hp:t></hp:run></hp:p>")
90
- return "".join(parts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
93
  changed_any = False
94
 
95
- # 1) ํ•„๋“œ์Œ ์™„์ „ ์น˜ํ™˜
 
 
 
 
 
 
 
 
 
96
  for k, v in mapping.items():
97
- is_multiline = bool(re.match(r"^(๋ชฉ๋ก|list|์ œ๋ชฉ|์—…๋ฌด๋ช…)\d+$", k, re.IGNORECASE))
98
- replacement = _runs_multiline(v) if is_multiline else _runs_plain(v)
 
99
  pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
100
  xml_new, n = pat.subn(replacement, xml)
101
  if n:
@@ -103,54 +122,42 @@ def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
103
  xml = xml_new
104
  changed_any = True
105
 
106
- # 2) ์ˆœ์ˆ˜ ํ…์ŠคํŠธ ์ž๋ฆฌํ‘œ์‹œ์ž ์น˜ํ™˜
 
 
 
 
107
  for k, v in mapping.items():
108
- is_multiline = bool(re.match(r"^(๋ชฉ๋ก|list|์ œ๋ชฉ|์—…๋ฌด๋ช…)\d+$", k, re.IGNORECASE))
109
- replacement = _runs_multiline(v) if is_multiline else _runs_plain(v)
110
-
111
- # ์ •ํ™• ์ผ์น˜: run ์•ˆ์˜ ํ…์ŠคํŠธ๊ฐ€ ํ‚ค๋งŒ ์žˆ๋Š” ๊ฒฝ์šฐ
112
- pat_text = re.compile(TEXT_RUN_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
113
- xml_new, n = pat_text.subn(replacement, xml)
114
- if n:
115
- dbg["text_hits"][k] = dbg["text_hits"].get(k, 0) + n
116
- xml = xml_new
 
 
 
117
  changed_any = True
118
- else:
119
- # ๋ถ€๋ถ„ ์ผ์น˜: ๊ฐ™์€ <t> ์•ˆ์— ๋‹ค๋ฅธ ๋ฌธ์ž์™€ ์„ž์—ฌ ์žˆ์„ ๋•Œ
120
- pat_tnode = re.compile(
121
- r'(<(?P<prefix>[a-zA-Z0-9_]+):t[^>]*>)([^<]*?)</(?P=prefix):t>',
122
- re.DOTALL
123
- )
124
- def repl_tnode(m):
125
- text_node = m.group(3)
126
- if k not in text_node:
127
- return m.group(0)
128
- val = "" if v is None else str(v)
129
- # ๋ถ€๋ถ„ ์น˜ํ™˜์€ ๋ฌธ๋‹จ ๊ตฌ์กฐ๋ฅผ ๊ฑด๋“œ๋ฆฌ์ง€ ์•Š๊ณ  ๋ฌธ์ž์—ด๋งŒ ๊ต์ฒด
130
- new_text = html.escape(text_node.replace(k, val))
131
- return f"{m.group(1)}{new_text}</{m.group('prefix')}:t>"
132
-
133
- xml2 = pat_tnode.sub(repl_tnode, xml)
134
- if xml2 != xml:
135
- dbg["text_hits"][k] = dbg["text_hits"].get(k, 0) + 1
136
- xml = xml2
137
- changed_any = True
138
 
139
- # 3) ํ† ํฐ ์น˜ํ™˜
140
  for k, v in mapping.items():
 
 
141
  tok = TOKEN_FMT.format(key=k)
142
  if tok in xml:
143
- rep = _runs_multiline(v) if re.match(r"^(๋ชฉ๋ก|list|์ œ๋ชฉ|์—…๋ฌด๋ช…)\d+$", k, re.IGNORECASE) else html.escape("" if v is None else str(v))
144
- xml = xml.replace(tok, rep)
145
  dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1
146
  changed_any = True
147
 
148
  if changed_any:
149
- dbg["touched"] = True
150
  return xml
151
 
152
  def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, dict]:
153
- dbg = {"field_hits":{}, "text_hits":{}, "token_hits":{}, "touched_files": []}
154
  zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
155
  out_buf = io.BytesIO()
156
  zout = zipfile.ZipFile(out_buf, "w")
@@ -158,8 +165,7 @@ def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, d
158
  # mimetype ๋ฌด์••์ถ• + ๋งจ์•ž
159
  names = zin.namelist()
160
  if "mimetype" in names:
161
- zi = zipfile.ZipInfo("mimetype")
162
- zi.compress_type = zipfile.ZIP_STORED
163
  zout.writestr(zi, zin.read("mimetype"))
164
 
165
  for e in zin.infolist():
@@ -170,33 +176,25 @@ def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, d
170
  try:
171
  s = data.decode("utf-8", errors="ignore")
172
  before = s
173
- local_dbg = {
174
- "field_hits": dbg["field_hits"],
175
- "text_hits": dbg["text_hits"],
176
- "token_hits": dbg["token_hits"],
177
- "touched": False
178
- }
179
- s = _apply_to_xml(s, mapping, local_dbg)
180
  if s != before:
181
  dbg["touched_files"].append(e.filename)
182
  data = s.encode("utf-8")
183
  except Exception:
184
  pass
185
- zi = zipfile.ZipInfo(e.filename)
186
- zi.compress_type = zipfile.ZIP_DEFLATED
187
  zout.writestr(zi, data)
188
 
189
  zout.close(); out_buf.seek(0); zin.close()
190
  return out_buf.getvalue(), dbg
191
 
192
- # ================== UI ==================
193
  with st.expander("์‚ฌ์šฉ๋ฒ•", expanded=True):
194
  st.markdown("""
195
- - HWPX ZIP ๋‚ด๋ถ€ **๋ชจ๋“  XML**์—์„œ ์•„๋ž˜ ์ˆœ์„œ๋กœ ์น˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
196
- 1) `fieldBegin(name=ํ‚ค)`~`fieldEnd` **ํ•„๋“œ์Œ** ํ†ต์งธ ๊ต์ฒด
197
- 2) `<*:t>ํ‚ค</*:t>` ๊ฐ™์€ **์ˆœ์ˆ˜ ํ…์ŠคํŠธ ์ž๋ฆฌํ‘œ์‹œ์ž** run ๊ต์ฒด
198
- 3) `{{ํ‚ค}}` **ํ† ํฐ** ๊ต์ฒด
199
- - โ€˜๋ชฉ๋ก/์ œ๋ชฉ/์—…๋ฌด๋ช…โ€™ ๊ฐ™์ด ์—ฌ๋Ÿฌ ์ค„์ด ๋“ค์–ด๊ฐˆ ์ˆ˜ ์žˆ๋Š” ๊ฐ’์€ **๊ฐ ์ค„์„ ๋…๋ฆฝ run + `lineBreak`**๋กœ ๋„ฃ์–ด ๊ฒน์นจ์„ ๋ฐฉ์ง€ํ•ฉ๋‹ˆ๋‹ค.
200
  """)
201
 
202
  tpl = st.file_uploader("๐Ÿ“„ HWPX ํ…œํ”Œ๋ฆฟ ์—…๋กœ๋“œ", type=["hwpx"])
@@ -208,8 +206,7 @@ if tpl and data:
208
  df = pd.read_csv(data) if data.name.lower().endswith(".csv") else pd.read_excel(data)
209
 
210
  if "๋ฐ•์Šค๋ฒˆํ˜ธ" not in df.columns:
211
- st.error("โŒ ํ•„์ˆ˜ ์ปฌ๋Ÿผ '๋ฐ•์Šค๋ฒˆํ˜ธ'๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
212
- st.stop()
213
 
214
  st.success("โœ… ์œ„์น˜ ๋งคํ•‘ ์™„๋ฃŒ (์—‘์…€ ์ธก)")
215
  st.dataframe(df.head(10), use_container_width=True)
@@ -227,60 +224,55 @@ if tpl and data:
227
 
228
  # 1ํŽ˜์ด์ง€ ๋งคํ•‘ ํ”„๋ฆฌ๋ทฐ
229
  st.subheader("๐Ÿงช 1ํŽ˜์ด์ง€ ๋งคํ•‘ ํ”„๋ฆฌ๋ทฐ")
230
- keys = ["๋ฐ•์Šค๋ฒˆํ˜ธ", "์ข…๋ฃŒ์—ฐ๋„", "๋ณด์กด๊ธฐ๊ฐ„", "๋‹จ์œ„์—…๋ฌด", "๊ธฐ๋ก๋ฌผ์ฒ ", "๋ชฉ๋ก", "์ œ๋ชฉ", "์—…๋ฌด๋ช…"]
231
  mapping_preview = {}
232
  for i in range(int(n_per_page)):
233
  if i < len(records):
234
  r = records[i]
235
- for k in keys:
236
- val = r.get("์ƒ์‚ฐ์—ฐ๋„","") if k=="์ข…๋ฃŒ์—ฐ๋„" else r.get("์ œ๋ชฉ","") if k=="์—…๋ฌด๋ช…" else r.get(k,"")
237
- mapping_preview[f"{k}{i+1}"] = val
 
 
 
 
 
 
 
238
  else:
239
- for k in keys:
240
- mapping_preview[f"{k}{i+1}"] = ""
241
- st.dataframe(
242
- pd.DataFrame([{"ํ‚ค": k, "๊ฐ’ ์•ž๋ถ€๋ถ„": str(v)[:120]} for k, v in sorted(mapping_preview.items())]),
243
- use_container_width=True,
244
- height=320,
245
- )
246
 
247
  if st.button("๐Ÿš€ ๋ผ๋ฒจ ์ƒ์„ฑ (ํŽ˜์ด์ง€๋ณ„ HWPX ZIP)"):
248
- mem = io.BytesIO()
249
- zout = zipfile.ZipFile(mem, "w", zipfile.ZIP_DEFLATED)
250
  pages = (len(records) + int(n_per_page) - 1) // int(n_per_page)
251
  debug_all = []
252
 
253
  for p in range(pages):
254
- chunk = records[p * int(n_per_page) : (p + 1) * int(n_per_page)]
255
- # ๋งคํ•‘ ๊ตฌ์ถ• (์ œ๋ชฉ == ์—…๋ฌด๋ช… ๋™์น˜)
256
  mapping = {}
257
  for i in range(int(n_per_page)):
258
  if i < len(chunk):
259
  r = chunk[i]
260
- mapping[f"๋ฐ•์Šค๋ฒˆํ˜ธ{i+1}"] = r.get("๋ฐ•์Šค๋ฒˆํ˜ธ", "")
261
- mapping[f"์ข…๋ฃŒ์—ฐ๋„{i+1}"] = r.get("์ƒ์‚ฐ์—ฐ๋„", "")
262
- mapping[f"๋ณด์กด๊ธฐ๊ฐ„{i+1}"] = r.get("๋ณด์กด๊ธฐ๊ฐ„", "")
263
- mapping[f"๋‹จ์œ„์—…๋ฌด{i+1}"] = r.get("๋‹จ์œ„์—…๋ฌด", "")
264
- mapping[f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}"] = r.get("๊ธฐ๋ก๋ฌผ์ฒ ", "")
265
- mapping[f"๋ชฉ๋ก{i+1}"] = r.get("๋ชฉ๋ก", "")
266
- title_val = r.get("์ œ๋ชฉ", "")
267
  mapping[f"์ œ๋ชฉ{i+1}"] = title_val
268
- mapping[f"์—…๋ฌด๋ช…{i+1}"] = title_val # ํ…œํ”Œ๋ฆฟ์ด '์—…๋ฌด๋ช…1' ๊ฐ™์€ ํ‚ค๋ฅผ ์“ธ ์ˆ˜ ์žˆ์Œ
269
  else:
270
- for k in keys:
271
- mapping[f"{k}{i+1}"] = ""
272
 
273
  out_hwpx, dbg = replace_in_hwpx(tpl_bytes, mapping)
274
- debug_all.append({"page": p + 1, "stats": dbg})
275
- name = "_".join([r.get("๋ฐ•์Šค๋ฒˆํ˜ธ", "") for r in chunk]) if chunk else f"empty_{p+1}"
276
  zout.writestr(f"label_{name}.hwpx", out_hwpx)
277
 
278
- zout.close()
279
- mem.seek(0)
280
  st.download_button("โฌ‡๏ธ ZIP ๋‹ค์šด๋กœ๋“œ", data=mem, file_name="labels_by_page.zip", mime="application/zip")
281
- st.download_button(
282
- "โฌ‡๏ธ ๋””๋ฒ„๊ทธ(JSON)",
283
- data=json.dumps(debug_all, ensure_ascii=False, indent=2),
284
- file_name="debug.json",
285
- mime="application/json",
286
- )
 
3
  import io, zipfile, re, html, json
4
  from typing import Dict, Tuple
5
 
6
+ st.set_page_config(page_title="๐Ÿ“ฆ ๋ฐ•์Šค๋ผ๋ฒจ(HWPX) โ€” ๋ฌธ๋‹จ ๋‹จ์œ„ ์™„์ „ ์น˜ํ™˜", layout="wide")
7
+ st.title("๐Ÿ“ฆ ๋ฐ•์Šค๋ผ๋ฒจ ์ž๋™ ์ƒ์„ฑ๊ธฐ โ€” HWPX ํ•„๋“œยทํ† ํฐยทํ…์ŠคํŠธ ์™„์ „ ์น˜ํ™˜(๋ฌธ๋‹จ ๋‹จ์œ„)")
8
 
9
+ # -------------------- ๋ฐ์ดํ„ฐ ์œ ํ‹ธ --------------------
10
  def _year_range(series: pd.Series) -> str:
11
  s = series.astype(str).fillna("")
12
  v = s[~s.isin(["", "0", "0000"])]
13
+ if v.empty: return "0000-0000"
 
14
  nums = pd.to_numeric(v, errors="coerce").dropna().astype(int)
15
+ if nums.empty: return "0000-0000"
 
16
  return f"{nums.min():04d}-{nums.max():04d}"
17
 
18
  def build_rows(df: pd.DataFrame) -> pd.DataFrame:
 
32
  has_mgmt = "๊ด€๋ฆฌ๋ฒˆํ˜ธ" in df.columns
33
  lists = []
34
  for b, g in df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ"):
35
+ lines = [f"- {r['๊ด€๋ฆฌ๋ฒˆํ˜ธ']} {r.get('์ œ๋ชฉ','')}" if has_mgmt else f"- {r.get('์ œ๋ชฉ','')}"
36
+ for _, r in g.iterrows()]
 
 
37
  lists.append({"๋ฐ•์Šค๋ฒˆํ˜ธ": b, "๋ชฉ๋ก": "\r\n".join(lines)})
38
  list_df = pd.DataFrame(lists)
39
 
40
  # ๋Œ€ํ‘œ ๋ฉ”ํƒ€
41
+ meta_cols = ["๋ฐ•์Šค๋ฒˆํ˜ธ","์ข…๋ฃŒ์—ฐ๋„","๋ณด์กด๊ธฐ๊ฐ„","๋‹จ์œ„์—…๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","์ œ๋ชฉ"]
42
  meta_exist = [c for c in meta_cols if c in df.columns]
43
+ meta = df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ", as_index=False).first()[meta_exist] if meta_exist \
44
+ else pd.DataFrame({"๋ฐ•์Šค๋ฒˆํ˜ธ": df["๋ฐ•์Šค๋ฒˆํ˜ธ"].unique()})
 
 
 
45
 
46
  merged = meta.merge(list_df, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left").merge(yr, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left")
47
  return merged
48
 
49
+ # -------------------- ์น˜ํ™˜ ์œ ํ‹ธ --------------------
 
50
  FIELD_PAIR_RE_TMPL = (
51
+ r'<(?P<fprefix>[a-zA-Z0-9_]+):fieldBegin\b[^>]*\bname="{name}"[^>]*/>'
52
  r'(.*?)'
53
+ r'<(?P=fprefix):fieldEnd\b[^>]*/>'
54
  )
 
 
55
  TOKEN_FMT = "{{{{{key}}}}}"
56
 
57
+ # ๋ฌธ๋‹จ(<*:p>) ํƒ์ƒ‰ ํŒจํ„ด
58
+ PARA_RE = re.compile(
59
+ r'<(?P<pprefix>[a-zA-Z0-9_]+):p(?P<pattrs>[^>]*)>(?P<pbody>.*?)</(?P=pprefix):p>',
60
+ re.DOTALL
 
 
61
  )
62
 
63
+ # ๋ฌธ๋‹จ ํ•˜๋‚˜๋ฅผ ๊ฐ™์€ ์Šคํƒ€์ผ๋กœ ๋ณต์ œํ•ด์ฃผ๋Š” ํ—ฌํผ
64
+ def _make_para(pprefix: str, pattrs: str, text: str) -> str:
65
+ esc = html.escape("" if text is None else str(text))
66
+ return f'<{pprefix}:p{pattrs}><{pprefix}:run><{pprefix}:t>{esc}</{pprefix}:t></{pprefix}:run></{pprefix}:p>'
67
+
68
+ def _split_lines(val) -> list:
69
+ if val is None: return [""]
70
+ return str(val).replace("\r\n","\n").split("\n")
71
 
72
+ def _replace_para_multiline(xml: str, key: str, value: str, dbg: dict) -> str:
73
  """
74
+ key๊ฐ€ ํฌํ•จ๋œ '๋ถ€๋ชจ ๋ฌธ๋‹จ ์ „์ฒด'๋ฅผ, ๊ฐ’์˜ ๊ฐ ์ค„์„ ๋‹ด์€ ์—ฌ๋Ÿฌ ๋ฌธ๋‹จ์œผ๋กœ ๊ต์ฒด.
75
+ - fieldBegin/End, <*:t>ํ‚ค</*:t>, {{ํ‚ค}} ๋ชจ๋‘ ๊ฐ์ง€
76
  """
77
+ pair_pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(key)), re.DOTALL)
78
+ tnode_pat = re.compile(rf'<(?P<p>[a-zA-Z0-9_]+):t[^>]*>[^<]*{re.escape(key)}[^<]*</(?P=p):t>', re.DOTALL)
79
+ token_str = TOKEN_FMT.format(key=key)
80
+
81
+ def para_repl(m):
82
+ body = m.group("pbody")
83
+ if not (pair_pat.search(body) or tnode_pat.search(body) or (token_str in body)):
84
+ return m.group(0)
85
+
86
+ lines = _split_lines(value)
87
+ pprefix = m.group("pprefix")
88
+ pattrs = m.group("pattrs")
89
+ new_paras = "".join(_make_para(pprefix, pattrs, ln) for ln in lines)
90
+ dbg["para_hits"][key] = dbg["para_hits"].get(key, 0) + 1
91
+ return new_paras
92
+
93
+ xml2 = PARA_RE.sub(para_repl, xml)
94
+ if xml2 != xml:
95
+ dbg["touched"] = True
96
+ return xml2
97
+
98
+ def _runs_plain(text: str) -> str:
99
+ return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>"
100
 
101
  def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
102
  changed_any = False
103
 
104
+ # 0) ๋‹ค์ค‘์ค„ ํ‚ค๋Š” ๋จผ์ € "๋ถ€๋ชจ ๋ฌธ๋‹จ ๊ต์ฒด"๋กœ ์ฒ˜๋ฆฌ
105
+ multi_key = re.compile(r"^(๋ชฉ๋ก|list|์ œ๋ชฉ|์—…๋ฌด๋ช…)\d+$", re.IGNORECASE)
106
+ for k, v in mapping.items():
107
+ if multi_key.match(k):
108
+ xml_new = _replace_para_multiline(xml, k, v, dbg)
109
+ if xml_new != xml:
110
+ xml = xml_new
111
+ changed_any = True
112
+
113
+ # 1) ํ•„๋“œ์Œ(์ธ๋ผ์ธ) ์น˜ํ™˜ โ€” ๋‹จ์ผ์ค„ ํ‚ค๋งŒ
114
  for k, v in mapping.items():
115
+ if multi_key.match(k):
116
+ continue
117
+ replacement = _runs_plain(v)
118
  pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
119
  xml_new, n = pat.subn(replacement, xml)
120
  if n:
 
122
  xml = xml_new
123
  changed_any = True
124
 
125
+ # 2) ์ˆœ์ˆ˜ ํ…์ŠคํŠธ ์ž๋ฆฌํ‘œ์‹œ์ž(<*:t>ํ‚ค</*:t>) ๋ถ€๋ถ„์น˜ํ™˜ โ€” ๋‹จ์ผ์ค„ ํ‚ค๋งŒ
126
+ tnode_all = re.compile(
127
+ r'(<(?P<prefix>[a-zA-Z0-9_]+):t[^>]*>)([^<]*?)</(?P=prefix):t>',
128
+ re.DOTALL
129
+ )
130
  for k, v in mapping.items():
131
+ if multi_key.match(k):
132
+ continue
133
+ def repl_tnode(m):
134
+ text_node = m.group(3)
135
+ if k not in text_node:
136
+ return m.group(0)
137
+ new_text = html.escape(text_node.replace(k, "" if v is None else str(v)))
138
+ return f"{m.group(1)}{new_text}</{m.group('prefix')}:t>"
139
+ xml2 = tnode_all.sub(repl_tnode, xml)
140
+ if xml2 != xml:
141
+ dbg["text_hits"][k] = dbg["text_hits"].get(k, 0) + 1
142
+ xml = xml2
143
  changed_any = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
+ # 3) ํ† ํฐ ์น˜ํ™˜ โ€” ๋‹จ์ผ์ค„ ํ‚ค๋งŒ
146
  for k, v in mapping.items():
147
+ if multi_key.match(k):
148
+ continue
149
  tok = TOKEN_FMT.format(key=k)
150
  if tok in xml:
151
+ xml = xml.replace(tok, html.escape("" if v is None else str(v)))
 
152
  dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1
153
  changed_any = True
154
 
155
  if changed_any:
156
+ dbg["files_touched"] = True
157
  return xml
158
 
159
  def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, dict]:
160
+ dbg = {"para_hits":{}, "field_hits":{}, "text_hits":{}, "token_hits":{}, "touched_files": []}
161
  zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
162
  out_buf = io.BytesIO()
163
  zout = zipfile.ZipFile(out_buf, "w")
 
165
  # mimetype ๋ฌด์••์ถ• + ๋งจ์•ž
166
  names = zin.namelist()
167
  if "mimetype" in names:
168
+ zi = zipfile.ZipInfo("mimetype"); zi.compress_type = zipfile.ZIP_STORED
 
169
  zout.writestr(zi, zin.read("mimetype"))
170
 
171
  for e in zin.infolist():
 
176
  try:
177
  s = data.decode("utf-8", errors="ignore")
178
  before = s
179
+ s = _apply_to_xml(s, mapping, {"para_hits":dbg["para_hits"], "field_hits":dbg["field_hits"],
180
+ "text_hits":dbg["text_hits"], "token_hits":dbg["token_hits"],
181
+ "files_touched":False})
 
 
 
 
182
  if s != before:
183
  dbg["touched_files"].append(e.filename)
184
  data = s.encode("utf-8")
185
  except Exception:
186
  pass
187
+ zi = zipfile.ZipInfo(e.filename); zi.compress_type = zipfile.ZIP_DEFLATED
 
188
  zout.writestr(zi, data)
189
 
190
  zout.close(); out_buf.seek(0); zin.close()
191
  return out_buf.getvalue(), dbg
192
 
193
+ # -------------------- UI --------------------
194
  with st.expander("์‚ฌ์šฉ๋ฒ•", expanded=True):
195
  st.markdown("""
196
+ - **๋‹ค์ค‘ ์ค„(๋ชฉ๋ก/์ œ๋ชฉ/์—…๋ฌด๋ช…)์€ ๋ถ€๋ชจ ๋ฌธ๋‹จ์„ ์—ฌ๋Ÿฌ ๋ฌธ๋‹จ์œผ๋กœ ๊ต์ฒด**ํ•˜์—ฌ ๊ฒน์นจ ์—†์ด ํ‘œ์‹œํ•ฉ๋‹ˆ๋‹ค.
197
+ - ๋‚˜๋จธ์ง€ ํ‚ค๋Š” ํ•„๋“œ์Œ/ํ…์ŠคํŠธ/ํ† ํฐ์„ ์ธ๋ผ์ธ ์น˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
 
 
 
198
  """)
199
 
200
  tpl = st.file_uploader("๐Ÿ“„ HWPX ํ…œํ”Œ๋ฆฟ ์—…๋กœ๋“œ", type=["hwpx"])
 
206
  df = pd.read_csv(data) if data.name.lower().endswith(".csv") else pd.read_excel(data)
207
 
208
  if "๋ฐ•์Šค๋ฒˆํ˜ธ" not in df.columns:
209
+ st.error("โŒ ํ•„์ˆ˜ ์ปฌ๋Ÿผ '๋ฐ•์Šค๋ฒˆํ˜ธ'๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค."); st.stop()
 
210
 
211
  st.success("โœ… ์œ„์น˜ ๋งคํ•‘ ์™„๋ฃŒ (์—‘์…€ ์ธก)")
212
  st.dataframe(df.head(10), use_container_width=True)
 
224
 
225
  # 1ํŽ˜์ด์ง€ ๋งคํ•‘ ํ”„๋ฆฌ๋ทฐ
226
  st.subheader("๐Ÿงช 1ํŽ˜์ด์ง€ ๋งคํ•‘ ํ”„๋ฆฌ๋ทฐ")
227
+ keys = ["๋ฐ•์Šค๋ฒˆํ˜ธ","์ข…๋ฃŒ์—ฐ๋„","๋ณด์กด๊ธฐ๊ฐ„","๋‹จ์œ„์—…๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","๋ชฉ๋ก","์ œ๋ชฉ","์—…๋ฌด๋ช…"]
228
  mapping_preview = {}
229
  for i in range(int(n_per_page)):
230
  if i < len(records):
231
  r = records[i]
232
+ mapping_preview.update({
233
+ f"๋ฐ•์Šค๋ฒˆํ˜ธ{i+1}": r.get("๋ฐ•์Šค๋ฒˆํ˜ธ",""),
234
+ f"์ข…๋ฃŒ์—ฐ๋„{i+1}": r.get("์ƒ์‚ฐ์—ฐ๋„",""),
235
+ f"๋ณด์กด๊ธฐ๊ฐ„{i+1}": r.get("๋ณด์กด๊ธฐ๊ฐ„",""),
236
+ f"๋‹จ์œ„์—…๋ฌด{i+1}": r.get("๋‹จ์œ„์—…๋ฌด",""),
237
+ f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}": r.get("๊ธฐ๋ก๋ฌผ์ฒ ",""),
238
+ f"๋ชฉ๋ก{i+1}": r.get("๋ชฉ๋ก",""),
239
+ f"์ œ๋ชฉ{i+1}": r.get("์ œ๋ชฉ",""),
240
+ f"์—…๋ฌด๋ช…{i+1}": r.get("์ œ๋ชฉ",""), # ํ…œํ”Œ๋ฆฟ์ด '์—…๋ฌด๋ช…1'์„ ์“ฐ๋Š” ๊ฒฝ์šฐ ๋Œ€์‘
241
+ })
242
  else:
243
+ for k in keys: mapping_preview[f"{k}{i+1}"] = ""
244
+ st.dataframe(pd.DataFrame([{"ํ‚ค":k, "๊ฐ’ ์•ž๋ถ€๋ถ„":str(v)[:120]} for k,v in sorted(mapping_preview.items())]),
245
+ use_container_width=True, height=320)
 
 
 
 
246
 
247
  if st.button("๐Ÿš€ ๋ผ๋ฒจ ์ƒ์„ฑ (ํŽ˜์ด์ง€๋ณ„ HWPX ZIP)"):
248
+ mem = io.BytesIO(); zout = zipfile.ZipFile(mem, "w", zipfile.ZIP_DEFLATED)
 
249
  pages = (len(records) + int(n_per_page) - 1) // int(n_per_page)
250
  debug_all = []
251
 
252
  for p in range(pages):
253
+ chunk = records[p*int(n_per_page):(p+1)*int(n_per_page)]
 
254
  mapping = {}
255
  for i in range(int(n_per_page)):
256
  if i < len(chunk):
257
  r = chunk[i]
258
+ mapping[f"๋ฐ•์Šค๋ฒˆํ˜ธ{i+1}"] = r.get("๋ฐ•์Šค๋ฒˆํ˜ธ","")
259
+ mapping[f"์ข…๋ฃŒ์—ฐ๋„{i+1}"] = r.get("์ƒ์‚ฐ์—ฐ๋„","")
260
+ mapping[f"๋ณด์กด๊ธฐ๊ฐ„{i+1}"] = r.get("๋ณด์กด๊ธฐ๊ฐ„","")
261
+ mapping[f"๋‹จ์œ„์—…๋ฌด{i+1}"] = r.get("๋‹จ์œ„์—…๋ฌด","")
262
+ mapping[f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}"] = r.get("๊ธฐ๋ก๋ฌผ์ฒ ","")
263
+ mapping[f"๋ชฉ๋ก{i+1}"] = r.get("๋ชฉ๋ก","")
264
+ title_val = r.get("์ œ๋ชฉ","")
265
  mapping[f"์ œ๋ชฉ{i+1}"] = title_val
266
+ mapping[f"์—…๋ฌด๋ช…{i+1}"] = title_val
267
  else:
268
+ for k in keys: mapping[f"{k}{i+1}"] = ""
 
269
 
270
  out_hwpx, dbg = replace_in_hwpx(tpl_bytes, mapping)
271
+ debug_all.append({"page": p+1, "stats": dbg})
272
+ name = "_".join([r.get("๋ฐ•์Šค๋ฒˆํ˜ธ","") for r in chunk]) if chunk else f"empty_{p+1}"
273
  zout.writestr(f"label_{name}.hwpx", out_hwpx)
274
 
275
+ zout.close(); mem.seek(0)
 
276
  st.download_button("โฌ‡๏ธ ZIP ๋‹ค์šด๋กœ๋“œ", data=mem, file_name="labels_by_page.zip", mime="application/zip")
277
+ st.download_button("โฌ‡๏ธ ๋””๋ฒ„๊ทธ(JSON)", data=json.dumps(debug_all, ensure_ascii=False, indent=2),
278
+ file_name="debug.json", mime="application/json")