dohyune commited on
Commit
715aec7
ยท
verified ยท
1 Parent(s): bd43ad1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -41
app.py CHANGED
@@ -2,10 +2,10 @@ import streamlit as st
2
  import pandas as pd
3
  import io, zipfile, re, html, json
4
 
5
- st.set_page_config(page_title="๐Ÿ“ฆ ๋ฐ•์Šค๋ผ๋ฒจ ์ž๋™ ์ƒ์„ฑ๊ธฐ (HWPX ํ•„๋“œ ์ „์šฉ)", layout="wide")
6
- st.title("๐Ÿ“ฆ ๋ฐ•์Šค๋ผ๋ฒจ ์ž๋™ ์ƒ์„ฑ๊ธฐ โ€” HWPX **ํ•„๋“œ์ปจํŠธ๋กค ์ „์šฉ**")
7
 
8
- # ---------- ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ----------
9
  def compute_year_range(series: pd.Series) -> str:
10
  s = series.astype(str).fillna("")
11
  valid = s[~s.isin(["", "0", "0000"])]
@@ -29,11 +29,12 @@ def build_merged_df(df: pd.DataFrame) -> pd.DataFrame:
29
  else:
30
  prod_df = pd.DataFrame({"๋ฐ•์Šค๋ฒˆํ˜ธ": df["๋ฐ•์Šค๋ฒˆํ˜ธ"].unique(), "์ƒ์‚ฐ์—ฐ๋„": "0000-0000"})
31
 
32
- # ๋ชฉ๋ก(๊ด€๋ฆฌ๋ฒˆํ˜ธ+์ œ๋ชฉ)
33
  has_mgmt = "๊ด€๋ฆฌ๋ฒˆํ˜ธ" in df.columns
34
  list_rows = []
35
  for box, g in df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ"):
36
- lines = [f"- {r['๊ด€๋ฆฌ๋ฒˆํ˜ธ']} {r['์ œ๋ชฉ']}" if has_mgmt else f"- {r['์ œ๋ชฉ']}" for _, r in g.iterrows()]
 
37
  list_rows.append({"๋ฐ•์Šค๋ฒˆํ˜ธ": box, "๋ชฉ๋ก": "\r\n".join(lines)})
38
  list_df = pd.DataFrame(list_rows)
39
 
@@ -44,9 +45,6 @@ def build_merged_df(df: pd.DataFrame) -> pd.DataFrame:
44
 
45
  return meta_df.merge(list_df, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left").merge(prod_df, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left")
46
 
47
- # ---------- HWPX ํ•„๋“œ ์น˜ํ™˜ ----------
48
- FIELD_BLOCK_RE_TMPL = r'(<hp:fieldBegin[^>]*name="{name}"[^>]*>.*?</hp:fieldBegin>)(.*?)(<hp:fieldEnd[^>]*/>)'
49
-
50
  def _runs_plain(text: str) -> str:
51
  return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>"
52
 
@@ -60,17 +58,16 @@ def _runs_list(text: str) -> str:
60
  parts.append(f"<hp:run><hp:t>{html.escape(ln)}</hp:t></hp:run>")
61
  return "".join(parts)
62
 
63
- def _write_hwpx_like_src(zin: zipfile.ZipFile, writer_fn) -> bytes:
 
64
  out = io.BytesIO()
65
  zout = zipfile.ZipFile(out, "w")
66
 
67
- # 1) mimetype: ๋ฌด์••์ถ• + ๋งจ ์•ž
68
  if "mimetype" in zin.namelist():
69
  zi = zipfile.ZipInfo("mimetype")
70
  zi.compress_type = zipfile.ZIP_STORED
71
  zout.writestr(zi, zin.read("mimetype"))
72
 
73
- # 2) ๋‚˜๋จธ์ง€ ํŒŒ์ผ
74
  for e in zin.infolist():
75
  if e.filename == "mimetype":
76
  continue
@@ -89,53 +86,64 @@ def _write_hwpx_like_src(zin: zipfile.ZipFile, writer_fn) -> bytes:
89
  zout.close(); out.seek(0)
90
  return out.getvalue()
91
 
92
- def apply_field_mode(hwpx_bytes: bytes, mapping: dict, collect_debug=False):
93
- """ ํ•œ๊ธ€ ํ•„๋“œ์ปจํŠธ๋กค(name=...)์„ ๊ฐ’์œผ๋กœ ์ฑ„์šฐ๊ณ ,
94
- fieldEnd ๋’ค์— ๋™์ผ ๋‚ด์šฉ์„ '๊ฐ€์‹œ ํ…์ŠคํŠธ'๋กœ ํ•œ ๋ฒˆ ๋” ์‚ฝ์ž…ํ•˜์—ฌ ํ•ญ์ƒ ๋ณด์ด๊ฒŒ ํ•œ๋‹ค. """
95
- dbg = {"mode":"field","files_touched":[], "field_hits":{}} if collect_debug else None
 
 
 
 
 
 
 
 
 
 
 
 
96
  zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
97
 
98
- # ์‹ค์ œ๋กœ ์กด์žฌํ•˜๋Š” name๋งŒ ๊ณจ๋ผ๋‚ด๊ธฐ(์†๋„โ†‘)
99
  present = set()
100
  for e in zin.infolist():
101
  if e.filename.startswith("Contents/") and e.filename.endswith(".xml"):
102
- s = zin.read(e.filename).decode("utf-8", errors="ignore")
103
- for k in mapping.keys():
104
- if f'name="{k}"' in s:
105
- present.add(k)
 
 
 
106
 
107
  def writer(fname: str, xml: str) -> str:
108
  changed = False
109
  for k in present:
110
  val = mapping.get(k, "")
111
  is_list = bool(re.match(r"^(๋ชฉ๋ก|list)\d+$", k, re.IGNORECASE))
112
- inner = _runs_list(val) if is_list else _runs_plain(val)
113
- pat = re.compile(FIELD_BLOCK_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
114
-
115
- def _repl(m):
116
- # ํ•„๋“œ ๋‚ด๋ถ€ ์ฑ„์šฐ๊ณ , fieldEnd ๋’ค์— ๊ฐ€์‹œ ํ…์ŠคํŠธ ํ•œ ๋ฒˆ ๋”
117
- vis = inner
118
- if dbg: dbg["field_hits"][k] = dbg["field_hits"].get(k, 0) + 1
119
- return f"{m.group(1)}{inner}{m.group(3)}{vis}"
120
 
121
- xml2, n = pat.subn(_repl, xml)
 
122
  if n:
123
  changed = True
124
  xml = xml2
 
 
125
  if changed and dbg and fname not in dbg["files_touched"]:
126
  dbg["files_touched"].append(fname)
127
  return xml
128
 
129
- out = _write_hwpx_like_src(zin, writer)
130
  zin.close()
131
  return (out, dbg) if collect_debug else (out, None)
132
 
133
- # ---------- UI ----------
134
  with st.expander("์‚ฌ์šฉ๋ฒ•", expanded=True):
135
  st.markdown("""
136
- - ํ…œํ”Œ๋ฆฟ์€ **ํ•œ๊ธ€ ํ•„๋“œ์ปจํŠธ๋กค**(์˜ˆ: `name="๋ฐ•์Šค๋ฒˆํ˜ธ1"`)์ด์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. (ํ˜„์žฌ ์•ฑ์€ ํ•„๋“œ ๋ชจ๋“œ ์ „์šฉ)
137
- - ํ•œ ํŽ˜์ด์ง€์— ๋ผ๋ฒจ N๊ฐœ๋ผ๋ฉด, ํ•„๋“œ ์ด๋ฆ„์€ `๋ฐ•์Šค๋ฒˆํ˜ธ1..N`, `์ข…๋ฃŒ์—ฐ๋„1..N`, `๋ณด์กด๊ธฐ๊ฐ„1..N`, `๋‹จ์œ„์—…๋ฌด1..N`, `๊ธฐ๋ก๋ฌผ์ฒ 1..N`, `๋ชฉ๋ก1..N` ํ˜•ํƒœ์—ฌ์•ผ ํ•ฉ๋‹ˆ๋‹ค.
138
- - `์ข…๋ฃŒ์—ฐ๋„` ๊ฐ’์€ ๋ฐ•์Šค๋ณ„ ์ตœ์†Œ~์ตœ๋Œ€๋กœ ๋ฌถ์–ด **์ƒ์‚ฐ์—ฐ๋„(๋ฒ”์œ„)** ๋กœ ์ž…๋ ฅ๋ฉ๋‹ˆ๋‹ค.
139
  """)
140
 
141
  tpl_file = st.file_uploader("๐Ÿ“„ HWPX ํ…œํ”Œ๋ฆฟ ์—…๋กœ๋“œ", type=["hwpx"])
@@ -164,8 +172,8 @@ if tpl_file and data_file:
164
  work = merged[merged["๋ฐ•์Šค๋ฒˆํ˜ธ"].isin(selected)] if selected else merged
165
  rows = work.sort_values("๋ฐ•์Šค๋ฒˆํ˜ธ").to_dict(orient="records")
166
 
167
- # 1ํŽ˜์ด์ง€ ๋งคํ•‘ ํ”„๋ฆฌ๋ทฐ
168
- st.subheader("๐Ÿงช 1ํŽ˜์ด์ง€ ๋งคํ•‘ ํ”„๋ฆฌ๋ทฐ (ํ•„๋“œ ์ด๋ฆ„ โ†” ๊ฐ’)")
169
  keys = ["๋ฐ•์Šค๋ฒˆํ˜ธ","์ข…๋ฃŒ์—ฐ๋„","๋ณด์กด๊ธฐ๊ฐ„","๋‹จ์œ„์—…๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","๋ชฉ๋ก"]
170
  n = int(batch_size)
171
  preview = {}
@@ -177,15 +185,17 @@ if tpl_file and data_file:
177
  else:
178
  for k in keys:
179
  preview[f"{k}{i+1}"] = ""
180
- st.dataframe(pd.DataFrame([{"ํ•„๋“œ๋ช…":k, "๊ฐ’ ์•ž๋ถ€๋ถ„":str(v)[:120]} for k,v in sorted(preview.items())]),
181
- use_container_width=True, height=320)
 
 
182
 
183
  if st.button("๐Ÿš€ ๋ผ๋ฒจ ์ƒ์„ฑ (ํŽ˜์ด์ง€๋ณ„ HWPX ZIP)"):
184
  mem_zip = io.BytesIO()
185
  zout = zipfile.ZipFile(mem_zip, "w", zipfile.ZIP_DEFLATED)
186
-
187
  pages = (len(rows) + n - 1) // n
188
  all_dbg = []
 
189
  for p in range(pages):
190
  chunk = rows[p*n:(p+1)*n]
191
  mapping = {}
@@ -198,7 +208,7 @@ if tpl_file and data_file:
198
  for k in keys:
199
  mapping[f"{k}{i+1}"] = ""
200
 
201
- out_hwpx, dbg = apply_field_mode(tpl_bytes, mapping, collect_debug=True)
202
  all_dbg.append({"page": p+1, "stats": dbg})
203
  name = "_".join([r.get("๋ฐ•์Šค๋ฒˆํ˜ธ","") for r in chunk]) if chunk else f"empty_{p+1}"
204
  zout.writestr(f"label_{name}.hwpx", out_hwpx)
@@ -208,4 +218,4 @@ if tpl_file and data_file:
208
  st.download_button("โฌ‡๏ธ ๋””๋ฒ„๊ทธ(JSON)", data=json.dumps(all_dbg, ensure_ascii=False, indent=2),
209
  file_name="debug.json", mime="application/json")
210
 
211
- st.caption("ํ•„๋“œ ๋‚ด๋ถ€์— ๊ฐ’ + fieldEnd ๋’ค์— ๊ฐ€์‹œ ํ…์ŠคํŠธ๋ฅผ **์ค‘๋ณต ์‚ฝ์ž…**ํ•˜๋ฏ€๋กœ, ํ•œ๊ธ€์—์„œ ๊ฐ’์ด ํ•ญ์ƒ ๋ณด์ž…๋‹ˆ๋‹ค.")
 
2
  import pandas as pd
3
  import io, zipfile, re, html, json
4
 
5
+ st.set_page_config(page_title="๐Ÿ“ฆ ๋ฐ•์Šค๋ผ๋ฒจ ์ž๋™ ์ƒ์„ฑ๊ธฐ (HWPX ํ•„๋“œ ํ‰๋ฌธํ™”)", layout="wide")
6
+ st.title("๐Ÿ“ฆ ๋ฐ•์Šค๋ผ๋ฒจ ์ž๋™ ์ƒ์„ฑ๊ธฐ โ€” HWPX **ํ•„๋“œ ์ œ๊ฑฐ/ํ‰๋ฌธํ™” ๋ฐฉ์‹**")
7
 
8
+ # ================= ๊ณตํ†ต ์œ ํ‹ธ =================
9
  def compute_year_range(series: pd.Series) -> str:
10
  s = series.astype(str).fillna("")
11
  valid = s[~s.isin(["", "0", "0000"])]
 
29
  else:
30
  prod_df = pd.DataFrame({"๋ฐ•์Šค๋ฒˆํ˜ธ": df["๋ฐ•์Šค๋ฒˆํ˜ธ"].unique(), "์ƒ์‚ฐ์—ฐ๋„": "0000-0000"})
31
 
32
+ # ๋ชฉ๋ก(๊ด€๋ฆฌ๋ฒˆํ˜ธ + ์ œ๋ชฉ)
33
  has_mgmt = "๊ด€๋ฆฌ๋ฒˆํ˜ธ" in df.columns
34
  list_rows = []
35
  for box, g in df.groupby("๋ฐ•์Šค๋ฒˆํ˜ธ"):
36
+ lines = [f"- {r['๊ด€๋ฆฌ๋ฒˆํ˜ธ']} {r['์ œ๋ชฉ']}" if has_mgmt else f"- {r['์ œ๋ชฉ']}"
37
+ for _, r in g.iterrows()]
38
  list_rows.append({"๋ฐ•์Šค๋ฒˆํ˜ธ": box, "๋ชฉ๋ก": "\r\n".join(lines)})
39
  list_df = pd.DataFrame(list_rows)
40
 
 
45
 
46
  return meta_df.merge(list_df, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left").merge(prod_df, on="๋ฐ•์Šค๋ฒˆํ˜ธ", how="left")
47
 
 
 
 
48
  def _runs_plain(text: str) -> str:
49
  return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>"
50
 
 
58
  parts.append(f"<hp:run><hp:t>{html.escape(ln)}</hp:t></hp:run>")
59
  return "".join(parts)
60
 
61
+ # =============== HWPX ์“ฐ๊ธฐ (mimetype ๋งจ์•ž/๋ฌด์••์ถ•) ===============
62
+ def write_hwpx_like_src(zin: zipfile.ZipFile, writer_fn) -> bytes:
63
  out = io.BytesIO()
64
  zout = zipfile.ZipFile(out, "w")
65
 
 
66
  if "mimetype" in zin.namelist():
67
  zi = zipfile.ZipInfo("mimetype")
68
  zi.compress_type = zipfile.ZIP_STORED
69
  zout.writestr(zi, zin.read("mimetype"))
70
 
 
71
  for e in zin.infolist():
72
  if e.filename == "mimetype":
73
  continue
 
86
  zout.close(); out.seek(0)
87
  return out.getvalue()
88
 
89
+ # =============== ํ•„๋“œ ํ‰๋ฌธํ™”(์ œ๊ฑฐ) ์น˜ํ™˜ ===============
90
+ # ํ•œ๊ธ€์€ ํ•„๋“œ๊ฐ€ ๋ณดํ†ต ์ด๋ ‡๊ฒŒ ๋“ค์–ด๊ฐ‘๋‹ˆ๋‹ค:
91
+ # <hp:run> ... <hp:fieldBegin name="ํ‚ค" .../> ... </hp:run>
92
+ # (์ค‘๊ฐ„์— ์—ฌ๋Ÿฌ run/ํ…์ŠคํŠธ)
93
+ # <hp:run> ... <hp:fieldEnd/> ... </hp:run>
94
+ # => ์•„๋ž˜ ์ •๊ทœ์‹์œผ๋กœ "fieldBegin run ~ fieldEnd run" ์ „์ฒด๋ฅผ ๊ฐ’ run๋“ค๋กœ ๋Œ€์ฒดํ•ฉ๋‹ˆ๋‹ค.
95
+ FIELD_RANGE_RE_TMPL = (
96
+ r'(<hp:run[^>]*>[^<]*'
97
+ r'<hp:fieldBegin[^>]*name="{name}"[^>]*/>'
98
+ r'.*?</hp:run>)'
99
+ r'(.*?)'
100
+ r'(<hp:run[^>]*>.*?<hp:fieldEnd[^>]*/>.*?</hp:run>)'
101
+ )
102
+
103
+ def apply_field_flatten(hwpx_bytes: bytes, mapping: dict, collect_debug=False):
104
+ dbg = {"mode":"field-flatten","files_touched":[], "field_hits":{}} if collect_debug else None
105
  zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
106
 
107
+ # ์‹ค์ œ ์กด์žฌํ•˜๋Š” name๋งŒ ์ถ”์ถœ
108
  present = set()
109
  for e in zin.infolist():
110
  if e.filename.startswith("Contents/") and e.filename.endswith(".xml"):
111
+ try:
112
+ s = zin.read(e.filename).decode("utf-8", errors="ignore")
113
+ for k in mapping.keys():
114
+ if f'name="{k}"' in s:
115
+ present.add(k)
116
+ except:
117
+ pass
118
 
119
  def writer(fname: str, xml: str) -> str:
120
  changed = False
121
  for k in present:
122
  val = mapping.get(k, "")
123
  is_list = bool(re.match(r"^(๋ชฉ๋ก|list)\d+$", k, re.IGNORECASE))
124
+ replacement_runs = _runs_list(val) if is_list else _runs_plain(val)
 
 
 
 
 
 
 
125
 
126
+ pat = re.compile(FIELD_RANGE_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
127
+ xml2, n = pat.subn(replacement_runs, xml)
128
  if n:
129
  changed = True
130
  xml = xml2
131
+ if dbg: dbg["field_hits"][k] = dbg["field_hits"].get(k, 0) + 1
132
+
133
  if changed and dbg and fname not in dbg["files_touched"]:
134
  dbg["files_touched"].append(fname)
135
  return xml
136
 
137
+ out = write_hwpx_like_src(zin, writer)
138
  zin.close()
139
  return (out, dbg) if collect_debug else (out, None)
140
 
141
+ # ================= UI =================
142
  with st.expander("์‚ฌ์šฉ๋ฒ•", expanded=True):
143
  st.markdown("""
144
+ - ํ…œํ”Œ๋ฆฟ์€ **ํ•œ๊ธ€ ํ•„๋“œ์ปจํŠธ๋กค**์ด์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. (์˜ˆ: `name="๋ฐ•์Šค๋ฒˆํ˜ธ1"`)
145
+ - ์ด ์•ฑ์€ ํ•„๋“œ ๊ตฌ๊ฐ„์„ **ํ‰๋ฌธํ™”(ํ•„๋“œ ์ œ๊ฑฐ)** ํ•˜์—ฌ ๊ฐ’ run๋“ค๋กœ ๋ฐ”๊ฟ‰๋‹ˆ๋‹ค. โ†’ ํ•œ๊ธ€ ๋ทฐ์–ด์—์„œ **ํ•ญ์ƒ ๋ณด์ž„**.
146
+ - ๋ผ๋ฒจ ํ•œ ํŽ˜์ด์ง€์— N๊ฐœ๋ฉด, ํ•„๋“œ๋ช…์€ `๋ฐ•์Šค๋ฒˆํ˜ธ1..N`, `์ข…๋ฃŒ์—ฐ๋„1..N`, `๋ณด์กด๊ธฐ๊ฐ„1..N`, `๋‹จ์œ„์—…๋ฌด1..N`, `๊ธฐ๋ก๋ฌผ์ฒ 1..N`, `๋ชฉ๋ก1..N`.
147
  """)
148
 
149
  tpl_file = st.file_uploader("๐Ÿ“„ HWPX ํ…œํ”Œ๋ฆฟ ์—…๋กœ๋“œ", type=["hwpx"])
 
172
  work = merged[merged["๋ฐ•์Šค๋ฒˆํ˜ธ"].isin(selected)] if selected else merged
173
  rows = work.sort_values("๋ฐ•์Šค๋ฒˆํ˜ธ").to_dict(orient="records")
174
 
175
+ # 1ํŽ˜์ด์ง€ ํ”„๋ฆฌ๋ทฐ
176
+ st.subheader("๐Ÿงช 1ํŽ˜์ด์ง€ ๋งคํ•‘ ํ”„๋ฆฌ๋ทฐ")
177
  keys = ["๋ฐ•์Šค๋ฒˆํ˜ธ","์ข…๋ฃŒ์—ฐ๋„","๋ณด์กด๊ธฐ๊ฐ„","๋‹จ์œ„์—…๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","๋ชฉ๋ก"]
178
  n = int(batch_size)
179
  preview = {}
 
185
  else:
186
  for k in keys:
187
  preview[f"{k}{i+1}"] = ""
188
+ st.dataframe(
189
+ pd.DataFrame([{"ํ•„๋“œ๋ช…":k, "๊ฐ’ ์•ž๋ถ€๋ถ„":str(v)[:120]} for k,v in sorted(preview.items())]),
190
+ use_container_width=True, height=320
191
+ )
192
 
193
  if st.button("๐Ÿš€ ๋ผ๋ฒจ ์ƒ์„ฑ (ํŽ˜์ด์ง€๋ณ„ HWPX ZIP)"):
194
  mem_zip = io.BytesIO()
195
  zout = zipfile.ZipFile(mem_zip, "w", zipfile.ZIP_DEFLATED)
 
196
  pages = (len(rows) + n - 1) // n
197
  all_dbg = []
198
+
199
  for p in range(pages):
200
  chunk = rows[p*n:(p+1)*n]
201
  mapping = {}
 
208
  for k in keys:
209
  mapping[f"{k}{i+1}"] = ""
210
 
211
+ out_hwpx, dbg = apply_field_flatten(tpl_bytes, mapping, collect_debug=True)
212
  all_dbg.append({"page": p+1, "stats": dbg})
213
  name = "_".join([r.get("๋ฐ•์Šค๋ฒˆํ˜ธ","") for r in chunk]) if chunk else f"empty_{p+1}"
214
  zout.writestr(f"label_{name}.hwpx", out_hwpx)
 
218
  st.download_button("โฌ‡๏ธ ๋””๋ฒ„๊ทธ(JSON)", data=json.dumps(all_dbg, ensure_ascii=False, indent=2),
219
  file_name="debug.json", mime="application/json")
220
 
221
+ st.caption("ํ•„๋“œ ๊ตฌ๊ฐ„์„ ํ†ต์งธ๋กœ ๊ฐ’ run๋“ค๋กœ ๊ต์ฒดํ•ฉ๋‹ˆ๋‹ค. (ํ•„๋“œ ์ œ๊ฑฐ โ†’ ๊ฐ’์ด ํ™•์‹คํžˆ ๋ณด์ž…๋‹ˆ๋‹ค)")