dohyune commited on
Commit
b6bf769
Β·
verified Β·
1 Parent(s): 57565d7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -19
app.py CHANGED
@@ -1,10 +1,10 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import io, zipfile, re, html, json
4
- from typing import Dict, Tuple, Optional
5
 
6
- st.set_page_config(page_title="πŸ“¦ λ°•μŠ€λΌλ²¨(HWPX) β€” ν•„λ“œ μ™„μ „ μΉ˜ν™˜", layout="wide")
7
- st.title("πŸ“¦ λ°•μŠ€λΌλ²¨ μžλ™ 생성기 β€” HWPX ν•„λ“œ μ™„μ „ μΉ˜ν™˜(λͺ¨λ“  XML / 접두어 μ™€μΌλ“œμΉ΄λ“œ)")
8
 
9
  # ================== 데이터 μœ ν‹Έ ==================
10
  def _year_range(series: pd.Series) -> str:
@@ -47,20 +47,28 @@ def build_rows(df: pd.DataFrame) -> pd.DataFrame:
47
  return merged
48
 
49
  # ================== μΉ˜ν™˜ μœ ν‹Έ ==================
50
- # 접두어 μ™€μΌλ“œμΉ΄λ“œ: <hp:..> 뿐 μ•„λ‹ˆλΌ <hwp:..>, <h:..> λ“± λͺ¨λ‘ ν—ˆμš©
51
  FIELD_PAIR_RE_TMPL = (
52
  r'<(?P<prefix>[a-zA-Z0-9_]+):fieldBegin\b[^>]*\bname="{name}"[^>]*/>'
53
  r'(.*?)'
54
  r'<(?P=prefix):fieldEnd\b[^>]*/>'
55
  )
56
 
57
- # 토큰(λ°±μ—… 경둜)
58
  TOKEN_FMT = "{{{{{key}}}}}"
59
 
60
- def _run_for_plain(text: str) -> str:
 
 
 
 
 
 
 
 
61
  return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>"
62
 
63
- def _run_for_list(text: str) -> str:
64
  if text is None: return ""
65
  lines = str(text).replace("\r\n","\n").split("\n")
66
  parts = []
@@ -72,11 +80,10 @@ def _run_for_list(text: str) -> str:
72
  def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
73
  changed_any = False
74
 
75
- # 1) ν•„λ“œμŒ μ™„μ „ μΉ˜ν™˜ (λͺ¨λ“  접두어, λͺ¨λ“  XML λŒ€μƒ)
76
  for k, v in mapping.items():
77
  is_list = bool(re.match(r"^(λͺ©λ‘|list)\d+$", k, re.IGNORECASE))
78
- replacement = _run_for_list(v) if is_list else _run_for_plain(v)
79
-
80
  pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
81
  xml_new, n = pat.subn(replacement, xml)
82
  if n:
@@ -84,21 +91,54 @@ def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
84
  xml = xml_new
85
  changed_any = True
86
 
87
- # 2) λ°±μ—… 경둜: 토큰 μΉ˜ν™˜ ({{ν‚€}}κ°€ μ–΄λ”˜κ°€ λ‚¨μ•„μžˆλ‹€λ©΄)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  for k, v in mapping.items():
89
  tok = TOKEN_FMT.format(key=k)
90
  if tok in xml:
91
- rep = _run_for_list(v) if re.match(r"^(λͺ©λ‘|list)\d+$", k, re.IGNORECASE) else html.escape("" if v is None else str(v))
92
  xml = xml.replace(tok, rep)
93
  dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1
94
  changed_any = True
95
 
96
  if changed_any:
97
- dbg["files_touched"] = True
98
  return xml
99
 
100
  def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, dict]:
101
- dbg = {"field_hits":{}, "token_hits":{}, "touched_files": []}
102
  zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
103
  out_buf = io.BytesIO()
104
  zout = zipfile.ZipFile(out_buf, "w")
@@ -117,7 +157,8 @@ def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, d
117
  try:
118
  s = data.decode("utf-8", errors="ignore")
119
  before = s
120
- s = _apply_to_xml(s, mapping, {"field_hits": dbg["field_hits"], "token_hits": dbg["token_hits"], "files_touched": False})
 
121
  if s != before:
122
  dbg["touched_files"].append(e.filename)
123
  data = s.encode("utf-8")
@@ -132,10 +173,11 @@ def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, d
132
  # ================== UI ==================
133
  with st.expander("μ‚¬μš©λ²•", expanded=True):
134
  st.markdown("""
135
- - ν…œν”Œλ¦Ώμ€ **HWPX(ν•œκΈ€)**이며, ν•„λ“œμ»¨νŠΈλ‘€ `name="λ°•μŠ€λ²ˆν˜Έ1"` λ“±μ˜ 이름을 κ·ΈλŒ€λ‘œ μ‚¬μš©ν•©λ‹ˆλ‹€.
136
- - 이 앱은 ZIP λ‚΄λΆ€μ˜ **λͺ¨λ“  XML**을 ν›‘μœΌλ©°, 접두어가 무엇이든(`<hp:...>`, `<hwp:...>` λ“±)
137
- **`fieldBegin(name=ν‚€)` ~ `fieldEnd` 사이 전체 블둝을 κ°’ runλ“€λ‘œ ꡐ체**ν•©λ‹ˆλ‹€.
138
- - 토큰(`{{λ°•μŠ€λ²ˆν˜Έ1}}`)이 남아 있으면 **λ°±μ—… 경둜**둜 κ·Έ μžλ¦¬λ„ μΉ˜ν™˜ν•©λ‹ˆλ‹€.
 
139
  """)
140
 
141
  tpl = st.file_uploader("πŸ“„ HWPX ν…œν”Œλ¦Ώ μ—…λ‘œλ“œ", type=["hwpx"])
 
1
  import streamlit as st
2
  import pandas as pd
3
  import io, zipfile, re, html, json
4
+ from typing import Dict, Tuple
5
 
6
+ st.set_page_config(page_title="πŸ“¦ λ°•μŠ€λΌλ²¨(HWPX) β€” μ™„μ „ μΉ˜ν™˜(ν•„λ“œ/토큰/ν…μŠ€νŠΈ)", layout="wide")
7
+ st.title("πŸ“¦ λ°•μŠ€λΌλ²¨ μžλ™ 생성기 β€” HWPX ν•„λ“œΒ·ν† ν°Β·ν…μŠ€νŠΈ μžλ¦¬ν‘œμ‹œμž **μ™„μ „ μΉ˜ν™˜**")
8
 
9
  # ================== 데이터 μœ ν‹Έ ==================
10
  def _year_range(series: pd.Series) -> str:
 
47
  return merged
48
 
49
  # ================== μΉ˜ν™˜ μœ ν‹Έ ==================
50
+ # 1) 접두어 μ™€μΌλ“œμΉ΄λ“œ: <hp:..> 뿐 μ•„λ‹ˆλΌ <hwp:..>, <h:..> λ“± λͺ¨λ‘ ν—ˆμš©
51
  FIELD_PAIR_RE_TMPL = (
52
  r'<(?P<prefix>[a-zA-Z0-9_]+):fieldBegin\b[^>]*\bname="{name}"[^>]*/>'
53
  r'(.*?)'
54
  r'<(?P=prefix):fieldEnd\b[^>]*/>'
55
  )
56
 
57
+ # 2) 토큰(λ°±μ—… 경둜)
58
  TOKEN_FMT = "{{{{{key}}}}}"
59
 
60
+ # 3) 순수 ν…μŠ€νŠΈ μžλ¦¬ν‘œμ‹œμž: <hp:run>…<hp:t>ν‚€</hp:t>…</hp:run> 전체λ₯Ό κ°’ runλ“€λ‘œ ꡐ체
61
+ TEXT_RUN_RE_TMPL = (
62
+ r'(<(?P<prefix>[a-zA-Z0-9_]+):run\b[^>]*>\s*'
63
+ r'(?:<(?P=prefix):t[^>]*>)\s*)'
64
+ r'{name}'
65
+ r'(\s*(?:</(?P=prefix):t>)\s*</(?P=prefix):run>)'
66
+ )
67
+
68
+ def _runs_plain(text: str) -> str:
69
  return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>"
70
 
71
+ def _runs_list(text: str) -> str:
72
  if text is None: return ""
73
  lines = str(text).replace("\r\n","\n").split("\n")
74
  parts = []
 
80
  def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str:
81
  changed_any = False
82
 
83
+ # 1) ν•„λ“œμŒ μ™„μ „ μΉ˜ν™˜
84
  for k, v in mapping.items():
85
  is_list = bool(re.match(r"^(λͺ©λ‘|list)\d+$", k, re.IGNORECASE))
86
+ replacement = _runs_list(v) if is_list else _runs_plain(v)
 
87
  pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
88
  xml_new, n = pat.subn(replacement, xml)
89
  if n:
 
91
  xml = xml_new
92
  changed_any = True
93
 
94
+ # 2) 순수 ν…μŠ€νŠΈ μžλ¦¬ν‘œμ‹œμž μΉ˜ν™˜ (<*:t>ν‚€</*:t>λ₯Ό κ°’ runλ“€λ‘œ)
95
+ for k, v in mapping.items():
96
+ is_list = bool(re.match(r"^(λͺ©λ‘|list)\d+$", k, re.IGNORECASE))
97
+ replacement = _runs_list(v) if is_list else _runs_plain(v)
98
+ # μ •ν™• 일치 μΉ˜ν™˜ (κ³΅λ°±Β·κ°œν–‰ 없이 λ”± κ·Έ ν…μŠ€νŠΈμΌ λ•Œ)
99
+ pat_text = re.compile(TEXT_RUN_RE_TMPL.format(name=re.escape(k)), re.DOTALL)
100
+ xml_new, n = pat_text.subn(replacement, xml)
101
+ if n:
102
+ dbg["text_hits"][k] = dbg["text_hits"].get(k, 0) + n
103
+ xml = xml_new
104
+ changed_any = True
105
+ else:
106
+ # λΆ€λΆ„ 일치(ν•œ run μ•ˆμ— λ‹€λ₯Έ λ¬Έμžμ™€ μ„žμ—¬ μžˆμ„ λ•Œ) β†’ <t>λ‚΄μš©</t>만 κ°’μœΌλ‘œ κ°ˆμ•„λΌμ›€(μ€„λ°”κΏˆμ€ \n κ·ΈλŒ€λ‘œ)
107
+ # μ•ˆμ „μ„ μœ„ν•΄ 'ν•œ run λ‚΄'μ—μ„œλ§Œ 처리
108
+ pat_tnode = re.compile(
109
+ r'(<(?P<prefix>[a-zA-Z0-9_]+):t[^>]*>)([^<]*?)</(?P=prefix):t>',
110
+ re.DOTALL
111
+ )
112
+ def repl_tnode(m):
113
+ text_node = m.group(3)
114
+ if k not in text_node:
115
+ return m.group(0)
116
+ # λ‹¨μˆœ ν…μŠ€νŠΈ μΉ˜ν™˜ (리슀트면 μ€„λ°”κΏˆμ„ μ‹€μ œ lineBreak둜 λ°”κΎΈκΈ° μ–΄λ €μ›Œμ„œ μ—¬κΈ°μ„  plain ν…μŠ€νŠΈλ‘œ μš°μ„  λŒ€μž…)
117
+ val = "" if v is None else str(v)
118
+ new_text = html.escape(text_node.replace(k, val))
119
+ return f"{m.group(1)}{new_text}</{m.group('prefix')}:t>"
120
+
121
+ xml2 = pat_tnode.sub(repl_tnode, xml)
122
+ if xml2 != xml:
123
+ dbg["text_hits"][k] = dbg["text_hits"].get(k, 0) + 1
124
+ xml = xml2
125
+ changed_any = True
126
+
127
+ # 3) 토큰 μΉ˜ν™˜
128
  for k, v in mapping.items():
129
  tok = TOKEN_FMT.format(key=k)
130
  if tok in xml:
131
+ rep = _runs_list(v) if re.match(r"^(λͺ©λ‘|list)\d+$", k, re.IGNORECASE) else html.escape("" if v is None else str(v))
132
  xml = xml.replace(tok, rep)
133
  dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1
134
  changed_any = True
135
 
136
  if changed_any:
137
+ dbg["touched"] = True
138
  return xml
139
 
140
  def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, dict]:
141
+ dbg = {"field_hits":{}, "text_hits":{}, "token_hits":{}, "touched_files": []}
142
  zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r")
143
  out_buf = io.BytesIO()
144
  zout = zipfile.ZipFile(out_buf, "w")
 
157
  try:
158
  s = data.decode("utf-8", errors="ignore")
159
  before = s
160
+ local_dbg = {"field_hits": dbg["field_hits"], "text_hits": dbg["text_hits"], "token_hits": dbg["token_hits"], "touched": False}
161
+ s = _apply_to_xml(s, mapping, local_dbg)
162
  if s != before:
163
  dbg["touched_files"].append(e.filename)
164
  data = s.encode("utf-8")
 
173
  # ================== UI ==================
174
  with st.expander("μ‚¬μš©λ²•", expanded=True):
175
  st.markdown("""
176
+ - 이 앱은 HWPX의 **λͺ¨λ“  XML**μ—μ„œ λ‹€μŒ 3κ°€μ§€λ₯Ό μˆœμ„œλŒ€λ‘œ μΉ˜ν™˜ν•©λ‹ˆλ‹€.
177
+ 1) `fieldBegin(name=ν‚€)` ~ `fieldEnd` **ν•„λ“œμŒ** 전체λ₯Ό κ°’μœΌλ‘œ ꡐ체
178
+ 2) `<*:t>ν‚€</*:t>` 같은 **순수 ν…μŠ€νŠΈ μžλ¦¬ν‘œμ‹œμž**λ₯Ό κ°’(runλ“€)으둜 ꡐ체
179
+ 3) `{{ν‚€}}` **토큰**을 κ°’μœΌλ‘œ ꡐ체
180
+ - 디버그 JSON의 `field_hits`/`text_hits`/`token_hits`κ°€ 0이면 ν…œν”Œλ¦Ώμ— ν•΄λ‹Ή νƒ€μž…μ˜ μžλ¦¬ν‘œμ‹œμžκ°€ μ—†λŠ” κ²ƒμž…λ‹ˆλ‹€.
181
  """)
182
 
183
  tpl = st.file_uploader("πŸ“„ HWPX ν…œν”Œλ¦Ώ μ—…λ‘œλ“œ", type=["hwpx"])