seawolf2357 commited on
Commit
2dfa8dd
Β·
verified Β·
1 Parent(s): cfe72d3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +405 -0
app.py ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HWP 파일 λ³€ν™˜κΈ° - Gradio μ›Ή μ•±
3
+ pyhwp μ €μž₯μ†Œ 기반으둜 λ™μž‘
4
+ """
5
+ import gradio as gr
6
+ import tempfile
7
+ import os
8
+ import subprocess
9
+ import shutil
10
+ import sys
11
+ from pathlib import Path
12
+
13
+ # 둜컬 pyhwp νŒ¨ν‚€μ§€ 경둜 μΆ”κ°€
14
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
15
+ PYHWP_PATH = os.path.join(SCRIPT_DIR, 'pyhwp')
16
+ if os.path.exists(PYHWP_PATH):
17
+ sys.path.insert(0, PYHWP_PATH)
18
+ print(f"Added local pyhwp path: {PYHWP_PATH}")
19
+
20
+ # pyhwp λͺ¨λ“ˆ import μ‹œλ„
21
+ try:
22
+ from hwp5.filestructure import Hwp5File
23
+ from hwp5 import plat
24
+ PYHWP_AVAILABLE = True
25
+ print("pyhwp modules loaded successfully")
26
+ except ImportError as e:
27
+ PYHWP_AVAILABLE = False
28
+ print(f"Warning: Could not import pyhwp modules: {e}")
29
+
30
+ # olefile은 κΈ°λ³Έ ν…μŠ€νŠΈ μΆ”μΆœμš©
31
+ try:
32
+ import olefile
33
+ OLEFILE_AVAILABLE = True
34
+ except ImportError:
35
+ OLEFILE_AVAILABLE = False
36
+
37
+
38
+ def check_hwp_version(file_path):
39
+ """HWP 파일 버전 확인"""
40
+ try:
41
+ with open(file_path, 'rb') as f:
42
+ header = f.read(32)
43
+ # HWP μ‹œκ·Έλ‹ˆμ²˜ 확인
44
+ if b'HWP Document File' in header:
45
+ return "HWP v5", True
46
+ elif header[:4] == b'\xd0\xcf\x11\xe0': # OLE μ‹œκ·Έλ‹ˆμ²˜
47
+ return "HWP v5 (OLE)", True
48
+ else:
49
+ return "Unknown", False
50
+ except Exception as e:
51
+ return f"Error: {e}", False
52
+
53
+
54
+ def extract_text_with_olefile(input_path):
55
+ """olefile을 μ‚¬μš©ν•œ κΈ°λ³Έ ν…μŠ€νŠΈ μΆ”μΆœ"""
56
+ if not OLEFILE_AVAILABLE:
57
+ return None, "olefile λͺ¨λ“ˆμ΄ μ—†μŠ΅λ‹ˆλ‹€."
58
+
59
+ try:
60
+ ole = olefile.OleFileIO(input_path)
61
+ text_parts = []
62
+
63
+ # 슀트림 λͺ©λ‘ 확인
64
+ for entry in ole.listdir():
65
+ entry_path = '/'.join(entry)
66
+
67
+ # BodyTextλ‚˜ Section κ΄€λ ¨ μŠ€νŠΈλ¦Όμ—μ„œ ν…μŠ€νŠΈ μΆ”μΆœ
68
+ if any(keyword in entry_path for keyword in ['BodyText', 'Section']):
69
+ try:
70
+ stream = ole.openstream(entry)
71
+ data = stream.read()
72
+
73
+ # UTF-16 LE λ””μ½”λ”© μ‹œλ„
74
+ try:
75
+ text = data.decode('utf-16-le', errors='ignore')
76
+ # μ œμ–΄ 문자 필터링
77
+ cleaned = ''.join(
78
+ c for c in text
79
+ if c.isprintable() or c in '\n\r\t '
80
+ )
81
+ if cleaned.strip():
82
+ text_parts.append(cleaned.strip())
83
+ except:
84
+ pass
85
+ except:
86
+ continue
87
+
88
+ ole.close()
89
+
90
+ if text_parts:
91
+ return '\n\n'.join(text_parts), None
92
+ else:
93
+ return None, "ν…μŠ€νŠΈλ₯Ό μΆ”μΆœν•  수 μ—†μŠ΅λ‹ˆλ‹€."
94
+
95
+ except Exception as e:
96
+ return None, f"OLE 파일 처리 였λ₯˜: {str(e)}"
97
+
98
+
99
+ def convert_to_html_subprocess(input_path, output_dir):
100
+ """subprocess둜 hwp5html μ‹€ν–‰"""
101
+ output_path = os.path.join(output_dir, "output.html")
102
+
103
+ # κ°€λŠ₯ν•œ hwp5html κ²½λ‘œλ“€
104
+ hwp5html_paths = [
105
+ 'hwp5html', # PATH에 μžˆλŠ” 경우
106
+ os.path.join(SCRIPT_DIR, 'bin', 'hwp5html'),
107
+ os.path.join(PYHWP_PATH, 'bin', 'hwp5html'),
108
+ sys.executable.replace('python', 'hwp5html'),
109
+ ]
110
+
111
+ for cmd_path in hwp5html_paths:
112
+ try:
113
+ result = subprocess.run(
114
+ [cmd_path, '--output', output_path, input_path],
115
+ capture_output=True,
116
+ text=True,
117
+ timeout=120
118
+ )
119
+
120
+ if result.returncode == 0 and os.path.exists(output_path):
121
+ return output_path, None
122
+ except (FileNotFoundError, subprocess.TimeoutExpired):
123
+ continue
124
+ except Exception as e:
125
+ continue
126
+
127
+ return None, "hwp5html λͺ…령을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
128
+
129
+
130
+ def convert_to_odt_subprocess(input_path, output_dir):
131
+ """subprocess둜 hwp5odt μ‹€ν–‰"""
132
+ output_path = os.path.join(output_dir, "output.odt")
133
+
134
+ hwp5odt_paths = [
135
+ 'hwp5odt',
136
+ os.path.join(SCRIPT_DIR, 'bin', 'hwp5odt'),
137
+ os.path.join(PYHWP_PATH, 'bin', 'hwp5odt'),
138
+ ]
139
+
140
+ for cmd_path in hwp5odt_paths:
141
+ try:
142
+ result = subprocess.run(
143
+ [cmd_path, '--output', output_path, input_path],
144
+ capture_output=True,
145
+ text=True,
146
+ timeout=120
147
+ )
148
+
149
+ if result.returncode == 0 and os.path.exists(output_path):
150
+ return output_path, None
151
+ except (FileNotFoundError, subprocess.TimeoutExpired):
152
+ continue
153
+ except Exception as e:
154
+ continue
155
+
156
+ return None, "hwp5odt λͺ…령을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
157
+
158
+
159
+ def convert_to_txt_subprocess(input_path, output_dir):
160
+ """subprocess둜 hwp5txt μ‹€ν–‰"""
161
+ output_path = os.path.join(output_dir, "output.txt")
162
+
163
+ hwp5txt_paths = [
164
+ 'hwp5txt',
165
+ os.path.join(SCRIPT_DIR, 'bin', 'hwp5txt'),
166
+ os.path.join(PYHWP_PATH, 'bin', 'hwp5txt'),
167
+ ]
168
+
169
+ for cmd_path in hwp5txt_paths:
170
+ try:
171
+ result = subprocess.run(
172
+ [cmd_path, input_path],
173
+ capture_output=True,
174
+ text=True,
175
+ timeout=120
176
+ )
177
+
178
+ if result.returncode == 0 and result.stdout:
179
+ with open(output_path, 'w', encoding='utf-8') as f:
180
+ f.write(result.stdout)
181
+ return output_path, None
182
+ except (FileNotFoundError, subprocess.TimeoutExpired):
183
+ continue
184
+ except Exception as e:
185
+ continue
186
+
187
+ return None, "hwp5txt λͺ…령을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
188
+
189
+
190
+ def convert_hwp(file, output_format, progress=gr.Progress()):
191
+ """HWP 파일 λ³€ν™˜ 메인 ν•¨μˆ˜"""
192
+
193
+ if file is None:
194
+ return None, "❌ νŒŒμΌμ„ μ—…λ‘œλ“œν•΄μ£Όμ„Έμš”.", ""
195
+
196
+ # 파일 경둜 처리
197
+ if hasattr(file, 'name'):
198
+ input_file = file.name
199
+ else:
200
+ input_file = str(file)
201
+
202
+ # 파일 ν™•μž₯자 확인
203
+ if not input_file.lower().endswith('.hwp'):
204
+ return None, "❌ HWP 파일만 μ§€μ›λ©λ‹ˆλ‹€.", ""
205
+
206
+ progress(0.1, desc="파일 뢄석 쀑...")
207
+
208
+ # HWP 버전 확인
209
+ version, is_valid = check_hwp_version(input_file)
210
+ if not is_valid:
211
+ return None, f"❌ μ§€μ›ν•˜μ§€ μ•ŠλŠ” 파일 ν˜•μ‹μž…λ‹ˆλ‹€: {version}", ""
212
+
213
+ # μž„μ‹œ 디렉토리 생성
214
+ tmp_dir = tempfile.mkdtemp()
215
+
216
+ try:
217
+ # μž…λ ₯ 파일 볡사
218
+ input_filename = os.path.basename(input_file)
219
+ input_path = os.path.join(tmp_dir, input_filename)
220
+ shutil.copy(input_file, input_path)
221
+
222
+ progress(0.3, desc=f"{output_format}둜 λ³€ν™˜ 쀑...")
223
+
224
+ output_path = None
225
+ error = None
226
+ ext = ""
227
+
228
+ # λ³€ν™˜ μˆ˜ν–‰
229
+ if output_format == "HTML":
230
+ output_path, error = convert_to_html_subprocess(input_path, tmp_dir)
231
+ ext = ".html"
232
+
233
+ elif output_format == "ODT (OpenDocument)":
234
+ output_path, error = convert_to_odt_subprocess(input_path, tmp_dir)
235
+ ext = ".odt"
236
+
237
+ elif output_format == "TXT (ν…μŠ€νŠΈ)":
238
+ # λ¨Όμ € hwp5txt μ‹œλ„
239
+ output_path, error = convert_to_txt_subprocess(input_path, tmp_dir)
240
+
241
+ # μ‹€νŒ¨ μ‹œ olefile둜 직접 μΆ”μΆœ
242
+ if output_path is None:
243
+ progress(0.5, desc="κΈ°λ³Έ ν…μŠ€νŠΈ μΆ”μΆœ 쀑...")
244
+ text, error = extract_text_with_olefile(input_path)
245
+ if text:
246
+ output_path = os.path.join(tmp_dir, "output.txt")
247
+ with open(output_path, 'w', encoding='utf-8') as f:
248
+ f.write(text)
249
+ error = None
250
+
251
+ ext = ".txt"
252
+ else:
253
+ return None, f"❌ μ§€μ›ν•˜μ§€ μ•ŠλŠ” ν˜•μ‹: {output_format}", ""
254
+
255
+ # λ³€ν™˜ μ‹€νŒ¨ 처리
256
+ if output_path is None or not os.path.exists(output_path):
257
+ error_msg = error or "λ³€ν™˜μ— μ‹€νŒ¨ν–ˆμŠ΅λ‹ˆλ‹€."
258
+ return None, f"❌ {error_msg}", ""
259
+
260
+ progress(0.8, desc="파일 μ€€λΉ„ 쀑...")
261
+
262
+ # μ΅œμ’… 파일λͺ… μ„€μ •
263
+ base_name = Path(input_filename).stem
264
+ final_filename = f"{base_name}{ext}"
265
+ final_output = os.path.join(tmp_dir, final_filename)
266
+
267
+ if output_path != final_output:
268
+ shutil.move(output_path, final_output)
269
+
270
+ # 파일 크기 확인
271
+ file_size = os.path.getsize(final_output)
272
+ size_str = f"{file_size / 1024:.1f} KB" if file_size > 1024 else f"{file_size} bytes"
273
+
274
+ progress(1.0, desc="μ™„λ£Œ!")
275
+
276
+ # 미리보기 (ν…μŠ€νŠΈ/HTML인 경우)
277
+ preview = ""
278
+ if ext in ['.txt', '.html']:
279
+ try:
280
+ with open(final_output, 'r', encoding='utf-8', errors='ignore') as f:
281
+ preview = f.read(5000) # 처음 5000자만
282
+ if len(preview) >= 5000:
283
+ preview += "\n\n... (미리보기 μƒλž΅)"
284
+ except:
285
+ pass
286
+
287
+ return final_output, f"βœ… λ³€ν™˜ μ™„λ£Œ: {final_filename} ({size_str})", preview
288
+
289
+ except Exception as e:
290
+ import traceback
291
+ traceback.print_exc()
292
+ return None, f"❌ 였λ₯˜ λ°œμƒ: {str(e)}", ""
293
+
294
+ finally:
295
+ # μž„μ‹œ 파일 μ •λ¦¬λŠ” Gradioκ°€ λ‹€μš΄λ‘œλ“œ ν›„ 처리
296
+ pass
297
+
298
+
299
+ # CSS μŠ€νƒ€μΌ
300
+ css = """
301
+ #col-container {
302
+ margin: 0 auto;
303
+ max-width: 900px;
304
+ }
305
+ .upload-box {
306
+ border: 2px dashed #6366f1 !important;
307
+ border-radius: 12px !important;
308
+ }
309
+ .download-box {
310
+ border: 2px solid #22c55e !important;
311
+ border-radius: 12px !important;
312
+ background: linear-gradient(135deg, #f0fdf4 0%, #dcfce7 100%) !important;
313
+ }
314
+ .preview-box {
315
+ max-height: 400px;
316
+ overflow-y: auto;
317
+ font-family: 'Nanum Gothic', sans-serif;
318
+ white-space: pre-wrap;
319
+ background: #f8fafc;
320
+ padding: 16px;
321
+ border-radius: 8px;
322
+ }
323
+ """
324
+
325
+ # Gradio μΈν„°νŽ˜μ΄μŠ€
326
+ with gr.Blocks(css=css, title="HWP λ³€ν™˜κΈ°", theme=gr.themes.Soft()) as demo:
327
+ with gr.Column(elem_id="col-container"):
328
+ gr.Markdown("""
329
+ # πŸ“„ HWP 파일 λ³€ν™˜κΈ°
330
+
331
+ ν•œκΈ€(HWP) λ¬Έμ„œλ₯Ό λ‹€μ–‘ν•œ ν˜•μ‹μœΌλ‘œ λ³€ν™˜ν•©λ‹ˆλ‹€.
332
+ """)
333
+
334
+ with gr.Row():
335
+ with gr.Column(scale=1):
336
+ gr.Markdown("### πŸ“€ 파일 μ—…λ‘œλ“œ")
337
+ file_input = gr.File(
338
+ label="HWP 파일 선택",
339
+ file_types=[".hwp"],
340
+ type="filepath",
341
+ elem_classes=["upload-box"]
342
+ )
343
+
344
+ format_select = gr.Radio(
345
+ label="λ³€ν™˜ ν˜•μ‹",
346
+ choices=["HTML", "ODT (OpenDocument)", "TXT (ν…μŠ€νŠΈ)"],
347
+ value="TXT (ν…μŠ€νŠΈ)",
348
+ info="μ›ν•˜λŠ” 좜λ ₯ ν˜•μ‹μ„ μ„ νƒν•˜μ„Έμš”"
349
+ )
350
+
351
+ convert_btn = gr.Button(
352
+ "πŸ”„ λ³€ν™˜ν•˜κΈ°",
353
+ variant="primary",
354
+ size="lg"
355
+ )
356
+
357
+ with gr.Column(scale=1):
358
+ gr.Markdown("### πŸ“₯ λ³€ν™˜ κ²°κ³Ό")
359
+ status_output = gr.Textbox(
360
+ label="μƒνƒœ",
361
+ interactive=False,
362
+ lines=2
363
+ )
364
+
365
+ file_output = gr.File(
366
+ label="λ‹€μš΄λ‘œλ“œ",
367
+ elem_classes=["download-box"]
368
+ )
369
+
370
+ # 미리보기 μ˜μ—­
371
+ with gr.Accordion("πŸ“‹ λ‚΄μš© 미리보기", open=False):
372
+ preview_output = gr.Textbox(
373
+ label="",
374
+ lines=15,
375
+ max_lines=25,
376
+ interactive=False,
377
+ elem_classes=["preview-box"]
378
+ )
379
+
380
+ gr.Markdown("""
381
+ ---
382
+ ### ℹ️ μ•ˆλ‚΄μ‚¬ν•­
383
+
384
+ | ν˜•μ‹ | μ„€λͺ… | μš©λ„ |
385
+ |------|------|------|
386
+ | **HTML** | μ›Ή νŽ˜μ΄μ§€ ν˜•μ‹ | λΈŒλΌμš°μ €μ—μ„œ 보기, μ›Ή κ²Œμ‹œ |
387
+ | **ODT** | OpenDocument | LibreOffice, Google Docs |
388
+ | **TXT** | 순수 ν…μŠ€νŠΈ | ν…μŠ€νŠΈ μΆ”μΆœ, 검색 |
389
+
390
+ > ⚠️ **μ œν•œμ‚¬ν•­:** HWP v5 ν˜•μ‹(ν•œκΈ€ 2007+)만 지원 | μ•”ν˜Έν™” 파일 λΆˆκ°€ | λ³΅μž‘ν•œ λ ˆμ΄μ•„μ›ƒμ€ 일뢀 손싀 κ°€λŠ₯
391
+
392
+ ---
393
+ *Powered by [pyhwp](https://github.com/mete0r/pyhwp) | GNU AGPL v3.0*
394
+ """)
395
+
396
+ # 이벀트 ν•Έλ“€λŸ¬
397
+ convert_btn.click(
398
+ fn=convert_hwp,
399
+ inputs=[file_input, format_select],
400
+ outputs=[file_output, status_output, preview_output]
401
+ )
402
+
403
+ # μ•± μ‹€ν–‰
404
+ if __name__ == "__main__":
405
+ demo.launch()