seawolf2357 commited on
Commit
edb886d
·
verified ·
1 Parent(s): 1df7414

Create app-backup.py

Browse files
Files changed (1) hide show
  1. app-backup.py +1483 -0
app-backup.py ADDED
@@ -0,0 +1,1483 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HWP AI 어시스턴트 - Gradio 웹 앱
3
+ AI가 HWP 파일을 읽고, 보고, 말하며, 생각하고 기억합니다.
4
+ - Tab 1: LLM 채팅 (스트리밍, 파일 첨부 지원)
5
+ - Tab 2: HWP 변환기
6
+ """
7
+ import gradio as gr
8
+ import tempfile
9
+ import os
10
+ import subprocess
11
+ import shutil
12
+ import sys
13
+ import re
14
+ import json
15
+ import uuid
16
+ import sqlite3
17
+ import base64
18
+ import requests
19
+ import zlib
20
+ import zipfile
21
+ from pathlib import Path
22
+ from datetime import datetime
23
+ from typing import Generator, List, Dict, Optional
24
+ from xml.etree import ElementTree as ET
25
+
26
+ # Groq 라이브러리 임포트
27
+ try:
28
+ from groq import Groq
29
+ GROQ_AVAILABLE = True
30
+ print("✅ Groq library loaded")
31
+ except ImportError:
32
+ GROQ_AVAILABLE = False
33
+ print("❌ Groq library not available - pip install groq")
34
+
35
+ # ============== Comic Style CSS ==============
36
+ COMIC_CSS = """
37
+ @import url('https://fonts.googleapis.com/css2?family=Bangers&family=Comic+Neue:wght@400;700&display=swap');
38
+
39
+ .gradio-container {
40
+ background-color: #FEF9C3 !important;
41
+ background-image: radial-gradient(#1F2937 1px, transparent 1px) !important;
42
+ background-size: 20px 20px !important;
43
+ min-height: 100vh !important;
44
+ font-family: 'Comic Neue', cursive, sans-serif !important;
45
+ }
46
+
47
+ footer, .footer, .gradio-container footer, .built-with, [class*="footer"], .gradio-footer, a[href*="gradio.app"] {
48
+ display: none !important;
49
+ visibility: hidden !important;
50
+ height: 0 !important;
51
+ }
52
+
53
+ /* HOME Button Style */
54
+ .home-button-container {
55
+ display: flex;
56
+ justify-content: center;
57
+ align-items: center;
58
+ gap: 15px;
59
+ margin-bottom: 15px;
60
+ padding: 12px 20px;
61
+ background: linear-gradient(135deg, #10B981 0%, #059669 100%);
62
+ border: 4px solid #1F2937;
63
+ border-radius: 12px;
64
+ box-shadow: 6px 6px 0 #1F2937;
65
+ }
66
+
67
+ .home-button {
68
+ display: inline-flex;
69
+ align-items: center;
70
+ gap: 8px;
71
+ padding: 10px 25px;
72
+ background: linear-gradient(135deg, #FACC15 0%, #F59E0B 100%);
73
+ color: #1F2937;
74
+ font-family: 'Bangers', cursive;
75
+ font-size: 1.4rem;
76
+ letter-spacing: 2px;
77
+ text-decoration: none;
78
+ border: 3px solid #1F2937;
79
+ border-radius: 8px;
80
+ box-shadow: 4px 4px 0 #1F2937;
81
+ transition: all 0.2s ease;
82
+ }
83
+
84
+ .home-button:hover {
85
+ background: linear-gradient(135deg, #FDE047 0%, #FACC15 100%);
86
+ transform: translate(-2px, -2px);
87
+ box-shadow: 6px 6px 0 #1F2937;
88
+ }
89
+
90
+ .home-button:active {
91
+ transform: translate(2px, 2px);
92
+ box-shadow: 2px 2px 0 #1F2937;
93
+ }
94
+
95
+ .url-display {
96
+ font-family: 'Comic Neue', cursive;
97
+ font-size: 1.1rem;
98
+ font-weight: 700;
99
+ color: #FFF;
100
+ background: rgba(0,0,0,0.3);
101
+ padding: 8px 16px;
102
+ border-radius: 6px;
103
+ border: 2px solid rgba(255,255,255,0.3);
104
+ }
105
+
106
+ .header-container {
107
+ text-align: center;
108
+ padding: 25px 20px;
109
+ background: linear-gradient(135deg, #3B82F6 0%, #8B5CF6 100%);
110
+ border: 4px solid #1F2937;
111
+ border-radius: 12px;
112
+ margin-bottom: 20px;
113
+ box-shadow: 8px 8px 0 #1F2937;
114
+ position: relative;
115
+ }
116
+
117
+ .header-title {
118
+ font-family: 'Bangers', cursive !important;
119
+ color: #FFF !important;
120
+ font-size: 2.8rem !important;
121
+ text-shadow: 3px 3px 0 #1F2937 !important;
122
+ letter-spacing: 3px !important;
123
+ margin: 0 !important;
124
+ }
125
+
126
+ .header-subtitle {
127
+ font-family: 'Comic Neue', cursive !important;
128
+ font-size: 1.1rem !important;
129
+ color: #FEF9C3 !important;
130
+ margin-top: 8px !important;
131
+ font-weight: 700 !important;
132
+ }
133
+
134
+ .stats-badge {
135
+ display: inline-block;
136
+ background: #FACC15;
137
+ color: #1F2937;
138
+ padding: 6px 14px;
139
+ border-radius: 20px;
140
+ font-size: 0.9rem;
141
+ margin: 3px;
142
+ font-weight: 700;
143
+ border: 2px solid #1F2937;
144
+ box-shadow: 2px 2px 0 #1F2937;
145
+ }
146
+
147
+ /* 무료 서비스 안내 박스 */
148
+ .free-service-notice {
149
+ text-align: center;
150
+ padding: 10px 15px;
151
+ background: linear-gradient(135deg, #FEE2E2 0%, #FECACA 100%);
152
+ border: 3px solid #1F2937;
153
+ border-radius: 8px;
154
+ margin: 10px 0;
155
+ box-shadow: 4px 4px 0 #1F2937;
156
+ font-family: 'Comic Neue', cursive;
157
+ font-weight: 700;
158
+ color: #991B1B;
159
+ }
160
+
161
+ .free-service-notice a {
162
+ color: #1D4ED8;
163
+ text-decoration: none;
164
+ font-weight: 700;
165
+ }
166
+
167
+ .free-service-notice a:hover {
168
+ text-decoration: underline;
169
+ }
170
+
171
+ .gr-panel, .gr-box, .gr-form, .block, .gr-group {
172
+ background: #FFF !important;
173
+ border: 3px solid #1F2937 !important;
174
+ border-radius: 8px !important;
175
+ box-shadow: 5px 5px 0 #1F2937 !important;
176
+ }
177
+
178
+ .gr-button-primary, button.primary, .gr-button.primary {
179
+ background: linear-gradient(135deg, #EF4444 0%, #F97316 100%) !important;
180
+ border: 3px solid #1F2937 !important;
181
+ border-radius: 8px !important;
182
+ color: #FFF !important;
183
+ font-family: 'Bangers', cursive !important;
184
+ font-size: 1.3rem !important;
185
+ letter-spacing: 2px !important;
186
+ padding: 12px 24px !important;
187
+ box-shadow: 4px 4px 0 #1F2937 !important;
188
+ text-shadow: 1px 1px 0 #1F2937 !important;
189
+ transition: all 0.2s ease !important;
190
+ }
191
+
192
+ .gr-button-primary:hover, button.primary:hover {
193
+ background: linear-gradient(135deg, #DC2626 0%, #EA580C 100%) !important;
194
+ transform: translate(-2px, -2px) !important;
195
+ box-shadow: 6px 6px 0 #1F2937 !important;
196
+ }
197
+
198
+ .gr-button-primary:active, button.primary:active {
199
+ transform: translate(2px, 2px) !important;
200
+ box-shadow: 2px 2px 0 #1F2937 !important;
201
+ }
202
+
203
+ textarea, input[type="text"], input[type="number"] {
204
+ background: #FFF !important;
205
+ border: 3px solid #1F2937 !important;
206
+ border-radius: 8px !important;
207
+ color: #1F2937 !important;
208
+ font-family: 'Comic Neue', cursive !important;
209
+ font-weight: 700 !important;
210
+ }
211
+
212
+ textarea:focus, input[type="text"]:focus {
213
+ border-color: #3B82F6 !important;
214
+ box-shadow: 3px 3px 0 #3B82F6 !important;
215
+ }
216
+
217
+ .info-box {
218
+ background: linear-gradient(135deg, #FACC15 0%, #FDE047 100%) !important;
219
+ border: 3px solid #1F2937 !important;
220
+ border-radius: 8px !important;
221
+ padding: 12px 15px !important;
222
+ margin: 10px 0 !important;
223
+ box-shadow: 4px 4px 0 #1F2937 !important;
224
+ font-family: 'Comic Neue', cursive !important;
225
+ font-weight: 700 !important;
226
+ color: #1F2937 !important;
227
+ }
228
+
229
+ .feature-box {
230
+ background: linear-gradient(135deg, #E0F2FE 0%, #BAE6FD 100%) !important;
231
+ border: 3px solid #1F2937 !important;
232
+ border-radius: 12px !important;
233
+ padding: 20px !important;
234
+ margin: 15px 0 !important;
235
+ box-shadow: 5px 5px 0 #1F2937 !important;
236
+ }
237
+
238
+ .feature-title {
239
+ font-family: 'Bangers', cursive !important;
240
+ font-size: 1.5rem !important;
241
+ color: #1F2937 !important;
242
+ margin-bottom: 10px !important;
243
+ text-shadow: 1px 1px 0 #FFF !important;
244
+ }
245
+
246
+ .feature-item {
247
+ display: flex;
248
+ align-items: center;
249
+ gap: 10px;
250
+ padding: 8px 0;
251
+ font-family: 'Comic Neue', cursive !important;
252
+ font-weight: 700 !important;
253
+ font-size: 1rem !important;
254
+ color: #1F2937 !important;
255
+ }
256
+
257
+ .feature-icon {
258
+ font-size: 1.5rem;
259
+ }
260
+
261
+ /* Markdown 강조 박스 */
262
+ .markdown-highlight-box {
263
+ background: linear-gradient(135deg, #EC4899 0%, #F472B6 100%) !important;
264
+ border: 4px solid #1F2937 !important;
265
+ border-radius: 12px !important;
266
+ padding: 20px !important;
267
+ margin: 15px 0 !important;
268
+ box-shadow: 6px 6px 0 #1F2937 !important;
269
+ animation: pulse-glow 2s ease-in-out infinite;
270
+ }
271
+
272
+ @keyframes pulse-glow {
273
+ 0%, 100% { box-shadow: 6px 6px 0 #1F2937; }
274
+ 50% { box-shadow: 8px 8px 0 #1F2937, 0 0 20px rgba(236, 72, 153, 0.5); }
275
+ }
276
+
277
+ .markdown-title {
278
+ font-family: 'Bangers', cursive !important;
279
+ font-size: 2rem !important;
280
+ color: #FFF !important;
281
+ text-shadow: 3px 3px 0 #1F2937 !important;
282
+ letter-spacing: 2px !important;
283
+ margin-bottom: 15px !important;
284
+ text-align: center !important;
285
+ }
286
+
287
+ .markdown-benefits {
288
+ display: grid;
289
+ grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
290
+ gap: 12px;
291
+ margin-top: 10px;
292
+ }
293
+
294
+ .markdown-benefit-item {
295
+ background: rgba(255,255,255,0.95) !important;
296
+ border: 3px solid #1F2937 !important;
297
+ border-radius: 8px !important;
298
+ padding: 12px !important;
299
+ box-shadow: 3px 3px 0 #1F2937 !important;
300
+ font-family: 'Comic Neue', cursive !important;
301
+ font-weight: 700 !important;
302
+ font-size: 0.95rem !important;
303
+ color: #1F2937 !important;
304
+ text-align: center !important;
305
+ }
306
+
307
+ .markdown-benefit-icon {
308
+ font-size: 1.8rem !important;
309
+ display: block !important;
310
+ margin-bottom: 5px !important;
311
+ }
312
+
313
+ label, .gr-input-label, .gr-block-label {
314
+ color: #1F2937 !important;
315
+ font-family: 'Comic Neue', cursive !important;
316
+ font-weight: 700 !important;
317
+ }
318
+
319
+ .gr-accordion {
320
+ background: #E0F2FE !important;
321
+ border: 3px solid #1F2937 !important;
322
+ border-radius: 8px !important;
323
+ box-shadow: 4px 4px 0 #1F2937 !important;
324
+ }
325
+
326
+ .footer-comic {
327
+ text-align: center;
328
+ padding: 20px;
329
+ background: linear-gradient(135deg, #3B82F6 0%, #8B5CF6 100%);
330
+ border: 4px solid #1F2937;
331
+ border-radius: 12px;
332
+ margin-top: 20px;
333
+ box-shadow: 6px 6px 0 #1F2937;
334
+ }
335
+
336
+ .footer-comic p {
337
+ font-family: 'Comic Neue', cursive !important;
338
+ color: #FFF !important;
339
+ margin: 5px 0 !important;
340
+ font-weight: 700 !important;
341
+ }
342
+
343
+ ::-webkit-scrollbar {
344
+ width: 12px;
345
+ height: 12px;
346
+ }
347
+
348
+ ::-webkit-scrollbar-track {
349
+ background: #FEF9C3;
350
+ border: 2px solid #1F2937;
351
+ }
352
+
353
+ ::-webkit-scrollbar-thumb {
354
+ background: #3B82F6;
355
+ border: 2px solid #1F2937;
356
+ border-radius: 6px;
357
+ }
358
+
359
+ ::-webkit-scrollbar-thumb:hover {
360
+ background: #EF4444;
361
+ }
362
+
363
+ ::selection {
364
+ background: #FACC15;
365
+ color: #1F2937;
366
+ }
367
+
368
+ /* Chatbot Styling */
369
+ .gr-chatbot {
370
+ border: 3px solid #1F2937 !important;
371
+ border-radius: 12px !important;
372
+ box-shadow: 5px 5px 0 #1F2937 !important;
373
+ }
374
+
375
+ /* Tab Styling */
376
+ .gr-tab-nav {
377
+ background: linear-gradient(135deg, #F59E0B 0%, #FACC15 100%) !important;
378
+ border: 3px solid #1F2937 !important;
379
+ border-radius: 8px 8px 0 0 !important;
380
+ }
381
+
382
+ .gr-tab-nav button {
383
+ font-family: 'Bangers', cursive !important;
384
+ font-size: 1.2rem !important;
385
+ letter-spacing: 1px !important;
386
+ color: #1F2937 !important;
387
+ }
388
+
389
+ .gr-tab-nav button.selected {
390
+ background: #FFF !important;
391
+ border-bottom: 3px solid #FFF !important;
392
+ }
393
+
394
+ /* File Upload Box */
395
+ .upload-box {
396
+ border: 3px dashed #3B82F6 !important;
397
+ border-radius: 12px !important;
398
+ background: linear-gradient(135deg, #EFF6FF 0%, #DBEAFE 100%) !important;
399
+ box-shadow: 4px 4px 0 #1F2937 !important;
400
+ }
401
+
402
+ .download-box {
403
+ border: 3px solid #10B981 !important;
404
+ border-radius: 12px !important;
405
+ background: linear-gradient(135deg, #ECFDF5 0%, #D1FAE5 100%) !important;
406
+ box-shadow: 4px 4px 0 #1F2937 !important;
407
+ }
408
+ """
409
+
410
+ # ============== 환경 설정 ==============
411
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
412
+ PYHWP_PATH = os.path.join(SCRIPT_DIR, 'pyhwp')
413
+ DB_PATH = os.path.join(SCRIPT_DIR, 'chat_history.db')
414
+
415
+ if os.path.exists(PYHWP_PATH):
416
+ sys.path.insert(0, PYHWP_PATH)
417
+
418
+ # ============== 모듈 임포트 ==============
419
+ try:
420
+ import olefile
421
+ OLEFILE_AVAILABLE = True
422
+ print("✅ olefile loaded")
423
+ except ImportError:
424
+ OLEFILE_AVAILABLE = False
425
+
426
+ try:
427
+ from markdownify import markdownify as md
428
+ MARKDOWNIFY_AVAILABLE = True
429
+ print("✅ markdownify loaded")
430
+ except ImportError:
431
+ MARKDOWNIFY_AVAILABLE = False
432
+
433
+ try:
434
+ import html2text
435
+ HTML2TEXT_AVAILABLE = True
436
+ print("✅ html2text loaded")
437
+ except ImportError:
438
+ HTML2TEXT_AVAILABLE = False
439
+
440
+ try:
441
+ from bs4 import BeautifulSoup
442
+ BS4_AVAILABLE = True
443
+ except ImportError:
444
+ BS4_AVAILABLE = False
445
+
446
+ try:
447
+ import PyPDF2
448
+ PYPDF2_AVAILABLE = True
449
+ print("✅ PyPDF2 loaded")
450
+ except ImportError:
451
+ PYPDF2_AVAILABLE = False
452
+
453
+ try:
454
+ import pdfplumber
455
+ PDFPLUMBER_AVAILABLE = True
456
+ print("✅ pdfplumber loaded")
457
+ except ImportError:
458
+ PDFPLUMBER_AVAILABLE = False
459
+
460
+ # ============== API 키 설정 ==============
461
+ GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
462
+ FIREWORKS_API_KEY = os.environ.get("FIREWORKS_API_KEY", "")
463
+
464
+ # ============== SQLite 데이터베이스 ==============
465
+ def init_database():
466
+ conn = sqlite3.connect(DB_PATH)
467
+ cursor = conn.cursor()
468
+ cursor.execute('''
469
+ CREATE TABLE IF NOT EXISTS sessions (
470
+ session_id TEXT PRIMARY KEY,
471
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
472
+ updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
473
+ title TEXT
474
+ )
475
+ ''')
476
+ cursor.execute('''
477
+ CREATE TABLE IF NOT EXISTS messages (
478
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
479
+ session_id TEXT,
480
+ role TEXT,
481
+ content TEXT,
482
+ file_info TEXT,
483
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
484
+ FOREIGN KEY (session_id) REFERENCES sessions(session_id)
485
+ )
486
+ ''')
487
+ conn.commit()
488
+ conn.close()
489
+
490
+ def create_session() -> str:
491
+ session_id = str(uuid.uuid4())
492
+ conn = sqlite3.connect(DB_PATH)
493
+ cursor = conn.cursor()
494
+ cursor.execute("INSERT INTO sessions (session_id, title) VALUES (?, ?)",
495
+ (session_id, f"대화 {datetime.now().strftime('%Y-%m-%d %H:%M')}"))
496
+ conn.commit()
497
+ conn.close()
498
+ return session_id
499
+
500
+ def save_message(session_id: str, role: str, content: str, file_info: str = None):
501
+ conn = sqlite3.connect(DB_PATH)
502
+ cursor = conn.cursor()
503
+ cursor.execute("INSERT INTO messages (session_id, role, content, file_info) VALUES (?, ?, ?, ?)",
504
+ (session_id, role, content, file_info))
505
+ cursor.execute("UPDATE sessions SET updated_at = CURRENT_TIMESTAMP WHERE session_id = ?", (session_id,))
506
+ conn.commit()
507
+ conn.close()
508
+
509
+ def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
510
+ conn = sqlite3.connect(DB_PATH)
511
+ cursor = conn.cursor()
512
+ cursor.execute("SELECT role, content, file_info, created_at FROM messages WHERE session_id = ? ORDER BY created_at DESC LIMIT ?",
513
+ (session_id, limit))
514
+ rows = cursor.fetchall()
515
+ conn.close()
516
+ return [{"role": r[0], "content": r[1], "file_info": r[2], "created_at": r[3]} for r in reversed(rows)]
517
+
518
+ def get_all_sessions() -> List[Dict]:
519
+ conn = sqlite3.connect(DB_PATH)
520
+ cursor = conn.cursor()
521
+ cursor.execute("SELECT session_id, title, created_at, updated_at FROM sessions ORDER BY updated_at DESC LIMIT 50")
522
+ rows = cursor.fetchall()
523
+ conn.close()
524
+ return [{"session_id": r[0], "title": r[1], "created_at": r[2], "updated_at": r[3]} for r in rows]
525
+
526
+ def update_session_title(session_id: str, title: str):
527
+ conn = sqlite3.connect(DB_PATH)
528
+ cursor = conn.cursor()
529
+ cursor.execute("UPDATE sessions SET title = ? WHERE session_id = ?", (title, session_id))
530
+ conn.commit()
531
+ conn.close()
532
+
533
+ init_database()
534
+
535
+ # ============== 파일 유틸리티 ==============
536
+ def extract_text_from_pdf(file_path: str) -> str:
537
+ text_parts = []
538
+ if PDFPLUMBER_AVAILABLE:
539
+ try:
540
+ with pdfplumber.open(file_path) as pdf:
541
+ for page in pdf.pages:
542
+ text = page.extract_text()
543
+ if text:
544
+ text_parts.append(text)
545
+ if text_parts:
546
+ return "\n\n".join(text_parts)
547
+ except Exception as e:
548
+ print(f"pdfplumber error: {e}")
549
+
550
+ if PYPDF2_AVAILABLE:
551
+ try:
552
+ with open(file_path, 'rb') as f:
553
+ reader = PyPDF2.PdfReader(f)
554
+ for page in reader.pages:
555
+ text = page.extract_text()
556
+ if text:
557
+ text_parts.append(text)
558
+ if text_parts:
559
+ return "\n\n".join(text_parts)
560
+ except Exception as e:
561
+ print(f"PyPDF2 error: {e}")
562
+ return None
563
+
564
+ def extract_text_from_txt(file_path: str) -> str:
565
+ for encoding in ['utf-8', 'euc-kr', 'cp949', 'utf-16', 'latin-1']:
566
+ try:
567
+ with open(file_path, 'r', encoding=encoding) as f:
568
+ return f.read()
569
+ except:
570
+ continue
571
+ return None
572
+
573
+ def image_to_base64(file_path: str) -> str:
574
+ with open(file_path, 'rb') as f:
575
+ return base64.b64encode(f.read()).decode('utf-8')
576
+
577
+ def get_image_mime_type(file_path: str) -> str:
578
+ ext = Path(file_path).suffix.lower()
579
+ return {'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png',
580
+ '.gif': 'image/gif', '.webp': 'image/webp', '.bmp': 'image/bmp'}.get(ext, 'image/jpeg')
581
+
582
+ def is_image_file(fp: str) -> bool:
583
+ return Path(fp).suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']
584
+
585
+ def is_hwp_file(fp: str) -> bool:
586
+ return Path(fp).suffix.lower() == '.hwp'
587
+
588
+ def is_hwpx_file(fp: str) -> bool:
589
+ return Path(fp).suffix.lower() == '.hwpx'
590
+
591
+ def is_pdf_file(fp: str) -> bool:
592
+ return Path(fp).suffix.lower() == '.pdf'
593
+
594
+ def is_text_file(fp: str) -> bool:
595
+ return Path(fp).suffix.lower() in ['.txt', '.md', '.json', '.csv', '.xml', '.html', '.css', '.js', '.py']
596
+
597
+ # ============== HWPX 텍스트 추출 ==============
598
+ def extract_text_from_hwpx(file_path: str) -> tuple:
599
+ try:
600
+ text_parts = []
601
+ with zipfile.ZipFile(file_path, 'r') as zf:
602
+ file_list = zf.namelist()
603
+ section_files = sorted([f for f in file_list if f.startswith('Contents/section') and f.endswith('.xml')])
604
+ if not section_files:
605
+ section_files = sorted([f for f in file_list if 'section' in f.lower() and f.endswith('.xml')])
606
+
607
+ for section_file in section_files:
608
+ try:
609
+ with zf.open(section_file) as sf:
610
+ content = sf.read()
611
+ content_str = content.decode('utf-8')
612
+ content_str = re.sub(r'\sxmlns[^"]*"[^"]*"', '', content_str)
613
+ content_str = re.sub(r'<[a-zA-Z]+:', '<', content_str)
614
+ content_str = re.sub(r'</[a-zA-Z]+:', '</', content_str)
615
+
616
+ try:
617
+ root = ET.fromstring(content_str)
618
+ texts = []
619
+ for elem in root.iter():
620
+ if elem.tag.endswith('t') or elem.tag == 't':
621
+ if elem.text:
622
+ texts.append(elem.text)
623
+ elif elem.text and elem.text.strip():
624
+ if any(x in elem.tag.lower() for x in ['text', 'run', 'para', 'char']):
625
+ texts.append(elem.text.strip())
626
+ if texts:
627
+ text_parts.append(' '.join(texts))
628
+ except ET.ParseError:
629
+ text_matches = re.findall(r'>([^<]+)<', content.decode('utf-8', errors='ignore'))
630
+ clean_texts = [t.strip() for t in text_matches if t.strip() and len(t.strip()) > 1]
631
+ if clean_texts:
632
+ text_parts.append(' '.join(clean_texts))
633
+ except:
634
+ continue
635
+
636
+ if text_parts:
637
+ result = '\n\n'.join(text_parts)
638
+ result = re.sub(r'\s+', ' ', result)
639
+ result = re.sub(r'\n{3,}', '\n\n', result)
640
+ return result.strip(), None
641
+ return None, "HWPX에서 텍스트를 찾을 수 없습니다"
642
+ except zipfile.BadZipFile:
643
+ return None, "유효하지 않은 HWPX 파일"
644
+ except Exception as e:
645
+ return None, f"HWPX 처리 오류: {str(e)}"
646
+
647
+ # ============== HWP 텍스트 추출 ==============
648
+ def extract_text_with_hwp5txt(file_path: str) -> tuple:
649
+ try:
650
+ result = subprocess.run(['hwp5txt', file_path], capture_output=True, timeout=60)
651
+ if result.returncode == 0 and result.stdout:
652
+ for enc in ['utf-8', 'cp949', 'euc-kr']:
653
+ try:
654
+ text = result.stdout.decode(enc)
655
+ if text.strip() and len(text.strip()) > 10:
656
+ return text.strip(), None
657
+ except:
658
+ continue
659
+ except FileNotFoundError:
660
+ pass
661
+ except Exception as e:
662
+ print(f"hwp5txt error: {e}")
663
+
664
+ try:
665
+ code = f'''
666
+ import sys
667
+ sys.path.insert(0, "{PYHWP_PATH}")
668
+ from hwp5.filestructure import Hwp5File
669
+ from hwp5.hwp5txt import extract_text
670
+ hwp = Hwp5File("{file_path}")
671
+ for idx in hwp.bodytext.sections():
672
+ section = hwp.bodytext.section(idx)
673
+ for para in extract_text(section):
674
+ if para.strip():
675
+ print(para.strip())
676
+ hwp.close()
677
+ '''
678
+ result = subprocess.run([sys.executable, '-c', code], capture_output=True, timeout=60)
679
+ if result.returncode == 0 and result.stdout:
680
+ for enc in ['utf-8', 'cp949', 'euc-kr']:
681
+ try:
682
+ text = result.stdout.decode(enc)
683
+ if text.strip() and len(text.strip()) > 10:
684
+ return text.strip(), None
685
+ except:
686
+ continue
687
+ except Exception as e:
688
+ print(f"hwp5txt subprocess error: {e}")
689
+
690
+ return None, "hwp5txt 실패"
691
+
692
+ def extract_text_with_olefile(file_path: str) -> tuple:
693
+ if not OLEFILE_AVAILABLE:
694
+ return None, "olefile 모듈 없음"
695
+
696
+ try:
697
+ ole = olefile.OleFileIO(file_path)
698
+ if not ole.exists('FileHeader'):
699
+ ole.close()
700
+ return None, "HWP 파일 헤더 없음"
701
+
702
+ header_data = ole.openstream('FileHeader').read()
703
+ is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
704
+
705
+ all_texts = []
706
+ for entry in ole.listdir():
707
+ entry_path = '/'.join(entry)
708
+ if entry_path.startswith('BodyText/Section'):
709
+ try:
710
+ stream_data = ole.openstream(entry).read()
711
+ if is_compressed:
712
+ try:
713
+ stream_data = zlib.decompress(stream_data, -15)
714
+ except:
715
+ try:
716
+ stream_data = zlib.decompress(stream_data)
717
+ except:
718
+ pass
719
+
720
+ section_text = extract_hwp_section_text(stream_data)
721
+ if section_text:
722
+ all_texts.append(section_text)
723
+ except:
724
+ continue
725
+
726
+ ole.close()
727
+ if all_texts:
728
+ return '\n\n'.join(all_texts).strip(), None
729
+ return None, "텍스트를 찾을 수 없습니다"
730
+ except Exception as e:
731
+ return None, f"olefile 오류: {str(e)}"
732
+
733
+ def extract_hwp_section_text(data: bytes) -> str:
734
+ texts = []
735
+ pos = 0
736
+ while pos < len(data) - 4:
737
+ try:
738
+ header = int.from_bytes(data[pos:pos+4], 'little')
739
+ tag_id = header & 0x3FF
740
+ size = (header >> 20) & 0xFFF
741
+ pos += 4
742
+ if size == 0xFFF:
743
+ if pos + 4 > len(data):
744
+ break
745
+ size = int.from_bytes(data[pos:pos+4], 'little')
746
+ pos += 4
747
+ if pos + size > len(data):
748
+ break
749
+ record_data = data[pos:pos+size]
750
+ pos += size
751
+ if tag_id == 67 and size > 0:
752
+ text = decode_para_text(record_data)
753
+ if text:
754
+ texts.append(text)
755
+ except:
756
+ pos += 1
757
+ continue
758
+ return '\n'.join(texts) if texts else None
759
+
760
+ def decode_para_text(data: bytes) -> str:
761
+ result = []
762
+ i = 0
763
+ while i < len(data) - 1:
764
+ code = int.from_bytes(data[i:i+2], 'little')
765
+ if code == 0:
766
+ pass
767
+ elif code == 1:
768
+ i += 14
769
+ elif code == 2:
770
+ i += 14
771
+ elif code == 3:
772
+ i += 14
773
+ elif code == 4:
774
+ pass
775
+ elif code == 9:
776
+ result.append('\t')
777
+ elif code == 10:
778
+ result.append('\n')
779
+ elif code == 13:
780
+ result.append('\n')
781
+ elif code == 24:
782
+ result.append('-')
783
+ elif code == 30 or code == 31:
784
+ result.append(' ')
785
+ elif code < 32:
786
+ pass
787
+ else:
788
+ try:
789
+ char = chr(code)
790
+ if char.isprintable() or char in '\n\t ':
791
+ result.append(char)
792
+ except:
793
+ pass
794
+ i += 2
795
+ text = ''.join(result).strip()
796
+ text = re.sub(r'[ \t]+', ' ', text)
797
+ text = re.sub(r'\n{3,}', '\n\n', text)
798
+ return text if len(text) > 2 else None
799
+
800
+ def extract_text_from_hwp(file_path: str) -> tuple:
801
+ print(f"\n📖 [HWP 읽기] {os.path.basename(file_path)}")
802
+ text, error = extract_text_with_hwp5txt(file_path)
803
+ if text and len(text.strip()) > 20:
804
+ print(f" ✅ 성공: {len(text)} 글자")
805
+ return text, None
806
+ text, error = extract_text_with_olefile(file_path)
807
+ if text and len(text.strip()) > 20:
808
+ print(f" ✅ 성공: {len(text)} 글자")
809
+ return text, None
810
+ print(f" ❌ 실패: {error}")
811
+ return None, "모든 추출 방법 실패"
812
+
813
+ def extract_text_from_hwp_or_hwpx(file_path: str) -> tuple:
814
+ if is_hwpx_file(file_path):
815
+ print(f"\n📖 [HWPX 읽기] {os.path.basename(file_path)}")
816
+ return extract_text_from_hwpx(file_path)
817
+ else:
818
+ return extract_text_from_hwp(file_path)
819
+
820
+ # ============== HWP 변환 함수들 ==============
821
+ def check_hwp_version(file_path):
822
+ try:
823
+ with open(file_path, 'rb') as f:
824
+ header = f.read(32)
825
+ if b'HWP Document File' in header:
826
+ return "HWP v5", True
827
+ elif header[:4] == b'\xd0\xcf\x11\xe0':
828
+ return "HWP v5 (OLE)", True
829
+ elif header[:4] == b'PK\x03\x04':
830
+ return "HWPX", True
831
+ else:
832
+ return "Unknown", False
833
+ except Exception as e:
834
+ return f"Error: {e}", False
835
+
836
+ def convert_to_html_subprocess(input_path, output_dir):
837
+ output_path = os.path.join(output_dir, "output.html")
838
+ try:
839
+ for cmd in [['hwp5html', '--output', output_path, input_path]]:
840
+ try:
841
+ result = subprocess.run(cmd, capture_output=True, timeout=120)
842
+ if result.returncode == 0:
843
+ if os.path.exists(output_path):
844
+ return output_path, None
845
+ for item in os.listdir(output_dir):
846
+ item_path = os.path.join(output_dir, item)
847
+ if item.lower().endswith(('.html', '.htm')):
848
+ return item_path, None
849
+ if os.path.isdir(item_path):
850
+ return item_path, None
851
+ except:
852
+ continue
853
+ except Exception as e:
854
+ print(f"HTML 변환 오류: {e}")
855
+ return None, "HTML 변환 실패"
856
+
857
+ def html_to_markdown(html_content):
858
+ if MARKDOWNIFY_AVAILABLE:
859
+ try:
860
+ return md(html_content, heading_style="ATX", bullets="-"), None
861
+ except:
862
+ pass
863
+ if HTML2TEXT_AVAILABLE:
864
+ try:
865
+ h = html2text.HTML2Text()
866
+ h.body_width = 0
867
+ return h.handle(html_content), None
868
+ except:
869
+ pass
870
+ if BS4_AVAILABLE:
871
+ try:
872
+ soup = BeautifulSoup(html_content, 'html.parser')
873
+ return soup.get_text(separator='\n'), None
874
+ except:
875
+ pass
876
+ return None, "Markdown 변환 실패"
877
+
878
+ def convert_hwp_to_markdown(input_path: str) -> tuple:
879
+ text, error = extract_text_from_hwp_or_hwpx(input_path)
880
+ if text:
881
+ return text, None
882
+ return None, error
883
+
884
+ # ============== LLM API (Groq 라이브러리 사용) ==============
885
+ def call_groq_api_stream(messages: List[Dict]) -> Generator[str, None, None]:
886
+ """Groq API 스트리밍 호출 - openai/gpt-oss-120b 모델 사용"""
887
+ if not GROQ_AVAILABLE:
888
+ yield "❌ Groq 라이브러리가 설치되지 않았습니다. pip install groq"
889
+ return
890
+
891
+ if not GROQ_API_KEY:
892
+ yield "❌ GROQ_API_KEY 환경변수가 설정되지 않았습니다."
893
+ return
894
+
895
+ try:
896
+ client = Groq(api_key=GROQ_API_KEY)
897
+
898
+ completion = client.chat.completions.create(
899
+ model="openai/gpt-oss-120b",
900
+ messages=messages,
901
+ temperature=1,
902
+ max_completion_tokens=8192,
903
+ top_p=1,
904
+ reasoning_effort="medium",
905
+ stream=True,
906
+ stop=None
907
+ )
908
+
909
+ for chunk in completion:
910
+ if chunk.choices[0].delta.content:
911
+ yield chunk.choices[0].delta.content
912
+
913
+ except Exception as e:
914
+ error_msg = str(e)
915
+ print(f"❌ Groq API 오류: {error_msg}")
916
+ yield f"❌ API 오류: {error_msg}"
917
+
918
+ def call_fireworks_api_stream(messages: List[Dict], image_base64: str, mime_type: str) -> Generator[str, None, None]:
919
+ """Fireworks API 스트리밍 호출 (이미지 분석용)"""
920
+ if not FIREWORKS_API_KEY:
921
+ yield "❌ FIREWORKS_API_KEY 환경변수가 설정되지 않았습니다."
922
+ return
923
+
924
+ try:
925
+ formatted_messages = [{"role": m["role"], "content": m["content"]} for m in messages[:-1]]
926
+ formatted_messages.append({
927
+ "role": messages[-1]["role"],
928
+ "content": [
929
+ {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{image_base64}"}},
930
+ {"type": "text", "text": messages[-1]["content"]}
931
+ ]
932
+ })
933
+
934
+ response = requests.post(
935
+ "https://api.fireworks.ai/inference/v1/chat/completions",
936
+ headers={"Authorization": f"Bearer {FIREWORKS_API_KEY}", "Content-Type": "application/json"},
937
+ json={
938
+ "model": "accounts/fireworks/models/qwen3-vl-235b-a22b-thinking",
939
+ "max_tokens": 4096,
940
+ "temperature": 0.6,
941
+ "messages": formatted_messages,
942
+ "stream": True
943
+ },
944
+ stream=True
945
+ )
946
+
947
+ if response.status_code != 200:
948
+ yield f"❌ Fireworks API 오류: {response.status_code}"
949
+ return
950
+
951
+ for line in response.iter_lines():
952
+ if line:
953
+ line = line.decode('utf-8')
954
+ if line.startswith('data: ') and line[6:] != '[DONE]':
955
+ try:
956
+ data = json.loads(line[6:])
957
+ content = data.get('choices', [{}])[0].get('delta', {}).get('content', '')
958
+ if content:
959
+ yield content
960
+ except:
961
+ continue
962
+ except Exception as e:
963
+ yield f"❌ API 오류: {str(e)}"
964
+
965
+ # ============== 채팅 처리 ==============
966
+ def process_file(file_path: str) -> tuple:
967
+ if not file_path:
968
+ return None, None, None
969
+ filename = os.path.basename(file_path)
970
+
971
+ if is_image_file(file_path):
972
+ return "image", image_to_base64(file_path), get_image_mime_type(file_path)
973
+
974
+ if is_hwp_file(file_path) or is_hwpx_file(file_path):
975
+ text, error = extract_text_from_hwp_or_hwpx(file_path)
976
+ if text and len(text.strip()) > 20:
977
+ print(f"📄 [문서 내용 추출 완료] {len(text)} 글자")
978
+ print(f"📄 [문서 미리보기] {text[:500]}...")
979
+ return "text", text, None
980
+ return "error", f"한글 문서 추출 실패: {error}", None
981
+
982
+ if is_pdf_file(file_path):
983
+ text = extract_text_from_pdf(file_path)
984
+ if text:
985
+ print(f"📄 [PDF 내용 추출 완료] {len(text)} 글자")
986
+ return "text", text, None
987
+ return "error", "PDF 추출 실패", None
988
+
989
+ if is_text_file(file_path):
990
+ text = extract_text_from_txt(file_path)
991
+ if text:
992
+ return "text", text, None
993
+ return "error", "텍스트 읽기 실패", None
994
+
995
+ return "unsupported", f"지원하지 않는 형식: {filename}", None
996
+
997
+ def chat_response(message: str, history: List[Dict], file: Optional[str],
998
+ session_id: str) -> Generator[tuple, None, None]:
999
+ if history is None:
1000
+ history = []
1001
+ if not message.strip() and not file:
1002
+ yield history, session_id
1003
+ return
1004
+ if not session_id:
1005
+ session_id = create_session()
1006
+
1007
+ file_type, file_content, file_mime = None, None, None
1008
+ file_info = None
1009
+ filename = None
1010
+
1011
+ if file:
1012
+ filename = os.path.basename(file)
1013
+ file_type, file_content, file_mime = process_file(file)
1014
+ file_info = json.dumps({"type": file_type, "filename": filename})
1015
+
1016
+ if file_type == "error":
1017
+ history = history + [
1018
+ {"role": "user", "content": message or "파일 업로드"},
1019
+ {"role": "assistant", "content": f"❌ {file_content}"}
1020
+ ]
1021
+ yield history, session_id
1022
+ return
1023
+ elif file_type == "unsupported":
1024
+ history = history + [
1025
+ {"role": "user", "content": message or "파일 업로드"},
1026
+ {"role": "assistant", "content": f"⚠️ {file_content}"}
1027
+ ]
1028
+ yield history, session_id
1029
+ return
1030
+
1031
+ # 사용자 메시지 표시
1032
+ user_msg = message
1033
+ if file:
1034
+ user_msg = f"📎 {filename}\n\n{message}" if message else f"📎 {filename}"
1035
+
1036
+ history = history + [{"role": "user", "content": user_msg}, {"role": "assistant", "content": ""}]
1037
+ yield history, session_id
1038
+
1039
+ # 이전 대화 불러오기
1040
+ db_messages = get_session_messages(session_id, limit=10)
1041
+
1042
+ # 시스템 프롬프트 - 문서 분석 강화
1043
+ system_prompt = """당신은 문서 분석 전문 AI 어시스턴트입니다.
1044
+
1045
+ ## 핵심 역할
1046
+ - 사용자가 업로드한 문서의 내용을 **정확하게 분석**하고 **구체적으로 답변**합니다.
1047
+ - 문서에 있는 **실제 내용**을 기반으로만 답변합니다.
1048
+ - 문서에 없는 내용은 추측하지 않습니다.
1049
+
1050
+ ## 문서 분석 방법
1051
+ 1. **문서가 제공되면**: 문서 전체 내용을 꼼꼼히 읽고 핵심 정보를 파악합니다.
1052
+ 2. **요약 요청 시**: 문서의 주제, 목적, 핵심 내용, 주요 항목을 구조화하여 요약합니다.
1053
+ 3. **질문 응답 시**: 문서에서 관련 내용을 찾아 **직접 인용하거나 구체적으로 설명**합니다.
1054
+
1055
+ ## 답변 형식
1056
+ - 한국어로 자연스럽고 명확하게 답변합니다.
1057
+ - 문서 내용을 인용할 때는 구체적으로 언급합니다.
1058
+ - 긴 문서는 섹션별로 나누어 정리합니다.
1059
+
1060
+ ## 주의사항
1061
+ - 문서에 **실제로 있는 내용만** 답변에 포함합니다.
1062
+ - 불확실한 내용은 "문서에서 확인되지 않습니다"라고 명시합니다."""
1063
+
1064
+ api_messages = [{"role": "system", "content": system_prompt}]
1065
+
1066
+ # 이전 대화 추가
1067
+ for m in db_messages:
1068
+ api_messages.append({"role": m["role"], "content": m["content"]})
1069
+
1070
+ # 현재 메시지 구성 - 문서 내용을 명확하게 구분
1071
+ if file_type == "text" and file_content:
1072
+ if message:
1073
+ current_content = f"""## 📄 업로드된 문서 내용 ({filename})
1074
+
1075
+ 다음은 사용자가 업로드한 문서의 전체 내용입니다:
1076
+
1077
+ ---
1078
+ {file_content}
1079
+ ---
1080
+
1081
+ ## 💬 사용자 질문
1082
+ {message}
1083
+
1084
+ 위 문서 내용을 바탕으로 사용자의 질문에 **구체적이고 정확하게** 답변해주세요."""
1085
+ else:
1086
+ current_content = f"""## 📄 업로드된 문서 내용 ({filename})
1087
+
1088
+ 다음은 사용자가 업로드한 문서의 전체 내용입니다:
1089
+
1090
+ ---
1091
+ {file_content}
1092
+ ---
1093
+
1094
+ ## 📋 요청사항
1095
+ 위 문서의 내용을 다음 형식으로 **상세하게 요약**해주세요:
1096
+
1097
+ 1. **문서 제목/주제**: 문서가 다루는 주요 주제
1098
+ 2. **문서 목적**: 이 문서의 작성 목적
1099
+ 3. **핵심 내용**: 가장 중요한 내용 3-5가지
1100
+ 4. **세부 항목**: 문서에 포함된 주요 섹션이나 항목
1101
+ 5. **결론/요약**: 문서의 핵심 메시지"""
1102
+ else:
1103
+ current_content = message or ""
1104
+
1105
+ api_messages.append({"role": "user", "content": current_content})
1106
+
1107
+ # 디버그 로그
1108
+ print(f"\n🤖 [API 요청]")
1109
+ print(f" - 모델: openai/gpt-oss-120b")
1110
+ print(f" - 메시지 수: {len(api_messages)}")
1111
+ print(f" - 파일 타입: {file_type}")
1112
+ print(f" - 문서 길이: {len(file_content) if file_content else 0} 글자")
1113
+ if file_content:
1114
+ print(f" - 문서 미리보기: {file_content[:200]}...")
1115
+
1116
+ # 응답 생성
1117
+ full_response = ""
1118
+ if file_type == "image":
1119
+ for chunk in call_fireworks_api_stream(api_messages, file_content, file_mime):
1120
+ full_response += chunk
1121
+ history[-1] = {"role": "assistant", "content": full_response}
1122
+ yield history, session_id
1123
+ else:
1124
+ for chunk in call_groq_api_stream(api_messages):
1125
+ full_response += chunk
1126
+ history[-1] = {"role": "assistant", "content": full_response}
1127
+ yield history, session_id
1128
+
1129
+ # DB 저장
1130
+ save_message(session_id, "user", current_content, file_info)
1131
+ save_message(session_id, "assistant", full_response)
1132
+
1133
+ if len(db_messages) == 0 and message:
1134
+ update_session_title(session_id, message[:50])
1135
+
1136
+ def new_chat():
1137
+ return [], create_session(), None
1138
+
1139
+ def load_session(session_id: str) -> tuple:
1140
+ if not session_id:
1141
+ return [], ""
1142
+ messages = get_session_messages(session_id, limit=50)
1143
+ return [{"role": m["role"], "content": m["content"]} for m in messages], session_id
1144
+
1145
+ # ============== HWP 변환기 ==============
1146
+ def convert_to_odt_subprocess(input_path, output_dir):
1147
+ output_path = os.path.join(output_dir, "output.odt")
1148
+ try:
1149
+ result = subprocess.run(['hwp5odt', '--output', output_path, input_path], capture_output=True, timeout=120)
1150
+ if result.returncode == 0 and os.path.exists(output_path):
1151
+ return output_path, None
1152
+ except:
1153
+ pass
1154
+ return None, "ODT 변환 실패"
1155
+
1156
+ def convert_to_xml_subprocess(input_path, output_dir):
1157
+ output_path = os.path.join(output_dir, "output.xml")
1158
+ try:
1159
+ result = subprocess.run(['hwp5xml', input_path], capture_output=True, timeout=120)
1160
+ if result.returncode == 0 and result.stdout:
1161
+ with open(output_path, 'wb') as f:
1162
+ f.write(result.stdout)
1163
+ return output_path, None
1164
+ except:
1165
+ pass
1166
+ return None, "XML 변환 실패"
1167
+
1168
+ def convert_hwp(file, output_format, progress=gr.Progress()):
1169
+ if not file:
1170
+ return None, "❌ 파일을 업로드해주세요.", ""
1171
+
1172
+ input_file = file.name if hasattr(file, 'name') else str(file)
1173
+ ext_lower = Path(input_file).suffix.lower()
1174
+
1175
+ if ext_lower not in ['.hwp', '.hwpx']:
1176
+ return None, "❌ HWP 또는 HWPX 파일만 지원됩니다.", ""
1177
+
1178
+ progress(0.1, desc="📖 파일 읽는 중...")
1179
+ version, is_valid = check_hwp_version(input_file)
1180
+ if not is_valid:
1181
+ return None, f"❌ 지원하지 않는 파일: {version}", ""
1182
+
1183
+ tmp_dir = tempfile.mkdtemp()
1184
+
1185
+ try:
1186
+ input_filename = os.path.basename(input_file)
1187
+ input_path = os.path.join(tmp_dir, input_filename)
1188
+ shutil.copy(input_file, input_path)
1189
+
1190
+ progress(0.3, desc=f"🔄 {output_format}로 변환 중...")
1191
+
1192
+ output_path, error, ext = None, None, ""
1193
+
1194
+ if output_format == "HTML":
1195
+ if ext_lower == '.hwpx':
1196
+ return None, "❌ HWPX는 HTML 변환을 지원하지 않습니다.", ""
1197
+ output_path, error = convert_to_html_subprocess(input_path, tmp_dir)
1198
+ ext = ".html"
1199
+ if output_path and os.path.isdir(output_path):
1200
+ zip_path = shutil.make_archive(os.path.join(tmp_dir, "html"), 'zip', output_path)
1201
+ output_path, ext = zip_path, ".zip"
1202
+
1203
+ elif output_format == "ODT (OpenDocument)":
1204
+ if ext_lower == '.hwpx':
1205
+ return None, "❌ HWPX는 ODT 변환을 지원하지 않습니다.", ""
1206
+ output_path, error = convert_to_odt_subprocess(input_path, tmp_dir)
1207
+ ext = ".odt"
1208
+
1209
+ elif output_format == "TXT (텍스트)":
1210
+ text, error = extract_text_from_hwp_or_hwpx(input_path)
1211
+ if text:
1212
+ output_path = os.path.join(tmp_dir, "output.txt")
1213
+ with open(output_path, 'w', encoding='utf-8') as f:
1214
+ f.write(text)
1215
+ ext = ".txt"
1216
+
1217
+ elif output_format == "⭐ MARKDOWN (추천)":
1218
+ text, error = convert_hwp_to_markdown(input_path)
1219
+ if text:
1220
+ output_path = os.path.join(tmp_dir, "output.md")
1221
+ with open(output_path, 'w', encoding='utf-8') as f:
1222
+ f.write(text)
1223
+ ext = ".md"
1224
+
1225
+ elif output_format == "XML":
1226
+ if ext_lower == '.hwpx':
1227
+ try:
1228
+ with zipfile.ZipFile(input_path, 'r') as zf:
1229
+ xml_contents = []
1230
+ for name in zf.namelist():
1231
+ if name.endswith('.xml'):
1232
+ with zf.open(name) as f:
1233
+ xml_contents.append(f"<!-- {name} -->\n{f.read().decode('utf-8', errors='ignore')}")
1234
+ output_path = os.path.join(tmp_dir, "output.xml")
1235
+ with open(output_path, 'w', encoding='utf-8') as f:
1236
+ f.write('\n\n'.join(xml_contents))
1237
+ except Exception as e:
1238
+ error = f"HWPX XML 추출 실패: {e}"
1239
+ else:
1240
+ output_path, error = convert_to_xml_subprocess(input_path, tmp_dir)
1241
+ ext = ".xml"
1242
+
1243
+ if not output_path:
1244
+ return None, f"❌ {error or '변환 실패'}", ""
1245
+
1246
+ if not os.path.exists(output_path):
1247
+ return None, "❌ 변환된 파일을 찾을 수 없습니다.", ""
1248
+
1249
+ progress(0.8, desc="✅ 완료 중...")
1250
+
1251
+ base_name = Path(input_filename).stem
1252
+ final_output = os.path.join(tmp_dir, f"{base_name}{ext}")
1253
+ if output_path != final_output:
1254
+ shutil.copy2(output_path, final_output)
1255
+
1256
+ file_size = os.path.getsize(final_output)
1257
+ size_str = f"{file_size/1024:.1f} KB" if file_size > 1024 else f"{file_size} bytes"
1258
+
1259
+ preview = ""
1260
+ if ext in ['.txt', '.md', '.xml']:
1261
+ try:
1262
+ with open(final_output, 'r', encoding='utf-8', errors='ignore') as f:
1263
+ preview = f.read(5000)
1264
+ if len(preview) >= 5000:
1265
+ preview += "\n\n... (생략)"
1266
+ except:
1267
+ pass
1268
+ elif ext == '.zip':
1269
+ preview = "📦 HTML이 ZIP으로 압축되었습니다."
1270
+
1271
+ progress(1.0, desc="🎉 완료!")
1272
+ return final_output, f"✅ 변환 완료: {base_name}{ext} ({size_str})", preview
1273
+
1274
+ except Exception as e:
1275
+ import traceback
1276
+ traceback.print_exc()
1277
+ return None, f"❌ 오류: {str(e)}", ""
1278
+
1279
+ # ============== Gradio UI ==============
1280
+ with gr.Blocks(title="HWPower AI 어시스턴트", css=COMIC_CSS, delete_cache=(60, 60)) as demo:
1281
+
1282
+ # HOME Button
1283
+ gr.HTML("""
1284
+ <div class="home-button-container">
1285
+ <a href="https://www.humangen.ai" target="_blank" class="home-button">
1286
+ 🏠 HOME
1287
+ </a>
1288
+ <span class="url-display">🌐 www.humangen.ai</span>
1289
+ </div>
1290
+ """)
1291
+
1292
+ # Header
1293
+ gr.HTML("""
1294
+ <div class="header-container">
1295
+ <div class="header-title">📄 HWPower AI 어시스턴트 🤖</div>
1296
+ <div class="header-subtitle">AI가 HWP 파일을 읽고, 보고, 말하며, 생각하고 기억합니다!</div>
1297
+ <div style="margin-top:12px">
1298
+ <span class="stats-badge">📖 읽기 READ</span>
1299
+ <span class="stats-badge">👁️ 보기 SEE</span>
1300
+ <span class="stats-badge">💬 말하기 SPEAK</span>
1301
+ <span class="stats-badge">🧠 생각 THINK</span>
1302
+ <span class="stats-badge">💾 기억 MEMORY</span>
1303
+ </div>
1304
+ </div>
1305
+ """)
1306
+
1307
+ # 무료 서비스 안내
1308
+ gr.HTML("""
1309
+ <div class="free-service-notice">
1310
+ 🆓 본 서비스는 <b>무료 버전</b>으로 일부 기능에 제약이 있습니다.<br>
1311
+ 📧 문의: <a href="mailto:arxivgpt@gmail.com">arxivgpt@gmail.com</a>
1312
+ </div>
1313
+ """)
1314
+
1315
+ session_state = gr.State("")
1316
+
1317
+ with gr.Tabs():
1318
+ # Tab 1: AI 채팅
1319
+ with gr.Tab("💬 AI 채팅"):
1320
+ # Feature Box
1321
+
1322
+ with gr.Row():
1323
+ with gr.Column(scale=1):
1324
+ gr.HTML("""
1325
+ <div class="info-box">
1326
+ 📁 <b>지원 파일 형식</b><br><br>
1327
+ 🖼️ <b>이미지</b>: JPG, PNG, GIF, WebP<br>
1328
+ 📑 <b>문서</b>: PDF, TXT, MD<br>
1329
+ 📄 <b>한글</b>: HWP, HWPX ✨
1330
+ </div>
1331
+ """)
1332
+
1333
+ new_btn = gr.Button("🆕 새 대화 시작", variant="primary")
1334
+
1335
+ with gr.Accordion("📜 대화 기록 (Memory)", open=False):
1336
+ session_list = gr.Dataframe(headers=["ID", "제목", "시간"], interactive=False)
1337
+ refresh_btn = gr.Button("🔄 새로고침", size="sm")
1338
+
1339
+ with gr.Column(scale=3):
1340
+ chatbot = gr.Chatbot(label="💬 AI 대화", height=500)
1341
+
1342
+ with gr.Row():
1343
+ file_upload = gr.File(
1344
+ label="📎 파일 첨부 (HWP/HWPX/PDF/이미지)",
1345
+ file_types=[".jpg", ".jpeg", ".png", ".gif", ".webp", ".pdf", ".txt", ".md", ".hwp", ".hwpx"],
1346
+ scale=1,
1347
+ elem_classes=["upload-box"]
1348
+ )
1349
+ msg_input = gr.Textbox(
1350
+ placeholder="💭 메시지를 입력하세요... (파일을 업로드하면 AI가 내용을 읽고 분석합니다)",
1351
+ lines=2,
1352
+ show_label=False,
1353
+ scale=4
1354
+ )
1355
+
1356
+ with gr.Row():
1357
+ submit_btn = gr.Button("🚀 전송", variant="primary", scale=3)
1358
+ clear_btn = gr.Button("🗑️ 지우기", scale=1)
1359
+
1360
+ # Tab 2: HWP 변환기
1361
+ with gr.Tab("📄 HWP 변환기"):
1362
+ gr.HTML("""
1363
+ <div class="feature-box">
1364
+ <div class="feature-title">🔄 HWP/HWPX 파일 변환기</div>
1365
+ <p style="font-family: 'Comic Neue', cursive; font-weight: 700; color: #1F2937;">
1366
+ 한글 문서를 다양한 형식으로 변환합니다. AI가 문서를 읽고 텍스트를 추출합니다.
1367
+ </p>
1368
+ </div>
1369
+ """)
1370
+
1371
+ # Markdown 강조 박스
1372
+ gr.HTML("""
1373
+ <div class="markdown-highlight-box">
1374
+ <div class="markdown-title">⭐ MARKDOWN 변환 추천! ⭐</div>
1375
+ <div class="markdown-benefits">
1376
+ <div class="markdown-benefit-item">
1377
+ <span class="markdown-benefit-icon">🤖</span>
1378
+ <b>AI/LLM 최적화</b><br>
1379
+ ChatGPT, Claude 등 AI에 바로 입력 가능
1380
+ </div>
1381
+ <div class="markdown-benefit-item">
1382
+ <span class="markdown-benefit-icon">📝</span>
1383
+ <b>범용 포맷</b><br>
1384
+ GitHub, Notion, 블로그 등 어디서나 사용
1385
+ </div>
1386
+ <div class="markdown-benefit-item">
1387
+ <span class="markdown-benefit-icon">🔍</span>
1388
+ <b>구조 유지</b><br>
1389
+ 제목, 목록, 표 등 문서 구조 보존
1390
+ </div>
1391
+ <div class="markdown-benefit-item">
1392
+ <span class="markdown-benefit-icon">⚡</span>
1393
+ <b>가볍고 빠름</b><br>
1394
+ 용량이 작고 처리 속도 빠름
1395
+ </div>
1396
+ <div class="markdown-benefit-item">
1397
+ <span class="markdown-benefit-icon">🔄</span>
1398
+ <b>변환 용이</b><br>
1399
+ HTML, PDF, Word 등으로 재변환 가능
1400
+ </div>
1401
+ <div class="markdown-benefit-item">
1402
+ <span class="markdown-benefit-icon">✏️</span>
1403
+ <b>편집 간편</b><br>
1404
+ 메모장으로도 바로 수정 가능
1405
+ </div>
1406
+ </div>
1407
+ </div>
1408
+ """)
1409
+
1410
+ with gr.Row():
1411
+ with gr.Column():
1412
+ gr.HTML('<div class="info-box">📤 <b>파일 업로드</b></div>')
1413
+ hwp_input = gr.File(
1414
+ label="HWP/HWPX 파일 선택",
1415
+ file_types=[".hwp", ".hwpx"],
1416
+ elem_classes=["upload-box"]
1417
+ )
1418
+ format_select = gr.Radio(
1419
+ ["⭐ MARKDOWN (추천)", "TXT (텍스트)", "HTML", "ODT (OpenDocument)", "XML"],
1420
+ value="⭐ MARKDOWN (추천)",
1421
+ label="📋 변환 형식"
1422
+ )
1423
+ convert_btn = gr.Button("🔄 변환하기", variant="primary", size="lg")
1424
+
1425
+ with gr.Column():
1426
+ gr.HTML('<div class="info-box">📥 <b>변환 결과</b></div>')
1427
+ status_out = gr.Textbox(label="상태", interactive=False)
1428
+ file_out = gr.File(label="다운로드", elem_classes=["download-box"])
1429
+
1430
+ with gr.Accordion("📋 미리보기", open=False):
1431
+ preview_out = gr.Textbox(lines=15, interactive=False)
1432
+
1433
+ gr.HTML("""
1434
+ <div class="info-box">
1435
+ ℹ️ <b>안내</b>: 변환 서비스는 개인용도로 사용시 어떠한 제약도 없습니다. * Special Thanks: june9713@gmail.com *
1436
+ </div>
1437
+ """)
1438
+
1439
+ # Footer
1440
+ gr.HTML("""
1441
+ <div class="footer-comic">
1442
+ <p style="font-family:'Bangers',cursive;font-size:1.8rem;letter-spacing:2px">📄 HWP AI 어시스턴트 🤖</p>
1443
+ <p>AI가 HWP 파일을 읽고, 보고, 말하며, 생각하고 기억합니다!</p>
1444
+ <p>📖 READ • 👁️ SEE • 💬 SPEAK • 🧠 THINK • 💾 MEMORY</p>
1445
+ <p style="margin-top:8px;font-size:0.9rem;">🆓 무료 서비스 (일부 기능 제한) | 📧 arxivgpt@gmail.com</p>
1446
+ <p style="margin-top:10px"><a href="https://www.humangen.ai" target="_blank" style="color:#FACC15;text-decoration:none;font-weight:bold;">🏠 www.humangen.ai</a></p>
1447
+ </div>
1448
+ """)
1449
+
1450
+ # ============== 이벤트 핸들러 ==============
1451
+ def on_submit(msg, hist, f, sid):
1452
+ if hist is None:
1453
+ hist = []
1454
+ for r in chat_response(msg, hist, f, sid):
1455
+ yield r[0], r[1], "", None
1456
+
1457
+ submit_btn.click(on_submit, [msg_input, chatbot, file_upload, session_state],
1458
+ [chatbot, session_state, msg_input, file_upload])
1459
+ msg_input.submit(on_submit, [msg_input, chatbot, file_upload, session_state],
1460
+ [chatbot, session_state, msg_input, file_upload])
1461
+
1462
+ new_btn.click(lambda: ([], create_session(), None, ""), outputs=[chatbot, session_state, file_upload, msg_input])
1463
+ clear_btn.click(lambda: ([], None, ""), outputs=[chatbot, file_upload, msg_input])
1464
+
1465
+ def refresh():
1466
+ sessions = get_all_sessions()
1467
+ return [[s["session_id"][:8], s["title"] or "제목없음", s["updated_at"][:16] if s["updated_at"] else ""] for s in sessions]
1468
+
1469
+ refresh_btn.click(refresh, outputs=[session_list])
1470
+
1471
+ def select_session(evt: gr.SelectData, data):
1472
+ if evt.index[0] < len(data):
1473
+ for s in get_all_sessions():
1474
+ if s["session_id"].startswith(data[evt.index[0]][0]):
1475
+ return load_session(s["session_id"])
1476
+ return [], ""
1477
+
1478
+ session_list.select(select_session, [session_list], [chatbot, session_state])
1479
+ convert_btn.click(convert_hwp, [hwp_input, format_select], [file_out, status_out, preview_out])
1480
+ demo.load(refresh, outputs=[session_list])
1481
+
1482
+ if __name__ == "__main__":
1483
+ demo.launch(ssr_mode=False)