seawolf2357 commited on
Commit
e006e27
·
verified ·
1 Parent(s): 46e1b25

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +382 -243
app.py CHANGED
@@ -16,9 +16,11 @@ import sqlite3
16
  import base64
17
  import requests
18
  import zlib
 
19
  from pathlib import Path
20
  from datetime import datetime
21
  from typing import Generator, List, Dict, Optional
 
22
 
23
  # ============== 환경 설정 ==============
24
  SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -27,7 +29,6 @@ DB_PATH = os.path.join(SCRIPT_DIR, 'chat_history.db')
27
 
28
  if os.path.exists(PYHWP_PATH):
29
  sys.path.insert(0, PYHWP_PATH)
30
- print(f"Added local pyhwp path: {PYHWP_PATH}")
31
 
32
  # ============== 모듈 임포트 ==============
33
  try:
@@ -72,6 +73,28 @@ try:
72
  except ImportError:
73
  PDFPLUMBER_AVAILABLE = False
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  # ============== API 키 설정 ==============
76
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
77
  FIREWORKS_API_KEY = os.environ.get("FIREWORKS_API_KEY", "")
@@ -106,10 +129,8 @@ def create_session() -> str:
106
  session_id = str(uuid.uuid4())
107
  conn = sqlite3.connect(DB_PATH)
108
  cursor = conn.cursor()
109
- cursor.execute(
110
- "INSERT INTO sessions (session_id, title) VALUES (?, ?)",
111
- (session_id, f"대화 {datetime.now().strftime('%Y-%m-%d %H:%M')}")
112
- )
113
  conn.commit()
114
  conn.close()
115
  return session_id
@@ -117,26 +138,17 @@ def create_session() -> str:
117
  def save_message(session_id: str, role: str, content: str, file_info: str = None):
118
  conn = sqlite3.connect(DB_PATH)
119
  cursor = conn.cursor()
120
- cursor.execute(
121
- "INSERT INTO messages (session_id, role, content, file_info) VALUES (?, ?, ?, ?)",
122
- (session_id, role, content, file_info)
123
- )
124
- cursor.execute(
125
- "UPDATE sessions SET updated_at = CURRENT_TIMESTAMP WHERE session_id = ?",
126
- (session_id,)
127
- )
128
  conn.commit()
129
  conn.close()
130
 
131
  def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
132
  conn = sqlite3.connect(DB_PATH)
133
  cursor = conn.cursor()
134
- cursor.execute(
135
- """SELECT role, content, file_info, created_at
136
- FROM messages WHERE session_id = ?
137
- ORDER BY created_at DESC LIMIT ?""",
138
- (session_id, limit)
139
- )
140
  rows = cursor.fetchall()
141
  conn.close()
142
  return [{"role": r[0], "content": r[1], "file_info": r[2], "created_at": r[3]} for r in reversed(rows)]
@@ -144,9 +156,7 @@ def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
144
  def get_all_sessions() -> List[Dict]:
145
  conn = sqlite3.connect(DB_PATH)
146
  cursor = conn.cursor()
147
- cursor.execute(
148
- "SELECT session_id, title, created_at, updated_at FROM sessions ORDER BY updated_at DESC LIMIT 50"
149
- )
150
  rows = cursor.fetchall()
151
  conn.close()
152
  return [{"session_id": r[0], "title": r[1], "created_at": r[2], "updated_at": r[3]} for r in rows]
@@ -204,14 +214,17 @@ def image_to_base64(file_path: str) -> str:
204
 
205
  def get_image_mime_type(file_path: str) -> str:
206
  ext = Path(file_path).suffix.lower()
207
- return {'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png',
208
  '.gif': 'image/gif', '.webp': 'image/webp', '.bmp': 'image/bmp'}.get(ext, 'image/jpeg')
209
 
210
  def is_image_file(fp: str) -> bool:
211
  return Path(fp).suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']
212
 
213
  def is_hwp_file(fp: str) -> bool:
214
- return Path(fp).suffix.lower() in ['.hwp', '.hwpx']
 
 
 
215
 
216
  def is_pdf_file(fp: str) -> bool:
217
  return Path(fp).suffix.lower() == '.pdf'
@@ -219,57 +232,242 @@ def is_pdf_file(fp: str) -> bool:
219
  def is_text_file(fp: str) -> bool:
220
  return Path(fp).suffix.lower() in ['.txt', '.md', '.json', '.csv', '.xml', '.html', '.css', '.js', '.py']
221
 
222
- # ============== HWP 텍스트 추출 (핵심 - 단순하고 안정적으로) ==============
223
 
224
- def decompress_stream(data: bytes) -> bytes:
225
- """zlib 압축 해제 시도"""
226
  try:
227
- return zlib.decompress(data, -15)
228
- except:
229
- try:
230
- return zlib.decompress(data)
231
- except:
232
- return data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
- def extract_hwp_text_from_bodytext(ole) -> str:
235
- """BodyText 섹션에서 텍스트 추출 (HWP5 포맷)"""
236
- text_parts = []
 
237
 
238
- for entry in ole.listdir():
239
- entry_path = '/'.join(entry)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
- # BodyText/SectionX 스트림 찾기
242
- if entry_path.startswith('BodyText/Section'):
243
- try:
244
- stream_data = ole.openstream(entry).read()
245
-
246
- # 압축 해제 시도
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  try:
248
- decompressed = zlib.decompress(stream_data, -15)
 
 
249
  except:
250
- decompressed = stream_data
251
-
252
- # HWP5 레코드에 추출
253
- extracted = extract_text_from_hwp_records(decompressed)
254
- if extracted:
255
- text_parts.append(extracted)
256
-
257
- except Exception as e:
258
- print(f" 섹션 읽기 오류 {entry_path}: {e}")
259
- continue
260
 
261
- return '\n\n'.join(text_parts) if text_parts else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
- def extract_text_from_hwp_records(data: bytes) -> str:
264
- """HWP5 레코드 구조에서 텍스트 추출"""
265
  texts = []
266
  pos = 0
267
 
268
  while pos < len(data) - 4:
269
  try:
270
- # 레코드 헤더 (4바이트)
271
  header = int.from_bytes(data[pos:pos+4], 'little')
272
  tag_id = header & 0x3FF
 
273
  size = (header >> 20) & 0xFFF
274
 
275
  pos += 4
@@ -287,44 +485,31 @@ def extract_text_from_hwp_records(data: bytes) -> str:
287
  record_data = data[pos:pos+size]
288
  pos += size
289
 
290
- # HWPTAG_PARA_TEXT = 67 (0x43)
291
  if tag_id == 67 and size > 0:
292
- # 텍스트 추출 (컨트롤 문자 처리)
293
- text = extract_para_text(record_data)
294
  if text:
295
  texts.append(text)
296
 
297
- except Exception as e:
298
  pos += 1
299
  continue
300
 
301
  return '\n'.join(texts) if texts else None
302
 
303
- def extract_para_text(data: bytes) -> str:
304
- """PARA_TEXT 레코드에서 실제 텍스트 추출"""
305
  result = []
306
  i = 0
307
 
308
  while i < len(data) - 1:
309
  code = int.from_bytes(data[i:i+2], 'little')
310
 
311
- # 일반 문자 (유니코드)
312
- if code >= 32:
313
- try:
314
- char = chr(code)
315
- # 한글, 영문, 숫자, 일반 기호만 허용
316
- if char.isprintable() and not (0x4E00 <= code <= 0x9FFF and code not in range(0xAC00, 0xD7A4)):
317
- result.append(char)
318
- elif 0xAC00 <= code <= 0xD7A3: # 한글 음절
319
- result.append(char)
320
- except:
321
- pass
322
- # 컨트롤 문자 처리
323
- elif code == 0: # NULL
324
  pass
325
- elif code == 1: # 예약
326
- i += 14 # 확장 컨트롤 건너뛰기
327
- elif code == 2: # 섹션/컬럼 정의
328
  i += 14
329
  elif code == 3: # 필드 시작
330
  i += 14
@@ -338,99 +523,59 @@ def extract_para_text(data: bytes) -> str:
338
  result.append('\n')
339
  elif code == 24: # 하이픈
340
  result.append('-')
341
- elif code == 30: # 묶음 빈칸
342
- result.append(' ')
343
- elif code == 31: # 고정폭 빈칸
344
  result.append(' ')
 
 
 
 
 
 
 
 
 
 
345
 
346
  i += 2
347
 
348
  text = ''.join(result).strip()
349
- # 의미 없는 텍스트 필터링
350
- if len(text) < 2:
351
- return None
352
- return text
353
-
354
- def extract_text_with_olefile(file_path: str) -> tuple:
355
- """olefile을 사용한 HWP 텍스트 추출"""
356
- if not OLEFILE_AVAILABLE:
357
- return None, "olefile 모듈 없음"
358
 
359
- try:
360
- ole = olefile.OleFileIO(file_path)
361
-
362
- # 파일 헤더 확인
363
- if not ole.exists('FileHeader'):
364
- ole.close()
365
- return None, "HWP 파일 헤더 없음"
366
-
367
- # 압축 여부 확인
368
- header_data = ole.openstream('FileHeader').read()
369
- is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
370
- print(f" HWP 압축 여부: {is_compressed}")
371
-
372
- # BodyText에서 텍스트 추출
373
- text = extract_hwp_text_from_bodytext(ole)
374
-
375
- ole.close()
376
-
377
- if text and len(text.strip()) > 10:
378
- return text.strip(), None
379
-
380
- return None, "텍스트 추출 실패"
381
-
382
- except Exception as e:
383
- return None, f"olefile 오류: {str(e)}"
384
-
385
- def extract_text_with_hwp5txt(file_path: str) -> tuple:
386
- """hwp5txt 명령어로 텍스트 추출"""
387
- try:
388
- result = subprocess.run(
389
- [sys.executable, '-m', 'hwp5', 'txt', file_path],
390
- capture_output=True,
391
- timeout=60
392
- )
393
-
394
- if result.returncode == 0 and result.stdout:
395
- # 여러 인코딩 시도
396
- for enc in ['utf-8', 'cp949', 'euc-kr']:
397
- try:
398
- text = result.stdout.decode(enc)
399
- if text.strip() and len(text.strip()) > 10:
400
- return text.strip(), None
401
- except:
402
- continue
403
-
404
- stderr = result.stderr.decode('utf-8', errors='ignore') if result.stderr else ""
405
- return None, f"hwp5txt 실패: {stderr[:100]}"
406
-
407
- except subprocess.TimeoutExpired:
408
- return None, "hwp5txt 타임아웃"
409
- except Exception as e:
410
- return None, f"hwp5txt 오류: {str(e)}"
411
 
412
  def extract_text_from_hwp(file_path: str) -> tuple:
413
  """HWP 파일에서 텍스트 추출 (메인 함수)"""
414
  print(f"\n[HWP 추출] 시작: {os.path.basename(file_path)}")
415
 
416
- # 방법 1: hwp5txt 명령어 (가장 안정적)
417
- print(" 방법 1: hwp5txt 명령어...")
418
  text, error = extract_text_with_hwp5txt(file_path)
419
- if text:
420
  print(f" ✓ hwp5txt 성공: {len(text)} 글자")
421
  return text, None
422
  print(f" ✗ hwp5txt 실패: {error}")
423
 
424
- # 방법 2: olefile 직접 파싱
425
  print(" 방법 2: olefile 파싱...")
426
  text, error = extract_text_with_olefile(file_path)
427
- if text:
428
  print(f" ✓ olefile 성공: {len(text)} 글자")
429
  return text, None
430
  print(f" ✗ olefile 실패: {error}")
431
 
432
  return None, "모든 추출 방법 실패"
433
 
 
 
 
 
 
 
 
 
434
  # ============== HWP 변환 함수들 ==============
435
 
436
  def check_hwp_version(file_path):
@@ -441,6 +586,8 @@ def check_hwp_version(file_path):
441
  return "HWP v5", True
442
  elif header[:4] == b'\xd0\xcf\x11\xe0':
443
  return "HWP v5 (OLE)", True
 
 
444
  else:
445
  return "Unknown", False
446
  except Exception as e:
@@ -451,41 +598,32 @@ def convert_to_html_subprocess(input_path, output_dir):
451
  output_path = os.path.join(output_dir, "output.html")
452
 
453
  try:
454
- result = subprocess.run(
455
- [sys.executable, '-m', 'hwp5', 'html', '--output', output_path, input_path],
456
- capture_output=True,
457
- text=True,
458
- timeout=120
459
- )
460
-
461
- if result.returncode == 0:
462
- # 결과 파일/디렉토리 찾기
463
- if os.path.isfile(output_path):
464
- return output_path, None
465
- if os.path.isdir(output_path):
466
- return output_path, None
467
-
468
- # 다른 위치 검색
469
- for item in os.listdir(output_dir):
470
- item_path = os.path.join(output_dir, item)
471
- if item.lower().endswith(('.html', '.htm')) and os.path.isfile(item_path):
472
- return item_path, None
473
- if os.path.isdir(item_path):
474
- for sub in os.listdir(item_path):
475
- if sub.lower().endswith(('.html', '.htm')):
476
  return item_path, None
477
- return output_dir, None
478
-
479
- except subprocess.TimeoutExpired:
480
- return None, "HTML 변환 타임아웃"
 
481
  except Exception as e:
482
- return None, f"HTML 변환 오류: {str(e)}"
483
 
484
  return None, "HTML 변환 실패"
485
 
486
  def convert_hwp_to_text(input_path: str) -> tuple:
487
- """HWP를 텍스트로 변환"""
488
- return extract_text_from_hwp(input_path)
489
 
490
  def html_to_markdown(html_content):
491
  """HTML을 Markdown으로 변환"""
@@ -503,7 +641,6 @@ def html_to_markdown(html_content):
503
  except:
504
  pass
505
 
506
- # 기본 변환
507
  if BS4_AVAILABLE:
508
  try:
509
  soup = BeautifulSoup(html_content, 'html.parser')
@@ -514,41 +651,12 @@ def html_to_markdown(html_content):
514
  return None, "Markdown 변환 실패"
515
 
516
  def convert_hwp_to_markdown(input_path: str) -> tuple:
517
- """HWP를 Markdown으로 변환"""
518
- # 먼저 텍스트 추출 시도
519
- text, error = extract_text_from_hwp(input_path)
520
  if text:
521
  return text, None
522
-
523
- # HTML 변환 후 Markdown 변환
524
- tmp_dir = tempfile.mkdtemp()
525
- try:
526
- html_output, error = convert_to_html_subprocess(input_path, tmp_dir)
527
- if html_output:
528
- # HTML 파일 읽기
529
- html_files = []
530
- if os.path.isfile(html_output):
531
- html_files = [html_output]
532
- elif os.path.isdir(html_output):
533
- for root, dirs, files in os.walk(html_output):
534
- for f in files:
535
- if f.lower().endswith(('.html', '.htm')):
536
- html_files.append(os.path.join(root, f))
537
-
538
- for html_file in html_files:
539
- for enc in ['utf-8', 'cp949', 'euc-kr']:
540
- try:
541
- with open(html_file, 'r', encoding=enc) as f:
542
- content = f.read()
543
- md_text, _ = html_to_markdown(content)
544
- if md_text and len(md_text.strip()) > 10:
545
- return md_text.strip(), None
546
- except:
547
- continue
548
-
549
- return None, error or "변환 실패"
550
- finally:
551
- shutil.rmtree(tmp_dir, ignore_errors=True)
552
 
553
  # ============== LLM API ==============
554
 
@@ -646,11 +754,11 @@ def process_file(file_path: str) -> tuple:
646
  if is_image_file(file_path):
647
  return "image", image_to_base64(file_path), get_image_mime_type(file_path)
648
 
649
- if is_hwp_file(file_path):
650
- text, error = extract_text_from_hwp(file_path)
651
- if text:
652
- return "text", f"[HWP 문서: {filename}]\n\n{text}", None
653
- return "error", f"HWP 추출 실패: {error}", None
654
 
655
  if is_pdf_file(file_path):
656
  text = extract_text_from_pdf(file_path)
@@ -666,7 +774,7 @@ def process_file(file_path: str) -> tuple:
666
 
667
  return "unsupported", f"지원하지 않는 형식: {filename}", None
668
 
669
- def chat_response(message: str, history: List[Dict], file: Optional[str],
670
  session_id: str, groq_key: str, fireworks_key: str) -> Generator[tuple, None, None]:
671
  if history is None:
672
  history = []
@@ -714,7 +822,7 @@ def chat_response(message: str, history: List[Dict], file: Optional[str],
714
  db_messages = get_session_messages(session_id, limit=10)
715
  api_messages = [{
716
  "role": "system",
717
- "content": "당신은 도움이 되는 AI 어시스턴트입니다. 한국어로 자연스럽게 대화하며, 파일이 첨부되면 내용을 분석하여 답변합니다."
718
  }]
719
 
720
  for m in db_messages:
@@ -760,12 +868,14 @@ def load_session(session_id: str) -> tuple:
760
  def convert_to_odt_subprocess(input_path, output_dir):
761
  output_path = os.path.join(output_dir, "output.odt")
762
  try:
763
- result = subprocess.run(
764
- [sys.executable, '-m', 'hwp5', 'odt', '--output', output_path, input_path],
765
- capture_output=True, timeout=120
766
- )
767
- if result.returncode == 0 and os.path.exists(output_path):
768
- return output_path, None
 
 
769
  except:
770
  pass
771
  return None, "ODT 변환 실패"
@@ -773,14 +883,16 @@ def convert_to_odt_subprocess(input_path, output_dir):
773
  def convert_to_xml_subprocess(input_path, output_dir):
774
  output_path = os.path.join(output_dir, "output.xml")
775
  try:
776
- result = subprocess.run(
777
- [sys.executable, '-m', 'hwp5', 'xml', input_path],
778
- capture_output=True, timeout=120
779
- )
780
- if result.returncode == 0 and result.stdout:
781
- with open(output_path, 'wb') as f:
782
- f.write(result.stdout)
783
- return output_path, None
 
 
784
  except:
785
  pass
786
  return None, "XML 변환 실패"
@@ -790,8 +902,10 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
790
  return None, "❌ 파일을 업로드해주세요.", ""
791
 
792
  input_file = file.name if hasattr(file, 'name') else str(file)
793
- if not input_file.lower().endswith('.hwp'):
794
- return None, "❌ HWP 파일만 지원됩니다.", ""
 
 
795
 
796
  progress(0.1, desc="파일 분석 중...")
797
  version, is_valid = check_hwp_version(input_file)
@@ -810,6 +924,8 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
810
  output_path, error, ext = None, None, ""
811
 
812
  if output_format == "HTML":
 
 
813
  output_path, error = convert_to_html_subprocess(input_path, tmp_dir)
814
  ext = ".html"
815
  if output_path and os.path.isdir(output_path):
@@ -817,11 +933,13 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
817
  output_path, ext = zip_path, ".zip"
818
 
819
  elif output_format == "ODT (OpenDocument)":
 
 
820
  output_path, error = convert_to_odt_subprocess(input_path, tmp_dir)
821
  ext = ".odt"
822
 
823
  elif output_format == "TXT (텍스트)":
824
- text, error = extract_text_from_hwp(input_path)
825
  if text:
826
  output_path = os.path.join(tmp_dir, "output.txt")
827
  with open(output_path, 'w', encoding='utf-8') as f:
@@ -837,7 +955,24 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
837
  ext = ".md"
838
 
839
  elif output_format == "XML":
840
- output_path, error = convert_to_xml_subprocess(input_path, tmp_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
841
  ext = ".xml"
842
 
843
  if not output_path:
@@ -886,7 +1021,7 @@ css = """
886
  with gr.Blocks(title="AI 문서 어시스턴트") as demo:
887
  session_state = gr.State("")
888
 
889
- gr.Markdown("# 🤖 AI 문서 어시스턴트\nLLM 채팅 + HWP 문서 변환")
890
 
891
  with gr.Tabs():
892
  with gr.Tab("💬 AI 채팅"):
@@ -897,7 +1032,7 @@ with gr.Blocks(title="AI 문서 어시스턴트") as demo:
897
  groq_key = gr.Textbox(label="Groq API Key", type="password", value=GROQ_API_KEY)
898
  fireworks_key = gr.Textbox(label="Fireworks API Key", type="password", value=FIREWORKS_API_KEY)
899
 
900
- gr.Markdown("### 📁 지원 파일\n- 이미지: JPG, PNG\n- 문서: PDF, TXT, HWP ✨")
901
  new_btn = gr.Button("🆕 새 대화", variant="primary")
902
 
903
  with gr.Accordion("📜 기록", open=False):
@@ -916,10 +1051,10 @@ with gr.Blocks(title="AI 문서 어시스턴트") as demo:
916
  clear_btn = gr.Button("🗑️ 지우기", scale=1)
917
 
918
  with gr.Tab("📄 HWP 변환기"):
919
- gr.Markdown("### HWP 파일 변환기")
920
  with gr.Row():
921
  with gr.Column():
922
- hwp_input = gr.File(label="HWP 파일", file_types=[".hwp"], elem_classes=["upload-box"])
923
  format_select = gr.Radio(["HTML", "ODT (OpenDocument)", "TXT (텍스트)", "Markdown", "XML"], value="TXT (텍스트)", label="형식")
924
  convert_btn = gr.Button("🔄 변환", variant="primary", size="lg")
925
  with gr.Column():
@@ -928,6 +1063,10 @@ with gr.Blocks(title="AI 문서 어시스턴트") as demo:
928
 
929
  with gr.Accordion("📋 미리보기", open=False):
930
  preview_out = gr.Textbox(lines=15, interactive=False)
 
 
 
 
931
 
932
  # 이벤트
933
  def on_submit(msg, hist, f, sid, gk, fk):
@@ -935,7 +1074,7 @@ with gr.Blocks(title="AI 문서 어시스턴트") as demo:
935
  for r in chat_response(msg, hist, f, sid, gk, fk):
936
  yield r[0], r[1], "", None
937
 
938
- submit_btn.click(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
939
  [chatbot, session_state, msg_input, file_upload])
940
  msg_input.submit(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
941
  [chatbot, session_state, msg_input, file_upload])
 
16
  import base64
17
  import requests
18
  import zlib
19
+ import zipfile
20
  from pathlib import Path
21
  from datetime import datetime
22
  from typing import Generator, List, Dict, Optional
23
+ from xml.etree import ElementTree as ET
24
 
25
  # ============== 환경 설정 ==============
26
  SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 
29
 
30
  if os.path.exists(PYHWP_PATH):
31
  sys.path.insert(0, PYHWP_PATH)
 
32
 
33
  # ============== 모듈 임포트 ==============
34
  try:
 
73
  except ImportError:
74
  PDFPLUMBER_AVAILABLE = False
75
 
76
+ # hwp5txt 사용 가능 여부 확인
77
+ HWP5TXT_AVAILABLE = False
78
+ try:
79
+ result = subprocess.run(['hwp5txt', '--help'], capture_output=True, timeout=5)
80
+ if result.returncode == 0:
81
+ HWP5TXT_AVAILABLE = True
82
+ print("hwp5txt command available")
83
+ except:
84
+ pass
85
+
86
+ if not HWP5TXT_AVAILABLE:
87
+ try:
88
+ result = subprocess.run([sys.executable, '-c', 'from hwp5.hwp5txt import main; print("ok")'],
89
+ capture_output=True, timeout=5)
90
+ if b'ok' in result.stdout:
91
+ HWP5TXT_AVAILABLE = True
92
+ print("hwp5txt module available")
93
+ except:
94
+ pass
95
+
96
+ print(f"HWP5TXT_AVAILABLE: {HWP5TXT_AVAILABLE}")
97
+
98
  # ============== API 키 설정 ==============
99
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
100
  FIREWORKS_API_KEY = os.environ.get("FIREWORKS_API_KEY", "")
 
129
  session_id = str(uuid.uuid4())
130
  conn = sqlite3.connect(DB_PATH)
131
  cursor = conn.cursor()
132
+ cursor.execute("INSERT INTO sessions (session_id, title) VALUES (?, ?)",
133
+ (session_id, f"대화 {datetime.now().strftime('%Y-%m-%d %H:%M')}"))
 
 
134
  conn.commit()
135
  conn.close()
136
  return session_id
 
138
  def save_message(session_id: str, role: str, content: str, file_info: str = None):
139
  conn = sqlite3.connect(DB_PATH)
140
  cursor = conn.cursor()
141
+ cursor.execute("INSERT INTO messages (session_id, role, content, file_info) VALUES (?, ?, ?, ?)",
142
+ (session_id, role, content, file_info))
143
+ cursor.execute("UPDATE sessions SET updated_at = CURRENT_TIMESTAMP WHERE session_id = ?", (session_id,))
 
 
 
 
 
144
  conn.commit()
145
  conn.close()
146
 
147
  def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
148
  conn = sqlite3.connect(DB_PATH)
149
  cursor = conn.cursor()
150
+ cursor.execute("SELECT role, content, file_info, created_at FROM messages WHERE session_id = ? ORDER BY created_at DESC LIMIT ?",
151
+ (session_id, limit))
 
 
 
 
152
  rows = cursor.fetchall()
153
  conn.close()
154
  return [{"role": r[0], "content": r[1], "file_info": r[2], "created_at": r[3]} for r in reversed(rows)]
 
156
  def get_all_sessions() -> List[Dict]:
157
  conn = sqlite3.connect(DB_PATH)
158
  cursor = conn.cursor()
159
+ cursor.execute("SELECT session_id, title, created_at, updated_at FROM sessions ORDER BY updated_at DESC LIMIT 50")
 
 
160
  rows = cursor.fetchall()
161
  conn.close()
162
  return [{"session_id": r[0], "title": r[1], "created_at": r[2], "updated_at": r[3]} for r in rows]
 
214
 
215
  def get_image_mime_type(file_path: str) -> str:
216
  ext = Path(file_path).suffix.lower()
217
+ return {'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png',
218
  '.gif': 'image/gif', '.webp': 'image/webp', '.bmp': 'image/bmp'}.get(ext, 'image/jpeg')
219
 
220
  def is_image_file(fp: str) -> bool:
221
  return Path(fp).suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']
222
 
223
  def is_hwp_file(fp: str) -> bool:
224
+ return Path(fp).suffix.lower() == '.hwp'
225
+
226
+ def is_hwpx_file(fp: str) -> bool:
227
+ return Path(fp).suffix.lower() == '.hwpx'
228
 
229
  def is_pdf_file(fp: str) -> bool:
230
  return Path(fp).suffix.lower() == '.pdf'
 
232
  def is_text_file(fp: str) -> bool:
233
  return Path(fp).suffix.lower() in ['.txt', '.md', '.json', '.csv', '.xml', '.html', '.css', '.js', '.py']
234
 
235
+ # ============== HWPX 텍스트 추출 (ZIP/XML 기반) ==============
236
 
237
+ def extract_text_from_hwpx(file_path: str) -> tuple:
238
+ """HWPX 파일에서 텍스트 추출 (ZIP 내부 XML 파싱)"""
239
  try:
240
+ text_parts = []
241
+
242
+ with zipfile.ZipFile(file_path, 'r') as zf:
243
+ # HWPX 내부 구조 확인
244
+ file_list = zf.namelist()
245
+ print(f" HWPX 내부 파일: {file_list[:10]}...")
246
+
247
+ # Contents 폴더 내의 section XML 파일들 처리
248
+ section_files = sorted([f for f in file_list if f.startswith('Contents/section') and f.endswith('.xml')])
249
+
250
+ if not section_files:
251
+ # 다른 경로 시도
252
+ section_files = sorted([f for f in file_list if 'section' in f.lower() and f.endswith('.xml')])
253
+
254
+ print(f" 섹션 파일: {section_files}")
255
+
256
+ for section_file in section_files:
257
+ try:
258
+ with zf.open(section_file) as sf:
259
+ content = sf.read()
260
+
261
+ # XML 파싱
262
+ try:
263
+ # 네임스페이스 제거하고 파싱
264
+ content_str = content.decode('utf-8')
265
+ # 네임스페이스 제거
266
+ content_str = re.sub(r'\sxmlns[^"]*"[^"]*"', '', content_str)
267
+ content_str = re.sub(r'<[a-zA-Z]+:', '<', content_str)
268
+ content_str = re.sub(r'</[a-zA-Z]+:', '</', content_str)
269
+
270
+ root = ET.fromstring(content_str)
271
+
272
+ # 모든 텍스트 추출
273
+ texts = []
274
+ for elem in root.iter():
275
+ # t 태그 (텍스트)
276
+ if elem.tag.endswith('t') or elem.tag == 't':
277
+ if elem.text:
278
+ texts.append(elem.text)
279
+ # 다른 텍스트 노드
280
+ elif elem.text and elem.text.strip():
281
+ # 태그 이름이 텍스트 관련인 경우
282
+ if any(x in elem.tag.lower() for x in ['text', 'run', 'para', 'char']):
283
+ texts.append(elem.text.strip())
284
+
285
+ if texts:
286
+ text_parts.append(' '.join(texts))
287
+
288
+ except ET.ParseError as e:
289
+ print(f" XML 파싱 오류 {section_file}: {e}")
290
+ # 정규식으로 텍스트 추출 시도
291
+ text_matches = re.findall(r'>([^<]+)<', content.decode('utf-8', errors='ignore'))
292
+ clean_texts = [t.strip() for t in text_matches if t.strip() and len(t.strip()) > 1]
293
+ if clean_texts:
294
+ text_parts.append(' '.join(clean_texts))
295
+
296
+ except Exception as e:
297
+ print(f" 섹션 파일 읽기 오류 {section_file}: {e}")
298
+ continue
299
+
300
+ # header.xml에서도 텍스트 추출 시도
301
+ for header_file in [f for f in file_list if 'header' in f.lower() and f.endswith('.xml')]:
302
+ try:
303
+ with zf.open(header_file) as hf:
304
+ content = hf.read().decode('utf-8', errors='ignore')
305
+ text_matches = re.findall(r'>([^<]+)<', content)
306
+ clean_texts = [t.strip() for t in text_matches if t.strip() and len(t.strip()) > 1]
307
+ # 헤더는 짧은 텍스트만 추가
308
+ if clean_texts:
309
+ text_parts.insert(0, ' '.join(clean_texts[:5]))
310
+ except:
311
+ pass
312
+
313
+ if text_parts:
314
+ result = '\n\n'.join(text_parts)
315
+ # 정리
316
+ result = re.sub(r'\s+', ' ', result)
317
+ result = re.sub(r'\n{3,}', '\n\n', result)
318
+ return result.strip(), None
319
+
320
+ return None, "HWPX에서 텍스트를 찾을 수 없습니다"
321
+
322
+ except zipfile.BadZipFile:
323
+ return None, "유효하지 않은 HWPX 파일"
324
+ except Exception as e:
325
+ return None, f"HWPX 처리 오류: {str(e)}"
326
 
327
+ # ============== HWP 텍스트 추출 (OLE 기반) ==============
328
+
329
+ def extract_text_with_hwp5txt(file_path: str) -> tuple:
330
+ """hwp5txt로 텍스트 추출"""
331
 
332
+ # 방법 1: hwp5txt 명령어 직접 실행
333
+ try:
334
+ result = subprocess.run(['hwp5txt', file_path], capture_output=True, timeout=60)
335
+ if result.returncode == 0 and result.stdout:
336
+ for enc in ['utf-8', 'cp949', 'euc-kr']:
337
+ try:
338
+ text = result.stdout.decode(enc)
339
+ if text.strip() and len(text.strip()) > 10:
340
+ return text.strip(), None
341
+ except:
342
+ continue
343
+ except FileNotFoundError:
344
+ pass
345
+ except Exception as e:
346
+ print(f" hwp5txt 명령어 오류: {e}")
347
+
348
+ # 방법 2: Python 모듈로 실행
349
+ try:
350
+ from hwp5.hwp5txt import main as hwp5txt_main
351
+ from hwp5.hwp5txt import extract_text
352
+ from hwp5.filestructure import Hwp5File
353
 
354
+ hwp5file = Hwp5File(file_path)
355
+ texts = []
356
+
357
+ for section_idx in hwp5file.bodytext.sections():
358
+ section = hwp5file.bodytext.section(section_idx)
359
+ for para in extract_text(section):
360
+ if para.strip():
361
+ texts.append(para.strip())
362
+
363
+ hwp5file.close()
364
+
365
+ if texts:
366
+ return '\n'.join(texts), None
367
+
368
+ except ImportError:
369
+ pass
370
+ except Exception as e:
371
+ print(f" hwp5txt 모듈 오류: {e}")
372
+
373
+ # 방법 3: 서브프로세스로 Python 코드 실행
374
+ try:
375
+ code = f'''
376
+ import sys
377
+ sys.path.insert(0, "{PYHWP_PATH}")
378
+ from hwp5.filestructure import Hwp5File
379
+ from hwp5.hwp5txt import extract_text
380
+ hwp = Hwp5File("{file_path}")
381
+ for idx in hwp.bodytext.sections():
382
+ section = hwp.bodytext.section(idx)
383
+ for para in extract_text(section):
384
+ if para.strip():
385
+ print(para.strip())
386
+ hwp.close()
387
+ '''
388
+ result = subprocess.run([sys.executable, '-c', code], capture_output=True, timeout=60)
389
+ if result.returncode == 0 and result.stdout:
390
+ for enc in ['utf-8', 'cp949', 'euc-kr']:
391
  try:
392
+ text = result.stdout.decode(enc)
393
+ if text.strip() and len(text.strip()) > 10:
394
+ return text.strip(), None
395
  except:
396
+ continue
397
+ except Exception as e:
398
+ print(f" hwp5txt브프로세오류: {e}")
399
+
400
+ return None, "hwp5txt 실패"
401
+
402
+ def extract_text_with_olefile(file_path: str) -> tuple:
403
+ """olefile을 사용한 HWP 텍스트 추출"""
404
+ if not OLEFILE_AVAILABLE:
405
+ return None, "olefile 모듈 없음"
406
 
407
+ try:
408
+ ole = olefile.OleFileIO(file_path)
409
+
410
+ # 파일 헤더 확인
411
+ if not ole.exists('FileHeader'):
412
+ ole.close()
413
+ return None, "HWP 파일 헤더 없음"
414
+
415
+ # 압축 여부 확인
416
+ header_data = ole.openstream('FileHeader').read()
417
+ is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
418
+ print(f" HWP 압축 여부: {is_compressed}")
419
+
420
+ all_texts = []
421
+
422
+ # BodyText 섹션들 처리
423
+ for entry in ole.listdir():
424
+ entry_path = '/'.join(entry)
425
+
426
+ if entry_path.startswith('BodyText/Section'):
427
+ try:
428
+ stream_data = ole.openstream(entry).read()
429
+
430
+ # 압축 해제
431
+ if is_compressed:
432
+ try:
433
+ stream_data = zlib.decompress(stream_data, -15)
434
+ except:
435
+ try:
436
+ stream_data = zlib.decompress(stream_data)
437
+ except:
438
+ pass
439
+
440
+ # ��코드에서 텍스트 추출
441
+ section_text = extract_hwp_section_text(stream_data)
442
+ if section_text:
443
+ all_texts.append(section_text)
444
+
445
+ except Exception as e:
446
+ print(f" 섹션 처리 오류 {entry_path}: {e}")
447
+ continue
448
+
449
+ ole.close()
450
+
451
+ if all_texts:
452
+ result = '\n\n'.join(all_texts)
453
+ return result.strip(), None
454
+
455
+ return None, "텍스트를 찾을 수 없습니다"
456
+
457
+ except Exception as e:
458
+ return None, f"olefile 오류: {str(e)}"
459
 
460
+ def extract_hwp_section_text(data: bytes) -> str:
461
+ """HWP 섹션 데이터에서 텍스트 추출"""
462
  texts = []
463
  pos = 0
464
 
465
  while pos < len(data) - 4:
466
  try:
467
+ # 레코드 헤더 읽기
468
  header = int.from_bytes(data[pos:pos+4], 'little')
469
  tag_id = header & 0x3FF
470
+ level = (header >> 10) & 0x3FF
471
  size = (header >> 20) & 0xFFF
472
 
473
  pos += 4
 
485
  record_data = data[pos:pos+size]
486
  pos += size
487
 
488
+ # HWPTAG_PARA_TEXT = 67
489
  if tag_id == 67 and size > 0:
490
+ text = decode_para_text(record_data)
 
491
  if text:
492
  texts.append(text)
493
 
494
+ except:
495
  pos += 1
496
  continue
497
 
498
  return '\n'.join(texts) if texts else None
499
 
500
+ def decode_para_text(data: bytes) -> str:
501
+ """PARA_TEXT 레코드 디코딩"""
502
  result = []
503
  i = 0
504
 
505
  while i < len(data) - 1:
506
  code = int.from_bytes(data[i:i+2], 'little')
507
 
508
+ if code == 0:
 
 
 
 
 
 
 
 
 
 
 
 
509
  pass
510
+ elif code == 1: # 확장 컨트롤
511
+ i += 14
512
+ elif code == 2: # 섹션 정의
513
  i += 14
514
  elif code == 3: # 필드 시작
515
  i += 14
 
523
  result.append('\n')
524
  elif code == 24: # 하이픈
525
  result.append('-')
526
+ elif code == 30 or code == 31: # 빈칸
 
 
527
  result.append(' ')
528
+ elif code < 32: # 기타 컨트롤 문자
529
+ pass
530
+ else:
531
+ # 일반 문자
532
+ try:
533
+ char = chr(code)
534
+ if char.isprintable() or char in '\n\t ':
535
+ result.append(char)
536
+ except:
537
+ pass
538
 
539
  i += 2
540
 
541
  text = ''.join(result).strip()
 
 
 
 
 
 
 
 
 
542
 
543
+ # 정리
544
+ text = re.sub(r'[ \t]+', ' ', text)
545
+ text = re.sub(r'\n{3,}', '\n\n', text)
546
+
547
+ return text if len(text) > 2 else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
 
549
  def extract_text_from_hwp(file_path: str) -> tuple:
550
  """HWP 파일에서 텍스트 추출 (메인 함수)"""
551
  print(f"\n[HWP 추출] 시작: {os.path.basename(file_path)}")
552
 
553
+ # 방법 1: hwp5txt
554
+ print(" 방법 1: hwp5txt...")
555
  text, error = extract_text_with_hwp5txt(file_path)
556
+ if text and len(text.strip()) > 20:
557
  print(f" ✓ hwp5txt 성공: {len(text)} 글자")
558
  return text, None
559
  print(f" ✗ hwp5txt 실패: {error}")
560
 
561
+ # 방법 2: olefile
562
  print(" 방법 2: olefile 파싱...")
563
  text, error = extract_text_with_olefile(file_path)
564
+ if text and len(text.strip()) > 20:
565
  print(f" ✓ olefile 성공: {len(text)} 글자")
566
  return text, None
567
  print(f" ✗ olefile 실패: {error}")
568
 
569
  return None, "모든 추출 방법 실패"
570
 
571
+ def extract_text_from_hwp_or_hwpx(file_path: str) -> tuple:
572
+ """HWP 또는 HWPX 파일에서 텍스트 추출"""
573
+ if is_hwpx_file(file_path):
574
+ print(f"\n[HWPX 추출] 시작: {os.path.basename(file_path)}")
575
+ return extract_text_from_hwpx(file_path)
576
+ else:
577
+ return extract_text_from_hwp(file_path)
578
+
579
  # ============== HWP 변환 함수들 ==============
580
 
581
  def check_hwp_version(file_path):
 
586
  return "HWP v5", True
587
  elif header[:4] == b'\xd0\xcf\x11\xe0':
588
  return "HWP v5 (OLE)", True
589
+ elif header[:4] == b'PK\x03\x04': # ZIP 파일 (HWPX)
590
+ return "HWPX", True
591
  else:
592
  return "Unknown", False
593
  except Exception as e:
 
598
  output_path = os.path.join(output_dir, "output.html")
599
 
600
  try:
601
+ # hwp5html 시도
602
+ for cmd in [['hwp5html', '--output', output_path, input_path],
603
+ [sys.executable, '-c', f'from hwp5.hwp5html import main; import sys; sys.argv=["hwp5html","--output","{output_path}","{input_path}"]; main()']]:
604
+ try:
605
+ result = subprocess.run(cmd, capture_output=True, timeout=120)
606
+ if result.returncode == 0:
607
+ if os.path.exists(output_path):
608
+ return output_path, None
609
+ # 디렉토리 검색
610
+ for item in os.listdir(output_dir):
611
+ item_path = os.path.join(output_dir, item)
612
+ if item.lower().endswith(('.html', '.htm')):
 
 
 
 
 
 
 
 
 
 
613
  return item_path, None
614
+ if os.path.isdir(item_path):
615
+ return item_path, None
616
+ except:
617
+ continue
618
+
619
  except Exception as e:
620
+ print(f"HTML 변환 오류: {e}")
621
 
622
  return None, "HTML 변환 실패"
623
 
624
  def convert_hwp_to_text(input_path: str) -> tuple:
625
+ """HWP/HWPX를 텍스트로 변환"""
626
+ return extract_text_from_hwp_or_hwpx(input_path)
627
 
628
  def html_to_markdown(html_content):
629
  """HTML을 Markdown으로 변환"""
 
641
  except:
642
  pass
643
 
 
644
  if BS4_AVAILABLE:
645
  try:
646
  soup = BeautifulSoup(html_content, 'html.parser')
 
651
  return None, "Markdown 변환 실패"
652
 
653
  def convert_hwp_to_markdown(input_path: str) -> tuple:
654
+ """HWP/HWPX를 Markdown으로 변환"""
655
+ # 텍스트 추출
656
+ text, error = extract_text_from_hwp_or_hwpx(input_path)
657
  if text:
658
  return text, None
659
+ return None, error
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
660
 
661
  # ============== LLM API ==============
662
 
 
754
  if is_image_file(file_path):
755
  return "image", image_to_base64(file_path), get_image_mime_type(file_path)
756
 
757
+ if is_hwp_file(file_path) or is_hwpx_file(file_path):
758
+ text, error = extract_text_from_hwp_or_hwpx(file_path)
759
+ if text and len(text.strip()) > 20:
760
+ return "text", f"[한글 문서: {filename}]\n\n{text}", None
761
+ return "error", f"한글 문서 추출 실패: {error}", None
762
 
763
  if is_pdf_file(file_path):
764
  text = extract_text_from_pdf(file_path)
 
774
 
775
  return "unsupported", f"지원하지 않는 형식: {filename}", None
776
 
777
+ def chat_response(message: str, history: List[Dict], file: Optional[str],
778
  session_id: str, groq_key: str, fireworks_key: str) -> Generator[tuple, None, None]:
779
  if history is None:
780
  history = []
 
822
  db_messages = get_session_messages(session_id, limit=10)
823
  api_messages = [{
824
  "role": "system",
825
+ "content": "당신은 도움이 되는 AI 어시스턴트입니다. 한국어로 자연스럽게 대화하며, 파일이 첨부되면 내용을 상세히 분석하여 답변합니다."
826
  }]
827
 
828
  for m in db_messages:
 
868
  def convert_to_odt_subprocess(input_path, output_dir):
869
  output_path = os.path.join(output_dir, "output.odt")
870
  try:
871
+ for cmd in [['hwp5odt', '--output', output_path, input_path],
872
+ [sys.executable, '-c', f'from hwp5.hwp5odt import main; import sys; sys.argv=["hwp5odt","--output","{output_path}","{input_path}"]; main()']]:
873
+ try:
874
+ result = subprocess.run(cmd, capture_output=True, timeout=120)
875
+ if result.returncode == 0 and os.path.exists(output_path):
876
+ return output_path, None
877
+ except:
878
+ continue
879
  except:
880
  pass
881
  return None, "ODT 변환 실패"
 
883
  def convert_to_xml_subprocess(input_path, output_dir):
884
  output_path = os.path.join(output_dir, "output.xml")
885
  try:
886
+ for cmd in [['hwp5xml', input_path],
887
+ [sys.executable, '-c', f'from hwp5.hwp5xml import main; import sys; sys.argv=["hwp5xml","{input_path}"]; main()']]:
888
+ try:
889
+ result = subprocess.run(cmd, capture_output=True, timeout=120)
890
+ if result.returncode == 0 and result.stdout:
891
+ with open(output_path, 'wb') as f:
892
+ f.write(result.stdout)
893
+ return output_path, None
894
+ except:
895
+ continue
896
  except:
897
  pass
898
  return None, "XML 변환 실패"
 
902
  return None, "❌ 파일을 업로드해주세요.", ""
903
 
904
  input_file = file.name if hasattr(file, 'name') else str(file)
905
+ ext_lower = Path(input_file).suffix.lower()
906
+
907
+ if ext_lower not in ['.hwp', '.hwpx']:
908
+ return None, "❌ HWP 또는 HWPX 파일만 지원됩니다.", ""
909
 
910
  progress(0.1, desc="파일 분석 중...")
911
  version, is_valid = check_hwp_version(input_file)
 
924
  output_path, error, ext = None, None, ""
925
 
926
  if output_format == "HTML":
927
+ if ext_lower == '.hwpx':
928
+ return None, "❌ HWPX는 HTML 변환을 지원하지 않습니다. TXT나 Markdown을 사용하세요.", ""
929
  output_path, error = convert_to_html_subprocess(input_path, tmp_dir)
930
  ext = ".html"
931
  if output_path and os.path.isdir(output_path):
 
933
  output_path, ext = zip_path, ".zip"
934
 
935
  elif output_format == "ODT (OpenDocument)":
936
+ if ext_lower == '.hwpx':
937
+ return None, "❌ HWPX는 ODT 변환을 지원하지 않습니다. TXT나 Markdown을 사용하세요.", ""
938
  output_path, error = convert_to_odt_subprocess(input_path, tmp_dir)
939
  ext = ".odt"
940
 
941
  elif output_format == "TXT (텍스트)":
942
+ text, error = extract_text_from_hwp_or_hwpx(input_path)
943
  if text:
944
  output_path = os.path.join(tmp_dir, "output.txt")
945
  with open(output_path, 'w', encoding='utf-8') as f:
 
955
  ext = ".md"
956
 
957
  elif output_format == "XML":
958
+ if ext_lower == '.hwpx':
959
+ # HWPX는 이미 XML 기반이므로 내부 XML 추출
960
+ try:
961
+ with zipfile.ZipFile(input_path, 'r') as zf:
962
+ # 모든 XML 파일을 하나로 합침
963
+ xml_contents = []
964
+ for name in zf.namelist():
965
+ if name.endswith('.xml'):
966
+ with zf.open(name) as f:
967
+ xml_contents.append(f"<!-- {name} -->\n{f.read().decode('utf-8', errors='ignore')}")
968
+
969
+ output_path = os.path.join(tmp_dir, "output.xml")
970
+ with open(output_path, 'w', encoding='utf-8') as f:
971
+ f.write('\n\n'.join(xml_contents))
972
+ except Exception as e:
973
+ error = f"HWPX XML 추출 실패: {e}"
974
+ else:
975
+ output_path, error = convert_to_xml_subprocess(input_path, tmp_dir)
976
  ext = ".xml"
977
 
978
  if not output_path:
 
1021
  with gr.Blocks(title="AI 문서 어시스턴트") as demo:
1022
  session_state = gr.State("")
1023
 
1024
+ gr.Markdown("# 🤖 AI 문서 어시스턴트\nLLM 채팅 + HWP/HWPX 문서 변환")
1025
 
1026
  with gr.Tabs():
1027
  with gr.Tab("💬 AI 채팅"):
 
1032
  groq_key = gr.Textbox(label="Groq API Key", type="password", value=GROQ_API_KEY)
1033
  fireworks_key = gr.Textbox(label="Fireworks API Key", type="password", value=FIREWORKS_API_KEY)
1034
 
1035
+ gr.Markdown("### 📁 지원 파일\n- 이미지: JPG, PNG\n- 문서: PDF, TXT\n- 한글: HWP, HWPX ✨")
1036
  new_btn = gr.Button("🆕 새 대화", variant="primary")
1037
 
1038
  with gr.Accordion("📜 기록", open=False):
 
1051
  clear_btn = gr.Button("🗑️ 지우기", scale=1)
1052
 
1053
  with gr.Tab("📄 HWP 변환기"):
1054
+ gr.Markdown("### HWP/HWPX 파일 변환기")
1055
  with gr.Row():
1056
  with gr.Column():
1057
+ hwp_input = gr.File(label="HWP/HWPX 파일", file_types=[".hwp", ".hwpx"], elem_classes=["upload-box"])
1058
  format_select = gr.Radio(["HTML", "ODT (OpenDocument)", "TXT (텍스트)", "Markdown", "XML"], value="TXT (텍스트)", label="형식")
1059
  convert_btn = gr.Button("🔄 변환", variant="primary", size="lg")
1060
  with gr.Column():
 
1063
 
1064
  with gr.Accordion("📋 미리보기", open=False):
1065
  preview_out = gr.Textbox(lines=15, interactive=False)
1066
+
1067
+ gr.Markdown("""
1068
+ > **참고**: HWPX 파일은 TXT, Markdown, XML 변환만 지원됩니다.
1069
+ """)
1070
 
1071
  # 이벤트
1072
  def on_submit(msg, hist, f, sid, gk, fk):
 
1074
  for r in chat_response(msg, hist, f, sid, gk, fk):
1075
  yield r[0], r[1], "", None
1076
 
1077
+ submit_btn.click(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
1078
  [chatbot, session_state, msg_input, file_upload])
1079
  msg_input.submit(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
1080
  [chatbot, session_state, msg_input, file_upload])