seawolf2357 commited on
Commit
46e1b25
·
verified ·
1 Parent(s): 021ef20

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +422 -1052
app.py CHANGED
@@ -9,16 +9,16 @@ import os
9
  import subprocess
10
  import shutil
11
  import sys
12
- import zipfile
13
  import re
14
  import json
15
  import uuid
16
  import sqlite3
17
  import base64
18
  import requests
 
19
  from pathlib import Path
20
  from datetime import datetime
21
- from typing import Generator, List, Dict, Any, Optional
22
 
23
  # ============== 환경 설정 ==============
24
  SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -30,20 +30,13 @@ if os.path.exists(PYHWP_PATH):
30
  print(f"Added local pyhwp path: {PYHWP_PATH}")
31
 
32
  # ============== 모듈 임포트 ==============
33
- try:
34
- from hwp5.filestructure import Hwp5File
35
- PYHWP_AVAILABLE = True
36
- print("pyhwp modules loaded successfully")
37
- except ImportError as e:
38
- PYHWP_AVAILABLE = False
39
- print(f"Warning: Could not import pyhwp modules: {e}")
40
-
41
  try:
42
  import olefile
43
  OLEFILE_AVAILABLE = True
44
  print("olefile loaded successfully")
45
  except ImportError:
46
  OLEFILE_AVAILABLE = False
 
47
 
48
  try:
49
  from markdownify import markdownify as md
@@ -51,7 +44,6 @@ try:
51
  print("markdownify loaded successfully")
52
  except ImportError:
53
  MARKDOWNIFY_AVAILABLE = False
54
- print("markdownify not available")
55
 
56
  try:
57
  import html2text
@@ -59,7 +51,6 @@ try:
59
  print("html2text loaded successfully")
60
  except ImportError:
61
  HTML2TEXT_AVAILABLE = False
62
- print("html2text not available")
63
 
64
  try:
65
  from bs4 import BeautifulSoup
@@ -89,7 +80,6 @@ FIREWORKS_API_KEY = os.environ.get("FIREWORKS_API_KEY", "")
89
  def init_database():
90
  conn = sqlite3.connect(DB_PATH)
91
  cursor = conn.cursor()
92
-
93
  cursor.execute('''
94
  CREATE TABLE IF NOT EXISTS sessions (
95
  session_id TEXT PRIMARY KEY,
@@ -98,7 +88,6 @@ def init_database():
98
  title TEXT
99
  )
100
  ''')
101
-
102
  cursor.execute('''
103
  CREATE TABLE IF NOT EXISTS messages (
104
  id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -110,7 +99,6 @@ def init_database():
110
  FOREIGN KEY (session_id) REFERENCES sessions(session_id)
111
  )
112
  ''')
113
-
114
  conn.commit()
115
  conn.close()
116
 
@@ -145,58 +133,36 @@ def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
145
  cursor = conn.cursor()
146
  cursor.execute(
147
  """SELECT role, content, file_info, created_at
148
- FROM messages
149
- WHERE session_id = ?
150
- ORDER BY created_at DESC
151
- LIMIT ?""",
152
  (session_id, limit)
153
  )
154
  rows = cursor.fetchall()
155
  conn.close()
156
-
157
- messages = []
158
- for row in reversed(rows):
159
- messages.append({
160
- "role": row[0],
161
- "content": row[1],
162
- "file_info": row[2],
163
- "created_at": row[3]
164
- })
165
- return messages
166
 
167
  def get_all_sessions() -> List[Dict]:
168
  conn = sqlite3.connect(DB_PATH)
169
  cursor = conn.cursor()
170
  cursor.execute(
171
- """SELECT session_id, title, created_at, updated_at
172
- FROM sessions
173
- ORDER BY updated_at DESC
174
- LIMIT 50"""
175
  )
176
  rows = cursor.fetchall()
177
  conn.close()
178
-
179
- return [
180
- {"session_id": row[0], "title": row[1], "created_at": row[2], "updated_at": row[3]}
181
- for row in rows
182
- ]
183
 
184
  def update_session_title(session_id: str, title: str):
185
  conn = sqlite3.connect(DB_PATH)
186
  cursor = conn.cursor()
187
- cursor.execute(
188
- "UPDATE sessions SET title = ? WHERE session_id = ?",
189
- (title, session_id)
190
- )
191
  conn.commit()
192
  conn.close()
193
 
194
  init_database()
195
 
196
- # ============== 파일 함수들 ==============
197
  def extract_text_from_pdf(file_path: str) -> str:
198
  text_parts = []
199
-
200
  if PDFPLUMBER_AVAILABLE:
201
  try:
202
  with pdfplumber.open(file_path) as pdf:
@@ -221,19 +187,15 @@ def extract_text_from_pdf(file_path: str) -> str:
221
  return "\n\n".join(text_parts)
222
  except Exception as e:
223
  print(f"PyPDF2 error: {e}")
224
-
225
  return None
226
 
227
  def extract_text_from_txt(file_path: str) -> str:
228
- encodings = ['utf-8', 'euc-kr', 'cp949', 'utf-16', 'latin-1']
229
-
230
- for encoding in encodings:
231
  try:
232
  with open(file_path, 'r', encoding=encoding) as f:
233
  return f.read()
234
- except (UnicodeDecodeError, UnicodeError):
235
  continue
236
-
237
  return None
238
 
239
  def image_to_base64(file_path: str) -> str:
@@ -242,332 +204,234 @@ def image_to_base64(file_path: str) -> str:
242
 
243
  def get_image_mime_type(file_path: str) -> str:
244
  ext = Path(file_path).suffix.lower()
245
- mime_types = {
246
- '.jpg': 'image/jpeg',
247
- '.jpeg': 'image/jpeg',
248
- '.png': 'image/png',
249
- '.gif': 'image/gif',
250
- '.webp': 'image/webp',
251
- '.bmp': 'image/bmp'
252
- }
253
- return mime_types.get(ext, 'image/jpeg')
254
 
255
- def is_image_file(file_path: str) -> bool:
256
- ext = Path(file_path).suffix.lower()
257
- return ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']
258
 
259
- def is_hwp_file(file_path: str) -> bool:
260
- ext = Path(file_path).suffix.lower()
261
- return ext in ['.hwp', '.hwpx']
262
 
263
- def is_pdf_file(file_path: str) -> bool:
264
- return Path(file_path).suffix.lower() == '.pdf'
265
 
266
- def is_text_file(file_path: str) -> bool:
267
- ext = Path(file_path).suffix.lower()
268
- return ext in ['.txt', '.md', '.json', '.csv', '.xml', '.html', '.css', '.js', '.py']
269
 
270
- # ============== HWP 텍스트 추출 함수들 (개선됨) ==============
271
 
272
- def extract_text_with_pyhwp(file_path: str) -> tuple:
273
- """pyhwp 라이브러리를 사용한 텍스트 추출 (가장 정확)"""
274
- if not PYHWP_AVAILABLE:
275
- return None, "pyhwp 모듈이 없습니다."
276
-
277
  try:
278
- # 방법 1: hwp5txt 모듈 직접 사용
 
279
  try:
280
- from hwp5.hwp5txt import TextExtractor
281
- from hwp5.dataio import ParseError
282
-
283
- hwp5file = Hwp5File(file_path)
284
- text_extractor = TextExtractor()
285
-
286
- text_parts = []
287
- for section_idx in range(len(hwp5file.bodytext)):
288
- section = hwp5file.bodytext[section_idx]
289
- text = text_extractor.extract(section)
290
- if text:
291
- text_parts.append(text)
292
-
293
- hwp5file.close()
294
-
295
- if text_parts:
296
- result = '\n\n'.join(text_parts)
297
- print(f"[pyhwp TextExtractor] 추출 성공: {len(result)} chars")
298
- return result, None
299
- except Exception as e:
300
- print(f"[pyhwp TextExtractor] 실패: {e}")
301
-
302
- # 방법 2: hwp5proc txt 직접 호출
303
- try:
304
- from hwp5 import plat
305
- from hwp5.hwp5txt import extract_text
306
-
307
- hwp5file = Hwp5File(file_path)
308
-
309
- text_parts = []
310
- for section in hwp5file.bodytext:
311
- paragraphs = extract_text(section)
312
- for para in paragraphs:
313
- if para.strip():
314
- text_parts.append(para.strip())
315
-
316
- hwp5file.close()
317
-
318
- if text_parts:
319
- result = '\n'.join(text_parts)
320
- print(f"[pyhwp extract_text] 추출 성공: {len(result)} chars")
321
- return result, None
322
- except Exception as e:
323
- print(f"[pyhwp extract_text] 실패: {e}")
324
 
325
- # 방법 3: XML 변환 후 텍스트 추출
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  try:
327
- from hwp5.xmlmodel import Hwp5File as XmlHwp5File
328
- import io
329
- from lxml import etree
330
-
331
- hwp5file = XmlHwp5File(file_path)
332
 
333
- # XML로 변환
334
- xml_buffer = io.BytesIO()
335
- from hwp5.xmldump import xmldump_flat
336
- xmldump_flat(hwp5file, xml_buffer)
337
- xml_buffer.seek(0)
338
 
339
- # XML에서 텍스트 추출
340
- tree = etree.parse(xml_buffer)
 
 
 
 
341
 
342
- # 모든 텍스트 노드 추출
343
- text_parts = []
344
- for elem in tree.iter():
345
- if elem.text and elem.text.strip():
346
- text_parts.append(elem.text.strip())
347
- if elem.tail and elem.tail.strip():
348
- text_parts.append(elem.tail.strip())
349
 
350
- hwp5file.close()
 
351
 
352
- if text_parts:
353
- result = '\n'.join(text_parts)
354
- print(f"[pyhwp XML] 추출 성공: {len(result)} chars")
355
- return result, None
 
 
 
356
  except Exception as e:
357
- print(f"[pyhwp XML] 실패: {e}")
358
-
359
- return None, "pyhwp로 텍스트 추출에 실패했습니다."
360
-
361
- except Exception as e:
362
- return None, f"pyhwp 오류: {str(e)}"
363
-
364
-
365
- def extract_text_with_hwp5txt_command(file_path: str) -> tuple:
366
- """hwp5txt 명령어로 텍스트 추출"""
367
- try:
368
- # python -m hwp5 txt 실행
369
- result = subprocess.run(
370
- [sys.executable, '-m', 'hwp5', 'txt', file_path],
371
- capture_output=True,
372
- timeout=60
373
- )
374
-
375
- if result.returncode == 0:
376
- # stdout을 여러 인코딩으로 디코딩 시도
377
- for encoding in ['utf-8', 'euc-kr', 'cp949']:
378
- try:
379
- text = result.stdout.decode(encoding)
380
- if text.strip():
381
- print(f"[hwp5txt command] 추출 성공: {len(text)} chars")
382
- return text.strip(), None
383
- except:
384
- continue
385
-
386
- # stderr 확인
387
- if result.stderr:
388
- print(f"[hwp5txt command] stderr: {result.stderr.decode('utf-8', errors='ignore')[:200]}")
389
-
390
- except subprocess.TimeoutExpired:
391
- print("[hwp5txt command] 타임아웃")
392
- except Exception as e:
393
- print(f"[hwp5txt command] 오류: {e}")
394
 
395
- return None, "hwp5txt 명령 실패"
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
 
398
- def extract_text_with_olefile_improved(file_path: str) -> tuple:
399
- """olefile을 사용한 개선된 텍스트 추출"""
400
  if not OLEFILE_AVAILABLE:
401
- return None, "olefile 모듈습니다."
402
 
403
  try:
404
  ole = olefile.OleFileIO(file_path)
405
 
406
- # HWP 파일 구조 확인
407
- print(f"[olefile] OLE 스트림 목록: {ole.listdir()}")
 
 
408
 
409
- text_parts = []
 
 
 
410
 
411
- # BodyText 섹션에서 텍스트 추출
412
- for entry in ole.listdir():
413
- entry_path = '/'.join(entry)
414
-
415
- # BodyText/Section 스트림 찾기
416
- if 'BodyText' in entry_path or 'Section' in entry_path:
417
- try:
418
- stream = ole.openstream(entry)
419
- data = stream.read()
420
-
421
- print(f"[olefile] 스트림 {entry_path}: {len(data)} bytes")
422
-
423
- # HWP5 레코드 파싱 시도
424
- extracted = parse_hwp_bodytext(data)
425
- if extracted:
426
- text_parts.append(extracted)
427
- continue
428
-
429
- # 단순 UTF-16 디코딩 (fallback)
430
- for encoding in ['utf-16-le', 'utf-16-be', 'utf-8', 'euc-kr', 'cp949']:
431
- try:
432
- text = data.decode(encoding, errors='ignore')
433
- # 제어 문자 제거 및 정리
434
- cleaned = clean_extracted_text(text)
435
- if cleaned and len(cleaned) > 10:
436
- text_parts.append(cleaned)
437
- break
438
- except:
439
- continue
440
- except Exception as e:
441
- print(f"[olefile] 스트림 읽기 오류 {entry_path}: {e}")
442
- continue
443
 
444
  ole.close()
445
 
446
- if text_parts:
447
- result = '\n\n'.join(text_parts)
448
- print(f"[olefile] 최종 추출: {len(result)} chars")
449
- return result, None
450
- else:
451
- return None, "텍스트를 추출할 수 없습니다."
452
-
453
  except Exception as e:
454
- return None, f"OLE 파일 처리 오류: {str(e)}"
455
-
456
 
457
- def parse_hwp_bodytext(data: bytes) -> str:
458
- """HWP5 BodyText 레코드에서 텍스트 추출"""
459
  try:
460
- # HWP5 레코드 구조: 태그(4바이트) + 데이터
461
- # 텍스트는 HWPTAG_PARA_TEXT (0x4A) 레코드에 저장됨
462
-
463
- text_parts = []
464
- offset = 0
465
 
466
- while offset < len(data) - 4:
467
- # 헤더 읽기 (4바이트)
468
- header = int.from_bytes(data[offset:offset+4], 'little')
469
-
470
- tag_id = header & 0x3FF # 하위 10비트
471
- level = (header >> 10) & 0x3FF # 다음 10비트
472
- size = (header >> 20) & 0xFFF # 상위 12비트
473
-
474
- # 확장 크기 처리
475
- if size == 0xFFF:
476
- if offset + 8 > len(data):
477
- break
478
- size = int.from_bytes(data[offset+4:offset+8], 'little')
479
- offset += 4
480
-
481
- offset += 4
482
-
483
- if offset + size > len(data):
484
- break
485
-
486
- # HWPTAG_PARA_TEXT (0x4A = 74)
487
- if tag_id == 67: # PARA_TEXT
488
- record_data = data[offset:offset+size]
489
- # UTF-16LE로 디코딩
490
  try:
491
- text = record_data.decode('utf-16-le', errors='ignore')
492
- # 제어 문자 필터링
493
- cleaned = ''.join(c for c in text if c.isprintable() or c in '\n\r\t ')
494
- if cleaned.strip():
495
- text_parts.append(cleaned.strip())
496
  except:
497
- pass
498
-
499
- offset += size
500
-
501
- if text_parts:
502
- return '\n'.join(text_parts)
503
 
504
- return None
 
505
 
 
 
506
  except Exception as e:
507
- print(f"[parse_hwp_bodytext] 오류: {e}")
508
- return None
509
-
510
-
511
- def clean_extracted_text(text: str) -> str:
512
- """추출된 텍스트 정리"""
513
- if not text:
514
- return ""
515
-
516
- # NULL 문자 제거
517
- text = text.replace('\x00', '')
518
-
519
- # 제어 문자 제거 (탭, 줄바꿈 제외)
520
- cleaned = ''.join(
521
- c for c in text
522
- if c.isprintable() or c in '\n\r\t '
523
- )
524
-
525
- # 연속된 공백 정리
526
- cleaned = re.sub(r'[ \t]+', ' ', cleaned)
527
- cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
528
-
529
- # 앞뒤 공백 제거
530
- cleaned = cleaned.strip()
531
-
532
- # 너무 짧으면 무시
533
- if len(cleaned) < 10:
534
- return ""
535
-
536
- return cleaned
537
-
538
 
539
  def extract_text_from_hwp(file_path: str) -> tuple:
540
- """HWP 파일에서 텍스트 추출 (다중 방법 시도)"""
541
-
542
- errors = []
543
 
544
  # 방법 1: hwp5txt 명령어 (가장 안정적)
545
- print("[HWP 추출] 방법 1: hwp5txt 명령어 시도...")
546
- text, error = extract_text_with_hwp5txt_command(file_path)
547
- if text and len(text.strip()) > 20:
 
548
  return text, None
549
- if error:
550
- errors.append(f"hwp5txt: {error}")
551
 
552
- # 방법 2: pyhwp API
553
- print("[HWP 추출] 방법 2: pyhwp API 시도...")
554
- text, error = extract_text_with_pyhwp(file_path)
555
- if text and len(text.strip()) > 20:
 
556
  return text, None
557
- if error:
558
- errors.append(f"pyhwp: {error}")
559
 
560
- # 방법 3: olefile (fallback)
561
- print("[HWP 추출] 방법 3: olefile 시도...")
562
- text, error = extract_text_with_olefile_improved(file_path)
563
- if text and len(text.strip()) > 20:
564
- return text, None
565
- if error:
566
- errors.append(f"olefile: {error}")
567
-
568
- # 모든 방법 실패
569
- return None, f"텍스트 추출 실패: {'; '.join(errors)}"
570
 
 
571
 
572
  def check_hwp_version(file_path):
573
  try:
@@ -582,25 +446,9 @@ def check_hwp_version(file_path):
582
  except Exception as e:
583
  return f"Error: {e}", False
584
 
585
-
586
- def get_hwp5_command_paths(command_name):
587
- paths = [
588
- command_name,
589
- os.path.join(os.path.dirname(sys.executable), command_name),
590
- os.path.join(SCRIPT_DIR, 'bin', command_name),
591
- os.path.join(PYHWP_PATH, 'bin', command_name),
592
- ]
593
-
594
- if sys.platform == 'win32':
595
- paths.append(os.path.join(os.path.dirname(sys.executable), 'Scripts', command_name))
596
- paths.append(os.path.join(os.path.dirname(sys.executable), 'Scripts', f'{command_name}.exe'))
597
-
598
- return paths
599
-
600
-
601
  def convert_to_html_subprocess(input_path, output_dir):
602
- output_name = "output.html"
603
- output_path = os.path.join(output_dir, output_name)
604
 
605
  try:
606
  result = subprocess.run(
@@ -611,288 +459,135 @@ def convert_to_html_subprocess(input_path, output_dir):
611
  )
612
 
613
  if result.returncode == 0:
 
614
  if os.path.isfile(output_path):
615
  return output_path, None
616
  if os.path.isdir(output_path):
617
  return output_path, None
618
 
 
619
  for item in os.listdir(output_dir):
620
  item_path = os.path.join(output_dir, item)
621
- if item.lower().endswith(('.html', '.htm', '.xhtml')) and os.path.isfile(item_path):
622
  return item_path, None
623
  if os.path.isdir(item_path):
624
- for sub_item in os.listdir(item_path):
625
- if sub_item.lower().endswith(('.html', '.htm', '.xhtml')):
626
  return item_path, None
627
-
628
  return output_dir, None
 
 
 
629
  except Exception as e:
630
- print(f"python -m hwp5 html 오류: {e}")
631
-
632
- return None, "hwp5html 변환 실패"
633
-
634
-
635
- def convert_to_txt_subprocess(input_path, output_dir):
636
- output_path = os.path.join(output_dir, "output.txt")
637
-
638
- try:
639
- result = subprocess.run(
640
- [sys.executable, '-m', 'hwp5', 'txt', input_path],
641
- capture_output=True,
642
- timeout=120
643
- )
644
-
645
- if result.returncode == 0 and result.stdout:
646
- # 인코딩 감지
647
- for encoding in ['utf-8', 'euc-kr', 'cp949']:
648
- try:
649
- text = result.stdout.decode(encoding)
650
- if text.strip():
651
- with open(output_path, 'w', encoding='utf-8') as f:
652
- f.write(text)
653
- return output_path, None
654
- except:
655
- continue
656
- except Exception as e:
657
- print(f"python -m hwp5 txt 오류: {e}")
658
 
659
- return None, "hwp5txt 변환 실패"
660
-
661
-
662
- # ============== HTML/Markdown 변환 ==============
663
- def html_to_markdown_with_markdownify(html_content):
664
- try:
665
- markdown_content = md(
666
- html_content,
667
- heading_style="ATX",
668
- bullets="-",
669
- strip=['script', 'style', 'meta', 'link'],
670
- code_language="",
671
- escape_asterisks=False,
672
- escape_underscores=False,
673
- )
674
- return markdown_content, None
675
- except Exception as e:
676
- return None, f"markdownify 변환 오류: {str(e)}"
677
-
678
-
679
- def html_to_markdown_with_html2text(html_content):
680
- try:
681
- h = html2text.HTML2Text()
682
- h.ignore_links = False
683
- h.ignore_images = False
684
- h.ignore_tables = False
685
- h.body_width = 0
686
- h.unicode_snob = True
687
- h.skip_internal_links = True
688
- h.inline_links = True
689
- h.protect_links = True
690
- h.ignore_emphasis = False
691
-
692
- markdown_content = h.handle(html_content)
693
- return markdown_content, None
694
- except Exception as e:
695
- return None, f"html2text 변환 오류: {str(e)}"
696
-
697
-
698
- def html_to_markdown_simple(html_content):
699
- try:
700
- if BS4_AVAILABLE:
701
- soup = BeautifulSoup(html_content, 'html.parser')
702
- for tag in soup(['script', 'style', 'meta', 'link']):
703
- tag.decompose()
704
- text = str(soup)
705
- else:
706
- text = html_content
707
-
708
- conversions = [
709
- (r'<h1[^>]*>(.*?)</h1>', r'# \1\n'),
710
- (r'<h2[^>]*>(.*?)</h2>', r'## \1\n'),
711
- (r'<h3[^>]*>(.*?)</h3>', r'### \1\n'),
712
- (r'<h4[^>]*>(.*?)</h4>', r'#### \1\n'),
713
- (r'<h5[^>]*>(.*?)</h5>', r'##### \1\n'),
714
- (r'<h6[^>]*>(.*?)</h6>', r'###### \1\n'),
715
- (r'<strong[^>]*>(.*?)</strong>', r'**\1**'),
716
- (r'<b[^>]*>(.*?)</b>', r'**\1**'),
717
- (r'<em[^>]*>(.*?)</em>', r'*\1*'),
718
- (r'<i[^>]*>(.*?)</i>', r'*\1*'),
719
- (r'<code[^>]*>(.*?)</code>', r'`\1`'),
720
- (r'<a[^>]*href=["\']([^"\']*)["\'][^>]*>(.*?)</a>', r'[\2](\1)'),
721
- (r'<li[^>]*>(.*?)</li>', r'- \1\n'),
722
- (r'<ul[^>]*>', ''),
723
- (r'</ul>', '\n'),
724
- (r'<ol[^>]*>', ''),
725
- (r'</ol>', '\n'),
726
- (r'<p[^>]*>(.*?)</p>', r'\1\n\n'),
727
- (r'<br\s*/?>', '\n'),
728
- (r'<hr\s*/?>', '\n---\n'),
729
- (r'<blockquote[^>]*>(.*?)</blockquote>', r'> \1\n'),
730
- (r'<pre[^>]*><code[^>]*>(.*?)</code></pre>', r'```\n\1\n```\n'),
731
- (r'<pre[^>]*>(.*?)</pre>', r'```\n\1\n```\n'),
732
- (r'<div[^>]*>', ''),
733
- (r'</div>', '\n'),
734
- (r'<span[^>]*>', ''),
735
- (r'</span>', ''),
736
- (r'<[^>]+>', ''),
737
- (r'&nbsp;', ' '),
738
- (r'&lt;', '<'),
739
- (r'&gt;', '>'),
740
- (r'&amp;', '&'),
741
- (r'&quot;', '"'),
742
- (r'&#39;', "'"),
743
- ]
744
-
745
- for pattern, replacement in conversions:
746
- text = re.sub(pattern, replacement, text, flags=re.DOTALL | re.IGNORECASE)
747
-
748
- text = re.sub(r'\n{3,}', '\n\n', text)
749
- text = text.strip()
750
-
751
- return text, None
752
-
753
- except Exception as e:
754
- return None, f"기본 변환 오류: {str(e)}"
755
 
 
 
 
756
 
757
- def convert_html_to_markdown(html_content):
 
758
  if MARKDOWNIFY_AVAILABLE:
759
- result, error = html_to_markdown_with_markdownify(html_content)
760
- if result:
761
- return result, None
 
762
 
763
  if HTML2TEXT_AVAILABLE:
764
- result, error = html_to_markdown_with_html2text(html_content)
765
- if result:
766
- return result, None
 
 
 
767
 
768
- result, error = html_to_markdown_simple(html_content)
769
- if result:
770
- return result, None
 
 
 
 
771
 
772
- return None, "HTML → Markdown 변환 실패했습니다."
773
-
774
 
775
  def convert_hwp_to_markdown(input_path: str) -> tuple:
776
- """HWP 파일을 텍스트/마크다운으로 변환 (개선된 버전)"""
777
-
778
- print(f"[HWP→MD] 변환 시작: {input_path}")
779
-
780
- # 1단계: 직접 텍스트 추출 시도 (가장 빠르고 안정적)
781
  text, error = extract_text_from_hwp(input_path)
782
- if text and len(text.strip()) > 20:
783
- print(f"[HWP→MD] 텍스트 추출 성공: {len(text)} chars")
784
  return text, None
785
 
786
- print(f"[HWP→MD] 텍스트 추출 실패, HTML 변환 시도...")
787
-
788
- # 2단계: HTML 변환 후 마크다운 변환
789
  tmp_dir = tempfile.mkdtemp()
790
-
791
  try:
792
  html_output, error = convert_to_html_subprocess(input_path, tmp_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
793
 
794
- if html_output is None:
795
- return None, f"변환 실패: {error}"
796
-
797
- # HTML 파일 찾기
798
- html_contents = []
799
-
800
- def find_html_files(search_dir):
801
- files = []
802
- for root, dirs, filenames in os.walk(search_dir):
803
- for filename in filenames:
804
- if filename.lower().endswith(('.html', '.htm', '.xhtml')):
805
- files.append(os.path.join(root, filename))
806
- return sorted(files)
807
-
808
- if os.path.isfile(html_output) and html_output.lower().endswith(('.html', '.htm', '.xhtml')):
809
- html_files = [html_output]
810
- else:
811
- search_path = html_output if os.path.isdir(html_output) else tmp_dir
812
- html_files = find_html_files(search_path)
813
-
814
- for html_file in html_files:
815
- for encoding in ['utf-8', 'euc-kr', 'cp949', 'utf-16']:
816
- try:
817
- with open(html_file, 'r', encoding=encoding) as f:
818
- content = f.read()
819
- html_contents.append(content)
820
- break
821
- except:
822
- continue
823
-
824
- if not html_contents:
825
- return None, "HTML 파일을 찾을 수 없습니다."
826
-
827
- # HTML → Markdown 변환
828
- markdown_parts = []
829
- for html_content in html_contents:
830
- md_content, error = convert_html_to_markdown(html_content)
831
- if md_content and len(md_content.strip()) > 10:
832
- markdown_parts.append(md_content)
833
-
834
- if markdown_parts:
835
- result = "\n\n---\n\n".join(markdown_parts)
836
- print(f"[HWP→MD] HTML→MD 변환 성공: {len(result)} chars")
837
- return result, None
838
-
839
- return None, "Markdown 변환에 실패했습니다."
840
-
841
- except Exception as e:
842
- return None, f"변환 오류: {str(e)}"
843
  finally:
844
  shutil.rmtree(tmp_dir, ignore_errors=True)
845
 
 
846
 
847
- # ============== LLM API 함수들 ==============
848
  def call_groq_api_stream(messages: List[Dict], api_key: str) -> Generator[str, None, None]:
849
  if not api_key:
850
  yield "❌ Groq API 키가 설정되지 않았습니다."
851
  return
852
 
853
  try:
854
- url = "https://api.groq.com/openai/v1/chat/completions"
855
-
856
- headers = {
857
- "Authorization": f"Bearer {api_key}",
858
- "Content-Type": "application/json"
859
- }
860
-
861
- payload = {
862
- "model": "meta-llama/llama-4-scout-17b-16e-instruct",
863
- "messages": messages,
864
- "temperature": 0.7,
865
- "max_tokens": 8192,
866
- "top_p": 1,
867
- "stream": True
868
- }
869
-
870
- response = requests.post(url, headers=headers, json=payload, stream=True)
871
 
872
  if response.status_code != 200:
873
- yield f"❌ Groq API 오류: {response.status_code} - {response.text}"
874
  return
875
 
876
  for line in response.iter_lines():
877
  if line:
878
  line = line.decode('utf-8')
879
- if line.startswith('data: '):
880
- data = line[6:]
881
- if data == '[DONE]':
882
- break
883
  try:
884
- json_data = json.loads(data)
885
- if 'choices' in json_data and len(json_data['choices']) > 0:
886
- delta = json_data['choices'][0].get('delta', {})
887
- content = delta.get('content', '')
888
- if content:
889
- yield content
890
- except json.JSONDecodeError:
891
  continue
892
-
893
  except Exception as e:
894
- yield f"\n\nGroq API 오류: {str(e)}"
895
-
896
 
897
  def call_fireworks_api_stream(messages: List[Dict], image_base64: str, mime_type: str, api_key: str) -> Generator[str, None, None]:
898
  if not api_key:
@@ -900,150 +595,96 @@ def call_fireworks_api_stream(messages: List[Dict], image_base64: str, mime_type
900
  return
901
 
902
  try:
903
- url = "https://api.fireworks.ai/inference/v1/chat/completions"
904
-
905
- formatted_messages = []
906
- for msg in messages[:-1]:
907
- formatted_messages.append({
908
- "role": msg["role"],
909
- "content": msg["content"]
910
- })
911
-
912
- last_msg = messages[-1]
913
  formatted_messages.append({
914
- "role": last_msg["role"],
915
  "content": [
916
- {
917
- "type": "image_url",
918
- "image_url": {
919
- "url": f"data:{mime_type};base64,{image_base64}"
920
- }
921
- },
922
- {
923
- "type": "text",
924
- "text": last_msg["content"]
925
- }
926
  ]
927
  })
928
 
929
- payload = {
930
- "model": "accounts/fireworks/models/qwen3-vl-235b-a22b-thinking",
931
- "max_tokens": 4096,
932
- "top_p": 1,
933
- "top_k": 40,
934
- "presence_penalty": 0,
935
- "frequency_penalty": 0,
936
- "temperature": 0.6,
937
- "messages": formatted_messages,
938
- "stream": True
939
- }
940
-
941
- headers = {
942
- "Accept": "application/json",
943
- "Content-Type": "application/json",
944
- "Authorization": f"Bearer {api_key}"
945
- }
946
-
947
- response = requests.post(url, headers=headers, json=payload, stream=True)
948
 
949
  if response.status_code != 200:
950
- yield f"❌ Fireworks API 오류: {response.status_code} - {response.text}"
951
  return
952
 
953
  for line in response.iter_lines():
954
  if line:
955
  line = line.decode('utf-8')
956
- if line.startswith('data: '):
957
- data = line[6:]
958
- if data == '[DONE]':
959
- break
960
  try:
961
- json_data = json.loads(data)
962
- if 'choices' in json_data and len(json_data['choices']) > 0:
963
- delta = json_data['choices'][0].get('delta', {})
964
- content = delta.get('content', '')
965
- if content:
966
- yield content
967
- except json.JSONDecodeError:
968
  continue
969
-
970
  except Exception as e:
971
- yield f"\n\nFireworks API 오류: {str(e)}"
972
 
 
973
 
974
- # ============== 채팅 처리 함수 ==============
975
  def process_file(file_path: str) -> tuple:
976
- if file_path is None:
977
  return None, None, None
978
 
979
  filename = os.path.basename(file_path)
980
 
981
  if is_image_file(file_path):
982
- base64_data = image_to_base64(file_path)
983
- mime_type = get_image_mime_type(file_path)
984
- return "image", base64_data, mime_type
985
 
986
  if is_hwp_file(file_path):
987
- print(f"[process_file] HWP 파일 처리: {filename}")
988
- markdown_content, error = convert_hwp_to_markdown(file_path)
989
- if markdown_content and len(markdown_content.strip()) > 20:
990
- # 컨텐츠가 너무 길면 요약 정보 추가
991
- content_preview = markdown_content[:500] + "..." if len(markdown_content) > 500 else markdown_content
992
- print(f"[process_file] HWP 변환 성공: {len(markdown_content)} chars")
993
- print(f"[process_file] 미리보기: {content_preview[:200]}")
994
- return "text", f"[HWP 문서: {filename}]\n\n{markdown_content}", None
995
- else:
996
- print(f"[process_file] HWP 변환 실패: {error}")
997
- return "error", f"HWP 변환 실패: {error}", None
998
 
999
  if is_pdf_file(file_path):
1000
  text = extract_text_from_pdf(file_path)
1001
  if text:
1002
  return "text", f"[PDF 문서: {filename}]\n\n{text}", None
1003
- else:
1004
- return "error", "PDF 텍스트 추출 실패", None
1005
 
1006
  if is_text_file(file_path):
1007
  text = extract_text_from_txt(file_path)
1008
  if text:
1009
  return "text", f"[텍스트 파일: {filename}]\n\n{text}", None
1010
- else:
1011
- return "error", "텍스트 파일 읽기 실패", None
1012
 
1013
- return "unsupported", f"지원하지 않는 파일 형식: {filename}", None
1014
 
1015
-
1016
- def chat_response(
1017
- message: str,
1018
- history: List[Dict],
1019
- file: Optional[str],
1020
- session_id: str,
1021
- groq_api_key: str,
1022
- fireworks_api_key: str
1023
- ) -> Generator[tuple, None, None]:
1024
- """채팅 응답 생성"""
1025
-
1026
  if history is None:
1027
  history = []
1028
 
1029
- if not message.strip() and file is None:
1030
  yield history, session_id
1031
  return
1032
 
1033
  if not session_id:
1034
  session_id = create_session()
1035
 
1036
- file_type = None
1037
- file_content = None
1038
- file_mime = None
1039
  file_info = None
1040
 
1041
- if file is not None:
1042
  file_type, file_content, file_mime = process_file(file)
1043
- file_info = json.dumps({
1044
- "type": file_type,
1045
- "filename": os.path.basename(file) if file else None
1046
- })
1047
 
1048
  if file_type == "error":
1049
  history = history + [
@@ -1060,184 +701,102 @@ def chat_response(
1060
  yield history, session_id
1061
  return
1062
 
1063
- # 사용자 메시지 구성
1064
- user_display_message = message
1065
- if file is not None:
1066
  filename = os.path.basename(file)
1067
- user_display_message = f"📎 {filename}\n\n{message}" if message else f"📎 {filename}"
1068
 
1069
- history = history + [
1070
- {"role": "user", "content": user_display_message},
1071
- {"role": "assistant", "content": ""}
1072
- ]
1073
  yield history, session_id
1074
 
1075
- # DB에서 이전 대화
1076
- db_messages = get_session_messages(session_id, limit=10)
1077
-
1078
  # API 메시지 구성
1079
- api_messages = []
1080
-
1081
- api_messages.append({
1082
  "role": "system",
1083
- "content": """당신은 도움이 되는 AI 어시스턴트입니다. 한국어로 자연스럽게 대화하며, 사용자의 질문에 정확하고 유 답변을 제공합니다.
1084
-
1085
- 파일이 첨부된 경우:
1086
- - 문서 내용을 주의 깊게 분석하세요
1087
- - 문서에서 중요한 정보를 추출하고 요약해주세요
1088
- - 사용자의 질문에 문서 내용을 기반으로 답변하세요
1089
- - 문서에 없는 내용은 추측하지 말고 문서에 기반한 답변을 하세요"""
1090
- })
1091
 
1092
- for db_msg in db_messages:
1093
- api_messages.append({
1094
- "role": db_msg["role"],
1095
- "content": db_msg["content"]
1096
- })
1097
 
1098
- # 현재 메시지 구성
1099
  current_content = message or ""
1100
  if file_type == "text" and file_content:
1101
- if message:
1102
- current_content = f"{file_content}\n\n---\n\n사용자 질문: {message}"
1103
- else:
1104
- current_content = f"{file_content}\n\n---\n\n위 문서의 내용을 요약해주세요."
1105
 
1106
- api_messages.append({
1107
- "role": "user",
1108
- "content": current_content
1109
- })
1110
 
 
1111
  full_response = ""
1112
-
1113
  if file_type == "image":
1114
- for chunk in call_fireworks_api_stream(api_messages, file_content, file_mime, fireworks_api_key):
1115
  full_response += chunk
1116
  history[-1] = {"role": "assistant", "content": full_response}
1117
  yield history, session_id
1118
  else:
1119
- for chunk in call_groq_api_stream(api_messages, groq_api_key):
1120
  full_response += chunk
1121
  history[-1] = {"role": "assistant", "content": full_response}
1122
  yield history, session_id
1123
 
1124
- # DB에 저장
1125
  save_message(session_id, "user", current_content, file_info)
1126
  save_message(session_id, "assistant", full_response)
1127
 
1128
  if len(db_messages) == 0 and message:
1129
- title = message[:50] + "..." if len(message) > 50 else message
1130
- update_session_title(session_id, title)
1131
-
1132
 
1133
  def new_chat():
1134
- session_id = create_session()
1135
- return [], session_id, None
1136
-
1137
 
1138
  def load_session(session_id: str) -> tuple:
1139
  if not session_id:
1140
  return [], ""
1141
-
1142
  messages = get_session_messages(session_id, limit=50)
1143
- history = []
1144
- for msg in messages:
1145
- history.append({"role": msg["role"], "content": msg["content"]})
1146
-
1147
- return history, session_id
1148
 
 
1149
 
1150
- # ============== HWP 변환기 함수 ==============
1151
  def convert_to_odt_subprocess(input_path, output_dir):
1152
  output_path = os.path.join(output_dir, "output.odt")
1153
-
1154
  try:
1155
  result = subprocess.run(
1156
  [sys.executable, '-m', 'hwp5', 'odt', '--output', output_path, input_path],
1157
- capture_output=True,
1158
- text=True,
1159
- timeout=120
1160
  )
1161
-
1162
  if result.returncode == 0 and os.path.exists(output_path):
1163
  return output_path, None
1164
- except Exception as e:
1165
- print(f"python -m hwp5 odt 오류: {e}")
1166
-
1167
- return None, "hwp5odt 변환 실패"
1168
-
1169
 
1170
  def convert_to_xml_subprocess(input_path, output_dir):
1171
  output_path = os.path.join(output_dir, "output.xml")
1172
-
1173
- if PYHWP_AVAILABLE:
1174
- try:
1175
- from hwp5.xmlmodel import Hwp5File as XmlHwp5File
1176
- from hwp5.xmldump import xmldump_flat
1177
-
1178
- hwp5file = XmlHwp5File(input_path)
1179
-
1180
- with open(output_path, 'wb') as f:
1181
- xmldump_flat(hwp5file, f)
1182
-
1183
- if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
1184
- return output_path, None
1185
- except Exception as e:
1186
- print(f"xmldump_flat 오류: {e}")
1187
-
1188
  try:
1189
  result = subprocess.run(
1190
  [sys.executable, '-m', 'hwp5', 'xml', input_path],
1191
- capture_output=True,
1192
- timeout=120
1193
  )
1194
-
1195
  if result.returncode == 0 and result.stdout:
1196
  with open(output_path, 'wb') as f:
1197
  f.write(result.stdout)
1198
  return output_path, None
1199
- except Exception as e:
1200
- print(f"python -m hwp5 xml 오류: {e}")
1201
-
1202
- return None, "hwp5xml 변환 실패"
1203
-
1204
-
1205
- def convert_to_markdown_file(input_path, output_dir, progress_callback=None):
1206
- if progress_callback:
1207
- progress_callback(0.2, "변환 중...")
1208
-
1209
- markdown_content, error = convert_hwp_to_markdown(input_path)
1210
-
1211
- if markdown_content is None:
1212
- return None, error
1213
-
1214
- if progress_callback:
1215
- progress_callback(0.8, "파일 저장 중...")
1216
-
1217
- output_path = os.path.join(output_dir, "output.md")
1218
- with open(output_path, 'w', encoding='utf-8') as f:
1219
- f.write(markdown_content)
1220
-
1221
- return output_path, None
1222
-
1223
 
1224
  def convert_hwp(file, output_format, progress=gr.Progress()):
1225
- if file is None:
1226
  return None, "❌ 파일을 업로드해주세요.", ""
1227
 
1228
- if hasattr(file, 'name'):
1229
- input_file = file.name
1230
- else:
1231
- input_file = str(file)
1232
-
1233
  if not input_file.lower().endswith('.hwp'):
1234
  return None, "❌ HWP 파일만 지원됩니다.", ""
1235
 
1236
  progress(0.1, desc="파일 분석 중...")
1237
-
1238
  version, is_valid = check_hwp_version(input_file)
1239
  if not is_valid:
1240
- return None, f"❌ 지원하지 않는 파일 형식입니다: {version}", ""
1241
 
1242
  tmp_dir = tempfile.mkdtemp()
1243
 
@@ -1248,347 +807,158 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
1248
 
1249
  progress(0.3, desc=f"{output_format}로 변환 중...")
1250
 
1251
- output_path = None
1252
- error = None
1253
- ext = ""
1254
 
1255
  if output_format == "HTML":
1256
  output_path, error = convert_to_html_subprocess(input_path, tmp_dir)
1257
  ext = ".html"
1258
-
1259
  if output_path and os.path.isdir(output_path):
1260
- zip_base = os.path.join(tmp_dir, "html_output")
1261
- zip_path = shutil.make_archive(zip_base, 'zip', output_path)
1262
- output_path = zip_path
1263
- ext = ".zip"
1264
- elif output_path and output_path == tmp_dir:
1265
- html_files = []
1266
- for item in os.listdir(tmp_dir):
1267
- if item.lower().endswith(('.html', '.htm', '.xhtml')):
1268
- html_files.append(item)
1269
- elif os.path.isdir(os.path.join(tmp_dir, item)):
1270
- for sub in os.listdir(os.path.join(tmp_dir, item)):
1271
- if sub.lower().endswith(('.html', '.htm', '.xhtml')):
1272
- html_files.append(os.path.join(item, sub))
1273
 
1274
- if html_files:
1275
- zip_base = os.path.join(tmp_dir, "html_output")
1276
- zip_path = shutil.make_archive(zip_base, 'zip', tmp_dir)
1277
- output_path = zip_path
1278
- ext = ".zip"
1279
-
1280
  elif output_format == "ODT (OpenDocument)":
1281
  output_path, error = convert_to_odt_subprocess(input_path, tmp_dir)
1282
  ext = ".odt"
1283
 
1284
  elif output_format == "TXT (텍스트)":
1285
- # 개선된 텍스트 추출 사용
1286
  text, error = extract_text_from_hwp(input_path)
1287
  if text:
1288
  output_path = os.path.join(tmp_dir, "output.txt")
1289
  with open(output_path, 'w', encoding='utf-8') as f:
1290
  f.write(text)
1291
- error = None
1292
  ext = ".txt"
1293
 
1294
  elif output_format == "Markdown":
1295
- def progress_callback(val, desc):
1296
- progress(0.3 + val * 0.5, desc=desc)
1297
-
1298
- output_path, error = convert_to_markdown_file(input_path, tmp_dir, progress_callback)
 
1299
  ext = ".md"
1300
-
1301
  elif output_format == "XML":
1302
  output_path, error = convert_to_xml_subprocess(input_path, tmp_dir)
1303
  ext = ".xml"
1304
-
1305
- else:
1306
- return None, f"❌ 지원하지 않는 형식: {output_format}", ""
1307
 
1308
- if output_path is None:
1309
- error_msg = error or "변환 실패했습니다."
1310
- return None, f"❌ {error_msg}", ""
1311
-
1312
- if os.path.isdir(output_path):
1313
- zip_base = os.path.join(tmp_dir, "output_archive")
1314
- zip_path = shutil.make_archive(zip_base, 'zip', output_path)
1315
- output_path = zip_path
1316
- ext = ".zip"
1317
 
1318
  if not os.path.exists(output_path):
1319
  return None, "❌ 변환된 파일을 찾을 수 없습니다.", ""
1320
 
1321
- progress(0.8, desc="파일 준비 중...")
1322
 
1323
  base_name = Path(input_filename).stem
1324
- final_filename = f"{base_name}{ext}"
1325
- final_output = os.path.join(tmp_dir, final_filename)
1326
-
1327
- if output_path != final_output and os.path.isfile(output_path):
1328
  shutil.copy2(output_path, final_output)
1329
- elif output_path == final_output:
1330
- pass
1331
- else:
1332
- final_output = output_path
1333
 
1334
  file_size = os.path.getsize(final_output)
1335
- size_str = f"{file_size / 1024:.1f} KB" if file_size > 1024 else f"{file_size} bytes"
1336
-
1337
- progress(1.0, desc="완료!")
1338
 
1339
  preview = ""
1340
-
1341
- if ext in ['.txt', '.md', '.xml'] and os.path.isfile(final_output):
1342
  try:
1343
  with open(final_output, 'r', encoding='utf-8', errors='ignore') as f:
1344
  preview = f.read(5000)
1345
  if len(preview) >= 5000:
1346
- preview += "\n\n... (미리보기 생략)"
1347
  except:
1348
  pass
1349
  elif ext == '.zip':
1350
- preview = "📦 HTML 변환 결과가 ZIP 파일로 압축되었습니다.\n다운로드 후 압축을 풀어서 HTML 파일을 확인하세요."
1351
 
1352
- return final_output, f"✅ 변환 완료: {final_filename} ({size_str})", preview
 
1353
 
1354
  except Exception as e:
1355
  import traceback
1356
  traceback.print_exc()
1357
- return None, f"❌ 오류 발생: {str(e)}", ""
1358
 
 
1359
 
1360
- # ============== CSS 스타일 ==============
1361
  css = """
1362
- .upload-box {
1363
- border: 2px dashed #6366f1 !important;
1364
- border-radius: 12px !important;
1365
- }
1366
- .download-box {
1367
- border: 2px solid #22c55e !important;
1368
- border-radius: 12px !important;
1369
- }
1370
- .preview-box {
1371
- max-height: 400px;
1372
- overflow-y: auto;
1373
- font-family: monospace;
1374
- white-space: pre-wrap;
1375
- background: #f8fafc;
1376
- padding: 16px;
1377
- border-radius: 8px;
1378
- }
1379
  """
1380
 
1381
- # ============== Gradio 인터페이스 ==============
1382
  with gr.Blocks(title="AI 문서 어시스턴트") as demo:
1383
-
1384
  session_state = gr.State("")
1385
 
1386
- gr.Markdown("""
1387
- # 🤖 AI 문서 어시스턴트
1388
- LLM 채팅 + HWP 문서 변환 통합 도구
1389
- """)
1390
 
1391
  with gr.Tabs():
1392
- # Tab 1: LLM 채팅
1393
- with gr.Tab("💬 AI 채팅", id="chat"):
1394
  with gr.Row():
1395
  with gr.Column(scale=1):
1396
  gr.Markdown("### ⚙️ 설정")
 
 
 
1397
 
1398
- with gr.Accordion("🔑 API 설정", open=True):
1399
- groq_key_input = gr.Textbox(
1400
- label="Groq API Key",
1401
- type="password",
1402
- placeholder="gsk_...",
1403
- value=GROQ_API_KEY
1404
- )
1405
- fireworks_key_input = gr.Textbox(
1406
- label="Fireworks API Key",
1407
- type="password",
1408
- placeholder="fw_...",
1409
- value=FIREWORKS_API_KEY
1410
- )
1411
-
1412
- gr.Markdown("### 📁 지원 파일")
1413
- gr.Markdown("""
1414
- - **이미지**: JPG, PNG, GIF, WebP
1415
- - **문서**: PDF, TXT, MD
1416
- - **한글**: HWP, HWPX ✨
1417
-
1418
- > HWP/HWPX 파일은 자동으로 텍스트 추출됩니다.
1419
- """)
1420
 
1421
- new_chat_btn = gr.Button("🆕 새 대화", variant="primary")
1422
-
1423
- with gr.Accordion("📜 대화 기록", open=False):
1424
- session_list = gr.Dataframe(
1425
- headers=["세션 ID", "제목", "업데이트"],
1426
- datatype=["str", "str", "str"],
1427
- interactive=False,
1428
- wrap=True
1429
- )
1430
- refresh_sessions_btn = gr.Button("🔄 새로고침", size="sm")
1431
 
1432
  with gr.Column(scale=3):
1433
- chatbot = gr.Chatbot(
1434
- label="대화",
1435
- height=500
1436
- )
1437
 
1438
  with gr.Row():
1439
- file_upload = gr.File(
1440
- label="📎 파일 첨부",
1441
- file_types=[".jpg", ".jpeg", ".png", ".gif", ".webp", ".pdf", ".txt", ".md", ".hwp", ".hwpx"],
1442
- file_count="single",
1443
- scale=1
1444
- )
1445
-
1446
- with gr.Column(scale=4):
1447
- msg_input = gr.Textbox(
1448
- label="메시지",
1449
- placeholder="메시지를 입력하세요... (Shift+Enter: 줄바꿈)",
1450
- lines=2,
1451
- max_lines=5,
1452
- show_label=False
1453
- )
1454
 
1455
  with gr.Row():
1456
  submit_btn = gr.Button("📤 전송", variant="primary", scale=3)
1457
  clear_btn = gr.Button("🗑️ 지우기", scale=1)
1458
 
1459
- # Tab 2: HWP 변환기
1460
- with gr.Tab("📄 HWP 변환기", id="converter"):
1461
- gr.Markdown("""
1462
- ### HWP 파일 변환기
1463
- 한글(HWP) 문서를 다양한 형식으로 변환합니다.
1464
- """)
1465
-
1466
  with gr.Row():
1467
- with gr.Column(scale=1):
1468
- gr.Markdown("#### 📤 파일 업로드")
1469
- hwp_file_input = gr.File(
1470
- label="HWP 파일 선택",
1471
- file_types=[".hwp"],
1472
- type="filepath",
1473
- elem_classes=["upload-box"]
1474
- )
1475
-
1476
- format_select = gr.Radio(
1477
- label="변환 형식",
1478
- choices=["HTML", "ODT (OpenDocument)", "TXT (텍스)", "Markdown", "XML"],
1479
- value="TXT (텍스트)",
1480
- info="원하는 출력 형식을 선택하세요"
1481
- )
1482
-
1483
- convert_btn = gr.Button("🔄 변환하기", variant="primary", size="lg")
1484
-
1485
- with gr.Column(scale=1):
1486
- gr.Markdown("#### 📥 변환 결과")
1487
- status_output = gr.Textbox(
1488
- label="상태",
1489
- interactive=False,
1490
- lines=2
1491
- )
1492
-
1493
- file_output = gr.File(
1494
- label="다운로드",
1495
- elem_classes=["download-box"]
1496
- )
1497
-
1498
- with gr.Accordion("📋 내용 미리보기", open=False):
1499
- preview_output = gr.Textbox(
1500
- label="",
1501
- lines=15,
1502
- max_lines=25,
1503
- interactive=False,
1504
- elem_classes=["preview-box"]
1505
- )
1506
-
1507
- gr.Markdown("""
1508
- ---
1509
- #### ℹ️ 안내사항
1510
-
1511
- | 형식 | 설명 | 용도 |
1512
- |------|------|------|
1513
- | **HTML** | 웹 페이지 형식 | 브라우저에서 보기 |
1514
- | **ODT** | OpenDocument | LibreOffice, Google Docs |
1515
- | **TXT** | 순수 텍스트 | 텍스트 추출 |
1516
- | **Markdown** | 마크다운 형식 | GitHub, 노션 |
1517
- | **XML** | 구조화 데이터 | 데이터 처리 |
1518
-
1519
- > ⚠️ HWP v5 형식(한글 2007+)만 지원 | 암호화 파일 불가
1520
- """)
1521
-
1522
- # ============== 이벤트 핸들러 ==============
1523
-
1524
- def on_submit(message, history, file, session_id, groq_key, fireworks_key):
1525
- if history is None:
1526
- history = []
1527
- for result in chat_response(message, history, file, session_id, groq_key, fireworks_key):
1528
- yield result[0], result[1], "", None
1529
-
1530
- submit_btn.click(
1531
- fn=on_submit,
1532
- inputs=[msg_input, chatbot, file_upload, session_state, groq_key_input, fireworks_key_input],
1533
- outputs=[chatbot, session_state, msg_input, file_upload]
1534
- )
1535
-
1536
- msg_input.submit(
1537
- fn=on_submit,
1538
- inputs=[msg_input, chatbot, file_upload, session_state, groq_key_input, fireworks_key_input],
1539
- outputs=[chatbot, session_state, msg_input, file_upload]
1540
- )
1541
-
1542
- def on_new_chat():
1543
- history, session_id, file = new_chat()
1544
- return history, session_id, None, ""
1545
-
1546
- new_chat_btn.click(
1547
- fn=on_new_chat,
1548
- outputs=[chatbot, session_state, file_upload, msg_input]
1549
- )
1550
-
1551
- clear_btn.click(
1552
- fn=lambda: ([], None, ""),
1553
- outputs=[chatbot, file_upload, msg_input]
1554
- )
1555
-
1556
- def refresh_sessions():
1557
  sessions = get_all_sessions()
1558
- data = [[s["session_id"][:8], s["title"] or "제목 없음", s["updated_at"][:16] if s["updated_at"] else ""] for s in sessions]
1559
- return data
1560
 
1561
- refresh_sessions_btn.click(
1562
- fn=refresh_sessions,
1563
- outputs=[session_list]
1564
- )
1565
 
1566
- def on_session_select(evt: gr.SelectData, data):
1567
  if evt.index[0] < len(data):
1568
- session_id_short = data[evt.index[0]][0]
1569
- sessions = get_all_sessions()
1570
- for s in sessions:
1571
- if s["session_id"].startswith(session_id_short):
1572
- history, session_id = load_session(s["session_id"])
1573
- return history, session_id
1574
  return [], ""
1575
 
1576
- session_list.select(
1577
- fn=on_session_select,
1578
- inputs=[session_list],
1579
- outputs=[chatbot, session_state]
1580
- )
1581
-
1582
- convert_btn.click(
1583
- fn=convert_hwp,
1584
- inputs=[hwp_file_input, format_select],
1585
- outputs=[file_output, status_output, preview_output]
1586
- )
1587
-
1588
- demo.load(
1589
- fn=refresh_sessions,
1590
- outputs=[session_list]
1591
- )
1592
 
1593
  if __name__ == "__main__":
1594
  demo.launch(css=css)
 
9
  import subprocess
10
  import shutil
11
  import sys
 
12
  import re
13
  import json
14
  import uuid
15
  import sqlite3
16
  import base64
17
  import requests
18
+ import zlib
19
  from pathlib import Path
20
  from datetime import datetime
21
+ from typing import Generator, List, Dict, Optional
22
 
23
  # ============== 환경 설정 ==============
24
  SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 
30
  print(f"Added local pyhwp path: {PYHWP_PATH}")
31
 
32
  # ============== 모듈 임포트 ==============
 
 
 
 
 
 
 
 
33
  try:
34
  import olefile
35
  OLEFILE_AVAILABLE = True
36
  print("olefile loaded successfully")
37
  except ImportError:
38
  OLEFILE_AVAILABLE = False
39
+ print("olefile not available")
40
 
41
  try:
42
  from markdownify import markdownify as md
 
44
  print("markdownify loaded successfully")
45
  except ImportError:
46
  MARKDOWNIFY_AVAILABLE = False
 
47
 
48
  try:
49
  import html2text
 
51
  print("html2text loaded successfully")
52
  except ImportError:
53
  HTML2TEXT_AVAILABLE = False
 
54
 
55
  try:
56
  from bs4 import BeautifulSoup
 
80
  def init_database():
81
  conn = sqlite3.connect(DB_PATH)
82
  cursor = conn.cursor()
 
83
  cursor.execute('''
84
  CREATE TABLE IF NOT EXISTS sessions (
85
  session_id TEXT PRIMARY KEY,
 
88
  title TEXT
89
  )
90
  ''')
 
91
  cursor.execute('''
92
  CREATE TABLE IF NOT EXISTS messages (
93
  id INTEGER PRIMARY KEY AUTOINCREMENT,
 
99
  FOREIGN KEY (session_id) REFERENCES sessions(session_id)
100
  )
101
  ''')
 
102
  conn.commit()
103
  conn.close()
104
 
 
133
  cursor = conn.cursor()
134
  cursor.execute(
135
  """SELECT role, content, file_info, created_at
136
+ FROM messages WHERE session_id = ?
137
+ ORDER BY created_at DESC LIMIT ?""",
 
 
138
  (session_id, limit)
139
  )
140
  rows = cursor.fetchall()
141
  conn.close()
142
+ return [{"role": r[0], "content": r[1], "file_info": r[2], "created_at": r[3]} for r in reversed(rows)]
 
 
 
 
 
 
 
 
 
143
 
144
  def get_all_sessions() -> List[Dict]:
145
  conn = sqlite3.connect(DB_PATH)
146
  cursor = conn.cursor()
147
  cursor.execute(
148
+ "SELECT session_id, title, created_at, updated_at FROM sessions ORDER BY updated_at DESC LIMIT 50"
 
 
 
149
  )
150
  rows = cursor.fetchall()
151
  conn.close()
152
+ return [{"session_id": r[0], "title": r[1], "created_at": r[2], "updated_at": r[3]} for r in rows]
 
 
 
 
153
 
154
  def update_session_title(session_id: str, title: str):
155
  conn = sqlite3.connect(DB_PATH)
156
  cursor = conn.cursor()
157
+ cursor.execute("UPDATE sessions SET title = ? WHERE session_id = ?", (title, session_id))
 
 
 
158
  conn.commit()
159
  conn.close()
160
 
161
  init_database()
162
 
163
+ # ============== 파일 유틸 ==============
164
  def extract_text_from_pdf(file_path: str) -> str:
165
  text_parts = []
 
166
  if PDFPLUMBER_AVAILABLE:
167
  try:
168
  with pdfplumber.open(file_path) as pdf:
 
187
  return "\n\n".join(text_parts)
188
  except Exception as e:
189
  print(f"PyPDF2 error: {e}")
 
190
  return None
191
 
192
  def extract_text_from_txt(file_path: str) -> str:
193
+ for encoding in ['utf-8', 'euc-kr', 'cp949', 'utf-16', 'latin-1']:
 
 
194
  try:
195
  with open(file_path, 'r', encoding=encoding) as f:
196
  return f.read()
197
+ except:
198
  continue
 
199
  return None
200
 
201
  def image_to_base64(file_path: str) -> str:
 
204
 
205
  def get_image_mime_type(file_path: str) -> str:
206
  ext = Path(file_path).suffix.lower()
207
+ return {'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png',
208
+ '.gif': 'image/gif', '.webp': 'image/webp', '.bmp': 'image/bmp'}.get(ext, 'image/jpeg')
 
 
 
 
 
 
 
209
 
210
+ def is_image_file(fp: str) -> bool:
211
+ return Path(fp).suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']
 
212
 
213
+ def is_hwp_file(fp: str) -> bool:
214
+ return Path(fp).suffix.lower() in ['.hwp', '.hwpx']
 
215
 
216
+ def is_pdf_file(fp: str) -> bool:
217
+ return Path(fp).suffix.lower() == '.pdf'
218
 
219
+ def is_text_file(fp: str) -> bool:
220
+ return Path(fp).suffix.lower() in ['.txt', '.md', '.json', '.csv', '.xml', '.html', '.css', '.js', '.py']
 
221
 
222
+ # ============== HWP 텍스트 추출 (핵심 - 단순하고 안정적으로) ==============
223
 
224
+ def decompress_stream(data: bytes) -> bytes:
225
+ """zlib 압축 해제 시도"""
 
 
 
226
  try:
227
+ return zlib.decompress(data, -15)
228
+ except:
229
  try:
230
+ return zlib.decompress(data)
231
+ except:
232
+ return data
233
+
234
+ def extract_hwp_text_from_bodytext(ole) -> str:
235
+ """BodyText 섹션에서 텍스트 추출 (HWP5 포맷)"""
236
+ text_parts = []
237
+
238
+ for entry in ole.listdir():
239
+ entry_path = '/'.join(entry)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
+ # BodyText/SectionX 스트 찾기
242
+ if entry_path.startswith('BodyText/Section'):
243
+ try:
244
+ stream_data = ole.openstream(entry).read()
245
+
246
+ # 압축 해제 시도
247
+ try:
248
+ decompressed = zlib.decompress(stream_data, -15)
249
+ except:
250
+ decompressed = stream_data
251
+
252
+ # HWP5 레코드에서 텍스트 추출
253
+ extracted = extract_text_from_hwp_records(decompressed)
254
+ if extracted:
255
+ text_parts.append(extracted)
256
+
257
+ except Exception as e:
258
+ print(f" 섹션 읽기 오류 {entry_path}: {e}")
259
+ continue
260
+
261
+ return '\n\n'.join(text_parts) if text_parts else None
262
+
263
+ def extract_text_from_hwp_records(data: bytes) -> str:
264
+ """HWP5 레코드 구조에서 텍스트 추출"""
265
+ texts = []
266
+ pos = 0
267
+
268
+ while pos < len(data) - 4:
269
  try:
270
+ # 레코드 헤더 (4바이트)
271
+ header = int.from_bytes(data[pos:pos+4], 'little')
272
+ tag_id = header & 0x3FF
273
+ size = (header >> 20) & 0xFFF
 
274
 
275
+ pos += 4
 
 
 
 
276
 
277
+ # 확장 크기
278
+ if size == 0xFFF:
279
+ if pos + 4 > len(data):
280
+ break
281
+ size = int.from_bytes(data[pos:pos+4], 'little')
282
+ pos += 4
283
 
284
+ if pos + size > len(data):
285
+ break
 
 
 
 
 
286
 
287
+ record_data = data[pos:pos+size]
288
+ pos += size
289
 
290
+ # HWPTAG_PARA_TEXT = 67 (0x43)
291
+ if tag_id == 67 and size > 0:
292
+ # 텍스트 추출 (컨트롤 문자 처리)
293
+ text = extract_para_text(record_data)
294
+ if text:
295
+ texts.append(text)
296
+
297
  except Exception as e:
298
+ pos += 1
299
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
 
301
+ return '\n'.join(texts) if texts else None
302
 
303
+ def extract_para_text(data: bytes) -> str:
304
+ """PARA_TEXT 레코드에서 실제 텍스트 추출"""
305
+ result = []
306
+ i = 0
307
+
308
+ while i < len(data) - 1:
309
+ code = int.from_bytes(data[i:i+2], 'little')
310
+
311
+ # 일반 문자 (유니코드)
312
+ if code >= 32:
313
+ try:
314
+ char = chr(code)
315
+ # 한글, 영문, 숫자, 일반 기호만 허용
316
+ if char.isprintable() and not (0x4E00 <= code <= 0x9FFF and code not in range(0xAC00, 0xD7A4)):
317
+ result.append(char)
318
+ elif 0xAC00 <= code <= 0xD7A3: # 한글 음절
319
+ result.append(char)
320
+ except:
321
+ pass
322
+ # 컨트롤 문자 처리
323
+ elif code == 0: # NULL
324
+ pass
325
+ elif code == 1: # 예약
326
+ i += 14 # 확장 컨트롤 건너뛰기
327
+ elif code == 2: # 섹션/컬럼 정의
328
+ i += 14
329
+ elif code == 3: # 필드 시작
330
+ i += 14
331
+ elif code == 4: # 필드 끝
332
+ pass
333
+ elif code == 9: # 탭
334
+ result.append('\t')
335
+ elif code == 10: # 줄바꿈
336
+ result.append('\n')
337
+ elif code == 13: # 문단 끝
338
+ result.append('\n')
339
+ elif code == 24: # 하이픈
340
+ result.append('-')
341
+ elif code == 30: # 묶음 빈칸
342
+ result.append(' ')
343
+ elif code == 31: # 고정폭 빈칸
344
+ result.append(' ')
345
+
346
+ i += 2
347
+
348
+ text = ''.join(result).strip()
349
+ # 의미 없는 텍스트 필터링
350
+ if len(text) < 2:
351
+ return None
352
+ return text
353
 
354
+ def extract_text_with_olefile(file_path: str) -> tuple:
355
+ """olefile을 사용한 HWP 텍스트 추출"""
356
  if not OLEFILE_AVAILABLE:
357
+ return None, "olefile 모듈 없"
358
 
359
  try:
360
  ole = olefile.OleFileIO(file_path)
361
 
362
+ # 파일 헤더 확인
363
+ if not ole.exists('FileHeader'):
364
+ ole.close()
365
+ return None, "HWP 파일 헤더 없음"
366
 
367
+ # 압축 여부 확인
368
+ header_data = ole.openstream('FileHeader').read()
369
+ is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
370
+ print(f" HWP 압축 여부: {is_compressed}")
371
 
372
+ # BodyText에서 텍스트 추출
373
+ text = extract_hwp_text_from_bodytext(ole)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
 
375
  ole.close()
376
 
377
+ if text and len(text.strip()) > 10:
378
+ return text.strip(), None
379
+
380
+ return None, "텍스트 추출 실패"
381
+
 
 
382
  except Exception as e:
383
+ return None, f"olefile 오류: {str(e)}"
 
384
 
385
+ def extract_text_with_hwp5txt(file_path: str) -> tuple:
386
+ """hwp5txt 명령어로 텍스트 추출"""
387
  try:
388
+ result = subprocess.run(
389
+ [sys.executable, '-m', 'hwp5', 'txt', file_path],
390
+ capture_output=True,
391
+ timeout=60
392
+ )
393
 
394
+ if result.returncode == 0 and result.stdout:
395
+ # 여러 인 시도
396
+ for enc in ['utf-8', 'cp949', 'euc-kr']:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  try:
398
+ text = result.stdout.decode(enc)
399
+ if text.strip() and len(text.strip()) > 10:
400
+ return text.strip(), None
 
 
401
  except:
402
+ continue
 
 
 
 
 
403
 
404
+ stderr = result.stderr.decode('utf-8', errors='ignore') if result.stderr else ""
405
+ return None, f"hwp5txt 실패: {stderr[:100]}"
406
 
407
+ except subprocess.TimeoutExpired:
408
+ return None, "hwp5txt 타임아웃"
409
  except Exception as e:
410
+ return None, f"hwp5txt 오류: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
 
412
  def extract_text_from_hwp(file_path: str) -> tuple:
413
+ """HWP 파일에서 텍스트 추출 (메인 함수)"""
414
+ print(f"\n[HWP 추출] 시작: {os.path.basename(file_path)}")
 
415
 
416
  # 방법 1: hwp5txt 명령어 (가장 안정적)
417
+ print(" 방법 1: hwp5txt 명령어...")
418
+ text, error = extract_text_with_hwp5txt(file_path)
419
+ if text:
420
+ print(f" ✓ hwp5txt 성공: {len(text)} 글자")
421
  return text, None
422
+ print(f" ✗ hwp5txt 실패: {error}")
 
423
 
424
+ # 방법 2: olefile 직접 파싱
425
+ print(" 방법 2: olefile 파싱...")
426
+ text, error = extract_text_with_olefile(file_path)
427
+ if text:
428
+ print(f" ✓ olefile 성공: {len(text)} 글자")
429
  return text, None
430
+ print(f" ✗ olefile 실패: {error}")
 
431
 
432
+ return None, "모든 추출 방법 실패"
 
 
 
 
 
 
 
 
 
433
 
434
+ # ============== HWP 변환 함수들 ==============
435
 
436
  def check_hwp_version(file_path):
437
  try:
 
446
  except Exception as e:
447
  return f"Error: {e}", False
448
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  def convert_to_html_subprocess(input_path, output_dir):
450
+ """HTML 변환"""
451
+ output_path = os.path.join(output_dir, "output.html")
452
 
453
  try:
454
  result = subprocess.run(
 
459
  )
460
 
461
  if result.returncode == 0:
462
+ # 결과 파일/디렉토리 찾기
463
  if os.path.isfile(output_path):
464
  return output_path, None
465
  if os.path.isdir(output_path):
466
  return output_path, None
467
 
468
+ # 다른 위치 검색
469
  for item in os.listdir(output_dir):
470
  item_path = os.path.join(output_dir, item)
471
+ if item.lower().endswith(('.html', '.htm')) and os.path.isfile(item_path):
472
  return item_path, None
473
  if os.path.isdir(item_path):
474
+ for sub in os.listdir(item_path):
475
+ if sub.lower().endswith(('.html', '.htm')):
476
  return item_path, None
 
477
  return output_dir, None
478
+
479
+ except subprocess.TimeoutExpired:
480
+ return None, "HTML 변환 타임아웃"
481
  except Exception as e:
482
+ return None, f"HTML 변환 오류: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
 
484
+ return None, "HTML 변환 실패"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
 
486
+ def convert_hwp_to_text(input_path: str) -> tuple:
487
+ """HWP를 텍스트로 변환"""
488
+ return extract_text_from_hwp(input_path)
489
 
490
+ def html_to_markdown(html_content):
491
+ """HTML을 Markdown으로 변환"""
492
  if MARKDOWNIFY_AVAILABLE:
493
+ try:
494
+ return md(html_content, heading_style="ATX", bullets="-"), None
495
+ except:
496
+ pass
497
 
498
  if HTML2TEXT_AVAILABLE:
499
+ try:
500
+ h = html2text.HTML2Text()
501
+ h.body_width = 0
502
+ return h.handle(html_content), None
503
+ except:
504
+ pass
505
 
506
+ # 기본 변환
507
+ if BS4_AVAILABLE:
508
+ try:
509
+ soup = BeautifulSoup(html_content, 'html.parser')
510
+ return soup.get_text(separator='\n'), None
511
+ except:
512
+ pass
513
 
514
+ return None, "Markdown 변환 실패"
 
515
 
516
  def convert_hwp_to_markdown(input_path: str) -> tuple:
517
+ """HWP Markdown으로 변환"""
518
+ # 먼저 텍스트 추출 시도
 
 
 
519
  text, error = extract_text_from_hwp(input_path)
520
+ if text:
 
521
  return text, None
522
 
523
+ # HTML 변환 Markdown 변환
 
 
524
  tmp_dir = tempfile.mkdtemp()
 
525
  try:
526
  html_output, error = convert_to_html_subprocess(input_path, tmp_dir)
527
+ if html_output:
528
+ # HTML 파일 읽기
529
+ html_files = []
530
+ if os.path.isfile(html_output):
531
+ html_files = [html_output]
532
+ elif os.path.isdir(html_output):
533
+ for root, dirs, files in os.walk(html_output):
534
+ for f in files:
535
+ if f.lower().endswith(('.html', '.htm')):
536
+ html_files.append(os.path.join(root, f))
537
+
538
+ for html_file in html_files:
539
+ for enc in ['utf-8', 'cp949', 'euc-kr']:
540
+ try:
541
+ with open(html_file, 'r', encoding=enc) as f:
542
+ content = f.read()
543
+ md_text, _ = html_to_markdown(content)
544
+ if md_text and len(md_text.strip()) > 10:
545
+ return md_text.strip(), None
546
+ except:
547
+ continue
548
 
549
+ return None, error or "변환 실패"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
550
  finally:
551
  shutil.rmtree(tmp_dir, ignore_errors=True)
552
 
553
+ # ============== LLM API ==============
554
 
 
555
  def call_groq_api_stream(messages: List[Dict], api_key: str) -> Generator[str, None, None]:
556
  if not api_key:
557
  yield "❌ Groq API 키가 설정되지 않았습니다."
558
  return
559
 
560
  try:
561
+ response = requests.post(
562
+ "https://api.groq.com/openai/v1/chat/completions",
563
+ headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
564
+ json={
565
+ "model": "meta-llama/llama-4-scout-17b-16e-instruct",
566
+ "messages": messages,
567
+ "temperature": 0.7,
568
+ "max_tokens": 8192,
569
+ "stream": True
570
+ },
571
+ stream=True
572
+ )
 
 
 
 
 
573
 
574
  if response.status_code != 200:
575
+ yield f"❌ Groq API 오류: {response.status_code}"
576
  return
577
 
578
  for line in response.iter_lines():
579
  if line:
580
  line = line.decode('utf-8')
581
+ if line.startswith('data: ') and line[6:] != '[DONE]':
 
 
 
582
  try:
583
+ data = json.loads(line[6:])
584
+ content = data.get('choices', [{}])[0].get('delta', {}).get('content', '')
585
+ if content:
586
+ yield content
587
+ except:
 
 
588
  continue
 
589
  except Exception as e:
590
+ yield f"❌ API 오류: {str(e)}"
 
591
 
592
  def call_fireworks_api_stream(messages: List[Dict], image_base64: str, mime_type: str, api_key: str) -> Generator[str, None, None]:
593
  if not api_key:
 
595
  return
596
 
597
  try:
598
+ formatted_messages = [{"role": m["role"], "content": m["content"]} for m in messages[:-1]]
 
 
 
 
 
 
 
 
 
599
  formatted_messages.append({
600
+ "role": messages[-1]["role"],
601
  "content": [
602
+ {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{image_base64}"}},
603
+ {"type": "text", "text": messages[-1]["content"]}
 
 
 
 
 
 
 
 
604
  ]
605
  })
606
 
607
+ response = requests.post(
608
+ "https://api.fireworks.ai/inference/v1/chat/completions",
609
+ headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
610
+ json={
611
+ "model": "accounts/fireworks/models/qwen3-vl-235b-a22b-thinking",
612
+ "max_tokens": 4096,
613
+ "temperature": 0.6,
614
+ "messages": formatted_messages,
615
+ "stream": True
616
+ },
617
+ stream=True
618
+ )
 
 
 
 
 
 
 
619
 
620
  if response.status_code != 200:
621
+ yield f"❌ Fireworks API 오류: {response.status_code}"
622
  return
623
 
624
  for line in response.iter_lines():
625
  if line:
626
  line = line.decode('utf-8')
627
+ if line.startswith('data: ') and line[6:] != '[DONE]':
 
 
 
628
  try:
629
+ data = json.loads(line[6:])
630
+ content = data.get('choices', [{}])[0].get('delta', {}).get('content', '')
631
+ if content:
632
+ yield content
633
+ except:
 
 
634
  continue
 
635
  except Exception as e:
636
+ yield f"❌ API 오류: {str(e)}"
637
 
638
+ # ============== 채팅 처리 ==============
639
 
 
640
  def process_file(file_path: str) -> tuple:
641
+ if not file_path:
642
  return None, None, None
643
 
644
  filename = os.path.basename(file_path)
645
 
646
  if is_image_file(file_path):
647
+ return "image", image_to_base64(file_path), get_image_mime_type(file_path)
 
 
648
 
649
  if is_hwp_file(file_path):
650
+ text, error = extract_text_from_hwp(file_path)
651
+ if text:
652
+ return "text", f"[HWP 문서: {filename}]\n\n{text}", None
653
+ return "error", f"HWP 추출 실패: {error}", None
 
 
 
 
 
 
 
654
 
655
  if is_pdf_file(file_path):
656
  text = extract_text_from_pdf(file_path)
657
  if text:
658
  return "text", f"[PDF 문서: {filename}]\n\n{text}", None
659
+ return "error", "PDF 추출 실패", None
 
660
 
661
  if is_text_file(file_path):
662
  text = extract_text_from_txt(file_path)
663
  if text:
664
  return "text", f"[텍스트 파일: {filename}]\n\n{text}", None
665
+ return "error", "텍스트 읽기 실패", None
 
666
 
667
+ return "unsupported", f"지원하지 않는 형식: {filename}", None
668
 
669
+ def chat_response(message: str, history: List[Dict], file: Optional[str],
670
+ session_id: str, groq_key: str, fireworks_key: str) -> Generator[tuple, None, None]:
 
 
 
 
 
 
 
 
 
671
  if history is None:
672
  history = []
673
 
674
+ if not message.strip() and not file:
675
  yield history, session_id
676
  return
677
 
678
  if not session_id:
679
  session_id = create_session()
680
 
681
+ # 파일 처리
682
+ file_type, file_content, file_mime = None, None, None
 
683
  file_info = None
684
 
685
+ if file:
686
  file_type, file_content, file_mime = process_file(file)
687
+ file_info = json.dumps({"type": file_type, "filename": os.path.basename(file)})
 
 
 
688
 
689
  if file_type == "error":
690
  history = history + [
 
701
  yield history, session_id
702
  return
703
 
704
+ # 사용자 메시지
705
+ user_msg = message
706
+ if file:
707
  filename = os.path.basename(file)
708
+ user_msg = f"📎 {filename}\n\n{message}" if message else f"📎 {filename}"
709
 
710
+ history = history + [{"role": "user", "content": user_msg}, {"role": "assistant", "content": ""}]
 
 
 
711
  yield history, session_id
712
 
 
 
 
713
  # API 메시지 구성
714
+ db_messages = get_session_messages(session_id, limit=10)
715
+ api_messages = [{
 
716
  "role": "system",
717
+ "content": "당신은 도움이 되는 AI 어시스턴트입니다. 한국어로 자연스럽게 대화하며, 파일이 첨부되면 분석하여 답변합니다."
718
+ }]
 
 
 
 
 
 
719
 
720
+ for m in db_messages:
721
+ api_messages.append({"role": m["role"], "content": m["content"]})
 
 
 
722
 
 
723
  current_content = message or ""
724
  if file_type == "text" and file_content:
725
+ current_content = f"{file_content}\n\n사용자 질문: {message}" if message else f"{file_content}\n\n위 문서 내용을 요약해주세요."
 
 
 
726
 
727
+ api_messages.append({"role": "user", "content": current_content})
 
 
 
728
 
729
+ # 응답 생성
730
  full_response = ""
 
731
  if file_type == "image":
732
+ for chunk in call_fireworks_api_stream(api_messages, file_content, file_mime, fireworks_key):
733
  full_response += chunk
734
  history[-1] = {"role": "assistant", "content": full_response}
735
  yield history, session_id
736
  else:
737
+ for chunk in call_groq_api_stream(api_messages, groq_key):
738
  full_response += chunk
739
  history[-1] = {"role": "assistant", "content": full_response}
740
  yield history, session_id
741
 
742
+ # 저장
743
  save_message(session_id, "user", current_content, file_info)
744
  save_message(session_id, "assistant", full_response)
745
 
746
  if len(db_messages) == 0 and message:
747
+ update_session_title(session_id, message[:50])
 
 
748
 
749
  def new_chat():
750
+ return [], create_session(), None
 
 
751
 
752
  def load_session(session_id: str) -> tuple:
753
  if not session_id:
754
  return [], ""
 
755
  messages = get_session_messages(session_id, limit=50)
756
+ return [{"role": m["role"], "content": m["content"]} for m in messages], session_id
 
 
 
 
757
 
758
+ # ============== HWP 변환기 (Tab 2) ==============
759
 
 
760
  def convert_to_odt_subprocess(input_path, output_dir):
761
  output_path = os.path.join(output_dir, "output.odt")
 
762
  try:
763
  result = subprocess.run(
764
  [sys.executable, '-m', 'hwp5', 'odt', '--output', output_path, input_path],
765
+ capture_output=True, timeout=120
 
 
766
  )
 
767
  if result.returncode == 0 and os.path.exists(output_path):
768
  return output_path, None
769
+ except:
770
+ pass
771
+ return None, "ODT 변환 실패"
 
 
772
 
773
  def convert_to_xml_subprocess(input_path, output_dir):
774
  output_path = os.path.join(output_dir, "output.xml")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
775
  try:
776
  result = subprocess.run(
777
  [sys.executable, '-m', 'hwp5', 'xml', input_path],
778
+ capture_output=True, timeout=120
 
779
  )
 
780
  if result.returncode == 0 and result.stdout:
781
  with open(output_path, 'wb') as f:
782
  f.write(result.stdout)
783
  return output_path, None
784
+ except:
785
+ pass
786
+ return None, "XML 변환 실패"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
787
 
788
  def convert_hwp(file, output_format, progress=gr.Progress()):
789
+ if not file:
790
  return None, "❌ 파일을 업로드해주세요.", ""
791
 
792
+ input_file = file.name if hasattr(file, 'name') else str(file)
 
 
 
 
793
  if not input_file.lower().endswith('.hwp'):
794
  return None, "❌ HWP 파일만 지원됩니다.", ""
795
 
796
  progress(0.1, desc="파일 분석 중...")
 
797
  version, is_valid = check_hwp_version(input_file)
798
  if not is_valid:
799
+ return None, f"❌ 지원하지 않는 파일: {version}", ""
800
 
801
  tmp_dir = tempfile.mkdtemp()
802
 
 
807
 
808
  progress(0.3, desc=f"{output_format}로 변환 중...")
809
 
810
+ output_path, error, ext = None, None, ""
 
 
811
 
812
  if output_format == "HTML":
813
  output_path, error = convert_to_html_subprocess(input_path, tmp_dir)
814
  ext = ".html"
 
815
  if output_path and os.path.isdir(output_path):
816
+ zip_path = shutil.make_archive(os.path.join(tmp_dir, "html"), 'zip', output_path)
817
+ output_path, ext = zip_path, ".zip"
 
 
 
 
 
 
 
 
 
 
 
818
 
 
 
 
 
 
 
819
  elif output_format == "ODT (OpenDocument)":
820
  output_path, error = convert_to_odt_subprocess(input_path, tmp_dir)
821
  ext = ".odt"
822
 
823
  elif output_format == "TXT (텍스트)":
 
824
  text, error = extract_text_from_hwp(input_path)
825
  if text:
826
  output_path = os.path.join(tmp_dir, "output.txt")
827
  with open(output_path, 'w', encoding='utf-8') as f:
828
  f.write(text)
 
829
  ext = ".txt"
830
 
831
  elif output_format == "Markdown":
832
+ text, error = convert_hwp_to_markdown(input_path)
833
+ if text:
834
+ output_path = os.path.join(tmp_dir, "output.md")
835
+ with open(output_path, 'w', encoding='utf-8') as f:
836
+ f.write(text)
837
  ext = ".md"
838
+
839
  elif output_format == "XML":
840
  output_path, error = convert_to_xml_subprocess(input_path, tmp_dir)
841
  ext = ".xml"
 
 
 
842
 
843
+ if not output_path:
844
+ return None, f"❌ {error or '변환 실패'}", ""
 
 
 
 
 
 
 
845
 
846
  if not os.path.exists(output_path):
847
  return None, "❌ 변환된 파일을 찾을 수 없습니다.", ""
848
 
849
+ progress(0.8, desc="완료...")
850
 
851
  base_name = Path(input_filename).stem
852
+ final_output = os.path.join(tmp_dir, f"{base_name}{ext}")
853
+ if output_path != final_output:
 
 
854
  shutil.copy2(output_path, final_output)
 
 
 
 
855
 
856
  file_size = os.path.getsize(final_output)
857
+ size_str = f"{file_size/1024:.1f} KB" if file_size > 1024 else f"{file_size} bytes"
 
 
858
 
859
  preview = ""
860
+ if ext in ['.txt', '.md', '.xml']:
 
861
  try:
862
  with open(final_output, 'r', encoding='utf-8', errors='ignore') as f:
863
  preview = f.read(5000)
864
  if len(preview) >= 5000:
865
+ preview += "\n\n... (생략)"
866
  except:
867
  pass
868
  elif ext == '.zip':
869
+ preview = "📦 HTML ZIP로 압축되었습니다."
870
 
871
+ progress(1.0, desc="완료!")
872
+ return final_output, f"✅ 변환 완료: {base_name}{ext} ({size_str})", preview
873
 
874
  except Exception as e:
875
  import traceback
876
  traceback.print_exc()
877
+ return None, f"❌ 오류: {str(e)}", ""
878
 
879
+ # ============== Gradio UI ==============
880
 
 
881
  css = """
882
+ .upload-box { border: 2px dashed #6366f1 !important; border-radius: 12px !important; }
883
+ .download-box { border: 2px solid #22c55e !important; border-radius: 12px !important; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
884
  """
885
 
 
886
  with gr.Blocks(title="AI 문서 어시스턴트") as demo:
 
887
  session_state = gr.State("")
888
 
889
+ gr.Markdown("# 🤖 AI 문서 어시스턴트\nLLM 채팅 + HWP 문서 변환")
 
 
 
890
 
891
  with gr.Tabs():
892
+ with gr.Tab("💬 AI 채팅"):
 
893
  with gr.Row():
894
  with gr.Column(scale=1):
895
  gr.Markdown("### ⚙️ 설정")
896
+ with gr.Accordion("🔑 API 키", open=True):
897
+ groq_key = gr.Textbox(label="Groq API Key", type="password", value=GROQ_API_KEY)
898
+ fireworks_key = gr.Textbox(label="Fireworks API Key", type="password", value=FIREWORKS_API_KEY)
899
 
900
+ gr.Markdown("### 📁 지원 파일\n- 이미지: JPG, PNG\n- 문서: PDF, TXT, HWP ✨")
901
+ new_btn = gr.Button("🆕 새 대화", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
902
 
903
+ with gr.Accordion("📜 기록", open=False):
904
+ session_list = gr.Dataframe(headers=["ID", "제목", "시간"], interactive=False)
905
+ refresh_btn = gr.Button("🔄 새로고침", size="sm")
 
 
 
 
 
 
 
906
 
907
  with gr.Column(scale=3):
908
+ chatbot = gr.Chatbot(label="대화", height=500)
 
 
 
909
 
910
  with gr.Row():
911
+ file_upload = gr.File(label="📎 파일", file_types=[".jpg",".jpeg",".png",".gif",".webp",".pdf",".txt",".md",".hwp",".hwpx"], scale=1)
912
+ msg_input = gr.Textbox(placeholder="메시지 입력...", lines=2, show_label=False, scale=4)
 
 
 
 
 
 
 
 
 
 
 
 
 
913
 
914
  with gr.Row():
915
  submit_btn = gr.Button("📤 전송", variant="primary", scale=3)
916
  clear_btn = gr.Button("🗑️ 지우기", scale=1)
917
 
918
+ with gr.Tab("📄 HWP 변환기"):
919
+ gr.Markdown("### HWP 파일 변환기")
 
 
 
 
 
920
  with gr.Row():
921
+ with gr.Column():
922
+ hwp_input = gr.File(label="HWP 파일", file_types=[".hwp"], elem_classes=["upload-box"])
923
+ format_select = gr.Radio(["HTML", "ODT (OpenDocument)", "TXT (텍스트)", "Markdown", "XML"], value="TXT (텍스트)", label="형식")
924
+ convert_btn = gr.Button("🔄 변환", variant="primary", size="lg")
925
+ with gr.Column():
926
+ status_out = gr.Textbox(label="상태", interactive=False)
927
+ file_out = gr.File(label="다운로드", elem_classes=["download-box"])
928
+
929
+ with gr.Accordion("📋 미리보기", open=False):
930
+ preview_out = gr.Textbox(lines=15, interactive=False)
931
+
932
+ # 이벤
933
+ def on_submit(msg, hist, f, sid, gk, fk):
934
+ if hist is None: hist = []
935
+ for r in chat_response(msg, hist, f, sid, gk, fk):
936
+ yield r[0], r[1], "", None
937
+
938
+ submit_btn.click(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
939
+ [chatbot, session_state, msg_input, file_upload])
940
+ msg_input.submit(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
941
+ [chatbot, session_state, msg_input, file_upload])
942
+
943
+ new_btn.click(lambda: ([], create_session(), None, ""), outputs=[chatbot, session_state, file_upload, msg_input])
944
+ clear_btn.click(lambda: ([], None, ""), outputs=[chatbot, file_upload, msg_input])
945
+
946
+ def refresh():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
947
  sessions = get_all_sessions()
948
+ return [[s["session_id"][:8], s["title"] or "제목없음", s["updated_at"][:16] if s["updated_at"] else ""] for s in sessions]
 
949
 
950
+ refresh_btn.click(refresh, outputs=[session_list])
 
 
 
951
 
952
+ def select_session(evt: gr.SelectData, data):
953
  if evt.index[0] < len(data):
954
+ for s in get_all_sessions():
955
+ if s["session_id"].startswith(data[evt.index[0]][0]):
956
+ return load_session(s["session_id"])
 
 
 
957
  return [], ""
958
 
959
+ session_list.select(select_session, [session_list], [chatbot, session_state])
960
+ convert_btn.click(convert_hwp, [hwp_input, format_select], [file_out, status_out, preview_out])
961
+ demo.load(refresh, outputs=[session_list])
 
 
 
 
 
 
 
 
 
 
 
 
 
962
 
963
  if __name__ == "__main__":
964
  demo.launch(css=css)