seawolf2357 commited on
Commit
e006e27
Β·
verified Β·
1 Parent(s): 46e1b25

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +382 -243
app.py CHANGED
@@ -16,9 +16,11 @@ import sqlite3
16
  import base64
17
  import requests
18
  import zlib
 
19
  from pathlib import Path
20
  from datetime import datetime
21
  from typing import Generator, List, Dict, Optional
 
22
 
23
  # ============== ν™˜κ²½ μ„€μ • ==============
24
  SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -27,7 +29,6 @@ DB_PATH = os.path.join(SCRIPT_DIR, 'chat_history.db')
27
 
28
  if os.path.exists(PYHWP_PATH):
29
  sys.path.insert(0, PYHWP_PATH)
30
- print(f"Added local pyhwp path: {PYHWP_PATH}")
31
 
32
  # ============== λͺ¨λ“ˆ μž„ν¬νŠΈ ==============
33
  try:
@@ -72,6 +73,28 @@ try:
72
  except ImportError:
73
  PDFPLUMBER_AVAILABLE = False
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  # ============== API ν‚€ μ„€μ • ==============
76
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
77
  FIREWORKS_API_KEY = os.environ.get("FIREWORKS_API_KEY", "")
@@ -106,10 +129,8 @@ def create_session() -> str:
106
  session_id = str(uuid.uuid4())
107
  conn = sqlite3.connect(DB_PATH)
108
  cursor = conn.cursor()
109
- cursor.execute(
110
- "INSERT INTO sessions (session_id, title) VALUES (?, ?)",
111
- (session_id, f"λŒ€ν™” {datetime.now().strftime('%Y-%m-%d %H:%M')}")
112
- )
113
  conn.commit()
114
  conn.close()
115
  return session_id
@@ -117,26 +138,17 @@ def create_session() -> str:
117
  def save_message(session_id: str, role: str, content: str, file_info: str = None):
118
  conn = sqlite3.connect(DB_PATH)
119
  cursor = conn.cursor()
120
- cursor.execute(
121
- "INSERT INTO messages (session_id, role, content, file_info) VALUES (?, ?, ?, ?)",
122
- (session_id, role, content, file_info)
123
- )
124
- cursor.execute(
125
- "UPDATE sessions SET updated_at = CURRENT_TIMESTAMP WHERE session_id = ?",
126
- (session_id,)
127
- )
128
  conn.commit()
129
  conn.close()
130
 
131
  def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
132
  conn = sqlite3.connect(DB_PATH)
133
  cursor = conn.cursor()
134
- cursor.execute(
135
- """SELECT role, content, file_info, created_at
136
- FROM messages WHERE session_id = ?
137
- ORDER BY created_at DESC LIMIT ?""",
138
- (session_id, limit)
139
- )
140
  rows = cursor.fetchall()
141
  conn.close()
142
  return [{"role": r[0], "content": r[1], "file_info": r[2], "created_at": r[3]} for r in reversed(rows)]
@@ -144,9 +156,7 @@ def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
144
  def get_all_sessions() -> List[Dict]:
145
  conn = sqlite3.connect(DB_PATH)
146
  cursor = conn.cursor()
147
- cursor.execute(
148
- "SELECT session_id, title, created_at, updated_at FROM sessions ORDER BY updated_at DESC LIMIT 50"
149
- )
150
  rows = cursor.fetchall()
151
  conn.close()
152
  return [{"session_id": r[0], "title": r[1], "created_at": r[2], "updated_at": r[3]} for r in rows]
@@ -204,14 +214,17 @@ def image_to_base64(file_path: str) -> str:
204
 
205
  def get_image_mime_type(file_path: str) -> str:
206
  ext = Path(file_path).suffix.lower()
207
- return {'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png',
208
  '.gif': 'image/gif', '.webp': 'image/webp', '.bmp': 'image/bmp'}.get(ext, 'image/jpeg')
209
 
210
  def is_image_file(fp: str) -> bool:
211
  return Path(fp).suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']
212
 
213
  def is_hwp_file(fp: str) -> bool:
214
- return Path(fp).suffix.lower() in ['.hwp', '.hwpx']
 
 
 
215
 
216
  def is_pdf_file(fp: str) -> bool:
217
  return Path(fp).suffix.lower() == '.pdf'
@@ -219,57 +232,242 @@ def is_pdf_file(fp: str) -> bool:
219
  def is_text_file(fp: str) -> bool:
220
  return Path(fp).suffix.lower() in ['.txt', '.md', '.json', '.csv', '.xml', '.html', '.css', '.js', '.py']
221
 
222
- # ============== HWP ν…μŠ€νŠΈ μΆ”μΆœ (핡심 - λ‹¨μˆœν•˜κ³  μ•ˆμ •μ μœΌλ‘œ) ==============
223
 
224
- def decompress_stream(data: bytes) -> bytes:
225
- """zlib μ••μΆ• ν•΄μ œ μ‹œλ„"""
226
  try:
227
- return zlib.decompress(data, -15)
228
- except:
229
- try:
230
- return zlib.decompress(data)
231
- except:
232
- return data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
- def extract_hwp_text_from_bodytext(ole) -> str:
235
- """BodyText μ„Ήμ…˜μ—μ„œ ν…μŠ€νŠΈ μΆ”μΆœ (HWP5 포맷)"""
236
- text_parts = []
 
237
 
238
- for entry in ole.listdir():
239
- entry_path = '/'.join(entry)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
- # BodyText/SectionX 슀트림 찾기
242
- if entry_path.startswith('BodyText/Section'):
243
- try:
244
- stream_data = ole.openstream(entry).read()
245
-
246
- # μ••μΆ• ν•΄μ œ μ‹œλ„
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  try:
248
- decompressed = zlib.decompress(stream_data, -15)
 
 
249
  except:
250
- decompressed = stream_data
251
-
252
- # HWP5 λ ˆμ½”λ“œμ—μ„œ ν…μŠ€νŠΈ μΆ”μΆœ
253
- extracted = extract_text_from_hwp_records(decompressed)
254
- if extracted:
255
- text_parts.append(extracted)
256
-
257
- except Exception as e:
258
- print(f" μ„Ήμ…˜ 읽기 였λ₯˜ {entry_path}: {e}")
259
- continue
260
 
261
- return '\n\n'.join(text_parts) if text_parts else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
- def extract_text_from_hwp_records(data: bytes) -> str:
264
- """HWP5 λ ˆμ½”λ“œ κ΅¬μ‘°μ—μ„œ ν…μŠ€νŠΈ μΆ”μΆœ"""
265
  texts = []
266
  pos = 0
267
 
268
  while pos < len(data) - 4:
269
  try:
270
- # λ ˆμ½”λ“œ 헀더 (4λ°”μ΄νŠΈ)
271
  header = int.from_bytes(data[pos:pos+4], 'little')
272
  tag_id = header & 0x3FF
 
273
  size = (header >> 20) & 0xFFF
274
 
275
  pos += 4
@@ -287,44 +485,31 @@ def extract_text_from_hwp_records(data: bytes) -> str:
287
  record_data = data[pos:pos+size]
288
  pos += size
289
 
290
- # HWPTAG_PARA_TEXT = 67 (0x43)
291
  if tag_id == 67 and size > 0:
292
- # ν…μŠ€νŠΈ μΆ”μΆœ (컨트둀 문자 처리)
293
- text = extract_para_text(record_data)
294
  if text:
295
  texts.append(text)
296
 
297
- except Exception as e:
298
  pos += 1
299
  continue
300
 
301
  return '\n'.join(texts) if texts else None
302
 
303
- def extract_para_text(data: bytes) -> str:
304
- """PARA_TEXT λ ˆμ½”λ“œμ—μ„œ μ‹€μ œ ν…μŠ€νŠΈ μΆ”μΆœ"""
305
  result = []
306
  i = 0
307
 
308
  while i < len(data) - 1:
309
  code = int.from_bytes(data[i:i+2], 'little')
310
 
311
- # 일반 문자 (μœ λ‹ˆμ½”λ“œ)
312
- if code >= 32:
313
- try:
314
- char = chr(code)
315
- # ν•œκΈ€, 영문, 숫자, 일반 기호만 ν—ˆμš©
316
- if char.isprintable() and not (0x4E00 <= code <= 0x9FFF and code not in range(0xAC00, 0xD7A4)):
317
- result.append(char)
318
- elif 0xAC00 <= code <= 0xD7A3: # ν•œκΈ€ 음절
319
- result.append(char)
320
- except:
321
- pass
322
- # 컨트둀 문자 처리
323
- elif code == 0: # NULL
324
  pass
325
- elif code == 1: # μ˜ˆμ•½
326
- i += 14 # ν™•μž₯ 컨트둀 κ±΄λ„ˆλ›°κΈ°
327
- elif code == 2: # μ„Ήμ…˜/컬럼 μ •μ˜
328
  i += 14
329
  elif code == 3: # ν•„λ“œ μ‹œμž‘
330
  i += 14
@@ -338,99 +523,59 @@ def extract_para_text(data: bytes) -> str:
338
  result.append('\n')
339
  elif code == 24: # ν•˜μ΄ν”ˆ
340
  result.append('-')
341
- elif code == 30: # 묢음 빈칸
342
- result.append(' ')
343
- elif code == 31: # 고정폭 빈칸
344
  result.append(' ')
 
 
 
 
 
 
 
 
 
 
345
 
346
  i += 2
347
 
348
  text = ''.join(result).strip()
349
- # 의미 μ—†λŠ” ν…μŠ€νŠΈ 필터링
350
- if len(text) < 2:
351
- return None
352
- return text
353
-
354
- def extract_text_with_olefile(file_path: str) -> tuple:
355
- """olefile을 μ‚¬μš©ν•œ HWP ν…μŠ€νŠΈ μΆ”μΆœ"""
356
- if not OLEFILE_AVAILABLE:
357
- return None, "olefile λͺ¨λ“ˆ μ—†μŒ"
358
 
359
- try:
360
- ole = olefile.OleFileIO(file_path)
361
-
362
- # 파일 헀더 확인
363
- if not ole.exists('FileHeader'):
364
- ole.close()
365
- return None, "HWP 파일 헀더 μ—†μŒ"
366
-
367
- # μ••μΆ• μ—¬λΆ€ 확인
368
- header_data = ole.openstream('FileHeader').read()
369
- is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
370
- print(f" HWP μ••μΆ• μ—¬λΆ€: {is_compressed}")
371
-
372
- # BodyTextμ—μ„œ ν…μŠ€νŠΈ μΆ”μΆœ
373
- text = extract_hwp_text_from_bodytext(ole)
374
-
375
- ole.close()
376
-
377
- if text and len(text.strip()) > 10:
378
- return text.strip(), None
379
-
380
- return None, "ν…μŠ€νŠΈ μΆ”μΆœ μ‹€νŒ¨"
381
-
382
- except Exception as e:
383
- return None, f"olefile 였λ₯˜: {str(e)}"
384
-
385
- def extract_text_with_hwp5txt(file_path: str) -> tuple:
386
- """hwp5txt λͺ…λ Ήμ–΄λ‘œ ν…μŠ€νŠΈ μΆ”μΆœ"""
387
- try:
388
- result = subprocess.run(
389
- [sys.executable, '-m', 'hwp5', 'txt', file_path],
390
- capture_output=True,
391
- timeout=60
392
- )
393
-
394
- if result.returncode == 0 and result.stdout:
395
- # μ—¬λŸ¬ 인코딩 μ‹œλ„
396
- for enc in ['utf-8', 'cp949', 'euc-kr']:
397
- try:
398
- text = result.stdout.decode(enc)
399
- if text.strip() and len(text.strip()) > 10:
400
- return text.strip(), None
401
- except:
402
- continue
403
-
404
- stderr = result.stderr.decode('utf-8', errors='ignore') if result.stderr else ""
405
- return None, f"hwp5txt μ‹€νŒ¨: {stderr[:100]}"
406
-
407
- except subprocess.TimeoutExpired:
408
- return None, "hwp5txt νƒ€μž„μ•„μ›ƒ"
409
- except Exception as e:
410
- return None, f"hwp5txt 였λ₯˜: {str(e)}"
411
 
412
  def extract_text_from_hwp(file_path: str) -> tuple:
413
  """HWP νŒŒμΌμ—μ„œ ν…μŠ€νŠΈ μΆ”μΆœ (메인 ν•¨μˆ˜)"""
414
  print(f"\n[HWP μΆ”μΆœ] μ‹œμž‘: {os.path.basename(file_path)}")
415
 
416
- # 방법 1: hwp5txt λͺ…λ Ήμ–΄ (κ°€μž₯ μ•ˆμ •μ )
417
- print(" 방법 1: hwp5txt λͺ…λ Ήμ–΄...")
418
  text, error = extract_text_with_hwp5txt(file_path)
419
- if text:
420
  print(f" βœ“ hwp5txt 성곡: {len(text)} κΈ€μž")
421
  return text, None
422
  print(f" βœ— hwp5txt μ‹€νŒ¨: {error}")
423
 
424
- # 방법 2: olefile 직접 νŒŒμ‹±
425
  print(" 방법 2: olefile νŒŒμ‹±...")
426
  text, error = extract_text_with_olefile(file_path)
427
- if text:
428
  print(f" βœ“ olefile 성곡: {len(text)} κΈ€μž")
429
  return text, None
430
  print(f" βœ— olefile μ‹€νŒ¨: {error}")
431
 
432
  return None, "λͺ¨λ“  μΆ”μΆœ 방법 μ‹€νŒ¨"
433
 
 
 
 
 
 
 
 
 
434
  # ============== HWP λ³€ν™˜ ν•¨μˆ˜λ“€ ==============
435
 
436
  def check_hwp_version(file_path):
@@ -441,6 +586,8 @@ def check_hwp_version(file_path):
441
  return "HWP v5", True
442
  elif header[:4] == b'\xd0\xcf\x11\xe0':
443
  return "HWP v5 (OLE)", True
 
 
444
  else:
445
  return "Unknown", False
446
  except Exception as e:
@@ -451,41 +598,32 @@ def convert_to_html_subprocess(input_path, output_dir):
451
  output_path = os.path.join(output_dir, "output.html")
452
 
453
  try:
454
- result = subprocess.run(
455
- [sys.executable, '-m', 'hwp5', 'html', '--output', output_path, input_path],
456
- capture_output=True,
457
- text=True,
458
- timeout=120
459
- )
460
-
461
- if result.returncode == 0:
462
- # κ²°κ³Ό 파일/디렉토리 μ°ΎκΈ°
463
- if os.path.isfile(output_path):
464
- return output_path, None
465
- if os.path.isdir(output_path):
466
- return output_path, None
467
-
468
- # λ‹€λ₯Έ μœ„μΉ˜ 검색
469
- for item in os.listdir(output_dir):
470
- item_path = os.path.join(output_dir, item)
471
- if item.lower().endswith(('.html', '.htm')) and os.path.isfile(item_path):
472
- return item_path, None
473
- if os.path.isdir(item_path):
474
- for sub in os.listdir(item_path):
475
- if sub.lower().endswith(('.html', '.htm')):
476
  return item_path, None
477
- return output_dir, None
478
-
479
- except subprocess.TimeoutExpired:
480
- return None, "HTML λ³€ν™˜ νƒ€μž„μ•„μ›ƒ"
 
481
  except Exception as e:
482
- return None, f"HTML λ³€ν™˜ 였λ₯˜: {str(e)}"
483
 
484
  return None, "HTML λ³€ν™˜ μ‹€νŒ¨"
485
 
486
  def convert_hwp_to_text(input_path: str) -> tuple:
487
- """HWPλ₯Ό ν…μŠ€νŠΈλ‘œ λ³€ν™˜"""
488
- return extract_text_from_hwp(input_path)
489
 
490
  def html_to_markdown(html_content):
491
  """HTML을 Markdown으둜 λ³€ν™˜"""
@@ -503,7 +641,6 @@ def html_to_markdown(html_content):
503
  except:
504
  pass
505
 
506
- # κΈ°λ³Έ λ³€ν™˜
507
  if BS4_AVAILABLE:
508
  try:
509
  soup = BeautifulSoup(html_content, 'html.parser')
@@ -514,41 +651,12 @@ def html_to_markdown(html_content):
514
  return None, "Markdown λ³€ν™˜ μ‹€νŒ¨"
515
 
516
  def convert_hwp_to_markdown(input_path: str) -> tuple:
517
- """HWPλ₯Ό Markdown으둜 λ³€ν™˜"""
518
- # λ¨Όμ € ν…μŠ€νŠΈ μΆ”μΆœ μ‹œλ„
519
- text, error = extract_text_from_hwp(input_path)
520
  if text:
521
  return text, None
522
-
523
- # HTML λ³€ν™˜ ν›„ Markdown λ³€ν™˜
524
- tmp_dir = tempfile.mkdtemp()
525
- try:
526
- html_output, error = convert_to_html_subprocess(input_path, tmp_dir)
527
- if html_output:
528
- # HTML 파일 읽기
529
- html_files = []
530
- if os.path.isfile(html_output):
531
- html_files = [html_output]
532
- elif os.path.isdir(html_output):
533
- for root, dirs, files in os.walk(html_output):
534
- for f in files:
535
- if f.lower().endswith(('.html', '.htm')):
536
- html_files.append(os.path.join(root, f))
537
-
538
- for html_file in html_files:
539
- for enc in ['utf-8', 'cp949', 'euc-kr']:
540
- try:
541
- with open(html_file, 'r', encoding=enc) as f:
542
- content = f.read()
543
- md_text, _ = html_to_markdown(content)
544
- if md_text and len(md_text.strip()) > 10:
545
- return md_text.strip(), None
546
- except:
547
- continue
548
-
549
- return None, error or "λ³€ν™˜ μ‹€νŒ¨"
550
- finally:
551
- shutil.rmtree(tmp_dir, ignore_errors=True)
552
 
553
  # ============== LLM API ==============
554
 
@@ -646,11 +754,11 @@ def process_file(file_path: str) -> tuple:
646
  if is_image_file(file_path):
647
  return "image", image_to_base64(file_path), get_image_mime_type(file_path)
648
 
649
- if is_hwp_file(file_path):
650
- text, error = extract_text_from_hwp(file_path)
651
- if text:
652
- return "text", f"[HWP λ¬Έμ„œ: {filename}]\n\n{text}", None
653
- return "error", f"HWP μΆ”μΆœ μ‹€νŒ¨: {error}", None
654
 
655
  if is_pdf_file(file_path):
656
  text = extract_text_from_pdf(file_path)
@@ -666,7 +774,7 @@ def process_file(file_path: str) -> tuple:
666
 
667
  return "unsupported", f"μ§€μ›ν•˜μ§€ μ•ŠλŠ” ν˜•μ‹: {filename}", None
668
 
669
- def chat_response(message: str, history: List[Dict], file: Optional[str],
670
  session_id: str, groq_key: str, fireworks_key: str) -> Generator[tuple, None, None]:
671
  if history is None:
672
  history = []
@@ -714,7 +822,7 @@ def chat_response(message: str, history: List[Dict], file: Optional[str],
714
  db_messages = get_session_messages(session_id, limit=10)
715
  api_messages = [{
716
  "role": "system",
717
- "content": "당신은 도움이 λ˜λŠ” AI μ–΄μ‹œμŠ€ν„΄νŠΈμž…λ‹ˆλ‹€. ν•œκ΅­μ–΄λ‘œ μžμ—°μŠ€λŸ½κ²Œ λŒ€ν™”ν•˜λ©°, 파일이 μ²¨λΆ€λ˜λ©΄ λ‚΄μš©μ„ λΆ„μ„ν•˜μ—¬ λ‹΅λ³€ν•©λ‹ˆλ‹€."
718
  }]
719
 
720
  for m in db_messages:
@@ -760,12 +868,14 @@ def load_session(session_id: str) -> tuple:
760
  def convert_to_odt_subprocess(input_path, output_dir):
761
  output_path = os.path.join(output_dir, "output.odt")
762
  try:
763
- result = subprocess.run(
764
- [sys.executable, '-m', 'hwp5', 'odt', '--output', output_path, input_path],
765
- capture_output=True, timeout=120
766
- )
767
- if result.returncode == 0 and os.path.exists(output_path):
768
- return output_path, None
 
 
769
  except:
770
  pass
771
  return None, "ODT λ³€ν™˜ μ‹€νŒ¨"
@@ -773,14 +883,16 @@ def convert_to_odt_subprocess(input_path, output_dir):
773
  def convert_to_xml_subprocess(input_path, output_dir):
774
  output_path = os.path.join(output_dir, "output.xml")
775
  try:
776
- result = subprocess.run(
777
- [sys.executable, '-m', 'hwp5', 'xml', input_path],
778
- capture_output=True, timeout=120
779
- )
780
- if result.returncode == 0 and result.stdout:
781
- with open(output_path, 'wb') as f:
782
- f.write(result.stdout)
783
- return output_path, None
 
 
784
  except:
785
  pass
786
  return None, "XML λ³€ν™˜ μ‹€νŒ¨"
@@ -790,8 +902,10 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
790
  return None, "❌ νŒŒμΌμ„ μ—…λ‘œλ“œν•΄μ£Όμ„Έμš”.", ""
791
 
792
  input_file = file.name if hasattr(file, 'name') else str(file)
793
- if not input_file.lower().endswith('.hwp'):
794
- return None, "❌ HWP 파일만 μ§€μ›λ©λ‹ˆλ‹€.", ""
 
 
795
 
796
  progress(0.1, desc="파일 뢄석 쀑...")
797
  version, is_valid = check_hwp_version(input_file)
@@ -810,6 +924,8 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
810
  output_path, error, ext = None, None, ""
811
 
812
  if output_format == "HTML":
 
 
813
  output_path, error = convert_to_html_subprocess(input_path, tmp_dir)
814
  ext = ".html"
815
  if output_path and os.path.isdir(output_path):
@@ -817,11 +933,13 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
817
  output_path, ext = zip_path, ".zip"
818
 
819
  elif output_format == "ODT (OpenDocument)":
 
 
820
  output_path, error = convert_to_odt_subprocess(input_path, tmp_dir)
821
  ext = ".odt"
822
 
823
  elif output_format == "TXT (ν…μŠ€νŠΈ)":
824
- text, error = extract_text_from_hwp(input_path)
825
  if text:
826
  output_path = os.path.join(tmp_dir, "output.txt")
827
  with open(output_path, 'w', encoding='utf-8') as f:
@@ -837,7 +955,24 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
837
  ext = ".md"
838
 
839
  elif output_format == "XML":
840
- output_path, error = convert_to_xml_subprocess(input_path, tmp_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
841
  ext = ".xml"
842
 
843
  if not output_path:
@@ -886,7 +1021,7 @@ css = """
886
  with gr.Blocks(title="AI λ¬Έμ„œ μ–΄μ‹œμŠ€ν„΄νŠΈ") as demo:
887
  session_state = gr.State("")
888
 
889
- gr.Markdown("# πŸ€– AI λ¬Έμ„œ μ–΄μ‹œμŠ€ν„΄νŠΈ\nLLM μ±„νŒ… + HWP λ¬Έμ„œ λ³€ν™˜")
890
 
891
  with gr.Tabs():
892
  with gr.Tab("πŸ’¬ AI μ±„νŒ…"):
@@ -897,7 +1032,7 @@ with gr.Blocks(title="AI λ¬Έμ„œ μ–΄μ‹œμŠ€ν„΄νŠΈ") as demo:
897
  groq_key = gr.Textbox(label="Groq API Key", type="password", value=GROQ_API_KEY)
898
  fireworks_key = gr.Textbox(label="Fireworks API Key", type="password", value=FIREWORKS_API_KEY)
899
 
900
- gr.Markdown("### πŸ“ 지원 파일\n- 이미지: JPG, PNG\n- λ¬Έμ„œ: PDF, TXT, HWP ✨")
901
  new_btn = gr.Button("πŸ†• μƒˆ λŒ€ν™”", variant="primary")
902
 
903
  with gr.Accordion("πŸ“œ 기둝", open=False):
@@ -916,10 +1051,10 @@ with gr.Blocks(title="AI λ¬Έμ„œ μ–΄μ‹œμŠ€ν„΄νŠΈ") as demo:
916
  clear_btn = gr.Button("πŸ—‘οΈ μ§€μš°κΈ°", scale=1)
917
 
918
  with gr.Tab("πŸ“„ HWP λ³€ν™˜κΈ°"):
919
- gr.Markdown("### HWP 파일 λ³€ν™˜κΈ°")
920
  with gr.Row():
921
  with gr.Column():
922
- hwp_input = gr.File(label="HWP 파일", file_types=[".hwp"], elem_classes=["upload-box"])
923
  format_select = gr.Radio(["HTML", "ODT (OpenDocument)", "TXT (ν…μŠ€νŠΈ)", "Markdown", "XML"], value="TXT (ν…μŠ€νŠΈ)", label="ν˜•μ‹")
924
  convert_btn = gr.Button("πŸ”„ λ³€ν™˜", variant="primary", size="lg")
925
  with gr.Column():
@@ -928,6 +1063,10 @@ with gr.Blocks(title="AI λ¬Έμ„œ μ–΄μ‹œμŠ€ν„΄νŠΈ") as demo:
928
 
929
  with gr.Accordion("πŸ“‹ 미리보기", open=False):
930
  preview_out = gr.Textbox(lines=15, interactive=False)
 
 
 
 
931
 
932
  # 이벀트
933
  def on_submit(msg, hist, f, sid, gk, fk):
@@ -935,7 +1074,7 @@ with gr.Blocks(title="AI λ¬Έμ„œ μ–΄μ‹œμŠ€ν„΄νŠΈ") as demo:
935
  for r in chat_response(msg, hist, f, sid, gk, fk):
936
  yield r[0], r[1], "", None
937
 
938
- submit_btn.click(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
939
  [chatbot, session_state, msg_input, file_upload])
940
  msg_input.submit(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
941
  [chatbot, session_state, msg_input, file_upload])
 
16
  import base64
17
  import requests
18
  import zlib
19
+ import zipfile
20
  from pathlib import Path
21
  from datetime import datetime
22
  from typing import Generator, List, Dict, Optional
23
+ from xml.etree import ElementTree as ET
24
 
25
  # ============== ν™˜κ²½ μ„€μ • ==============
26
  SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 
29
 
30
  if os.path.exists(PYHWP_PATH):
31
  sys.path.insert(0, PYHWP_PATH)
 
32
 
33
  # ============== λͺ¨λ“ˆ μž„ν¬νŠΈ ==============
34
  try:
 
73
  except ImportError:
74
  PDFPLUMBER_AVAILABLE = False
75
 
76
+ # hwp5txt μ‚¬μš© κ°€λŠ₯ μ—¬λΆ€ 확인
77
+ HWP5TXT_AVAILABLE = False
78
+ try:
79
+ result = subprocess.run(['hwp5txt', '--help'], capture_output=True, timeout=5)
80
+ if result.returncode == 0:
81
+ HWP5TXT_AVAILABLE = True
82
+ print("hwp5txt command available")
83
+ except:
84
+ pass
85
+
86
+ if not HWP5TXT_AVAILABLE:
87
+ try:
88
+ result = subprocess.run([sys.executable, '-c', 'from hwp5.hwp5txt import main; print("ok")'],
89
+ capture_output=True, timeout=5)
90
+ if b'ok' in result.stdout:
91
+ HWP5TXT_AVAILABLE = True
92
+ print("hwp5txt module available")
93
+ except:
94
+ pass
95
+
96
+ print(f"HWP5TXT_AVAILABLE: {HWP5TXT_AVAILABLE}")
97
+
98
  # ============== API ν‚€ μ„€μ • ==============
99
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
100
  FIREWORKS_API_KEY = os.environ.get("FIREWORKS_API_KEY", "")
 
129
  session_id = str(uuid.uuid4())
130
  conn = sqlite3.connect(DB_PATH)
131
  cursor = conn.cursor()
132
+ cursor.execute("INSERT INTO sessions (session_id, title) VALUES (?, ?)",
133
+ (session_id, f"λŒ€ν™” {datetime.now().strftime('%Y-%m-%d %H:%M')}"))
 
 
134
  conn.commit()
135
  conn.close()
136
  return session_id
 
138
  def save_message(session_id: str, role: str, content: str, file_info: str = None):
139
  conn = sqlite3.connect(DB_PATH)
140
  cursor = conn.cursor()
141
+ cursor.execute("INSERT INTO messages (session_id, role, content, file_info) VALUES (?, ?, ?, ?)",
142
+ (session_id, role, content, file_info))
143
+ cursor.execute("UPDATE sessions SET updated_at = CURRENT_TIMESTAMP WHERE session_id = ?", (session_id,))
 
 
 
 
 
144
  conn.commit()
145
  conn.close()
146
 
147
  def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
148
  conn = sqlite3.connect(DB_PATH)
149
  cursor = conn.cursor()
150
+ cursor.execute("SELECT role, content, file_info, created_at FROM messages WHERE session_id = ? ORDER BY created_at DESC LIMIT ?",
151
+ (session_id, limit))
 
 
 
 
152
  rows = cursor.fetchall()
153
  conn.close()
154
  return [{"role": r[0], "content": r[1], "file_info": r[2], "created_at": r[3]} for r in reversed(rows)]
 
156
  def get_all_sessions() -> List[Dict]:
157
  conn = sqlite3.connect(DB_PATH)
158
  cursor = conn.cursor()
159
+ cursor.execute("SELECT session_id, title, created_at, updated_at FROM sessions ORDER BY updated_at DESC LIMIT 50")
 
 
160
  rows = cursor.fetchall()
161
  conn.close()
162
  return [{"session_id": r[0], "title": r[1], "created_at": r[2], "updated_at": r[3]} for r in rows]
 
214
 
215
  def get_image_mime_type(file_path: str) -> str:
216
  ext = Path(file_path).suffix.lower()
217
+ return {'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png',
218
  '.gif': 'image/gif', '.webp': 'image/webp', '.bmp': 'image/bmp'}.get(ext, 'image/jpeg')
219
 
220
  def is_image_file(fp: str) -> bool:
221
  return Path(fp).suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']
222
 
223
  def is_hwp_file(fp: str) -> bool:
224
+ return Path(fp).suffix.lower() == '.hwp'
225
+
226
+ def is_hwpx_file(fp: str) -> bool:
227
+ return Path(fp).suffix.lower() == '.hwpx'
228
 
229
  def is_pdf_file(fp: str) -> bool:
230
  return Path(fp).suffix.lower() == '.pdf'
 
232
  def is_text_file(fp: str) -> bool:
233
  return Path(fp).suffix.lower() in ['.txt', '.md', '.json', '.csv', '.xml', '.html', '.css', '.js', '.py']
234
 
235
+ # ============== HWPX ν…μŠ€νŠΈ μΆ”μΆœ (ZIP/XML 기반) ==============
236
 
237
+ def extract_text_from_hwpx(file_path: str) -> tuple:
238
+ """HWPX νŒŒμΌμ—μ„œ ν…μŠ€νŠΈ μΆ”μΆœ (ZIP λ‚΄λΆ€ XML νŒŒμ‹±)"""
239
  try:
240
+ text_parts = []
241
+
242
+ with zipfile.ZipFile(file_path, 'r') as zf:
243
+ # HWPX λ‚΄λΆ€ ꡬ쑰 확인
244
+ file_list = zf.namelist()
245
+ print(f" HWPX λ‚΄λΆ€ 파일: {file_list[:10]}...")
246
+
247
+ # Contents 폴더 λ‚΄μ˜ section XML νŒŒμΌλ“€ 처리
248
+ section_files = sorted([f for f in file_list if f.startswith('Contents/section') and f.endswith('.xml')])
249
+
250
+ if not section_files:
251
+ # λ‹€λ₯Έ 경둜 μ‹œλ„
252
+ section_files = sorted([f for f in file_list if 'section' in f.lower() and f.endswith('.xml')])
253
+
254
+ print(f" μ„Ήμ…˜ 파일: {section_files}")
255
+
256
+ for section_file in section_files:
257
+ try:
258
+ with zf.open(section_file) as sf:
259
+ content = sf.read()
260
+
261
+ # XML νŒŒμ‹±
262
+ try:
263
+ # λ„€μž„μŠ€νŽ˜μ΄μŠ€ μ œκ±°ν•˜κ³  νŒŒμ‹±
264
+ content_str = content.decode('utf-8')
265
+ # λ„€μž„μŠ€νŽ˜μ΄μŠ€ 제거
266
+ content_str = re.sub(r'\sxmlns[^"]*"[^"]*"', '', content_str)
267
+ content_str = re.sub(r'<[a-zA-Z]+:', '<', content_str)
268
+ content_str = re.sub(r'</[a-zA-Z]+:', '</', content_str)
269
+
270
+ root = ET.fromstring(content_str)
271
+
272
+ # λͺ¨λ“  ν…μŠ€νŠΈ μΆ”μΆœ
273
+ texts = []
274
+ for elem in root.iter():
275
+ # t νƒœκ·Έ (ν…μŠ€νŠΈ)
276
+ if elem.tag.endswith('t') or elem.tag == 't':
277
+ if elem.text:
278
+ texts.append(elem.text)
279
+ # λ‹€λ₯Έ ν…μŠ€νŠΈ λ…Έλ“œ
280
+ elif elem.text and elem.text.strip():
281
+ # νƒœκ·Έ 이름이 ν…μŠ€νŠΈ 관련인 경우
282
+ if any(x in elem.tag.lower() for x in ['text', 'run', 'para', 'char']):
283
+ texts.append(elem.text.strip())
284
+
285
+ if texts:
286
+ text_parts.append(' '.join(texts))
287
+
288
+ except ET.ParseError as e:
289
+ print(f" XML νŒŒμ‹± 였λ₯˜ {section_file}: {e}")
290
+ # μ •κ·œμ‹μœΌλ‘œ ν…μŠ€νŠΈ μΆ”μΆœ μ‹œλ„
291
+ text_matches = re.findall(r'>([^<]+)<', content.decode('utf-8', errors='ignore'))
292
+ clean_texts = [t.strip() for t in text_matches if t.strip() and len(t.strip()) > 1]
293
+ if clean_texts:
294
+ text_parts.append(' '.join(clean_texts))
295
+
296
+ except Exception as e:
297
+ print(f" μ„Ήμ…˜ 파일 읽기 였λ₯˜ {section_file}: {e}")
298
+ continue
299
+
300
+ # header.xmlμ—μ„œλ„ ν…μŠ€νŠΈ μΆ”μΆœ μ‹œλ„
301
+ for header_file in [f for f in file_list if 'header' in f.lower() and f.endswith('.xml')]:
302
+ try:
303
+ with zf.open(header_file) as hf:
304
+ content = hf.read().decode('utf-8', errors='ignore')
305
+ text_matches = re.findall(r'>([^<]+)<', content)
306
+ clean_texts = [t.strip() for t in text_matches if t.strip() and len(t.strip()) > 1]
307
+ # ν—€λ”λŠ” 짧은 ν…μŠ€νŠΈλ§Œ μΆ”κ°€
308
+ if clean_texts:
309
+ text_parts.insert(0, ' '.join(clean_texts[:5]))
310
+ except:
311
+ pass
312
+
313
+ if text_parts:
314
+ result = '\n\n'.join(text_parts)
315
+ # 정리
316
+ result = re.sub(r'\s+', ' ', result)
317
+ result = re.sub(r'\n{3,}', '\n\n', result)
318
+ return result.strip(), None
319
+
320
+ return None, "HWPXμ—μ„œ ν…μŠ€νŠΈλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€"
321
+
322
+ except zipfile.BadZipFile:
323
+ return None, "μœ νš¨ν•˜μ§€ μ•Šμ€ HWPX 파일"
324
+ except Exception as e:
325
+ return None, f"HWPX 처리 였λ₯˜: {str(e)}"
326
 
327
+ # ============== HWP ν…μŠ€νŠΈ μΆ”μΆœ (OLE 기반) ==============
328
+
329
+ def extract_text_with_hwp5txt(file_path: str) -> tuple:
330
+ """hwp5txt둜 ν…μŠ€νŠΈ μΆ”μΆœ"""
331
 
332
+ # 방법 1: hwp5txt λͺ…λ Ήμ–΄ 직접 μ‹€ν–‰
333
+ try:
334
+ result = subprocess.run(['hwp5txt', file_path], capture_output=True, timeout=60)
335
+ if result.returncode == 0 and result.stdout:
336
+ for enc in ['utf-8', 'cp949', 'euc-kr']:
337
+ try:
338
+ text = result.stdout.decode(enc)
339
+ if text.strip() and len(text.strip()) > 10:
340
+ return text.strip(), None
341
+ except:
342
+ continue
343
+ except FileNotFoundError:
344
+ pass
345
+ except Exception as e:
346
+ print(f" hwp5txt λͺ…λ Ήμ–΄ 였λ₯˜: {e}")
347
+
348
+ # 방법 2: Python λͺ¨λ“ˆλ‘œ μ‹€ν–‰
349
+ try:
350
+ from hwp5.hwp5txt import main as hwp5txt_main
351
+ from hwp5.hwp5txt import extract_text
352
+ from hwp5.filestructure import Hwp5File
353
 
354
+ hwp5file = Hwp5File(file_path)
355
+ texts = []
356
+
357
+ for section_idx in hwp5file.bodytext.sections():
358
+ section = hwp5file.bodytext.section(section_idx)
359
+ for para in extract_text(section):
360
+ if para.strip():
361
+ texts.append(para.strip())
362
+
363
+ hwp5file.close()
364
+
365
+ if texts:
366
+ return '\n'.join(texts), None
367
+
368
+ except ImportError:
369
+ pass
370
+ except Exception as e:
371
+ print(f" hwp5txt λͺ¨λ“ˆ 였λ₯˜: {e}")
372
+
373
+ # 방법 3: μ„œλΈŒν”„λ‘œμ„ΈμŠ€λ‘œ Python μ½”λ“œ μ‹€ν–‰
374
+ try:
375
+ code = f'''
376
+ import sys
377
+ sys.path.insert(0, "{PYHWP_PATH}")
378
+ from hwp5.filestructure import Hwp5File
379
+ from hwp5.hwp5txt import extract_text
380
+ hwp = Hwp5File("{file_path}")
381
+ for idx in hwp.bodytext.sections():
382
+ section = hwp.bodytext.section(idx)
383
+ for para in extract_text(section):
384
+ if para.strip():
385
+ print(para.strip())
386
+ hwp.close()
387
+ '''
388
+ result = subprocess.run([sys.executable, '-c', code], capture_output=True, timeout=60)
389
+ if result.returncode == 0 and result.stdout:
390
+ for enc in ['utf-8', 'cp949', 'euc-kr']:
391
  try:
392
+ text = result.stdout.decode(enc)
393
+ if text.strip() and len(text.strip()) > 10:
394
+ return text.strip(), None
395
  except:
396
+ continue
397
+ except Exception as e:
398
+ print(f" hwp5txt μ„œλΈŒν”„λ‘œμ„ΈμŠ€ 였λ₯˜: {e}")
399
+
400
+ return None, "hwp5txt μ‹€νŒ¨"
401
+
402
+ def extract_text_with_olefile(file_path: str) -> tuple:
403
+ """olefile을 μ‚¬μš©ν•œ HWP ν…μŠ€νŠΈ μΆ”μΆœ"""
404
+ if not OLEFILE_AVAILABLE:
405
+ return None, "olefile λͺ¨λ“ˆ μ—†μŒ"
406
 
407
+ try:
408
+ ole = olefile.OleFileIO(file_path)
409
+
410
+ # 파일 헀더 확인
411
+ if not ole.exists('FileHeader'):
412
+ ole.close()
413
+ return None, "HWP 파일 헀더 μ—†μŒ"
414
+
415
+ # μ••μΆ• μ—¬λΆ€ 확인
416
+ header_data = ole.openstream('FileHeader').read()
417
+ is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
418
+ print(f" HWP μ••μΆ• μ—¬λΆ€: {is_compressed}")
419
+
420
+ all_texts = []
421
+
422
+ # BodyText μ„Ήμ…˜λ“€ 처리
423
+ for entry in ole.listdir():
424
+ entry_path = '/'.join(entry)
425
+
426
+ if entry_path.startswith('BodyText/Section'):
427
+ try:
428
+ stream_data = ole.openstream(entry).read()
429
+
430
+ # μ••μΆ• ν•΄μ œ
431
+ if is_compressed:
432
+ try:
433
+ stream_data = zlib.decompress(stream_data, -15)
434
+ except:
435
+ try:
436
+ stream_data = zlib.decompress(stream_data)
437
+ except:
438
+ pass
439
+
440
+ # οΏ½οΏ½μ½”λ“œμ—μ„œ ν…μŠ€νŠΈ μΆ”μΆœ
441
+ section_text = extract_hwp_section_text(stream_data)
442
+ if section_text:
443
+ all_texts.append(section_text)
444
+
445
+ except Exception as e:
446
+ print(f" μ„Ήμ…˜ 처리 였λ₯˜ {entry_path}: {e}")
447
+ continue
448
+
449
+ ole.close()
450
+
451
+ if all_texts:
452
+ result = '\n\n'.join(all_texts)
453
+ return result.strip(), None
454
+
455
+ return None, "ν…μŠ€νŠΈλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€"
456
+
457
+ except Exception as e:
458
+ return None, f"olefile 였λ₯˜: {str(e)}"
459
 
460
+ def extract_hwp_section_text(data: bytes) -> str:
461
+ """HWP μ„Ήμ…˜ λ°μ΄ν„°μ—μ„œ ν…μŠ€νŠΈ μΆ”μΆœ"""
462
  texts = []
463
  pos = 0
464
 
465
  while pos < len(data) - 4:
466
  try:
467
+ # λ ˆμ½”λ“œ 헀더 읽기
468
  header = int.from_bytes(data[pos:pos+4], 'little')
469
  tag_id = header & 0x3FF
470
+ level = (header >> 10) & 0x3FF
471
  size = (header >> 20) & 0xFFF
472
 
473
  pos += 4
 
485
  record_data = data[pos:pos+size]
486
  pos += size
487
 
488
+ # HWPTAG_PARA_TEXT = 67
489
  if tag_id == 67 and size > 0:
490
+ text = decode_para_text(record_data)
 
491
  if text:
492
  texts.append(text)
493
 
494
+ except:
495
  pos += 1
496
  continue
497
 
498
  return '\n'.join(texts) if texts else None
499
 
500
+ def decode_para_text(data: bytes) -> str:
501
+ """PARA_TEXT λ ˆμ½”λ“œ λ””μ½”λ”©"""
502
  result = []
503
  i = 0
504
 
505
  while i < len(data) - 1:
506
  code = int.from_bytes(data[i:i+2], 'little')
507
 
508
+ if code == 0:
 
 
 
 
 
 
 
 
 
 
 
 
509
  pass
510
+ elif code == 1: # ν™•μž₯ 컨트둀
511
+ i += 14
512
+ elif code == 2: # μ„Ήμ…˜ μ •μ˜
513
  i += 14
514
  elif code == 3: # ν•„λ“œ μ‹œμž‘
515
  i += 14
 
523
  result.append('\n')
524
  elif code == 24: # ν•˜μ΄ν”ˆ
525
  result.append('-')
526
+ elif code == 30 or code == 31: # 빈칸
 
 
527
  result.append(' ')
528
+ elif code < 32: # 기타 컨트둀 문자
529
+ pass
530
+ else:
531
+ # 일반 문자
532
+ try:
533
+ char = chr(code)
534
+ if char.isprintable() or char in '\n\t ':
535
+ result.append(char)
536
+ except:
537
+ pass
538
 
539
  i += 2
540
 
541
  text = ''.join(result).strip()
 
 
 
 
 
 
 
 
 
542
 
543
+ # 정리
544
+ text = re.sub(r'[ \t]+', ' ', text)
545
+ text = re.sub(r'\n{3,}', '\n\n', text)
546
+
547
+ return text if len(text) > 2 else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
 
549
  def extract_text_from_hwp(file_path: str) -> tuple:
550
  """HWP νŒŒμΌμ—μ„œ ν…μŠ€νŠΈ μΆ”μΆœ (메인 ν•¨μˆ˜)"""
551
  print(f"\n[HWP μΆ”μΆœ] μ‹œμž‘: {os.path.basename(file_path)}")
552
 
553
+ # 방법 1: hwp5txt
554
+ print(" 방법 1: hwp5txt...")
555
  text, error = extract_text_with_hwp5txt(file_path)
556
+ if text and len(text.strip()) > 20:
557
  print(f" βœ“ hwp5txt 성곡: {len(text)} κΈ€μž")
558
  return text, None
559
  print(f" βœ— hwp5txt μ‹€νŒ¨: {error}")
560
 
561
+ # 방법 2: olefile
562
  print(" 방법 2: olefile νŒŒμ‹±...")
563
  text, error = extract_text_with_olefile(file_path)
564
+ if text and len(text.strip()) > 20:
565
  print(f" βœ“ olefile 성곡: {len(text)} κΈ€μž")
566
  return text, None
567
  print(f" βœ— olefile μ‹€νŒ¨: {error}")
568
 
569
  return None, "λͺ¨λ“  μΆ”μΆœ 방법 μ‹€νŒ¨"
570
 
571
+ def extract_text_from_hwp_or_hwpx(file_path: str) -> tuple:
572
+ """HWP λ˜λŠ” HWPX νŒŒμΌμ—μ„œ ν…μŠ€νŠΈ μΆ”μΆœ"""
573
+ if is_hwpx_file(file_path):
574
+ print(f"\n[HWPX μΆ”μΆœ] μ‹œμž‘: {os.path.basename(file_path)}")
575
+ return extract_text_from_hwpx(file_path)
576
+ else:
577
+ return extract_text_from_hwp(file_path)
578
+
579
  # ============== HWP λ³€ν™˜ ν•¨μˆ˜λ“€ ==============
580
 
581
  def check_hwp_version(file_path):
 
586
  return "HWP v5", True
587
  elif header[:4] == b'\xd0\xcf\x11\xe0':
588
  return "HWP v5 (OLE)", True
589
+ elif header[:4] == b'PK\x03\x04': # ZIP 파일 (HWPX)
590
+ return "HWPX", True
591
  else:
592
  return "Unknown", False
593
  except Exception as e:
 
598
  output_path = os.path.join(output_dir, "output.html")
599
 
600
  try:
601
+ # hwp5html μ‹œλ„
602
+ for cmd in [['hwp5html', '--output', output_path, input_path],
603
+ [sys.executable, '-c', f'from hwp5.hwp5html import main; import sys; sys.argv=["hwp5html","--output","{output_path}","{input_path}"]; main()']]:
604
+ try:
605
+ result = subprocess.run(cmd, capture_output=True, timeout=120)
606
+ if result.returncode == 0:
607
+ if os.path.exists(output_path):
608
+ return output_path, None
609
+ # 디렉토리 검색
610
+ for item in os.listdir(output_dir):
611
+ item_path = os.path.join(output_dir, item)
612
+ if item.lower().endswith(('.html', '.htm')):
 
 
 
 
 
 
 
 
 
 
613
  return item_path, None
614
+ if os.path.isdir(item_path):
615
+ return item_path, None
616
+ except:
617
+ continue
618
+
619
  except Exception as e:
620
+ print(f"HTML λ³€ν™˜ 였λ₯˜: {e}")
621
 
622
  return None, "HTML λ³€ν™˜ μ‹€νŒ¨"
623
 
624
  def convert_hwp_to_text(input_path: str) -> tuple:
625
+ """HWP/HWPXλ₯Ό ν…μŠ€νŠΈλ‘œ λ³€ν™˜"""
626
+ return extract_text_from_hwp_or_hwpx(input_path)
627
 
628
  def html_to_markdown(html_content):
629
  """HTML을 Markdown으둜 λ³€ν™˜"""
 
641
  except:
642
  pass
643
 
 
644
  if BS4_AVAILABLE:
645
  try:
646
  soup = BeautifulSoup(html_content, 'html.parser')
 
651
  return None, "Markdown λ³€ν™˜ μ‹€νŒ¨"
652
 
653
  def convert_hwp_to_markdown(input_path: str) -> tuple:
654
+ """HWP/HWPXλ₯Ό Markdown으둜 λ³€ν™˜"""
655
+ # ν…μŠ€νŠΈ μΆ”μΆœ
656
+ text, error = extract_text_from_hwp_or_hwpx(input_path)
657
  if text:
658
  return text, None
659
+ return None, error
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
660
 
661
  # ============== LLM API ==============
662
 
 
754
  if is_image_file(file_path):
755
  return "image", image_to_base64(file_path), get_image_mime_type(file_path)
756
 
757
+ if is_hwp_file(file_path) or is_hwpx_file(file_path):
758
+ text, error = extract_text_from_hwp_or_hwpx(file_path)
759
+ if text and len(text.strip()) > 20:
760
+ return "text", f"[ν•œκΈ€ λ¬Έμ„œ: {filename}]\n\n{text}", None
761
+ return "error", f"ν•œκΈ€ λ¬Έμ„œ μΆ”μΆœ μ‹€νŒ¨: {error}", None
762
 
763
  if is_pdf_file(file_path):
764
  text = extract_text_from_pdf(file_path)
 
774
 
775
  return "unsupported", f"μ§€μ›ν•˜μ§€ μ•ŠλŠ” ν˜•μ‹: {filename}", None
776
 
777
+ def chat_response(message: str, history: List[Dict], file: Optional[str],
778
  session_id: str, groq_key: str, fireworks_key: str) -> Generator[tuple, None, None]:
779
  if history is None:
780
  history = []
 
822
  db_messages = get_session_messages(session_id, limit=10)
823
  api_messages = [{
824
  "role": "system",
825
+ "content": "당신은 도움이 λ˜λŠ” AI μ–΄μ‹œμŠ€ν„΄νŠΈμž…λ‹ˆλ‹€. ν•œκ΅­μ–΄λ‘œ μžμ—°μŠ€λŸ½κ²Œ λŒ€ν™”ν•˜λ©°, 파일이 μ²¨λΆ€λ˜λ©΄ λ‚΄μš©μ„ μƒμ„Ένžˆ λΆ„μ„ν•˜μ—¬ λ‹΅λ³€ν•©λ‹ˆλ‹€."
826
  }]
827
 
828
  for m in db_messages:
 
868
  def convert_to_odt_subprocess(input_path, output_dir):
869
  output_path = os.path.join(output_dir, "output.odt")
870
  try:
871
+ for cmd in [['hwp5odt', '--output', output_path, input_path],
872
+ [sys.executable, '-c', f'from hwp5.hwp5odt import main; import sys; sys.argv=["hwp5odt","--output","{output_path}","{input_path}"]; main()']]:
873
+ try:
874
+ result = subprocess.run(cmd, capture_output=True, timeout=120)
875
+ if result.returncode == 0 and os.path.exists(output_path):
876
+ return output_path, None
877
+ except:
878
+ continue
879
  except:
880
  pass
881
  return None, "ODT λ³€ν™˜ μ‹€νŒ¨"
 
883
  def convert_to_xml_subprocess(input_path, output_dir):
884
  output_path = os.path.join(output_dir, "output.xml")
885
  try:
886
+ for cmd in [['hwp5xml', input_path],
887
+ [sys.executable, '-c', f'from hwp5.hwp5xml import main; import sys; sys.argv=["hwp5xml","{input_path}"]; main()']]:
888
+ try:
889
+ result = subprocess.run(cmd, capture_output=True, timeout=120)
890
+ if result.returncode == 0 and result.stdout:
891
+ with open(output_path, 'wb') as f:
892
+ f.write(result.stdout)
893
+ return output_path, None
894
+ except:
895
+ continue
896
  except:
897
  pass
898
  return None, "XML λ³€ν™˜ μ‹€νŒ¨"
 
902
  return None, "❌ νŒŒμΌμ„ μ—…λ‘œλ“œν•΄μ£Όμ„Έμš”.", ""
903
 
904
  input_file = file.name if hasattr(file, 'name') else str(file)
905
+ ext_lower = Path(input_file).suffix.lower()
906
+
907
+ if ext_lower not in ['.hwp', '.hwpx']:
908
+ return None, "❌ HWP λ˜λŠ” HWPX 파일만 μ§€μ›λ©λ‹ˆλ‹€.", ""
909
 
910
  progress(0.1, desc="파일 뢄석 쀑...")
911
  version, is_valid = check_hwp_version(input_file)
 
924
  output_path, error, ext = None, None, ""
925
 
926
  if output_format == "HTML":
927
+ if ext_lower == '.hwpx':
928
+ return None, "❌ HWPXλŠ” HTML λ³€ν™˜μ„ μ§€μ›ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€. TXTλ‚˜ Markdown을 μ‚¬μš©ν•˜μ„Έμš”.", ""
929
  output_path, error = convert_to_html_subprocess(input_path, tmp_dir)
930
  ext = ".html"
931
  if output_path and os.path.isdir(output_path):
 
933
  output_path, ext = zip_path, ".zip"
934
 
935
  elif output_format == "ODT (OpenDocument)":
936
+ if ext_lower == '.hwpx':
937
+ return None, "❌ HWPXλŠ” ODT λ³€ν™˜μ„ μ§€μ›ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€. TXTλ‚˜ Markdown을 μ‚¬μš©ν•˜μ„Έμš”.", ""
938
  output_path, error = convert_to_odt_subprocess(input_path, tmp_dir)
939
  ext = ".odt"
940
 
941
  elif output_format == "TXT (ν…μŠ€νŠΈ)":
942
+ text, error = extract_text_from_hwp_or_hwpx(input_path)
943
  if text:
944
  output_path = os.path.join(tmp_dir, "output.txt")
945
  with open(output_path, 'w', encoding='utf-8') as f:
 
955
  ext = ".md"
956
 
957
  elif output_format == "XML":
958
+ if ext_lower == '.hwpx':
959
+ # HWPXλŠ” 이미 XML κΈ°λ°˜μ΄λ―€λ‘œ λ‚΄λΆ€ XML μΆ”μΆœ
960
+ try:
961
+ with zipfile.ZipFile(input_path, 'r') as zf:
962
+ # λͺ¨λ“  XML νŒŒμΌμ„ ν•˜λ‚˜λ‘œ ν•©μΉ¨
963
+ xml_contents = []
964
+ for name in zf.namelist():
965
+ if name.endswith('.xml'):
966
+ with zf.open(name) as f:
967
+ xml_contents.append(f"<!-- {name} -->\n{f.read().decode('utf-8', errors='ignore')}")
968
+
969
+ output_path = os.path.join(tmp_dir, "output.xml")
970
+ with open(output_path, 'w', encoding='utf-8') as f:
971
+ f.write('\n\n'.join(xml_contents))
972
+ except Exception as e:
973
+ error = f"HWPX XML μΆ”μΆœ μ‹€νŒ¨: {e}"
974
+ else:
975
+ output_path, error = convert_to_xml_subprocess(input_path, tmp_dir)
976
  ext = ".xml"
977
 
978
  if not output_path:
 
1021
  with gr.Blocks(title="AI λ¬Έμ„œ μ–΄μ‹œμŠ€ν„΄νŠΈ") as demo:
1022
  session_state = gr.State("")
1023
 
1024
+ gr.Markdown("# πŸ€– AI λ¬Έμ„œ μ–΄μ‹œμŠ€ν„΄νŠΈ\nLLM μ±„νŒ… + HWP/HWPX λ¬Έμ„œ λ³€ν™˜")
1025
 
1026
  with gr.Tabs():
1027
  with gr.Tab("πŸ’¬ AI μ±„νŒ…"):
 
1032
  groq_key = gr.Textbox(label="Groq API Key", type="password", value=GROQ_API_KEY)
1033
  fireworks_key = gr.Textbox(label="Fireworks API Key", type="password", value=FIREWORKS_API_KEY)
1034
 
1035
+ gr.Markdown("### πŸ“ 지원 파일\n- 이미지: JPG, PNG\n- λ¬Έμ„œ: PDF, TXT\n- ν•œκΈ€: HWP, HWPX ✨")
1036
  new_btn = gr.Button("πŸ†• μƒˆ λŒ€ν™”", variant="primary")
1037
 
1038
  with gr.Accordion("πŸ“œ 기둝", open=False):
 
1051
  clear_btn = gr.Button("πŸ—‘οΈ μ§€μš°κΈ°", scale=1)
1052
 
1053
  with gr.Tab("πŸ“„ HWP λ³€ν™˜κΈ°"):
1054
+ gr.Markdown("### HWP/HWPX 파일 λ³€ν™˜κΈ°")
1055
  with gr.Row():
1056
  with gr.Column():
1057
+ hwp_input = gr.File(label="HWP/HWPX 파일", file_types=[".hwp", ".hwpx"], elem_classes=["upload-box"])
1058
  format_select = gr.Radio(["HTML", "ODT (OpenDocument)", "TXT (ν…μŠ€νŠΈ)", "Markdown", "XML"], value="TXT (ν…μŠ€νŠΈ)", label="ν˜•μ‹")
1059
  convert_btn = gr.Button("πŸ”„ λ³€ν™˜", variant="primary", size="lg")
1060
  with gr.Column():
 
1063
 
1064
  with gr.Accordion("πŸ“‹ 미리보기", open=False):
1065
  preview_out = gr.Textbox(lines=15, interactive=False)
1066
+
1067
+ gr.Markdown("""
1068
+ > **μ°Έκ³ **: HWPX νŒŒμΌμ€ TXT, Markdown, XML λ³€ν™˜λ§Œ μ§€μ›λ©λ‹ˆλ‹€.
1069
+ """)
1070
 
1071
  # 이벀트
1072
  def on_submit(msg, hist, f, sid, gk, fk):
 
1074
  for r in chat_response(msg, hist, f, sid, gk, fk):
1075
  yield r[0], r[1], "", None
1076
 
1077
+ submit_btn.click(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
1078
  [chatbot, session_state, msg_input, file_upload])
1079
  msg_input.submit(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
1080
  [chatbot, session_state, msg_input, file_upload])