Update app.py
Browse files
app.py
CHANGED
|
@@ -16,9 +16,11 @@ import sqlite3
|
|
| 16 |
import base64
|
| 17 |
import requests
|
| 18 |
import zlib
|
|
|
|
| 19 |
from pathlib import Path
|
| 20 |
from datetime import datetime
|
| 21 |
from typing import Generator, List, Dict, Optional
|
|
|
|
| 22 |
|
| 23 |
# ============== 환경 설정 ==============
|
| 24 |
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
@@ -27,7 +29,6 @@ DB_PATH = os.path.join(SCRIPT_DIR, 'chat_history.db')
|
|
| 27 |
|
| 28 |
if os.path.exists(PYHWP_PATH):
|
| 29 |
sys.path.insert(0, PYHWP_PATH)
|
| 30 |
-
print(f"Added local pyhwp path: {PYHWP_PATH}")
|
| 31 |
|
| 32 |
# ============== 모듈 임포트 ==============
|
| 33 |
try:
|
|
@@ -72,6 +73,28 @@ try:
|
|
| 72 |
except ImportError:
|
| 73 |
PDFPLUMBER_AVAILABLE = False
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
# ============== API 키 설정 ==============
|
| 76 |
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
|
| 77 |
FIREWORKS_API_KEY = os.environ.get("FIREWORKS_API_KEY", "")
|
|
@@ -106,10 +129,8 @@ def create_session() -> str:
|
|
| 106 |
session_id = str(uuid.uuid4())
|
| 107 |
conn = sqlite3.connect(DB_PATH)
|
| 108 |
cursor = conn.cursor()
|
| 109 |
-
cursor.execute(
|
| 110 |
-
|
| 111 |
-
(session_id, f"대화 {datetime.now().strftime('%Y-%m-%d %H:%M')}")
|
| 112 |
-
)
|
| 113 |
conn.commit()
|
| 114 |
conn.close()
|
| 115 |
return session_id
|
|
@@ -117,26 +138,17 @@ def create_session() -> str:
|
|
| 117 |
def save_message(session_id: str, role: str, content: str, file_info: str = None):
|
| 118 |
conn = sqlite3.connect(DB_PATH)
|
| 119 |
cursor = conn.cursor()
|
| 120 |
-
cursor.execute(
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
)
|
| 124 |
-
cursor.execute(
|
| 125 |
-
"UPDATE sessions SET updated_at = CURRENT_TIMESTAMP WHERE session_id = ?",
|
| 126 |
-
(session_id,)
|
| 127 |
-
)
|
| 128 |
conn.commit()
|
| 129 |
conn.close()
|
| 130 |
|
| 131 |
def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
|
| 132 |
conn = sqlite3.connect(DB_PATH)
|
| 133 |
cursor = conn.cursor()
|
| 134 |
-
cursor.execute(
|
| 135 |
-
|
| 136 |
-
FROM messages WHERE session_id = ?
|
| 137 |
-
ORDER BY created_at DESC LIMIT ?""",
|
| 138 |
-
(session_id, limit)
|
| 139 |
-
)
|
| 140 |
rows = cursor.fetchall()
|
| 141 |
conn.close()
|
| 142 |
return [{"role": r[0], "content": r[1], "file_info": r[2], "created_at": r[3]} for r in reversed(rows)]
|
|
@@ -144,9 +156,7 @@ def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
|
|
| 144 |
def get_all_sessions() -> List[Dict]:
|
| 145 |
conn = sqlite3.connect(DB_PATH)
|
| 146 |
cursor = conn.cursor()
|
| 147 |
-
cursor.execute(
|
| 148 |
-
"SELECT session_id, title, created_at, updated_at FROM sessions ORDER BY updated_at DESC LIMIT 50"
|
| 149 |
-
)
|
| 150 |
rows = cursor.fetchall()
|
| 151 |
conn.close()
|
| 152 |
return [{"session_id": r[0], "title": r[1], "created_at": r[2], "updated_at": r[3]} for r in rows]
|
|
@@ -204,14 +214,17 @@ def image_to_base64(file_path: str) -> str:
|
|
| 204 |
|
| 205 |
def get_image_mime_type(file_path: str) -> str:
|
| 206 |
ext = Path(file_path).suffix.lower()
|
| 207 |
-
return {'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png',
|
| 208 |
'.gif': 'image/gif', '.webp': 'image/webp', '.bmp': 'image/bmp'}.get(ext, 'image/jpeg')
|
| 209 |
|
| 210 |
def is_image_file(fp: str) -> bool:
|
| 211 |
return Path(fp).suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']
|
| 212 |
|
| 213 |
def is_hwp_file(fp: str) -> bool:
|
| 214 |
-
return Path(fp).suffix.lower()
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
def is_pdf_file(fp: str) -> bool:
|
| 217 |
return Path(fp).suffix.lower() == '.pdf'
|
|
@@ -219,57 +232,242 @@ def is_pdf_file(fp: str) -> bool:
|
|
| 219 |
def is_text_file(fp: str) -> bool:
|
| 220 |
return Path(fp).suffix.lower() in ['.txt', '.md', '.json', '.csv', '.xml', '.html', '.css', '.js', '.py']
|
| 221 |
|
| 222 |
-
# ==============
|
| 223 |
|
| 224 |
-
def
|
| 225 |
-
"""
|
| 226 |
try:
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
|
|
|
| 237 |
|
| 238 |
-
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
try:
|
| 248 |
-
|
|
|
|
|
|
|
| 249 |
except:
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
|
| 261 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
|
| 263 |
-
def
|
| 264 |
-
"""
|
| 265 |
texts = []
|
| 266 |
pos = 0
|
| 267 |
|
| 268 |
while pos < len(data) - 4:
|
| 269 |
try:
|
| 270 |
-
# 레코드 헤더
|
| 271 |
header = int.from_bytes(data[pos:pos+4], 'little')
|
| 272 |
tag_id = header & 0x3FF
|
|
|
|
| 273 |
size = (header >> 20) & 0xFFF
|
| 274 |
|
| 275 |
pos += 4
|
|
@@ -287,44 +485,31 @@ def extract_text_from_hwp_records(data: bytes) -> str:
|
|
| 287 |
record_data = data[pos:pos+size]
|
| 288 |
pos += size
|
| 289 |
|
| 290 |
-
# HWPTAG_PARA_TEXT = 67
|
| 291 |
if tag_id == 67 and size > 0:
|
| 292 |
-
|
| 293 |
-
text = extract_para_text(record_data)
|
| 294 |
if text:
|
| 295 |
texts.append(text)
|
| 296 |
|
| 297 |
-
except
|
| 298 |
pos += 1
|
| 299 |
continue
|
| 300 |
|
| 301 |
return '\n'.join(texts) if texts else None
|
| 302 |
|
| 303 |
-
def
|
| 304 |
-
"""PARA_TEXT 레코드
|
| 305 |
result = []
|
| 306 |
i = 0
|
| 307 |
|
| 308 |
while i < len(data) - 1:
|
| 309 |
code = int.from_bytes(data[i:i+2], 'little')
|
| 310 |
|
| 311 |
-
|
| 312 |
-
if code >= 32:
|
| 313 |
-
try:
|
| 314 |
-
char = chr(code)
|
| 315 |
-
# 한글, 영문, 숫자, 일반 기호만 허용
|
| 316 |
-
if char.isprintable() and not (0x4E00 <= code <= 0x9FFF and code not in range(0xAC00, 0xD7A4)):
|
| 317 |
-
result.append(char)
|
| 318 |
-
elif 0xAC00 <= code <= 0xD7A3: # 한글 음절
|
| 319 |
-
result.append(char)
|
| 320 |
-
except:
|
| 321 |
-
pass
|
| 322 |
-
# 컨트롤 문자 처리
|
| 323 |
-
elif code == 0: # NULL
|
| 324 |
pass
|
| 325 |
-
elif code == 1: #
|
| 326 |
-
i += 14
|
| 327 |
-
elif code == 2: # 섹션
|
| 328 |
i += 14
|
| 329 |
elif code == 3: # 필드 시작
|
| 330 |
i += 14
|
|
@@ -338,99 +523,59 @@ def extract_para_text(data: bytes) -> str:
|
|
| 338 |
result.append('\n')
|
| 339 |
elif code == 24: # 하이픈
|
| 340 |
result.append('-')
|
| 341 |
-
elif code == 30: #
|
| 342 |
-
result.append(' ')
|
| 343 |
-
elif code == 31: # 고정폭 빈칸
|
| 344 |
result.append(' ')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
|
| 346 |
i += 2
|
| 347 |
|
| 348 |
text = ''.join(result).strip()
|
| 349 |
-
# 의미 없는 텍스트 필터링
|
| 350 |
-
if len(text) < 2:
|
| 351 |
-
return None
|
| 352 |
-
return text
|
| 353 |
-
|
| 354 |
-
def extract_text_with_olefile(file_path: str) -> tuple:
|
| 355 |
-
"""olefile을 사용한 HWP 텍스트 추출"""
|
| 356 |
-
if not OLEFILE_AVAILABLE:
|
| 357 |
-
return None, "olefile 모듈 없음"
|
| 358 |
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
ole.close()
|
| 365 |
-
return None, "HWP 파일 헤더 없음"
|
| 366 |
-
|
| 367 |
-
# 압축 여부 확인
|
| 368 |
-
header_data = ole.openstream('FileHeader').read()
|
| 369 |
-
is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
|
| 370 |
-
print(f" HWP 압축 여부: {is_compressed}")
|
| 371 |
-
|
| 372 |
-
# BodyText에서 텍스트 추출
|
| 373 |
-
text = extract_hwp_text_from_bodytext(ole)
|
| 374 |
-
|
| 375 |
-
ole.close()
|
| 376 |
-
|
| 377 |
-
if text and len(text.strip()) > 10:
|
| 378 |
-
return text.strip(), None
|
| 379 |
-
|
| 380 |
-
return None, "텍스트 추출 실패"
|
| 381 |
-
|
| 382 |
-
except Exception as e:
|
| 383 |
-
return None, f"olefile 오류: {str(e)}"
|
| 384 |
-
|
| 385 |
-
def extract_text_with_hwp5txt(file_path: str) -> tuple:
|
| 386 |
-
"""hwp5txt 명령어로 텍스트 추출"""
|
| 387 |
-
try:
|
| 388 |
-
result = subprocess.run(
|
| 389 |
-
[sys.executable, '-m', 'hwp5', 'txt', file_path],
|
| 390 |
-
capture_output=True,
|
| 391 |
-
timeout=60
|
| 392 |
-
)
|
| 393 |
-
|
| 394 |
-
if result.returncode == 0 and result.stdout:
|
| 395 |
-
# 여러 인코딩 시도
|
| 396 |
-
for enc in ['utf-8', 'cp949', 'euc-kr']:
|
| 397 |
-
try:
|
| 398 |
-
text = result.stdout.decode(enc)
|
| 399 |
-
if text.strip() and len(text.strip()) > 10:
|
| 400 |
-
return text.strip(), None
|
| 401 |
-
except:
|
| 402 |
-
continue
|
| 403 |
-
|
| 404 |
-
stderr = result.stderr.decode('utf-8', errors='ignore') if result.stderr else ""
|
| 405 |
-
return None, f"hwp5txt 실패: {stderr[:100]}"
|
| 406 |
-
|
| 407 |
-
except subprocess.TimeoutExpired:
|
| 408 |
-
return None, "hwp5txt 타임아웃"
|
| 409 |
-
except Exception as e:
|
| 410 |
-
return None, f"hwp5txt 오류: {str(e)}"
|
| 411 |
|
| 412 |
def extract_text_from_hwp(file_path: str) -> tuple:
|
| 413 |
"""HWP 파일에서 텍스트 추출 (메인 함수)"""
|
| 414 |
print(f"\n[HWP 추출] 시작: {os.path.basename(file_path)}")
|
| 415 |
|
| 416 |
-
# 방법 1: hwp5txt
|
| 417 |
-
print(" 방법 1: hwp5txt
|
| 418 |
text, error = extract_text_with_hwp5txt(file_path)
|
| 419 |
-
if text:
|
| 420 |
print(f" ✓ hwp5txt 성공: {len(text)} 글자")
|
| 421 |
return text, None
|
| 422 |
print(f" ✗ hwp5txt 실패: {error}")
|
| 423 |
|
| 424 |
-
# 방법 2: olefile
|
| 425 |
print(" 방법 2: olefile 파싱...")
|
| 426 |
text, error = extract_text_with_olefile(file_path)
|
| 427 |
-
if text:
|
| 428 |
print(f" ✓ olefile 성공: {len(text)} 글자")
|
| 429 |
return text, None
|
| 430 |
print(f" ✗ olefile 실패: {error}")
|
| 431 |
|
| 432 |
return None, "모든 추출 방법 실패"
|
| 433 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
# ============== HWP 변환 함수들 ==============
|
| 435 |
|
| 436 |
def check_hwp_version(file_path):
|
|
@@ -441,6 +586,8 @@ def check_hwp_version(file_path):
|
|
| 441 |
return "HWP v5", True
|
| 442 |
elif header[:4] == b'\xd0\xcf\x11\xe0':
|
| 443 |
return "HWP v5 (OLE)", True
|
|
|
|
|
|
|
| 444 |
else:
|
| 445 |
return "Unknown", False
|
| 446 |
except Exception as e:
|
|
@@ -451,41 +598,32 @@ def convert_to_html_subprocess(input_path, output_dir):
|
|
| 451 |
output_path = os.path.join(output_dir, "output.html")
|
| 452 |
|
| 453 |
try:
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
return output_path, None
|
| 467 |
-
|
| 468 |
-
# 다른 위치 검색
|
| 469 |
-
for item in os.listdir(output_dir):
|
| 470 |
-
item_path = os.path.join(output_dir, item)
|
| 471 |
-
if item.lower().endswith(('.html', '.htm')) and os.path.isfile(item_path):
|
| 472 |
-
return item_path, None
|
| 473 |
-
if os.path.isdir(item_path):
|
| 474 |
-
for sub in os.listdir(item_path):
|
| 475 |
-
if sub.lower().endswith(('.html', '.htm')):
|
| 476 |
return item_path, None
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
|
|
|
| 481 |
except Exception as e:
|
| 482 |
-
|
| 483 |
|
| 484 |
return None, "HTML 변환 실패"
|
| 485 |
|
| 486 |
def convert_hwp_to_text(input_path: str) -> tuple:
|
| 487 |
-
"""HWP를 텍스트로 변환"""
|
| 488 |
-
return
|
| 489 |
|
| 490 |
def html_to_markdown(html_content):
|
| 491 |
"""HTML을 Markdown으로 변환"""
|
|
@@ -503,7 +641,6 @@ def html_to_markdown(html_content):
|
|
| 503 |
except:
|
| 504 |
pass
|
| 505 |
|
| 506 |
-
# 기본 변환
|
| 507 |
if BS4_AVAILABLE:
|
| 508 |
try:
|
| 509 |
soup = BeautifulSoup(html_content, 'html.parser')
|
|
@@ -514,41 +651,12 @@ def html_to_markdown(html_content):
|
|
| 514 |
return None, "Markdown 변환 실패"
|
| 515 |
|
| 516 |
def convert_hwp_to_markdown(input_path: str) -> tuple:
|
| 517 |
-
"""HWP를 Markdown으로 변환"""
|
| 518 |
-
#
|
| 519 |
-
text, error =
|
| 520 |
if text:
|
| 521 |
return text, None
|
| 522 |
-
|
| 523 |
-
# HTML 변환 후 Markdown 변환
|
| 524 |
-
tmp_dir = tempfile.mkdtemp()
|
| 525 |
-
try:
|
| 526 |
-
html_output, error = convert_to_html_subprocess(input_path, tmp_dir)
|
| 527 |
-
if html_output:
|
| 528 |
-
# HTML 파일 읽기
|
| 529 |
-
html_files = []
|
| 530 |
-
if os.path.isfile(html_output):
|
| 531 |
-
html_files = [html_output]
|
| 532 |
-
elif os.path.isdir(html_output):
|
| 533 |
-
for root, dirs, files in os.walk(html_output):
|
| 534 |
-
for f in files:
|
| 535 |
-
if f.lower().endswith(('.html', '.htm')):
|
| 536 |
-
html_files.append(os.path.join(root, f))
|
| 537 |
-
|
| 538 |
-
for html_file in html_files:
|
| 539 |
-
for enc in ['utf-8', 'cp949', 'euc-kr']:
|
| 540 |
-
try:
|
| 541 |
-
with open(html_file, 'r', encoding=enc) as f:
|
| 542 |
-
content = f.read()
|
| 543 |
-
md_text, _ = html_to_markdown(content)
|
| 544 |
-
if md_text and len(md_text.strip()) > 10:
|
| 545 |
-
return md_text.strip(), None
|
| 546 |
-
except:
|
| 547 |
-
continue
|
| 548 |
-
|
| 549 |
-
return None, error or "변환 실패"
|
| 550 |
-
finally:
|
| 551 |
-
shutil.rmtree(tmp_dir, ignore_errors=True)
|
| 552 |
|
| 553 |
# ============== LLM API ==============
|
| 554 |
|
|
@@ -646,11 +754,11 @@ def process_file(file_path: str) -> tuple:
|
|
| 646 |
if is_image_file(file_path):
|
| 647 |
return "image", image_to_base64(file_path), get_image_mime_type(file_path)
|
| 648 |
|
| 649 |
-
if is_hwp_file(file_path):
|
| 650 |
-
text, error =
|
| 651 |
-
if text:
|
| 652 |
-
return "text", f"[
|
| 653 |
-
return "error", f"
|
| 654 |
|
| 655 |
if is_pdf_file(file_path):
|
| 656 |
text = extract_text_from_pdf(file_path)
|
|
@@ -666,7 +774,7 @@ def process_file(file_path: str) -> tuple:
|
|
| 666 |
|
| 667 |
return "unsupported", f"지원하지 않는 형식: {filename}", None
|
| 668 |
|
| 669 |
-
def chat_response(message: str, history: List[Dict], file: Optional[str],
|
| 670 |
session_id: str, groq_key: str, fireworks_key: str) -> Generator[tuple, None, None]:
|
| 671 |
if history is None:
|
| 672 |
history = []
|
|
@@ -714,7 +822,7 @@ def chat_response(message: str, history: List[Dict], file: Optional[str],
|
|
| 714 |
db_messages = get_session_messages(session_id, limit=10)
|
| 715 |
api_messages = [{
|
| 716 |
"role": "system",
|
| 717 |
-
"content": "당신은 도움이 되는 AI 어시스턴트입니다. 한국어로 자연스럽게 대화하며, 파일이 첨부되면 내용을 분석하여 답변합니다."
|
| 718 |
}]
|
| 719 |
|
| 720 |
for m in db_messages:
|
|
@@ -760,12 +868,14 @@ def load_session(session_id: str) -> tuple:
|
|
| 760 |
def convert_to_odt_subprocess(input_path, output_dir):
|
| 761 |
output_path = os.path.join(output_dir, "output.odt")
|
| 762 |
try:
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
|
|
|
|
|
|
| 769 |
except:
|
| 770 |
pass
|
| 771 |
return None, "ODT 변환 실패"
|
|
@@ -773,14 +883,16 @@ def convert_to_odt_subprocess(input_path, output_dir):
|
|
| 773 |
def convert_to_xml_subprocess(input_path, output_dir):
|
| 774 |
output_path = os.path.join(output_dir, "output.xml")
|
| 775 |
try:
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
|
|
|
|
|
|
| 784 |
except:
|
| 785 |
pass
|
| 786 |
return None, "XML 변환 실패"
|
|
@@ -790,8 +902,10 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
|
|
| 790 |
return None, "❌ 파일을 업로드해주세요.", ""
|
| 791 |
|
| 792 |
input_file = file.name if hasattr(file, 'name') else str(file)
|
| 793 |
-
|
| 794 |
-
|
|
|
|
|
|
|
| 795 |
|
| 796 |
progress(0.1, desc="파일 분석 중...")
|
| 797 |
version, is_valid = check_hwp_version(input_file)
|
|
@@ -810,6 +924,8 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
|
|
| 810 |
output_path, error, ext = None, None, ""
|
| 811 |
|
| 812 |
if output_format == "HTML":
|
|
|
|
|
|
|
| 813 |
output_path, error = convert_to_html_subprocess(input_path, tmp_dir)
|
| 814 |
ext = ".html"
|
| 815 |
if output_path and os.path.isdir(output_path):
|
|
@@ -817,11 +933,13 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
|
|
| 817 |
output_path, ext = zip_path, ".zip"
|
| 818 |
|
| 819 |
elif output_format == "ODT (OpenDocument)":
|
|
|
|
|
|
|
| 820 |
output_path, error = convert_to_odt_subprocess(input_path, tmp_dir)
|
| 821 |
ext = ".odt"
|
| 822 |
|
| 823 |
elif output_format == "TXT (텍스트)":
|
| 824 |
-
text, error =
|
| 825 |
if text:
|
| 826 |
output_path = os.path.join(tmp_dir, "output.txt")
|
| 827 |
with open(output_path, 'w', encoding='utf-8') as f:
|
|
@@ -837,7 +955,24 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
|
|
| 837 |
ext = ".md"
|
| 838 |
|
| 839 |
elif output_format == "XML":
|
| 840 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 841 |
ext = ".xml"
|
| 842 |
|
| 843 |
if not output_path:
|
|
@@ -886,7 +1021,7 @@ css = """
|
|
| 886 |
with gr.Blocks(title="AI 문서 어시스턴트") as demo:
|
| 887 |
session_state = gr.State("")
|
| 888 |
|
| 889 |
-
gr.Markdown("# 🤖 AI 문서 어시스턴트\nLLM 채팅 + HWP 문서 변환")
|
| 890 |
|
| 891 |
with gr.Tabs():
|
| 892 |
with gr.Tab("💬 AI 채팅"):
|
|
@@ -897,7 +1032,7 @@ with gr.Blocks(title="AI 문서 어시스턴트") as demo:
|
|
| 897 |
groq_key = gr.Textbox(label="Groq API Key", type="password", value=GROQ_API_KEY)
|
| 898 |
fireworks_key = gr.Textbox(label="Fireworks API Key", type="password", value=FIREWORKS_API_KEY)
|
| 899 |
|
| 900 |
-
gr.Markdown("### 📁 지원 파일\n- 이미지: JPG, PNG\n- 문서: PDF, TXT
|
| 901 |
new_btn = gr.Button("🆕 새 대화", variant="primary")
|
| 902 |
|
| 903 |
with gr.Accordion("📜 기록", open=False):
|
|
@@ -916,10 +1051,10 @@ with gr.Blocks(title="AI 문서 어시스턴트") as demo:
|
|
| 916 |
clear_btn = gr.Button("🗑️ 지우기", scale=1)
|
| 917 |
|
| 918 |
with gr.Tab("📄 HWP 변환기"):
|
| 919 |
-
gr.Markdown("### HWP 파일 변환기")
|
| 920 |
with gr.Row():
|
| 921 |
with gr.Column():
|
| 922 |
-
hwp_input = gr.File(label="HWP 파일", file_types=[".hwp"], elem_classes=["upload-box"])
|
| 923 |
format_select = gr.Radio(["HTML", "ODT (OpenDocument)", "TXT (텍스트)", "Markdown", "XML"], value="TXT (텍스트)", label="형식")
|
| 924 |
convert_btn = gr.Button("🔄 변환", variant="primary", size="lg")
|
| 925 |
with gr.Column():
|
|
@@ -928,6 +1063,10 @@ with gr.Blocks(title="AI 문서 어시스턴트") as demo:
|
|
| 928 |
|
| 929 |
with gr.Accordion("📋 미리보기", open=False):
|
| 930 |
preview_out = gr.Textbox(lines=15, interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 931 |
|
| 932 |
# 이벤트
|
| 933 |
def on_submit(msg, hist, f, sid, gk, fk):
|
|
@@ -935,7 +1074,7 @@ with gr.Blocks(title="AI 문서 어시스턴트") as demo:
|
|
| 935 |
for r in chat_response(msg, hist, f, sid, gk, fk):
|
| 936 |
yield r[0], r[1], "", None
|
| 937 |
|
| 938 |
-
submit_btn.click(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
|
| 939 |
[chatbot, session_state, msg_input, file_upload])
|
| 940 |
msg_input.submit(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
|
| 941 |
[chatbot, session_state, msg_input, file_upload])
|
|
|
|
| 16 |
import base64
|
| 17 |
import requests
|
| 18 |
import zlib
|
| 19 |
+
import zipfile
|
| 20 |
from pathlib import Path
|
| 21 |
from datetime import datetime
|
| 22 |
from typing import Generator, List, Dict, Optional
|
| 23 |
+
from xml.etree import ElementTree as ET
|
| 24 |
|
| 25 |
# ============== 환경 설정 ==============
|
| 26 |
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
| 29 |
|
| 30 |
if os.path.exists(PYHWP_PATH):
|
| 31 |
sys.path.insert(0, PYHWP_PATH)
|
|
|
|
| 32 |
|
| 33 |
# ============== 모듈 임포트 ==============
|
| 34 |
try:
|
|
|
|
| 73 |
except ImportError:
|
| 74 |
PDFPLUMBER_AVAILABLE = False
|
| 75 |
|
| 76 |
+
# hwp5txt 사용 가능 여부 확인
|
| 77 |
+
HWP5TXT_AVAILABLE = False
|
| 78 |
+
try:
|
| 79 |
+
result = subprocess.run(['hwp5txt', '--help'], capture_output=True, timeout=5)
|
| 80 |
+
if result.returncode == 0:
|
| 81 |
+
HWP5TXT_AVAILABLE = True
|
| 82 |
+
print("hwp5txt command available")
|
| 83 |
+
except:
|
| 84 |
+
pass
|
| 85 |
+
|
| 86 |
+
if not HWP5TXT_AVAILABLE:
|
| 87 |
+
try:
|
| 88 |
+
result = subprocess.run([sys.executable, '-c', 'from hwp5.hwp5txt import main; print("ok")'],
|
| 89 |
+
capture_output=True, timeout=5)
|
| 90 |
+
if b'ok' in result.stdout:
|
| 91 |
+
HWP5TXT_AVAILABLE = True
|
| 92 |
+
print("hwp5txt module available")
|
| 93 |
+
except:
|
| 94 |
+
pass
|
| 95 |
+
|
| 96 |
+
print(f"HWP5TXT_AVAILABLE: {HWP5TXT_AVAILABLE}")
|
| 97 |
+
|
| 98 |
# ============== API 키 설정 ==============
|
| 99 |
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
|
| 100 |
FIREWORKS_API_KEY = os.environ.get("FIREWORKS_API_KEY", "")
|
|
|
|
| 129 |
session_id = str(uuid.uuid4())
|
| 130 |
conn = sqlite3.connect(DB_PATH)
|
| 131 |
cursor = conn.cursor()
|
| 132 |
+
cursor.execute("INSERT INTO sessions (session_id, title) VALUES (?, ?)",
|
| 133 |
+
(session_id, f"대화 {datetime.now().strftime('%Y-%m-%d %H:%M')}"))
|
|
|
|
|
|
|
| 134 |
conn.commit()
|
| 135 |
conn.close()
|
| 136 |
return session_id
|
|
|
|
| 138 |
def save_message(session_id: str, role: str, content: str, file_info: str = None):
|
| 139 |
conn = sqlite3.connect(DB_PATH)
|
| 140 |
cursor = conn.cursor()
|
| 141 |
+
cursor.execute("INSERT INTO messages (session_id, role, content, file_info) VALUES (?, ?, ?, ?)",
|
| 142 |
+
(session_id, role, content, file_info))
|
| 143 |
+
cursor.execute("UPDATE sessions SET updated_at = CURRENT_TIMESTAMP WHERE session_id = ?", (session_id,))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
conn.commit()
|
| 145 |
conn.close()
|
| 146 |
|
| 147 |
def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
|
| 148 |
conn = sqlite3.connect(DB_PATH)
|
| 149 |
cursor = conn.cursor()
|
| 150 |
+
cursor.execute("SELECT role, content, file_info, created_at FROM messages WHERE session_id = ? ORDER BY created_at DESC LIMIT ?",
|
| 151 |
+
(session_id, limit))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
rows = cursor.fetchall()
|
| 153 |
conn.close()
|
| 154 |
return [{"role": r[0], "content": r[1], "file_info": r[2], "created_at": r[3]} for r in reversed(rows)]
|
|
|
|
| 156 |
def get_all_sessions() -> List[Dict]:
|
| 157 |
conn = sqlite3.connect(DB_PATH)
|
| 158 |
cursor = conn.cursor()
|
| 159 |
+
cursor.execute("SELECT session_id, title, created_at, updated_at FROM sessions ORDER BY updated_at DESC LIMIT 50")
|
|
|
|
|
|
|
| 160 |
rows = cursor.fetchall()
|
| 161 |
conn.close()
|
| 162 |
return [{"session_id": r[0], "title": r[1], "created_at": r[2], "updated_at": r[3]} for r in rows]
|
|
|
|
| 214 |
|
| 215 |
def get_image_mime_type(file_path: str) -> str:
|
| 216 |
ext = Path(file_path).suffix.lower()
|
| 217 |
+
return {'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png',
|
| 218 |
'.gif': 'image/gif', '.webp': 'image/webp', '.bmp': 'image/bmp'}.get(ext, 'image/jpeg')
|
| 219 |
|
| 220 |
def is_image_file(fp: str) -> bool:
|
| 221 |
return Path(fp).suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']
|
| 222 |
|
| 223 |
def is_hwp_file(fp: str) -> bool:
|
| 224 |
+
return Path(fp).suffix.lower() == '.hwp'
|
| 225 |
+
|
| 226 |
+
def is_hwpx_file(fp: str) -> bool:
|
| 227 |
+
return Path(fp).suffix.lower() == '.hwpx'
|
| 228 |
|
| 229 |
def is_pdf_file(fp: str) -> bool:
|
| 230 |
return Path(fp).suffix.lower() == '.pdf'
|
|
|
|
| 232 |
def is_text_file(fp: str) -> bool:
|
| 233 |
return Path(fp).suffix.lower() in ['.txt', '.md', '.json', '.csv', '.xml', '.html', '.css', '.js', '.py']
|
| 234 |
|
| 235 |
+
# ============== HWPX 텍스트 추출 (ZIP/XML 기반) ==============
|
| 236 |
|
| 237 |
+
def extract_text_from_hwpx(file_path: str) -> tuple:
|
| 238 |
+
"""HWPX 파일에서 텍스트 추출 (ZIP 내부 XML 파싱)"""
|
| 239 |
try:
|
| 240 |
+
text_parts = []
|
| 241 |
+
|
| 242 |
+
with zipfile.ZipFile(file_path, 'r') as zf:
|
| 243 |
+
# HWPX 내부 구조 확인
|
| 244 |
+
file_list = zf.namelist()
|
| 245 |
+
print(f" HWPX 내부 파일: {file_list[:10]}...")
|
| 246 |
+
|
| 247 |
+
# Contents 폴더 내의 section XML 파일들 처리
|
| 248 |
+
section_files = sorted([f for f in file_list if f.startswith('Contents/section') and f.endswith('.xml')])
|
| 249 |
+
|
| 250 |
+
if not section_files:
|
| 251 |
+
# 다른 경로 시도
|
| 252 |
+
section_files = sorted([f for f in file_list if 'section' in f.lower() and f.endswith('.xml')])
|
| 253 |
+
|
| 254 |
+
print(f" 섹션 파일: {section_files}")
|
| 255 |
+
|
| 256 |
+
for section_file in section_files:
|
| 257 |
+
try:
|
| 258 |
+
with zf.open(section_file) as sf:
|
| 259 |
+
content = sf.read()
|
| 260 |
+
|
| 261 |
+
# XML 파싱
|
| 262 |
+
try:
|
| 263 |
+
# 네임스페이스 제거하고 파싱
|
| 264 |
+
content_str = content.decode('utf-8')
|
| 265 |
+
# 네임스페이스 제거
|
| 266 |
+
content_str = re.sub(r'\sxmlns[^"]*"[^"]*"', '', content_str)
|
| 267 |
+
content_str = re.sub(r'<[a-zA-Z]+:', '<', content_str)
|
| 268 |
+
content_str = re.sub(r'</[a-zA-Z]+:', '</', content_str)
|
| 269 |
+
|
| 270 |
+
root = ET.fromstring(content_str)
|
| 271 |
+
|
| 272 |
+
# 모든 텍스트 추출
|
| 273 |
+
texts = []
|
| 274 |
+
for elem in root.iter():
|
| 275 |
+
# t 태그 (텍스트)
|
| 276 |
+
if elem.tag.endswith('t') or elem.tag == 't':
|
| 277 |
+
if elem.text:
|
| 278 |
+
texts.append(elem.text)
|
| 279 |
+
# 다른 텍스트 노드
|
| 280 |
+
elif elem.text and elem.text.strip():
|
| 281 |
+
# 태그 이름이 텍스트 관련인 경우
|
| 282 |
+
if any(x in elem.tag.lower() for x in ['text', 'run', 'para', 'char']):
|
| 283 |
+
texts.append(elem.text.strip())
|
| 284 |
+
|
| 285 |
+
if texts:
|
| 286 |
+
text_parts.append(' '.join(texts))
|
| 287 |
+
|
| 288 |
+
except ET.ParseError as e:
|
| 289 |
+
print(f" XML 파싱 오류 {section_file}: {e}")
|
| 290 |
+
# 정규식으로 텍스트 추출 시도
|
| 291 |
+
text_matches = re.findall(r'>([^<]+)<', content.decode('utf-8', errors='ignore'))
|
| 292 |
+
clean_texts = [t.strip() for t in text_matches if t.strip() and len(t.strip()) > 1]
|
| 293 |
+
if clean_texts:
|
| 294 |
+
text_parts.append(' '.join(clean_texts))
|
| 295 |
+
|
| 296 |
+
except Exception as e:
|
| 297 |
+
print(f" 섹션 파일 읽기 오류 {section_file}: {e}")
|
| 298 |
+
continue
|
| 299 |
+
|
| 300 |
+
# header.xml에서도 텍스트 추출 시도
|
| 301 |
+
for header_file in [f for f in file_list if 'header' in f.lower() and f.endswith('.xml')]:
|
| 302 |
+
try:
|
| 303 |
+
with zf.open(header_file) as hf:
|
| 304 |
+
content = hf.read().decode('utf-8', errors='ignore')
|
| 305 |
+
text_matches = re.findall(r'>([^<]+)<', content)
|
| 306 |
+
clean_texts = [t.strip() for t in text_matches if t.strip() and len(t.strip()) > 1]
|
| 307 |
+
# 헤더는 짧은 텍스트만 추가
|
| 308 |
+
if clean_texts:
|
| 309 |
+
text_parts.insert(0, ' '.join(clean_texts[:5]))
|
| 310 |
+
except:
|
| 311 |
+
pass
|
| 312 |
+
|
| 313 |
+
if text_parts:
|
| 314 |
+
result = '\n\n'.join(text_parts)
|
| 315 |
+
# 정리
|
| 316 |
+
result = re.sub(r'\s+', ' ', result)
|
| 317 |
+
result = re.sub(r'\n{3,}', '\n\n', result)
|
| 318 |
+
return result.strip(), None
|
| 319 |
+
|
| 320 |
+
return None, "HWPX에서 텍스트를 찾을 수 없습니다"
|
| 321 |
+
|
| 322 |
+
except zipfile.BadZipFile:
|
| 323 |
+
return None, "유효하지 않은 HWPX 파일"
|
| 324 |
+
except Exception as e:
|
| 325 |
+
return None, f"HWPX 처리 오류: {str(e)}"
|
| 326 |
|
| 327 |
+
# ============== HWP 텍스트 추출 (OLE 기반) ==============
|
| 328 |
+
|
| 329 |
+
def extract_text_with_hwp5txt(file_path: str) -> tuple:
|
| 330 |
+
"""hwp5txt로 텍스트 추출"""
|
| 331 |
|
| 332 |
+
# 방법 1: hwp5txt 명령어 직접 실행
|
| 333 |
+
try:
|
| 334 |
+
result = subprocess.run(['hwp5txt', file_path], capture_output=True, timeout=60)
|
| 335 |
+
if result.returncode == 0 and result.stdout:
|
| 336 |
+
for enc in ['utf-8', 'cp949', 'euc-kr']:
|
| 337 |
+
try:
|
| 338 |
+
text = result.stdout.decode(enc)
|
| 339 |
+
if text.strip() and len(text.strip()) > 10:
|
| 340 |
+
return text.strip(), None
|
| 341 |
+
except:
|
| 342 |
+
continue
|
| 343 |
+
except FileNotFoundError:
|
| 344 |
+
pass
|
| 345 |
+
except Exception as e:
|
| 346 |
+
print(f" hwp5txt 명령어 오류: {e}")
|
| 347 |
+
|
| 348 |
+
# 방법 2: Python 모듈로 실행
|
| 349 |
+
try:
|
| 350 |
+
from hwp5.hwp5txt import main as hwp5txt_main
|
| 351 |
+
from hwp5.hwp5txt import extract_text
|
| 352 |
+
from hwp5.filestructure import Hwp5File
|
| 353 |
|
| 354 |
+
hwp5file = Hwp5File(file_path)
|
| 355 |
+
texts = []
|
| 356 |
+
|
| 357 |
+
for section_idx in hwp5file.bodytext.sections():
|
| 358 |
+
section = hwp5file.bodytext.section(section_idx)
|
| 359 |
+
for para in extract_text(section):
|
| 360 |
+
if para.strip():
|
| 361 |
+
texts.append(para.strip())
|
| 362 |
+
|
| 363 |
+
hwp5file.close()
|
| 364 |
+
|
| 365 |
+
if texts:
|
| 366 |
+
return '\n'.join(texts), None
|
| 367 |
+
|
| 368 |
+
except ImportError:
|
| 369 |
+
pass
|
| 370 |
+
except Exception as e:
|
| 371 |
+
print(f" hwp5txt 모듈 오류: {e}")
|
| 372 |
+
|
| 373 |
+
# 방법 3: 서브프로세스로 Python 코드 실행
|
| 374 |
+
try:
|
| 375 |
+
code = f'''
|
| 376 |
+
import sys
|
| 377 |
+
sys.path.insert(0, "{PYHWP_PATH}")
|
| 378 |
+
from hwp5.filestructure import Hwp5File
|
| 379 |
+
from hwp5.hwp5txt import extract_text
|
| 380 |
+
hwp = Hwp5File("{file_path}")
|
| 381 |
+
for idx in hwp.bodytext.sections():
|
| 382 |
+
section = hwp.bodytext.section(idx)
|
| 383 |
+
for para in extract_text(section):
|
| 384 |
+
if para.strip():
|
| 385 |
+
print(para.strip())
|
| 386 |
+
hwp.close()
|
| 387 |
+
'''
|
| 388 |
+
result = subprocess.run([sys.executable, '-c', code], capture_output=True, timeout=60)
|
| 389 |
+
if result.returncode == 0 and result.stdout:
|
| 390 |
+
for enc in ['utf-8', 'cp949', 'euc-kr']:
|
| 391 |
try:
|
| 392 |
+
text = result.stdout.decode(enc)
|
| 393 |
+
if text.strip() and len(text.strip()) > 10:
|
| 394 |
+
return text.strip(), None
|
| 395 |
except:
|
| 396 |
+
continue
|
| 397 |
+
except Exception as e:
|
| 398 |
+
print(f" hwp5txt 서브프로세스 오류: {e}")
|
| 399 |
+
|
| 400 |
+
return None, "hwp5txt 실패"
|
| 401 |
+
|
| 402 |
+
def extract_text_with_olefile(file_path: str) -> tuple:
|
| 403 |
+
"""olefile을 사용한 HWP 텍스트 추출"""
|
| 404 |
+
if not OLEFILE_AVAILABLE:
|
| 405 |
+
return None, "olefile 모듈 없음"
|
| 406 |
|
| 407 |
+
try:
|
| 408 |
+
ole = olefile.OleFileIO(file_path)
|
| 409 |
+
|
| 410 |
+
# 파일 헤더 확인
|
| 411 |
+
if not ole.exists('FileHeader'):
|
| 412 |
+
ole.close()
|
| 413 |
+
return None, "HWP 파일 헤더 없음"
|
| 414 |
+
|
| 415 |
+
# 압축 여부 확인
|
| 416 |
+
header_data = ole.openstream('FileHeader').read()
|
| 417 |
+
is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
|
| 418 |
+
print(f" HWP 압축 여부: {is_compressed}")
|
| 419 |
+
|
| 420 |
+
all_texts = []
|
| 421 |
+
|
| 422 |
+
# BodyText 섹션들 처리
|
| 423 |
+
for entry in ole.listdir():
|
| 424 |
+
entry_path = '/'.join(entry)
|
| 425 |
+
|
| 426 |
+
if entry_path.startswith('BodyText/Section'):
|
| 427 |
+
try:
|
| 428 |
+
stream_data = ole.openstream(entry).read()
|
| 429 |
+
|
| 430 |
+
# 압축 해제
|
| 431 |
+
if is_compressed:
|
| 432 |
+
try:
|
| 433 |
+
stream_data = zlib.decompress(stream_data, -15)
|
| 434 |
+
except:
|
| 435 |
+
try:
|
| 436 |
+
stream_data = zlib.decompress(stream_data)
|
| 437 |
+
except:
|
| 438 |
+
pass
|
| 439 |
+
|
| 440 |
+
# ��코드에서 텍스트 추출
|
| 441 |
+
section_text = extract_hwp_section_text(stream_data)
|
| 442 |
+
if section_text:
|
| 443 |
+
all_texts.append(section_text)
|
| 444 |
+
|
| 445 |
+
except Exception as e:
|
| 446 |
+
print(f" 섹션 처리 오류 {entry_path}: {e}")
|
| 447 |
+
continue
|
| 448 |
+
|
| 449 |
+
ole.close()
|
| 450 |
+
|
| 451 |
+
if all_texts:
|
| 452 |
+
result = '\n\n'.join(all_texts)
|
| 453 |
+
return result.strip(), None
|
| 454 |
+
|
| 455 |
+
return None, "텍스트를 찾을 수 없습니다"
|
| 456 |
+
|
| 457 |
+
except Exception as e:
|
| 458 |
+
return None, f"olefile 오류: {str(e)}"
|
| 459 |
|
| 460 |
+
def extract_hwp_section_text(data: bytes) -> str:
|
| 461 |
+
"""HWP 섹션 데이터에서 텍스트 추출"""
|
| 462 |
texts = []
|
| 463 |
pos = 0
|
| 464 |
|
| 465 |
while pos < len(data) - 4:
|
| 466 |
try:
|
| 467 |
+
# 레코드 헤더 읽기
|
| 468 |
header = int.from_bytes(data[pos:pos+4], 'little')
|
| 469 |
tag_id = header & 0x3FF
|
| 470 |
+
level = (header >> 10) & 0x3FF
|
| 471 |
size = (header >> 20) & 0xFFF
|
| 472 |
|
| 473 |
pos += 4
|
|
|
|
| 485 |
record_data = data[pos:pos+size]
|
| 486 |
pos += size
|
| 487 |
|
| 488 |
+
# HWPTAG_PARA_TEXT = 67
|
| 489 |
if tag_id == 67 and size > 0:
|
| 490 |
+
text = decode_para_text(record_data)
|
|
|
|
| 491 |
if text:
|
| 492 |
texts.append(text)
|
| 493 |
|
| 494 |
+
except:
|
| 495 |
pos += 1
|
| 496 |
continue
|
| 497 |
|
| 498 |
return '\n'.join(texts) if texts else None
|
| 499 |
|
| 500 |
+
def decode_para_text(data: bytes) -> str:
|
| 501 |
+
"""PARA_TEXT 레코드 디코딩"""
|
| 502 |
result = []
|
| 503 |
i = 0
|
| 504 |
|
| 505 |
while i < len(data) - 1:
|
| 506 |
code = int.from_bytes(data[i:i+2], 'little')
|
| 507 |
|
| 508 |
+
if code == 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
pass
|
| 510 |
+
elif code == 1: # 확장 컨트롤
|
| 511 |
+
i += 14
|
| 512 |
+
elif code == 2: # 섹션 정의
|
| 513 |
i += 14
|
| 514 |
elif code == 3: # 필드 시작
|
| 515 |
i += 14
|
|
|
|
| 523 |
result.append('\n')
|
| 524 |
elif code == 24: # 하이픈
|
| 525 |
result.append('-')
|
| 526 |
+
elif code == 30 or code == 31: # 빈칸
|
|
|
|
|
|
|
| 527 |
result.append(' ')
|
| 528 |
+
elif code < 32: # 기타 컨트롤 문자
|
| 529 |
+
pass
|
| 530 |
+
else:
|
| 531 |
+
# 일반 문자
|
| 532 |
+
try:
|
| 533 |
+
char = chr(code)
|
| 534 |
+
if char.isprintable() or char in '\n\t ':
|
| 535 |
+
result.append(char)
|
| 536 |
+
except:
|
| 537 |
+
pass
|
| 538 |
|
| 539 |
i += 2
|
| 540 |
|
| 541 |
text = ''.join(result).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
|
| 543 |
+
# 정리
|
| 544 |
+
text = re.sub(r'[ \t]+', ' ', text)
|
| 545 |
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 546 |
+
|
| 547 |
+
return text if len(text) > 2 else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
|
| 549 |
def extract_text_from_hwp(file_path: str) -> tuple:
|
| 550 |
"""HWP 파일에서 텍스트 추출 (메인 함수)"""
|
| 551 |
print(f"\n[HWP 추출] 시작: {os.path.basename(file_path)}")
|
| 552 |
|
| 553 |
+
# 방법 1: hwp5txt
|
| 554 |
+
print(" 방법 1: hwp5txt...")
|
| 555 |
text, error = extract_text_with_hwp5txt(file_path)
|
| 556 |
+
if text and len(text.strip()) > 20:
|
| 557 |
print(f" ✓ hwp5txt 성공: {len(text)} 글자")
|
| 558 |
return text, None
|
| 559 |
print(f" ✗ hwp5txt 실패: {error}")
|
| 560 |
|
| 561 |
+
# 방법 2: olefile
|
| 562 |
print(" 방법 2: olefile 파싱...")
|
| 563 |
text, error = extract_text_with_olefile(file_path)
|
| 564 |
+
if text and len(text.strip()) > 20:
|
| 565 |
print(f" ✓ olefile 성공: {len(text)} 글자")
|
| 566 |
return text, None
|
| 567 |
print(f" ✗ olefile 실패: {error}")
|
| 568 |
|
| 569 |
return None, "모든 추출 방법 실패"
|
| 570 |
|
| 571 |
+
def extract_text_from_hwp_or_hwpx(file_path: str) -> tuple:
|
| 572 |
+
"""HWP 또는 HWPX 파일에서 텍스트 추출"""
|
| 573 |
+
if is_hwpx_file(file_path):
|
| 574 |
+
print(f"\n[HWPX 추출] 시작: {os.path.basename(file_path)}")
|
| 575 |
+
return extract_text_from_hwpx(file_path)
|
| 576 |
+
else:
|
| 577 |
+
return extract_text_from_hwp(file_path)
|
| 578 |
+
|
| 579 |
# ============== HWP 변환 함수들 ==============
|
| 580 |
|
| 581 |
def check_hwp_version(file_path):
|
|
|
|
| 586 |
return "HWP v5", True
|
| 587 |
elif header[:4] == b'\xd0\xcf\x11\xe0':
|
| 588 |
return "HWP v5 (OLE)", True
|
| 589 |
+
elif header[:4] == b'PK\x03\x04': # ZIP 파일 (HWPX)
|
| 590 |
+
return "HWPX", True
|
| 591 |
else:
|
| 592 |
return "Unknown", False
|
| 593 |
except Exception as e:
|
|
|
|
| 598 |
output_path = os.path.join(output_dir, "output.html")
|
| 599 |
|
| 600 |
try:
|
| 601 |
+
# hwp5html 시도
|
| 602 |
+
for cmd in [['hwp5html', '--output', output_path, input_path],
|
| 603 |
+
[sys.executable, '-c', f'from hwp5.hwp5html import main; import sys; sys.argv=["hwp5html","--output","{output_path}","{input_path}"]; main()']]:
|
| 604 |
+
try:
|
| 605 |
+
result = subprocess.run(cmd, capture_output=True, timeout=120)
|
| 606 |
+
if result.returncode == 0:
|
| 607 |
+
if os.path.exists(output_path):
|
| 608 |
+
return output_path, None
|
| 609 |
+
# 디렉토리 검색
|
| 610 |
+
for item in os.listdir(output_dir):
|
| 611 |
+
item_path = os.path.join(output_dir, item)
|
| 612 |
+
if item.lower().endswith(('.html', '.htm')):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
return item_path, None
|
| 614 |
+
if os.path.isdir(item_path):
|
| 615 |
+
return item_path, None
|
| 616 |
+
except:
|
| 617 |
+
continue
|
| 618 |
+
|
| 619 |
except Exception as e:
|
| 620 |
+
print(f"HTML 변환 오류: {e}")
|
| 621 |
|
| 622 |
return None, "HTML 변환 실패"
|
| 623 |
|
| 624 |
def convert_hwp_to_text(input_path: str) -> tuple:
|
| 625 |
+
"""HWP/HWPX를 텍스트로 변환"""
|
| 626 |
+
return extract_text_from_hwp_or_hwpx(input_path)
|
| 627 |
|
| 628 |
def html_to_markdown(html_content):
|
| 629 |
"""HTML을 Markdown으로 변환"""
|
|
|
|
| 641 |
except:
|
| 642 |
pass
|
| 643 |
|
|
|
|
| 644 |
if BS4_AVAILABLE:
|
| 645 |
try:
|
| 646 |
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
| 651 |
return None, "Markdown 변환 실패"
|
| 652 |
|
| 653 |
def convert_hwp_to_markdown(input_path: str) -> tuple:
|
| 654 |
+
"""HWP/HWPX를 Markdown으로 변환"""
|
| 655 |
+
# 텍스트 추출
|
| 656 |
+
text, error = extract_text_from_hwp_or_hwpx(input_path)
|
| 657 |
if text:
|
| 658 |
return text, None
|
| 659 |
+
return None, error
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 660 |
|
| 661 |
# ============== LLM API ==============
|
| 662 |
|
|
|
|
| 754 |
if is_image_file(file_path):
|
| 755 |
return "image", image_to_base64(file_path), get_image_mime_type(file_path)
|
| 756 |
|
| 757 |
+
if is_hwp_file(file_path) or is_hwpx_file(file_path):
|
| 758 |
+
text, error = extract_text_from_hwp_or_hwpx(file_path)
|
| 759 |
+
if text and len(text.strip()) > 20:
|
| 760 |
+
return "text", f"[한글 문서: {filename}]\n\n{text}", None
|
| 761 |
+
return "error", f"한글 문서 추출 실패: {error}", None
|
| 762 |
|
| 763 |
if is_pdf_file(file_path):
|
| 764 |
text = extract_text_from_pdf(file_path)
|
|
|
|
| 774 |
|
| 775 |
return "unsupported", f"지원하지 않는 형식: {filename}", None
|
| 776 |
|
| 777 |
+
def chat_response(message: str, history: List[Dict], file: Optional[str],
|
| 778 |
session_id: str, groq_key: str, fireworks_key: str) -> Generator[tuple, None, None]:
|
| 779 |
if history is None:
|
| 780 |
history = []
|
|
|
|
| 822 |
db_messages = get_session_messages(session_id, limit=10)
|
| 823 |
api_messages = [{
|
| 824 |
"role": "system",
|
| 825 |
+
"content": "당신은 도움이 되는 AI 어시스턴트입니다. 한국어로 자연스럽게 대화하며, 파일이 첨부되면 내용을 상세히 분석하여 답변합니다."
|
| 826 |
}]
|
| 827 |
|
| 828 |
for m in db_messages:
|
|
|
|
| 868 |
def convert_to_odt_subprocess(input_path, output_dir):
|
| 869 |
output_path = os.path.join(output_dir, "output.odt")
|
| 870 |
try:
|
| 871 |
+
for cmd in [['hwp5odt', '--output', output_path, input_path],
|
| 872 |
+
[sys.executable, '-c', f'from hwp5.hwp5odt import main; import sys; sys.argv=["hwp5odt","--output","{output_path}","{input_path}"]; main()']]:
|
| 873 |
+
try:
|
| 874 |
+
result = subprocess.run(cmd, capture_output=True, timeout=120)
|
| 875 |
+
if result.returncode == 0 and os.path.exists(output_path):
|
| 876 |
+
return output_path, None
|
| 877 |
+
except:
|
| 878 |
+
continue
|
| 879 |
except:
|
| 880 |
pass
|
| 881 |
return None, "ODT 변환 실패"
|
|
|
|
| 883 |
def convert_to_xml_subprocess(input_path, output_dir):
|
| 884 |
output_path = os.path.join(output_dir, "output.xml")
|
| 885 |
try:
|
| 886 |
+
for cmd in [['hwp5xml', input_path],
|
| 887 |
+
[sys.executable, '-c', f'from hwp5.hwp5xml import main; import sys; sys.argv=["hwp5xml","{input_path}"]; main()']]:
|
| 888 |
+
try:
|
| 889 |
+
result = subprocess.run(cmd, capture_output=True, timeout=120)
|
| 890 |
+
if result.returncode == 0 and result.stdout:
|
| 891 |
+
with open(output_path, 'wb') as f:
|
| 892 |
+
f.write(result.stdout)
|
| 893 |
+
return output_path, None
|
| 894 |
+
except:
|
| 895 |
+
continue
|
| 896 |
except:
|
| 897 |
pass
|
| 898 |
return None, "XML 변환 실패"
|
|
|
|
| 902 |
return None, "❌ 파일을 업로드해주세요.", ""
|
| 903 |
|
| 904 |
input_file = file.name if hasattr(file, 'name') else str(file)
|
| 905 |
+
ext_lower = Path(input_file).suffix.lower()
|
| 906 |
+
|
| 907 |
+
if ext_lower not in ['.hwp', '.hwpx']:
|
| 908 |
+
return None, "❌ HWP 또는 HWPX 파일만 지원됩니다.", ""
|
| 909 |
|
| 910 |
progress(0.1, desc="파일 분석 중...")
|
| 911 |
version, is_valid = check_hwp_version(input_file)
|
|
|
|
| 924 |
output_path, error, ext = None, None, ""
|
| 925 |
|
| 926 |
if output_format == "HTML":
|
| 927 |
+
if ext_lower == '.hwpx':
|
| 928 |
+
return None, "❌ HWPX는 HTML 변환을 지원하지 않습니다. TXT나 Markdown을 사용하세요.", ""
|
| 929 |
output_path, error = convert_to_html_subprocess(input_path, tmp_dir)
|
| 930 |
ext = ".html"
|
| 931 |
if output_path and os.path.isdir(output_path):
|
|
|
|
| 933 |
output_path, ext = zip_path, ".zip"
|
| 934 |
|
| 935 |
elif output_format == "ODT (OpenDocument)":
|
| 936 |
+
if ext_lower == '.hwpx':
|
| 937 |
+
return None, "❌ HWPX는 ODT 변환을 지원하지 않습니다. TXT나 Markdown을 사용하세요.", ""
|
| 938 |
output_path, error = convert_to_odt_subprocess(input_path, tmp_dir)
|
| 939 |
ext = ".odt"
|
| 940 |
|
| 941 |
elif output_format == "TXT (텍스트)":
|
| 942 |
+
text, error = extract_text_from_hwp_or_hwpx(input_path)
|
| 943 |
if text:
|
| 944 |
output_path = os.path.join(tmp_dir, "output.txt")
|
| 945 |
with open(output_path, 'w', encoding='utf-8') as f:
|
|
|
|
| 955 |
ext = ".md"
|
| 956 |
|
| 957 |
elif output_format == "XML":
|
| 958 |
+
if ext_lower == '.hwpx':
|
| 959 |
+
# HWPX는 이미 XML 기반이므로 내부 XML 추출
|
| 960 |
+
try:
|
| 961 |
+
with zipfile.ZipFile(input_path, 'r') as zf:
|
| 962 |
+
# 모든 XML 파일을 하나로 합침
|
| 963 |
+
xml_contents = []
|
| 964 |
+
for name in zf.namelist():
|
| 965 |
+
if name.endswith('.xml'):
|
| 966 |
+
with zf.open(name) as f:
|
| 967 |
+
xml_contents.append(f"<!-- {name} -->\n{f.read().decode('utf-8', errors='ignore')}")
|
| 968 |
+
|
| 969 |
+
output_path = os.path.join(tmp_dir, "output.xml")
|
| 970 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 971 |
+
f.write('\n\n'.join(xml_contents))
|
| 972 |
+
except Exception as e:
|
| 973 |
+
error = f"HWPX XML 추출 실패: {e}"
|
| 974 |
+
else:
|
| 975 |
+
output_path, error = convert_to_xml_subprocess(input_path, tmp_dir)
|
| 976 |
ext = ".xml"
|
| 977 |
|
| 978 |
if not output_path:
|
|
|
|
| 1021 |
with gr.Blocks(title="AI 문서 어시스턴트") as demo:
|
| 1022 |
session_state = gr.State("")
|
| 1023 |
|
| 1024 |
+
gr.Markdown("# 🤖 AI 문서 어시스턴트\nLLM 채팅 + HWP/HWPX 문서 변환")
|
| 1025 |
|
| 1026 |
with gr.Tabs():
|
| 1027 |
with gr.Tab("💬 AI 채팅"):
|
|
|
|
| 1032 |
groq_key = gr.Textbox(label="Groq API Key", type="password", value=GROQ_API_KEY)
|
| 1033 |
fireworks_key = gr.Textbox(label="Fireworks API Key", type="password", value=FIREWORKS_API_KEY)
|
| 1034 |
|
| 1035 |
+
gr.Markdown("### 📁 지원 파일\n- 이미지: JPG, PNG\n- 문서: PDF, TXT\n- 한글: HWP, HWPX ✨")
|
| 1036 |
new_btn = gr.Button("🆕 새 대화", variant="primary")
|
| 1037 |
|
| 1038 |
with gr.Accordion("📜 기록", open=False):
|
|
|
|
| 1051 |
clear_btn = gr.Button("🗑️ 지우기", scale=1)
|
| 1052 |
|
| 1053 |
with gr.Tab("📄 HWP 변환기"):
|
| 1054 |
+
gr.Markdown("### HWP/HWPX 파일 변환기")
|
| 1055 |
with gr.Row():
|
| 1056 |
with gr.Column():
|
| 1057 |
+
hwp_input = gr.File(label="HWP/HWPX 파일", file_types=[".hwp", ".hwpx"], elem_classes=["upload-box"])
|
| 1058 |
format_select = gr.Radio(["HTML", "ODT (OpenDocument)", "TXT (텍스트)", "Markdown", "XML"], value="TXT (텍스트)", label="형식")
|
| 1059 |
convert_btn = gr.Button("🔄 변환", variant="primary", size="lg")
|
| 1060 |
with gr.Column():
|
|
|
|
| 1063 |
|
| 1064 |
with gr.Accordion("📋 미리보기", open=False):
|
| 1065 |
preview_out = gr.Textbox(lines=15, interactive=False)
|
| 1066 |
+
|
| 1067 |
+
gr.Markdown("""
|
| 1068 |
+
> **참고**: HWPX 파일은 TXT, Markdown, XML 변환만 지원됩니다.
|
| 1069 |
+
""")
|
| 1070 |
|
| 1071 |
# 이벤트
|
| 1072 |
def on_submit(msg, hist, f, sid, gk, fk):
|
|
|
|
| 1074 |
for r in chat_response(msg, hist, f, sid, gk, fk):
|
| 1075 |
yield r[0], r[1], "", None
|
| 1076 |
|
| 1077 |
+
submit_btn.click(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
|
| 1078 |
[chatbot, session_state, msg_input, file_upload])
|
| 1079 |
msg_input.submit(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
|
| 1080 |
[chatbot, session_state, msg_input, file_upload])
|