Update app.py
Browse files
app.py
CHANGED
|
@@ -16,9 +16,11 @@ import sqlite3
|
|
| 16 |
import base64
|
| 17 |
import requests
|
| 18 |
import zlib
|
|
|
|
| 19 |
from pathlib import Path
|
| 20 |
from datetime import datetime
|
| 21 |
from typing import Generator, List, Dict, Optional
|
|
|
|
| 22 |
|
| 23 |
# ============== νκ²½ μ€μ ==============
|
| 24 |
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
@@ -27,7 +29,6 @@ DB_PATH = os.path.join(SCRIPT_DIR, 'chat_history.db')
|
|
| 27 |
|
| 28 |
if os.path.exists(PYHWP_PATH):
|
| 29 |
sys.path.insert(0, PYHWP_PATH)
|
| 30 |
-
print(f"Added local pyhwp path: {PYHWP_PATH}")
|
| 31 |
|
| 32 |
# ============== λͺ¨λ μν¬νΈ ==============
|
| 33 |
try:
|
|
@@ -72,6 +73,28 @@ try:
|
|
| 72 |
except ImportError:
|
| 73 |
PDFPLUMBER_AVAILABLE = False
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
# ============== API ν€ μ€μ ==============
|
| 76 |
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
|
| 77 |
FIREWORKS_API_KEY = os.environ.get("FIREWORKS_API_KEY", "")
|
|
@@ -106,10 +129,8 @@ def create_session() -> str:
|
|
| 106 |
session_id = str(uuid.uuid4())
|
| 107 |
conn = sqlite3.connect(DB_PATH)
|
| 108 |
cursor = conn.cursor()
|
| 109 |
-
cursor.execute(
|
| 110 |
-
|
| 111 |
-
(session_id, f"λν {datetime.now().strftime('%Y-%m-%d %H:%M')}")
|
| 112 |
-
)
|
| 113 |
conn.commit()
|
| 114 |
conn.close()
|
| 115 |
return session_id
|
|
@@ -117,26 +138,17 @@ def create_session() -> str:
|
|
| 117 |
def save_message(session_id: str, role: str, content: str, file_info: str = None):
|
| 118 |
conn = sqlite3.connect(DB_PATH)
|
| 119 |
cursor = conn.cursor()
|
| 120 |
-
cursor.execute(
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
)
|
| 124 |
-
cursor.execute(
|
| 125 |
-
"UPDATE sessions SET updated_at = CURRENT_TIMESTAMP WHERE session_id = ?",
|
| 126 |
-
(session_id,)
|
| 127 |
-
)
|
| 128 |
conn.commit()
|
| 129 |
conn.close()
|
| 130 |
|
| 131 |
def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
|
| 132 |
conn = sqlite3.connect(DB_PATH)
|
| 133 |
cursor = conn.cursor()
|
| 134 |
-
cursor.execute(
|
| 135 |
-
|
| 136 |
-
FROM messages WHERE session_id = ?
|
| 137 |
-
ORDER BY created_at DESC LIMIT ?""",
|
| 138 |
-
(session_id, limit)
|
| 139 |
-
)
|
| 140 |
rows = cursor.fetchall()
|
| 141 |
conn.close()
|
| 142 |
return [{"role": r[0], "content": r[1], "file_info": r[2], "created_at": r[3]} for r in reversed(rows)]
|
|
@@ -144,9 +156,7 @@ def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
|
|
| 144 |
def get_all_sessions() -> List[Dict]:
|
| 145 |
conn = sqlite3.connect(DB_PATH)
|
| 146 |
cursor = conn.cursor()
|
| 147 |
-
cursor.execute(
|
| 148 |
-
"SELECT session_id, title, created_at, updated_at FROM sessions ORDER BY updated_at DESC LIMIT 50"
|
| 149 |
-
)
|
| 150 |
rows = cursor.fetchall()
|
| 151 |
conn.close()
|
| 152 |
return [{"session_id": r[0], "title": r[1], "created_at": r[2], "updated_at": r[3]} for r in rows]
|
|
@@ -204,14 +214,17 @@ def image_to_base64(file_path: str) -> str:
|
|
| 204 |
|
| 205 |
def get_image_mime_type(file_path: str) -> str:
|
| 206 |
ext = Path(file_path).suffix.lower()
|
| 207 |
-
return {'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png',
|
| 208 |
'.gif': 'image/gif', '.webp': 'image/webp', '.bmp': 'image/bmp'}.get(ext, 'image/jpeg')
|
| 209 |
|
| 210 |
def is_image_file(fp: str) -> bool:
|
| 211 |
return Path(fp).suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']
|
| 212 |
|
| 213 |
def is_hwp_file(fp: str) -> bool:
|
| 214 |
-
return Path(fp).suffix.lower()
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
def is_pdf_file(fp: str) -> bool:
|
| 217 |
return Path(fp).suffix.lower() == '.pdf'
|
|
@@ -219,57 +232,242 @@ def is_pdf_file(fp: str) -> bool:
|
|
| 219 |
def is_text_file(fp: str) -> bool:
|
| 220 |
return Path(fp).suffix.lower() in ['.txt', '.md', '.json', '.csv', '.xml', '.html', '.css', '.js', '.py']
|
| 221 |
|
| 222 |
-
# ==============
|
| 223 |
|
| 224 |
-
def
|
| 225 |
-
"""
|
| 226 |
try:
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
|
|
|
| 237 |
|
| 238 |
-
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
try:
|
| 248 |
-
|
|
|
|
|
|
|
| 249 |
except:
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
|
| 261 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
|
| 263 |
-
def
|
| 264 |
-
"""
|
| 265 |
texts = []
|
| 266 |
pos = 0
|
| 267 |
|
| 268 |
while pos < len(data) - 4:
|
| 269 |
try:
|
| 270 |
-
# λ μ½λ ν€λ
|
| 271 |
header = int.from_bytes(data[pos:pos+4], 'little')
|
| 272 |
tag_id = header & 0x3FF
|
|
|
|
| 273 |
size = (header >> 20) & 0xFFF
|
| 274 |
|
| 275 |
pos += 4
|
|
@@ -287,44 +485,31 @@ def extract_text_from_hwp_records(data: bytes) -> str:
|
|
| 287 |
record_data = data[pos:pos+size]
|
| 288 |
pos += size
|
| 289 |
|
| 290 |
-
# HWPTAG_PARA_TEXT = 67
|
| 291 |
if tag_id == 67 and size > 0:
|
| 292 |
-
|
| 293 |
-
text = extract_para_text(record_data)
|
| 294 |
if text:
|
| 295 |
texts.append(text)
|
| 296 |
|
| 297 |
-
except
|
| 298 |
pos += 1
|
| 299 |
continue
|
| 300 |
|
| 301 |
return '\n'.join(texts) if texts else None
|
| 302 |
|
| 303 |
-
def
|
| 304 |
-
"""PARA_TEXT λ μ½λ
|
| 305 |
result = []
|
| 306 |
i = 0
|
| 307 |
|
| 308 |
while i < len(data) - 1:
|
| 309 |
code = int.from_bytes(data[i:i+2], 'little')
|
| 310 |
|
| 311 |
-
|
| 312 |
-
if code >= 32:
|
| 313 |
-
try:
|
| 314 |
-
char = chr(code)
|
| 315 |
-
# νκΈ, μλ¬Έ, μ«μ, μΌλ° κΈ°νΈλ§ νμ©
|
| 316 |
-
if char.isprintable() and not (0x4E00 <= code <= 0x9FFF and code not in range(0xAC00, 0xD7A4)):
|
| 317 |
-
result.append(char)
|
| 318 |
-
elif 0xAC00 <= code <= 0xD7A3: # νκΈ μμ
|
| 319 |
-
result.append(char)
|
| 320 |
-
except:
|
| 321 |
-
pass
|
| 322 |
-
# 컨νΈλ‘€ λ¬Έμ μ²λ¦¬
|
| 323 |
-
elif code == 0: # NULL
|
| 324 |
pass
|
| 325 |
-
elif code == 1: #
|
| 326 |
-
i += 14
|
| 327 |
-
elif code == 2: # μΉμ
|
| 328 |
i += 14
|
| 329 |
elif code == 3: # νλ μμ
|
| 330 |
i += 14
|
|
@@ -338,99 +523,59 @@ def extract_para_text(data: bytes) -> str:
|
|
| 338 |
result.append('\n')
|
| 339 |
elif code == 24: # νμ΄ν
|
| 340 |
result.append('-')
|
| 341 |
-
elif code == 30: #
|
| 342 |
-
result.append(' ')
|
| 343 |
-
elif code == 31: # κ³ μ ν λΉμΉΈ
|
| 344 |
result.append(' ')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
|
| 346 |
i += 2
|
| 347 |
|
| 348 |
text = ''.join(result).strip()
|
| 349 |
-
# μλ―Έ μλ ν
μ€νΈ νν°λ§
|
| 350 |
-
if len(text) < 2:
|
| 351 |
-
return None
|
| 352 |
-
return text
|
| 353 |
-
|
| 354 |
-
def extract_text_with_olefile(file_path: str) -> tuple:
|
| 355 |
-
"""olefileμ μ¬μ©ν HWP ν
μ€νΈ μΆμΆ"""
|
| 356 |
-
if not OLEFILE_AVAILABLE:
|
| 357 |
-
return None, "olefile λͺ¨λ μμ"
|
| 358 |
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
ole.close()
|
| 365 |
-
return None, "HWP νμΌ ν€λ μμ"
|
| 366 |
-
|
| 367 |
-
# μμΆ μ¬λΆ νμΈ
|
| 368 |
-
header_data = ole.openstream('FileHeader').read()
|
| 369 |
-
is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
|
| 370 |
-
print(f" HWP μμΆ μ¬λΆ: {is_compressed}")
|
| 371 |
-
|
| 372 |
-
# BodyTextμμ ν
μ€νΈ μΆμΆ
|
| 373 |
-
text = extract_hwp_text_from_bodytext(ole)
|
| 374 |
-
|
| 375 |
-
ole.close()
|
| 376 |
-
|
| 377 |
-
if text and len(text.strip()) > 10:
|
| 378 |
-
return text.strip(), None
|
| 379 |
-
|
| 380 |
-
return None, "ν
μ€νΈ μΆμΆ μ€ν¨"
|
| 381 |
-
|
| 382 |
-
except Exception as e:
|
| 383 |
-
return None, f"olefile μ€λ₯: {str(e)}"
|
| 384 |
-
|
| 385 |
-
def extract_text_with_hwp5txt(file_path: str) -> tuple:
|
| 386 |
-
"""hwp5txt λͺ
λ Ήμ΄λ‘ ν
μ€νΈ μΆμΆ"""
|
| 387 |
-
try:
|
| 388 |
-
result = subprocess.run(
|
| 389 |
-
[sys.executable, '-m', 'hwp5', 'txt', file_path],
|
| 390 |
-
capture_output=True,
|
| 391 |
-
timeout=60
|
| 392 |
-
)
|
| 393 |
-
|
| 394 |
-
if result.returncode == 0 and result.stdout:
|
| 395 |
-
# μ¬λ¬ μΈμ½λ© μλ
|
| 396 |
-
for enc in ['utf-8', 'cp949', 'euc-kr']:
|
| 397 |
-
try:
|
| 398 |
-
text = result.stdout.decode(enc)
|
| 399 |
-
if text.strip() and len(text.strip()) > 10:
|
| 400 |
-
return text.strip(), None
|
| 401 |
-
except:
|
| 402 |
-
continue
|
| 403 |
-
|
| 404 |
-
stderr = result.stderr.decode('utf-8', errors='ignore') if result.stderr else ""
|
| 405 |
-
return None, f"hwp5txt μ€ν¨: {stderr[:100]}"
|
| 406 |
-
|
| 407 |
-
except subprocess.TimeoutExpired:
|
| 408 |
-
return None, "hwp5txt νμμμ"
|
| 409 |
-
except Exception as e:
|
| 410 |
-
return None, f"hwp5txt μ€λ₯: {str(e)}"
|
| 411 |
|
| 412 |
def extract_text_from_hwp(file_path: str) -> tuple:
|
| 413 |
"""HWP νμΌμμ ν
μ€νΈ μΆμΆ (λ©μΈ ν¨μ)"""
|
| 414 |
print(f"\n[HWP μΆμΆ] μμ: {os.path.basename(file_path)}")
|
| 415 |
|
| 416 |
-
# λ°©λ² 1: hwp5txt
|
| 417 |
-
print(" λ°©λ² 1: hwp5txt
|
| 418 |
text, error = extract_text_with_hwp5txt(file_path)
|
| 419 |
-
if text:
|
| 420 |
print(f" β hwp5txt μ±κ³΅: {len(text)} κΈμ")
|
| 421 |
return text, None
|
| 422 |
print(f" β hwp5txt μ€ν¨: {error}")
|
| 423 |
|
| 424 |
-
# λ°©λ² 2: olefile
|
| 425 |
print(" λ°©λ² 2: olefile νμ±...")
|
| 426 |
text, error = extract_text_with_olefile(file_path)
|
| 427 |
-
if text:
|
| 428 |
print(f" β olefile μ±κ³΅: {len(text)} κΈμ")
|
| 429 |
return text, None
|
| 430 |
print(f" β olefile μ€ν¨: {error}")
|
| 431 |
|
| 432 |
return None, "λͺ¨λ μΆμΆ λ°©λ² μ€ν¨"
|
| 433 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
# ============== HWP λ³ν ν¨μλ€ ==============
|
| 435 |
|
| 436 |
def check_hwp_version(file_path):
|
|
@@ -441,6 +586,8 @@ def check_hwp_version(file_path):
|
|
| 441 |
return "HWP v5", True
|
| 442 |
elif header[:4] == b'\xd0\xcf\x11\xe0':
|
| 443 |
return "HWP v5 (OLE)", True
|
|
|
|
|
|
|
| 444 |
else:
|
| 445 |
return "Unknown", False
|
| 446 |
except Exception as e:
|
|
@@ -451,41 +598,32 @@ def convert_to_html_subprocess(input_path, output_dir):
|
|
| 451 |
output_path = os.path.join(output_dir, "output.html")
|
| 452 |
|
| 453 |
try:
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
return output_path, None
|
| 467 |
-
|
| 468 |
-
# λ€λ₯Έ μμΉ κ²μ
|
| 469 |
-
for item in os.listdir(output_dir):
|
| 470 |
-
item_path = os.path.join(output_dir, item)
|
| 471 |
-
if item.lower().endswith(('.html', '.htm')) and os.path.isfile(item_path):
|
| 472 |
-
return item_path, None
|
| 473 |
-
if os.path.isdir(item_path):
|
| 474 |
-
for sub in os.listdir(item_path):
|
| 475 |
-
if sub.lower().endswith(('.html', '.htm')):
|
| 476 |
return item_path, None
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
|
|
|
| 481 |
except Exception as e:
|
| 482 |
-
|
| 483 |
|
| 484 |
return None, "HTML λ³ν μ€ν¨"
|
| 485 |
|
| 486 |
def convert_hwp_to_text(input_path: str) -> tuple:
|
| 487 |
-
"""HWPλ₯Ό ν
μ€νΈλ‘ λ³ν"""
|
| 488 |
-
return
|
| 489 |
|
| 490 |
def html_to_markdown(html_content):
|
| 491 |
"""HTMLμ MarkdownμΌλ‘ λ³ν"""
|
|
@@ -503,7 +641,6 @@ def html_to_markdown(html_content):
|
|
| 503 |
except:
|
| 504 |
pass
|
| 505 |
|
| 506 |
-
# κΈ°λ³Έ λ³ν
|
| 507 |
if BS4_AVAILABLE:
|
| 508 |
try:
|
| 509 |
soup = BeautifulSoup(html_content, 'html.parser')
|
|
@@ -514,41 +651,12 @@ def html_to_markdown(html_content):
|
|
| 514 |
return None, "Markdown λ³ν μ€ν¨"
|
| 515 |
|
| 516 |
def convert_hwp_to_markdown(input_path: str) -> tuple:
|
| 517 |
-
"""HWPλ₯Ό MarkdownμΌλ‘ λ³ν"""
|
| 518 |
-
#
|
| 519 |
-
text, error =
|
| 520 |
if text:
|
| 521 |
return text, None
|
| 522 |
-
|
| 523 |
-
# HTML λ³ν ν Markdown λ³ν
|
| 524 |
-
tmp_dir = tempfile.mkdtemp()
|
| 525 |
-
try:
|
| 526 |
-
html_output, error = convert_to_html_subprocess(input_path, tmp_dir)
|
| 527 |
-
if html_output:
|
| 528 |
-
# HTML νμΌ μ½κΈ°
|
| 529 |
-
html_files = []
|
| 530 |
-
if os.path.isfile(html_output):
|
| 531 |
-
html_files = [html_output]
|
| 532 |
-
elif os.path.isdir(html_output):
|
| 533 |
-
for root, dirs, files in os.walk(html_output):
|
| 534 |
-
for f in files:
|
| 535 |
-
if f.lower().endswith(('.html', '.htm')):
|
| 536 |
-
html_files.append(os.path.join(root, f))
|
| 537 |
-
|
| 538 |
-
for html_file in html_files:
|
| 539 |
-
for enc in ['utf-8', 'cp949', 'euc-kr']:
|
| 540 |
-
try:
|
| 541 |
-
with open(html_file, 'r', encoding=enc) as f:
|
| 542 |
-
content = f.read()
|
| 543 |
-
md_text, _ = html_to_markdown(content)
|
| 544 |
-
if md_text and len(md_text.strip()) > 10:
|
| 545 |
-
return md_text.strip(), None
|
| 546 |
-
except:
|
| 547 |
-
continue
|
| 548 |
-
|
| 549 |
-
return None, error or "λ³ν μ€ν¨"
|
| 550 |
-
finally:
|
| 551 |
-
shutil.rmtree(tmp_dir, ignore_errors=True)
|
| 552 |
|
| 553 |
# ============== LLM API ==============
|
| 554 |
|
|
@@ -646,11 +754,11 @@ def process_file(file_path: str) -> tuple:
|
|
| 646 |
if is_image_file(file_path):
|
| 647 |
return "image", image_to_base64(file_path), get_image_mime_type(file_path)
|
| 648 |
|
| 649 |
-
if is_hwp_file(file_path):
|
| 650 |
-
text, error =
|
| 651 |
-
if text:
|
| 652 |
-
return "text", f"[
|
| 653 |
-
return "error", f"
|
| 654 |
|
| 655 |
if is_pdf_file(file_path):
|
| 656 |
text = extract_text_from_pdf(file_path)
|
|
@@ -666,7 +774,7 @@ def process_file(file_path: str) -> tuple:
|
|
| 666 |
|
| 667 |
return "unsupported", f"μ§μνμ§ μλ νμ: {filename}", None
|
| 668 |
|
| 669 |
-
def chat_response(message: str, history: List[Dict], file: Optional[str],
|
| 670 |
session_id: str, groq_key: str, fireworks_key: str) -> Generator[tuple, None, None]:
|
| 671 |
if history is None:
|
| 672 |
history = []
|
|
@@ -714,7 +822,7 @@ def chat_response(message: str, history: List[Dict], file: Optional[str],
|
|
| 714 |
db_messages = get_session_messages(session_id, limit=10)
|
| 715 |
api_messages = [{
|
| 716 |
"role": "system",
|
| 717 |
-
"content": "λΉμ μ λμμ΄ λλ AI μ΄μμ€ν΄νΈμ
λλ€. νκ΅μ΄λ‘ μμ°μ€λ½κ² λννλ©°, νμΌμ΄ 첨λΆλλ©΄ λ΄μ©μ λΆμνμ¬ λ΅λ³ν©λλ€."
|
| 718 |
}]
|
| 719 |
|
| 720 |
for m in db_messages:
|
|
@@ -760,12 +868,14 @@ def load_session(session_id: str) -> tuple:
|
|
| 760 |
def convert_to_odt_subprocess(input_path, output_dir):
|
| 761 |
output_path = os.path.join(output_dir, "output.odt")
|
| 762 |
try:
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
|
|
|
|
|
|
| 769 |
except:
|
| 770 |
pass
|
| 771 |
return None, "ODT λ³ν μ€ν¨"
|
|
@@ -773,14 +883,16 @@ def convert_to_odt_subprocess(input_path, output_dir):
|
|
| 773 |
def convert_to_xml_subprocess(input_path, output_dir):
|
| 774 |
output_path = os.path.join(output_dir, "output.xml")
|
| 775 |
try:
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
|
|
|
|
|
|
| 784 |
except:
|
| 785 |
pass
|
| 786 |
return None, "XML λ³ν μ€ν¨"
|
|
@@ -790,8 +902,10 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
|
|
| 790 |
return None, "β νμΌμ μ
λ‘λν΄μ£ΌμΈμ.", ""
|
| 791 |
|
| 792 |
input_file = file.name if hasattr(file, 'name') else str(file)
|
| 793 |
-
|
| 794 |
-
|
|
|
|
|
|
|
| 795 |
|
| 796 |
progress(0.1, desc="νμΌ λΆμ μ€...")
|
| 797 |
version, is_valid = check_hwp_version(input_file)
|
|
@@ -810,6 +924,8 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
|
|
| 810 |
output_path, error, ext = None, None, ""
|
| 811 |
|
| 812 |
if output_format == "HTML":
|
|
|
|
|
|
|
| 813 |
output_path, error = convert_to_html_subprocess(input_path, tmp_dir)
|
| 814 |
ext = ".html"
|
| 815 |
if output_path and os.path.isdir(output_path):
|
|
@@ -817,11 +933,13 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
|
|
| 817 |
output_path, ext = zip_path, ".zip"
|
| 818 |
|
| 819 |
elif output_format == "ODT (OpenDocument)":
|
|
|
|
|
|
|
| 820 |
output_path, error = convert_to_odt_subprocess(input_path, tmp_dir)
|
| 821 |
ext = ".odt"
|
| 822 |
|
| 823 |
elif output_format == "TXT (ν
μ€νΈ)":
|
| 824 |
-
text, error =
|
| 825 |
if text:
|
| 826 |
output_path = os.path.join(tmp_dir, "output.txt")
|
| 827 |
with open(output_path, 'w', encoding='utf-8') as f:
|
|
@@ -837,7 +955,24 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
|
|
| 837 |
ext = ".md"
|
| 838 |
|
| 839 |
elif output_format == "XML":
|
| 840 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 841 |
ext = ".xml"
|
| 842 |
|
| 843 |
if not output_path:
|
|
@@ -886,7 +1021,7 @@ css = """
|
|
| 886 |
with gr.Blocks(title="AI λ¬Έμ μ΄μμ€ν΄νΈ") as demo:
|
| 887 |
session_state = gr.State("")
|
| 888 |
|
| 889 |
-
gr.Markdown("# π€ AI λ¬Έμ μ΄μμ€ν΄νΈ\nLLM μ±ν
+ HWP λ¬Έμ λ³ν")
|
| 890 |
|
| 891 |
with gr.Tabs():
|
| 892 |
with gr.Tab("π¬ AI μ±ν
"):
|
|
@@ -897,7 +1032,7 @@ with gr.Blocks(title="AI λ¬Έμ μ΄μμ€ν΄νΈ") as demo:
|
|
| 897 |
groq_key = gr.Textbox(label="Groq API Key", type="password", value=GROQ_API_KEY)
|
| 898 |
fireworks_key = gr.Textbox(label="Fireworks API Key", type="password", value=FIREWORKS_API_KEY)
|
| 899 |
|
| 900 |
-
gr.Markdown("### π μ§μ νμΌ\n- μ΄λ―Έμ§: JPG, PNG\n- λ¬Έμ: PDF, TXT
|
| 901 |
new_btn = gr.Button("π μ λν", variant="primary")
|
| 902 |
|
| 903 |
with gr.Accordion("π κΈ°λ‘", open=False):
|
|
@@ -916,10 +1051,10 @@ with gr.Blocks(title="AI λ¬Έμ μ΄μμ€ν΄νΈ") as demo:
|
|
| 916 |
clear_btn = gr.Button("ποΈ μ§μ°κΈ°", scale=1)
|
| 917 |
|
| 918 |
with gr.Tab("π HWP λ³νκΈ°"):
|
| 919 |
-
gr.Markdown("### HWP νμΌ λ³νκΈ°")
|
| 920 |
with gr.Row():
|
| 921 |
with gr.Column():
|
| 922 |
-
hwp_input = gr.File(label="HWP νμΌ", file_types=[".hwp"], elem_classes=["upload-box"])
|
| 923 |
format_select = gr.Radio(["HTML", "ODT (OpenDocument)", "TXT (ν
μ€νΈ)", "Markdown", "XML"], value="TXT (ν
μ€νΈ)", label="νμ")
|
| 924 |
convert_btn = gr.Button("π λ³ν", variant="primary", size="lg")
|
| 925 |
with gr.Column():
|
|
@@ -928,6 +1063,10 @@ with gr.Blocks(title="AI λ¬Έμ μ΄μμ€ν΄νΈ") as demo:
|
|
| 928 |
|
| 929 |
with gr.Accordion("π 미리보기", open=False):
|
| 930 |
preview_out = gr.Textbox(lines=15, interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 931 |
|
| 932 |
# μ΄λ²€νΈ
|
| 933 |
def on_submit(msg, hist, f, sid, gk, fk):
|
|
@@ -935,7 +1074,7 @@ with gr.Blocks(title="AI λ¬Έμ μ΄μμ€ν΄νΈ") as demo:
|
|
| 935 |
for r in chat_response(msg, hist, f, sid, gk, fk):
|
| 936 |
yield r[0], r[1], "", None
|
| 937 |
|
| 938 |
-
submit_btn.click(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
|
| 939 |
[chatbot, session_state, msg_input, file_upload])
|
| 940 |
msg_input.submit(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
|
| 941 |
[chatbot, session_state, msg_input, file_upload])
|
|
|
|
| 16 |
import base64
|
| 17 |
import requests
|
| 18 |
import zlib
|
| 19 |
+
import zipfile
|
| 20 |
from pathlib import Path
|
| 21 |
from datetime import datetime
|
| 22 |
from typing import Generator, List, Dict, Optional
|
| 23 |
+
from xml.etree import ElementTree as ET
|
| 24 |
|
| 25 |
# ============== νκ²½ μ€μ ==============
|
| 26 |
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
| 29 |
|
| 30 |
if os.path.exists(PYHWP_PATH):
|
| 31 |
sys.path.insert(0, PYHWP_PATH)
|
|
|
|
| 32 |
|
| 33 |
# ============== λͺ¨λ μν¬νΈ ==============
|
| 34 |
try:
|
|
|
|
| 73 |
except ImportError:
|
| 74 |
PDFPLUMBER_AVAILABLE = False
|
| 75 |
|
| 76 |
+
# hwp5txt μ¬μ© κ°λ₯ μ¬λΆ νμΈ
|
| 77 |
+
HWP5TXT_AVAILABLE = False
|
| 78 |
+
try:
|
| 79 |
+
result = subprocess.run(['hwp5txt', '--help'], capture_output=True, timeout=5)
|
| 80 |
+
if result.returncode == 0:
|
| 81 |
+
HWP5TXT_AVAILABLE = True
|
| 82 |
+
print("hwp5txt command available")
|
| 83 |
+
except:
|
| 84 |
+
pass
|
| 85 |
+
|
| 86 |
+
if not HWP5TXT_AVAILABLE:
|
| 87 |
+
try:
|
| 88 |
+
result = subprocess.run([sys.executable, '-c', 'from hwp5.hwp5txt import main; print("ok")'],
|
| 89 |
+
capture_output=True, timeout=5)
|
| 90 |
+
if b'ok' in result.stdout:
|
| 91 |
+
HWP5TXT_AVAILABLE = True
|
| 92 |
+
print("hwp5txt module available")
|
| 93 |
+
except:
|
| 94 |
+
pass
|
| 95 |
+
|
| 96 |
+
print(f"HWP5TXT_AVAILABLE: {HWP5TXT_AVAILABLE}")
|
| 97 |
+
|
| 98 |
# ============== API ν€ μ€μ ==============
|
| 99 |
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
|
| 100 |
FIREWORKS_API_KEY = os.environ.get("FIREWORKS_API_KEY", "")
|
|
|
|
| 129 |
session_id = str(uuid.uuid4())
|
| 130 |
conn = sqlite3.connect(DB_PATH)
|
| 131 |
cursor = conn.cursor()
|
| 132 |
+
cursor.execute("INSERT INTO sessions (session_id, title) VALUES (?, ?)",
|
| 133 |
+
(session_id, f"λν {datetime.now().strftime('%Y-%m-%d %H:%M')}"))
|
|
|
|
|
|
|
| 134 |
conn.commit()
|
| 135 |
conn.close()
|
| 136 |
return session_id
|
|
|
|
| 138 |
def save_message(session_id: str, role: str, content: str, file_info: str = None):
|
| 139 |
conn = sqlite3.connect(DB_PATH)
|
| 140 |
cursor = conn.cursor()
|
| 141 |
+
cursor.execute("INSERT INTO messages (session_id, role, content, file_info) VALUES (?, ?, ?, ?)",
|
| 142 |
+
(session_id, role, content, file_info))
|
| 143 |
+
cursor.execute("UPDATE sessions SET updated_at = CURRENT_TIMESTAMP WHERE session_id = ?", (session_id,))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
conn.commit()
|
| 145 |
conn.close()
|
| 146 |
|
| 147 |
def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
|
| 148 |
conn = sqlite3.connect(DB_PATH)
|
| 149 |
cursor = conn.cursor()
|
| 150 |
+
cursor.execute("SELECT role, content, file_info, created_at FROM messages WHERE session_id = ? ORDER BY created_at DESC LIMIT ?",
|
| 151 |
+
(session_id, limit))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
rows = cursor.fetchall()
|
| 153 |
conn.close()
|
| 154 |
return [{"role": r[0], "content": r[1], "file_info": r[2], "created_at": r[3]} for r in reversed(rows)]
|
|
|
|
| 156 |
def get_all_sessions() -> List[Dict]:
|
| 157 |
conn = sqlite3.connect(DB_PATH)
|
| 158 |
cursor = conn.cursor()
|
| 159 |
+
cursor.execute("SELECT session_id, title, created_at, updated_at FROM sessions ORDER BY updated_at DESC LIMIT 50")
|
|
|
|
|
|
|
| 160 |
rows = cursor.fetchall()
|
| 161 |
conn.close()
|
| 162 |
return [{"session_id": r[0], "title": r[1], "created_at": r[2], "updated_at": r[3]} for r in rows]
|
|
|
|
| 214 |
|
| 215 |
def get_image_mime_type(file_path: str) -> str:
|
| 216 |
ext = Path(file_path).suffix.lower()
|
| 217 |
+
return {'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png',
|
| 218 |
'.gif': 'image/gif', '.webp': 'image/webp', '.bmp': 'image/bmp'}.get(ext, 'image/jpeg')
|
| 219 |
|
| 220 |
def is_image_file(fp: str) -> bool:
|
| 221 |
return Path(fp).suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']
|
| 222 |
|
| 223 |
def is_hwp_file(fp: str) -> bool:
|
| 224 |
+
return Path(fp).suffix.lower() == '.hwp'
|
| 225 |
+
|
| 226 |
+
def is_hwpx_file(fp: str) -> bool:
|
| 227 |
+
return Path(fp).suffix.lower() == '.hwpx'
|
| 228 |
|
| 229 |
def is_pdf_file(fp: str) -> bool:
|
| 230 |
return Path(fp).suffix.lower() == '.pdf'
|
|
|
|
| 232 |
def is_text_file(fp: str) -> bool:
|
| 233 |
return Path(fp).suffix.lower() in ['.txt', '.md', '.json', '.csv', '.xml', '.html', '.css', '.js', '.py']
|
| 234 |
|
| 235 |
+
# ============== HWPX ν
μ€νΈ μΆμΆ (ZIP/XML κΈ°λ°) ==============
|
| 236 |
|
| 237 |
+
def extract_text_from_hwpx(file_path: str) -> tuple:
|
| 238 |
+
"""HWPX νμΌμμ ν
μ€νΈ μΆμΆ (ZIP λ΄λΆ XML νμ±)"""
|
| 239 |
try:
|
| 240 |
+
text_parts = []
|
| 241 |
+
|
| 242 |
+
with zipfile.ZipFile(file_path, 'r') as zf:
|
| 243 |
+
# HWPX λ΄λΆ ꡬ쑰 νμΈ
|
| 244 |
+
file_list = zf.namelist()
|
| 245 |
+
print(f" HWPX λ΄λΆ νμΌ: {file_list[:10]}...")
|
| 246 |
+
|
| 247 |
+
# Contents ν΄λ λ΄μ section XML νμΌλ€ μ²λ¦¬
|
| 248 |
+
section_files = sorted([f for f in file_list if f.startswith('Contents/section') and f.endswith('.xml')])
|
| 249 |
+
|
| 250 |
+
if not section_files:
|
| 251 |
+
# λ€λ₯Έ κ²½λ‘ μλ
|
| 252 |
+
section_files = sorted([f for f in file_list if 'section' in f.lower() and f.endswith('.xml')])
|
| 253 |
+
|
| 254 |
+
print(f" μΉμ
νμΌ: {section_files}")
|
| 255 |
+
|
| 256 |
+
for section_file in section_files:
|
| 257 |
+
try:
|
| 258 |
+
with zf.open(section_file) as sf:
|
| 259 |
+
content = sf.read()
|
| 260 |
+
|
| 261 |
+
# XML νμ±
|
| 262 |
+
try:
|
| 263 |
+
# λ€μμ€νμ΄μ€ μ κ±°νκ³ νμ±
|
| 264 |
+
content_str = content.decode('utf-8')
|
| 265 |
+
# λ€μμ€νμ΄μ€ μ κ±°
|
| 266 |
+
content_str = re.sub(r'\sxmlns[^"]*"[^"]*"', '', content_str)
|
| 267 |
+
content_str = re.sub(r'<[a-zA-Z]+:', '<', content_str)
|
| 268 |
+
content_str = re.sub(r'</[a-zA-Z]+:', '</', content_str)
|
| 269 |
+
|
| 270 |
+
root = ET.fromstring(content_str)
|
| 271 |
+
|
| 272 |
+
# λͺ¨λ ν
μ€νΈ μΆμΆ
|
| 273 |
+
texts = []
|
| 274 |
+
for elem in root.iter():
|
| 275 |
+
# t νκ·Έ (ν
μ€νΈ)
|
| 276 |
+
if elem.tag.endswith('t') or elem.tag == 't':
|
| 277 |
+
if elem.text:
|
| 278 |
+
texts.append(elem.text)
|
| 279 |
+
# λ€λ₯Έ ν
μ€νΈ λ
Έλ
|
| 280 |
+
elif elem.text and elem.text.strip():
|
| 281 |
+
# νκ·Έ μ΄λ¦μ΄ ν
μ€νΈ κ΄λ ¨μΈ κ²½μ°
|
| 282 |
+
if any(x in elem.tag.lower() for x in ['text', 'run', 'para', 'char']):
|
| 283 |
+
texts.append(elem.text.strip())
|
| 284 |
+
|
| 285 |
+
if texts:
|
| 286 |
+
text_parts.append(' '.join(texts))
|
| 287 |
+
|
| 288 |
+
except ET.ParseError as e:
|
| 289 |
+
print(f" XML νμ± μ€λ₯ {section_file}: {e}")
|
| 290 |
+
# μ κ·μμΌλ‘ ν
μ€νΈ μΆμΆ μλ
|
| 291 |
+
text_matches = re.findall(r'>([^<]+)<', content.decode('utf-8', errors='ignore'))
|
| 292 |
+
clean_texts = [t.strip() for t in text_matches if t.strip() and len(t.strip()) > 1]
|
| 293 |
+
if clean_texts:
|
| 294 |
+
text_parts.append(' '.join(clean_texts))
|
| 295 |
+
|
| 296 |
+
except Exception as e:
|
| 297 |
+
print(f" μΉμ
νμΌ μ½κΈ° μ€λ₯ {section_file}: {e}")
|
| 298 |
+
continue
|
| 299 |
+
|
| 300 |
+
# header.xmlμμλ ν
μ€νΈ μΆμΆ μλ
|
| 301 |
+
for header_file in [f for f in file_list if 'header' in f.lower() and f.endswith('.xml')]:
|
| 302 |
+
try:
|
| 303 |
+
with zf.open(header_file) as hf:
|
| 304 |
+
content = hf.read().decode('utf-8', errors='ignore')
|
| 305 |
+
text_matches = re.findall(r'>([^<]+)<', content)
|
| 306 |
+
clean_texts = [t.strip() for t in text_matches if t.strip() and len(t.strip()) > 1]
|
| 307 |
+
# ν€λλ μ§§μ ν
μ€νΈλ§ μΆκ°
|
| 308 |
+
if clean_texts:
|
| 309 |
+
text_parts.insert(0, ' '.join(clean_texts[:5]))
|
| 310 |
+
except:
|
| 311 |
+
pass
|
| 312 |
+
|
| 313 |
+
if text_parts:
|
| 314 |
+
result = '\n\n'.join(text_parts)
|
| 315 |
+
# μ 리
|
| 316 |
+
result = re.sub(r'\s+', ' ', result)
|
| 317 |
+
result = re.sub(r'\n{3,}', '\n\n', result)
|
| 318 |
+
return result.strip(), None
|
| 319 |
+
|
| 320 |
+
return None, "HWPXμμ ν
μ€νΈλ₯Ό μ°Ύμ μ μμ΅λλ€"
|
| 321 |
+
|
| 322 |
+
except zipfile.BadZipFile:
|
| 323 |
+
return None, "μ ν¨νμ§ μμ HWPX νμΌ"
|
| 324 |
+
except Exception as e:
|
| 325 |
+
return None, f"HWPX μ²λ¦¬ μ€λ₯: {str(e)}"
|
| 326 |
|
| 327 |
+
# ============== HWP ν
μ€νΈ μΆμΆ (OLE κΈ°λ°) ==============
|
| 328 |
+
|
| 329 |
+
def extract_text_with_hwp5txt(file_path: str) -> tuple:
|
| 330 |
+
"""hwp5txtλ‘ ν
μ€νΈ μΆμΆ"""
|
| 331 |
|
| 332 |
+
# λ°©λ² 1: hwp5txt λͺ
λ Ήμ΄ μ§μ μ€ν
|
| 333 |
+
try:
|
| 334 |
+
result = subprocess.run(['hwp5txt', file_path], capture_output=True, timeout=60)
|
| 335 |
+
if result.returncode == 0 and result.stdout:
|
| 336 |
+
for enc in ['utf-8', 'cp949', 'euc-kr']:
|
| 337 |
+
try:
|
| 338 |
+
text = result.stdout.decode(enc)
|
| 339 |
+
if text.strip() and len(text.strip()) > 10:
|
| 340 |
+
return text.strip(), None
|
| 341 |
+
except:
|
| 342 |
+
continue
|
| 343 |
+
except FileNotFoundError:
|
| 344 |
+
pass
|
| 345 |
+
except Exception as e:
|
| 346 |
+
print(f" hwp5txt λͺ
λ Ήμ΄ μ€λ₯: {e}")
|
| 347 |
+
|
| 348 |
+
# λ°©λ² 2: Python λͺ¨λλ‘ μ€ν
|
| 349 |
+
try:
|
| 350 |
+
from hwp5.hwp5txt import main as hwp5txt_main
|
| 351 |
+
from hwp5.hwp5txt import extract_text
|
| 352 |
+
from hwp5.filestructure import Hwp5File
|
| 353 |
|
| 354 |
+
hwp5file = Hwp5File(file_path)
|
| 355 |
+
texts = []
|
| 356 |
+
|
| 357 |
+
for section_idx in hwp5file.bodytext.sections():
|
| 358 |
+
section = hwp5file.bodytext.section(section_idx)
|
| 359 |
+
for para in extract_text(section):
|
| 360 |
+
if para.strip():
|
| 361 |
+
texts.append(para.strip())
|
| 362 |
+
|
| 363 |
+
hwp5file.close()
|
| 364 |
+
|
| 365 |
+
if texts:
|
| 366 |
+
return '\n'.join(texts), None
|
| 367 |
+
|
| 368 |
+
except ImportError:
|
| 369 |
+
pass
|
| 370 |
+
except Exception as e:
|
| 371 |
+
print(f" hwp5txt λͺ¨λ μ€λ₯: {e}")
|
| 372 |
+
|
| 373 |
+
# λ°©λ² 3: μλΈνλ‘μΈμ€λ‘ Python μ½λ μ€ν
|
| 374 |
+
try:
|
| 375 |
+
code = f'''
|
| 376 |
+
import sys
|
| 377 |
+
sys.path.insert(0, "{PYHWP_PATH}")
|
| 378 |
+
from hwp5.filestructure import Hwp5File
|
| 379 |
+
from hwp5.hwp5txt import extract_text
|
| 380 |
+
hwp = Hwp5File("{file_path}")
|
| 381 |
+
for idx in hwp.bodytext.sections():
|
| 382 |
+
section = hwp.bodytext.section(idx)
|
| 383 |
+
for para in extract_text(section):
|
| 384 |
+
if para.strip():
|
| 385 |
+
print(para.strip())
|
| 386 |
+
hwp.close()
|
| 387 |
+
'''
|
| 388 |
+
result = subprocess.run([sys.executable, '-c', code], capture_output=True, timeout=60)
|
| 389 |
+
if result.returncode == 0 and result.stdout:
|
| 390 |
+
for enc in ['utf-8', 'cp949', 'euc-kr']:
|
| 391 |
try:
|
| 392 |
+
text = result.stdout.decode(enc)
|
| 393 |
+
if text.strip() and len(text.strip()) > 10:
|
| 394 |
+
return text.strip(), None
|
| 395 |
except:
|
| 396 |
+
continue
|
| 397 |
+
except Exception as e:
|
| 398 |
+
print(f" hwp5txt μλΈνλ‘μΈμ€ μ€λ₯: {e}")
|
| 399 |
+
|
| 400 |
+
return None, "hwp5txt μ€ν¨"
|
| 401 |
+
|
| 402 |
+
def extract_text_with_olefile(file_path: str) -> tuple:
|
| 403 |
+
"""olefileμ μ¬μ©ν HWP ν
μ€νΈ μΆμΆ"""
|
| 404 |
+
if not OLEFILE_AVAILABLE:
|
| 405 |
+
return None, "olefile λͺ¨λ μμ"
|
| 406 |
|
| 407 |
+
try:
|
| 408 |
+
ole = olefile.OleFileIO(file_path)
|
| 409 |
+
|
| 410 |
+
# νμΌ ν€λ νμΈ
|
| 411 |
+
if not ole.exists('FileHeader'):
|
| 412 |
+
ole.close()
|
| 413 |
+
return None, "HWP νμΌ ν€λ μμ"
|
| 414 |
+
|
| 415 |
+
# μμΆ μ¬λΆ νμΈ
|
| 416 |
+
header_data = ole.openstream('FileHeader').read()
|
| 417 |
+
is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
|
| 418 |
+
print(f" HWP μμΆ μ¬λΆ: {is_compressed}")
|
| 419 |
+
|
| 420 |
+
all_texts = []
|
| 421 |
+
|
| 422 |
+
# BodyText μΉμ
λ€ μ²λ¦¬
|
| 423 |
+
for entry in ole.listdir():
|
| 424 |
+
entry_path = '/'.join(entry)
|
| 425 |
+
|
| 426 |
+
if entry_path.startswith('BodyText/Section'):
|
| 427 |
+
try:
|
| 428 |
+
stream_data = ole.openstream(entry).read()
|
| 429 |
+
|
| 430 |
+
# μμΆ ν΄μ
|
| 431 |
+
if is_compressed:
|
| 432 |
+
try:
|
| 433 |
+
stream_data = zlib.decompress(stream_data, -15)
|
| 434 |
+
except:
|
| 435 |
+
try:
|
| 436 |
+
stream_data = zlib.decompress(stream_data)
|
| 437 |
+
except:
|
| 438 |
+
pass
|
| 439 |
+
|
| 440 |
+
# οΏ½οΏ½μ½λμμ ν
μ€νΈ μΆμΆ
|
| 441 |
+
section_text = extract_hwp_section_text(stream_data)
|
| 442 |
+
if section_text:
|
| 443 |
+
all_texts.append(section_text)
|
| 444 |
+
|
| 445 |
+
except Exception as e:
|
| 446 |
+
print(f" μΉμ
μ²λ¦¬ μ€λ₯ {entry_path}: {e}")
|
| 447 |
+
continue
|
| 448 |
+
|
| 449 |
+
ole.close()
|
| 450 |
+
|
| 451 |
+
if all_texts:
|
| 452 |
+
result = '\n\n'.join(all_texts)
|
| 453 |
+
return result.strip(), None
|
| 454 |
+
|
| 455 |
+
return None, "ν
μ€νΈλ₯Ό μ°Ύμ μ μμ΅λλ€"
|
| 456 |
+
|
| 457 |
+
except Exception as e:
|
| 458 |
+
return None, f"olefile μ€λ₯: {str(e)}"
|
| 459 |
|
| 460 |
+
def extract_hwp_section_text(data: bytes) -> str:
|
| 461 |
+
"""HWP μΉμ
λ°μ΄ν°μμ ν
μ€νΈ μΆμΆ"""
|
| 462 |
texts = []
|
| 463 |
pos = 0
|
| 464 |
|
| 465 |
while pos < len(data) - 4:
|
| 466 |
try:
|
| 467 |
+
# λ μ½λ ν€λ μ½κΈ°
|
| 468 |
header = int.from_bytes(data[pos:pos+4], 'little')
|
| 469 |
tag_id = header & 0x3FF
|
| 470 |
+
level = (header >> 10) & 0x3FF
|
| 471 |
size = (header >> 20) & 0xFFF
|
| 472 |
|
| 473 |
pos += 4
|
|
|
|
| 485 |
record_data = data[pos:pos+size]
|
| 486 |
pos += size
|
| 487 |
|
| 488 |
+
# HWPTAG_PARA_TEXT = 67
|
| 489 |
if tag_id == 67 and size > 0:
|
| 490 |
+
text = decode_para_text(record_data)
|
|
|
|
| 491 |
if text:
|
| 492 |
texts.append(text)
|
| 493 |
|
| 494 |
+
except:
|
| 495 |
pos += 1
|
| 496 |
continue
|
| 497 |
|
| 498 |
return '\n'.join(texts) if texts else None
|
| 499 |
|
| 500 |
+
def decode_para_text(data: bytes) -> str:
|
| 501 |
+
"""PARA_TEXT λ μ½λ λμ½λ©"""
|
| 502 |
result = []
|
| 503 |
i = 0
|
| 504 |
|
| 505 |
while i < len(data) - 1:
|
| 506 |
code = int.from_bytes(data[i:i+2], 'little')
|
| 507 |
|
| 508 |
+
if code == 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
pass
|
| 510 |
+
elif code == 1: # νμ₯ 컨νΈλ‘€
|
| 511 |
+
i += 14
|
| 512 |
+
elif code == 2: # μΉμ
μ μ
|
| 513 |
i += 14
|
| 514 |
elif code == 3: # νλ μμ
|
| 515 |
i += 14
|
|
|
|
| 523 |
result.append('\n')
|
| 524 |
elif code == 24: # νμ΄ν
|
| 525 |
result.append('-')
|
| 526 |
+
elif code == 30 or code == 31: # λΉμΉΈ
|
|
|
|
|
|
|
| 527 |
result.append(' ')
|
| 528 |
+
elif code < 32: # κΈ°ν 컨νΈλ‘€ λ¬Έμ
|
| 529 |
+
pass
|
| 530 |
+
else:
|
| 531 |
+
# μΌλ° λ¬Έμ
|
| 532 |
+
try:
|
| 533 |
+
char = chr(code)
|
| 534 |
+
if char.isprintable() or char in '\n\t ':
|
| 535 |
+
result.append(char)
|
| 536 |
+
except:
|
| 537 |
+
pass
|
| 538 |
|
| 539 |
i += 2
|
| 540 |
|
| 541 |
text = ''.join(result).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
|
| 543 |
+
# μ 리
|
| 544 |
+
text = re.sub(r'[ \t]+', ' ', text)
|
| 545 |
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 546 |
+
|
| 547 |
+
return text if len(text) > 2 else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
|
| 549 |
def extract_text_from_hwp(file_path: str) -> tuple:
|
| 550 |
"""HWP νμΌμμ ν
μ€νΈ μΆμΆ (λ©μΈ ν¨μ)"""
|
| 551 |
print(f"\n[HWP μΆμΆ] μμ: {os.path.basename(file_path)}")
|
| 552 |
|
| 553 |
+
# λ°©λ² 1: hwp5txt
|
| 554 |
+
print(" λ°©λ² 1: hwp5txt...")
|
| 555 |
text, error = extract_text_with_hwp5txt(file_path)
|
| 556 |
+
if text and len(text.strip()) > 20:
|
| 557 |
print(f" β hwp5txt μ±κ³΅: {len(text)} κΈμ")
|
| 558 |
return text, None
|
| 559 |
print(f" β hwp5txt μ€ν¨: {error}")
|
| 560 |
|
| 561 |
+
# λ°©λ² 2: olefile
|
| 562 |
print(" λ°©λ² 2: olefile νμ±...")
|
| 563 |
text, error = extract_text_with_olefile(file_path)
|
| 564 |
+
if text and len(text.strip()) > 20:
|
| 565 |
print(f" β olefile μ±κ³΅: {len(text)} κΈμ")
|
| 566 |
return text, None
|
| 567 |
print(f" β olefile μ€ν¨: {error}")
|
| 568 |
|
| 569 |
return None, "λͺ¨λ μΆμΆ λ°©λ² μ€ν¨"
|
| 570 |
|
| 571 |
+
def extract_text_from_hwp_or_hwpx(file_path: str) -> tuple:
|
| 572 |
+
"""HWP λλ HWPX νμΌμμ ν
μ€νΈ μΆμΆ"""
|
| 573 |
+
if is_hwpx_file(file_path):
|
| 574 |
+
print(f"\n[HWPX μΆμΆ] μμ: {os.path.basename(file_path)}")
|
| 575 |
+
return extract_text_from_hwpx(file_path)
|
| 576 |
+
else:
|
| 577 |
+
return extract_text_from_hwp(file_path)
|
| 578 |
+
|
| 579 |
# ============== HWP λ³ν ν¨μλ€ ==============
|
| 580 |
|
| 581 |
def check_hwp_version(file_path):
|
|
|
|
| 586 |
return "HWP v5", True
|
| 587 |
elif header[:4] == b'\xd0\xcf\x11\xe0':
|
| 588 |
return "HWP v5 (OLE)", True
|
| 589 |
+
elif header[:4] == b'PK\x03\x04': # ZIP νμΌ (HWPX)
|
| 590 |
+
return "HWPX", True
|
| 591 |
else:
|
| 592 |
return "Unknown", False
|
| 593 |
except Exception as e:
|
|
|
|
| 598 |
output_path = os.path.join(output_dir, "output.html")
|
| 599 |
|
| 600 |
try:
|
| 601 |
+
# hwp5html μλ
|
| 602 |
+
for cmd in [['hwp5html', '--output', output_path, input_path],
|
| 603 |
+
[sys.executable, '-c', f'from hwp5.hwp5html import main; import sys; sys.argv=["hwp5html","--output","{output_path}","{input_path}"]; main()']]:
|
| 604 |
+
try:
|
| 605 |
+
result = subprocess.run(cmd, capture_output=True, timeout=120)
|
| 606 |
+
if result.returncode == 0:
|
| 607 |
+
if os.path.exists(output_path):
|
| 608 |
+
return output_path, None
|
| 609 |
+
# λλ ν 리 κ²μ
|
| 610 |
+
for item in os.listdir(output_dir):
|
| 611 |
+
item_path = os.path.join(output_dir, item)
|
| 612 |
+
if item.lower().endswith(('.html', '.htm')):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
return item_path, None
|
| 614 |
+
if os.path.isdir(item_path):
|
| 615 |
+
return item_path, None
|
| 616 |
+
except:
|
| 617 |
+
continue
|
| 618 |
+
|
| 619 |
except Exception as e:
|
| 620 |
+
print(f"HTML λ³ν μ€λ₯: {e}")
|
| 621 |
|
| 622 |
return None, "HTML λ³ν μ€ν¨"
|
| 623 |
|
| 624 |
def convert_hwp_to_text(input_path: str) -> tuple:
|
| 625 |
+
"""HWP/HWPXλ₯Ό ν
μ€νΈλ‘ λ³ν"""
|
| 626 |
+
return extract_text_from_hwp_or_hwpx(input_path)
|
| 627 |
|
| 628 |
def html_to_markdown(html_content):
|
| 629 |
"""HTMLμ MarkdownμΌλ‘ λ³ν"""
|
|
|
|
| 641 |
except:
|
| 642 |
pass
|
| 643 |
|
|
|
|
| 644 |
if BS4_AVAILABLE:
|
| 645 |
try:
|
| 646 |
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
| 651 |
return None, "Markdown λ³ν μ€ν¨"
|
| 652 |
|
| 653 |
def convert_hwp_to_markdown(input_path: str) -> tuple:
|
| 654 |
+
"""HWP/HWPXλ₯Ό MarkdownμΌλ‘ λ³ν"""
|
| 655 |
+
# ν
μ€νΈ μΆμΆ
|
| 656 |
+
text, error = extract_text_from_hwp_or_hwpx(input_path)
|
| 657 |
if text:
|
| 658 |
return text, None
|
| 659 |
+
return None, error
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 660 |
|
| 661 |
# ============== LLM API ==============
|
| 662 |
|
|
|
|
| 754 |
if is_image_file(file_path):
|
| 755 |
return "image", image_to_base64(file_path), get_image_mime_type(file_path)
|
| 756 |
|
| 757 |
+
if is_hwp_file(file_path) or is_hwpx_file(file_path):
|
| 758 |
+
text, error = extract_text_from_hwp_or_hwpx(file_path)
|
| 759 |
+
if text and len(text.strip()) > 20:
|
| 760 |
+
return "text", f"[νκΈ λ¬Έμ: {filename}]\n\n{text}", None
|
| 761 |
+
return "error", f"νκΈ λ¬Έμ μΆμΆ μ€ν¨: {error}", None
|
| 762 |
|
| 763 |
if is_pdf_file(file_path):
|
| 764 |
text = extract_text_from_pdf(file_path)
|
|
|
|
| 774 |
|
| 775 |
return "unsupported", f"μ§μνμ§ μλ νμ: {filename}", None
|
| 776 |
|
| 777 |
+
def chat_response(message: str, history: List[Dict], file: Optional[str],
|
| 778 |
session_id: str, groq_key: str, fireworks_key: str) -> Generator[tuple, None, None]:
|
| 779 |
if history is None:
|
| 780 |
history = []
|
|
|
|
| 822 |
db_messages = get_session_messages(session_id, limit=10)
|
| 823 |
api_messages = [{
|
| 824 |
"role": "system",
|
| 825 |
+
"content": "λΉμ μ λμμ΄ λλ AI μ΄μμ€ν΄νΈμ
λλ€. νκ΅μ΄λ‘ μμ°μ€λ½κ² λννλ©°, νμΌμ΄ 첨λΆλλ©΄ λ΄μ©μ μμΈν λΆμνμ¬ λ΅λ³ν©λλ€."
|
| 826 |
}]
|
| 827 |
|
| 828 |
for m in db_messages:
|
|
|
|
| 868 |
def convert_to_odt_subprocess(input_path, output_dir):
|
| 869 |
output_path = os.path.join(output_dir, "output.odt")
|
| 870 |
try:
|
| 871 |
+
for cmd in [['hwp5odt', '--output', output_path, input_path],
|
| 872 |
+
[sys.executable, '-c', f'from hwp5.hwp5odt import main; import sys; sys.argv=["hwp5odt","--output","{output_path}","{input_path}"]; main()']]:
|
| 873 |
+
try:
|
| 874 |
+
result = subprocess.run(cmd, capture_output=True, timeout=120)
|
| 875 |
+
if result.returncode == 0 and os.path.exists(output_path):
|
| 876 |
+
return output_path, None
|
| 877 |
+
except:
|
| 878 |
+
continue
|
| 879 |
except:
|
| 880 |
pass
|
| 881 |
return None, "ODT λ³ν μ€ν¨"
|
|
|
|
| 883 |
def convert_to_xml_subprocess(input_path, output_dir):
|
| 884 |
output_path = os.path.join(output_dir, "output.xml")
|
| 885 |
try:
|
| 886 |
+
for cmd in [['hwp5xml', input_path],
|
| 887 |
+
[sys.executable, '-c', f'from hwp5.hwp5xml import main; import sys; sys.argv=["hwp5xml","{input_path}"]; main()']]:
|
| 888 |
+
try:
|
| 889 |
+
result = subprocess.run(cmd, capture_output=True, timeout=120)
|
| 890 |
+
if result.returncode == 0 and result.stdout:
|
| 891 |
+
with open(output_path, 'wb') as f:
|
| 892 |
+
f.write(result.stdout)
|
| 893 |
+
return output_path, None
|
| 894 |
+
except:
|
| 895 |
+
continue
|
| 896 |
except:
|
| 897 |
pass
|
| 898 |
return None, "XML λ³ν μ€ν¨"
|
|
|
|
| 902 |
return None, "β νμΌμ μ
λ‘λν΄μ£ΌμΈμ.", ""
|
| 903 |
|
| 904 |
input_file = file.name if hasattr(file, 'name') else str(file)
|
| 905 |
+
ext_lower = Path(input_file).suffix.lower()
|
| 906 |
+
|
| 907 |
+
if ext_lower not in ['.hwp', '.hwpx']:
|
| 908 |
+
return None, "β HWP λλ HWPX νμΌλ§ μ§μλ©λλ€.", ""
|
| 909 |
|
| 910 |
progress(0.1, desc="νμΌ λΆμ μ€...")
|
| 911 |
version, is_valid = check_hwp_version(input_file)
|
|
|
|
| 924 |
output_path, error, ext = None, None, ""
|
| 925 |
|
| 926 |
if output_format == "HTML":
|
| 927 |
+
if ext_lower == '.hwpx':
|
| 928 |
+
return None, "β HWPXλ HTML λ³νμ μ§μνμ§ μμ΅λλ€. TXTλ Markdownμ μ¬μ©νμΈμ.", ""
|
| 929 |
output_path, error = convert_to_html_subprocess(input_path, tmp_dir)
|
| 930 |
ext = ".html"
|
| 931 |
if output_path and os.path.isdir(output_path):
|
|
|
|
| 933 |
output_path, ext = zip_path, ".zip"
|
| 934 |
|
| 935 |
elif output_format == "ODT (OpenDocument)":
|
| 936 |
+
if ext_lower == '.hwpx':
|
| 937 |
+
return None, "β HWPXλ ODT λ³νμ μ§μνμ§ μμ΅λλ€. TXTλ Markdownμ μ¬μ©νμΈμ.", ""
|
| 938 |
output_path, error = convert_to_odt_subprocess(input_path, tmp_dir)
|
| 939 |
ext = ".odt"
|
| 940 |
|
| 941 |
elif output_format == "TXT (ν
μ€νΈ)":
|
| 942 |
+
text, error = extract_text_from_hwp_or_hwpx(input_path)
|
| 943 |
if text:
|
| 944 |
output_path = os.path.join(tmp_dir, "output.txt")
|
| 945 |
with open(output_path, 'w', encoding='utf-8') as f:
|
|
|
|
| 955 |
ext = ".md"
|
| 956 |
|
| 957 |
elif output_format == "XML":
|
| 958 |
+
if ext_lower == '.hwpx':
|
| 959 |
+
# HWPXλ μ΄λ―Έ XML κΈ°λ°μ΄λ―λ‘ λ΄λΆ XML μΆμΆ
|
| 960 |
+
try:
|
| 961 |
+
with zipfile.ZipFile(input_path, 'r') as zf:
|
| 962 |
+
# λͺ¨λ XML νμΌμ νλλ‘ ν©μΉ¨
|
| 963 |
+
xml_contents = []
|
| 964 |
+
for name in zf.namelist():
|
| 965 |
+
if name.endswith('.xml'):
|
| 966 |
+
with zf.open(name) as f:
|
| 967 |
+
xml_contents.append(f"<!-- {name} -->\n{f.read().decode('utf-8', errors='ignore')}")
|
| 968 |
+
|
| 969 |
+
output_path = os.path.join(tmp_dir, "output.xml")
|
| 970 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 971 |
+
f.write('\n\n'.join(xml_contents))
|
| 972 |
+
except Exception as e:
|
| 973 |
+
error = f"HWPX XML μΆμΆ μ€ν¨: {e}"
|
| 974 |
+
else:
|
| 975 |
+
output_path, error = convert_to_xml_subprocess(input_path, tmp_dir)
|
| 976 |
ext = ".xml"
|
| 977 |
|
| 978 |
if not output_path:
|
|
|
|
| 1021 |
with gr.Blocks(title="AI λ¬Έμ μ΄μμ€ν΄νΈ") as demo:
|
| 1022 |
session_state = gr.State("")
|
| 1023 |
|
| 1024 |
+
gr.Markdown("# π€ AI λ¬Έμ μ΄μμ€ν΄νΈ\nLLM μ±ν
+ HWP/HWPX λ¬Έμ λ³ν")
|
| 1025 |
|
| 1026 |
with gr.Tabs():
|
| 1027 |
with gr.Tab("π¬ AI μ±ν
"):
|
|
|
|
| 1032 |
groq_key = gr.Textbox(label="Groq API Key", type="password", value=GROQ_API_KEY)
|
| 1033 |
fireworks_key = gr.Textbox(label="Fireworks API Key", type="password", value=FIREWORKS_API_KEY)
|
| 1034 |
|
| 1035 |
+
gr.Markdown("### π μ§μ νμΌ\n- μ΄λ―Έμ§: JPG, PNG\n- λ¬Έμ: PDF, TXT\n- νκΈ: HWP, HWPX β¨")
|
| 1036 |
new_btn = gr.Button("π μ λν", variant="primary")
|
| 1037 |
|
| 1038 |
with gr.Accordion("π κΈ°λ‘", open=False):
|
|
|
|
| 1051 |
clear_btn = gr.Button("ποΈ μ§μ°κΈ°", scale=1)
|
| 1052 |
|
| 1053 |
with gr.Tab("π HWP λ³νκΈ°"):
|
| 1054 |
+
gr.Markdown("### HWP/HWPX νμΌ λ³νκΈ°")
|
| 1055 |
with gr.Row():
|
| 1056 |
with gr.Column():
|
| 1057 |
+
hwp_input = gr.File(label="HWP/HWPX νμΌ", file_types=[".hwp", ".hwpx"], elem_classes=["upload-box"])
|
| 1058 |
format_select = gr.Radio(["HTML", "ODT (OpenDocument)", "TXT (ν
μ€νΈ)", "Markdown", "XML"], value="TXT (ν
μ€νΈ)", label="νμ")
|
| 1059 |
convert_btn = gr.Button("π λ³ν", variant="primary", size="lg")
|
| 1060 |
with gr.Column():
|
|
|
|
| 1063 |
|
| 1064 |
with gr.Accordion("π 미리보기", open=False):
|
| 1065 |
preview_out = gr.Textbox(lines=15, interactive=False)
|
| 1066 |
+
|
| 1067 |
+
gr.Markdown("""
|
| 1068 |
+
> **μ°Έκ³ **: HWPX νμΌμ TXT, Markdown, XML λ³νλ§ μ§μλ©λλ€.
|
| 1069 |
+
""")
|
| 1070 |
|
| 1071 |
# μ΄λ²€νΈ
|
| 1072 |
def on_submit(msg, hist, f, sid, gk, fk):
|
|
|
|
| 1074 |
for r in chat_response(msg, hist, f, sid, gk, fk):
|
| 1075 |
yield r[0], r[1], "", None
|
| 1076 |
|
| 1077 |
+
submit_btn.click(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
|
| 1078 |
[chatbot, session_state, msg_input, file_upload])
|
| 1079 |
msg_input.submit(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
|
| 1080 |
[chatbot, session_state, msg_input, file_upload])
|