Spaces:

Heartsync
/

HWPower

Running

App Files Files Community

seawolf2357 commited on Jan 15

Commit

e006e27

verified ·

1 Parent(s): 46e1b25

Update app.py

Browse files

Files changed (1) hide show

app.py +382 -243

app.py CHANGED Viewed

@@ -16,9 +16,11 @@ import sqlite3
 import base64
 import requests
 import zlib
 from pathlib import Path
 from datetime import datetime
 from typing import Generator, List, Dict, Optional
 # ============== 환경 설정 ==============
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -27,7 +29,6 @@ DB_PATH = os.path.join(SCRIPT_DIR, 'chat_history.db')
 if os.path.exists(PYHWP_PATH):
     sys.path.insert(0, PYHWP_PATH)
-    print(f"Added local pyhwp path: {PYHWP_PATH}")
 # ============== 모듈 임포트 ==============
 try:
@@ -72,6 +73,28 @@ try:
 except ImportError:
     PDFPLUMBER_AVAILABLE = False
 # ============== API 키 설정 ==============
 GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
 FIREWORKS_API_KEY = os.environ.get("FIREWORKS_API_KEY", "")
@@ -106,10 +129,8 @@ def create_session() -> str:
     session_id = str(uuid.uuid4())
     conn = sqlite3.connect(DB_PATH)
     cursor = conn.cursor()
-    cursor.execute(
-        "INSERT INTO sessions (session_id, title) VALUES (?, ?)",
-        (session_id, f"대화 {datetime.now().strftime('%Y-%m-%d %H:%M')}")
-    )
     conn.commit()
     conn.close()
     return session_id
@@ -117,26 +138,17 @@ def create_session() -> str:
 def save_message(session_id: str, role: str, content: str, file_info: str = None):
     conn = sqlite3.connect(DB_PATH)
     cursor = conn.cursor()
-    cursor.execute(
-        "INSERT INTO messages (session_id, role, content, file_info) VALUES (?, ?, ?, ?)",
-        (session_id, role, content, file_info)
-    )
-    cursor.execute(
-        "UPDATE sessions SET updated_at = CURRENT_TIMESTAMP WHERE session_id = ?",
-        (session_id,)
-    )
     conn.commit()
     conn.close()
 def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
     conn = sqlite3.connect(DB_PATH)
     cursor = conn.cursor()
-    cursor.execute(
-        """SELECT role, content, file_info, created_at
-           FROM messages WHERE session_id = ?
-           ORDER BY created_at DESC LIMIT ?""",
-        (session_id, limit)
-    )
     rows = cursor.fetchall()
     conn.close()
     return [{"role": r[0], "content": r[1], "file_info": r[2], "created_at": r[3]} for r in reversed(rows)]
@@ -144,9 +156,7 @@ def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
 def get_all_sessions() -> List[Dict]:
     conn = sqlite3.connect(DB_PATH)
     cursor = conn.cursor()
-    cursor.execute(
-        "SELECT session_id, title, created_at, updated_at FROM sessions ORDER BY updated_at DESC LIMIT 50"
-    )
     rows = cursor.fetchall()
     conn.close()
     return [{"session_id": r[0], "title": r[1], "created_at": r[2], "updated_at": r[3]} for r in rows]
@@ -204,14 +214,17 @@ def image_to_base64(file_path: str) -> str:
 def get_image_mime_type(file_path: str) -> str:
     ext = Path(file_path).suffix.lower()
-    return {'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png',
             '.gif': 'image/gif', '.webp': 'image/webp', '.bmp': 'image/bmp'}.get(ext, 'image/jpeg')
 def is_image_file(fp: str) -> bool:
     return Path(fp).suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']
 def is_hwp_file(fp: str) -> bool:
-    return Path(fp).suffix.lower() in ['.hwp', '.hwpx']
 def is_pdf_file(fp: str) -> bool:
     return Path(fp).suffix.lower() == '.pdf'
@@ -219,57 +232,242 @@ def is_pdf_file(fp: str) -> bool:
 def is_text_file(fp: str) -> bool:
     return Path(fp).suffix.lower() in ['.txt', '.md', '.json', '.csv', '.xml', '.html', '.css', '.js', '.py']
-# ============== HWP 텍스트 추출 (핵심 - 단순하고 안정적으로) ==============
-def decompress_stream(data: bytes) -> bytes:
-    """zlib 압축 해제 시도"""
     try:
-        return zlib.decompress(data, -15)
-    except:
-        try:
-            return zlib.decompress(data)
-        except:
-            return data
-def extract_hwp_text_from_bodytext(ole) -> str:
-    """BodyText 섹션에서 텍스트 추출 (HWP5 포맷)"""
-    text_parts = []
-    for entry in ole.listdir():
-        entry_path = '/'.join(entry)
-        # BodyText/SectionX 스트림 찾기
-        if entry_path.startswith('BodyText/Section'):
-            try:
-                stream_data = ole.openstream(entry).read()
-                # 압축 해제 시도
                 try:
-                    decompressed = zlib.decompress(stream_data, -15)
                 except:
-                    decompressed = stream_data
-                # HWP5 레코드에서 텍스트 추출
-                extracted = extract_text_from_hwp_records(decompressed)
-                if extracted:
-                    text_parts.append(extracted)
-            except Exception as e:
-                print(f"  섹션 읽기 오류 {entry_path}: {e}")
-                continue
-    return '\n\n'.join(text_parts) if text_parts else None
-def extract_text_from_hwp_records(data: bytes) -> str:
-    """HWP5 레코드 구조에서 텍스트 추출"""
     texts = []
     pos = 0
     while pos < len(data) - 4:
         try:
-            # 레코드 헤더 (4바이트)
             header = int.from_bytes(data[pos:pos+4], 'little')
             tag_id = header & 0x3FF
             size = (header >> 20) & 0xFFF
             pos += 4
@@ -287,44 +485,31 @@ def extract_text_from_hwp_records(data: bytes) -> str:
             record_data = data[pos:pos+size]
             pos += size
-            # HWPTAG_PARA_TEXT = 67 (0x43)
             if tag_id == 67 and size > 0:
-                # 텍스트 추출 (컨트롤 문자 처리)
-                text = extract_para_text(record_data)
                 if text:
                     texts.append(text)
-        except Exception as e:
             pos += 1
             continue
     return '\n'.join(texts) if texts else None
-def extract_para_text(data: bytes) -> str:
-    """PARA_TEXT 레코드에서 실제 텍스트 추출"""
     result = []
     i = 0
     while i < len(data) - 1:
         code = int.from_bytes(data[i:i+2], 'little')
-        # 일반 문자 (유니코드)
-        if code >= 32:
-            try:
-                char = chr(code)
-                # 한글, 영문, 숫자, 일반 기호만 허용
-                if char.isprintable() and not (0x4E00 <= code <= 0x9FFF and code not in range(0xAC00, 0xD7A4)):
-                    result.append(char)
-                elif 0xAC00 <= code <= 0xD7A3:  # 한글 음절
-                    result.append(char)
-            except:
-                pass
-        # 컨트롤 문자 처리
-        elif code == 0:  # NULL
             pass
-        elif code == 1:  # 예약
-            i += 14  # 확장 컨트롤 건너뛰기
-        elif code == 2:  # 섹션/컬럼 정의
             i += 14
         elif code == 3:  # 필드 시작
             i += 14
@@ -338,99 +523,59 @@ def extract_para_text(data: bytes) -> str:
             result.append('\n')
         elif code == 24:  # 하이픈
             result.append('-')
-        elif code == 30:  # 묶음 빈칸
-            result.append(' ')
-        elif code == 31:  # 고정폭 빈칸
             result.append(' ')
         i += 2
     text = ''.join(result).strip()
-    # 의미 없는 텍스트 필터링
-    if len(text) < 2:
-        return None
-    return text
-def extract_text_with_olefile(file_path: str) -> tuple:
-    """olefile을 사용한 HWP 텍스트 추출"""
-    if not OLEFILE_AVAILABLE:
-        return None, "olefile 모듈 없음"
-    try:
-        ole = olefile.OleFileIO(file_path)
-        # 파일 헤더 확인
-        if not ole.exists('FileHeader'):
-            ole.close()
-            return None, "HWP 파일 헤더 없음"
-        # 압축 여부 확인
-        header_data = ole.openstream('FileHeader').read()
-        is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
-        print(f"  HWP 압축 여부: {is_compressed}")
-        # BodyText에서 텍스트 추출
-        text = extract_hwp_text_from_bodytext(ole)
-        ole.close()
-        if text and len(text.strip()) > 10:
-            return text.strip(), None
-        return None, "텍스트 추출 실패"
-    except Exception as e:
-        return None, f"olefile 오류: {str(e)}"
-def extract_text_with_hwp5txt(file_path: str) -> tuple:
-    """hwp5txt 명령어로 텍스트 추출"""
-    try:
-        result = subprocess.run(
-            [sys.executable, '-m', 'hwp5', 'txt', file_path],
-            capture_output=True,
-            timeout=60
-        )
-        if result.returncode == 0 and result.stdout:
-            # 여러 인코딩 시도
-            for enc in ['utf-8', 'cp949', 'euc-kr']:
-                try:
-                    text = result.stdout.decode(enc)
-                    if text.strip() and len(text.strip()) > 10:
-                        return text.strip(), None
-                except:
-                    continue
-        stderr = result.stderr.decode('utf-8', errors='ignore') if result.stderr else ""
-        return None, f"hwp5txt 실패: {stderr[:100]}"
-    except subprocess.TimeoutExpired:
-        return None, "hwp5txt 타임아웃"
-    except Exception as e:
-        return None, f"hwp5txt 오류: {str(e)}"
 def extract_text_from_hwp(file_path: str) -> tuple:
     """HWP 파일에서 텍스트 추출 (메인 함수)"""
     print(f"\n[HWP 추출] 시작: {os.path.basename(file_path)}")
-    # 방법 1: hwp5txt 명령어 (가장 안정적)
-    print("  방법 1: hwp5txt 명령어...")
     text, error = extract_text_with_hwp5txt(file_path)
-    if text:
         print(f"  ✓ hwp5txt 성공: {len(text)} 글자")
         return text, None
     print(f"  ✗ hwp5txt 실패: {error}")
-    # 방법 2: olefile 직접 파싱
     print("  방법 2: olefile 파싱...")
     text, error = extract_text_with_olefile(file_path)
-    if text:
         print(f"  ✓ olefile 성공: {len(text)} 글자")
         return text, None
     print(f"  ✗ olefile 실패: {error}")
     return None, "모든 추출 방법 실패"
 # ============== HWP 변환 함수들 ==============
 def check_hwp_version(file_path):
@@ -441,6 +586,8 @@ def check_hwp_version(file_path):
                 return "HWP v5", True
             elif header[:4] == b'\xd0\xcf\x11\xe0':
                 return "HWP v5 (OLE)", True
             else:
                 return "Unknown", False
     except Exception as e:
@@ -451,41 +598,32 @@ def convert_to_html_subprocess(input_path, output_dir):
     output_path = os.path.join(output_dir, "output.html")
     try:
-        result = subprocess.run(
-            [sys.executable, '-m', 'hwp5', 'html', '--output', output_path, input_path],
-            capture_output=True,
-            text=True,
-            timeout=120
-        )
-        if result.returncode == 0:
-            # 결과 파일/디렉토리 찾기
-            if os.path.isfile(output_path):
-                return output_path, None
-            if os.path.isdir(output_path):
-                return output_path, None
-            # 다른 위치 검색
-            for item in os.listdir(output_dir):
-                item_path = os.path.join(output_dir, item)
-                if item.lower().endswith(('.html', '.htm')) and os.path.isfile(item_path):
-                    return item_path, None
-                if os.path.isdir(item_path):
-                    for sub in os.listdir(item_path):
-                        if sub.lower().endswith(('.html', '.htm')):
                             return item_path, None
-            return output_dir, None
-    except subprocess.TimeoutExpired:
-        return None, "HTML 변환 타임아웃"
     except Exception as e:
-        return None, f"HTML 변환 오류: {str(e)}"
     return None, "HTML 변환 실패"
 def convert_hwp_to_text(input_path: str) -> tuple:
-    """HWP를 텍스트로 변환"""
-    return extract_text_from_hwp(input_path)
 def html_to_markdown(html_content):
     """HTML을 Markdown으로 변환"""
@@ -503,7 +641,6 @@ def html_to_markdown(html_content):
         except:
             pass
-    # 기본 변환
     if BS4_AVAILABLE:
         try:
             soup = BeautifulSoup(html_content, 'html.parser')
@@ -514,41 +651,12 @@ def html_to_markdown(html_content):
     return None, "Markdown 변환 실패"
 def convert_hwp_to_markdown(input_path: str) -> tuple:
-    """HWP를 Markdown으로 변환"""
-    # 먼저 텍스트 추출 시도
-    text, error = extract_text_from_hwp(input_path)
     if text:
         return text, None
-    # HTML 변환 후 Markdown 변환
-    tmp_dir = tempfile.mkdtemp()
-    try:
-        html_output, error = convert_to_html_subprocess(input_path, tmp_dir)
-        if html_output:
-            # HTML 파일 읽기
-            html_files = []
-            if os.path.isfile(html_output):
-                html_files = [html_output]
-            elif os.path.isdir(html_output):
-                for root, dirs, files in os.walk(html_output):
-                    for f in files:
-                        if f.lower().endswith(('.html', '.htm')):
-                            html_files.append(os.path.join(root, f))
-            for html_file in html_files:
-                for enc in ['utf-8', 'cp949', 'euc-kr']:
-                    try:
-                        with open(html_file, 'r', encoding=enc) as f:
-                            content = f.read()
-                        md_text, _ = html_to_markdown(content)
-                        if md_text and len(md_text.strip()) > 10:
-                            return md_text.strip(), None
-                    except:
-                        continue
-        return None, error or "변환 실패"
-    finally:
-        shutil.rmtree(tmp_dir, ignore_errors=True)
 # ============== LLM API ==============
@@ -646,11 +754,11 @@ def process_file(file_path: str) -> tuple:
     if is_image_file(file_path):
         return "image", image_to_base64(file_path), get_image_mime_type(file_path)
-    if is_hwp_file(file_path):
-        text, error = extract_text_from_hwp(file_path)
-        if text:
-            return "text", f"[HWP 문서: {filename}]\n\n{text}", None
-        return "error", f"HWP 추출 실패: {error}", None
     if is_pdf_file(file_path):
         text = extract_text_from_pdf(file_path)
@@ -666,7 +774,7 @@ def process_file(file_path: str) -> tuple:
     return "unsupported", f"지원하지 않는 형식: {filename}", None
-def chat_response(message: str, history: List[Dict], file: Optional[str],
                   session_id: str, groq_key: str, fireworks_key: str) -> Generator[tuple, None, None]:
     if history is None:
         history = []
@@ -714,7 +822,7 @@ def chat_response(message: str, history: List[Dict], file: Optional[str],
     db_messages = get_session_messages(session_id, limit=10)
     api_messages = [{
         "role": "system",
-        "content": "당신은 도움이 되는 AI 어시스턴트입니다. 한국어로 자연스럽게 대화하며, 파일이 첨부되면 내용을 분석하여 답변합니다."
     }]
     for m in db_messages:
@@ -760,12 +868,14 @@ def load_session(session_id: str) -> tuple:
 def convert_to_odt_subprocess(input_path, output_dir):
     output_path = os.path.join(output_dir, "output.odt")
     try:
-        result = subprocess.run(
-            [sys.executable, '-m', 'hwp5', 'odt', '--output', output_path, input_path],
-            capture_output=True, timeout=120
-        )
-        if result.returncode == 0 and os.path.exists(output_path):
-            return output_path, None
     except:
         pass
     return None, "ODT 변환 실패"
@@ -773,14 +883,16 @@ def convert_to_odt_subprocess(input_path, output_dir):
 def convert_to_xml_subprocess(input_path, output_dir):
     output_path = os.path.join(output_dir, "output.xml")
     try:
-        result = subprocess.run(
-            [sys.executable, '-m', 'hwp5', 'xml', input_path],
-            capture_output=True, timeout=120
-        )
-        if result.returncode == 0 and result.stdout:
-            with open(output_path, 'wb') as f:
-                f.write(result.stdout)
-            return output_path, None
     except:
         pass
     return None, "XML 변환 실패"
@@ -790,8 +902,10 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
         return None, "❌ 파일을 업로드해주세요.", ""
     input_file = file.name if hasattr(file, 'name') else str(file)
-    if not input_file.lower().endswith('.hwp'):
-        return None, "❌ HWP 파일만 지원됩니다.", ""
     progress(0.1, desc="파일 분석 중...")
     version, is_valid = check_hwp_version(input_file)
@@ -810,6 +924,8 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
         output_path, error, ext = None, None, ""
         if output_format == "HTML":
             output_path, error = convert_to_html_subprocess(input_path, tmp_dir)
             ext = ".html"
             if output_path and os.path.isdir(output_path):
@@ -817,11 +933,13 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
                 output_path, ext = zip_path, ".zip"
         elif output_format == "ODT (OpenDocument)":
             output_path, error = convert_to_odt_subprocess(input_path, tmp_dir)
             ext = ".odt"
         elif output_format == "TXT (텍스트)":
-            text, error = extract_text_from_hwp(input_path)
             if text:
                 output_path = os.path.join(tmp_dir, "output.txt")
                 with open(output_path, 'w', encoding='utf-8') as f:
@@ -837,7 +955,24 @@ def convert_hwp(file, output_format, progress=gr.Progress()):
             ext = ".md"
         elif output_format == "XML":
-            output_path, error = convert_to_xml_subprocess(input_path, tmp_dir)
             ext = ".xml"
         if not output_path:
@@ -886,7 +1021,7 @@ css = """
 with gr.Blocks(title="AI 문서 어시스턴트") as demo:
     session_state = gr.State("")
-    gr.Markdown("# 🤖 AI 문서 어시스턴트\nLLM 채팅 + HWP 문서 변환")
     with gr.Tabs():
         with gr.Tab("💬 AI 채팅"):
@@ -897,7 +1032,7 @@ with gr.Blocks(title="AI 문서 어시스턴트") as demo:
                         groq_key = gr.Textbox(label="Groq API Key", type="password", value=GROQ_API_KEY)
                         fireworks_key = gr.Textbox(label="Fireworks API Key", type="password", value=FIREWORKS_API_KEY)
-                    gr.Markdown("### 📁 지원 파일\n- 이미지: JPG, PNG\n- 문서: PDF, TXT, HWP ✨")
                     new_btn = gr.Button("🆕 새 대화", variant="primary")
                     with gr.Accordion("📜 기록", open=False):
@@ -916,10 +1051,10 @@ with gr.Blocks(title="AI 문서 어시스턴트") as demo:
                         clear_btn = gr.Button("🗑️ 지우기", scale=1)
         with gr.Tab("📄 HWP 변환기"):
-            gr.Markdown("### HWP 파일 변환기")
             with gr.Row():
                 with gr.Column():
-                    hwp_input = gr.File(label="HWP 파일", file_types=[".hwp"], elem_classes=["upload-box"])
                     format_select = gr.Radio(["HTML", "ODT (OpenDocument)", "TXT (텍스트)", "Markdown", "XML"], value="TXT (텍스트)", label="형식")
                     convert_btn = gr.Button("🔄 변환", variant="primary", size="lg")
                 with gr.Column():
@@ -928,6 +1063,10 @@ with gr.Blocks(title="AI 문서 어시스턴트") as demo:
             with gr.Accordion("📋 미리보기", open=False):
                 preview_out = gr.Textbox(lines=15, interactive=False)
     # 이벤트
     def on_submit(msg, hist, f, sid, gk, fk):
@@ -935,7 +1074,7 @@ with gr.Blocks(title="AI 문서 어시스턴트") as demo:
         for r in chat_response(msg, hist, f, sid, gk, fk):
             yield r[0], r[1], "", None
-    submit_btn.click(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
                      [chatbot, session_state, msg_input, file_upload])
     msg_input.submit(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
                      [chatbot, session_state, msg_input, file_upload])

 import base64
 import requests
 import zlib
+import zipfile
 from pathlib import Path
 from datetime import datetime
 from typing import Generator, List, Dict, Optional
+from xml.etree import ElementTree as ET
 # ============== 환경 설정 ==============
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 if os.path.exists(PYHWP_PATH):
     sys.path.insert(0, PYHWP_PATH)
 # ============== 모듈 임포트 ==============
 try:
 except ImportError:
     PDFPLUMBER_AVAILABLE = False
+# hwp5txt 사용 가능 여부 확인
+HWP5TXT_AVAILABLE = False
+try:
+    result = subprocess.run(['hwp5txt', '--help'], capture_output=True, timeout=5)
+    if result.returncode == 0:
+        HWP5TXT_AVAILABLE = True
+        print("hwp5txt command available")
+except:
+    pass
+if not HWP5TXT_AVAILABLE:
+    try:
+        result = subprocess.run([sys.executable, '-c', 'from hwp5.hwp5txt import main; print("ok")'],
+                              capture_output=True, timeout=5)
+        if b'ok' in result.stdout:
+            HWP5TXT_AVAILABLE = True
+            print("hwp5txt module available")
+    except:
+        pass
+print(f"HWP5TXT_AVAILABLE: {HWP5TXT_AVAILABLE}")
 # ============== API 키 설정 ==============
 GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
 FIREWORKS_API_KEY = os.environ.get("FIREWORKS_API_KEY", "")
     session_id = str(uuid.uuid4())
     conn = sqlite3.connect(DB_PATH)
     cursor = conn.cursor()
+    cursor.execute("INSERT INTO sessions (session_id, title) VALUES (?, ?)",
+                   (session_id, f"대화 {datetime.now().strftime('%Y-%m-%d %H:%M')}"))
     conn.commit()
     conn.close()
     return session_id
 def save_message(session_id: str, role: str, content: str, file_info: str = None):
     conn = sqlite3.connect(DB_PATH)
     cursor = conn.cursor()
+    cursor.execute("INSERT INTO messages (session_id, role, content, file_info) VALUES (?, ?, ?, ?)",
+                   (session_id, role, content, file_info))
+    cursor.execute("UPDATE sessions SET updated_at = CURRENT_TIMESTAMP WHERE session_id = ?", (session_id,))
     conn.commit()
     conn.close()
 def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
     conn = sqlite3.connect(DB_PATH)
     cursor = conn.cursor()
+    cursor.execute("SELECT role, content, file_info, created_at FROM messages WHERE session_id = ? ORDER BY created_at DESC LIMIT ?",
+                   (session_id, limit))
     rows = cursor.fetchall()
     conn.close()
     return [{"role": r[0], "content": r[1], "file_info": r[2], "created_at": r[3]} for r in reversed(rows)]
 def get_all_sessions() -> List[Dict]:
     conn = sqlite3.connect(DB_PATH)
     cursor = conn.cursor()
+    cursor.execute("SELECT session_id, title, created_at, updated_at FROM sessions ORDER BY updated_at DESC LIMIT 50")
     rows = cursor.fetchall()
     conn.close()
     return [{"session_id": r[0], "title": r[1], "created_at": r[2], "updated_at": r[3]} for r in rows]
 def get_image_mime_type(file_path: str) -> str:
     ext = Path(file_path).suffix.lower()
+    return {'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png',
             '.gif': 'image/gif', '.webp': 'image/webp', '.bmp': 'image/bmp'}.get(ext, 'image/jpeg')
 def is_image_file(fp: str) -> bool:
     return Path(fp).suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']
 def is_hwp_file(fp: str) -> bool:
+    return Path(fp).suffix.lower() == '.hwp'
+def is_hwpx_file(fp: str) -> bool:
+    return Path(fp).suffix.lower() == '.hwpx'
 def is_pdf_file(fp: str) -> bool:
     return Path(fp).suffix.lower() == '.pdf'
 def is_text_file(fp: str) -> bool:
     return Path(fp).suffix.lower() in ['.txt', '.md', '.json', '.csv', '.xml', '.html', '.css', '.js', '.py']
+# ============== HWPX 텍스트 추출 (ZIP/XML 기반) ==============
+def extract_text_from_hwpx(file_path: str) -> tuple:
+    """HWPX 파일에서 텍스트 추출 (ZIP 내부 XML 파싱)"""
     try:
+        text_parts = []
+        with zipfile.ZipFile(file_path, 'r') as zf:
+            # HWPX 내부 구조 확인
+            file_list = zf.namelist()
+            print(f"  HWPX 내부 파일: {file_list[:10]}...")
+            # Contents 폴더 내의 section XML 파일들 처리
+            section_files = sorted([f for f in file_list if f.startswith('Contents/section') and f.endswith('.xml')])
+            if not section_files:
+                # 다른 경로 시도
+                section_files = sorted([f for f in file_list if 'section' in f.lower() and f.endswith('.xml')])
+            print(f"  섹션 파일: {section_files}")
+            for section_file in section_files:
+                try:
+                    with zf.open(section_file) as sf:
+                        content = sf.read()
+                        # XML 파싱
+                        try:
+                            # 네임스페이스 제거하고 파싱
+                            content_str = content.decode('utf-8')
+                            # 네임스페이스 제거
+                            content_str = re.sub(r'\sxmlns[^"]*"[^"]*"', '', content_str)
+                            content_str = re.sub(r'<[a-zA-Z]+:', '<', content_str)
+                            content_str = re.sub(r'</[a-zA-Z]+:', '</', content_str)
+                            root = ET.fromstring(content_str)
+                            # 모든 텍스트 추출
+                            texts = []
+                            for elem in root.iter():
+                                # t 태그 (텍스트)
+                                if elem.tag.endswith('t') or elem.tag == 't':
+                                    if elem.text:
+                                        texts.append(elem.text)
+                                # 다른 텍스트 노드
+                                elif elem.text and elem.text.strip():
+                                    # 태그 이름이 텍스트 관련인 경우
+                                    if any(x in elem.tag.lower() for x in ['text', 'run', 'para', 'char']):
+                                        texts.append(elem.text.strip())
+                            if texts:
+                                text_parts.append(' '.join(texts))
+                        except ET.ParseError as e:
+                            print(f"  XML 파싱 오류 {section_file}: {e}")
+                            # 정규식으로 텍스트 추출 시도
+                            text_matches = re.findall(r'>([^<]+)<', content.decode('utf-8', errors='ignore'))
+                            clean_texts = [t.strip() for t in text_matches if t.strip() and len(t.strip()) > 1]
+                            if clean_texts:
+                                text_parts.append(' '.join(clean_texts))
+                except Exception as e:
+                    print(f"  섹션 파일 읽기 오류 {section_file}: {e}")
+                    continue
+            # header.xml에서도 텍스트 추출 시도
+            for header_file in [f for f in file_list if 'header' in f.lower() and f.endswith('.xml')]:
+                try:
+                    with zf.open(header_file) as hf:
+                        content = hf.read().decode('utf-8', errors='ignore')
+                        text_matches = re.findall(r'>([^<]+)<', content)
+                        clean_texts = [t.strip() for t in text_matches if t.strip() and len(t.strip()) > 1]
+                        # 헤더는 짧은 텍스트만 추가
+                        if clean_texts:
+                            text_parts.insert(0, ' '.join(clean_texts[:5]))
+                except:
+                    pass
+        if text_parts:
+            result = '\n\n'.join(text_parts)
+            # 정리
+            result = re.sub(r'\s+', ' ', result)
+            result = re.sub(r'\n{3,}', '\n\n', result)
+            return result.strip(), None
+        return None, "HWPX에서 텍스트를 찾을 수 없습니다"
+    except zipfile.BadZipFile:
+        return None, "유효하지 않은 HWPX 파일"
+    except Exception as e:
+        return None, f"HWPX 처리 오류: {str(e)}"
+# ============== HWP 텍스트 추출 (OLE 기반) ==============
+def extract_text_with_hwp5txt(file_path: str) -> tuple:
+    """hwp5txt로 텍스트 추출"""
+    # 방법 1: hwp5txt 명령어 직접 실행
+    try:
+        result = subprocess.run(['hwp5txt', file_path], capture_output=True, timeout=60)
+        if result.returncode == 0 and result.stdout:
+            for enc in ['utf-8', 'cp949', 'euc-kr']:
+                try:
+                    text = result.stdout.decode(enc)
+                    if text.strip() and len(text.strip()) > 10:
+                        return text.strip(), None
+                except:
+                    continue
+    except FileNotFoundError:
+        pass
+    except Exception as e:
+        print(f"  hwp5txt 명령어 오류: {e}")
+    # 방법 2: Python 모듈로 실행
+    try:
+        from hwp5.hwp5txt import main as hwp5txt_main
+        from hwp5.hwp5txt import extract_text
+        from hwp5.filestructure import Hwp5File
+        hwp5file = Hwp5File(file_path)
+        texts = []
+        for section_idx in hwp5file.bodytext.sections():
+            section = hwp5file.bodytext.section(section_idx)
+            for para in extract_text(section):
+                if para.strip():
+                    texts.append(para.strip())
+        hwp5file.close()
+        if texts:
+            return '\n'.join(texts), None
+    except ImportError:
+        pass
+    except Exception as e:
+        print(f"  hwp5txt 모듈 오류: {e}")
+    # 방법 3: 서브프로세스로 Python 코드 실행
+    try:
+        code = f'''
+import sys
+sys.path.insert(0, "{PYHWP_PATH}")
+from hwp5.filestructure import Hwp5File
+from hwp5.hwp5txt import extract_text
+hwp = Hwp5File("{file_path}")
+for idx in hwp.bodytext.sections():
+    section = hwp.bodytext.section(idx)
+    for para in extract_text(section):
+        if para.strip():
+            print(para.strip())
+hwp.close()
+'''
+        result = subprocess.run([sys.executable, '-c', code], capture_output=True, timeout=60)
+        if result.returncode == 0 and result.stdout:
+            for enc in ['utf-8', 'cp949', 'euc-kr']:
                 try:
+                    text = result.stdout.decode(enc)
+                    if text.strip() and len(text.strip()) > 10:
+                        return text.strip(), None
                 except:
+                    continue
+    except Exception as e:
+        print(f"  hwp5txt 서브프로세스 오류: {e}")
+    return None, "hwp5txt 실패"
+def extract_text_with_olefile(file_path: str) -> tuple:
+    """olefile을 사용한 HWP 텍스트 추출"""
+    if not OLEFILE_AVAILABLE:
+        return None, "olefile 모듈 없음"
+    try:
+        ole = olefile.OleFileIO(file_path)
+        # 파일 헤더 확인
+        if not ole.exists('FileHeader'):
+            ole.close()
+            return None, "HWP 파일 헤더 없음"
+        # 압축 여부 확인
+        header_data = ole.openstream('FileHeader').read()
+        is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
+        print(f"  HWP 압축 여부: {is_compressed}")
+        all_texts = []
+        # BodyText 섹션들 처리
+        for entry in ole.listdir():
+            entry_path = '/'.join(entry)
+            if entry_path.startswith('BodyText/Section'):
+                try:
+                    stream_data = ole.openstream(entry).read()
+                    # 압축 해제
+                    if is_compressed:
+                        try:
+                            stream_data = zlib.decompress(stream_data, -15)
+                        except:
+                            try:
+                                stream_data = zlib.decompress(stream_data)
+                            except:
+                                pass
+                    # ��코드에서 텍스트 추출
+                    section_text = extract_hwp_section_text(stream_data)
+                    if section_text:
+                        all_texts.append(section_text)
+                except Exception as e:
+                    print(f"  섹션 처리 오류 {entry_path}: {e}")
+                    continue
+        ole.close()
+        if all_texts:
+            result = '\n\n'.join(all_texts)
+            return result.strip(), None
+        return None, "텍스트를 찾을 수 없습니다"
+    except Exception as e:
+        return None, f"olefile 오류: {str(e)}"
+def extract_hwp_section_text(data: bytes) -> str:
+    """HWP 섹션 데이터에서 텍스트 추출"""
     texts = []
     pos = 0
     while pos < len(data) - 4:
         try:
+            # 레코드 헤더 읽기
             header = int.from_bytes(data[pos:pos+4], 'little')
             tag_id = header & 0x3FF
+            level = (header >> 10) & 0x3FF
             size = (header >> 20) & 0xFFF
             pos += 4
             record_data = data[pos:pos+size]
             pos += size
+            # HWPTAG_PARA_TEXT = 67
             if tag_id == 67 and size > 0:
+                text = decode_para_text(record_data)
                 if text:
                     texts.append(text)
+        except:
             pos += 1
             continue
     return '\n'.join(texts) if texts else None
+def decode_para_text(data: bytes) -> str:
+    """PARA_TEXT 레코드 디코딩"""
     result = []
     i = 0
     while i < len(data) - 1:
         code = int.from_bytes(data[i:i+2], 'little')
+        if code == 0:
             pass
+        elif code == 1:  # 확장 컨트롤
+            i += 14
+        elif code == 2:  # 섹션 정의
             i += 14
         elif code == 3:  # 필드 시작
             i += 14
             result.append('\n')
         elif code == 24:  # 하이픈
             result.append('-')
+        elif code == 30 or code == 31:  # 빈칸
             result.append(' ')
+        elif code < 32:  # 기타 컨트롤 문자
+            pass
+        else:
+            # 일반 문자
+            try:
+                char = chr(code)
+                if char.isprintable() or char in '\n\t ':
+                    result.append(char)
+            except:
+                pass
         i += 2
     text = ''.join(result).strip()
+    # 정리
+    text = re.sub(r'[ \t]+', ' ', text)
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    return text if len(text) > 2 else None
 def extract_text_from_hwp(file_path: str) -> tuple:
     """HWP 파일에서 텍스트 추출 (메인 함수)"""
     print(f"\n[HWP 추출] 시작: {os.path.basename(file_path)}")
+    # 방법 1: hwp5txt
+    print("  방법 1: hwp5txt...")
     text, error = extract_text_with_hwp5txt(file_path)
+    if text and len(text.strip()) > 20:
         print(f"  ✓ hwp5txt 성공: {len(text)} 글자")
         return text, None
     print(f"  ✗ hwp5txt 실패: {error}")
+    # 방법 2: olefile
     print("  방법 2: olefile 파싱...")
     text, error = extract_text_with_olefile(file_path)
+    if text and len(text.strip()) > 20:
         print(f"  ✓ olefile 성공: {len(text)} 글자")
         return text, None
     print(f"  ✗ olefile 실패: {error}")
     return None, "모든 추출 방법 실패"
+def extract_text_from_hwp_or_hwpx(file_path: str) -> tuple:
+    """HWP 또는 HWPX 파일에서 텍스트 추출"""
+    if is_hwpx_file(file_path):
+        print(f"\n[HWPX 추출] 시작: {os.path.basename(file_path)}")
+        return extract_text_from_hwpx(file_path)
+    else:
+        return extract_text_from_hwp(file_path)
 # ============== HWP 변환 함수들 ==============
 def check_hwp_version(file_path):
                 return "HWP v5", True
             elif header[:4] == b'\xd0\xcf\x11\xe0':
                 return "HWP v5 (OLE)", True
+            elif header[:4] == b'PK\x03\x04':  # ZIP 파일 (HWPX)
+                return "HWPX", True
             else:
                 return "Unknown", False
     except Exception as e:
     output_path = os.path.join(output_dir, "output.html")
     try:
+        # hwp5html 시도
+        for cmd in [['hwp5html', '--output', output_path, input_path],
+                    [sys.executable, '-c', f'from hwp5.hwp5html import main; import sys; sys.argv=["hwp5html","--output","{output_path}","{input_path}"]; main()']]:
+            try:
+                result = subprocess.run(cmd, capture_output=True, timeout=120)
+                if result.returncode == 0:
+                    if os.path.exists(output_path):
+                        return output_path, None
+                    # 디렉토리 검색
+                    for item in os.listdir(output_dir):
+                        item_path = os.path.join(output_dir, item)
+                        if item.lower().endswith(('.html', '.htm')):
                             return item_path, None
+                        if os.path.isdir(item_path):
+                            return item_path, None
+            except:
+                continue
     except Exception as e:
+        print(f"HTML 변환 오류: {e}")
     return None, "HTML 변환 실패"
 def convert_hwp_to_text(input_path: str) -> tuple:
+    """HWP/HWPX를 텍스트로 변환"""
+    return extract_text_from_hwp_or_hwpx(input_path)
 def html_to_markdown(html_content):
     """HTML을 Markdown으로 변환"""
         except:
             pass
     if BS4_AVAILABLE:
         try:
             soup = BeautifulSoup(html_content, 'html.parser')
     return None, "Markdown 변환 실패"
 def convert_hwp_to_markdown(input_path: str) -> tuple:
+    """HWP/HWPX를 Markdown으로 변환"""
+    # 텍스트 추출
+    text, error = extract_text_from_hwp_or_hwpx(input_path)
     if text:
         return text, None
+    return None, error
 # ============== LLM API ==============
     if is_image_file(file_path):
         return "image", image_to_base64(file_path), get_image_mime_type(file_path)
+    if is_hwp_file(file_path) or is_hwpx_file(file_path):
+        text, error = extract_text_from_hwp_or_hwpx(file_path)
+        if text and len(text.strip()) > 20:
+            return "text", f"[한글 문서: {filename}]\n\n{text}", None
+        return "error", f"한글 문서 추출 실패: {error}", None
     if is_pdf_file(file_path):
         text = extract_text_from_pdf(file_path)
     return "unsupported", f"지원하지 않는 형식: {filename}", None
+def chat_response(message: str, history: List[Dict], file: Optional[str],
                   session_id: str, groq_key: str, fireworks_key: str) -> Generator[tuple, None, None]:
     if history is None:
         history = []
     db_messages = get_session_messages(session_id, limit=10)
     api_messages = [{
         "role": "system",
+        "content": "당신은 도움이 되는 AI 어시스턴트입니다. 한국어로 자연스럽게 대화하며, 파일이 첨부되면 내용을 상세히 분석하여 답변합니다."
     }]
     for m in db_messages:
 def convert_to_odt_subprocess(input_path, output_dir):
     output_path = os.path.join(output_dir, "output.odt")
     try:
+        for cmd in [['hwp5odt', '--output', output_path, input_path],
+                    [sys.executable, '-c', f'from hwp5.hwp5odt import main; import sys; sys.argv=["hwp5odt","--output","{output_path}","{input_path}"]; main()']]:
+            try:
+                result = subprocess.run(cmd, capture_output=True, timeout=120)
+                if result.returncode == 0 and os.path.exists(output_path):
+                    return output_path, None
+            except:
+                continue
     except:
         pass
     return None, "ODT 변환 실패"
 def convert_to_xml_subprocess(input_path, output_dir):
     output_path = os.path.join(output_dir, "output.xml")
     try:
+        for cmd in [['hwp5xml', input_path],
+                    [sys.executable, '-c', f'from hwp5.hwp5xml import main; import sys; sys.argv=["hwp5xml","{input_path}"]; main()']]:
+            try:
+                result = subprocess.run(cmd, capture_output=True, timeout=120)
+                if result.returncode == 0 and result.stdout:
+                    with open(output_path, 'wb') as f:
+                        f.write(result.stdout)
+                    return output_path, None
+            except:
+                continue
     except:
         pass
     return None, "XML 변환 실패"
         return None, "❌ 파일을 업로드해주세요.", ""
     input_file = file.name if hasattr(file, 'name') else str(file)
+    ext_lower = Path(input_file).suffix.lower()
+    if ext_lower not in ['.hwp', '.hwpx']:
+        return None, "❌ HWP 또는 HWPX 파일만 지원됩니다.", ""
     progress(0.1, desc="파일 분석 중...")
     version, is_valid = check_hwp_version(input_file)
         output_path, error, ext = None, None, ""
         if output_format == "HTML":
+            if ext_lower == '.hwpx':
+                return None, "❌ HWPX는 HTML 변환을 지원하지 않습니다. TXT나 Markdown을 사용하세요.", ""
             output_path, error = convert_to_html_subprocess(input_path, tmp_dir)
             ext = ".html"
             if output_path and os.path.isdir(output_path):
                 output_path, ext = zip_path, ".zip"
         elif output_format == "ODT (OpenDocument)":
+            if ext_lower == '.hwpx':
+                return None, "❌ HWPX는 ODT 변환을 지원하지 않습니다. TXT나 Markdown을 사용하세요.", ""
             output_path, error = convert_to_odt_subprocess(input_path, tmp_dir)
             ext = ".odt"
         elif output_format == "TXT (텍스트)":
+            text, error = extract_text_from_hwp_or_hwpx(input_path)
             if text:
                 output_path = os.path.join(tmp_dir, "output.txt")
                 with open(output_path, 'w', encoding='utf-8') as f:
             ext = ".md"
         elif output_format == "XML":
+            if ext_lower == '.hwpx':
+                # HWPX는 이미 XML 기반이므로 내부 XML 추출
+                try:
+                    with zipfile.ZipFile(input_path, 'r') as zf:
+                        # 모든 XML 파일을 하나로 합침
+                        xml_contents = []
+                        for name in zf.namelist():
+                            if name.endswith('.xml'):
+                                with zf.open(name) as f:
+                                    xml_contents.append(f"<!-- {name} -->\n{f.read().decode('utf-8', errors='ignore')}")
+                        output_path = os.path.join(tmp_dir, "output.xml")
+                        with open(output_path, 'w', encoding='utf-8') as f:
+                            f.write('\n\n'.join(xml_contents))
+                except Exception as e:
+                    error = f"HWPX XML 추출 실패: {e}"
+            else:
+                output_path, error = convert_to_xml_subprocess(input_path, tmp_dir)
             ext = ".xml"
         if not output_path:
 with gr.Blocks(title="AI 문서 어시스턴트") as demo:
     session_state = gr.State("")
+    gr.Markdown("# 🤖 AI 문서 어시스턴트\nLLM 채팅 + HWP/HWPX 문서 변환")
     with gr.Tabs():
         with gr.Tab("💬 AI 채팅"):
                         groq_key = gr.Textbox(label="Groq API Key", type="password", value=GROQ_API_KEY)
                         fireworks_key = gr.Textbox(label="Fireworks API Key", type="password", value=FIREWORKS_API_KEY)
+                    gr.Markdown("### 📁 지원 파일\n- 이미지: JPG, PNG\n- 문서: PDF, TXT\n- 한글: HWP, HWPX ✨")
                     new_btn = gr.Button("🆕 새 대화", variant="primary")
                     with gr.Accordion("📜 기록", open=False):
                         clear_btn = gr.Button("🗑️ 지우기", scale=1)
         with gr.Tab("📄 HWP 변환기"):
+            gr.Markdown("### HWP/HWPX 파일 변환기")
             with gr.Row():
                 with gr.Column():
+                    hwp_input = gr.File(label="HWP/HWPX 파일", file_types=[".hwp", ".hwpx"], elem_classes=["upload-box"])
                     format_select = gr.Radio(["HTML", "ODT (OpenDocument)", "TXT (텍스트)", "Markdown", "XML"], value="TXT (텍스트)", label="형식")
                     convert_btn = gr.Button("🔄 변환", variant="primary", size="lg")
                 with gr.Column():
             with gr.Accordion("📋 미리보기", open=False):
                 preview_out = gr.Textbox(lines=15, interactive=False)
+            gr.Markdown("""
+            > **참고**: HWPX 파일은 TXT, Markdown, XML 변환만 지원됩니다.
+            """)
     # 이벤트
     def on_submit(msg, hist, f, sid, gk, fk):
         for r in chat_response(msg, hist, f, sid, gk, fk):
             yield r[0], r[1], "", None
+    submit_btn.click(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
                      [chatbot, session_state, msg_input, file_upload])
     msg_input.submit(on_submit, [msg_input, chatbot, file_upload, session_state, groq_key, fireworks_key],
                      [chatbot, session_state, msg_input, file_upload])