diff --git "a/app.py" "b/app.py"
--- "a/app.py"
+++ "b/app.py"
@@ -1,602 +1,118 @@
"""
-HWP AI 어시스턴트 - Gradio 웹 앱
-AI가 HWP 파일을 읽고, 보고, 말하며, 생각하고 기억합니다.
-- Tab 1: LLM 채팅 (스트리밍, 파일 첨부 지원)
-- Tab 2: HWP 변환기
+AI 글 판별기 v4.0 — AI 탐지 + 품질 + AI→인간 변환 + 표절 검사 + 문서 분석
+═══════════════════════════════════════════════════════════════════════════
+5축 AI 탐지 | 6항목 품질 | LLM 교차검증 (GPT-OSS-120B · Qwen3-32B · Kimi-K2)
+★ AI→인간: Adversarial Humanizer v2 (반복 자기대전 루프)
+★ 표절: Brave Search 병렬(최대20) + KCI/RISS/ARXIV + Gemini + CopyKiller 보고서
+★ 문서: PDF·DOCX·HWP·HWPX·TXT 업로드 → 섹션별 히트맵 + PDF 보고서
"""
import gradio as gr
-import tempfile
-import os
-import subprocess
-import shutil
-import sys
-import re
-import json
-import uuid
-import sqlite3
-import base64
-import requests
-import zlib
-import zipfile
-from pathlib import Path
+import math, re, os, json, random, time, hashlib, zlib, zipfile, tempfile
+from collections import Counter
from datetime import datetime
-from typing import Generator, List, Dict, Optional
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
from xml.etree import ElementTree as ET
+from kiwipiepy import Kiwi
-# Groq 라이브러리 임포트
+KIWI = Kiwi()
try:
- from groq import Groq
- GROQ_AVAILABLE = True
- print("✅ Groq library loaded")
+ import httpx; HAS_HTTPX = True
except ImportError:
- GROQ_AVAILABLE = False
- print("❌ Groq library not available - pip install groq")
-
-# ============== Comic Style CSS ==============
-COMIC_CSS = """
-@import url('https://fonts.googleapis.com/css2?family=Bangers&family=Comic+Neue:wght@400;700&display=swap');
-
-.gradio-container {
- background-color: #FEF9C3 !important;
- background-image: radial-gradient(#1F2937 1px, transparent 1px) !important;
- background-size: 20px 20px !important;
- min-height: 100vh !important;
- font-family: 'Comic Neue', cursive, sans-serif !important;
-}
-
-footer, .footer, .gradio-container footer, .built-with, [class*="footer"], .gradio-footer, a[href*="gradio.app"] {
- display: none !important;
- visibility: hidden !important;
- height: 0 !important;
-}
-
-
-/* HOME Button Style */
-.home-button-container {
- display: flex;
- justify-content: center;
- align-items: center;
- gap: 15px;
- margin-bottom: 15px;
- padding: 12px 20px;
- background: linear-gradient(135deg, #10B981 0%, #059669 100%);
- border: 4px solid #1F2937;
- border-radius: 12px;
- box-shadow: 6px 6px 0 #1F2937;
-}
-
-.home-button {
- display: inline-flex;
- align-items: center;
- gap: 8px;
- padding: 10px 25px;
- background: linear-gradient(135deg, #FACC15 0%, #F59E0B 100%);
- color: #1F2937;
- font-family: 'Bangers', cursive;
- font-size: 1.4rem;
- letter-spacing: 2px;
- text-decoration: none;
- border: 3px solid #1F2937;
- border-radius: 8px;
- box-shadow: 4px 4px 0 #1F2937;
- transition: all 0.2s ease;
-}
-
-.home-button:hover {
- background: linear-gradient(135deg, #FDE047 0%, #FACC15 100%);
- transform: translate(-2px, -2px);
- box-shadow: 6px 6px 0 #1F2937;
-}
-
-.home-button:active {
- transform: translate(2px, 2px);
- box-shadow: 2px 2px 0 #1F2937;
-}
-
-.url-display {
- font-family: 'Comic Neue', cursive;
- font-size: 1.1rem;
- font-weight: 700;
- color: #FFF;
- background: rgba(0,0,0,0.3);
- padding: 8px 16px;
- border-radius: 6px;
- border: 2px solid rgba(255,255,255,0.3);
-}
-
-.header-container {
- text-align: center;
- padding: 25px 20px;
- background: linear-gradient(135deg, #3B82F6 0%, #8B5CF6 100%);
- border: 4px solid #1F2937;
- border-radius: 12px;
- margin-bottom: 20px;
- box-shadow: 8px 8px 0 #1F2937;
- position: relative;
-}
-
-.header-title {
- font-family: 'Bangers', cursive !important;
- color: #FFF !important;
- font-size: 2.8rem !important;
- text-shadow: 3px 3px 0 #1F2937 !important;
- letter-spacing: 3px !important;
- margin: 0 !important;
-}
-
-.header-subtitle {
- font-family: 'Comic Neue', cursive !important;
- font-size: 1.1rem !important;
- color: #FEF9C3 !important;
- margin-top: 8px !important;
- font-weight: 700 !important;
-}
-
-.stats-badge {
- display: inline-block;
- background: #FACC15;
- color: #1F2937;
- padding: 6px 14px;
- border-radius: 20px;
- font-size: 0.9rem;
- margin: 3px;
- font-weight: 700;
- border: 2px solid #1F2937;
- box-shadow: 2px 2px 0 #1F2937;
-}
-
-/* 무료 서비스 안내 박스 */
-.free-service-notice {
- text-align: center;
- padding: 10px 15px;
- background: linear-gradient(135deg, #FEE2E2 0%, #FECACA 100%);
- border: 3px solid #1F2937;
- border-radius: 8px;
- margin: 10px 0;
- box-shadow: 4px 4px 0 #1F2937;
- font-family: 'Comic Neue', cursive;
- font-weight: 700;
- color: #991B1B;
-}
-
-.free-service-notice a {
- color: #1D4ED8;
- text-decoration: none;
- font-weight: 700;
-}
-
-.free-service-notice a:hover {
- text-decoration: underline;
-}
-
-.gr-panel, .gr-box, .gr-form, .block, .gr-group {
- background: #FFF !important;
- border: 3px solid #1F2937 !important;
- border-radius: 8px !important;
- box-shadow: 5px 5px 0 #1F2937 !important;
-}
-
-.gr-button-primary, button.primary, .gr-button.primary {
- background: linear-gradient(135deg, #EF4444 0%, #F97316 100%) !important;
- border: 3px solid #1F2937 !important;
- border-radius: 8px !important;
- color: #FFF !important;
- font-family: 'Bangers', cursive !important;
- font-size: 1.3rem !important;
- letter-spacing: 2px !important;
- padding: 12px 24px !important;
- box-shadow: 4px 4px 0 #1F2937 !important;
- text-shadow: 1px 1px 0 #1F2937 !important;
- transition: all 0.2s ease !important;
-}
-
-.gr-button-primary:hover, button.primary:hover {
- background: linear-gradient(135deg, #DC2626 0%, #EA580C 100%) !important;
- transform: translate(-2px, -2px) !important;
- box-shadow: 6px 6px 0 #1F2937 !important;
-}
-
-.gr-button-primary:active, button.primary:active {
- transform: translate(2px, 2px) !important;
- box-shadow: 2px 2px 0 #1F2937 !important;
-}
-
-textarea, input[type="text"], input[type="number"] {
- background: #FFF !important;
- border: 3px solid #1F2937 !important;
- border-radius: 8px !important;
- color: #1F2937 !important;
- font-family: 'Comic Neue', cursive !important;
- font-weight: 700 !important;
-}
-
-textarea:focus, input[type="text"]:focus {
- border-color: #3B82F6 !important;
- box-shadow: 3px 3px 0 #3B82F6 !important;
-}
-
-.info-box {
- background: linear-gradient(135deg, #FACC15 0%, #FDE047 100%) !important;
- border: 3px solid #1F2937 !important;
- border-radius: 8px !important;
- padding: 12px 15px !important;
- margin: 10px 0 !important;
- box-shadow: 4px 4px 0 #1F2937 !important;
- font-family: 'Comic Neue', cursive !important;
- font-weight: 700 !important;
- color: #1F2937 !important;
-}
-
-.feature-box {
- background: linear-gradient(135deg, #E0F2FE 0%, #BAE6FD 100%) !important;
- border: 3px solid #1F2937 !important;
- border-radius: 12px !important;
- padding: 20px !important;
- margin: 15px 0 !important;
- box-shadow: 5px 5px 0 #1F2937 !important;
-}
-
-.feature-title {
- font-family: 'Bangers', cursive !important;
- font-size: 1.5rem !important;
- color: #1F2937 !important;
- margin-bottom: 10px !important;
- text-shadow: 1px 1px 0 #FFF !important;
-}
-
-.feature-item {
- display: flex;
- align-items: center;
- gap: 10px;
- padding: 8px 0;
- font-family: 'Comic Neue', cursive !important;
- font-weight: 700 !important;
- font-size: 1rem !important;
- color: #1F2937 !important;
-}
-
-.feature-icon {
- font-size: 1.5rem;
-}
-
-/* Markdown 강조 박스 */
-.markdown-highlight-box {
- background: linear-gradient(135deg, #EC4899 0%, #F472B6 100%) !important;
- border: 4px solid #1F2937 !important;
- border-radius: 12px !important;
- padding: 20px !important;
- margin: 15px 0 !important;
- box-shadow: 6px 6px 0 #1F2937 !important;
- animation: pulse-glow 2s ease-in-out infinite;
-}
-
-@keyframes pulse-glow {
- 0%, 100% { box-shadow: 6px 6px 0 #1F2937; }
- 50% { box-shadow: 8px 8px 0 #1F2937, 0 0 20px rgba(236, 72, 153, 0.5); }
-}
-
-.markdown-title {
- font-family: 'Bangers', cursive !important;
- font-size: 2rem !important;
- color: #FFF !important;
- text-shadow: 3px 3px 0 #1F2937 !important;
- letter-spacing: 2px !important;
- margin-bottom: 15px !important;
- text-align: center !important;
-}
-
-.markdown-benefits {
- display: grid;
- grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
- gap: 12px;
- margin-top: 10px;
-}
-
-.markdown-benefit-item {
- background: rgba(255,255,255,0.95) !important;
- border: 3px solid #1F2937 !important;
- border-radius: 8px !important;
- padding: 12px !important;
- box-shadow: 3px 3px 0 #1F2937 !important;
- font-family: 'Comic Neue', cursive !important;
- font-weight: 700 !important;
- font-size: 0.95rem !important;
- color: #1F2937 !important;
- text-align: center !important;
-}
-
-.markdown-benefit-icon {
- font-size: 1.8rem !important;
- display: block !important;
- margin-bottom: 5px !important;
-}
-
-label, .gr-input-label, .gr-block-label {
- color: #1F2937 !important;
- font-family: 'Comic Neue', cursive !important;
- font-weight: 700 !important;
-}
-
-.gr-accordion {
- background: #E0F2FE !important;
- border: 3px solid #1F2937 !important;
- border-radius: 8px !important;
- box-shadow: 4px 4px 0 #1F2937 !important;
-}
-
-.footer-comic {
- text-align: center;
- padding: 20px;
- background: linear-gradient(135deg, #3B82F6 0%, #8B5CF6 100%);
- border: 4px solid #1F2937;
- border-radius: 12px;
- margin-top: 20px;
- box-shadow: 6px 6px 0 #1F2937;
-}
-
-.footer-comic p {
- font-family: 'Comic Neue', cursive !important;
- color: #FFF !important;
- margin: 5px 0 !important;
- font-weight: 700 !important;
-}
-
-::-webkit-scrollbar {
- width: 12px;
- height: 12px;
-}
-
-::-webkit-scrollbar-track {
- background: #FEF9C3;
- border: 2px solid #1F2937;
-}
-
-::-webkit-scrollbar-thumb {
- background: #3B82F6;
- border: 2px solid #1F2937;
- border-radius: 6px;
-}
-
-::-webkit-scrollbar-thumb:hover {
- background: #EF4444;
-}
-
-::selection {
- background: #FACC15;
- color: #1F2937;
-}
-
-/* Chatbot Styling */
-.gr-chatbot {
- border: 3px solid #1F2937 !important;
- border-radius: 12px !important;
- box-shadow: 5px 5px 0 #1F2937 !important;
-}
-
-/* Tab Styling */
-.gr-tab-nav {
- background: linear-gradient(135deg, #F59E0B 0%, #FACC15 100%) !important;
- border: 3px solid #1F2937 !important;
- border-radius: 8px 8px 0 0 !important;
-}
-
-.gr-tab-nav button {
- font-family: 'Bangers', cursive !important;
- font-size: 1.2rem !important;
- letter-spacing: 1px !important;
- color: #1F2937 !important;
-}
-
-.gr-tab-nav button.selected {
- background: #FFF !important;
- border-bottom: 3px solid #FFF !important;
-}
-
-/* File Upload Box */
-.upload-box {
- border: 3px dashed #3B82F6 !important;
- border-radius: 12px !important;
- background: linear-gradient(135deg, #EFF6FF 0%, #DBEAFE 100%) !important;
- box-shadow: 4px 4px 0 #1F2937 !important;
-}
-
-.download-box {
- border: 3px solid #10B981 !important;
- border-radius: 12px !important;
- background: linear-gradient(135deg, #ECFDF5 0%, #D1FAE5 100%) !important;
- box-shadow: 4px 4px 0 #1F2937 !important;
-}
-"""
-
-# ============== 환경 설정 ==============
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-PYHWP_PATH = os.path.join(SCRIPT_DIR, 'pyhwp')
-DB_PATH = os.path.join(SCRIPT_DIR, 'chat_history.db')
-
-if os.path.exists(PYHWP_PATH):
- sys.path.insert(0, PYHWP_PATH)
-
-# ============== 모듈 임포트 ==============
+ HAS_HTTPX = False
try:
- import olefile
- OLEFILE_AVAILABLE = True
- print("✅ olefile loaded")
+ from google import genai
+ from google.genai import types as gtypes
+ HAS_GENAI = True
except ImportError:
- OLEFILE_AVAILABLE = False
+ HAS_GENAI = False
+# ── 문서 추출 라이브러리 ──
try:
- from markdownify import markdownify as md
- MARKDOWNIFY_AVAILABLE = True
- print("✅ markdownify loaded")
+ import olefile; HAS_OLEFILE = True
except ImportError:
- MARKDOWNIFY_AVAILABLE = False
-
+ HAS_OLEFILE = False
try:
- import html2text
- HTML2TEXT_AVAILABLE = True
- print("✅ html2text loaded")
+ import pdfplumber; HAS_PDFPLUMBER = True
except ImportError:
- HTML2TEXT_AVAILABLE = False
-
+ HAS_PDFPLUMBER = False
try:
- from bs4 import BeautifulSoup
- BS4_AVAILABLE = True
+ import PyPDF2; HAS_PYPDF2 = True
except ImportError:
- BS4_AVAILABLE = False
-
+ HAS_PYPDF2 = False
try:
- import PyPDF2
- PYPDF2_AVAILABLE = True
- print("✅ PyPDF2 loaded")
+ from docx import Document as DocxDocument; HAS_DOCX = True
except ImportError:
- PYPDF2_AVAILABLE = False
+ HAS_DOCX = False
-try:
- import pdfplumber
- PDFPLUMBER_AVAILABLE = True
- print("✅ pdfplumber loaded")
-except ImportError:
- PDFPLUMBER_AVAILABLE = False
-
-# ============== API 키 설정 ==============
-GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
-FIREWORKS_API_KEY = os.environ.get("FIREWORKS_API_KEY", "")
-
-# ============== SQLite 데이터베이스 ==============
-def init_database():
- conn = sqlite3.connect(DB_PATH)
- cursor = conn.cursor()
- cursor.execute('''
- CREATE TABLE IF NOT EXISTS sessions (
- session_id TEXT PRIMARY KEY,
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- title TEXT
- )
- ''')
- cursor.execute('''
- CREATE TABLE IF NOT EXISTS messages (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- session_id TEXT,
- role TEXT,
- content TEXT,
- file_info TEXT,
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- FOREIGN KEY (session_id) REFERENCES sessions(session_id)
- )
- ''')
- conn.commit()
- conn.close()
-
-def create_session() -> str:
- session_id = str(uuid.uuid4())
- conn = sqlite3.connect(DB_PATH)
- cursor = conn.cursor()
- cursor.execute("INSERT INTO sessions (session_id, title) VALUES (?, ?)",
- (session_id, f"대화 {datetime.now().strftime('%Y-%m-%d %H:%M')}"))
- conn.commit()
- conn.close()
- return session_id
-
-def save_message(session_id: str, role: str, content: str, file_info: str = None):
- conn = sqlite3.connect(DB_PATH)
- cursor = conn.cursor()
- cursor.execute("INSERT INTO messages (session_id, role, content, file_info) VALUES (?, ?, ?, ?)",
- (session_id, role, content, file_info))
- cursor.execute("UPDATE sessions SET updated_at = CURRENT_TIMESTAMP WHERE session_id = ?", (session_id,))
- conn.commit()
- conn.close()
-
-def get_session_messages(session_id: str, limit: int = 20) -> List[Dict]:
- conn = sqlite3.connect(DB_PATH)
- cursor = conn.cursor()
- cursor.execute("SELECT role, content, file_info, created_at FROM messages WHERE session_id = ? ORDER BY created_at DESC LIMIT ?",
- (session_id, limit))
- rows = cursor.fetchall()
- conn.close()
- return [{"role": r[0], "content": r[1], "file_info": r[2], "created_at": r[3]} for r in reversed(rows)]
-
-def get_all_sessions() -> List[Dict]:
- conn = sqlite3.connect(DB_PATH)
- cursor = conn.cursor()
- cursor.execute("SELECT session_id, title, created_at, updated_at FROM sessions ORDER BY updated_at DESC LIMIT 50")
- rows = cursor.fetchall()
- conn.close()
- return [{"session_id": r[0], "title": r[1], "created_at": r[2], "updated_at": r[3]} for r in rows]
-
-def update_session_title(session_id: str, title: str):
- conn = sqlite3.connect(DB_PATH)
- cursor = conn.cursor()
- cursor.execute("UPDATE sessions SET title = ? WHERE session_id = ?", (title, session_id))
- conn.commit()
- conn.close()
-
-init_database()
-
-# ============== 파일 유틸리티 ==============
-def extract_text_from_pdf(file_path: str) -> str:
- text_parts = []
- if PDFPLUMBER_AVAILABLE:
+GROQ_KEY = os.getenv("GROQ_API_KEY", "")
+GEMINI_KEY = os.getenv("GEMINI_API_KEY", "")
+BRAVE_KEY = os.getenv("BRAVE_API_KEY", "")
+
+# ═══════════════════════════════════════════════
+# 문서 텍스트 추출 엔진
+# ═══════════════════════════════════════════════
+
+def extract_text_from_pdf(file_path):
+ """PDF → 텍스트 (페이지별 분리)"""
+ pages = []
+ if HAS_PDFPLUMBER:
try:
with pdfplumber.open(file_path) as pdf:
- for page in pdf.pages:
- text = page.extract_text()
- if text:
- text_parts.append(text)
- if text_parts:
- return "\n\n".join(text_parts)
+ for p in pdf.pages:
+ t = p.extract_text()
+ if t: pages.append(t)
+ if pages: return pages, None
except Exception as e:
- print(f"pdfplumber error: {e}")
-
- if PYPDF2_AVAILABLE:
+ print(f"pdfplumber: {e}")
+ if HAS_PYPDF2:
try:
with open(file_path, 'rb') as f:
reader = PyPDF2.PdfReader(f)
- for page in reader.pages:
- text = page.extract_text()
- if text:
- text_parts.append(text)
- if text_parts:
- return "\n\n".join(text_parts)
+ for p in reader.pages:
+ t = p.extract_text()
+ if t: pages.append(t)
+ if pages: return pages, None
except Exception as e:
- print(f"PyPDF2 error: {e}")
- return None
-
-def extract_text_from_txt(file_path: str) -> str:
- for encoding in ['utf-8', 'euc-kr', 'cp949', 'utf-16', 'latin-1']:
- try:
- with open(file_path, 'r', encoding=encoding) as f:
- return f.read()
- except:
- continue
- return None
-
-def image_to_base64(file_path: str) -> str:
- with open(file_path, 'rb') as f:
- return base64.b64encode(f.read()).decode('utf-8')
-
-def get_image_mime_type(file_path: str) -> str:
- ext = Path(file_path).suffix.lower()
- return {'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png',
- '.gif': 'image/gif', '.webp': 'image/webp', '.bmp': 'image/bmp'}.get(ext, 'image/jpeg')
-
-def is_image_file(fp: str) -> bool:
- return Path(fp).suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']
+ print(f"PyPDF2: {e}")
+ return None, "PDF 추출 실패 (pdfplumber, PyPDF2 없음)"
-def is_hwp_file(fp: str) -> bool:
- return Path(fp).suffix.lower() == '.hwp'
-
-def is_hwpx_file(fp: str) -> bool:
- return Path(fp).suffix.lower() == '.hwpx'
-
-def is_pdf_file(fp: str) -> bool:
- return Path(fp).suffix.lower() == '.pdf'
-
-def is_text_file(fp: str) -> bool:
- return Path(fp).suffix.lower() in ['.txt', '.md', '.json', '.csv', '.xml', '.html', '.css', '.js', '.py']
+def extract_text_from_docx(file_path):
+ """DOCX → 텍스트 (문단별 분리)"""
+ if not HAS_DOCX: return None, "python-docx 없음"
+ try:
+ doc = DocxDocument(file_path)
+ sections = []
+ current = []
+ for para in doc.paragraphs:
+ txt = para.text.strip()
+ if not txt:
+ if current:
+ sections.append('\n'.join(current))
+ current = []
+ else:
+ current.append(txt)
+ if current: sections.append('\n'.join(current))
+ if sections: return sections, None
+ return None, "DOCX 텍스트 없음"
+ except Exception as e:
+ return None, f"DOCX 오류: {e}"
-# ============== HWPX 텍스트 추출 ==============
-def extract_text_from_hwpx(file_path: str) -> tuple:
+def extract_text_from_txt(file_path):
+ """TXT/MD/CSV 등 → 텍스트"""
+ for enc in ['utf-8', 'euc-kr', 'cp949', 'utf-16', 'latin-1']:
+ try:
+ with open(file_path, 'r', encoding=enc) as f:
+ text = f.read()
+ if text.strip():
+ # 빈 줄 기준으로 섹션 분리
+ sections = [s.strip() for s in re.split(r'\n{2,}', text) if s.strip()]
+ return sections if sections else [text], None
+ except: continue
+ return None, "텍스트 인코딩 감지 실패"
+
+def extract_text_from_hwpx(file_path):
+ """HWPX (ZIP 기반) → 텍스트"""
try:
text_parts = []
with zipfile.ZipFile(file_path, 'r') as zf:
@@ -604,134 +120,60 @@ def extract_text_from_hwpx(file_path: str) -> tuple:
section_files = sorted([f for f in file_list if f.startswith('Contents/section') and f.endswith('.xml')])
if not section_files:
section_files = sorted([f for f in file_list if 'section' in f.lower() and f.endswith('.xml')])
-
- for section_file in section_files:
+ for sf_name in section_files:
try:
- with zf.open(section_file) as sf:
- content = sf.read()
- content_str = content.decode('utf-8')
- content_str = re.sub(r'\sxmlns[^"]*"[^"]*"', '', content_str)
- content_str = re.sub(r'<[a-zA-Z]+:', '<', content_str)
- content_str = re.sub(r'[a-zA-Z]+:', '', content_str)
-
+ with zf.open(sf_name) as sf:
+ content = sf.read().decode('utf-8', errors='ignore')
+ content = re.sub(r'\sxmlns[^"]*"[^"]*"', '', content)
+ content = re.sub(r'<[a-zA-Z]+:', '<', content)
+ content = re.sub(r'[a-zA-Z]+:', '', content)
try:
- root = ET.fromstring(content_str)
+ root = ET.fromstring(content)
texts = []
for elem in root.iter():
if elem.tag.endswith('t') or elem.tag == 't':
- if elem.text:
- texts.append(elem.text)
+ if elem.text: texts.append(elem.text)
elif elem.text and elem.text.strip():
if any(x in elem.tag.lower() for x in ['text', 'run', 'para', 'char']):
texts.append(elem.text.strip())
- if texts:
- text_parts.append(' '.join(texts))
+ if texts: text_parts.append(' '.join(texts))
except ET.ParseError:
- text_matches = re.findall(r'>([^<]+)<', content.decode('utf-8', errors='ignore'))
- clean_texts = [t.strip() for t in text_matches if t.strip() and len(t.strip()) > 1]
- if clean_texts:
- text_parts.append(' '.join(clean_texts))
- except:
- continue
-
+ matches = re.findall(r'>([^<]+)<', content)
+ clean = [t.strip() for t in matches if t.strip() and len(t.strip()) > 1]
+ if clean: text_parts.append(' '.join(clean))
+ except: continue
if text_parts:
- result = '\n\n'.join(text_parts)
- result = re.sub(r'\s+', ' ', result)
- result = re.sub(r'\n{3,}', '\n\n', result)
- return result.strip(), None
- return None, "HWPX에서 텍스트를 찾을 수 없습니다"
+ return text_parts, None
+ return None, "HWPX 텍스트 없음"
except zipfile.BadZipFile:
- return None, "유효하지 않은 HWPX 파일"
+ return None, "유효하지 않은 HWPX"
except Exception as e:
- return None, f"HWPX 처리 오류: {str(e)}"
+ return None, f"HWPX 오류: {e}"
-# ============== HWP 텍스트 추출 ==============
-def extract_text_with_hwp5txt(file_path: str) -> tuple:
- try:
- result = subprocess.run(['hwp5txt', file_path], capture_output=True, timeout=60)
- if result.returncode == 0 and result.stdout:
- for enc in ['utf-8', 'cp949', 'euc-kr']:
- try:
- text = result.stdout.decode(enc)
- if text.strip() and len(text.strip()) > 10:
- return text.strip(), None
- except:
- continue
- except FileNotFoundError:
- pass
- except Exception as e:
- print(f"hwp5txt error: {e}")
-
- try:
- code = f'''
-import sys
-sys.path.insert(0, "{PYHWP_PATH}")
-from hwp5.filestructure import Hwp5File
-from hwp5.hwp5txt import extract_text
-hwp = Hwp5File("{file_path}")
-for idx in hwp.bodytext.sections():
- section = hwp.bodytext.section(idx)
- for para in extract_text(section):
- if para.strip():
- print(para.strip())
-hwp.close()
-'''
- result = subprocess.run([sys.executable, '-c', code], capture_output=True, timeout=60)
- if result.returncode == 0 and result.stdout:
- for enc in ['utf-8', 'cp949', 'euc-kr']:
- try:
- text = result.stdout.decode(enc)
- if text.strip() and len(text.strip()) > 10:
- return text.strip(), None
- except:
- continue
- except Exception as e:
- print(f"hwp5txt subprocess error: {e}")
-
- return None, "hwp5txt 실패"
-
-def extract_text_with_olefile(file_path: str) -> tuple:
- if not OLEFILE_AVAILABLE:
- return None, "olefile 모듈 없음"
-
- try:
- ole = olefile.OleFileIO(file_path)
- if not ole.exists('FileHeader'):
- ole.close()
- return None, "HWP 파일 헤더 없음"
-
- header_data = ole.openstream('FileHeader').read()
- is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
-
- all_texts = []
- for entry in ole.listdir():
- entry_path = '/'.join(entry)
- if entry_path.startswith('BodyText/Section'):
- try:
- stream_data = ole.openstream(entry).read()
- if is_compressed:
- try:
- stream_data = zlib.decompress(stream_data, -15)
- except:
- try:
- stream_data = zlib.decompress(stream_data)
- except:
- pass
-
- section_text = extract_hwp_section_text(stream_data)
- if section_text:
- all_texts.append(section_text)
- except:
- continue
-
- ole.close()
- if all_texts:
- return '\n\n'.join(all_texts).strip(), None
- return None, "텍스트를 찾을 수 없습니다"
- except Exception as e:
- return None, f"olefile 오류: {str(e)}"
+def _decode_hwp_para(data):
+ """HWP 바이너리 → 문단 텍스트"""
+ result = []
+ i = 0
+ while i < len(data) - 1:
+ code = int.from_bytes(data[i:i+2], 'little')
+ if code in (1,2,3): i += 14
+ elif code == 9: result.append('\t')
+ elif code in (10,13): result.append('\n')
+ elif code == 24: result.append('-')
+ elif code in (30,31): result.append(' ')
+ elif code >= 32:
+ try:
+ ch = chr(code)
+ if ch.isprintable() or ch in '\n\t ': result.append(ch)
+ except: pass
+ i += 2
+ text = ''.join(result).strip()
+ text = re.sub(r'[ \t]+', ' ', text)
+ text = re.sub(r'\n{3,}', '\n\n', text)
+ return text if len(text) > 2 else None
-def extract_hwp_section_text(data: bytes) -> str:
+def _extract_hwp_section(data):
+ """HWP 섹션 바이너리 → 텍스트"""
texts = []
pos = 0
while pos < len(data) - 4:
@@ -741,836 +183,1660 @@ def extract_hwp_section_text(data: bytes) -> str:
size = (header >> 20) & 0xFFF
pos += 4
if size == 0xFFF:
- if pos + 4 > len(data):
- break
+ if pos + 4 > len(data): break
size = int.from_bytes(data[pos:pos+4], 'little')
pos += 4
- if pos + size > len(data):
- break
+ if pos + size > len(data): break
record_data = data[pos:pos+size]
pos += size
if tag_id == 67 and size > 0:
- text = decode_para_text(record_data)
- if text:
- texts.append(text)
+ t = _decode_hwp_para(record_data)
+ if t: texts.append(t)
except:
pos += 1
- continue
return '\n'.join(texts) if texts else None
-def decode_para_text(data: bytes) -> str:
- result = []
- i = 0
- while i < len(data) - 1:
- code = int.from_bytes(data[i:i+2], 'little')
- if code == 0:
- pass
- elif code == 1:
- i += 14
- elif code == 2:
- i += 14
- elif code == 3:
- i += 14
- elif code == 4:
- pass
- elif code == 9:
- result.append('\t')
- elif code == 10:
- result.append('\n')
- elif code == 13:
- result.append('\n')
- elif code == 24:
- result.append('-')
- elif code == 30 or code == 31:
- result.append(' ')
- elif code < 32:
- pass
- else:
- try:
- char = chr(code)
- if char.isprintable() or char in '\n\t ':
- result.append(char)
- except:
- pass
- i += 2
- text = ''.join(result).strip()
- text = re.sub(r'[ \t]+', ' ', text)
- text = re.sub(r'\n{3,}', '\n\n', text)
- return text if len(text) > 2 else None
-
-def extract_text_from_hwp(file_path: str) -> tuple:
- print(f"\n📖 [HWP 읽기] {os.path.basename(file_path)}")
- text, error = extract_text_with_hwp5txt(file_path)
- if text and len(text.strip()) > 20:
- print(f" ✅ 성공: {len(text)} 글자")
- return text, None
- text, error = extract_text_with_olefile(file_path)
- if text and len(text.strip()) > 20:
- print(f" ✅ 성공: {len(text)} 글자")
- return text, None
- print(f" ❌ 실패: {error}")
- return None, "모든 추출 방법 실패"
-
-def extract_text_from_hwp_or_hwpx(file_path: str) -> tuple:
- if is_hwpx_file(file_path):
- print(f"\n📖 [HWPX 읽기] {os.path.basename(file_path)}")
- return extract_text_from_hwpx(file_path)
- else:
- return extract_text_from_hwp(file_path)
-
-# ============== HWP 변환 함수들 ==============
-def check_hwp_version(file_path):
+def extract_text_from_hwp(file_path):
+ """HWP (OLE 기반) → 텍스트"""
+ if not HAS_OLEFILE: return None, "olefile 없음"
try:
- with open(file_path, 'rb') as f:
- header = f.read(32)
- if b'HWP Document File' in header:
- return "HWP v5", True
- elif header[:4] == b'\xd0\xcf\x11\xe0':
- return "HWP v5 (OLE)", True
- elif header[:4] == b'PK\x03\x04':
- return "HWPX", True
- else:
- return "Unknown", False
+ ole = olefile.OleFileIO(file_path)
+ if not ole.exists('FileHeader'):
+ ole.close(); return None, "HWP 헤더 없음"
+ header_data = ole.openstream('FileHeader').read()
+ is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
+ all_texts = []
+ for entry in ole.listdir():
+ entry_path = '/'.join(entry)
+ if entry_path.startswith('BodyText/Section'):
+ try:
+ stream = ole.openstream(entry).read()
+ if is_compressed:
+ try: stream = zlib.decompress(stream, -15)
+ except:
+ try: stream = zlib.decompress(stream)
+ except: pass
+ section_text = _extract_hwp_section(stream)
+ if section_text: all_texts.append(section_text)
+ except: continue
+ ole.close()
+ if all_texts: return all_texts, None
+ return None, "HWP 텍스트 없음"
except Exception as e:
- return f"Error: {e}", False
+ return None, f"HWP 오류: {e}"
+
+def extract_text_from_file(file_path):
+ """
+ 만능 문서 추출: PDF/DOCX/HWP/HWPX/TXT → (sections_list, full_text, error)
+ sections_list: 페이지/섹션별 텍스트 리스트
+ full_text: 전체 합친 텍스트
+ """
+ if not file_path or not os.path.exists(file_path):
+ return None, None, "파일 없음"
+ ext = Path(file_path).suffix.lower()
+ sections, error = None, None
+
+ if ext == '.pdf':
+ sections, error = extract_text_from_pdf(file_path)
+ elif ext == '.docx':
+ sections, error = extract_text_from_docx(file_path)
+ elif ext == '.hwpx':
+ sections, error = extract_text_from_hwpx(file_path)
+ elif ext == '.hwp':
+ sections, error = extract_text_from_hwp(file_path)
+ elif ext in ('.txt', '.md', '.csv', '.json', '.xml', '.html'):
+ sections, error = extract_text_from_txt(file_path)
+ else:
+ return None, None, f"지원하지 않는 형식: {ext}"
-def convert_to_html_subprocess(input_path, output_dir):
- output_path = os.path.join(output_dir, "output.html")
- try:
- for cmd in [['hwp5html', '--output', output_path, input_path]]:
- try:
- result = subprocess.run(cmd, capture_output=True, timeout=120)
- if result.returncode == 0:
- if os.path.exists(output_path):
- return output_path, None
- for item in os.listdir(output_dir):
- item_path = os.path.join(output_dir, item)
- if item.lower().endswith(('.html', '.htm')):
- return item_path, None
- if os.path.isdir(item_path):
- return item_path, None
- except:
- continue
- except Exception as e:
- print(f"HTML 변환 오류: {e}")
- return None, "HTML 변환 실패"
+ if sections:
+ full = '\n\n'.join(sections)
+ return sections, full, None
+ return None, None, error or "텍스트 추출 실패"
-def html_to_markdown(html_content):
- if MARKDOWNIFY_AVAILABLE:
- try:
- return md(html_content, heading_style="ATX", bullets="-"), None
- except:
- pass
- if HTML2TEXT_AVAILABLE:
- try:
- h = html2text.HTML2Text()
- h.body_width = 0
- return h.handle(html_content), None
- except:
- pass
- if BS4_AVAILABLE:
- try:
- soup = BeautifulSoup(html_content, 'html.parser')
- return soup.get_text(separator='\n'), None
- except:
- pass
- return None, "Markdown 변환 실패"
+# ═══════════════════════════════════════════════
+# 유틸리티
+# ═══════════════════════════════════════════════
+def split_sentences(text):
+ try:
+ s = [x.text.strip() for x in KIWI.split_into_sents(text) if x.text.strip()]
+ if s: return s
+ except: pass
+ return [x.strip() for x in re.split(r'(?<=[.!?。])\s+', text) if x.strip()]
-def convert_hwp_to_markdown(input_path: str) -> tuple:
- text, error = extract_text_from_hwp_or_hwpx(input_path)
- if text:
- return text, None
- return None, error
+def split_words(text):
+ return [w for w in re.findall(r'[가-힣a-zA-Z0-9]+', text) if w]
-def convert_to_odt_subprocess(input_path, output_dir):
- output_path = os.path.join(output_dir, "output.odt")
+def get_morphemes(text):
try:
- result = subprocess.run(['hwp5odt', '--output', output_path, input_path], capture_output=True, timeout=120)
- if result.returncode == 0 and os.path.exists(output_path):
- return output_path, None
- except:
- pass
- return None, "ODT 변환 실패"
+ r = KIWI.analyze(text)
+ if r and r[0]: return [(m.form, m.tag) for m in r[0][0]]
+ except: pass
+ return []
-def convert_to_xml_subprocess(input_path, output_dir):
- output_path = os.path.join(output_dir, "output.xml")
+def http_get(url, headers=None, timeout=15):
try:
- result = subprocess.run(['hwp5xml', input_path], capture_output=True, timeout=120)
- if result.returncode == 0 and result.stdout:
- with open(output_path, 'wb') as f:
- f.write(result.stdout)
- return output_path, None
- except:
- pass
- return None, "XML 변환 실패"
-
-# ============== LLM API (Groq 라이브러리 사용) ==============
-def call_groq_api_stream(messages: List[Dict]) -> Generator[str, None, None]:
- """Groq API 스트리밍 호출"""
- if not GROQ_AVAILABLE:
- yield "❌ Groq 라이브러리가 설치되지 않았습니다. pip install groq"
- return
-
- if not GROQ_API_KEY:
- yield "❌ GROQ_API_KEY 환경변수가 설정되지 않았습니다."
- return
-
+ if HAS_HTTPX:
+ r = httpx.get(url, headers=headers or {}, timeout=timeout, follow_redirects=True)
+ return r.text if r.status_code == 200 else None
+ else:
+ import urllib.request
+ req = urllib.request.Request(url, headers=headers or {})
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
+ return resp.read().decode('utf-8', errors='replace')
+ except: return None
+
+def http_post_json(url, body, headers=None, timeout=30):
try:
- client = Groq(api_key=GROQ_API_KEY)
-
- completion = client.chat.completions.create(
- model="openai/gpt-oss-120b",
- messages=messages,
- temperature=1,
- max_completion_tokens=8192,
- top_p=1,
- reasoning_effort="medium",
- stream=True,
- stop=None
- )
-
- for chunk in completion:
- if chunk.choices[0].delta.content:
- yield chunk.choices[0].delta.content
-
- except Exception as e:
- error_msg = str(e)
- print(f"❌ Groq API 오류: {error_msg}")
- yield f"❌ API 오류: {error_msg}"
-
-def call_fireworks_api_stream(messages: List[Dict], image_base64: str, mime_type: str) -> Generator[str, None, None]:
- """Fireworks API 스트리밍 호출 (이미지 분석용)"""
- if not FIREWORKS_API_KEY:
- yield "❌ FIREWORKS_API_KEY 환경변수가 설정되지 않았습니다."
- return
-
+ h = headers or {}
+ h["Content-Type"] = "application/json"
+ if HAS_HTTPX:
+ r = httpx.post(url, json=body, headers=h, timeout=timeout)
+ if r.status_code == 200: return r.json()
+ return None
+ else:
+ import urllib.request, ssl
+ req = urllib.request.Request(url, json.dumps(body).encode(), h)
+ with urllib.request.urlopen(req, timeout=timeout, context=ssl.create_default_context()) as resp:
+ return json.loads(resp.read())
+ except: return None
+
+def call_groq(model, prompt, max_tokens=800, temperature=0.1):
+ if not GROQ_KEY: return None, "NO_KEY"
+ url = "https://api.groq.com/openai/v1/chat/completions"
+ h = {"Authorization": f"Bearer {GROQ_KEY}", "Content-Type": "application/json"}
+ b = {"model": model, "messages": [{"role":"user","content":prompt}], "max_tokens": max_tokens, "temperature": temperature}
try:
- formatted_messages = [{"role": m["role"], "content": m["content"]} for m in messages[:-1]]
- formatted_messages.append({
- "role": messages[-1]["role"],
- "content": [
- {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{image_base64}"}},
- {"type": "text", "text": messages[-1]["content"]}
- ]
- })
-
- response = requests.post(
- "https://api.fireworks.ai/inference/v1/chat/completions",
- headers={"Authorization": f"Bearer {FIREWORKS_API_KEY}", "Content-Type": "application/json"},
- json={
- "model": "accounts/fireworks/models/qwen3-vl-235b-a22b-thinking",
- "max_tokens": 4096,
- "temperature": 0.6,
- "messages": formatted_messages,
- "stream": True
- },
- stream=True
- )
-
- if response.status_code != 200:
- yield f"❌ Fireworks API 오류: {response.status_code}"
- return
-
- for line in response.iter_lines():
- if line:
- line = line.decode('utf-8')
- if line.startswith('data: ') and line[6:] != '[DONE]':
- try:
- data = json.loads(line[6:])
- content = data.get('choices', [{}])[0].get('delta', {}).get('content', '')
- if content:
- yield content
- except:
- continue
- except Exception as e:
- yield f"❌ API 오류: {str(e)}"
-
-# ============== 채팅 처리 ==============
-def process_file(file_path: str) -> tuple:
- if not file_path:
- return None, None, None
- filename = os.path.basename(file_path)
-
- if is_image_file(file_path):
- return "image", image_to_base64(file_path), get_image_mime_type(file_path)
-
- if is_hwp_file(file_path) or is_hwpx_file(file_path):
- text, error = extract_text_from_hwp_or_hwpx(file_path)
- if text and len(text.strip()) > 20:
- print(f"📄 [문서 내용 추출 완료] {len(text)} 글자")
- print(f"📄 [문서 미리보기] {text[:500]}...")
- return "text", text, None
- return "error", f"한글 문서 추출 실패: {error}", None
-
- if is_pdf_file(file_path):
- text = extract_text_from_pdf(file_path)
- if text:
- print(f"📄 [PDF 내용 추출 완료] {len(text)} 글자")
- return "text", text, None
- return "error", "PDF 추출 실패", None
-
- if is_text_file(file_path):
- text = extract_text_from_txt(file_path)
- if text:
- return "text", text, None
- return "error", "텍스트 읽기 실패", None
-
- return "unsupported", f"지원하지 않는 형식: {filename}", None
-
-def chat_response(message: str, history: List[Dict], file: Optional[str],
- session_id: str) -> Generator[tuple, None, None]:
- if history is None:
- history = []
- if not message.strip() and not file:
- yield history, session_id
- return
- if not session_id:
- session_id = create_session()
-
- file_type, file_content, file_mime = None, None, None
- file_info = None
- filename = None
-
- if file:
- filename = os.path.basename(file)
- file_type, file_content, file_mime = process_file(file)
- file_info = json.dumps({"type": file_type, "filename": filename})
-
- if file_type == "error":
- history = history + [
- {"role": "user", "content": message or "파일 업로드"},
- {"role": "assistant", "content": f"❌ {file_content}"}
- ]
- yield history, session_id
- return
- elif file_type == "unsupported":
- history = history + [
- {"role": "user", "content": message or "파일 업로드"},
- {"role": "assistant", "content": f"⚠️ {file_content}"}
- ]
- yield history, session_id
- return
-
- # 사용자 메시지 표시
- user_msg = message
- if file:
- user_msg = f"📎 {filename}\n\n{message}" if message else f"📎 {filename}"
-
- history = history + [{"role": "user", "content": user_msg}, {"role": "assistant", "content": ""}]
- yield history, session_id
-
- # 이전 대화 불러오기
- db_messages = get_session_messages(session_id, limit=10)
-
- # 시스템 프롬프트 - 문서 분석 강화
- system_prompt = """당신은 문서 분석 전문 AI 어시스턴트입니다.
-
-## 핵심 역할
-- 사용자가 업로드한 문서의 내용을 **정확하게 분석**하고 **구체적으로 답변**합니다.
-- 문서에 있는 **실제 내용**을 기반으로만 답변합니다.
-- 문서에 없는 내용은 추측하지 않습니다.
-
-## 문서 분석 방법
-1. **문서가 제공되면**: 문서 전체 내용을 꼼꼼히 읽고 핵심 정보를 파악합니다.
-2. **요약 요청 시**: 문서의 주제, 목적, 핵심 내용, 주요 항목을 구조화하여 요약합니다.
-3. **질문 응답 시**: 문서에서 관련 내용을 찾아 **직접 인용하거나 구체적으로 설명**합니다.
-
-## 답변 형식
-- 한국어로 자연스럽고 명확하게 답변합니다.
-- 문서 내용을 인용할 때는 구체적으로 언급합니다.
-- 긴 문서는 섹션별로 나누어 정리합니다.
-
-## 주의사항
-- 문서에 **실제로 있는 내용만** 답변에 포함합니다.
-- 불확실한 내용은 "문서에서 확인되지 않습니다"라고 명시합니다."""
-
- api_messages = [{"role": "system", "content": system_prompt}]
-
- # 이전 대화 추가
- for m in db_messages:
- api_messages.append({"role": m["role"], "content": m["content"]})
-
- # 현재 메시지 구성 - 문서 내용을 명확하게 구분
- if file_type == "text" and file_content:
- if message:
- current_content = f"""## 📄 업로드된 문서 내용 ({filename})
-
-다음은 사용자가 업로드한 문서의 전체 내용입니다:
-
----
-{file_content}
----
-
-## 💬 사용자 질문
-{message}
-
-위 문서 내용을 바탕으로 사용자의 질문에 **구체적이고 정확하게** 답변해주세요."""
+ if HAS_HTTPX:
+ r = httpx.post(url, json=b, headers=h, timeout=45)
+ if r.status_code == 200: return r.json()["choices"][0]["message"]["content"], None
+ return None, f"HTTP {r.status_code}"
else:
- current_content = f"""## 📄 업로드된 문서 내용 ({filename})
+ import urllib.request, ssl
+ req = urllib.request.Request(url, json.dumps(b).encode(), h)
+ with urllib.request.urlopen(req, timeout=45, context=ssl.create_default_context()) as resp:
+ return json.loads(resp.read())["choices"][0]["message"]["content"], None
+ except Exception as e: return None, str(e)[:150]
+
+# ═══════════════════════════════════════════════
+# ★ 통합 문장 점수 (탭1 + 탭2 공유)
+# ═══════════════════════════════════════════════
+AI_ENDINGS = ['합니다','입니다','됩니다','습니다','있습니다','했습니다','겠습니다']
+AI_CONNS = ['또한','따라서','그러므로','이에 따라','한편','더불어','아울러','뿐만 아니라','이를 통해','이에','결과적으로','궁극적으로','특히','나아가','이러한']
+AI_FILLER = ['것으로 보','것으로 나타','것으로 예상','할 수 있','볼 수 있','주목할 만','중요한 역할','중요한 의미','긍정적인 영향','부정적인 영향','필요합니다','필요하다','중요합니다','중요하다','역할을 하','영향을 미','기대된다','예상됩니다','부각되고','대두되고','다양한 분야','다양한 산업','눈부신 성과','획기적인 변화','혁신적인','점에서','측면에서','관점에서']
+HUMAN_MARKERS = {
+ 'ㅋㅎㅠ': re.compile(r'([ㅋㅎㅠㅜㄷㄱ])\1{2,}'),
+ '이모티콘': re.compile(r'[;:]-?[)(DPp]|\^[_\-]?\^|ㅡㅡ|;;'),
+ '줄임': re.compile(r'ㄹㅇ|ㅇㅇ|ㄴㄴ|ㅇㅋ'),
+ '느낌표': re.compile(r'[!?]{2,}'),
+ '비격식': re.compile(r'(거든|잖아|인데|인걸|같음|느낌|아님|대박|미쳤)'),
+}
+FP = {
+ "GPT": {"m":['물론이죠','도움이 되셨기를','설명해 드리겠습니다','추가 질문','도움이 필요하시면'],"e":['습니다','드리겠습니다'],"lp":re.compile(r'^\d+\.\s|^[-•]\s',re.M)},
+ "Claude": {"m":['말씀하신','살펴보겠습니다','균형 잡힌','맥락에서','한 가지 주의할','뉘앙스'],"e":['네요','거예요'],"lp":re.compile(r'^\*\*.*\*\*|^#+\s',re.M)},
+ "Gemini": {"m":['다음과 같습니다','정리해 드리겠습니다','핵심 내용을','더 알고 싶으시면'],"e":['겠습니다','보세요'],"lp":re.compile(r'^\*\s|^-\s\*\*',re.M)},
+ "Perplexity": {"m":['검색 결과에 따르면','보도에 따르면','연구에 따르면','밝혔다','전했다'],"e":['밝혔다','나타났다'],"lp":re.compile(r'\[\d+\]',re.M)},
+}
-다음은 사용자가 업로드한 문서의 전체 내용입니다:
+def score_sentence(sent):
+ """단일 문장 AI 점수 (0~100). 탭1·탭2 공유."""
+ sc = 0; reasons = []
+ # 격식 종결어미
+ for e in AI_ENDINGS:
+ if sent.rstrip('.').endswith(e): sc += 25; reasons.append(f"격식어미(-{e})"); break
+ # 문두 접속사
+ for c in AI_CONNS:
+ if sent.strip().startswith(c): sc += 20; reasons.append(f"AI접속사({c})"); break
+ # 상투적 표현
+ filler_found = 0
+ for f in AI_FILLER:
+ if f in sent: filler_found += 1
+ if filler_found >= 2: sc += 25; reasons.append(f"상투표현×{filler_found}")
+ elif filler_found == 1: sc += 15; reasons.append("상투표현×1")
+ # 모델 지문
+ for mn, fp in FP.items():
+ for m in fp["m"]:
+ if m in sent: sc += 10; reasons.append(f"{mn}지문({m})"); break
+ # 인간 마커 (감점)
+ for n, p in HUMAN_MARKERS.items():
+ if p.search(sent): sc -= 30; reasons.append(f"인간마커({n})")
+ return max(0, min(100, sc)), reasons
+
+# ═══════════════════════════════════════════════
+# 축① 통계
+# ═══════════════════════════════════════════════
+def analyze_statistics(text, sentences, words):
+ sl = [len(s) for s in sentences]
+ if len(sl) < 2: return {"score":50}
+ avg = sum(sl)/len(sl); std = math.sqrt(sum((l-avg)**2 for l in sl)/len(sl))
+ cv = std/avg if avg > 0 else 0
+ burst = 90 if cv<0.25 else 70 if cv<0.35 else 50 if cv<0.50 else 30 if cv<0.65 else 15
+ wf = Counter(words); t = len(words)
+ ne = 0
+ if t > 0:
+ ent = -sum((c/t)*math.log2(c/t) for c in wf.values() if c>0)
+ mx = math.log2(len(wf)) if len(wf)>1 else 1
+ ne = ent/mx if mx>0 else 0
+ es = 75 if ne>0.92 else 55 if ne>0.85 else 30
+ ttr = len(wf)/t if t>0 else 0
+ vs = 70 if ttr<0.45 else 50 if ttr<0.55 else 25
+ se = []
+ for s in sentences:
+ sw = split_words(s)
+ if len(sw)<3: continue
+ sf = Counter(sw); st = len(sw)
+ se.append(-sum((c/st)*math.log2(c/st) for c in sf.values() if c>0))
+ ps = 50
+ if len(se)>=2:
+ sa = sum(se)/len(se); scv = math.sqrt(sum((e-sa)**2 for e in se)/len(se))/(sa if sa else 1)
+ ps = 80 if scv<0.15 else 55 if scv<0.25 else 25
+ return {"score":int(burst*0.35+es*0.2+vs*0.2+ps*0.25),"cv":round(cv,3),"ttr":round(ttr,3)}
+
+# ═══════════════════════════════════════════════
+# 축② 문체
+# ═══════════════════════════════════════════════
+def analyze_korean_style(text, sentences, morphemes):
+ fc = sum(1 for s in sentences if any(s.rstrip('.').endswith(e) for e in AI_ENDINGS))
+ fr = fc/len(sentences) if sentences else 0
+ es = 80 if fr>0.8 else 60 if fr>0.6 else 40 if fr>0.4 else 20
+ cc = sum(1 for c in AI_CONNS if c in text); cd = cc/len(sentences) if sentences else 0
+ cs = 85 if cd>0.5 else 65 if cd>0.3 else 40 if cd>0.15 else 15
+ flc = sum(1 for f in AI_FILLER if f in text)
+ fs = 90 if flc>=5 else 70 if flc>=3 else 45 if flc>=1 else 10
+ hs = sum(len(p.findall(text)) for p in HUMAN_MARKERS.values())
+ hp = min(30, hs*10)
+ ps = 50
+ if morphemes:
+ pc = Counter(t for _,t in morphemes); tm = sum(pc.values())
+ nr = sum(pc.get(t,0) for t in ['NNG','NNP','NNB','NR','NP'])/tm if tm else 0
+ ps = 70 if nr>0.45 else 55 if nr>0.40 else 30
+ return {"score":max(5,int(es*0.25+cs*0.25+fs*0.25+ps*0.25)-hp),"formal":f"{fr:.0%}","conn":f"{cd:.2f}","filler":flc,"human":hs}
+
+# ═══════════════════════════════════════════════
+# 축③ 반복
+# ═══════════════════════════════════════════════
+def analyze_repetition(text, sentences, words):
+ tr = 0
+ if len(words)>=3:
+ tg = Counter(tuple(words[i:i+3]) for i in range(len(words)-2))
+ tr = sum(1 for c in tg.values() if c>1)/len(tg) if tg else 0
+ ns = 75 if tr>0.15 else 55 if tr>0.08 else 25
+ fws = 50
+ if len(sentences)>=3:
+ fw = [split_words(s)[0] for s in sentences if split_words(s)]
+ if fw: r = Counter(fw).most_common(1)[0][1]/len(fw); fws = 70 if r>0.4 else 50 if r>0.25 else 20
+ csl = AI_CONNS + ['그러나','하지만','그래서','그런데','물론']
+ cr = sum(1 for s in sentences if any(s.strip().startswith(c) for c in csl))
+ crr = cr/len(sentences) if sentences else 0
+ css = 85 if crr>0.4 else 60 if crr>0.25 else 35 if crr>0.1 else 15
+ return {"score":int(ns*0.3+fws*0.3+css*0.4)}
+
+# ════════════���══════════════════════════════════
+# 축④ 구조
+# ═══════════════════════════════════════════════
+def analyze_structure(text, sentences):
+ paras = [p.strip() for p in text.split('\n\n') if p.strip()]
+ psc = 40
+ if len(paras)>1:
+ pl = [len(split_sentences(p)) for p in paras]; ap = sum(pl)/len(pl)
+ sp = math.sqrt(sum((l-ap)**2 for l in pl)/len(pl)) if len(pl)>1 else 0
+ cv = sp/ap if ap>0 else 0
+ psc = 75 if cv<0.2 and len(paras)>=3 else 50 if cv<0.4 else 25
+ lt = len(re.findall(r'^\d+[.)]\s',text,re.M))+len(re.findall(r'^[-•*]\s',text,re.M))+len(re.findall(r'^#+\s',text,re.M))+len(re.findall(r'\*\*[^*]+\*\*',text))
+ lsc = 85 if lt>=5 else 60 if lt>=2 else 15
+ return {"score":int(psc*0.4+lsc*0.6)}
+
+# ═══════════════════════════════════════════════
+# 축⑤ 지문
+# ═══════════════════════════════════════════════
+def analyze_model_fingerprint(text, sentences):
+ ms = {}
+ for mn, fp in FP.items():
+ sc = sum(min(15,text.count(m)*5) for m in fp["m"] if text.count(m)>0)
+ lm = fp["lp"].findall(text)
+ if lm: sc += min(20,len(lm)*3)
+ em = sum(1 for s in sentences if any(s.rstrip('.!?').endswith(e) for e in fp.get("e",[])))
+ if sentences: sc += int((em/len(sentences))*20)
+ ms[mn] = min(100,sc)
+ mx = max(ms.values()) if ms else 0
+ return {"score":85 if mx>=50 else 65 if mx>=30 else 40 if mx>=15 else 15,"model_scores":ms}
+
+# ═══════════════════════════════════════════════
+# 품질
+# ═══════════════════════════════════════════════
+def analyze_quality(text, sentences, words, morphemes):
+ qs = {}; sl = [len(s) for s in sentences]; tw = len(words)
+ ideal = sum(1 for l in sl if 15<=l<=70)/len(sentences) if sentences else 0
+ qs["가독성"] = min(100,int(ideal*70+(1-sum(1 for l in sl if l>100)/max(1,len(sentences)))*30))
+ wf = Counter(words); uw = len(wf)
+ mattr = (sum(len(set(words[i:i+50]))/50 for i in range(max(1,tw-50)))/max(1,tw-50)) if tw>=100 else (uw/tw if tw>0 else 0.5)
+ hr = sum(1 for c in wf.values() if c==1)/tw if tw>0 else 0
+ qs["어휘풍부도"] = min(100,int(mattr*80+hr*40))
+ lc = {'순접':['그래서','따라서'],'역접':['그러나','하지만','다만'],'첨가':['또한','그리고','게다가'],'전환':['한편'],'예시':['예를 들어'],'요약':['결국','결론적으로']}
+ ut = sum(1 for cw in lc.values() if any(w in text for w in cw))
+ qs["논리구조"] = min(100,int(ut/len(lc)*60+min(40,ut*10)))
+ si = sum(1 for p in [re.compile(r'됬'),re.compile(r'몇일'),re.compile(r'금새')] if p.search(text))
+ spi = sum(1 for p in [re.compile(r'할수있'),re.compile(r'것같')] if p.search(text))
+ qs["정확성"] = max(0,100-(si+spi)*15)
+ ar=0;vv=0
+ if morphemes:
+ pc = Counter(t for _,t in morphemes); tm = sum(pc.values())
+ ar = sum(pc.get(t,0) for t in ['VA','MAG','MAJ'])/tm if tm else 0
+ vv = len(set(f for f,t in morphemes if t in ['VV','VA']))/max(1,sum(1 for _,t in morphemes if t in ['VV','VA']))
+ qs["표현풍부성"] = min(100,int(ar*200+vv*30))
+ cr = 0.5
+ if morphemes:
+ ct={'NNG','NNP','VV','VA','MAG'}; ft={'JKS','JKC','JKG','JKO','JX','JC','EP','EF','EC','ETN','ETM'}
+ cc=sum(1 for _,t in morphemes if t in ct); fc=sum(1 for _,t in morphemes if t in ft)
+ cr = cc/(cc+fc) if (cc+fc)>0 else 0.5
+ qs["정보밀도"] = min(100,int(cr*80))
+ wq = {"가독성":.20,"어휘풍부도":.18,"논리구조":.18,"정확성":.18,"표현풍부성":.13,"정보밀도":.13}
+ total = int(sum(qs[k]*wq[k] for k in wq))
+ grade = "S" if total>=85 else "A" if total>=72 else "B" if total>=58 else "C" if total>=42 else "D" if total>=28 else "F"
+ return {"score":total,"grade":grade,"sub_scores":qs}
+
+# ═══════════════════════════════════════════════
+# LLM 교차검증
+# ═══════════════════════════════════════════════
+LLM_JUDGES = [("openai/gpt-oss-120b","GPT-OSS 120B"),("qwen/qwen3-32b","Qwen3 32B"),("moonshotai/kimi-k2-instruct-0905","Kimi-K2")]
+
+def llm_cross_check(text):
+ if not GROQ_KEY: return {"score":-1,"detail":{}}
+ prompt = f"AI 텍스트 탐지 전문가로서 분석. 1) AI vs 사람+근거3 2) 마지막줄: \"AI확률: XX%\"\n\n[텍스트]\n{text[:2000]}"
+ votes=[]; rpt={}
+ for mid,mn in LLM_JUDGES:
+ resp,err = call_groq(mid,prompt)
+ if resp:
+ pm = re.search(r'AI\s*확률[:\s]*(\d+)',resp)
+ if pm: p=int(pm.group(1)); votes.append(p); rpt[mn]=f"{p}%"
+ else: rpt[mn]="파싱실패"
+ else: rpt[mn]=f"ERR"
+ if votes: return {"score":int(sum(votes)/len(votes)),"detail":rpt}
+ return {"score":-1,"detail":rpt}
+
+# ═══════════════════════════════════════════════
+# 종합 판정 (일관된 기준)
+# ═══════════════════════════════════════════════
+def compute_verdict(scores, llm_score=-1):
+ w={"통계":.25,"문체":.30,"반복성":.15,"구조":.15,"지문":.15}
+ ws=sum(scores[k]*w[k] for k in w)
+ hi=sum(1 for v in scores.values() if v>=50)
+ if hi>=4: ws+=12
+ elif hi>=3: ws+=8
+ elif hi>=2: ws+=4
+ if sum(1 for v in scores.values() if v<25)>=3: ws-=8
+ if llm_score>=0: ws=ws*0.70+llm_score*0.30
+ fs=max(0,min(100,int(ws)))
+ if fs>=75: return fs,"AI 작성 확신","ai_high"
+ if fs>=60: return fs,"AI 의심 높음","ai_medium"
+ if fs>=45: return fs,"AI 의심 중간","ai_low"
+ if fs>=30: return fs,"판단 유보","uncertain"
+ return fs,"인간 작성 추정","human"
+
+def quick_score(text):
+ sents=split_sentences(text); words=split_words(text); morphs=get_morphemes(text)
+ sc={"통계":analyze_statistics(text,sents,words)["score"],"문체":analyze_korean_style(text,sents,morphs)["score"],
+ "반복성":analyze_repetition(text,sents,words)["score"],"구조":analyze_structure(text,sents)["score"],
+ "지문":analyze_model_fingerprint(text,sents)["score"]}
+ fs,v,lv=compute_verdict(sc); return fs,v,lv,sc
+
+# ═══════════════════════════════════════════════
+# ★ AI→인간 변환 (대폭 강화)
+# ═══════════════════════════════════════════════
+CONN_MAP = {'또한':['그리고','이 밖에도'],'따라서':['그래서','이런 이유로'],'이에 따라':['그래서'],
+ '한편':['반면'],'더불어':['함께'],'결과적으로':['결국'],'궁극적으로':['결국'],
+ '나아가':['더 나아가면'],'이러한':['이런'],'특히':['그중에서도','무엇보다'],
+ '뿐만 아니라':['거기에다'],'이를 통해':['덕분에'],'이에':['그래서'],
+ '아울러':['그리고'],'그러므로':['그래서']}
+FILL_MAP = {
+ '중요한 역할을 하고':'큰 역할을 하고','중요한 의미를 가지':'큰 의미를 가지',
+ '긍정적인 영향을 미치고':'좋은 영향을 주고','부정적인 영향을':'나쁜 영향을',
+ '눈부신 성과를 거두':'대단한 성과를 내','괄목할 만한':'눈에 띄는',
+ '획기적인 변화':'큰 변화','혁신적인':'새로운',
+ '다양한 분야':'여러 분야','다양한 산업 분야':'여러 산업','다양한 산업':'여러 산업',
+ '다양한 창작':'여러 창작','다양한 측면':'여러 면',
+ '부각되고 있습니다':'두드러지고 있다','부각되고':'두드러지고',
+ '대두되고':'떠오르고','활용할 수 있게':'쓸 수 있게',
+ '활발히 진행되고 있습니다':'활발하게 이뤄지고 있다',
+ '것으로 예상됩니다':'것 같다','것으로 보입니다':'것 같다',
+ '것으로 판단됩니다':'것으로 보인다','것으로 분석됩니다':'것으로 보인다',
+}
+INLINE_CONN = {'이를 통해 ':'이걸로 ','이에 대한 ':'이 문제에 대한 ','따라서 ':'그래서 ','결과적으로 ':'결국 '}
+END_RULES = [
+ ('활발히 진행되고 있습니다','활발하게 이뤄지고 있다'),
+ ('거두고 있습니다','거두고 있다'),('변화하고 있습니다','바뀌고 있다'),
+ ('있게 되었습니다','있게 됐다'),('하고 있습니다','하고 있다'),
+ ('되고 있습니다','되고 있다'),('할 수 있습니다','할 수 있다'),
+ ('미치고 있으며','주고 있고'),('가능해졌으며','가능해졌고'),
+ ('필요합니다','필요하다'),('중요합니다','중요하다'),
+ ('있습니다','있다'),('됩니다','된다'),('했습니다','했다'),
+ ('겠습니다','것이다'),('입니다','이다'),
+ ('가지며','가지고'),('이루는 것이','이루는 게'),
+]
+# 문장 재구성용 패턴
+RESTRUCTURE = [
+ (r'(\S+)은 (\S+)에서 (.+)', lambda m: f"{m.group(2)}에서 {m.group(1)}은 {m.group(3)}" if random.random()<0.3 else m.group()),
+ (r'(.+)하고 있다\.', lambda m: f"{m.group(1)}하는 중이다." if random.random()<0.3 else m.group()),
+]
+
+def rule_humanize(text):
+ r=text; ch=[]
+ # 1. 문두 접속사
+ for ac,alts in CONN_MAP.items():
+ pat=re.compile(r'(?:^|\n)(\s*)('+re.escape(ac)+r')(\s)',re.M)
+ for m in reversed(list(pat.finditer(r))):
+ if random.random()<0.5:
+ alt=random.choice(alts); r=r[:m.start(2)]+alt+r[m.end(2):]; ch.append(f"접속사 '{ac}'→'{alt}'")
+ else: r=r[:m.start(2)]+r[m.end(2):]; ch.append(f"접속사제거 '{ac}'")
+ # 2. 문장 내 접속사
+ for ai,hu in INLINE_CONN.items():
+ if ai in r: r=r.replace(ai,hu,1); ch.append(f"내부접속 '{ai.strip()}'")
+ # 3. 상투
+ for ai in sorted(FILL_MAP.keys(), key=len, reverse=True):
+ hu = FILL_MAP[ai]
+ if ai in r: r=r.replace(ai,hu,1); ch.append(f"상투 '{ai}'")
+ # 4. 종결어미 전면 변환
+ for ai,hu in END_RULES:
+ cnt=r.count(ai)
+ if cnt>0: r=r.replace(ai,hu); ch.append(f"종결 '{ai}'→'{hu}' ×{cnt}")
+ # 5. 마크다운 제거
+ r=re.sub(r'^\d+\.\s+','',r,flags=re.M)
+ r=re.sub(r'^[-•*]\s+','',r,flags=re.M)
+ r=re.sub(r'\*\*([^*]+)\*\*',r'\1',r)
+ r=re.sub(r'^#+\s+','',r,flags=re.M)
+ # 6. 문장 재구성 (일부)
+ sents = split_sentences(r)
+ rebuilt = []
+ for i, s in enumerate(sents):
+ ns = s
+ for pat, repl in RESTRUCTURE:
+ ns = re.sub(pat, repl, ns)
+ rebuilt.append(ns)
+ # 7. 균일도 개선: 짧은 감상 삽입
+ if len(rebuilt) >= 4:
+ sl = [len(s) for s in rebuilt]
+ avg = sum(sl)/len(sl)
+ cv = math.sqrt(sum((l-avg)**2 for l in sl)/len(sl))/(avg if avg>0 else 1)
+ if cv < 0.30:
+ inserts = ["사실 이건 꽤 중요한 변화다.","솔직히 놀라운 부분이다.","이 부분이 핵심이다.","생각보다 파급력이 크다."]
+ mid = len(rebuilt)//2
+ rebuilt.insert(mid, random.choice(inserts))
+ ch.append("감상 삽입 (균일도 개선)")
+ # 8. 가끔 문장 합치기
+ final = []
+ i = 0
+ while i < len(rebuilt):
+ if i < len(rebuilt)-1 and len(rebuilt[i]) < 25 and len(rebuilt[i+1]) < 40 and random.random() < 0.3:
+ merged = rebuilt[i].rstrip('.') + ', ' + rebuilt[i+1][0].lower() + rebuilt[i+1][1:] if len(rebuilt[i+1]) > 1 else rebuilt[i]
+ final.append(merged); ch.append("문장 합병"); i += 2
+ else:
+ final.append(rebuilt[i]); i += 1
+ r = ' '.join(final)
+ return r.strip(), ch
+
+def llm_humanize(text):
+ if not GROQ_KEY: return text, ["LLM: API키 없음"]
+ prompt = f"""당신은 AI 텍스트 탐지 우회 전문 리라이터입니다. 아래 텍스트를 자연스러운 사람이 쓴 글로 완전히 재작성하세요.
+
+[필수 규칙 - 모두 지킬 것]
+1. 격식체(-합니다)와 비격식체(-다/-이야)를 60:40 비율로 섞어라
+2. AI접속사(또한/따라서/나아가/이러한/이에) 전부 제거하거나 구어체로 대체
+3. 상투표현(중요한 역할/긍정적인 영향/다양한 분야/획기적인) 전부 일상어로
+4. 문장 길이를 확실히 불규칙하게: 짧은 문장(10자)과 긴 문장(70자) 반드시 혼합
+5. 반드시 2-3개 개인 의견/감상 삽입 ("솔직히", "사실", "근데 생각해보면", "좀 놀라운 건")
+6. 원래 의미·정보 100% 보존
+7. 마크다운/리스트 전부 제거, 완전한 산문
+8. 문장 순서를 일부 변경해도 됨
+9. 능동태 위주로, 피동 표현 줄여라
+10. "~것으로 보인다/예상된다" 같은 회피적 표현 → "~일 것이다/~할 것 같다"
+
+[원문]
+{text[:2500]}
+
+[변환 결과만 출력 - 설명 없이]"""
+ resp, err = call_groq("qwen/qwen3-32b", prompt, max_tokens=2000, temperature=0.8)
+ if resp:
+ cleaned = re.sub(r'
|
+ AI TEXT DETECTOR · PLAGIARISM REPORT
+ 표절 검사 결과 확인서
+ |
+
+ 문서번호 {doc_id}
+ {now}
+ |
+