Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,11 +1,3 @@
|
|
| 1 |
-
"""
|
| 2 |
-
AI 글 판별기 v5.1 — 5축+Perplexity+Humanizer+모델추정 + 품질 + LLM교차검증 + 표절
|
| 3 |
-
═══════════════════════════════════════════════════════════════════════════
|
| 4 |
-
★ v5.1 3대 킬러: Perplexity 확률분석 · Humanizer/Bypasser 탐지 · AI 모델 추정
|
| 5 |
-
★ 5축 AI 탐지 | 6항목 품질 | LLM 교차검증 (GPT-OSS-120B · Qwen3-32B · Kimi-K2)
|
| 6 |
-
★ 표절: Brave Search 병렬(최대20) + KCI/RISS/ARXIV + Gemini + CopyKiller 보고서
|
| 7 |
-
★ 문서: PDF·DOCX·HWP·HWPX·TXT 업로드 → 섹션별 히트맵 + PDF 보고서
|
| 8 |
-
"""
|
| 9 |
import gradio as gr
|
| 10 |
import math, re, os, json, random, time, hashlib, zlib, zipfile, tempfile
|
| 11 |
from collections import Counter
|
|
@@ -14,7 +6,6 @@ from pathlib import Path
|
|
| 14 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 15 |
from xml.etree import ElementTree as ET
|
| 16 |
from kiwipiepy import Kiwi
|
| 17 |
-
|
| 18 |
KIWI = Kiwi()
|
| 19 |
try:
|
| 20 |
import httpx; HAS_HTTPX = True
|
|
@@ -26,8 +17,6 @@ try:
|
|
| 26 |
HAS_GENAI = True
|
| 27 |
except ImportError:
|
| 28 |
HAS_GENAI = False
|
| 29 |
-
|
| 30 |
-
# ── 문서 추출 라이브러리 ──
|
| 31 |
try:
|
| 32 |
import olefile; HAS_OLEFILE = True
|
| 33 |
except ImportError:
|
|
@@ -44,15 +33,9 @@ try:
|
|
| 44 |
from docx import Document as DocxDocument; HAS_DOCX = True
|
| 45 |
except ImportError:
|
| 46 |
HAS_DOCX = False
|
| 47 |
-
|
| 48 |
GROQ_KEY = os.getenv("GROQ_API_KEY", "")
|
| 49 |
GEMINI_KEY = os.getenv("GEMINI_API_KEY", "")
|
| 50 |
BRAVE_KEY = os.getenv("BRAVE_API_KEY", "")
|
| 51 |
-
|
| 52 |
-
# ═══════════════════════════════════════════════
|
| 53 |
-
# 문서 텍스트 추출 엔진
|
| 54 |
-
# ═══════════════════════════════════════════════
|
| 55 |
-
|
| 56 |
def extract_text_from_pdf(file_path):
|
| 57 |
"""PDF → 텍스트 (페이지별 분리)"""
|
| 58 |
pages = []
|
|
@@ -76,7 +59,6 @@ def extract_text_from_pdf(file_path):
|
|
| 76 |
except Exception as e:
|
| 77 |
print(f"PyPDF2: {e}")
|
| 78 |
return None, "PDF 추출 실패 (pdfplumber, PyPDF2 없음)"
|
| 79 |
-
|
| 80 |
def extract_text_from_docx(file_path):
|
| 81 |
"""DOCX → 텍스트 (문단별 분리)"""
|
| 82 |
if not HAS_DOCX: return None, "python-docx 없음"
|
|
@@ -97,7 +79,6 @@ def extract_text_from_docx(file_path):
|
|
| 97 |
return None, "DOCX 텍스트 없음"
|
| 98 |
except Exception as e:
|
| 99 |
return None, f"DOCX 오류: {e}"
|
| 100 |
-
|
| 101 |
def extract_text_from_txt(file_path):
|
| 102 |
"""TXT/MD/CSV 등 → 텍스트"""
|
| 103 |
for enc in ['utf-8', 'euc-kr', 'cp949', 'utf-16', 'latin-1']:
|
|
@@ -105,12 +86,10 @@ def extract_text_from_txt(file_path):
|
|
| 105 |
with open(file_path, 'r', encoding=enc) as f:
|
| 106 |
text = f.read()
|
| 107 |
if text.strip():
|
| 108 |
-
# 빈 줄 기준으로 섹션 분리
|
| 109 |
sections = [s.strip() for s in re.split(r'\n{2,}', text) if s.strip()]
|
| 110 |
return sections if sections else [text], None
|
| 111 |
except: continue
|
| 112 |
return None, "텍스트 인코딩 감지 실패"
|
| 113 |
-
|
| 114 |
def extract_text_from_hwpx(file_path):
|
| 115 |
"""HWPX (ZIP 기반) → 텍스트"""
|
| 116 |
try:
|
|
@@ -149,7 +128,6 @@ def extract_text_from_hwpx(file_path):
|
|
| 149 |
return None, "유효하지 않은 HWPX"
|
| 150 |
except Exception as e:
|
| 151 |
return None, f"HWPX 오류: {e}"
|
| 152 |
-
|
| 153 |
def _decode_hwp_para(data):
|
| 154 |
"""HWP 바이너리 → 문단 텍스트"""
|
| 155 |
result = []
|
|
@@ -171,7 +149,6 @@ def _decode_hwp_para(data):
|
|
| 171 |
text = re.sub(r'[ \t]+', ' ', text)
|
| 172 |
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 173 |
return text if len(text) > 2 else None
|
| 174 |
-
|
| 175 |
def _extract_hwp_section(data):
|
| 176 |
"""HWP 섹션 바이너리 → 텍스트"""
|
| 177 |
texts = []
|
|
@@ -195,7 +172,6 @@ def _extract_hwp_section(data):
|
|
| 195 |
except:
|
| 196 |
pos += 1
|
| 197 |
return '\n'.join(texts) if texts else None
|
| 198 |
-
|
| 199 |
def extract_text_from_hwp(file_path):
|
| 200 |
"""HWP (OLE 기반) → 텍스트"""
|
| 201 |
if not HAS_OLEFILE: return None, "olefile 없음"
|
|
@@ -224,7 +200,6 @@ def extract_text_from_hwp(file_path):
|
|
| 224 |
return None, "HWP 텍스트 없음"
|
| 225 |
except Exception as e:
|
| 226 |
return None, f"HWP 오류: {e}"
|
| 227 |
-
|
| 228 |
def extract_text_from_file(file_path):
|
| 229 |
"""
|
| 230 |
만능 문서 추출: PDF/DOCX/HWP/HWPX/TXT → (sections_list, full_text, error)
|
|
@@ -235,7 +210,6 @@ def extract_text_from_file(file_path):
|
|
| 235 |
return None, None, "파일 없음"
|
| 236 |
ext = Path(file_path).suffix.lower()
|
| 237 |
sections, error = None, None
|
| 238 |
-
|
| 239 |
if ext == '.pdf':
|
| 240 |
sections, error = extract_text_from_pdf(file_path)
|
| 241 |
elif ext == '.docx':
|
|
@@ -248,32 +222,24 @@ def extract_text_from_file(file_path):
|
|
| 248 |
sections, error = extract_text_from_txt(file_path)
|
| 249 |
else:
|
| 250 |
return None, None, f"지원하지 않는 형식: {ext}"
|
| 251 |
-
|
| 252 |
if sections:
|
| 253 |
full = '\n\n'.join(sections)
|
| 254 |
return sections, full, None
|
| 255 |
return None, None, error or "텍스트 추출 실패"
|
| 256 |
-
|
| 257 |
-
# ════════════════════════════════════════���══════
|
| 258 |
-
# 유틸리티
|
| 259 |
-
# ═══════════════════════════════════════════════
|
| 260 |
def split_sentences(text):
|
| 261 |
try:
|
| 262 |
s = [x.text.strip() for x in KIWI.split_into_sents(text) if x.text.strip()]
|
| 263 |
if s: return s
|
| 264 |
except: pass
|
| 265 |
return [x.strip() for x in re.split(r'(?<=[.!?。])\s+', text) if x.strip()]
|
| 266 |
-
|
| 267 |
def split_words(text):
|
| 268 |
return [w for w in re.findall(r'[가-힣a-zA-Z0-9]+', text) if w]
|
| 269 |
-
|
| 270 |
def get_morphemes(text):
|
| 271 |
try:
|
| 272 |
r = KIWI.analyze(text)
|
| 273 |
if r and r[0]: return [(m.form, m.tag) for m in r[0][0]]
|
| 274 |
except: pass
|
| 275 |
return []
|
| 276 |
-
|
| 277 |
def http_get(url, headers=None, timeout=15):
|
| 278 |
try:
|
| 279 |
if HAS_HTTPX:
|
|
@@ -285,7 +251,6 @@ def http_get(url, headers=None, timeout=15):
|
|
| 285 |
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
| 286 |
return resp.read().decode('utf-8', errors='replace')
|
| 287 |
except: return None
|
| 288 |
-
|
| 289 |
def http_post_json(url, body, headers=None, timeout=30):
|
| 290 |
try:
|
| 291 |
h = headers or {}
|
|
@@ -300,7 +265,6 @@ def http_post_json(url, body, headers=None, timeout=30):
|
|
| 300 |
with urllib.request.urlopen(req, timeout=timeout, context=ssl.create_default_context()) as resp:
|
| 301 |
return json.loads(resp.read())
|
| 302 |
except: return None
|
| 303 |
-
|
| 304 |
def call_groq(model, prompt, max_tokens=800, temperature=0.1):
|
| 305 |
if not GROQ_KEY: return None, "NO_KEY"
|
| 306 |
url = "https://api.groq.com/openai/v1/chat/completions"
|
|
@@ -317,32 +281,23 @@ def call_groq(model, prompt, max_tokens=800, temperature=0.1):
|
|
| 317 |
with urllib.request.urlopen(req, timeout=45, context=ssl.create_default_context()) as resp:
|
| 318 |
return json.loads(resp.read())["choices"][0]["message"]["content"], None
|
| 319 |
except Exception as e: return None, str(e)[:150]
|
| 320 |
-
|
| 321 |
-
# ═══════════════════════════════════════════════
|
| 322 |
-
# ★ 통합 문장 점수 (탭1 + 탭2 공유) — v5.0 대폭 강화
|
| 323 |
-
# ═══════════════════════════════════════════════
|
| 324 |
AI_ENDINGS = ['합니다','입니다','됩니다','습니다','있습니다','했습니다','겠습니다']
|
| 325 |
-
# 비격식이지만 AI적인 종결 패턴
|
| 326 |
AI_CASUAL_ENDINGS = ['라고 할 수 있다','라고 볼 수 있다','다고 생각한다','다고 판단된다',
|
| 327 |
'인 셈이다','인 것이다','는 것이다','는 셈이다','ㄹ 것이다','을 것이다',
|
| 328 |
'라 할 수 있다','로 보인다','로 판단된다','고 있다','는 추세다','는 상황이다',
|
| 329 |
'지 않을 수 없다','라 하겠다','음을 알 수 있다','할 필요가 있다']
|
| 330 |
AI_CONNS = ['또한','따라서','그러므로','이에 따라','한편','더불어','아울러','뿐만 아니라',
|
| 331 |
'이를 통해','이에','결과적으로','궁극적으로','특히','나아가','이러한']
|
| 332 |
-
# 자연어에서도 쓰이지만 AI가 과도하게 쓰는 접속사
|
| 333 |
AI_SOFT_CONNS = ['물론','그러나','하지만','이처럼','이와 같이','이를 바탕으로']
|
| 334 |
AI_FILLER = ['것으로 보','것으로 나타','것으로 예상','할 수 있','볼 수 있','주목할 만',
|
| 335 |
'중요한 역할','중요한 의미','긍정적인 영향','부정적인 영향','필요합니다','필요하다',
|
| 336 |
'중요합니다','중요하다','역할을 하','영향을 미','기대된다','예상됩니다','부각되고',
|
| 337 |
'대두되고','다양한 분야','다양한 산업','눈부신 성과','획기적인 변화','혁신적인',
|
| 338 |
'점에서','측면에서','관점에서']
|
| 339 |
-
# 비격식 AI 상투표현 (비격식체로 써도 AI적)
|
| 340 |
AI_CASUAL_FILLER = ['무궁무진하다','무궁무진한','과언이 아니','돌파구가 될','전환점이 될',
|
| 341 |
'기반으로','발판으로','원동력이','초석이 될','가속화되','급부상','패러다임',
|
| 342 |
'지평을 열','새로운 장을','대전환','본격화되','고도화','이정표']
|
| 343 |
-
# 양보-주장 패턴 (AI 특유: "물론 X하지만, Y" 구조)
|
| 344 |
AI_CONCESSION = re.compile(r'물론.{2,20}(하지만|그러나|그렇지만|다만)|.{2,15}(이긴 하지만|기는 하지만|수 있지만|수는 있지만)')
|
| 345 |
-
# 영어 AI 패턴
|
| 346 |
EN_AI_MARKERS = ['furthermore','additionally','moreover','it is worth noting','in conclusion',
|
| 347 |
'it is important to','plays a crucial role','significant impact','various aspects',
|
| 348 |
'in this regard','consequently','nevertheless','integral part of','led to remarkable',
|
|
@@ -369,23 +324,16 @@ FP = {
|
|
| 369 |
"Perplexity": {"m":['검색 결과에 따르면','보도에 따르면','연구에 따르면','밝혔다','전했다',
|
| 370 |
'것으로 나타났다','것으로 조사됐다','것으로 집계됐다','발표했다'],"e":['밝혔다','나타났다','전했다'],"lp":re.compile(r'\[\d+\]',re.M)},
|
| 371 |
}
|
| 372 |
-
|
| 373 |
def score_sentence(sent):
|
| 374 |
"""단일 문장 AI 점수 (0~100). 탭1·탭2 공유. v5.0 대폭 강화."""
|
| 375 |
sc = 0; reasons = []
|
| 376 |
sl = sent.lower().strip()
|
| 377 |
sr = sent.rstrip('.!?。')
|
| 378 |
-
|
| 379 |
-
# ── 격식 종결어미 ──
|
| 380 |
for e in AI_ENDINGS:
|
| 381 |
if sr.endswith(e): sc += 22; reasons.append(f"격식어미(-{e})"); break
|
| 382 |
-
|
| 383 |
-
# ── 비격식 AI 종결 패턴 ──
|
| 384 |
if sc == 0: # 격식이 아닌 경우만
|
| 385 |
for e in AI_CASUAL_ENDINGS:
|
| 386 |
if sr.endswith(e): sc += 15; reasons.append(f"비격식AI(-{e})"); break
|
| 387 |
-
|
| 388 |
-
# ── 문두 접속사 ──
|
| 389 |
stripped = sent.strip()
|
| 390 |
for c in AI_CONNS:
|
| 391 |
if stripped.startswith(c):
|
|
@@ -393,36 +341,24 @@ def score_sentence(sent):
|
|
| 393 |
else:
|
| 394 |
for c in AI_SOFT_CONNS:
|
| 395 |
if stripped.startswith(c): sc += 8; reasons.append(f"약한접속사({c})"); break
|
| 396 |
-
|
| 397 |
-
# ── 상투적 표현 (클래식 + 비격식) ──
|
| 398 |
filler_found = sum(1 for f in AI_FILLER if f in sent)
|
| 399 |
casual_filler = sum(1 for f in AI_CASUAL_FILLER if f in sent)
|
| 400 |
total_filler = filler_found + casual_filler
|
| 401 |
if total_filler >= 3: sc += 25; reasons.append(f"상투표현×{total_filler}")
|
| 402 |
elif total_filler == 2: sc += 18; reasons.append(f"상투표현×2")
|
| 403 |
elif total_filler == 1: sc += 10; reasons.append(f"상투표현×1")
|
| 404 |
-
|
| 405 |
-
# ── 양보-주장 패턴 (Claude/GPT 특유) ──
|
| 406 |
if AI_CONCESSION.search(sent): sc += 10; reasons.append("양보패턴")
|
| 407 |
-
|
| 408 |
-
# ── 모델 지문 ──
|
| 409 |
for mn, fp in FP.items():
|
| 410 |
for m in fp["m"]:
|
| 411 |
if m in sent: sc += 8; reasons.append(f"{mn}지문"); break
|
| 412 |
-
|
| 413 |
-
# ── 영어 AI 패턴 (복수 누적) ──
|
| 414 |
en_count = sum(1 for em in EN_AI_MARKERS if em in sl)
|
| 415 |
if en_count >= 3: sc += 25; reasons.append(f"영어AI×{en_count}")
|
| 416 |
elif en_count >= 2: sc += 18; reasons.append(f"영어AI×{en_count}")
|
| 417 |
elif en_count >= 1: sc += 12; reasons.append(f"영어AI×1")
|
| 418 |
-
|
| 419 |
-
# ── 복합 보너스: 격식+상투+접속 동시 → 거의 확실 AI ──
|
| 420 |
has_formal = any(sr.endswith(e) for e in AI_ENDINGS)
|
| 421 |
has_conn = any(stripped.startswith(c) for c in AI_CONNS)
|
| 422 |
if has_formal and total_filler >= 1 and has_conn: sc += 8; reasons.append("복합AI")
|
| 423 |
elif has_formal and total_filler >= 2: sc += 5; reasons.append("격식+상투")
|
| 424 |
-
|
| 425 |
-
# ── 인간 마커 (감점) — 세분화 ──
|
| 426 |
for n, p in HUMAN_MARKERS.items():
|
| 427 |
matches = p.findall(sent)
|
| 428 |
if matches:
|
|
@@ -430,22 +366,13 @@ def score_sentence(sent):
|
|
| 430 |
elif n in ('비격식종결','구어축약'): sc -= 18; reasons.append(f"구어체({n})")
|
| 431 |
elif n == '맞춤법오류': sc -= 12; reasons.append("맞춤법오류")
|
| 432 |
elif n in ('느낌표','말줄임표'): sc -= 10; reasons.append(f"인간({n})")
|
| 433 |
-
|
| 434 |
return max(0, min(100, sc)), reasons
|
| 435 |
-
|
| 436 |
-
# ═══════════════════════════════════════════════
|
| 437 |
-
# 축① 통계 — v5.0: Burstiness + 조건부 엔트로피 추가
|
| 438 |
-
# ═══════════════════════════════════════════════
|
| 439 |
def analyze_statistics(text, sentences, words):
|
| 440 |
sl = [len(s) for s in sentences]
|
| 441 |
if len(sl) < 2: return {"score":50}
|
| 442 |
-
|
| 443 |
-
# 1. CV (문장 길이 변동 계수)
|
| 444 |
avg = sum(sl)/len(sl); std = math.sqrt(sum((l-avg)**2 for l in sl)/len(sl))
|
| 445 |
cv = std/avg if avg > 0 else 0
|
| 446 |
cv_score = 85 if cv<0.20 else 70 if cv<0.30 else 50 if cv<0.45 else 30 if cv<0.60 else 15
|
| 447 |
-
|
| 448 |
-
# 2. Burstiness — 연속 문장 길이차 변동
|
| 449 |
diffs = [abs(sl[i]-sl[i-1]) for i in range(1,len(sl))]
|
| 450 |
burst_score = 50
|
| 451 |
if diffs:
|
|
@@ -453,21 +380,14 @@ def analyze_statistics(text, sentences, words):
|
|
| 453 |
max_d = max(diffs)
|
| 454 |
burst_ratio = max_d / (avg_d + 1)
|
| 455 |
burst_score = 85 if burst_ratio < 1.8 else 65 if burst_ratio < 2.5 else 40 if burst_ratio < 3.5 else 20
|
| 456 |
-
|
| 457 |
-
# 3. ★ 표준 길이 비율 — AI는 대부분 25~60자, 인간은 극단 outlier 있음
|
| 458 |
standard_ratio = sum(1 for l in sl if 20 <= l <= 60) / len(sl)
|
| 459 |
std_score = 80 if standard_ratio > 0.8 else 60 if standard_ratio > 0.6 else 40 if standard_ratio > 0.4 else 20
|
| 460 |
-
# 극단 문장(10자 미만 or 80자 초과) 있으면 인간적
|
| 461 |
extreme = sum(1 for l in sl if l < 10 or l > 80)
|
| 462 |
if extreme >= 2: std_score = max(10, std_score - 20)
|
| 463 |
elif extreme >= 1: std_score = max(15, std_score - 10)
|
| 464 |
-
|
| 465 |
-
# 4. 어휘 다양성
|
| 466 |
wf = Counter(words); t = len(words)
|
| 467 |
ttr = len(wf)/t if t>0 else 0
|
| 468 |
vocab_score = 70 if ttr<0.45 else 55 if ttr<0.55 else 35 if ttr<0.65 else 20
|
| 469 |
-
|
| 470 |
-
# 5. 문장 복잡도 균일성
|
| 471 |
wpc = [len(split_words(s)) for s in sentences]
|
| 472 |
complex_score = 50
|
| 473 |
if len(wpc) >= 3:
|
|
@@ -475,108 +395,72 @@ def analyze_statistics(text, sentences, words):
|
|
| 475 |
wpc_std = math.sqrt(sum((w-wpc_avg)**2 for w in wpc)/len(wpc))
|
| 476 |
wpc_cv = wpc_std/wpc_avg if wpc_avg > 0 else 0
|
| 477 |
complex_score = 80 if wpc_cv < 0.20 else 60 if wpc_cv < 0.35 else 35 if wpc_cv < 0.50 else 15
|
| 478 |
-
|
| 479 |
final = int(cv_score*0.20 + burst_score*0.20 + std_score*0.25 + vocab_score*0.15 + complex_score*0.20)
|
| 480 |
return {"score":final,"cv":round(cv,3),"ttr":round(ttr,3)}
|
| 481 |
-
|
| 482 |
-
# ═══════════════════════════════════════════════
|
| 483 |
-
# 축② 문체 — v5.0: 양보패턴 + 비격식AI + 접속사 위치패턴
|
| 484 |
-
# ═══════════════════════════════════════════════
|
| 485 |
def analyze_korean_style(text, sentences, morphemes):
|
| 486 |
if not sentences: return {"score":50}
|
| 487 |
-
|
| 488 |
-
# 1. 종결어미 분석 (격식 + 비격식AI)
|
| 489 |
formal_cnt = sum(1 for s in sentences if any(s.rstrip('.!?').endswith(e) for e in AI_ENDINGS))
|
| 490 |
casual_ai = sum(1 for s in sentences if any(s.rstrip('.!?').endswith(e) for e in AI_CASUAL_ENDINGS))
|
| 491 |
fr = formal_cnt/len(sentences)
|
| 492 |
car = casual_ai/len(sentences)
|
| 493 |
-
# 격식 비율 높으면 AI적, 비격식AI도 가산
|
| 494 |
ending_score = 85 if fr>0.7 else 65 if fr>0.5 else 45 if fr>0.3 else 25 if fr>0.1 else 10
|
| 495 |
ending_score = min(90, ending_score + int(car * 25)) # 비격식AI 보너스
|
| 496 |
-
|
| 497 |
-
# 2. 접속사 밀도 + ★ 위치 패턴
|
| 498 |
conn_positions = []
|
| 499 |
for i, s in enumerate(sentences):
|
| 500 |
for c in AI_CONNS:
|
| 501 |
if s.strip().startswith(c): conn_positions.append(i); break
|
| 502 |
conn_density = len(conn_positions)/len(sentences) if sentences else 0
|
| 503 |
conn_score = 85 if conn_density>0.4 else 65 if conn_density>0.25 else 40 if conn_density>0.1 else 15
|
| 504 |
-
# AI는 접속사를 규칙적 간격으로 배치 (2-3문장마다)
|
| 505 |
if len(conn_positions) >= 2:
|
| 506 |
gaps = [conn_positions[i]-conn_positions[i-1] for i in range(1,len(conn_positions))]
|
| 507 |
gap_cv = (math.sqrt(sum((g-sum(gaps)/len(gaps))**2 for g in gaps)/len(gaps))/(sum(gaps)/len(gaps)+0.01))
|
| 508 |
if gap_cv < 0.5: conn_score = min(90, conn_score + 10) # 매우 규칙적 → AI 보너스
|
| 509 |
-
|
| 510 |
-
# 3. 상투표현 (클래식 + 비격식)
|
| 511 |
filler_cnt = sum(1 for f in AI_FILLER if f in text) + sum(1 for f in AI_CASUAL_FILLER if f in text)
|
| 512 |
filler_score = 90 if filler_cnt>=6 else 75 if filler_cnt>=4 else 55 if filler_cnt>=2 else 30 if filler_cnt>=1 else 10
|
| 513 |
-
|
| 514 |
-
# 4. ★ 양보-주장 구문 (AI 특유: "물론~하지만~" 패턴)
|
| 515 |
concession_cnt = len(AI_CONCESSION.findall(text))
|
| 516 |
conc_score = 80 if concession_cnt >= 2 else 55 if concession_cnt >= 1 else 20
|
| 517 |
-
|
| 518 |
-
# 5. 인간 마커 감점
|
| 519 |
human_count = sum(len(p.findall(text)) for p in HUMAN_MARKERS.values())
|
| 520 |
human_penalty = min(35, human_count * 8)
|
| 521 |
-
|
| 522 |
-
# 6. 품사 비율 (명사 과다 = AI적)
|
| 523 |
pos_score = 45
|
| 524 |
if morphemes:
|
| 525 |
pc = Counter(t for _,t in morphemes); tm = sum(pc.values())
|
| 526 |
noun_r = sum(pc.get(t,0) for t in ['NNG','NNP','NNB','NR'])/tm if tm else 0
|
| 527 |
pos_score = 70 if noun_r>0.42 else 55 if noun_r>0.38 else 35 if noun_r>0.32 else 20
|
| 528 |
-
|
| 529 |
final = max(5, int(ending_score*0.25 + conn_score*0.20 + filler_score*0.20 +
|
| 530 |
conc_score*0.10 + pos_score*0.15 + 10*0.10) - human_penalty)
|
| 531 |
return {"score":final,"formal":f"{fr:.0%}","conn":f"{conn_density:.2f}","filler":filler_cnt,"human":human_count}
|
| 532 |
-
|
| 533 |
-
# ═══════════════════════════════════════════════
|
| 534 |
-
# 축③ 반복 — v5.0: 문두 N-어절 + 구문 템플릿 + 의미 반복
|
| 535 |
-
# ═══════════════════════════════════════════════
|
| 536 |
def analyze_repetition(text, sentences, words):
|
| 537 |
if not sentences or len(sentences) < 2: return {"score":35}
|
| 538 |
-
|
| 539 |
-
# 1. N-gram 반복 (3-gram)
|
| 540 |
tr = 0
|
| 541 |
if len(words)>=5:
|
| 542 |
tg = Counter(tuple(words[i:i+3]) for i in range(len(words)-2))
|
| 543 |
tr = sum(1 for c in tg.values() if c>1)/len(tg) if tg else 0
|
| 544 |
ngram_score = 80 if tr>0.15 else 60 if tr>0.08 else 35 if tr>0.03 else 15
|
| 545 |
-
|
| 546 |
-
# 2. ★ 문두 2-3어절 다양성 (첫 단어만이 아닌 첫 2-3어절)
|
| 547 |
openers_2 = []
|
| 548 |
openers_3 = []
|
| 549 |
for s in sentences:
|
| 550 |
ws = split_words(s)
|
| 551 |
if len(ws) >= 2: openers_2.append(tuple(ws[:2]))
|
| 552 |
if len(ws) >= 3: openers_3.append(tuple(ws[:3]))
|
| 553 |
-
|
| 554 |
opener2_score = 50
|
| 555 |
if openers_2:
|
| 556 |
unique2 = len(set(openers_2))/len(openers_2)
|
| 557 |
opener2_score = 80 if unique2 < 0.5 else 60 if unique2 < 0.7 else 35 if unique2 < 0.85 else 15
|
| 558 |
-
|
| 559 |
-
# 3. AI 접속사 문두 반복
|
| 560 |
ai_only_conns = ['또한','따라서','그러므로','이에 따라','더불어','아울러','뿐만 아니라',
|
| 561 |
'이를 통해','이에','결과적으로','궁극적으로','나아가','이러한']
|
| 562 |
cr = sum(1 for s in sentences if any(s.strip().startswith(c) for c in ai_only_conns))
|
| 563 |
crr = cr/len(sentences) if sentences else 0
|
| 564 |
ai_conn_score = 85 if crr>0.35 else 65 if crr>0.2 else 40 if crr>0.08 else 15
|
| 565 |
-
|
| 566 |
-
# 4. ★ 구문 템플릿 반복 (주어+조사+...+종결 패턴)
|
| 567 |
templates = []
|
| 568 |
for s in sentences:
|
| 569 |
ws = split_words(s)
|
| 570 |
if len(ws) >= 4:
|
| 571 |
-
# 첫 어절 + 마지막 어절 패턴
|
| 572 |
templates.append((ws[0], ws[-1]))
|
| 573 |
template_rep = 0
|
| 574 |
if templates:
|
| 575 |
tc = Counter(templates)
|
| 576 |
template_rep = sum(1 for c in tc.values() if c > 1) / len(tc) if tc else 0
|
| 577 |
template_score = 80 if template_rep > 0.3 else 55 if template_rep > 0.1 else 25
|
| 578 |
-
|
| 579 |
-
# 5. ★ 종결어미 다양성 (AI는 같은 종결어미 반복)
|
| 580 |
endings = []
|
| 581 |
for s in sentences:
|
| 582 |
sr = s.rstrip('.!?。')
|
|
@@ -586,36 +470,23 @@ def analyze_repetition(text, sentences, words):
|
|
| 586 |
if endings:
|
| 587 |
unique_e = len(set(endings))/len(endings)
|
| 588 |
ending_div = 80 if unique_e < 0.3 else 60 if unique_e < 0.5 else 35 if unique_e < 0.7 else 15
|
| 589 |
-
|
| 590 |
final = int(ngram_score*0.15 + opener2_score*0.20 + ai_conn_score*0.25 +
|
| 591 |
template_score*0.15 + ending_div*0.25)
|
| 592 |
return {"score":final}
|
| 593 |
-
|
| 594 |
-
# ═══════════════════════════════════════════════
|
| 595 |
-
# 축④ 구조 — v5.0: 추상성/구체성 + 문장다양성 + 구두점
|
| 596 |
-
# ═══════════════════════════════════════════════
|
| 597 |
-
# AI 추상 수식어 (구체 명사 없이 쓰이는 AI적 형용사)
|
| 598 |
AI_VAGUE = re.compile(r'다양한|중요한|긍정적인|부정적인|획기적인|혁신적인|효율적인|체계적인|종합적인|전반적인|지속적인|적극적인|상당한|주요한')
|
| 599 |
-
# 구체성 지표 (인간적 — 고유명사, 숫자+단위, 인용/출처)
|
| 600 |
CONCRETE_PROPER = re.compile(r'삼성|LG|현대|SK|카카오|네이버|넷플릭스|구글|애플|테슬라|아마존|마이크로소프트|[가-힣]{2,}대학|[가-힣]{2,}병원|[가-힣]{1,3}시[는을이가]|[가-힣]{1,3}구[는을이가]|[가-힣]{2,}동[에서]')
|
| 601 |
CONCRETE_NUMBER = re.compile(r'\d{2,}[만억조원달러%개년월일위등호]|\d+\.\d+%|\d{4}년|\d{1,2}월')
|
| 602 |
CONCRETE_QUOTE = re.compile(r'에 따르면|발표했|밝혔다|보도했|전했다|라고 말|인터뷰|설문|조사|통계청|보고서')
|
| 603 |
-
|
| 604 |
def analyze_structure(text, sentences):
|
| 605 |
if not sentences: return {"score":35}
|
| 606 |
-
|
| 607 |
-
# 1. 마크다운/리스트
|
| 608 |
lt = (len(re.findall(r'^\d+[.)]\s',text,re.M)) + len(re.findall(r'^[-•*]\s',text,re.M)) +
|
| 609 |
len(re.findall(r'^#+\s',text,re.M)) + len(re.findall(r'\*\*[^*]+\*\*',text)))
|
| 610 |
list_score = 90 if lt>=5 else 70 if lt>=3 else 45 if lt>=1 else 10
|
| 611 |
-
|
| 612 |
-
# 2. ★ 추상성 vs 구체성 (핵심 판별 — 40% 가중치)
|
| 613 |
vague_cnt = len(AI_VAGUE.findall(text))
|
| 614 |
proper_cnt = len(CONCRETE_PROPER.findall(text))
|
| 615 |
number_cnt = len(CONCRETE_NUMBER.findall(text))
|
| 616 |
quote_cnt = len(CONCRETE_QUOTE.findall(text))
|
| 617 |
concrete_total = proper_cnt + number_cnt + quote_cnt
|
| 618 |
-
|
| 619 |
if vague_cnt >= 3 and concrete_total == 0: abstract_score = 90
|
| 620 |
elif vague_cnt >= 2 and concrete_total <= 1: abstract_score = 70
|
| 621 |
elif vague_cnt >= 1 and concrete_total == 0: abstract_score = 55
|
|
@@ -623,20 +494,14 @@ def analyze_structure(text, sentences):
|
|
| 623 |
elif concrete_total >= 2: abstract_score = 20
|
| 624 |
elif concrete_total >= 1: abstract_score = 30
|
| 625 |
else: abstract_score = 45
|
| 626 |
-
|
| 627 |
-
# 3. 문장 유형 다양성 (AI=서술문만, 인간=의문/감탄 혼용)
|
| 628 |
has_question = any(s.strip().endswith('?') for s in sentences)
|
| 629 |
has_exclaim = any(s.strip().endswith('!') for s in sentences)
|
| 630 |
has_ellipsis = any('...' in s or '…' in s for s in sentences)
|
| 631 |
variety = sum([has_question, has_exclaim, has_ellipsis])
|
| 632 |
type_score = 15 if variety >= 2 else 40 if variety >= 1 else 65
|
| 633 |
-
|
| 634 |
-
# 4. 구두점 단조로움
|
| 635 |
puncts = re.findall(r'[!?,;:…—\-~]', text)
|
| 636 |
unique_punct = len(set(puncts))
|
| 637 |
punct_score = 65 if unique_punct <= 1 else 45 if unique_punct <= 3 else 20
|
| 638 |
-
|
| 639 |
-
# 5. 문단 구조 (다문단인 경우)
|
| 640 |
paras = [p.strip() for p in text.split('\n\n') if p.strip()]
|
| 641 |
para_score = 35
|
| 642 |
if len(paras) >= 2:
|
|
@@ -647,13 +512,8 @@ def analyze_structure(text, sentences):
|
|
| 647 |
para_score = 75 if pcv < 0.2 else 55 if pcv < 0.35 else 30
|
| 648 |
if len(paras) >= 3 and pl[0] < avg_p and pl[-1] < avg_p:
|
| 649 |
para_score = min(85, para_score + 10)
|
| 650 |
-
|
| 651 |
final = int(list_score*0.10 + abstract_score*0.40 + type_score*0.20 + punct_score*0.10 + para_score*0.20)
|
| 652 |
return {"score":final}
|
| 653 |
-
|
| 654 |
-
# ═══════════════════════════════════════════════
|
| 655 |
-
# 축⑤ 지문 — v5.0: Perplexity + 비격식AI + 영어 패턴
|
| 656 |
-
# ═══════════════════════════════════════════════
|
| 657 |
def analyze_model_fingerprint(text, sentences):
|
| 658 |
ms = {}
|
| 659 |
sl = text.lower()
|
|
@@ -664,39 +524,22 @@ def analyze_model_fingerprint(text, sentences):
|
|
| 664 |
em = sum(1 for s in sentences if any(s.rstrip('.!?').endswith(e) for e in fp.get("e",[])))
|
| 665 |
if sentences: sc += int((em/len(sentences))*20)
|
| 666 |
ms[mn] = min(100,sc)
|
| 667 |
-
|
| 668 |
-
# ★ 비격식 AI 일반 지문 (특정 모델 불문)
|
| 669 |
general_ai = 0
|
| 670 |
-
# 비격식 AI 상투
|
| 671 |
general_ai += sum(5 for f in AI_CASUAL_FILLER if f in text)
|
| 672 |
-
# 비격식 AI 종결
|
| 673 |
casual_end_cnt = sum(1 for s in sentences if any(s.rstrip('.!?').endswith(e) for e in AI_CASUAL_ENDINGS))
|
| 674 |
general_ai += casual_end_cnt * 5
|
| 675 |
-
# 양보 패턴
|
| 676 |
general_ai += len(AI_CONCESSION.findall(text)) * 8
|
| 677 |
ms["비격식AI"] = min(100, general_ai)
|
| 678 |
-
|
| 679 |
-
# ★ 영어 AI 지문
|
| 680 |
en_score = sum(5 for em in EN_AI_MARKERS if em in sl)
|
| 681 |
ms["영어AI"] = min(100, en_score)
|
| 682 |
-
|
| 683 |
mx = max(ms.values()) if ms else 0
|
| 684 |
-
# 복수 모델에서 점수가 나오면 더 AI적
|
| 685 |
multi = sum(1 for v in ms.values() if v >= 10)
|
| 686 |
multi_bonus = 10 if multi >= 3 else 5 if multi >= 2 else 0
|
| 687 |
-
|
| 688 |
base = 85 if mx>=50 else 65 if mx>=35 else 45 if mx>=20 else 25 if mx>=10 else 10
|
| 689 |
return {"score":min(95, base + multi_bonus),"model_scores":{k:v for k,v in ms.items() if k not in ("비격식AI","영어AI") or v > 0}}
|
| 690 |
-
|
| 691 |
-
# ═══════════════════════════════════════════════
|
| 692 |
-
# ★★★ 킬러 기능 ① — Perplexity 기반 AI 확률 (v5.1)
|
| 693 |
-
# ═══════════════════════════════════════════════
|
| 694 |
-
# AI 텍스트는 예측 가능도가 높음 (낮은 Perplexity) → 문자/형태소 n-gram 기반
|
| 695 |
def analyze_perplexity(text, sentences, morphemes):
|
| 696 |
"""한국어 특화 Perplexity + Burstiness — 문자 엔트로피 보정"""
|
| 697 |
if len(sentences) < 2: return {"score": 40, "entropy": 0, "variance": 0, "order": 0, "zipf": 0}
|
| 698 |
-
|
| 699 |
-
# === 1. 문자 바이그램 엔트로피 (형태소보다 안정적) ===
|
| 700 |
chars = [c for c in text if c.strip()]
|
| 701 |
char_score = 45
|
| 702 |
if len(chars) >= 30:
|
|
@@ -704,14 +547,11 @@ def analyze_perplexity(text, sentences, morphemes):
|
|
| 704 |
cb_freq = Counter(cbigrams)
|
| 705 |
total_cb = len(cbigrams)
|
| 706 |
char_entropy = -sum((cnt/total_cb)*math.log2(cnt/total_cb) for cnt in cb_freq.values())
|
| 707 |
-
# AI 한국어: ~7~9비트, 인간: ~9~12비트
|
| 708 |
if char_entropy < 7.5: char_score = 78
|
| 709 |
elif char_entropy < 8.5: char_score = 62
|
| 710 |
elif char_entropy < 9.5: char_score = 42
|
| 711 |
elif char_entropy < 10.5: char_score = 25
|
| 712 |
else: char_score = 12
|
| 713 |
-
|
| 714 |
-
# === 2. 문장 길이 Burstiness (CV) ===
|
| 715 |
sl = [len(s) for s in sentences]
|
| 716 |
burst_score = 45
|
| 717 |
if len(sl) >= 3:
|
|
@@ -723,8 +563,6 @@ def analyze_perplexity(text, sentences, morphemes):
|
|
| 723 |
elif cv < 0.40: burst_score = 38
|
| 724 |
elif cv < 0.60: burst_score = 20
|
| 725 |
else: burst_score = 8
|
| 726 |
-
|
| 727 |
-
# === 3. 문장간 어휘밀도(TTR) 편차 ===
|
| 728 |
sent_ttr = []
|
| 729 |
for s in sentences:
|
| 730 |
sw = split_words(s)
|
|
@@ -738,8 +576,6 @@ def analyze_perplexity(text, sentences, morphemes):
|
|
| 738 |
elif std_ttr < 0.08: ttr_score = 55
|
| 739 |
elif std_ttr < 0.15: ttr_score = 35
|
| 740 |
else: ttr_score = 15
|
| 741 |
-
|
| 742 |
-
# === 4. 종결어미 엔트로피 ===
|
| 743 |
endings = [s.rstrip('.!?\u2026')[-3:] for s in sentences if len(s) >= 5]
|
| 744 |
end_score = 40
|
| 745 |
if len(endings) >= 3:
|
|
@@ -751,55 +587,32 @@ def analyze_perplexity(text, sentences, morphemes):
|
|
| 751 |
elif norm_ent < 0.7: end_score = 50
|
| 752 |
elif norm_ent < 0.85: end_score = 32
|
| 753 |
else: end_score = 15
|
| 754 |
-
|
| 755 |
final = int(char_score * 0.30 + burst_score * 0.30 + ttr_score * 0.20 + end_score * 0.20)
|
| 756 |
return {"score": final, "entropy": char_score, "variance": burst_score, "order": ttr_score, "zipf": end_score}
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
# ═══════════════════════════════════════════════
|
| 760 |
-
# ★★★ 킬러 기능 ② — Humanizer/Bypasser 탐지 (v5.1)
|
| 761 |
-
# ═══════════════════════════════════════════════
|
| 762 |
-
# 패러프레이즈 도구(QuillBot 등)로 수정된 AI 글의 잔존 흔적 탐지
|
| 763 |
-
|
| 764 |
-
# Humanizer 특유 패턴
|
| 765 |
HUMANIZER_OVERSUBST = re.compile(r'활용하다|이용하다|사용하다|적용하다|도입하다|채택하다|수행하다|진행하다|실시하다|실행하다')
|
| 766 |
HUMANIZER_AWKWARD = re.compile(r'그것은|이것은|저것은|해당 사항|앞서 언급한|전술한|상기한|기술된')
|
| 767 |
HUMANIZER_PASSIVE = re.compile(r'되어지[고는며]|하게 되었[다습]|수행되[었어]|진행되[었어]|실시되[었어]|활용되[었어]')
|
| 768 |
-
|
| 769 |
def analyze_humanizer(text, sentences, words, morphemes):
|
| 770 |
"""Humanizer/Bypasser 탐지 — AI 원문 패러프레이즈 흔적 분석"""
|
| 771 |
if len(sentences) < 2: return {"score": 20, "signals": []}
|
| 772 |
signals = []
|
| 773 |
-
|
| 774 |
-
# === 1. 동의어 과다 치환 패턴 ===
|
| 775 |
-
# Humanizer는 같은 의미를 다양한 동의어로 바꿈 → 비자연적 어휘 분산
|
| 776 |
content_words = [f for f, t in morphemes if t in ('NNG', 'NNP', 'VV', 'VA')]
|
| 777 |
if len(content_words) >= 10:
|
| 778 |
cw_freq = Counter(content_words)
|
| 779 |
-
# Hapax ratio 극단적으로 높으면 동의어 치환 의심
|
| 780 |
hapax = sum(1 for c in cw_freq.values() if c == 1)
|
| 781 |
hapax_ratio = hapax / len(cw_freq) if cw_freq else 0
|
| 782 |
-
# 자연어: 0.4~0.7, Humanizer: 0.8+ (모든 단어를 다 바꿔서)
|
| 783 |
if hapax_ratio > 0.95 and len(content_words) >= 30:
|
| 784 |
signals.append(("동의어과다치환", 20, "핵심 어휘가 과도하게 분산"))
|
| 785 |
elif hapax_ratio > 0.90 and len(content_words) >= 25:
|
| 786 |
signals.append(("동의어치환의심", 12, "어휘 반복 회피 패턴"))
|
| 787 |
-
|
| 788 |
-
# === 2. 구조 보존 + 어휘만 변경 패턴 ===
|
| 789 |
-
# 원문 AI의 문장 구조(길이, 어순)는 유지되면서 단어만 바뀜
|
| 790 |
sl = [len(s) for s in sentences]
|
| 791 |
if len(sl) >= 4:
|
| 792 |
avg = sum(sl) / len(sl)
|
| 793 |
cv = math.sqrt(sum((l - avg)**2 for l in sl) / len(sl)) / (avg + 1e-10)
|
| 794 |
-
# 문장 길이 균일 + 어휘 다양 = Humanizer 패턴
|
| 795 |
unique_ratio = len(set(words)) / len(words) if words else 0
|
| 796 |
if cv < 0.20 and unique_ratio > 0.80 and len(sentences) >= 5:
|
| 797 |
signals.append(("구조보존어휘변경", 18, "문장 구조 균일 + 비정상적 어휘 다양성"))
|
| 798 |
-
|
| 799 |
-
# === 3. 잔존 AI 패턴 ===
|
| 800 |
-
# Humanizer가 놓치기 쉬운 AI 흔적
|
| 801 |
residual = 0
|
| 802 |
-
# 접속사 위치 규칙성 (Humanizer는 접속사를 잘 안 바꿈)
|
| 803 |
conn_positions = []
|
| 804 |
for i, s in enumerate(sentences):
|
| 805 |
stripped = s.strip()
|
|
@@ -808,13 +621,10 @@ def analyze_humanizer(text, sentences, words, morphemes):
|
|
| 808 |
conn_positions.append(i)
|
| 809 |
break
|
| 810 |
if len(conn_positions) >= 2:
|
| 811 |
-
# 등간격 접속사 = AI 원문 구조 잔존
|
| 812 |
gaps = [conn_positions[i] - conn_positions[i-1] for i in range(1, len(conn_positions))]
|
| 813 |
if gaps and max(gaps) - min(gaps) <= 1: # 거의 등간격
|
| 814 |
signals.append(("접속사등간격잔존", 15, "접속사 배치가 규칙적 (AI 원문 구조 잔존)"))
|
| 815 |
residual += 15
|
| 816 |
-
|
| 817 |
-
# === 4. 부자연스러운 대체 표현 ===
|
| 818 |
oversubst = len(HUMANIZER_OVERSUBST.findall(text))
|
| 819 |
awkward = len(HUMANIZER_AWKWARD.findall(text))
|
| 820 |
passive = len(HUMANIZER_PASSIVE.findall(text))
|
|
@@ -824,11 +634,6 @@ def analyze_humanizer(text, sentences, words, morphemes):
|
|
| 824 |
signals.append(("어색한지시어", 10, f"해당/전술/상기 등 {awkward}개"))
|
| 825 |
if passive >= 3:
|
| 826 |
signals.append(("이중피동과다", 15, f"되어지/수행되 등 {passive}개"))
|
| 827 |
-
|
| 828 |
-
# === 5. 문장 유형 단조 + 어미 다양 = Humanizer 시그니처 ===
|
| 829 |
-
# AI 원문: 문장유형 단조 + 어미 단조
|
| 830 |
-
# 인간: 문장유형 다양 + 어미 다양
|
| 831 |
-
# Humanizer: 문장유형 단조(바꿀 수 없음) + 어미 다양(바꿈) → 부조화
|
| 832 |
endings = [s.rstrip('.!?')[-2:] for s in sentences if len(s) >= 4]
|
| 833 |
end_types = len(set(endings)) / len(endings) if endings else 0
|
| 834 |
has_question = any(s.strip().endswith('?') for s in sentences)
|
|
@@ -836,29 +641,17 @@ def analyze_humanizer(text, sentences, words, morphemes):
|
|
| 836 |
sent_type_variety = sum([has_question, has_exclaim])
|
| 837 |
if sent_type_variety == 0 and end_types > 0.85 and len(sentences) >= 6:
|
| 838 |
signals.append(("유형단조어미다양", 12, "서술문만 + 종결어미 과다 다양 = Humanizer 패턴"))
|
| 839 |
-
|
| 840 |
-
# === 6. 문장 시작 패턴 불일치 ===
|
| 841 |
-
# Humanizer는 문두를 다양하게 바꾸려 하나, 한국어에서는 부자연스러움 유발
|
| 842 |
starters = [s.strip()[:3] for s in sentences if len(s) >= 6]
|
| 843 |
starter_unique = len(set(starters)) / len(starters) if starters else 0
|
| 844 |
if starter_unique >= 0.98 and len(sentences) >= 7:
|
| 845 |
signals.append(("문두과다다양", 8, "모든 문장 시작이 다름 (자연스럽지 않은 다양성)"))
|
| 846 |
-
|
| 847 |
total = sum(s[1] for s in signals)
|
| 848 |
-
# 점수화
|
| 849 |
if total >= 45: score = 85
|
| 850 |
elif total >= 30: score = 68
|
| 851 |
elif total >= 20: score = 52
|
| 852 |
elif total >= 10: score = 35
|
| 853 |
else: score = 15
|
| 854 |
-
|
| 855 |
return {"score": score, "signals": signals, "total_evidence": total}
|
| 856 |
-
|
| 857 |
-
# ═══════════════════════════════════════════════
|
| 858 |
-
# ★★★ 킬러 기능 ③ — AI 모델 추정 (v5.1)
|
| 859 |
-
# ═══════════════════════════════════════════════
|
| 860 |
-
# 모델별 고유 특성으로 작성 모델 추정
|
| 861 |
-
|
| 862 |
MODEL_PROFILES = {
|
| 863 |
"GPT": {
|
| 864 |
"style": ["격식체 ~습니다", "또한/특히 접속사", "~에 대해", "~것으로 예상됩니다"],
|
|
@@ -889,27 +682,19 @@ MODEL_PROFILES = {
|
|
| 889 |
"connectors": ["한편", "또한", "이에"],
|
| 890 |
},
|
| 891 |
}
|
| 892 |
-
|
| 893 |
def estimate_model(text, sentences, morphemes, model_scores):
|
| 894 |
"""AI 모델 추정 — 복합 증거 기반"""
|
| 895 |
evidence = {m: {"score": 0, "reasons": []} for m in MODEL_PROFILES}
|
| 896 |
-
|
| 897 |
sl = text.lower()
|
| 898 |
-
|
| 899 |
for model, profile in MODEL_PROFILES.items():
|
| 900 |
-
# 1. FP 점수 반영 (기존 지문 분석)
|
| 901 |
fp_score = model_scores.get(model, 0)
|
| 902 |
evidence[model]["score"] += fp_score * 0.4
|
| 903 |
if fp_score >= 20:
|
| 904 |
evidence[model]["reasons"].append(f"지문 매칭 {fp_score}점")
|
| 905 |
-
|
| 906 |
-
# 2. 마커 매칭
|
| 907 |
marker_cnt = sum(1 for m in profile["markers"] if m in text)
|
| 908 |
if marker_cnt >= 2:
|
| 909 |
evidence[model]["score"] += marker_cnt * 8
|
| 910 |
evidence[model]["reasons"].append(f"특유 표현 {marker_cnt}개")
|
| 911 |
-
|
| 912 |
-
# 3. 종결어미 패턴
|
| 913 |
end_match = 0
|
| 914 |
for s in sentences:
|
| 915 |
for e in profile["endings"]:
|
|
@@ -920,34 +705,23 @@ def estimate_model(text, sentences, morphemes, model_scores):
|
|
| 920 |
if end_ratio > 0.7:
|
| 921 |
evidence[model]["score"] += 12
|
| 922 |
evidence[model]["reasons"].append(f"종결어미 {end_ratio:.0%} 일치")
|
| 923 |
-
|
| 924 |
-
# 4. 접속사 패턴
|
| 925 |
conn_match = sum(1 for s in sentences if any(s.strip().startswith(c) for c in profile["connectors"]))
|
| 926 |
if conn_match >= 2:
|
| 927 |
evidence[model]["score"] += conn_match * 4
|
| 928 |
evidence[model]["reasons"].append(f"접속사 패턴 {conn_match}회")
|
| 929 |
-
|
| 930 |
-
# Perplexity 특화: 수치 + 출처 인용
|
| 931 |
number_citations = len(re.findall(r'\d+[%만억조]|에 따르면|것으로 나타났|발표했', text))
|
| 932 |
if number_citations >= 3:
|
| 933 |
evidence["Perplexity"]["score"] += number_citations * 5
|
| 934 |
evidence["Perplexity"]["reasons"].append(f"수치/인용 {number_citations}회")
|
| 935 |
-
|
| 936 |
-
# Claude 특화: 양보-주장 구문
|
| 937 |
concession_cnt = len(AI_CONCESSION.findall(text))
|
| 938 |
if concession_cnt >= 1:
|
| 939 |
evidence["Claude"]["score"] += concession_cnt * 10
|
| 940 |
evidence["Claude"]["reasons"].append(f"양보-주장 구문 {concession_cnt}회")
|
| 941 |
-
|
| 942 |
-
# 정렬 및 판정
|
| 943 |
ranked = sorted(evidence.items(), key=lambda x: x[1]["score"], reverse=True)
|
| 944 |
top = ranked[0]
|
| 945 |
second = ranked[1] if len(ranked) > 1 else None
|
| 946 |
-
|
| 947 |
if top[1]["score"] < 10:
|
| 948 |
return {"model": "특정 불가", "confidence": "낮음", "detail": evidence, "ranked": ranked}
|
| 949 |
-
|
| 950 |
-
# 신뢰도 계산
|
| 951 |
gap = top[1]["score"] - (second[1]["score"] if second else 0)
|
| 952 |
if gap >= 20 and top[1]["score"] >= 30:
|
| 953 |
conf = "높음"
|
|
@@ -955,7 +729,6 @@ def estimate_model(text, sentences, morphemes, model_scores):
|
|
| 955 |
conf = "중간"
|
| 956 |
else:
|
| 957 |
conf = "낮음"
|
| 958 |
-
|
| 959 |
return {
|
| 960 |
"model": top[0],
|
| 961 |
"confidence": conf,
|
|
@@ -964,10 +737,6 @@ def estimate_model(text, sentences, morphemes, model_scores):
|
|
| 964 |
"detail": evidence,
|
| 965 |
"ranked": ranked
|
| 966 |
}
|
| 967 |
-
|
| 968 |
-
# ═══════════════════════════════════════════════
|
| 969 |
-
# 품질
|
| 970 |
-
# ═══════════════════════════════════════════════
|
| 971 |
def analyze_quality(text, sentences, words, morphemes):
|
| 972 |
qs = {}; sl = [len(s) for s in sentences]; tw = len(words)
|
| 973 |
ideal = sum(1 for l in sl if 15<=l<=70)/len(sentences) if sentences else 0
|
|
@@ -998,24 +767,16 @@ def analyze_quality(text, sentences, words, morphemes):
|
|
| 998 |
total = int(sum(qs[k]*wq[k] for k in wq))
|
| 999 |
grade = "S" if total>=85 else "A" if total>=72 else "B" if total>=58 else "C" if total>=42 else "D" if total>=28 else "F"
|
| 1000 |
return {"score":total,"grade":grade,"sub_scores":qs}
|
| 1001 |
-
|
| 1002 |
-
# ═══════════════════════════════════════════════
|
| 1003 |
-
# LLM 교차검증
|
| 1004 |
-
# ═══════════════════════════════════════════════
|
| 1005 |
LLM_JUDGES = [("openai/gpt-oss-120b","GPT-OSS 120B"),("qwen/qwen3-32b","Qwen3 32B"),("moonshotai/kimi-k2-instruct-0905","Kimi-K2")]
|
| 1006 |
-
|
| 1007 |
def _parse_ai_probability(raw_resp):
|
| 1008 |
"""LLM 응답에서 AI 확률(0~100)을 추출. 한국어/영어 다양한 형식 대응."""
|
| 1009 |
if not raw_resp: return -1
|
| 1010 |
-
# 1. <think> 태그 분리
|
| 1011 |
think_content = ''
|
| 1012 |
think_m = re.search(r'<think>(.*?)</think>', raw_resp, flags=re.S)
|
| 1013 |
if think_m: think_content = think_m.group(1)
|
| 1014 |
resp = re.sub(r'<think>.*?</think>', '', raw_resp, flags=re.S).strip()
|
| 1015 |
if not resp or len(resp) < 5:
|
| 1016 |
resp = raw_resp # think만 있으면 원본 포함
|
| 1017 |
-
|
| 1018 |
-
# 2. 특정 키워드 패턴 (높은 우선순위 — 첫 매칭)
|
| 1019 |
specific_patterns = [
|
| 1020 |
r'AI\s*확률\s*[::]\s*(?:약\s*)?(\d+)\s*%?',
|
| 1021 |
r'AI\s*[Pp]robability\s*[::]\s*(?:about|approximately?\s*)?(\d+)\s*%?',
|
|
@@ -1031,11 +792,8 @@ def _parse_ai_probability(raw_resp):
|
|
| 1031 |
if m:
|
| 1032 |
v = int(m.group(1))
|
| 1033 |
if 0 <= v <= 100: return v
|
| 1034 |
-
|
| 1035 |
-
# 3. 범용 패턴 — 마지막 5줄에서만 검색 (통계 수치 오탐 방지)
|
| 1036 |
lines = [l.strip() for l in resp.strip().split('\n') if l.strip()]
|
| 1037 |
for line in reversed(lines[-5:]):
|
| 1038 |
-
# 라인에 AI/확률/probability 키워드가 있으면 우선
|
| 1039 |
if re.search(r'AI|확률|[Pp]robab|신뢰|판[정단]', line):
|
| 1040 |
nums = re.findall(r'(\d+)\s*%', line)
|
| 1041 |
if nums:
|
|
@@ -1045,54 +803,39 @@ def _parse_ai_probability(raw_resp):
|
|
| 1045 |
if nums:
|
| 1046 |
v = int(nums[-1])
|
| 1047 |
if 0 <= v <= 100: return v
|
| 1048 |
-
|
| 1049 |
-
# 4. 전체 텍스트에서 마지막 XX% (단, AI/확률 근처만)
|
| 1050 |
all_pcts = list(re.finditer(r'(\d+)\s*(?:%|퍼센트|percent)', resp, re.I))
|
| 1051 |
for m in reversed(all_pcts):
|
| 1052 |
v = int(m.group(1))
|
| 1053 |
-
# 주변 50자 내에 AI/확률 키워드 있는지
|
| 1054 |
ctx_start = max(0, m.start()-50)
|
| 1055 |
ctx = resp[ctx_start:m.end()+20]
|
| 1056 |
if re.search(r'AI|확률|[Pp]robab|신뢰|판[정단]|가능성|likelihood', ctx, re.I):
|
| 1057 |
if 0 <= v <= 100: return v
|
| 1058 |
-
|
| 1059 |
-
# 5. 최후 수단: 전체에서 마지막 XX%
|
| 1060 |
if all_pcts:
|
| 1061 |
v = int(all_pcts[-1].group(1))
|
| 1062 |
if 5 <= v <= 99: return v # 100% 제외 (통계 수치 오탐 방지)
|
| 1063 |
-
|
| 1064 |
-
# 6. think 내부 폴백 (본문 파싱 실패 시)
|
| 1065 |
if think_content:
|
| 1066 |
for pat in specific_patterns:
|
| 1067 |
m = re.search(pat, think_content, re.I)
|
| 1068 |
if m:
|
| 1069 |
v = int(m.group(1))
|
| 1070 |
if 0 <= v <= 100: return v
|
| 1071 |
-
# think 내부 마지막 XX%
|
| 1072 |
think_pcts = re.findall(r'(\d+)\s*%', think_content)
|
| 1073 |
if think_pcts:
|
| 1074 |
v = int(think_pcts[-1])
|
| 1075 |
if 5 <= v <= 99: return v
|
| 1076 |
-
|
| 1077 |
return -1
|
| 1078 |
-
|
| 1079 |
def llm_cross_check(text):
|
| 1080 |
if not GROQ_KEY: return {"score":-1,"detail":{}}
|
| 1081 |
-
# 한국어+영어 병행 프롬프트 (GPT-OSS는 영어 모델이므로)
|
| 1082 |
prompt = f"""Analyze whether this text was written by AI.
|
| 1083 |
-
|
| 1084 |
[Instructions]
|
| 1085 |
1. Determine AI vs Human with 3 brief reasons
|
| 1086 |
2. IMPORTANT - Your LAST line MUST be exactly this format:
|
| 1087 |
AI확률: XX%
|
| 1088 |
(Replace XX with your estimated probability 0-100)
|
| 1089 |
-
|
| 1090 |
Example of correct last line:
|
| 1091 |
AI확률: 75%
|
| 1092 |
-
|
| 1093 |
[Text to analyze]
|
| 1094 |
{text[:2000]}"""
|
| 1095 |
-
|
| 1096 |
votes=[]; rpt={}
|
| 1097 |
for mid,mn in LLM_JUDGES:
|
| 1098 |
resp,err = call_groq(mid,prompt)
|
|
@@ -1101,52 +844,34 @@ AI확률: 75%
|
|
| 1101 |
if p >= 0:
|
| 1102 |
votes.append(p); rpt[mn]=f"{p}%"
|
| 1103 |
else:
|
| 1104 |
-
# 디버그: think 제거 후 응답 끝부분
|
| 1105 |
cleaned = re.sub(r'<think>.*?</think>', '', resp, flags=re.S).strip()
|
| 1106 |
tail = cleaned[-60:].replace('\n',' ') if len(cleaned) > 60 else cleaned.replace('\n',' ')
|
| 1107 |
rpt[mn]=f"파싱실패({tail[:40]})"
|
| 1108 |
else: rpt[mn]=f"ERR:{err[:30] if err else '?'}"
|
| 1109 |
if votes: return {"score":int(sum(votes)/len(votes)),"detail":rpt}
|
| 1110 |
return {"score":-1,"detail":rpt}
|
| 1111 |
-
|
| 1112 |
-
# ═══════════════════════════════════════════════
|
| 1113 |
-
# 종합 판정 (일관된 기준)
|
| 1114 |
-
# ═══════════════════════════════════════════════
|
| 1115 |
def compute_verdict(scores, llm_score=-1, sent_avg=-1, ppx_score=-1, hum_score=-1):
|
| 1116 |
w={"통계":.06,"문체":.25,"반복성":.10,"구조":.12,"지문":.30}
|
| 1117 |
ws=sum(scores[k]*w[k] for k in w)
|
| 1118 |
-
|
| 1119 |
-
# ★ Perplexity 축 통합 (17%)
|
| 1120 |
if ppx_score >= 0: ws += ppx_score * 0.17
|
| 1121 |
-
|
| 1122 |
-
# ★ 교차 신호 부스트
|
| 1123 |
style=scores["문체"]; fp=scores["지문"]; rep=scores["반복성"]; struct=scores["구조"]
|
| 1124 |
if style>=35 and fp>=35: ws+=8
|
| 1125 |
elif style>=30 and fp>=25: ws+=4
|
| 1126 |
if style>=30 and rep>=25 and fp>=20: ws+=4
|
| 1127 |
if fp>=45: ws+=3
|
| 1128 |
if struct>=50 and style>=30: ws+=3
|
| 1129 |
-
# Perplexity + 지문 동시 부스트
|
| 1130 |
if ppx_score>=55 and fp>=35: ws+=5
|
| 1131 |
if ppx_score>=65 and style>=35: ws+=3
|
| 1132 |
-
|
| 1133 |
-
# ★ Humanizer 탐지 시 특별 부스트
|
| 1134 |
if hum_score>=50:
|
| 1135 |
ws=max(ws, 45) # Humanizer 확인 → 최소 AI 의심 중간
|
| 1136 |
ws += (hum_score-50)*0.15
|
| 1137 |
-
|
| 1138 |
-
# ★ 문장 수준 부스트
|
| 1139 |
if sent_avg>=0 and sent_avg>ws: ws=ws*0.80+sent_avg*0.20
|
| 1140 |
-
|
| 1141 |
hi=sum(1 for v in scores.values() if v>=50)
|
| 1142 |
if hi>=4: ws+=8
|
| 1143 |
elif hi>=3: ws+=5
|
| 1144 |
elif hi>=2: ws+=2
|
| 1145 |
-
|
| 1146 |
-
# ★ 인간 격식문 할인
|
| 1147 |
if style<40 and fp<=20 and rep<22 and struct<35 and (ppx_score<0 or ppx_score<40):
|
| 1148 |
ws-=5
|
| 1149 |
-
|
| 1150 |
lo=sum(1 for v in scores.values() if v<20)
|
| 1151 |
if lo>=3: ws-=8
|
| 1152 |
elif lo>=2: ws-=3
|
|
@@ -1157,7 +882,6 @@ def compute_verdict(scores, llm_score=-1, sent_avg=-1, ppx_score=-1, hum_score=-
|
|
| 1157 |
if fs>=45: return fs,"AI 의심 중간","ai_low"
|
| 1158 |
if fs>=30: return fs,"판단 유보","uncertain"
|
| 1159 |
return fs,"인간 작성 추정","human"
|
| 1160 |
-
|
| 1161 |
def quick_score(text):
|
| 1162 |
sents=split_sentences(text); words=split_words(text); morphs=get_morphemes(text)
|
| 1163 |
sc={"통계":analyze_statistics(text,sents,words)["score"],"문체":analyze_korean_style(text,sents,morphs)["score"],
|
|
@@ -1169,10 +893,6 @@ def quick_score(text):
|
|
| 1169 |
hum=analyze_humanizer(text,sents,words,morphs)
|
| 1170 |
fs,v,lv=compute_verdict(sc, sent_avg=sent_avg, ppx_score=ppx["score"], hum_score=hum["score"])
|
| 1171 |
return fs,v,lv,sc,ppx,hum
|
| 1172 |
-
|
| 1173 |
-
# ═══════════════════════════════════════════════
|
| 1174 |
-
# ═══════════════════════════════════════════════
|
| 1175 |
-
# ★ 표절 검사 (Brave Search 병렬 + KCI/RISS/ARXIV + Gemini)
|
| 1176 |
def brave_search(query, count=5):
|
| 1177 |
"""Brave Search API — 단일 쿼리"""
|
| 1178 |
if not BRAVE_KEY: return []
|
|
@@ -1188,7 +908,6 @@ def brave_search(query, count=5):
|
|
| 1188 |
return results
|
| 1189 |
except: pass
|
| 1190 |
return []
|
| 1191 |
-
|
| 1192 |
def search_kci(query):
|
| 1193 |
"""KCI(한국학술지인용색인) 검색"""
|
| 1194 |
try:
|
|
@@ -1201,7 +920,6 @@ def search_kci(query):
|
|
| 1201 |
return results[:3]
|
| 1202 |
except: pass
|
| 1203 |
return []
|
| 1204 |
-
|
| 1205 |
def search_riss(query):
|
| 1206 |
"""RISS(학술연구정보서비스) — 간접 검색"""
|
| 1207 |
results = []
|
|
@@ -1215,7 +933,6 @@ def search_riss(query):
|
|
| 1215 |
results.append({"title": title, "url": "https://www.riss.kr" + m.group(1), "snippet": "", "source": "RISS"})
|
| 1216 |
except: pass
|
| 1217 |
return results[:3]
|
| 1218 |
-
|
| 1219 |
def search_arxiv(query):
|
| 1220 |
"""arXiv API 검색"""
|
| 1221 |
results = []
|
|
@@ -1231,7 +948,6 @@ def search_arxiv(query):
|
|
| 1231 |
except Exception as e:
|
| 1232 |
pass
|
| 1233 |
return results[:3]
|
| 1234 |
-
|
| 1235 |
def gemini_plagiarism_check(text_chunk):
|
| 1236 |
"""Gemini + Google Search Grounding으로 표절 검사"""
|
| 1237 |
if not HAS_GENAI or not GEMINI_KEY: return None
|
|
@@ -1241,7 +957,6 @@ def gemini_plagiarism_check(text_chunk):
|
|
| 1241 |
prompt = f"""다음 텍스트가 인터넷에 존재하는지 Google Search로 확인하세요.
|
| 1242 |
유사한 문장이 발견되면 출처 URL과 유사도(%)를 보고하세요.
|
| 1243 |
마지막 줄에 "유사도: XX%" 형식으로 작성.
|
| 1244 |
-
|
| 1245 |
[텍스트]
|
| 1246 |
{text_chunk[:1000]}"""
|
| 1247 |
resp = client.models.generate_content(
|
|
@@ -1262,7 +977,6 @@ def gemini_plagiarism_check(text_chunk):
|
|
| 1262 |
return {"pct": pct, "response": text_resp, "sources": sources}
|
| 1263 |
except Exception as e:
|
| 1264 |
return {"pct": 0, "response": str(e)[:100], "sources": []}
|
| 1265 |
-
|
| 1266 |
def parallel_brave_search(queries, max_workers=10):
|
| 1267 |
"""Brave Search 병렬 실행 (최대 20개)"""
|
| 1268 |
all_results = {}
|
|
@@ -1275,7 +989,6 @@ def parallel_brave_search(queries, max_workers=10):
|
|
| 1275 |
all_results[q] = results
|
| 1276 |
except: all_results[q] = []
|
| 1277 |
return all_results
|
| 1278 |
-
|
| 1279 |
def duckduckgo_search(query, max_results=5):
|
| 1280 |
"""DuckDuckGo HTML 스크래핑 — API 키 불필요 폴백"""
|
| 1281 |
results = []
|
|
@@ -1290,7 +1003,6 @@ def duckduckgo_search(query, max_results=5):
|
|
| 1290 |
href = m.group(1)
|
| 1291 |
title = re.sub(r'<[^>]+>', '', m.group(2)).strip()
|
| 1292 |
snippet = re.sub(r'<[^>]+>', '', m.group(3)).strip()
|
| 1293 |
-
# DuckDuckGo redirect URL 파싱
|
| 1294 |
real_url = href
|
| 1295 |
if 'uddg=' in href:
|
| 1296 |
um = re.search(r'uddg=([^&]+)', href)
|
|
@@ -1300,41 +1012,31 @@ def duckduckgo_search(query, max_results=5):
|
|
| 1300 |
if len(results) >= max_results: break
|
| 1301 |
except: pass
|
| 1302 |
return results
|
| 1303 |
-
|
| 1304 |
def self_crawl_search(query, max_results=3):
|
| 1305 |
"""httpx 기반 자체 크롤링 (DuckDuckGo + 학술 사이트)"""
|
| 1306 |
all_results = []
|
| 1307 |
-
# DuckDuckGo
|
| 1308 |
all_results.extend(duckduckgo_search(query, max_results))
|
| 1309 |
-
# 학술 키워드 추가 검색
|
| 1310 |
if '논문' not in query and 'paper' not in query.lower():
|
| 1311 |
all_results.extend(duckduckgo_search(f"{query} 논문 학술", 2))
|
| 1312 |
return all_results
|
| 1313 |
-
|
| 1314 |
def run_plagiarism(text, progress=gr.Progress()):
|
| 1315 |
if not text or len(text.strip())<50:
|
| 1316 |
return "<div style='padding:20px;text-align:center;color:#888;'>⚠️ 최소 50자 이상</div>", ""
|
| 1317 |
text = text.strip()
|
| 1318 |
sents = split_sentences(text)
|
| 1319 |
now = datetime.now().strftime("%Y-%m-%d %H:%M")
|
| 1320 |
-
|
| 1321 |
has_brave = bool(BRAVE_KEY)
|
| 1322 |
has_gemini = bool(HAS_GENAI and GEMINI_KEY)
|
| 1323 |
-
|
| 1324 |
progress(0.05, "문장 분리...")
|
| 1325 |
-
# 문장을 3~5문장 단위로 블록화
|
| 1326 |
blocks = []
|
| 1327 |
for i in range(0, len(sents), 4):
|
| 1328 |
block = ' '.join(sents[i:i+4])
|
| 1329 |
if len(block) > 20:
|
| 1330 |
blocks.append({"text": block, "sent_indices": list(range(i, min(i+4, len(sents))))})
|
| 1331 |
-
|
| 1332 |
all_sources = []
|
| 1333 |
sent_matches = {i: [] for i in range(len(sents))} # 문장별 매칭 정보
|
| 1334 |
block_results = []
|
| 1335 |
log_lines = []
|
| 1336 |
-
|
| 1337 |
-
# Phase 1: 웹 검색 (Brave Search 병렬 or 자체 크롤링)
|
| 1338 |
if has_brave:
|
| 1339 |
progress(0.15, f"Brave Search 병렬 검색 ({len(blocks)}블록)...")
|
| 1340 |
queries = []
|
|
@@ -1351,7 +1053,6 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1351 |
sent_matches[si].append({"source": r["title"], "url": r["url"], "type": "Brave"})
|
| 1352 |
log_lines.append(f"Brave Search: {len(queries)}쿼리 → {sum(len(v) for v in brave_results.values())}건")
|
| 1353 |
else:
|
| 1354 |
-
# 자체 크롤링 폴백 (DuckDuckGo + 병렬)
|
| 1355 |
progress(0.15, f"자체 웹 검색 ({len(blocks)}블록)...")
|
| 1356 |
crawl_queries = []
|
| 1357 |
for b in blocks[:10]: # 최대 10블록
|
|
@@ -1369,15 +1070,11 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1369 |
sent_matches[si].append({"source": r["title"], "url": r["url"], "type": r.get("source","Web")})
|
| 1370 |
except: pass
|
| 1371 |
log_lines.append(f"자체 웹검색: {len(crawl_queries)}쿼리 (DuckDuckGo)")
|
| 1372 |
-
|
| 1373 |
-
# Phase 2: 학술 DB (KCI, RISS, arXiv) — 키워드 추출 후 검색
|
| 1374 |
progress(0.40, "학술 DB 검색 (KCI/RISS/arXiv)...")
|
| 1375 |
-
# 핵심 키워드 추출
|
| 1376 |
words = split_words(text)
|
| 1377 |
wf = Counter(words)
|
| 1378 |
keywords = [w for w, c in wf.most_common(20) if len(w) >= 2 and c >= 2][:5]
|
| 1379 |
kw_query = ' '.join(keywords[:3])
|
| 1380 |
-
|
| 1381 |
academic_results = []
|
| 1382 |
with ThreadPoolExecutor(max_workers=3) as executor:
|
| 1383 |
futures = [
|
|
@@ -1392,8 +1089,6 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1392 |
all_sources.extend(results)
|
| 1393 |
except: pass
|
| 1394 |
log_lines.append(f"학술DB: KCI/RISS/arXiv → {len(academic_results)}건")
|
| 1395 |
-
|
| 1396 |
-
# Phase 3: Gemini Google Search Grounding
|
| 1397 |
gemini_results = []
|
| 1398 |
if has_gemini:
|
| 1399 |
progress(0.60, "Gemini + Google Search...")
|
|
@@ -1406,22 +1101,15 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1406 |
for si in b["sent_indices"]:
|
| 1407 |
sent_matches[si].append({"source": src.get("title",""), "url": src.get("url",""), "type": "Google"})
|
| 1408 |
log_lines.append(f"Gemini: {len(blocks[:5])}블록 → {sum(len(r.get('sources',[])) for r in gemini_results)}출처")
|
| 1409 |
-
|
| 1410 |
progress(0.80, "보고서 생성...")
|
| 1411 |
-
|
| 1412 |
-
# 유사도 계산
|
| 1413 |
matched_sents = sum(1 for si, matches in sent_matches.items() if matches)
|
| 1414 |
total_sents = len(sents)
|
| 1415 |
plag_pct = int(matched_sents / total_sents * 100) if total_sents > 0 else 0
|
| 1416 |
-
|
| 1417 |
-
# Gemini 유사도도 반영
|
| 1418 |
if gemini_results:
|
| 1419 |
gemini_pcts = [r["pct"] for r in gemini_results if r["pct"] > 0]
|
| 1420 |
if gemini_pcts:
|
| 1421 |
gemini_avg = sum(gemini_pcts) / len(gemini_pcts)
|
| 1422 |
plag_pct = int(plag_pct * 0.5 + gemini_avg * 0.5)
|
| 1423 |
-
|
| 1424 |
-
# 출처 중복 제거
|
| 1425 |
seen_urls = set()
|
| 1426 |
unique_sources = []
|
| 1427 |
for s in all_sources:
|
|
@@ -1429,16 +1117,11 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1429 |
if url and url not in seen_urls:
|
| 1430 |
seen_urls.add(url)
|
| 1431 |
unique_sources.append(s)
|
| 1432 |
-
|
| 1433 |
-
# 등급
|
| 1434 |
if plag_pct >= 50: grade, grade_color, grade_bg = "표절 의심", "#FF4444", "#FFE0E0"
|
| 1435 |
elif plag_pct >= 30: grade, grade_color, grade_bg = "주의 필요", "#FF8800", "#FFF0DD"
|
| 1436 |
elif plag_pct >= 15: grade, grade_color, grade_bg = "유사 표현 일부", "#DDAA00", "#FFFBE0"
|
| 1437 |
elif plag_pct >= 5: grade, grade_color, grade_bg = "양호", "#4ECDC4", "#E0FFF8"
|
| 1438 |
else: grade, grade_color, grade_bg = "우수 (원본성 높음)", "#22AA44", "#E0FFE8"
|
| 1439 |
-
|
| 1440 |
-
|
| 1441 |
-
# ═══ CopyKiller 정밀 재현 보고서 HTML ═══
|
| 1442 |
sent_analysis = []
|
| 1443 |
for i, s in enumerate(sents):
|
| 1444 |
matches = sent_matches.get(i, [])
|
|
@@ -1448,8 +1131,6 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1448 |
else:
|
| 1449 |
sent_analysis.append({"idx":i, "text":s, "matched":False})
|
| 1450 |
sim_sents = [s for s in sent_analysis if s["matched"]]
|
| 1451 |
-
|
| 1452 |
-
# 출처 그룹핑
|
| 1453 |
src_groups = {}
|
| 1454 |
for src in unique_sources:
|
| 1455 |
key = src.get("url","")[:80]
|
|
@@ -1457,29 +1138,23 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1457 |
src_groups[key] = {"title":src.get("title",""), "url":src.get("url",""), "source":src.get("source",""), "count":0}
|
| 1458 |
src_groups[key]["count"] += 1
|
| 1459 |
src_list = sorted(src_groups.values(), key=lambda x: -x["count"])
|
| 1460 |
-
|
| 1461 |
methods_used = []
|
| 1462 |
if has_brave: methods_used.append("Brave Search(병렬)")
|
| 1463 |
elif all_sources: methods_used.append("DuckDuckGo(자체크롤링)")
|
| 1464 |
methods_used.append("KCI · RISS · arXiv")
|
| 1465 |
if has_gemini: methods_used.append("Gemini+Google Search")
|
| 1466 |
method_str = " + ".join(methods_used)
|
| 1467 |
-
|
| 1468 |
gc = grade_color
|
| 1469 |
word_count = len(split_words(text))
|
| 1470 |
char_count = len(text)
|
| 1471 |
doc_id = hashlib.md5(text[:100].encode()).hexdigest()[:8].upper()
|
| 1472 |
similarity_pct = plag_pct
|
| 1473 |
citation_pct = 0
|
| 1474 |
-
|
| 1475 |
-
# 문장 카테고리 분류 (CopyKiller 스타일)
|
| 1476 |
cat_suspect = len(sim_sents) # 의심
|
| 1477 |
cat_cited = 0 # 인용 (형식적 인용 감지)
|
| 1478 |
cat_normal = total_sents - cat_suspect - cat_cited # 일반
|
| 1479 |
cat_suspect_pct = int(cat_suspect / max(1, total_sents) * 100)
|
| 1480 |
cat_normal_pct = 100 - cat_suspect_pct
|
| 1481 |
-
|
| 1482 |
-
# 출처 유형 아이콘
|
| 1483 |
def src_icon(s):
|
| 1484 |
src = s.get("source","").lower()
|
| 1485 |
if "kci" in src: return "📚", "KCI"
|
|
@@ -1488,8 +1163,6 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1488 |
if "google" in src: return "🔍", "Google"
|
| 1489 |
if "brave" in src: return "🌐", "Brave"
|
| 1490 |
return "🌐", "Web"
|
| 1491 |
-
|
| 1492 |
-
# 출처 테이블 행
|
| 1493 |
src_rows = ""
|
| 1494 |
for i, sg in enumerate(src_list[:15]):
|
| 1495 |
pct = min(100, int(sg["count"] / max(1, total_sents) * 100 * 3))
|
|
@@ -1503,8 +1176,6 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1503 |
<td style="padding:6px 8px;font-size:12px;text-align:center;border:1px solid #D5D5D5;font-weight:800;color:#D63031;">{pct}%</td>
|
| 1504 |
<td style="padding:6px 10px;border:1px solid #D5D5D5;"><div style="background:#EDEDED;height:14px;border-radius:2px;overflow:hidden;"><div style="background:linear-gradient(90deg,#D63031,#FF7675);height:100%;width:{max(3,pct)}%;border-radius:2px;"></div></div></td>
|
| 1505 |
</tr>"""
|
| 1506 |
-
|
| 1507 |
-
# 의심 문장 대비 행
|
| 1508 |
suspect_rows = ""
|
| 1509 |
for i, sa in enumerate(sim_sents[:15]):
|
| 1510 |
suspect_rows += f"""<tr>
|
|
@@ -1513,8 +1184,6 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1513 |
<td style="padding:8px;font-size:10px;line-height:1.6;border:1px solid #D5D5D5;vertical-align:top;color:#555;"><span style="background:#FFE0E0;border-bottom:2px solid #E74C3C;padding:1px 3px;">{sa["text"][:70]}...</span></td>
|
| 1514 |
<td style="padding:8px;font-size:10px;border:1px solid #D5D5D5;vertical-align:top;"><a href="{sa.get('url','#')}" target="_blank" style="color:#2E86C1;text-decoration:none;font-weight:600;">{sa["source"][:28]}</a><br><span style="font-size:8px;color:#AAA;">{sa.get('type','')}</span></td>
|
| 1515 |
</tr>"""
|
| 1516 |
-
|
| 1517 |
-
# 전체 텍스트 하이라이트 (CopyKiller 스타일 - 문장번호 + 색상)
|
| 1518 |
full_hl = ""
|
| 1519 |
for sa in sent_analysis:
|
| 1520 |
sidx = sa["idx"] + 1
|
|
@@ -1522,21 +1191,15 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1522 |
full_hl += f'<span style="background:#FFD6D6;border-bottom:2px solid #E74C3C;padding:1px 2px;cursor:pointer;" title="[의심 #{sidx}] 출처: {sa.get("source","")}">{sa["text"]}</span> '
|
| 1523 |
else:
|
| 1524 |
full_hl += f'<span style="color:#333;">{sa["text"]}</span> '
|
| 1525 |
-
|
| 1526 |
-
# 카테고리 바 너비
|
| 1527 |
bar_suspect_w = max(2, cat_suspect_pct) if cat_suspect > 0 else 0
|
| 1528 |
bar_normal_w = 100 - bar_suspect_w
|
| 1529 |
-
|
| 1530 |
-
# CSS 상수
|
| 1531 |
HDR_BG = '#3B7DD8'
|
| 1532 |
HDR_BG2 = '#4A8DE0'
|
| 1533 |
TH = 'padding:8px 10px;font-size:10px;font-weight:700;color:#fff;background:{};text-align:center;border:1px solid {};'.format(HDR_BG, HDR_BG)
|
| 1534 |
TL = 'padding:7px 10px;font-size:11px;color:#444;font-weight:600;background:#EDF2FA;border:1px solid #D5D5D5;'
|
| 1535 |
TV = 'padding:7px 10px;font-size:12px;color:#333;border:1px solid #D5D5D5;'
|
| 1536 |
SEC = 'font-size:13px;font-weight:800;color:#1A3C6E;margin:0 0 10px 0;padding:8px 12px;background:#EDF2FA;border-left:4px solid {};border-bottom:1px solid #D5D5D5;'.format(HDR_BG)
|
| 1537 |
-
|
| 1538 |
html = f"""<div style="font-family:'Noto Sans KR','Malgun Gothic','Apple SD Gothic Neo',sans-serif;max-width:780px;margin:0 auto;background:#fff;border:2px solid #3B7DD8;box-shadow:0 2px 12px rgba(0,0,0,0.08);">
|
| 1539 |
-
|
| 1540 |
<!-- ═══════ 헤더 (CopyKiller 스타일) ═══════ -->
|
| 1541 |
<div style="background:linear-gradient(135deg,{HDR_BG},{HDR_BG2});padding:18px 24px;color:#fff;">
|
| 1542 |
<table style="width:100%;"><tr>
|
|
@@ -1550,7 +1213,6 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1550 |
</td>
|
| 1551 |
</tr></table>
|
| 1552 |
</div>
|
| 1553 |
-
|
| 1554 |
<!-- ═══════ 제출 정보 테이블 ═══════ -->
|
| 1555 |
<div style="padding:16px 24px 0;">
|
| 1556 |
<div style="{SEC}">📋 검사 정보</div>
|
|
@@ -1575,7 +1237,6 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1575 |
</tr>
|
| 1576 |
</table>
|
| 1577 |
</div>
|
| 1578 |
-
|
| 1579 |
<!-- ═══════ 검사 결과 (도넛 + 바) ═══════ -->
|
| 1580 |
<div style="padding:18px 24px 0;">
|
| 1581 |
<div style="{SEC}">📊 검사 결과</div>
|
|
@@ -1597,7 +1258,6 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1597 |
<span style="font-size:11px;font-weight:800;color:{gc};">{grade}</span>
|
| 1598 |
</div>
|
| 1599 |
</td>
|
| 1600 |
-
|
| 1601 |
<!-- 상세 결과 -->
|
| 1602 |
<td style="padding:0;border:1px solid #D5D5D5;vertical-align:top;">
|
| 1603 |
<!-- 카테고리 바 (CopyKiller 핵심) -->
|
|
@@ -1613,7 +1273,6 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1613 |
<span style="display:flex;align-items:center;gap:3px;"><span style="display:inline-block;width:10px;height:10px;background:#E8ECF0;border:1px solid #CCC;border-radius:2px;"></span> 일반 <b>{cat_normal}</b>건</span>
|
| 1614 |
</div>
|
| 1615 |
</div>
|
| 1616 |
-
|
| 1617 |
<!-- 비율 바 3줄 -->
|
| 1618 |
<div style="padding:12px 18px;">
|
| 1619 |
<div style="margin-bottom:10px;">
|
|
@@ -1629,7 +1288,6 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1629 |
<div style="background:#EDEDED;height:16px;border-radius:3px;overflow:hidden;border:1px solid #DDD;"><div style="background:linear-gradient(90deg,#3498DB,#85C1E9);height:100%;width:{max(1,citation_pct)}%;transition:width 0.5s;"></div></div>
|
| 1630 |
</div>
|
| 1631 |
</div>
|
| 1632 |
-
|
| 1633 |
<!-- 요약 수치 -->
|
| 1634 |
<div style="padding:8px 18px;background:#F8F9FB;border-top:1px solid #E8E8E8;">
|
| 1635 |
<table style="width:100%;border-collapse:collapse;">
|
|
@@ -1647,7 +1305,6 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1647 |
</tr>
|
| 1648 |
</table>
|
| 1649 |
</div>
|
| 1650 |
-
|
| 1651 |
<!-- ═══════ 전체 텍스트 분석 ═══════ -->
|
| 1652 |
<div style="padding:18px 24px 0;">
|
| 1653 |
<div style="{SEC}">📝 전체 텍스트 분석</div>
|
|
@@ -1660,7 +1317,6 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1660 |
</div>
|
| 1661 |
<div style="padding:14px;background:#FAFBFC;border:1px solid #D5D5D5;line-height:2.1;font-size:13px;max-height:300px;overflow-y:auto;">{full_hl}</div>
|
| 1662 |
</div>
|
| 1663 |
-
|
| 1664 |
<!-- ═══════ 표절 의심 출처 ═══════ -->
|
| 1665 |
<div style="padding:18px 24px 0;">
|
| 1666 |
<div style="{SEC}">🔗 표절 의심 출처 ({len(src_list)}건)</div>
|
|
@@ -1675,7 +1331,6 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1675 |
{src_rows if src_rows else '<tr><td colspan="5" style="padding:18px;text-align:center;color:#999;font-size:11px;border:1px solid #D5D5D5;background:#FAFBFC;">발견된 유사 출처가 없습니다.</td></tr>'}
|
| 1676 |
</table>
|
| 1677 |
</div>
|
| 1678 |
-
|
| 1679 |
<!-- ═══════ 의심 문장 비교 ═══════ -->
|
| 1680 |
<div style="padding:18px 24px 0;">
|
| 1681 |
<div style="{SEC}">⚠️ 의심 문장 비교 ({len(sim_sents)}건)</div>
|
|
@@ -1689,7 +1344,6 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1689 |
{suspect_rows if suspect_rows else '<tr><td colspan="4" style="padding:18px;text-align:center;color:#999;font-size:11px;border:1px solid #D5D5D5;background:#FAFBFC;">유사 의심 문장이 발견되지 않았습니다.</td></tr>'}
|
| 1690 |
</table>
|
| 1691 |
</div>
|
| 1692 |
-
|
| 1693 |
<!-- ═══════ 검사 안내 ═══════ -->
|
| 1694 |
<div style="margin:18px 24px;padding:12px 14px;background:#F5F8FC;border:1px solid #D0DAEA;border-radius:4px;font-size:9px;color:#555;line-height:1.8;">
|
| 1695 |
<b style="color:#333;">📌 검사 안내</b><br>
|
|
@@ -1698,7 +1352,6 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1698 |
· 유사도는 문장 단위 매칭 기반이며, 최종 판정은 교수자/검토자의 확인이 필요합니다.<br>
|
| 1699 |
· 인용 표기(따옴표, 각주 등)가 포함된 문장은 인용으로 분류될 수 있습니다.
|
| 1700 |
</div>
|
| 1701 |
-
|
| 1702 |
<!-- ═══════ 푸터 (CopyKiller 스타일) ═══════ -->
|
| 1703 |
<div style="padding:10px 24px;background:#F0F3F8;border-top:2px solid {HDR_BG};display:flex;justify-content:space-between;align-items:center;">
|
| 1704 |
<div>
|
|
@@ -1711,13 +1364,8 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 1711 |
</div>
|
| 1712 |
</div>
|
| 1713 |
</div>"""
|
| 1714 |
-
|
| 1715 |
log = '\n'.join(log_lines) + f"\n\n종합: {plag_pct}% {grade} | 출처 {len(unique_sources)}건 | 유사문장 {matched_sents}/{total_sents}"
|
| 1716 |
return html, log
|
| 1717 |
-
|
| 1718 |
-
# ═══════════════════════════════════════════════
|
| 1719 |
-
# 탭1: 분석 (명확한 출력)
|
| 1720 |
-
# ═══════════════════════════════════════════════
|
| 1721 |
def run_detection(text, progress=gr.Progress()):
|
| 1722 |
if not text or len(text.strip())<50: return "<div style='padding:20px;text-align:center;color:#888;'>⚠️ 최소 50자</div>",""
|
| 1723 |
text=text.strip()
|
|
@@ -1734,17 +1382,12 @@ def run_detection(text, progress=gr.Progress()):
|
|
| 1734 |
sc={"통계":s1["score"],"문체":s2["score"],"반복성":s3["score"],"구조":s4["score"],"지문":s5["score"]}
|
| 1735 |
sent_scores=[score_sentence(s)[0] for s in sents]
|
| 1736 |
sent_avg=sum(sent_scores)/len(sent_scores) if sent_scores else -1
|
| 1737 |
-
|
| 1738 |
-
# ★ 모델 추정
|
| 1739 |
ms_raw=s5.get("model_scores",{})
|
| 1740 |
model_est=estimate_model(text,sents,morphs,ms_raw)
|
| 1741 |
-
|
| 1742 |
fs,verdict,level=compute_verdict(sc,lr["score"],sent_avg=sent_avg,ppx_score=ppx["score"],hum_score=hum["score"])
|
| 1743 |
progress(0.95)
|
| 1744 |
cm={"ai_high":("#FF4444","#FFE0E0","높음"),"ai_medium":("#FF8800","#FFF0DD","중간~높음"),"ai_low":("#DDAA00","#FFFBE0","중간"),"uncertain":("#888","#F0F0F0","낮음"),"human":("#22AA44","#E0FFE8","매우 낮음")}
|
| 1745 |
fg,bg,conf=cm.get(level,("#888","#F0F0F0","?"))
|
| 1746 |
-
|
| 1747 |
-
# 모델 추정 표시
|
| 1748 |
est_model=model_est.get("model","특정 불가")
|
| 1749 |
est_conf=model_est.get("confidence","낮음")
|
| 1750 |
est_reasons=model_est.get("reasons",[])
|
|
@@ -1754,41 +1397,30 @@ def run_detection(text, progress=gr.Progress()):
|
|
| 1754 |
mt=f"{est_model} (참고)"
|
| 1755 |
else:
|
| 1756 |
mt="특정 불가"
|
| 1757 |
-
|
| 1758 |
ai_sents=sum(1 for s in sent_scores if s>=40)
|
| 1759 |
human_sents=sum(1 for s in sent_scores if s<20)
|
| 1760 |
-
|
| 1761 |
def gb(l,s,w="",desc=""):
|
| 1762 |
c="#FF4444" if s>=70 else "#FF8800" if s>=50 else "#DDAA00" if s>=35 else "#22AA44"
|
| 1763 |
wt=f"<span style='color:#999;font-size:9px;'> ×{w}</span>" if w else ""
|
| 1764 |
dt=f"<div style='font-size:9px;color:#888;margin-top:1px;'>{desc}</div>" if desc else ""
|
| 1765 |
return f"<div style='margin:4px 0;'><div style='display:flex;justify-content:space-between;'><span style='font-size:11px;font-weight:600;'>{l}{wt}</span><span style='font-size:11px;font-weight:700;color:{c};'>{s}</span></div><div style='background:#E8E8E8;border-radius:4px;height:7px;'><div style='background:{c};height:100%;width:{s}%;border-radius:4px;'></div></div>{dt}</div>"
|
| 1766 |
-
|
| 1767 |
-
# 모델 지문 바
|
| 1768 |
mb=""
|
| 1769 |
for mn in ["GPT","Claude","Gemini","Perplexity"]:
|
| 1770 |
s=ms_raw.get(mn,0); mc="#FF4444" if s>=40 else "#FF8800" if s>=20 else "#CCC"
|
| 1771 |
-
# 추정 모델 표시
|
| 1772 |
tag=""
|
| 1773 |
if mn==est_model and est_conf!="낮음":
|
| 1774 |
tag=f" <span style='background:#FF4444;color:white;font-size:7px;padding:0 3px;border-radius:3px;'>추정</span>"
|
| 1775 |
mb+=f"<div style='display:flex;align-items:center;gap:4px;margin:2px 0;'><span style='width:66px;font-size:10px;font-weight:600;'>{mn}{tag}</span><div style='flex:1;background:#E8E8E8;border-radius:3px;height:5px;'><div style='background:{mc};height:100%;width:{s}%;'></div></div><span style='font-size:9px;width:18px;text-align:right;color:{mc};'>{s}</span></div>"
|
| 1776 |
-
|
| 1777 |
-
# LLM 섹션
|
| 1778 |
ls=""
|
| 1779 |
if lr["score"]>=0:
|
| 1780 |
lsc=lr["score"]
|
| 1781 |
lr_rows="".join(f"<div style='font-size:9px;color:#555;'>{mn}: {lr['detail'].get(mn,'—')}</div>" for _,mn in LLM_JUDGES)
|
| 1782 |
ls=f"<div style='margin-top:8px;padding:8px;background:#F8F8FF;border-radius:6px;border:1px solid #E0E0FF;'><div style='font-size:10px;font-weight:700;margin-bottom:3px;'>🤖 LLM 교차검증 (평균 {lsc}%)</div>{lr_rows}</div>"
|
| 1783 |
else: ls="<div style='margin-top:6px;padding:4px 8px;background:#F5F5F5;border-radius:4px;color:#999;font-size:9px;'>🤖 GROQ_API_KEY 미설정</div>"
|
| 1784 |
-
|
| 1785 |
-
# 품질
|
| 1786 |
qs=qr["sub_scores"]; gc={"S":"#FF6B6B","A":"#4ECDC4","B":"#45B7D1","C":"#DDAA00","D":"#FF8800","F":"#FF4444"}.get(qr["grade"],"#888")
|
| 1787 |
def qgb(l,s):
|
| 1788 |
c="#22AA44" if s>=70 else "#4ECDC4" if s>=55 else "#DDAA00" if s>=40 else "#FF8800"
|
| 1789 |
return f"<div style='margin:2px 0;display:flex;align-items:center;gap:4px;'><span style='width:50px;font-size:10px;'>{l}</span><div style='flex:1;background:#E8E8E8;border-radius:3px;height:5px;'><div style='background:{c};height:100%;width:{s}%;'></div></div><span style='font-size:9px;color:{c};width:18px;text-align:right;'>{s}</span></div>"
|
| 1790 |
-
|
| 1791 |
-
# ★ 판정 이유 (3대 킬러 통합)
|
| 1792 |
reasons=[]
|
| 1793 |
if sc["문체"]>=70: reasons.append("격식체 종결어미가 대부분, AI형 접속사·상투표현 다수 감지")
|
| 1794 |
elif sc["문체"]>=50: reasons.append("격식체와 AI형 표현이 혼재")
|
|
@@ -1805,8 +1437,6 @@ def run_detection(text, progress=gr.Progress()):
|
|
| 1805 |
reasons.append(f"🔍 추정 모델: <b>{est_model}</b> ({est_why})")
|
| 1806 |
if not reasons: reasons.append("인간적 표현이 우세하며 AI 패턴이 약함")
|
| 1807 |
reason_html='<br>'.join(f"• {r}" for r in reasons)
|
| 1808 |
-
|
| 1809 |
-
# ★ Perplexity 카드
|
| 1810 |
ppx_c="#FF4444" if ppx["score"]>=65 else "#FF8800" if ppx["score"]>=50 else "#DDAA00" if ppx["score"]>=35 else "#22AA44"
|
| 1811 |
ppx_html=f"""<div style='margin-top:8px;padding:8px;background:linear-gradient(135deg,#FFF8F0,#FFF0FF);border-radius:6px;border:1px solid #E8D0FF;'>
|
| 1812 |
<div style='font-size:10px;font-weight:700;margin-bottom:4px;'>🧠 Perplexity 분석 <span style='color:{ppx_c};font-size:12px;font-weight:900;'>{ppx["score"]}점</span></div>
|
|
@@ -1817,8 +1447,6 @@ def run_detection(text, progress=gr.Progress()):
|
|
| 1817 |
<span style='font-size:9px;color:#777;'>Zipf적합: {ppx.get("zipf",0)}</span>
|
| 1818 |
</div>
|
| 1819 |
</div>"""
|
| 1820 |
-
|
| 1821 |
-
# ★ Humanizer 탐지 카드
|
| 1822 |
hum_html=""
|
| 1823 |
if hum["score"]>=30:
|
| 1824 |
hc="#FF4444" if hum["score"]>=65 else "#FF8800" if hum["score"]>=50 else "#DDAA00"
|
|
@@ -1827,8 +1455,6 @@ def run_detection(text, progress=gr.Progress()):
|
|
| 1827 |
<div style='font-size:10px;font-weight:700;margin-bottom:3px;'>🛡️ Humanizer 탐지 <span style='color:{hc};font-size:12px;font-weight:900;'>{hum["score"]}점</span></div>
|
| 1828 |
{sig_rows}
|
| 1829 |
</div>"""
|
| 1830 |
-
|
| 1831 |
-
# ★ 모델 추정 카드
|
| 1832 |
est_html=""
|
| 1833 |
if est_model!="특정 불가":
|
| 1834 |
ec="#FF4444" if est_conf=="높음" else "#FF8800" if est_conf=="중간" else "#DDAA00"
|
|
@@ -1842,7 +1468,6 @@ def run_detection(text, progress=gr.Progress()):
|
|
| 1842 |
{ranked_html}
|
| 1843 |
<div style='font-size:8px;color:#999;margin-top:2px;'>근거: {", ".join(est_reasons[:3]) if est_reasons else "복합 지표"}</div>
|
| 1844 |
</div>"""
|
| 1845 |
-
|
| 1846 |
html=f"""<div style="font-family:'Pretendard','Noto Sans KR',sans-serif;max-width:720px;margin:0 auto;">
|
| 1847 |
<div style="background:{bg};border:2px solid {fg};border-radius:14px;padding:20px;margin-bottom:12px;">
|
| 1848 |
<div style="display:flex;align-items:center;gap:16px;">
|
|
@@ -1864,7 +1489,6 @@ def run_detection(text, progress=gr.Progress()):
|
|
| 1864 |
<div style="font-size:11px;color:#555;line-height:1.7;">{reason_html}</div>
|
| 1865 |
</div>
|
| 1866 |
</div>
|
| 1867 |
-
|
| 1868 |
<div style="display:grid;grid-template-columns:1fr 1fr;gap:8px;">
|
| 1869 |
<div style="background:#FAFAFA;border-radius:8px;padding:10px;">
|
| 1870 |
<div style="font-size:10px;font-weight:700;margin-bottom:4px;">📊 AI 탐지 5축 + Perplexity</div>
|
|
@@ -1891,23 +1515,17 @@ def run_detection(text, progress=gr.Progress()):
|
|
| 1891 |
</div>"""
|
| 1892 |
log=f"AI:{fs}점 [{verdict}] 신뢰:{conf} | 모델:{mt} | PPX:{ppx['score']} HUM:{hum['score']} | 품질:{qr['grade']}({qr['score']})\n축: 통계{sc['통계']} 문체{sc['문체']} 반복{sc['반복성']} 구조{sc['구조']} 지문{sc['지문']} PPX{ppx['score']} HUM{hum['score']}"
|
| 1893 |
return html, log
|
| 1894 |
-
|
| 1895 |
-
# ═══════════════════════════════════════════════
|
| 1896 |
-
# 탭2: 하이라이트 (탭1과 동일 기준)
|
| 1897 |
-
# ═══════════════════════════════════════════════
|
| 1898 |
def run_highlight(text):
|
| 1899 |
if not text or len(text.strip())<30: return "<div style='color:#888;'>텍스트 필요</div>"
|
| 1900 |
sents=split_sentences(text)
|
| 1901 |
hl=[]
|
| 1902 |
for s in sents:
|
| 1903 |
sc, reasons = score_sentence(s)
|
| 1904 |
-
# 5단계 색상
|
| 1905 |
if sc >= 60: bg="rgba(220,38,38,0.35)"; level="AI확신"
|
| 1906 |
elif sc >= 40: bg="rgba(249,115,22,0.30)"; level="AI의심"
|
| 1907 |
elif sc >= 25: bg="rgba(234,179,8,0.25)"; level="주의"
|
| 1908 |
elif sc >= 10: bg="rgba(132,204,22,0.15)"; level="인간추정"
|
| 1909 |
else: bg="rgba(34,197,94,0.20)"; level="인간"
|
| 1910 |
-
# 근거 상세
|
| 1911 |
detail_parts = []
|
| 1912 |
for r in reasons:
|
| 1913 |
if '격식' in r or '비격식AI' in r: detail_parts.append(f"🔤 {r}")
|
|
@@ -1918,13 +1536,11 @@ def run_highlight(text):
|
|
| 1918 |
else: detail_parts.append(r)
|
| 1919 |
tt = ' | '.join(detail_parts) if detail_parts else '특이 패턴 없음'
|
| 1920 |
hl.append(f'<span style="background:{bg};padding:2px 4px;border-radius:4px;display:inline;line-height:2.2;border-bottom:2px solid {"#DC2626" if sc>=60 else "#F97316" if sc>=40 else "#EAB308" if sc>=25 else "#84CC16" if sc>=10 else "#22C55E"};" title="[{level}] {tt} ({sc}점)">{s}</span>')
|
| 1921 |
-
|
| 1922 |
total_scores = [score_sentence(s)[0] for s in sents]
|
| 1923 |
avg_sc = sum(total_scores)/len(total_scores) if total_scores else 0
|
| 1924 |
ai_high = sum(1 for s in total_scores if s >= 60)
|
| 1925 |
ai_mid = sum(1 for s in total_scores if 40 <= s < 60)
|
| 1926 |
human_cnt = sum(1 for s in total_scores if s < 25)
|
| 1927 |
-
|
| 1928 |
return f"""<div style='font-family:Pretendard,sans-serif;'>
|
| 1929 |
<div style='margin-bottom:10px;padding:10px;background:#F8F8FF;border-radius:8px;'>
|
| 1930 |
<div style='display:flex;gap:8px;align-items:center;font-size:11px;margin-bottom:6px;flex-wrap:wrap;'>
|
|
@@ -1938,30 +1554,16 @@ def run_highlight(text):
|
|
| 1938 |
</div>
|
| 1939 |
<div style='line-height:2.4;font-size:14px;'>{' '.join(hl)}</div>
|
| 1940 |
</div>"""
|
| 1941 |
-
|
| 1942 |
-
# ═══════════════════════════════════════════════
|
| 1943 |
-
# GRADIO UI
|
| 1944 |
-
# ═══════════════════════════════════════════════
|
| 1945 |
SAMPLE_AI = """인공지능 기술은 현대 사회에서 매우 중요한 역할을 하고 있습니다. 특히 자연어 처리 분야에서의 발전은 눈부신 성과를 거두고 있습니다. 이러한 기술의 발전은 다양한 산업 분야에 긍정적인 영향을 미치고 있으며, 향후 더욱 발전할 것으로 예상됩니다.
|
| 1946 |
-
|
| 1947 |
또한 생성형 AI의 등장으로 콘텐츠 제작 방식이 크게 변화하고 있습니다. 이를 통해 기업들은 효율적인 콘텐츠 생산이 가능해졌으며, 개인 사용자들도 다양한 창작 활동에 AI를 활용할 수 있게 되었습니다. 따라서 AI 리터러시의 중요성이 더욱 부각되고 있습니다.
|
| 1948 |
-
|
| 1949 |
나아가 AI 윤리와 규제에 대한 논의도 활발히 진행되고 있습니다. 특히 AI가 생성한 콘텐츠의 저작권 문제는 중요한 의미를 가지며, 이에 대한 법적 프레임워크 구축이 필요합니다. 결과적으로 기술 발전과 함께 사회적 합의를 이루는 것이 중요합니다."""
|
| 1950 |
-
|
| 1951 |
SAMPLE_HUMAN = """아 진짜 요즘 AI 때문에 머리 아프다ㅋㅋㅋ 어제 chatgpt한테 레포트 써달라고 했는데 완전 교과서 같은 글만 써줘서 그냥 내가 다시 썼음;;
|
| 1952 |
-
|
| 1953 |
근데 생각해보면 AI가 쓴 글이랑 사람이 쓴 글이 확실히 다르긴 해. 뭔가... 너무 깔끔하달까? 사람은 이렇게 횡설수설도 하고 맞춤법도 틀리고 그러잖아.
|
| 1954 |
-
|
| 1955 |
교수님이 AI 탐지기 돌린다고 해서 좀 무서운데 ㅠㅠ 나는 진짜 직접 쓴 건데 혹시 오탐 나면 어쩌지... 걱정된다 진심으로."""
|
| 1956 |
-
|
| 1957 |
-
# ═══════════════════════════════════════════════
|
| 1958 |
-
# 탭5: 문서 업로드 → 섹션별 히트맵 분석 + PDF 보고서
|
| 1959 |
-
# ═══════════════════════════════════════════════
|
| 1960 |
def run_document_analysis(file, progress=gr.Progress()):
|
| 1961 |
"""문서 파일 업로드 → 섹션별 AI 탐지 히트맵 + PDF 보고서 생성"""
|
| 1962 |
if file is None:
|
| 1963 |
return "<div style='padding:20px;text-align:center;color:#888;'>📄 파일을 업로드하세요 (PDF, DOCX, HWP, HWPX, TXT)</div>", "", None
|
| 1964 |
-
|
| 1965 |
if hasattr(file, 'path'):
|
| 1966 |
file_path = file.path
|
| 1967 |
elif hasattr(file, 'name'):
|
|
@@ -1970,30 +1572,23 @@ def run_document_analysis(file, progress=gr.Progress()):
|
|
| 1970 |
file_path = str(file)
|
| 1971 |
fname = os.path.basename(file_path)
|
| 1972 |
progress(0.05, f"📄 {fname} 읽는 중...")
|
| 1973 |
-
|
| 1974 |
sections, full_text, error = extract_text_from_file(file_path)
|
| 1975 |
if error:
|
| 1976 |
return f"<div style='padding:20px;text-align:center;color:#E44;'>⚠️ {error}</div>", "", None
|
| 1977 |
if not sections or not full_text or len(full_text.strip()) < 50:
|
| 1978 |
return "<div style='padding:20px;text-align:center;color:#E44;'>⚠️ 텍스트가 충분하지 않습니다 (50자 미만)</div>", "", None
|
| 1979 |
-
|
| 1980 |
progress(0.15, "전체 텍스트 분석...")
|
| 1981 |
-
# 전체 분석
|
| 1982 |
sents_all = split_sentences(full_text)
|
| 1983 |
words_all = split_words(full_text)
|
| 1984 |
morphs_all = get_morphemes(full_text)
|
| 1985 |
total_score, total_verdict, total_level, total_axes, total_ppx, total_hum = quick_score(full_text)
|
| 1986 |
quality = analyze_quality(full_text, sents_all, words_all, morphs_all)
|
| 1987 |
-
|
| 1988 |
-
# LLM 교차검증 (전체)
|
| 1989 |
progress(0.30, "LLM 교차검증...")
|
| 1990 |
llm_result = llm_cross_check(full_text[:3000])
|
| 1991 |
if llm_result["score"] >= 0:
|
| 1992 |
_sent_scores = [score_sentence(s)[0] for s in sents_all]
|
| 1993 |
_sent_avg = sum(_sent_scores)/len(_sent_scores) if _sent_scores else -1
|
| 1994 |
total_score, total_verdict, total_level = compute_verdict(total_axes, llm_result["score"], sent_avg=_sent_avg, ppx_score=total_ppx["score"], hum_score=total_hum["score"])
|
| 1995 |
-
|
| 1996 |
-
# 섹션별 분석
|
| 1997 |
progress(0.45, f"{len(sections)}개 섹션 분석...")
|
| 1998 |
section_results = []
|
| 1999 |
for i, sec in enumerate(sections):
|
|
@@ -2001,7 +1596,6 @@ def run_document_analysis(file, progress=gr.Progress()):
|
|
| 2001 |
section_results.append({"idx": i+1, "text": sec, "score": -1, "verdict": "너무 짧음", "skipped": True})
|
| 2002 |
continue
|
| 2003 |
s_score, s_verdict, s_level, s_axes, _, _ = quick_score(sec)
|
| 2004 |
-
# 문장별 하이라이트
|
| 2005 |
sec_sents = split_sentences(sec)
|
| 2006 |
sent_scores = []
|
| 2007 |
for sent in sec_sents:
|
|
@@ -2014,15 +1608,11 @@ def run_document_analysis(file, progress=gr.Progress()):
|
|
| 2014 |
})
|
| 2015 |
pct = 0.45 + (i / max(len(sections), 1)) * 0.30
|
| 2016 |
progress(pct, f"섹션 {i+1}/{len(sections)}")
|
| 2017 |
-
|
| 2018 |
-
# ═══ HTML 히트맵 보고서 ═══
|
| 2019 |
now = datetime.now().strftime("%Y-%m-%d %H:%M")
|
| 2020 |
ext = Path(file_path).suffix.upper()
|
| 2021 |
cm_map = {"ai_high": ("#FF4444", "#FFE0E0"), "ai_medium": ("#FF8800", "#FFF0DD"),
|
| 2022 |
"ai_low": ("#DDAA00", "#FFFBE0"), "uncertain": ("#888", "#F5F5F5"), "human": ("#22AA44", "#E0FFE8")}
|
| 2023 |
tc, tbg = cm_map.get(total_level, ("#888", "#F5F5F5"))
|
| 2024 |
-
|
| 2025 |
-
# 섹션별 히트맵 바 HTML
|
| 2026 |
heatmap_cells = []
|
| 2027 |
for sr in section_results:
|
| 2028 |
sidx = sr["idx"]
|
|
@@ -2033,13 +1623,10 @@ def run_document_analysis(file, progress=gr.Progress()):
|
|
| 2033 |
ssc = sr["score"]; svd = sr["verdict"]
|
| 2034 |
heatmap_cells.append(f"<div style='flex:1;min-width:18px;height:28px;background:{sc};border-radius:3px;opacity:0.8;cursor:pointer;' title='섹션{sidx}: AI {ssc}점 ({svd})'></div>")
|
| 2035 |
heatmap_bar = f"<div style='display:flex;gap:3px;margin:12px 0;'>" + ''.join(heatmap_cells) + "</div>"
|
| 2036 |
-
|
| 2037 |
-
# 섹션 상세 카드
|
| 2038 |
section_cards = []
|
| 2039 |
for sr in section_results:
|
| 2040 |
if sr["skipped"]: continue
|
| 2041 |
sc, sbg = cm_map.get(sr.get("level", "uncertain"), ("#888", "#F5F5F5"))
|
| 2042 |
-
# 문장 하이라이트 (score_sentence 기반)
|
| 2043 |
sent_html = ""
|
| 2044 |
for ss in sr.get("sent_scores", []):
|
| 2045 |
s = ss["score"]
|
|
@@ -2048,14 +1635,12 @@ def run_document_analysis(file, progress=gr.Progress()):
|
|
| 2048 |
elif s >= 25: sclr = "background:rgba(221,170,0,0.08);border-bottom:1px solid #DDAA00;"
|
| 2049 |
else: sclr = ""
|
| 2050 |
sent_html += f"<span style='{sclr}padding:1px 2px;border-radius:2px;' title='AI {s}점'>{ss['text']}</span> "
|
| 2051 |
-
|
| 2052 |
axes_html = ""
|
| 2053 |
if "axes" in sr:
|
| 2054 |
ax = sr["axes"]
|
| 2055 |
for k, v in ax.items():
|
| 2056 |
axc = "#FF4444" if v >= 50 else "#FF8800" if v >= 30 else "#22AA44"
|
| 2057 |
axes_html += f"<span style='display:inline-block;margin:1px 3px;padding:2px 8px;background:{axc}22;border:1px solid {axc}44;border-radius:10px;font-size:9px;color:{axc};'>{k} {v}</span>"
|
| 2058 |
-
|
| 2059 |
section_cards.append(f"""
|
| 2060 |
<div style="border:1px solid #E0E0E0;border-left:4px solid {sc};border-radius:8px;padding:14px;margin-bottom:10px;background:white;">
|
| 2061 |
<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:8px;">
|
|
@@ -2065,19 +1650,14 @@ def run_document_analysis(file, progress=gr.Progress()):
|
|
| 2065 |
<div style="margin-bottom:6px;">{axes_html}</div>
|
| 2066 |
<div style="font-size:12px;line-height:1.8;color:#333;">{sent_html}</div>
|
| 2067 |
</div>""")
|
| 2068 |
-
|
| 2069 |
-
# AI 비율 분포
|
| 2070 |
ai_high = sum(1 for s in section_results if not s["skipped"] and s["score"] >= 60)
|
| 2071 |
ai_med = sum(1 for s in section_results if not s["skipped"] and 35 <= s["score"] < 60)
|
| 2072 |
ai_low = sum(1 for s in section_results if not s["skipped"] and s["score"] < 35)
|
| 2073 |
valid_sections = [s for s in section_results if not s["skipped"]]
|
| 2074 |
-
|
| 2075 |
-
# LLM 교차검증 정보
|
| 2076 |
llm_info = ""
|
| 2077 |
if llm_result["score"] >= 0:
|
| 2078 |
llm_rows = ''.join(f"<span style='margin-right:8px;font-size:10px;color:#555;'>{mn}: {llm_result['detail'].get(mn,'—')}</span>" for _, mn in LLM_JUDGES)
|
| 2079 |
llm_info = f"<div style='margin-top:8px;padding:8px 12px;background:#F8F8FF;border-radius:6px;border:1px solid #E0E0FF;font-size:10px;'><b>🤖 LLM 교차검증:</b> 평균 {llm_result['score']}% | {llm_rows}</div>"
|
| 2080 |
-
|
| 2081 |
html = f"""<div style="font-family:'Pretendard','Noto Sans KR',sans-serif;max-width:800px;margin:0 auto;background:#FAFBFC;border-radius:12px;border:1px solid #E0E0E0;overflow:hidden;">
|
| 2082 |
<!-- 헤더 -->
|
| 2083 |
<div style="background:linear-gradient(135deg,#1A1F36,#2D3561);padding:20px 24px;color:white;">
|
|
@@ -2092,7 +1672,6 @@ def run_document_analysis(file, progress=gr.Progress()):
|
|
| 2092 |
</div>
|
| 2093 |
</div>
|
| 2094 |
</div>
|
| 2095 |
-
|
| 2096 |
<!-- 요약 -->
|
| 2097 |
<div style="padding:16px 24px;">
|
| 2098 |
<div style="display:grid;grid-template-columns:repeat(4,1fr);gap:10px;margin-bottom:12px;">
|
|
@@ -2113,7 +1692,6 @@ def run_document_analysis(file, progress=gr.Progress()):
|
|
| 2113 |
<div style="font-size:10px;color:#888;">품질 등급</div>
|
| 2114 |
</div>
|
| 2115 |
</div>
|
| 2116 |
-
|
| 2117 |
<!-- 히트맵 바 -->
|
| 2118 |
<div style="background:white;border:1px solid #E0E0E0;border-radius:8px;padding:12px;">
|
| 2119 |
<div style="font-size:11px;font-weight:700;margin-bottom:6px;">🗺️ 섹션별 AI 히트맵 (빨강=AI의심, 초록=인간)</div>
|
|
@@ -2127,25 +1705,20 @@ def run_document_analysis(file, progress=gr.Progress()):
|
|
| 2127 |
</div>
|
| 2128 |
{llm_info}
|
| 2129 |
</div>
|
| 2130 |
-
|
| 2131 |
<!-- 섹션 상세 -->
|
| 2132 |
<div style="padding:0 24px 20px;">
|
| 2133 |
<div style="font-size:13px;font-weight:700;margin-bottom:10px;">📊 섹션별 상세 분석 ({len(valid_sections)}개)</div>
|
| 2134 |
{''.join(section_cards)}
|
| 2135 |
</div>
|
| 2136 |
-
|
| 2137 |
<!-- 푸터 -->
|
| 2138 |
<div style="padding:10px 24px;background:#F0F3F8;border-top:1px solid #E0E0E0;display:flex;justify-content:space-between;">
|
| 2139 |
<span style="font-size:11px;font-weight:800;color:#1A1F36;">AI Detector v4.0</span>
|
| 2140 |
<span style="font-size:9px;color:#AAA;">{now} · 5축 앙상블 + LLM 교차검증</span>
|
| 2141 |
</div>
|
| 2142 |
</div>"""
|
| 2143 |
-
|
| 2144 |
-
# ═══ PDF 보고서 생성 ═══
|
| 2145 |
progress(0.90, "PDF 보고서 생성...")
|
| 2146 |
pdf_path = _generate_pdf_report(fname, total_score, total_verdict, total_level,
|
| 2147 |
total_axes, quality, section_results, llm_result, now)
|
| 2148 |
-
|
| 2149 |
log = f"파일: {fname} ({ext})\n"
|
| 2150 |
log += f"섹션: {len(sections)}개 | 전체: {len(full_text)}자\n"
|
| 2151 |
log += f"총점: {total_score} ({total_verdict})\n"
|
|
@@ -2154,23 +1727,17 @@ def run_document_analysis(file, progress=gr.Progress()):
|
|
| 2154 |
if llm_result["score"] >= 0:
|
| 2155 |
log += f"LLM 교차검증: {llm_result['score']}%\n"
|
| 2156 |
return html, log, pdf_path
|
| 2157 |
-
|
| 2158 |
-
|
| 2159 |
def _generate_pdf_report(fname, score, verdict, level, axes, quality, sections, llm_result, now):
|
| 2160 |
"""HTML → PDF 변환으로 보고서 생성"""
|
| 2161 |
try:
|
| 2162 |
cm = {"ai_high":"#FF4444","ai_medium":"#FF8800","ai_low":"#DDAA00","uncertain":"#888","human":"#22AA44"}
|
| 2163 |
tc = cm.get(level, "#888")
|
| 2164 |
-
|
| 2165 |
-
# 섹션 테이블 행
|
| 2166 |
sec_rows = ""
|
| 2167 |
for sr in sections:
|
| 2168 |
if sr["skipped"]: continue
|
| 2169 |
sc = cm.get(sr.get("level","uncertain"),"#888")
|
| 2170 |
sec_rows += f"<tr><td style='padding:6px 10px;border:1px solid #DDD;text-align:center;'>{sr['idx']}</td><td style='padding:6px 10px;border:1px solid #DDD;'>{sr['text'][:80]}...</td><td style='padding:6px 10px;border:1px solid #DDD;text-align:center;color:{sc};font-weight:700;'>{sr['score']}</td><td style='padding:6px 10px;border:1px solid #DDD;text-align:center;'>{sr['verdict']}</td></tr>"
|
| 2171 |
-
|
| 2172 |
ax_rows = ''.join(f"<tr><td style='padding:4px 10px;border:1px solid #DDD;'>{k}</td><td style='padding:4px 10px;border:1px solid #DDD;text-align:center;font-weight:700;'>{v}/100</td></tr>" for k, v in axes.items())
|
| 2173 |
-
|
| 2174 |
html_content = f"""<!DOCTYPE html><html><head><meta charset="UTF-8">
|
| 2175 |
<style>
|
| 2176 |
body{{font-family:sans-serif;margin:30px;font-size:12px;color:#333;}}
|
|
@@ -2183,28 +1750,22 @@ def _generate_pdf_report(fname, score, verdict, level, axes, quality, sections,
|
|
| 2183 |
</style></head><body>
|
| 2184 |
<h1>📄 AI 글 판별 보고서</h1>
|
| 2185 |
<p><b>파일:</b> {fname} | <b>생성:</b> {now} | <b>엔진:</b> AI Detector v4.0</p>
|
| 2186 |
-
|
| 2187 |
<h2>종합 결과</h2>
|
| 2188 |
<div class="score-box">{score}점</div>
|
| 2189 |
<span style="margin-left:12px;font-size:16px;font-weight:700;">{verdict}</span>
|
| 2190 |
<p><b>품질:</b> {quality['grade']} ({quality['score']}점)</p>
|
| 2191 |
-
|
| 2192 |
<h2>5축 분석</h2>
|
| 2193 |
<table><tr><th>축</th><th>점수</th></tr>{ax_rows}</table>
|
| 2194 |
-
|
| 2195 |
<h2>섹션별 분석 ({len([s for s in sections if not s['skipped']])}개)</h2>
|
| 2196 |
<table>
|
| 2197 |
<tr><th>No</th><th>내용 (발췌)</th><th>AI 점수</th><th>판정</th></tr>
|
| 2198 |
{sec_rows}
|
| 2199 |
</table>
|
| 2200 |
-
|
| 2201 |
<div class="footer">
|
| 2202 |
AI Detector v4.0 — 5축 앙상블 + LLM 교차검증 | {now}<br>
|
| 2203 |
본 보고서는 AI 판별 참고 자료이며, 최종 판단은 검토자의 확인이 필요합니다.
|
| 2204 |
</div>
|
| 2205 |
</body></html>"""
|
| 2206 |
-
|
| 2207 |
-
# HTML 파일 저장 → 다운로드용
|
| 2208 |
report_dir = tempfile.mkdtemp()
|
| 2209 |
html_path = os.path.join(report_dir, f"AI_Report_{fname}.html")
|
| 2210 |
with open(html_path, 'w', encoding='utf-8') as f:
|
|
@@ -2213,13 +1774,10 @@ def _generate_pdf_report(fname, score, verdict, level, axes, quality, sections,
|
|
| 2213 |
except Exception as e:
|
| 2214 |
print(f"PDF 보고서 생성 오류: {e}")
|
| 2215 |
return None
|
| 2216 |
-
|
| 2217 |
-
|
| 2218 |
def extract_file_text_api(file):
|
| 2219 |
"""파일 업로드 → 텍스트 추출 API (탭1~4용)"""
|
| 2220 |
if file is None:
|
| 2221 |
return "⚠️ 파일이 없습니다."
|
| 2222 |
-
# Gradio 6.x: FileData 객체 / NamedString / str 모두 처리
|
| 2223 |
if hasattr(file, 'path'):
|
| 2224 |
file_path = file.path
|
| 2225 |
elif hasattr(file, 'name'):
|
|
@@ -2235,8 +1793,6 @@ def extract_file_text_api(file):
|
|
| 2235 |
if not full_text or len(full_text.strip()) < 10:
|
| 2236 |
return f"⚠️ {fname}: 텍스트 추출 결과 없음"
|
| 2237 |
return full_text.strip()
|
| 2238 |
-
|
| 2239 |
-
|
| 2240 |
with gr.Blocks(title="AI 글 판별기 v5.1") as demo:
|
| 2241 |
gr.Markdown("# 🔎 AI 글 판별기 v5.1\n**5축+Perplexity+Humanizer탐지+모델추정 · 품질 측정 · LLM 교차검증 · 표절 검사**")
|
| 2242 |
with gr.Tab("🏆 경쟁력 비교"):
|
|
@@ -2252,9 +1808,7 @@ with gr.Blocks(title="AI 글 판별기 v5.1") as demo:
|
|
| 2252 |
@keyframes glow { 0%,100% { box-shadow:0 0 8px rgba(99,102,241,0.3); } 50% { box-shadow:0 0 20px rgba(99,102,241,0.6); } }
|
| 2253 |
@keyframes borderPulse { 0%,100% { border-color:rgba(99,102,241,0.3); } 50% { border-color:rgba(99,102,241,0.8); } }
|
| 2254 |
@keyframes checkPop { from { transform:scale(0) rotate(-180deg); opacity:0; } to { transform:scale(1) rotate(0deg); opacity:1; } }
|
| 2255 |
-
|
| 2256 |
.comp-wrap { font-family:'Pretendard','Noto Sans KR',system-ui,sans-serif; max-width:860px; margin:0 auto; padding:8px; }
|
| 2257 |
-
|
| 2258 |
/* 히어로 배너 */
|
| 2259 |
.hero-banner {
|
| 2260 |
background: linear-gradient(135deg, #1e1b4b 0%, #312e81 30%, #4338ca 60%, #6366f1 100%);
|
|
@@ -2273,7 +1827,6 @@ with gr.Blocks(title="AI 글 판별기 v5.1") as demo:
|
|
| 2273 |
font-size:11px; font-weight:800; padding:4px 12px; border-radius:20px; margin-top:10px;
|
| 2274 |
animation: pulse 2s ease-in-out infinite; position:relative;
|
| 2275 |
}
|
| 2276 |
-
|
| 2277 |
/* 핵심 수치 */
|
| 2278 |
.stats-row { display:grid; grid-template-columns:repeat(4,1fr); gap:10px; margin-bottom:18px; animation:fadeInUp 1s ease-out 0.2s both; }
|
| 2279 |
.stat-card {
|
|
@@ -2283,7 +1836,6 @@ with gr.Blocks(title="AI 글 판별기 v5.1") as demo:
|
|
| 2283 |
.stat-card:hover { transform:translateY(-4px); border-color:#6366f1; box-shadow:0 8px 25px rgba(99,102,241,0.15); }
|
| 2284 |
.stat-num { font-size:32px; font-weight:900; background:linear-gradient(135deg,#6366f1,#8b5cf6); -webkit-background-clip:text; -webkit-text-fill-color:transparent; animation:countUp 0.6s ease-out; }
|
| 2285 |
.stat-label { font-size:10px; color:#64748b; margin-top:2px; font-weight:600; }
|
| 2286 |
-
|
| 2287 |
/* 비교 테이블 */
|
| 2288 |
.comp-table-wrap { animation:fadeInUp 1.1s ease-out 0.4s both; margin-bottom:18px; }
|
| 2289 |
.comp-table {
|
|
@@ -2304,13 +1856,11 @@ with gr.Blocks(title="AI 글 판별기 v5.1") as demo:
|
|
| 2304 |
.comp-table tbody td:first-child { text-align:left; padding-left:14px; font-weight:600; color:#1e293b; }
|
| 2305 |
.comp-table tbody td:nth-child(2) { background:rgba(239,68,68,0.04); font-weight:700; }
|
| 2306 |
.comp-table tbody tr:last-child td { border-bottom:none; }
|
| 2307 |
-
|
| 2308 |
/* 체크/엑스 아이콘 */
|
| 2309 |
.ck { color:#22c55e; font-weight:800; font-size:15px; animation:checkPop 0.4s ease-out; }
|
| 2310 |
.cx { color:#ef4444; font-weight:700; font-size:13px; }
|
| 2311 |
.cp { color:#f59e0b; font-weight:700; font-size:12px; }
|
| 2312 |
.cstar { color:#6366f1; font-weight:900; font-size:14px; }
|
| 2313 |
-
|
| 2314 |
/* 차별화 카드 */
|
| 2315 |
.diff-grid { display:grid; grid-template-columns:1fr 1fr; gap:10px; margin-bottom:18px; }
|
| 2316 |
.diff-card {
|
|
@@ -2329,7 +1879,6 @@ with gr.Blocks(title="AI 글 판별기 v5.1") as demo:
|
|
| 2329 |
.diff-title { font-size:13px; font-weight:800; color:#1e293b; margin-bottom:4px; }
|
| 2330 |
.diff-desc { font-size:10px; color:#64748b; line-height:1.5; }
|
| 2331 |
.diff-vs { display:inline-block; background:#fef2f2; color:#dc2626; font-size:9px; font-weight:700; padding:2px 6px; border-radius:4px; margin-top:4px; }
|
| 2332 |
-
|
| 2333 |
/* 경쟁사 공략 바 */
|
| 2334 |
.attack-section { animation:fadeInUp 1.3s ease-out 0.8s both; margin-bottom:14px; }
|
| 2335 |
.attack-title { font-size:13px; font-weight:800; color:#1e293b; margin-bottom:10px; }
|
|
@@ -2339,7 +1888,6 @@ with gr.Blocks(title="AI 글 판별기 v5.1") as demo:
|
|
| 2339 |
.attack-weak { font-size:9px; color:#ef4444; font-weight:600; }
|
| 2340 |
.attack-bar { height:8px; background:#f1f5f9; border-radius:4px; overflow:hidden; }
|
| 2341 |
.attack-fill { height:100%; border-radius:4px; animation:barGrow 1.5s ease-out; }
|
| 2342 |
-
|
| 2343 |
/* 슬로건 */
|
| 2344 |
.slogan-box {
|
| 2345 |
background:linear-gradient(135deg,#faf5ff,#ede9fe,#e0e7ff); border-radius:16px;
|
|
@@ -2349,7 +1897,6 @@ with gr.Blocks(title="AI 글 판별기 v5.1") as demo:
|
|
| 2349 |
.slogan-main { font-size:16px; font-weight:900; color:#3730a3; margin-bottom:6px; }
|
| 2350 |
.slogan-sub { font-size:11px; color:#6366f1; line-height:1.7; }
|
| 2351 |
</style>
|
| 2352 |
-
|
| 2353 |
<div class="comp-wrap">
|
| 2354 |
<!-- 히어로 배너 -->
|
| 2355 |
<div class="hero-banner">
|
|
@@ -2360,7 +1907,6 @@ with gr.Blocks(title="AI 글 판별기 v5.1") as demo:
|
|
| 2360 |
</div>
|
| 2361 |
<div class="hero-badge">🔬 v5.1 — 3대 킬러 기능 탑재</div>
|
| 2362 |
</div>
|
| 2363 |
-
|
| 2364 |
<!-- 핵심 수치 -->
|
| 2365 |
<div class="stats-row">
|
| 2366 |
<div class="stat-card"><div class="stat-num">8축</div><div class="stat-label">탐지 차원<br>(업계 최다)</div></div>
|
|
@@ -2368,7 +1914,6 @@ with gr.Blocks(title="AI 글 판별기 v5.1") as demo:
|
|
| 2368 |
<div class="stat-card"><div class="stat-num">무료</div><div class="stat-label">완전 무료<br>(횟수 무제한)</div></div>
|
| 2369 |
<div class="stat-card"><div class="stat-num">4종</div><div class="stat-label">모델 추정<br>(GPT/Claude/Gemini/PPX)</div></div>
|
| 2370 |
</div>
|
| 2371 |
-
|
| 2372 |
<!-- 비교 테이블 -->
|
| 2373 |
<div class="comp-table-wrap">
|
| 2374 |
<table class="comp-table">
|
|
@@ -2513,7 +2058,6 @@ with gr.Blocks(title="AI 글 판별기 v5.1") as demo:
|
|
| 2513 |
</tbody>
|
| 2514 |
</table>
|
| 2515 |
</div>
|
| 2516 |
-
|
| 2517 |
<!-- 차별화 4대 카드 -->
|
| 2518 |
<div class="diff-grid" style="animation:fadeInUp 1.2s ease-out 0.6s both;">
|
| 2519 |
<div class="diff-card">
|
|
@@ -2541,7 +2085,6 @@ with gr.Blocks(title="AI 글 판별기 v5.1") as demo:
|
|
| 2541 |
<div class="diff-vs">vs 전체 경쟁사: LLM 검증 미보유</div>
|
| 2542 |
</div>
|
| 2543 |
</div>
|
| 2544 |
-
|
| 2545 |
<!-- 경쟁사 취약점 공략 -->
|
| 2546 |
<div class="attack-section">
|
| 2547 |
<div class="attack-title">⚔️ 경쟁사 취약점 vs 우리의 강점</div>
|
|
@@ -2566,7 +2109,6 @@ with gr.Blocks(title="AI 글 판별기 v5.1") as demo:
|
|
| 2566 |
<div class="attack-bar"><div class="attack-fill" style="width:94%;background:linear-gradient(90deg,#a855f7,#ec4899);"></div></div>
|
| 2567 |
</div>
|
| 2568 |
</div>
|
| 2569 |
-
|
| 2570 |
<!-- 슬로건 -->
|
| 2571 |
<div class="slogan-box">
|
| 2572 |
<div class="slogan-main">🇰🇷 한국어 AI 글을 가장 정확하게, 가장 투명하게, 완전 무료로 판별합니다</div>
|
|
@@ -2603,59 +2145,42 @@ with gr.Blocks(title="AI 글 판별기 v5.1") as demo:
|
|
| 2603 |
btn_ps.click(lambda:SAMPLE_AI,outputs=[inp_plag])
|
| 2604 |
with gr.Tab("📖 설명"):
|
| 2605 |
gr.Markdown("""
|
| 2606 |
-
### 아키텍처 v5.1 — 3대 킬러 기능 탑재
|
| 2607 |
- **탐지 5축:** 통계(6%)·문체(25%)·반복(10%)·구조(12%)·지문(30%)
|
| 2608 |
- **★ Perplexity 확률분석(17%):** 문자 엔트로피·Burstiness·TTR편차·종결엔트로피
|
| 2609 |
- **★ Humanizer 탐지:** 동의어과다치환·구조보존·접속사잔존·이중피동 등 6시그널
|
| 2610 |
- **★ 모델 추정:** GPT·Claude·Gemini·Perplexity 4모델 증거기반 추정
|
| 2611 |
- **품질 6항목:** 가독성·어휘·논리·정확성·표현·정보밀도
|
| 2612 |
- **LLM 교차검증:** GPT-OSS-120B·Qwen3-32B·Kimi-K2 (GROQ)
|
| 2613 |
-
|
| 2614 |
-
### 표절 검사
|
| 2615 |
- **Brave Search**: 병렬 20개 동시 웹검색
|
| 2616 |
- **학술 DB**: KCI(한국학술지인용색인), RISS(학술연구정보), arXiv
|
| 2617 |
- **Gemini**: Google Search Grounding
|
| 2618 |
- **보고서**: CopyKiller 스타일 — 유사도%, 출처표, 문장별 하이라이트
|
| 2619 |
-
|
| 2620 |
-
### 환경변수
|
| 2621 |
- `GROQ_API_KEY` — LLM 교차검증
|
| 2622 |
- `GEMINI_API_KEY` — 표절 검사 (Google Search Grounding)
|
| 2623 |
- `BRAVE_API_KEY` — 표절 검사 (Brave Search 병렬)
|
| 2624 |
""")
|
| 2625 |
-
# ── 숨겨진 파일 텍스트 추출 API (탭1~4 파일 업로드용) ──
|
| 2626 |
with gr.Row(visible=False):
|
| 2627 |
_file_in = gr.File(label="hidden_file")
|
| 2628 |
_text_out = gr.Textbox(label="hidden_text")
|
| 2629 |
_file_in.change(extract_file_text_api, [_file_in], [_text_out], api_name="extract_file_text")
|
| 2630 |
-
|
| 2631 |
-
# ═══ 정적 파일 준비 ═══
|
| 2632 |
import shutil, pathlib
|
| 2633 |
static_dir = pathlib.Path("static")
|
| 2634 |
static_dir.mkdir(exist_ok=True)
|
| 2635 |
if pathlib.Path("index.html").exists():
|
| 2636 |
shutil.copy("index.html", static_dir / "index.html")
|
| 2637 |
-
|
| 2638 |
-
# ═══ FastAPI — index.html을 루트(/)로 서빙 ═══
|
| 2639 |
from fastapi import FastAPI
|
| 2640 |
from fastapi.responses import HTMLResponse, FileResponse
|
| 2641 |
from fastapi.staticfiles import StaticFiles
|
| 2642 |
-
|
| 2643 |
server = FastAPI()
|
| 2644 |
-
|
| 2645 |
@server.get("/", response_class=HTMLResponse)
|
| 2646 |
async def serve_root():
|
| 2647 |
"""루트 URL에서 프리미엄 index.html 서빙"""
|
| 2648 |
fp = pathlib.Path("static/index.html")
|
| 2649 |
if fp.exists():
|
| 2650 |
return HTMLResponse(fp.read_text(encoding="utf-8"))
|
| 2651 |
-
# index.html 없으면 Gradio UI로 리다이렉트
|
| 2652 |
from fastapi.responses import RedirectResponse
|
| 2653 |
return RedirectResponse("/gradio/")
|
| 2654 |
-
|
| 2655 |
-
# Gradio를 /gradio 경로에 마운트 — API는 /gradio/gradio_api/call/... 에서 작동
|
| 2656 |
app = gr.mount_gradio_app(server, demo, path="/gradio", allowed_paths=["static"])
|
| 2657 |
-
|
| 2658 |
if __name__ == "__main__":
|
| 2659 |
import uvicorn
|
| 2660 |
-
uvicorn.run(app, host="0.0.0.0", port=7860)
|
| 2661 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import math, re, os, json, random, time, hashlib, zlib, zipfile, tempfile
|
| 3 |
from collections import Counter
|
|
|
|
| 6 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 7 |
from xml.etree import ElementTree as ET
|
| 8 |
from kiwipiepy import Kiwi
|
|
|
|
| 9 |
KIWI = Kiwi()
|
| 10 |
try:
|
| 11 |
import httpx; HAS_HTTPX = True
|
|
|
|
| 17 |
HAS_GENAI = True
|
| 18 |
except ImportError:
|
| 19 |
HAS_GENAI = False
|
|
|
|
|
|
|
| 20 |
try:
|
| 21 |
import olefile; HAS_OLEFILE = True
|
| 22 |
except ImportError:
|
|
|
|
| 33 |
from docx import Document as DocxDocument; HAS_DOCX = True
|
| 34 |
except ImportError:
|
| 35 |
HAS_DOCX = False
|
|
|
|
| 36 |
GROQ_KEY = os.getenv("GROQ_API_KEY", "")
|
| 37 |
GEMINI_KEY = os.getenv("GEMINI_API_KEY", "")
|
| 38 |
BRAVE_KEY = os.getenv("BRAVE_API_KEY", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
def extract_text_from_pdf(file_path):
|
| 40 |
"""PDF → 텍스트 (페이지별 분리)"""
|
| 41 |
pages = []
|
|
|
|
| 59 |
except Exception as e:
|
| 60 |
print(f"PyPDF2: {e}")
|
| 61 |
return None, "PDF 추출 실패 (pdfplumber, PyPDF2 없음)"
|
|
|
|
| 62 |
def extract_text_from_docx(file_path):
|
| 63 |
"""DOCX → 텍스트 (문단별 분리)"""
|
| 64 |
if not HAS_DOCX: return None, "python-docx 없음"
|
|
|
|
| 79 |
return None, "DOCX 텍스트 없음"
|
| 80 |
except Exception as e:
|
| 81 |
return None, f"DOCX 오류: {e}"
|
|
|
|
| 82 |
def extract_text_from_txt(file_path):
|
| 83 |
"""TXT/MD/CSV 등 → 텍스트"""
|
| 84 |
for enc in ['utf-8', 'euc-kr', 'cp949', 'utf-16', 'latin-1']:
|
|
|
|
| 86 |
with open(file_path, 'r', encoding=enc) as f:
|
| 87 |
text = f.read()
|
| 88 |
if text.strip():
|
|
|
|
| 89 |
sections = [s.strip() for s in re.split(r'\n{2,}', text) if s.strip()]
|
| 90 |
return sections if sections else [text], None
|
| 91 |
except: continue
|
| 92 |
return None, "텍스트 인코딩 감지 실패"
|
|
|
|
| 93 |
def extract_text_from_hwpx(file_path):
|
| 94 |
"""HWPX (ZIP 기반) → 텍스트"""
|
| 95 |
try:
|
|
|
|
| 128 |
return None, "유효하지 않은 HWPX"
|
| 129 |
except Exception as e:
|
| 130 |
return None, f"HWPX 오류: {e}"
|
|
|
|
| 131 |
def _decode_hwp_para(data):
|
| 132 |
"""HWP 바이너리 → 문단 텍스트"""
|
| 133 |
result = []
|
|
|
|
| 149 |
text = re.sub(r'[ \t]+', ' ', text)
|
| 150 |
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 151 |
return text if len(text) > 2 else None
|
|
|
|
| 152 |
def _extract_hwp_section(data):
|
| 153 |
"""HWP 섹션 바이너리 → 텍스트"""
|
| 154 |
texts = []
|
|
|
|
| 172 |
except:
|
| 173 |
pos += 1
|
| 174 |
return '\n'.join(texts) if texts else None
|
|
|
|
| 175 |
def extract_text_from_hwp(file_path):
|
| 176 |
"""HWP (OLE 기반) → 텍스트"""
|
| 177 |
if not HAS_OLEFILE: return None, "olefile 없음"
|
|
|
|
| 200 |
return None, "HWP 텍스트 없음"
|
| 201 |
except Exception as e:
|
| 202 |
return None, f"HWP 오류: {e}"
|
|
|
|
| 203 |
def extract_text_from_file(file_path):
|
| 204 |
"""
|
| 205 |
만능 문서 추출: PDF/DOCX/HWP/HWPX/TXT → (sections_list, full_text, error)
|
|
|
|
| 210 |
return None, None, "파일 없음"
|
| 211 |
ext = Path(file_path).suffix.lower()
|
| 212 |
sections, error = None, None
|
|
|
|
| 213 |
if ext == '.pdf':
|
| 214 |
sections, error = extract_text_from_pdf(file_path)
|
| 215 |
elif ext == '.docx':
|
|
|
|
| 222 |
sections, error = extract_text_from_txt(file_path)
|
| 223 |
else:
|
| 224 |
return None, None, f"지원하지 않는 형식: {ext}"
|
|
|
|
| 225 |
if sections:
|
| 226 |
full = '\n\n'.join(sections)
|
| 227 |
return sections, full, None
|
| 228 |
return None, None, error or "텍스트 추출 실패"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
def split_sentences(text):
|
| 230 |
try:
|
| 231 |
s = [x.text.strip() for x in KIWI.split_into_sents(text) if x.text.strip()]
|
| 232 |
if s: return s
|
| 233 |
except: pass
|
| 234 |
return [x.strip() for x in re.split(r'(?<=[.!?。])\s+', text) if x.strip()]
|
|
|
|
| 235 |
def split_words(text):
|
| 236 |
return [w for w in re.findall(r'[가-힣a-zA-Z0-9]+', text) if w]
|
|
|
|
| 237 |
def get_morphemes(text):
|
| 238 |
try:
|
| 239 |
r = KIWI.analyze(text)
|
| 240 |
if r and r[0]: return [(m.form, m.tag) for m in r[0][0]]
|
| 241 |
except: pass
|
| 242 |
return []
|
|
|
|
| 243 |
def http_get(url, headers=None, timeout=15):
|
| 244 |
try:
|
| 245 |
if HAS_HTTPX:
|
|
|
|
| 251 |
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
| 252 |
return resp.read().decode('utf-8', errors='replace')
|
| 253 |
except: return None
|
|
|
|
| 254 |
def http_post_json(url, body, headers=None, timeout=30):
|
| 255 |
try:
|
| 256 |
h = headers or {}
|
|
|
|
| 265 |
with urllib.request.urlopen(req, timeout=timeout, context=ssl.create_default_context()) as resp:
|
| 266 |
return json.loads(resp.read())
|
| 267 |
except: return None
|
|
|
|
| 268 |
def call_groq(model, prompt, max_tokens=800, temperature=0.1):
|
| 269 |
if not GROQ_KEY: return None, "NO_KEY"
|
| 270 |
url = "https://api.groq.com/openai/v1/chat/completions"
|
|
|
|
| 281 |
with urllib.request.urlopen(req, timeout=45, context=ssl.create_default_context()) as resp:
|
| 282 |
return json.loads(resp.read())["choices"][0]["message"]["content"], None
|
| 283 |
except Exception as e: return None, str(e)[:150]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
AI_ENDINGS = ['합니다','입니다','됩니다','습니다','있습니다','했습니다','겠습니다']
|
|
|
|
| 285 |
AI_CASUAL_ENDINGS = ['라고 할 수 있다','라고 볼 수 있다','다고 생각한다','다고 판단된다',
|
| 286 |
'인 셈이다','인 것이다','는 것이다','는 셈이다','ㄹ 것이다','을 것이다',
|
| 287 |
'라 할 수 있다','로 보인다','로 판단된다','고 있다','는 추세다','는 상황이다',
|
| 288 |
'지 않을 수 없다','라 하겠다','음을 알 수 있다','할 필요가 있다']
|
| 289 |
AI_CONNS = ['또한','따라서','그러므로','이에 따라','한편','더불어','아울러','뿐만 아니라',
|
| 290 |
'이를 통해','이에','결과적으로','궁극적으로','특히','나아가','이러한']
|
|
|
|
| 291 |
AI_SOFT_CONNS = ['물론','그러나','하지만','이처럼','이와 같이','이를 바탕으로']
|
| 292 |
AI_FILLER = ['것으로 보','것으로 나타','것으로 예상','할 수 있','볼 수 있','주목할 만',
|
| 293 |
'중요한 역할','중요한 의미','긍정적인 영향','부정적인 영향','필요합니다','필요하다',
|
| 294 |
'중요합니다','중요하다','역할을 하','영향을 미','기대된다','예상됩니다','부각되고',
|
| 295 |
'대두되고','다양한 분야','다양한 산업','눈부신 성과','획기적인 변화','혁신적인',
|
| 296 |
'점에서','측면에서','관점에서']
|
|
|
|
| 297 |
AI_CASUAL_FILLER = ['무궁무진하다','무궁무진한','과언이 아니','돌파구가 될','전환점이 될',
|
| 298 |
'기반으로','발판으로','원동력이','초석이 될','가속화되','급부상','패러다임',
|
| 299 |
'지평을 열','새로운 장을','대전환','본격화되','고도화','이정표']
|
|
|
|
| 300 |
AI_CONCESSION = re.compile(r'물론.{2,20}(하지만|그러나|그렇지만|다만)|.{2,15}(이긴 하지만|기는 하지만|수 있지만|수는 있지만)')
|
|
|
|
| 301 |
EN_AI_MARKERS = ['furthermore','additionally','moreover','it is worth noting','in conclusion',
|
| 302 |
'it is important to','plays a crucial role','significant impact','various aspects',
|
| 303 |
'in this regard','consequently','nevertheless','integral part of','led to remarkable',
|
|
|
|
| 324 |
"Perplexity": {"m":['검색 결과에 따르면','보도에 따르면','연구에 따르면','밝혔다','전했다',
|
| 325 |
'것으로 나타났다','것으로 조사됐다','것으로 집계됐다','발표했다'],"e":['밝혔다','나타났다','전했다'],"lp":re.compile(r'\[\d+\]',re.M)},
|
| 326 |
}
|
|
|
|
| 327 |
def score_sentence(sent):
|
| 328 |
"""단일 문장 AI 점수 (0~100). 탭1·탭2 공유. v5.0 대폭 강화."""
|
| 329 |
sc = 0; reasons = []
|
| 330 |
sl = sent.lower().strip()
|
| 331 |
sr = sent.rstrip('.!?。')
|
|
|
|
|
|
|
| 332 |
for e in AI_ENDINGS:
|
| 333 |
if sr.endswith(e): sc += 22; reasons.append(f"격식어미(-{e})"); break
|
|
|
|
|
|
|
| 334 |
if sc == 0: # 격식이 아닌 경우만
|
| 335 |
for e in AI_CASUAL_ENDINGS:
|
| 336 |
if sr.endswith(e): sc += 15; reasons.append(f"비격식AI(-{e})"); break
|
|
|
|
|
|
|
| 337 |
stripped = sent.strip()
|
| 338 |
for c in AI_CONNS:
|
| 339 |
if stripped.startswith(c):
|
|
|
|
| 341 |
else:
|
| 342 |
for c in AI_SOFT_CONNS:
|
| 343 |
if stripped.startswith(c): sc += 8; reasons.append(f"약한접속사({c})"); break
|
|
|
|
|
|
|
| 344 |
filler_found = sum(1 for f in AI_FILLER if f in sent)
|
| 345 |
casual_filler = sum(1 for f in AI_CASUAL_FILLER if f in sent)
|
| 346 |
total_filler = filler_found + casual_filler
|
| 347 |
if total_filler >= 3: sc += 25; reasons.append(f"상투표현×{total_filler}")
|
| 348 |
elif total_filler == 2: sc += 18; reasons.append(f"상투표현×2")
|
| 349 |
elif total_filler == 1: sc += 10; reasons.append(f"상투표현×1")
|
|
|
|
|
|
|
| 350 |
if AI_CONCESSION.search(sent): sc += 10; reasons.append("양보패턴")
|
|
|
|
|
|
|
| 351 |
for mn, fp in FP.items():
|
| 352 |
for m in fp["m"]:
|
| 353 |
if m in sent: sc += 8; reasons.append(f"{mn}지문"); break
|
|
|
|
|
|
|
| 354 |
en_count = sum(1 for em in EN_AI_MARKERS if em in sl)
|
| 355 |
if en_count >= 3: sc += 25; reasons.append(f"영어AI×{en_count}")
|
| 356 |
elif en_count >= 2: sc += 18; reasons.append(f"영어AI×{en_count}")
|
| 357 |
elif en_count >= 1: sc += 12; reasons.append(f"영어AI×1")
|
|
|
|
|
|
|
| 358 |
has_formal = any(sr.endswith(e) for e in AI_ENDINGS)
|
| 359 |
has_conn = any(stripped.startswith(c) for c in AI_CONNS)
|
| 360 |
if has_formal and total_filler >= 1 and has_conn: sc += 8; reasons.append("복합AI")
|
| 361 |
elif has_formal and total_filler >= 2: sc += 5; reasons.append("격식+상투")
|
|
|
|
|
|
|
| 362 |
for n, p in HUMAN_MARKERS.items():
|
| 363 |
matches = p.findall(sent)
|
| 364 |
if matches:
|
|
|
|
| 366 |
elif n in ('비격식종결','구어축약'): sc -= 18; reasons.append(f"구어체({n})")
|
| 367 |
elif n == '맞춤법오류': sc -= 12; reasons.append("맞춤법오류")
|
| 368 |
elif n in ('느낌표','말줄임표'): sc -= 10; reasons.append(f"인간({n})")
|
|
|
|
| 369 |
return max(0, min(100, sc)), reasons
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
def analyze_statistics(text, sentences, words):
|
| 371 |
sl = [len(s) for s in sentences]
|
| 372 |
if len(sl) < 2: return {"score":50}
|
|
|
|
|
|
|
| 373 |
avg = sum(sl)/len(sl); std = math.sqrt(sum((l-avg)**2 for l in sl)/len(sl))
|
| 374 |
cv = std/avg if avg > 0 else 0
|
| 375 |
cv_score = 85 if cv<0.20 else 70 if cv<0.30 else 50 if cv<0.45 else 30 if cv<0.60 else 15
|
|
|
|
|
|
|
| 376 |
diffs = [abs(sl[i]-sl[i-1]) for i in range(1,len(sl))]
|
| 377 |
burst_score = 50
|
| 378 |
if diffs:
|
|
|
|
| 380 |
max_d = max(diffs)
|
| 381 |
burst_ratio = max_d / (avg_d + 1)
|
| 382 |
burst_score = 85 if burst_ratio < 1.8 else 65 if burst_ratio < 2.5 else 40 if burst_ratio < 3.5 else 20
|
|
|
|
|
|
|
| 383 |
standard_ratio = sum(1 for l in sl if 20 <= l <= 60) / len(sl)
|
| 384 |
std_score = 80 if standard_ratio > 0.8 else 60 if standard_ratio > 0.6 else 40 if standard_ratio > 0.4 else 20
|
|
|
|
| 385 |
extreme = sum(1 for l in sl if l < 10 or l > 80)
|
| 386 |
if extreme >= 2: std_score = max(10, std_score - 20)
|
| 387 |
elif extreme >= 1: std_score = max(15, std_score - 10)
|
|
|
|
|
|
|
| 388 |
wf = Counter(words); t = len(words)
|
| 389 |
ttr = len(wf)/t if t>0 else 0
|
| 390 |
vocab_score = 70 if ttr<0.45 else 55 if ttr<0.55 else 35 if ttr<0.65 else 20
|
|
|
|
|
|
|
| 391 |
wpc = [len(split_words(s)) for s in sentences]
|
| 392 |
complex_score = 50
|
| 393 |
if len(wpc) >= 3:
|
|
|
|
| 395 |
wpc_std = math.sqrt(sum((w-wpc_avg)**2 for w in wpc)/len(wpc))
|
| 396 |
wpc_cv = wpc_std/wpc_avg if wpc_avg > 0 else 0
|
| 397 |
complex_score = 80 if wpc_cv < 0.20 else 60 if wpc_cv < 0.35 else 35 if wpc_cv < 0.50 else 15
|
|
|
|
| 398 |
final = int(cv_score*0.20 + burst_score*0.20 + std_score*0.25 + vocab_score*0.15 + complex_score*0.20)
|
| 399 |
return {"score":final,"cv":round(cv,3),"ttr":round(ttr,3)}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
def analyze_korean_style(text, sentences, morphemes):
|
| 401 |
if not sentences: return {"score":50}
|
|
|
|
|
|
|
| 402 |
formal_cnt = sum(1 for s in sentences if any(s.rstrip('.!?').endswith(e) for e in AI_ENDINGS))
|
| 403 |
casual_ai = sum(1 for s in sentences if any(s.rstrip('.!?').endswith(e) for e in AI_CASUAL_ENDINGS))
|
| 404 |
fr = formal_cnt/len(sentences)
|
| 405 |
car = casual_ai/len(sentences)
|
|
|
|
| 406 |
ending_score = 85 if fr>0.7 else 65 if fr>0.5 else 45 if fr>0.3 else 25 if fr>0.1 else 10
|
| 407 |
ending_score = min(90, ending_score + int(car * 25)) # 비격식AI 보너스
|
|
|
|
|
|
|
| 408 |
conn_positions = []
|
| 409 |
for i, s in enumerate(sentences):
|
| 410 |
for c in AI_CONNS:
|
| 411 |
if s.strip().startswith(c): conn_positions.append(i); break
|
| 412 |
conn_density = len(conn_positions)/len(sentences) if sentences else 0
|
| 413 |
conn_score = 85 if conn_density>0.4 else 65 if conn_density>0.25 else 40 if conn_density>0.1 else 15
|
|
|
|
| 414 |
if len(conn_positions) >= 2:
|
| 415 |
gaps = [conn_positions[i]-conn_positions[i-1] for i in range(1,len(conn_positions))]
|
| 416 |
gap_cv = (math.sqrt(sum((g-sum(gaps)/len(gaps))**2 for g in gaps)/len(gaps))/(sum(gaps)/len(gaps)+0.01))
|
| 417 |
if gap_cv < 0.5: conn_score = min(90, conn_score + 10) # 매우 규칙적 → AI 보너스
|
|
|
|
|
|
|
| 418 |
filler_cnt = sum(1 for f in AI_FILLER if f in text) + sum(1 for f in AI_CASUAL_FILLER if f in text)
|
| 419 |
filler_score = 90 if filler_cnt>=6 else 75 if filler_cnt>=4 else 55 if filler_cnt>=2 else 30 if filler_cnt>=1 else 10
|
|
|
|
|
|
|
| 420 |
concession_cnt = len(AI_CONCESSION.findall(text))
|
| 421 |
conc_score = 80 if concession_cnt >= 2 else 55 if concession_cnt >= 1 else 20
|
|
|
|
|
|
|
| 422 |
human_count = sum(len(p.findall(text)) for p in HUMAN_MARKERS.values())
|
| 423 |
human_penalty = min(35, human_count * 8)
|
|
|
|
|
|
|
| 424 |
pos_score = 45
|
| 425 |
if morphemes:
|
| 426 |
pc = Counter(t for _,t in morphemes); tm = sum(pc.values())
|
| 427 |
noun_r = sum(pc.get(t,0) for t in ['NNG','NNP','NNB','NR'])/tm if tm else 0
|
| 428 |
pos_score = 70 if noun_r>0.42 else 55 if noun_r>0.38 else 35 if noun_r>0.32 else 20
|
|
|
|
| 429 |
final = max(5, int(ending_score*0.25 + conn_score*0.20 + filler_score*0.20 +
|
| 430 |
conc_score*0.10 + pos_score*0.15 + 10*0.10) - human_penalty)
|
| 431 |
return {"score":final,"formal":f"{fr:.0%}","conn":f"{conn_density:.2f}","filler":filler_cnt,"human":human_count}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
def analyze_repetition(text, sentences, words):
|
| 433 |
if not sentences or len(sentences) < 2: return {"score":35}
|
|
|
|
|
|
|
| 434 |
tr = 0
|
| 435 |
if len(words)>=5:
|
| 436 |
tg = Counter(tuple(words[i:i+3]) for i in range(len(words)-2))
|
| 437 |
tr = sum(1 for c in tg.values() if c>1)/len(tg) if tg else 0
|
| 438 |
ngram_score = 80 if tr>0.15 else 60 if tr>0.08 else 35 if tr>0.03 else 15
|
|
|
|
|
|
|
| 439 |
openers_2 = []
|
| 440 |
openers_3 = []
|
| 441 |
for s in sentences:
|
| 442 |
ws = split_words(s)
|
| 443 |
if len(ws) >= 2: openers_2.append(tuple(ws[:2]))
|
| 444 |
if len(ws) >= 3: openers_3.append(tuple(ws[:3]))
|
|
|
|
| 445 |
opener2_score = 50
|
| 446 |
if openers_2:
|
| 447 |
unique2 = len(set(openers_2))/len(openers_2)
|
| 448 |
opener2_score = 80 if unique2 < 0.5 else 60 if unique2 < 0.7 else 35 if unique2 < 0.85 else 15
|
|
|
|
|
|
|
| 449 |
ai_only_conns = ['또한','따라서','그러므로','이에 따라','더불어','아울러','뿐만 아니라',
|
| 450 |
'이를 통해','이에','결과적으로','궁극적으로','나아가','이러한']
|
| 451 |
cr = sum(1 for s in sentences if any(s.strip().startswith(c) for c in ai_only_conns))
|
| 452 |
crr = cr/len(sentences) if sentences else 0
|
| 453 |
ai_conn_score = 85 if crr>0.35 else 65 if crr>0.2 else 40 if crr>0.08 else 15
|
|
|
|
|
|
|
| 454 |
templates = []
|
| 455 |
for s in sentences:
|
| 456 |
ws = split_words(s)
|
| 457 |
if len(ws) >= 4:
|
|
|
|
| 458 |
templates.append((ws[0], ws[-1]))
|
| 459 |
template_rep = 0
|
| 460 |
if templates:
|
| 461 |
tc = Counter(templates)
|
| 462 |
template_rep = sum(1 for c in tc.values() if c > 1) / len(tc) if tc else 0
|
| 463 |
template_score = 80 if template_rep > 0.3 else 55 if template_rep > 0.1 else 25
|
|
|
|
|
|
|
| 464 |
endings = []
|
| 465 |
for s in sentences:
|
| 466 |
sr = s.rstrip('.!?。')
|
|
|
|
| 470 |
if endings:
|
| 471 |
unique_e = len(set(endings))/len(endings)
|
| 472 |
ending_div = 80 if unique_e < 0.3 else 60 if unique_e < 0.5 else 35 if unique_e < 0.7 else 15
|
|
|
|
| 473 |
final = int(ngram_score*0.15 + opener2_score*0.20 + ai_conn_score*0.25 +
|
| 474 |
template_score*0.15 + ending_div*0.25)
|
| 475 |
return {"score":final}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
AI_VAGUE = re.compile(r'다양한|중요한|긍정적인|부정적인|획기적인|혁신적인|효율적인|체계적인|종합적인|전반적인|지속적인|적극적인|상당한|주요한')
|
|
|
|
| 477 |
CONCRETE_PROPER = re.compile(r'삼성|LG|현대|SK|카카오|네이버|넷플릭스|구글|애플|테슬라|아마존|마이크로소프트|[가-힣]{2,}대학|[가-힣]{2,}병원|[가-힣]{1,3}시[는을이가]|[가-힣]{1,3}구[는을이가]|[가-힣]{2,}동[에서]')
|
| 478 |
CONCRETE_NUMBER = re.compile(r'\d{2,}[만억조원달러%개년월일위등호]|\d+\.\d+%|\d{4}년|\d{1,2}월')
|
| 479 |
CONCRETE_QUOTE = re.compile(r'에 따르면|발표했|밝혔다|보도했|전했다|라고 말|인터뷰|설문|조사|통계청|보고서')
|
|
|
|
| 480 |
def analyze_structure(text, sentences):
|
| 481 |
if not sentences: return {"score":35}
|
|
|
|
|
|
|
| 482 |
lt = (len(re.findall(r'^\d+[.)]\s',text,re.M)) + len(re.findall(r'^[-•*]\s',text,re.M)) +
|
| 483 |
len(re.findall(r'^#+\s',text,re.M)) + len(re.findall(r'\*\*[^*]+\*\*',text)))
|
| 484 |
list_score = 90 if lt>=5 else 70 if lt>=3 else 45 if lt>=1 else 10
|
|
|
|
|
|
|
| 485 |
vague_cnt = len(AI_VAGUE.findall(text))
|
| 486 |
proper_cnt = len(CONCRETE_PROPER.findall(text))
|
| 487 |
number_cnt = len(CONCRETE_NUMBER.findall(text))
|
| 488 |
quote_cnt = len(CONCRETE_QUOTE.findall(text))
|
| 489 |
concrete_total = proper_cnt + number_cnt + quote_cnt
|
|
|
|
| 490 |
if vague_cnt >= 3 and concrete_total == 0: abstract_score = 90
|
| 491 |
elif vague_cnt >= 2 and concrete_total <= 1: abstract_score = 70
|
| 492 |
elif vague_cnt >= 1 and concrete_total == 0: abstract_score = 55
|
|
|
|
| 494 |
elif concrete_total >= 2: abstract_score = 20
|
| 495 |
elif concrete_total >= 1: abstract_score = 30
|
| 496 |
else: abstract_score = 45
|
|
|
|
|
|
|
| 497 |
has_question = any(s.strip().endswith('?') for s in sentences)
|
| 498 |
has_exclaim = any(s.strip().endswith('!') for s in sentences)
|
| 499 |
has_ellipsis = any('...' in s or '…' in s for s in sentences)
|
| 500 |
variety = sum([has_question, has_exclaim, has_ellipsis])
|
| 501 |
type_score = 15 if variety >= 2 else 40 if variety >= 1 else 65
|
|
|
|
|
|
|
| 502 |
puncts = re.findall(r'[!?,;:…—\-~]', text)
|
| 503 |
unique_punct = len(set(puncts))
|
| 504 |
punct_score = 65 if unique_punct <= 1 else 45 if unique_punct <= 3 else 20
|
|
|
|
|
|
|
| 505 |
paras = [p.strip() for p in text.split('\n\n') if p.strip()]
|
| 506 |
para_score = 35
|
| 507 |
if len(paras) >= 2:
|
|
|
|
| 512 |
para_score = 75 if pcv < 0.2 else 55 if pcv < 0.35 else 30
|
| 513 |
if len(paras) >= 3 and pl[0] < avg_p and pl[-1] < avg_p:
|
| 514 |
para_score = min(85, para_score + 10)
|
|
|
|
| 515 |
final = int(list_score*0.10 + abstract_score*0.40 + type_score*0.20 + punct_score*0.10 + para_score*0.20)
|
| 516 |
return {"score":final}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
def analyze_model_fingerprint(text, sentences):
|
| 518 |
ms = {}
|
| 519 |
sl = text.lower()
|
|
|
|
| 524 |
em = sum(1 for s in sentences if any(s.rstrip('.!?').endswith(e) for e in fp.get("e",[])))
|
| 525 |
if sentences: sc += int((em/len(sentences))*20)
|
| 526 |
ms[mn] = min(100,sc)
|
|
|
|
|
|
|
| 527 |
general_ai = 0
|
|
|
|
| 528 |
general_ai += sum(5 for f in AI_CASUAL_FILLER if f in text)
|
|
|
|
| 529 |
casual_end_cnt = sum(1 for s in sentences if any(s.rstrip('.!?').endswith(e) for e in AI_CASUAL_ENDINGS))
|
| 530 |
general_ai += casual_end_cnt * 5
|
|
|
|
| 531 |
general_ai += len(AI_CONCESSION.findall(text)) * 8
|
| 532 |
ms["비격식AI"] = min(100, general_ai)
|
|
|
|
|
|
|
| 533 |
en_score = sum(5 for em in EN_AI_MARKERS if em in sl)
|
| 534 |
ms["영어AI"] = min(100, en_score)
|
|
|
|
| 535 |
mx = max(ms.values()) if ms else 0
|
|
|
|
| 536 |
multi = sum(1 for v in ms.values() if v >= 10)
|
| 537 |
multi_bonus = 10 if multi >= 3 else 5 if multi >= 2 else 0
|
|
|
|
| 538 |
base = 85 if mx>=50 else 65 if mx>=35 else 45 if mx>=20 else 25 if mx>=10 else 10
|
| 539 |
return {"score":min(95, base + multi_bonus),"model_scores":{k:v for k,v in ms.items() if k not in ("비격식AI","영어AI") or v > 0}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 540 |
def analyze_perplexity(text, sentences, morphemes):
|
| 541 |
"""한국어 특화 Perplexity + Burstiness — 문자 엔트로피 보정"""
|
| 542 |
if len(sentences) < 2: return {"score": 40, "entropy": 0, "variance": 0, "order": 0, "zipf": 0}
|
|
|
|
|
|
|
| 543 |
chars = [c for c in text if c.strip()]
|
| 544 |
char_score = 45
|
| 545 |
if len(chars) >= 30:
|
|
|
|
| 547 |
cb_freq = Counter(cbigrams)
|
| 548 |
total_cb = len(cbigrams)
|
| 549 |
char_entropy = -sum((cnt/total_cb)*math.log2(cnt/total_cb) for cnt in cb_freq.values())
|
|
|
|
| 550 |
if char_entropy < 7.5: char_score = 78
|
| 551 |
elif char_entropy < 8.5: char_score = 62
|
| 552 |
elif char_entropy < 9.5: char_score = 42
|
| 553 |
elif char_entropy < 10.5: char_score = 25
|
| 554 |
else: char_score = 12
|
|
|
|
|
|
|
| 555 |
sl = [len(s) for s in sentences]
|
| 556 |
burst_score = 45
|
| 557 |
if len(sl) >= 3:
|
|
|
|
| 563 |
elif cv < 0.40: burst_score = 38
|
| 564 |
elif cv < 0.60: burst_score = 20
|
| 565 |
else: burst_score = 8
|
|
|
|
|
|
|
| 566 |
sent_ttr = []
|
| 567 |
for s in sentences:
|
| 568 |
sw = split_words(s)
|
|
|
|
| 576 |
elif std_ttr < 0.08: ttr_score = 55
|
| 577 |
elif std_ttr < 0.15: ttr_score = 35
|
| 578 |
else: ttr_score = 15
|
|
|
|
|
|
|
| 579 |
endings = [s.rstrip('.!?\u2026')[-3:] for s in sentences if len(s) >= 5]
|
| 580 |
end_score = 40
|
| 581 |
if len(endings) >= 3:
|
|
|
|
| 587 |
elif norm_ent < 0.7: end_score = 50
|
| 588 |
elif norm_ent < 0.85: end_score = 32
|
| 589 |
else: end_score = 15
|
|
|
|
| 590 |
final = int(char_score * 0.30 + burst_score * 0.30 + ttr_score * 0.20 + end_score * 0.20)
|
| 591 |
return {"score": final, "entropy": char_score, "variance": burst_score, "order": ttr_score, "zipf": end_score}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
HUMANIZER_OVERSUBST = re.compile(r'활용하다|이용하다|사용하다|적용하다|도입하다|채택하다|수행하다|진행하다|실시하다|실행하다')
|
| 593 |
HUMANIZER_AWKWARD = re.compile(r'그것은|이것은|저것은|해당 사항|앞서 언급한|전술한|상기한|기술된')
|
| 594 |
HUMANIZER_PASSIVE = re.compile(r'되어지[고는며]|하게 되었[다습]|수행되[었어]|진행되[었어]|실시되[었어]|활용되[었어]')
|
|
|
|
| 595 |
def analyze_humanizer(text, sentences, words, morphemes):
|
| 596 |
"""Humanizer/Bypasser 탐지 — AI 원문 패러프레이즈 흔적 분석"""
|
| 597 |
if len(sentences) < 2: return {"score": 20, "signals": []}
|
| 598 |
signals = []
|
|
|
|
|
|
|
|
|
|
| 599 |
content_words = [f for f, t in morphemes if t in ('NNG', 'NNP', 'VV', 'VA')]
|
| 600 |
if len(content_words) >= 10:
|
| 601 |
cw_freq = Counter(content_words)
|
|
|
|
| 602 |
hapax = sum(1 for c in cw_freq.values() if c == 1)
|
| 603 |
hapax_ratio = hapax / len(cw_freq) if cw_freq else 0
|
|
|
|
| 604 |
if hapax_ratio > 0.95 and len(content_words) >= 30:
|
| 605 |
signals.append(("동의어과다치환", 20, "핵심 어휘가 과도하게 분산"))
|
| 606 |
elif hapax_ratio > 0.90 and len(content_words) >= 25:
|
| 607 |
signals.append(("동의어치환의심", 12, "어휘 반복 회피 패턴"))
|
|
|
|
|
|
|
|
|
|
| 608 |
sl = [len(s) for s in sentences]
|
| 609 |
if len(sl) >= 4:
|
| 610 |
avg = sum(sl) / len(sl)
|
| 611 |
cv = math.sqrt(sum((l - avg)**2 for l in sl) / len(sl)) / (avg + 1e-10)
|
|
|
|
| 612 |
unique_ratio = len(set(words)) / len(words) if words else 0
|
| 613 |
if cv < 0.20 and unique_ratio > 0.80 and len(sentences) >= 5:
|
| 614 |
signals.append(("구조보존어휘변경", 18, "문장 구조 균일 + 비정상적 어휘 다양성"))
|
|
|
|
|
|
|
|
|
|
| 615 |
residual = 0
|
|
|
|
| 616 |
conn_positions = []
|
| 617 |
for i, s in enumerate(sentences):
|
| 618 |
stripped = s.strip()
|
|
|
|
| 621 |
conn_positions.append(i)
|
| 622 |
break
|
| 623 |
if len(conn_positions) >= 2:
|
|
|
|
| 624 |
gaps = [conn_positions[i] - conn_positions[i-1] for i in range(1, len(conn_positions))]
|
| 625 |
if gaps and max(gaps) - min(gaps) <= 1: # 거의 등간격
|
| 626 |
signals.append(("접속사등간격잔존", 15, "접속사 배치가 규칙적 (AI 원문 구조 잔존)"))
|
| 627 |
residual += 15
|
|
|
|
|
|
|
| 628 |
oversubst = len(HUMANIZER_OVERSUBST.findall(text))
|
| 629 |
awkward = len(HUMANIZER_AWKWARD.findall(text))
|
| 630 |
passive = len(HUMANIZER_PASSIVE.findall(text))
|
|
|
|
| 634 |
signals.append(("어색한지시어", 10, f"해당/전술/상기 등 {awkward}개"))
|
| 635 |
if passive >= 3:
|
| 636 |
signals.append(("이중피동과다", 15, f"되어지/수행되 등 {passive}개"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 637 |
endings = [s.rstrip('.!?')[-2:] for s in sentences if len(s) >= 4]
|
| 638 |
end_types = len(set(endings)) / len(endings) if endings else 0
|
| 639 |
has_question = any(s.strip().endswith('?') for s in sentences)
|
|
|
|
| 641 |
sent_type_variety = sum([has_question, has_exclaim])
|
| 642 |
if sent_type_variety == 0 and end_types > 0.85 and len(sentences) >= 6:
|
| 643 |
signals.append(("유형단조어미다양", 12, "서술문만 + 종결어미 과다 다양 = Humanizer 패턴"))
|
|
|
|
|
|
|
|
|
|
| 644 |
starters = [s.strip()[:3] for s in sentences if len(s) >= 6]
|
| 645 |
starter_unique = len(set(starters)) / len(starters) if starters else 0
|
| 646 |
if starter_unique >= 0.98 and len(sentences) >= 7:
|
| 647 |
signals.append(("문두과다다양", 8, "모든 문장 시작이 다름 (자연스럽지 않은 다양성)"))
|
|
|
|
| 648 |
total = sum(s[1] for s in signals)
|
|
|
|
| 649 |
if total >= 45: score = 85
|
| 650 |
elif total >= 30: score = 68
|
| 651 |
elif total >= 20: score = 52
|
| 652 |
elif total >= 10: score = 35
|
| 653 |
else: score = 15
|
|
|
|
| 654 |
return {"score": score, "signals": signals, "total_evidence": total}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 655 |
MODEL_PROFILES = {
|
| 656 |
"GPT": {
|
| 657 |
"style": ["격식체 ~습니다", "또한/특히 접속사", "~에 대해", "~것으로 예상됩니다"],
|
|
|
|
| 682 |
"connectors": ["한편", "또한", "이에"],
|
| 683 |
},
|
| 684 |
}
|
|
|
|
| 685 |
def estimate_model(text, sentences, morphemes, model_scores):
|
| 686 |
"""AI 모델 추정 — 복합 증거 기반"""
|
| 687 |
evidence = {m: {"score": 0, "reasons": []} for m in MODEL_PROFILES}
|
|
|
|
| 688 |
sl = text.lower()
|
|
|
|
| 689 |
for model, profile in MODEL_PROFILES.items():
|
|
|
|
| 690 |
fp_score = model_scores.get(model, 0)
|
| 691 |
evidence[model]["score"] += fp_score * 0.4
|
| 692 |
if fp_score >= 20:
|
| 693 |
evidence[model]["reasons"].append(f"지문 매칭 {fp_score}점")
|
|
|
|
|
|
|
| 694 |
marker_cnt = sum(1 for m in profile["markers"] if m in text)
|
| 695 |
if marker_cnt >= 2:
|
| 696 |
evidence[model]["score"] += marker_cnt * 8
|
| 697 |
evidence[model]["reasons"].append(f"특유 표현 {marker_cnt}개")
|
|
|
|
|
|
|
| 698 |
end_match = 0
|
| 699 |
for s in sentences:
|
| 700 |
for e in profile["endings"]:
|
|
|
|
| 705 |
if end_ratio > 0.7:
|
| 706 |
evidence[model]["score"] += 12
|
| 707 |
evidence[model]["reasons"].append(f"종결어미 {end_ratio:.0%} 일치")
|
|
|
|
|
|
|
| 708 |
conn_match = sum(1 for s in sentences if any(s.strip().startswith(c) for c in profile["connectors"]))
|
| 709 |
if conn_match >= 2:
|
| 710 |
evidence[model]["score"] += conn_match * 4
|
| 711 |
evidence[model]["reasons"].append(f"접속사 패턴 {conn_match}회")
|
|
|
|
|
|
|
| 712 |
number_citations = len(re.findall(r'\d+[%만억조]|에 따르면|것으로 나타났|발표했', text))
|
| 713 |
if number_citations >= 3:
|
| 714 |
evidence["Perplexity"]["score"] += number_citations * 5
|
| 715 |
evidence["Perplexity"]["reasons"].append(f"수치/인용 {number_citations}회")
|
|
|
|
|
|
|
| 716 |
concession_cnt = len(AI_CONCESSION.findall(text))
|
| 717 |
if concession_cnt >= 1:
|
| 718 |
evidence["Claude"]["score"] += concession_cnt * 10
|
| 719 |
evidence["Claude"]["reasons"].append(f"양보-주장 구문 {concession_cnt}회")
|
|
|
|
|
|
|
| 720 |
ranked = sorted(evidence.items(), key=lambda x: x[1]["score"], reverse=True)
|
| 721 |
top = ranked[0]
|
| 722 |
second = ranked[1] if len(ranked) > 1 else None
|
|
|
|
| 723 |
if top[1]["score"] < 10:
|
| 724 |
return {"model": "특정 불가", "confidence": "낮음", "detail": evidence, "ranked": ranked}
|
|
|
|
|
|
|
| 725 |
gap = top[1]["score"] - (second[1]["score"] if second else 0)
|
| 726 |
if gap >= 20 and top[1]["score"] >= 30:
|
| 727 |
conf = "높음"
|
|
|
|
| 729 |
conf = "중간"
|
| 730 |
else:
|
| 731 |
conf = "낮음"
|
|
|
|
| 732 |
return {
|
| 733 |
"model": top[0],
|
| 734 |
"confidence": conf,
|
|
|
|
| 737 |
"detail": evidence,
|
| 738 |
"ranked": ranked
|
| 739 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 740 |
def analyze_quality(text, sentences, words, morphemes):
|
| 741 |
qs = {}; sl = [len(s) for s in sentences]; tw = len(words)
|
| 742 |
ideal = sum(1 for l in sl if 15<=l<=70)/len(sentences) if sentences else 0
|
|
|
|
| 767 |
total = int(sum(qs[k]*wq[k] for k in wq))
|
| 768 |
grade = "S" if total>=85 else "A" if total>=72 else "B" if total>=58 else "C" if total>=42 else "D" if total>=28 else "F"
|
| 769 |
return {"score":total,"grade":grade,"sub_scores":qs}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 770 |
LLM_JUDGES = [("openai/gpt-oss-120b","GPT-OSS 120B"),("qwen/qwen3-32b","Qwen3 32B"),("moonshotai/kimi-k2-instruct-0905","Kimi-K2")]
|
|
|
|
| 771 |
def _parse_ai_probability(raw_resp):
|
| 772 |
"""LLM 응답에서 AI 확률(0~100)을 추출. 한국어/영어 다양한 형식 대응."""
|
| 773 |
if not raw_resp: return -1
|
|
|
|
| 774 |
think_content = ''
|
| 775 |
think_m = re.search(r'<think>(.*?)</think>', raw_resp, flags=re.S)
|
| 776 |
if think_m: think_content = think_m.group(1)
|
| 777 |
resp = re.sub(r'<think>.*?</think>', '', raw_resp, flags=re.S).strip()
|
| 778 |
if not resp or len(resp) < 5:
|
| 779 |
resp = raw_resp # think만 있으면 원본 포함
|
|
|
|
|
|
|
| 780 |
specific_patterns = [
|
| 781 |
r'AI\s*확률\s*[::]\s*(?:약\s*)?(\d+)\s*%?',
|
| 782 |
r'AI\s*[Pp]robability\s*[::]\s*(?:about|approximately?\s*)?(\d+)\s*%?',
|
|
|
|
| 792 |
if m:
|
| 793 |
v = int(m.group(1))
|
| 794 |
if 0 <= v <= 100: return v
|
|
|
|
|
|
|
| 795 |
lines = [l.strip() for l in resp.strip().split('\n') if l.strip()]
|
| 796 |
for line in reversed(lines[-5:]):
|
|
|
|
| 797 |
if re.search(r'AI|확률|[Pp]robab|신뢰|판[정단]', line):
|
| 798 |
nums = re.findall(r'(\d+)\s*%', line)
|
| 799 |
if nums:
|
|
|
|
| 803 |
if nums:
|
| 804 |
v = int(nums[-1])
|
| 805 |
if 0 <= v <= 100: return v
|
|
|
|
|
|
|
| 806 |
all_pcts = list(re.finditer(r'(\d+)\s*(?:%|퍼센트|percent)', resp, re.I))
|
| 807 |
for m in reversed(all_pcts):
|
| 808 |
v = int(m.group(1))
|
|
|
|
| 809 |
ctx_start = max(0, m.start()-50)
|
| 810 |
ctx = resp[ctx_start:m.end()+20]
|
| 811 |
if re.search(r'AI|확률|[Pp]robab|신뢰|판[정단]|가능성|likelihood', ctx, re.I):
|
| 812 |
if 0 <= v <= 100: return v
|
|
|
|
|
|
|
| 813 |
if all_pcts:
|
| 814 |
v = int(all_pcts[-1].group(1))
|
| 815 |
if 5 <= v <= 99: return v # 100% 제외 (통계 수치 오탐 방지)
|
|
|
|
|
|
|
| 816 |
if think_content:
|
| 817 |
for pat in specific_patterns:
|
| 818 |
m = re.search(pat, think_content, re.I)
|
| 819 |
if m:
|
| 820 |
v = int(m.group(1))
|
| 821 |
if 0 <= v <= 100: return v
|
|
|
|
| 822 |
think_pcts = re.findall(r'(\d+)\s*%', think_content)
|
| 823 |
if think_pcts:
|
| 824 |
v = int(think_pcts[-1])
|
| 825 |
if 5 <= v <= 99: return v
|
|
|
|
| 826 |
return -1
|
|
|
|
| 827 |
def llm_cross_check(text):
|
| 828 |
if not GROQ_KEY: return {"score":-1,"detail":{}}
|
|
|
|
| 829 |
prompt = f"""Analyze whether this text was written by AI.
|
|
|
|
| 830 |
[Instructions]
|
| 831 |
1. Determine AI vs Human with 3 brief reasons
|
| 832 |
2. IMPORTANT - Your LAST line MUST be exactly this format:
|
| 833 |
AI확률: XX%
|
| 834 |
(Replace XX with your estimated probability 0-100)
|
|
|
|
| 835 |
Example of correct last line:
|
| 836 |
AI확률: 75%
|
|
|
|
| 837 |
[Text to analyze]
|
| 838 |
{text[:2000]}"""
|
|
|
|
| 839 |
votes=[]; rpt={}
|
| 840 |
for mid,mn in LLM_JUDGES:
|
| 841 |
resp,err = call_groq(mid,prompt)
|
|
|
|
| 844 |
if p >= 0:
|
| 845 |
votes.append(p); rpt[mn]=f"{p}%"
|
| 846 |
else:
|
|
|
|
| 847 |
cleaned = re.sub(r'<think>.*?</think>', '', resp, flags=re.S).strip()
|
| 848 |
tail = cleaned[-60:].replace('\n',' ') if len(cleaned) > 60 else cleaned.replace('\n',' ')
|
| 849 |
rpt[mn]=f"파싱실패({tail[:40]})"
|
| 850 |
else: rpt[mn]=f"ERR:{err[:30] if err else '?'}"
|
| 851 |
if votes: return {"score":int(sum(votes)/len(votes)),"detail":rpt}
|
| 852 |
return {"score":-1,"detail":rpt}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 853 |
def compute_verdict(scores, llm_score=-1, sent_avg=-1, ppx_score=-1, hum_score=-1):
|
| 854 |
w={"통계":.06,"문체":.25,"반복성":.10,"구조":.12,"지문":.30}
|
| 855 |
ws=sum(scores[k]*w[k] for k in w)
|
|
|
|
|
|
|
| 856 |
if ppx_score >= 0: ws += ppx_score * 0.17
|
|
|
|
|
|
|
| 857 |
style=scores["문체"]; fp=scores["지문"]; rep=scores["반복성"]; struct=scores["구조"]
|
| 858 |
if style>=35 and fp>=35: ws+=8
|
| 859 |
elif style>=30 and fp>=25: ws+=4
|
| 860 |
if style>=30 and rep>=25 and fp>=20: ws+=4
|
| 861 |
if fp>=45: ws+=3
|
| 862 |
if struct>=50 and style>=30: ws+=3
|
|
|
|
| 863 |
if ppx_score>=55 and fp>=35: ws+=5
|
| 864 |
if ppx_score>=65 and style>=35: ws+=3
|
|
|
|
|
|
|
| 865 |
if hum_score>=50:
|
| 866 |
ws=max(ws, 45) # Humanizer 확인 → 최소 AI 의심 중간
|
| 867 |
ws += (hum_score-50)*0.15
|
|
|
|
|
|
|
| 868 |
if sent_avg>=0 and sent_avg>ws: ws=ws*0.80+sent_avg*0.20
|
|
|
|
| 869 |
hi=sum(1 for v in scores.values() if v>=50)
|
| 870 |
if hi>=4: ws+=8
|
| 871 |
elif hi>=3: ws+=5
|
| 872 |
elif hi>=2: ws+=2
|
|
|
|
|
|
|
| 873 |
if style<40 and fp<=20 and rep<22 and struct<35 and (ppx_score<0 or ppx_score<40):
|
| 874 |
ws-=5
|
|
|
|
| 875 |
lo=sum(1 for v in scores.values() if v<20)
|
| 876 |
if lo>=3: ws-=8
|
| 877 |
elif lo>=2: ws-=3
|
|
|
|
| 882 |
if fs>=45: return fs,"AI 의심 중간","ai_low"
|
| 883 |
if fs>=30: return fs,"판단 유보","uncertain"
|
| 884 |
return fs,"인간 작성 추정","human"
|
|
|
|
| 885 |
def quick_score(text):
|
| 886 |
sents=split_sentences(text); words=split_words(text); morphs=get_morphemes(text)
|
| 887 |
sc={"통계":analyze_statistics(text,sents,words)["score"],"문체":analyze_korean_style(text,sents,morphs)["score"],
|
|
|
|
| 893 |
hum=analyze_humanizer(text,sents,words,morphs)
|
| 894 |
fs,v,lv=compute_verdict(sc, sent_avg=sent_avg, ppx_score=ppx["score"], hum_score=hum["score"])
|
| 895 |
return fs,v,lv,sc,ppx,hum
|
|
|
|
|
|
|
|
|
|
|
|
|
| 896 |
def brave_search(query, count=5):
|
| 897 |
"""Brave Search API — 단일 쿼리"""
|
| 898 |
if not BRAVE_KEY: return []
|
|
|
|
| 908 |
return results
|
| 909 |
except: pass
|
| 910 |
return []
|
|
|
|
| 911 |
def search_kci(query):
|
| 912 |
"""KCI(한국학술지인용색인) 검색"""
|
| 913 |
try:
|
|
|
|
| 920 |
return results[:3]
|
| 921 |
except: pass
|
| 922 |
return []
|
|
|
|
| 923 |
def search_riss(query):
|
| 924 |
"""RISS(학술연구정보서비스) — 간접 검색"""
|
| 925 |
results = []
|
|
|
|
| 933 |
results.append({"title": title, "url": "https://www.riss.kr" + m.group(1), "snippet": "", "source": "RISS"})
|
| 934 |
except: pass
|
| 935 |
return results[:3]
|
|
|
|
| 936 |
def search_arxiv(query):
|
| 937 |
"""arXiv API 검색"""
|
| 938 |
results = []
|
|
|
|
| 948 |
except Exception as e:
|
| 949 |
pass
|
| 950 |
return results[:3]
|
|
|
|
| 951 |
def gemini_plagiarism_check(text_chunk):
|
| 952 |
"""Gemini + Google Search Grounding으로 표절 검사"""
|
| 953 |
if not HAS_GENAI or not GEMINI_KEY: return None
|
|
|
|
| 957 |
prompt = f"""다음 텍스트가 인터넷에 존재하는지 Google Search로 확인하세요.
|
| 958 |
유사한 문장이 발견되면 출처 URL과 유사도(%)를 보고하세요.
|
| 959 |
마지막 줄에 "유사도: XX%" 형식으로 작성.
|
|
|
|
| 960 |
[텍스트]
|
| 961 |
{text_chunk[:1000]}"""
|
| 962 |
resp = client.models.generate_content(
|
|
|
|
| 977 |
return {"pct": pct, "response": text_resp, "sources": sources}
|
| 978 |
except Exception as e:
|
| 979 |
return {"pct": 0, "response": str(e)[:100], "sources": []}
|
|
|
|
| 980 |
def parallel_brave_search(queries, max_workers=10):
|
| 981 |
"""Brave Search 병렬 실행 (최대 20개)"""
|
| 982 |
all_results = {}
|
|
|
|
| 989 |
all_results[q] = results
|
| 990 |
except: all_results[q] = []
|
| 991 |
return all_results
|
|
|
|
| 992 |
def duckduckgo_search(query, max_results=5):
|
| 993 |
"""DuckDuckGo HTML 스크래핑 — API 키 불필요 폴백"""
|
| 994 |
results = []
|
|
|
|
| 1003 |
href = m.group(1)
|
| 1004 |
title = re.sub(r'<[^>]+>', '', m.group(2)).strip()
|
| 1005 |
snippet = re.sub(r'<[^>]+>', '', m.group(3)).strip()
|
|
|
|
| 1006 |
real_url = href
|
| 1007 |
if 'uddg=' in href:
|
| 1008 |
um = re.search(r'uddg=([^&]+)', href)
|
|
|
|
| 1012 |
if len(results) >= max_results: break
|
| 1013 |
except: pass
|
| 1014 |
return results
|
|
|
|
| 1015 |
def self_crawl_search(query, max_results=3):
|
| 1016 |
"""httpx 기반 자체 크롤링 (DuckDuckGo + 학술 사이트)"""
|
| 1017 |
all_results = []
|
|
|
|
| 1018 |
all_results.extend(duckduckgo_search(query, max_results))
|
|
|
|
| 1019 |
if '논문' not in query and 'paper' not in query.lower():
|
| 1020 |
all_results.extend(duckduckgo_search(f"{query} 논문 학술", 2))
|
| 1021 |
return all_results
|
|
|
|
| 1022 |
def run_plagiarism(text, progress=gr.Progress()):
|
| 1023 |
if not text or len(text.strip())<50:
|
| 1024 |
return "<div style='padding:20px;text-align:center;color:#888;'>⚠️ 최소 50자 이상</div>", ""
|
| 1025 |
text = text.strip()
|
| 1026 |
sents = split_sentences(text)
|
| 1027 |
now = datetime.now().strftime("%Y-%m-%d %H:%M")
|
|
|
|
| 1028 |
has_brave = bool(BRAVE_KEY)
|
| 1029 |
has_gemini = bool(HAS_GENAI and GEMINI_KEY)
|
|
|
|
| 1030 |
progress(0.05, "문장 분리...")
|
|
|
|
| 1031 |
blocks = []
|
| 1032 |
for i in range(0, len(sents), 4):
|
| 1033 |
block = ' '.join(sents[i:i+4])
|
| 1034 |
if len(block) > 20:
|
| 1035 |
blocks.append({"text": block, "sent_indices": list(range(i, min(i+4, len(sents))))})
|
|
|
|
| 1036 |
all_sources = []
|
| 1037 |
sent_matches = {i: [] for i in range(len(sents))} # 문장별 매칭 정보
|
| 1038 |
block_results = []
|
| 1039 |
log_lines = []
|
|
|
|
|
|
|
| 1040 |
if has_brave:
|
| 1041 |
progress(0.15, f"Brave Search 병렬 검색 ({len(blocks)}블록)...")
|
| 1042 |
queries = []
|
|
|
|
| 1053 |
sent_matches[si].append({"source": r["title"], "url": r["url"], "type": "Brave"})
|
| 1054 |
log_lines.append(f"Brave Search: {len(queries)}쿼리 → {sum(len(v) for v in brave_results.values())}건")
|
| 1055 |
else:
|
|
|
|
| 1056 |
progress(0.15, f"자체 웹 검색 ({len(blocks)}블록)...")
|
| 1057 |
crawl_queries = []
|
| 1058 |
for b in blocks[:10]: # 최대 10블록
|
|
|
|
| 1070 |
sent_matches[si].append({"source": r["title"], "url": r["url"], "type": r.get("source","Web")})
|
| 1071 |
except: pass
|
| 1072 |
log_lines.append(f"자체 웹검색: {len(crawl_queries)}쿼리 (DuckDuckGo)")
|
|
|
|
|
|
|
| 1073 |
progress(0.40, "학술 DB 검색 (KCI/RISS/arXiv)...")
|
|
|
|
| 1074 |
words = split_words(text)
|
| 1075 |
wf = Counter(words)
|
| 1076 |
keywords = [w for w, c in wf.most_common(20) if len(w) >= 2 and c >= 2][:5]
|
| 1077 |
kw_query = ' '.join(keywords[:3])
|
|
|
|
| 1078 |
academic_results = []
|
| 1079 |
with ThreadPoolExecutor(max_workers=3) as executor:
|
| 1080 |
futures = [
|
|
|
|
| 1089 |
all_sources.extend(results)
|
| 1090 |
except: pass
|
| 1091 |
log_lines.append(f"학술DB: KCI/RISS/arXiv → {len(academic_results)}건")
|
|
|
|
|
|
|
| 1092 |
gemini_results = []
|
| 1093 |
if has_gemini:
|
| 1094 |
progress(0.60, "Gemini + Google Search...")
|
|
|
|
| 1101 |
for si in b["sent_indices"]:
|
| 1102 |
sent_matches[si].append({"source": src.get("title",""), "url": src.get("url",""), "type": "Google"})
|
| 1103 |
log_lines.append(f"Gemini: {len(blocks[:5])}블록 → {sum(len(r.get('sources',[])) for r in gemini_results)}출처")
|
|
|
|
| 1104 |
progress(0.80, "보고서 생성...")
|
|
|
|
|
|
|
| 1105 |
matched_sents = sum(1 for si, matches in sent_matches.items() if matches)
|
| 1106 |
total_sents = len(sents)
|
| 1107 |
plag_pct = int(matched_sents / total_sents * 100) if total_sents > 0 else 0
|
|
|
|
|
|
|
| 1108 |
if gemini_results:
|
| 1109 |
gemini_pcts = [r["pct"] for r in gemini_results if r["pct"] > 0]
|
| 1110 |
if gemini_pcts:
|
| 1111 |
gemini_avg = sum(gemini_pcts) / len(gemini_pcts)
|
| 1112 |
plag_pct = int(plag_pct * 0.5 + gemini_avg * 0.5)
|
|
|
|
|
|
|
| 1113 |
seen_urls = set()
|
| 1114 |
unique_sources = []
|
| 1115 |
for s in all_sources:
|
|
|
|
| 1117 |
if url and url not in seen_urls:
|
| 1118 |
seen_urls.add(url)
|
| 1119 |
unique_sources.append(s)
|
|
|
|
|
|
|
| 1120 |
if plag_pct >= 50: grade, grade_color, grade_bg = "표절 의심", "#FF4444", "#FFE0E0"
|
| 1121 |
elif plag_pct >= 30: grade, grade_color, grade_bg = "주의 필요", "#FF8800", "#FFF0DD"
|
| 1122 |
elif plag_pct >= 15: grade, grade_color, grade_bg = "유사 표현 일부", "#DDAA00", "#FFFBE0"
|
| 1123 |
elif plag_pct >= 5: grade, grade_color, grade_bg = "양호", "#4ECDC4", "#E0FFF8"
|
| 1124 |
else: grade, grade_color, grade_bg = "우수 (원본성 높음)", "#22AA44", "#E0FFE8"
|
|
|
|
|
|
|
|
|
|
| 1125 |
sent_analysis = []
|
| 1126 |
for i, s in enumerate(sents):
|
| 1127 |
matches = sent_matches.get(i, [])
|
|
|
|
| 1131 |
else:
|
| 1132 |
sent_analysis.append({"idx":i, "text":s, "matched":False})
|
| 1133 |
sim_sents = [s for s in sent_analysis if s["matched"]]
|
|
|
|
|
|
|
| 1134 |
src_groups = {}
|
| 1135 |
for src in unique_sources:
|
| 1136 |
key = src.get("url","")[:80]
|
|
|
|
| 1138 |
src_groups[key] = {"title":src.get("title",""), "url":src.get("url",""), "source":src.get("source",""), "count":0}
|
| 1139 |
src_groups[key]["count"] += 1
|
| 1140 |
src_list = sorted(src_groups.values(), key=lambda x: -x["count"])
|
|
|
|
| 1141 |
methods_used = []
|
| 1142 |
if has_brave: methods_used.append("Brave Search(병렬)")
|
| 1143 |
elif all_sources: methods_used.append("DuckDuckGo(자체크롤링)")
|
| 1144 |
methods_used.append("KCI · RISS · arXiv")
|
| 1145 |
if has_gemini: methods_used.append("Gemini+Google Search")
|
| 1146 |
method_str = " + ".join(methods_used)
|
|
|
|
| 1147 |
gc = grade_color
|
| 1148 |
word_count = len(split_words(text))
|
| 1149 |
char_count = len(text)
|
| 1150 |
doc_id = hashlib.md5(text[:100].encode()).hexdigest()[:8].upper()
|
| 1151 |
similarity_pct = plag_pct
|
| 1152 |
citation_pct = 0
|
|
|
|
|
|
|
| 1153 |
cat_suspect = len(sim_sents) # 의심
|
| 1154 |
cat_cited = 0 # 인용 (형식적 인용 감지)
|
| 1155 |
cat_normal = total_sents - cat_suspect - cat_cited # 일반
|
| 1156 |
cat_suspect_pct = int(cat_suspect / max(1, total_sents) * 100)
|
| 1157 |
cat_normal_pct = 100 - cat_suspect_pct
|
|
|
|
|
|
|
| 1158 |
def src_icon(s):
|
| 1159 |
src = s.get("source","").lower()
|
| 1160 |
if "kci" in src: return "📚", "KCI"
|
|
|
|
| 1163 |
if "google" in src: return "🔍", "Google"
|
| 1164 |
if "brave" in src: return "🌐", "Brave"
|
| 1165 |
return "🌐", "Web"
|
|
|
|
|
|
|
| 1166 |
src_rows = ""
|
| 1167 |
for i, sg in enumerate(src_list[:15]):
|
| 1168 |
pct = min(100, int(sg["count"] / max(1, total_sents) * 100 * 3))
|
|
|
|
| 1176 |
<td style="padding:6px 8px;font-size:12px;text-align:center;border:1px solid #D5D5D5;font-weight:800;color:#D63031;">{pct}%</td>
|
| 1177 |
<td style="padding:6px 10px;border:1px solid #D5D5D5;"><div style="background:#EDEDED;height:14px;border-radius:2px;overflow:hidden;"><div style="background:linear-gradient(90deg,#D63031,#FF7675);height:100%;width:{max(3,pct)}%;border-radius:2px;"></div></div></td>
|
| 1178 |
</tr>"""
|
|
|
|
|
|
|
| 1179 |
suspect_rows = ""
|
| 1180 |
for i, sa in enumerate(sim_sents[:15]):
|
| 1181 |
suspect_rows += f"""<tr>
|
|
|
|
| 1184 |
<td style="padding:8px;font-size:10px;line-height:1.6;border:1px solid #D5D5D5;vertical-align:top;color:#555;"><span style="background:#FFE0E0;border-bottom:2px solid #E74C3C;padding:1px 3px;">{sa["text"][:70]}...</span></td>
|
| 1185 |
<td style="padding:8px;font-size:10px;border:1px solid #D5D5D5;vertical-align:top;"><a href="{sa.get('url','#')}" target="_blank" style="color:#2E86C1;text-decoration:none;font-weight:600;">{sa["source"][:28]}</a><br><span style="font-size:8px;color:#AAA;">{sa.get('type','')}</span></td>
|
| 1186 |
</tr>"""
|
|
|
|
|
|
|
| 1187 |
full_hl = ""
|
| 1188 |
for sa in sent_analysis:
|
| 1189 |
sidx = sa["idx"] + 1
|
|
|
|
| 1191 |
full_hl += f'<span style="background:#FFD6D6;border-bottom:2px solid #E74C3C;padding:1px 2px;cursor:pointer;" title="[의심 #{sidx}] 출처: {sa.get("source","")}">{sa["text"]}</span> '
|
| 1192 |
else:
|
| 1193 |
full_hl += f'<span style="color:#333;">{sa["text"]}</span> '
|
|
|
|
|
|
|
| 1194 |
bar_suspect_w = max(2, cat_suspect_pct) if cat_suspect > 0 else 0
|
| 1195 |
bar_normal_w = 100 - bar_suspect_w
|
|
|
|
|
|
|
| 1196 |
HDR_BG = '#3B7DD8'
|
| 1197 |
HDR_BG2 = '#4A8DE0'
|
| 1198 |
TH = 'padding:8px 10px;font-size:10px;font-weight:700;color:#fff;background:{};text-align:center;border:1px solid {};'.format(HDR_BG, HDR_BG)
|
| 1199 |
TL = 'padding:7px 10px;font-size:11px;color:#444;font-weight:600;background:#EDF2FA;border:1px solid #D5D5D5;'
|
| 1200 |
TV = 'padding:7px 10px;font-size:12px;color:#333;border:1px solid #D5D5D5;'
|
| 1201 |
SEC = 'font-size:13px;font-weight:800;color:#1A3C6E;margin:0 0 10px 0;padding:8px 12px;background:#EDF2FA;border-left:4px solid {};border-bottom:1px solid #D5D5D5;'.format(HDR_BG)
|
|
|
|
| 1202 |
html = f"""<div style="font-family:'Noto Sans KR','Malgun Gothic','Apple SD Gothic Neo',sans-serif;max-width:780px;margin:0 auto;background:#fff;border:2px solid #3B7DD8;box-shadow:0 2px 12px rgba(0,0,0,0.08);">
|
|
|
|
| 1203 |
<!-- ═══════ 헤더 (CopyKiller 스타일) ═══════ -->
|
| 1204 |
<div style="background:linear-gradient(135deg,{HDR_BG},{HDR_BG2});padding:18px 24px;color:#fff;">
|
| 1205 |
<table style="width:100%;"><tr>
|
|
|
|
| 1213 |
</td>
|
| 1214 |
</tr></table>
|
| 1215 |
</div>
|
|
|
|
| 1216 |
<!-- ═══════ 제출 정보 테이블 ═══════ -->
|
| 1217 |
<div style="padding:16px 24px 0;">
|
| 1218 |
<div style="{SEC}">📋 검사 정보</div>
|
|
|
|
| 1237 |
</tr>
|
| 1238 |
</table>
|
| 1239 |
</div>
|
|
|
|
| 1240 |
<!-- ═══════ 검사 결과 (도넛 + 바) ═══════ -->
|
| 1241 |
<div style="padding:18px 24px 0;">
|
| 1242 |
<div style="{SEC}">📊 검사 결과</div>
|
|
|
|
| 1258 |
<span style="font-size:11px;font-weight:800;color:{gc};">{grade}</span>
|
| 1259 |
</div>
|
| 1260 |
</td>
|
|
|
|
| 1261 |
<!-- 상세 결과 -->
|
| 1262 |
<td style="padding:0;border:1px solid #D5D5D5;vertical-align:top;">
|
| 1263 |
<!-- 카테고리 바 (CopyKiller 핵심) -->
|
|
|
|
| 1273 |
<span style="display:flex;align-items:center;gap:3px;"><span style="display:inline-block;width:10px;height:10px;background:#E8ECF0;border:1px solid #CCC;border-radius:2px;"></span> 일반 <b>{cat_normal}</b>건</span>
|
| 1274 |
</div>
|
| 1275 |
</div>
|
|
|
|
| 1276 |
<!-- 비율 바 3줄 -->
|
| 1277 |
<div style="padding:12px 18px;">
|
| 1278 |
<div style="margin-bottom:10px;">
|
|
|
|
| 1288 |
<div style="background:#EDEDED;height:16px;border-radius:3px;overflow:hidden;border:1px solid #DDD;"><div style="background:linear-gradient(90deg,#3498DB,#85C1E9);height:100%;width:{max(1,citation_pct)}%;transition:width 0.5s;"></div></div>
|
| 1289 |
</div>
|
| 1290 |
</div>
|
|
|
|
| 1291 |
<!-- 요약 수치 -->
|
| 1292 |
<div style="padding:8px 18px;background:#F8F9FB;border-top:1px solid #E8E8E8;">
|
| 1293 |
<table style="width:100%;border-collapse:collapse;">
|
|
|
|
| 1305 |
</tr>
|
| 1306 |
</table>
|
| 1307 |
</div>
|
|
|
|
| 1308 |
<!-- ═══════ 전체 텍스트 분석 ═══════ -->
|
| 1309 |
<div style="padding:18px 24px 0;">
|
| 1310 |
<div style="{SEC}">📝 전체 텍스트 분석</div>
|
|
|
|
| 1317 |
</div>
|
| 1318 |
<div style="padding:14px;background:#FAFBFC;border:1px solid #D5D5D5;line-height:2.1;font-size:13px;max-height:300px;overflow-y:auto;">{full_hl}</div>
|
| 1319 |
</div>
|
|
|
|
| 1320 |
<!-- ═══════ 표절 의심 출처 ═══════ -->
|
| 1321 |
<div style="padding:18px 24px 0;">
|
| 1322 |
<div style="{SEC}">🔗 표절 의심 출처 ({len(src_list)}건)</div>
|
|
|
|
| 1331 |
{src_rows if src_rows else '<tr><td colspan="5" style="padding:18px;text-align:center;color:#999;font-size:11px;border:1px solid #D5D5D5;background:#FAFBFC;">발견된 유사 출처가 없습니다.</td></tr>'}
|
| 1332 |
</table>
|
| 1333 |
</div>
|
|
|
|
| 1334 |
<!-- ═══════ 의심 문장 비교 ═══════ -->
|
| 1335 |
<div style="padding:18px 24px 0;">
|
| 1336 |
<div style="{SEC}">⚠️ 의심 문장 비교 ({len(sim_sents)}건)</div>
|
|
|
|
| 1344 |
{suspect_rows if suspect_rows else '<tr><td colspan="4" style="padding:18px;text-align:center;color:#999;font-size:11px;border:1px solid #D5D5D5;background:#FAFBFC;">유사 의심 문장이 발견되지 않았습니다.</td></tr>'}
|
| 1345 |
</table>
|
| 1346 |
</div>
|
|
|
|
| 1347 |
<!-- ═══════ 검사 안내 ═══════ -->
|
| 1348 |
<div style="margin:18px 24px;padding:12px 14px;background:#F5F8FC;border:1px solid #D0DAEA;border-radius:4px;font-size:9px;color:#555;line-height:1.8;">
|
| 1349 |
<b style="color:#333;">📌 검사 안내</b><br>
|
|
|
|
| 1352 |
· 유사도는 문장 단위 매칭 기반이며, 최종 판정은 교수자/검토자의 확인이 필요합니다.<br>
|
| 1353 |
· 인용 표기(따옴표, 각주 등)가 포함된 문장은 인용으로 분류될 수 있습니다.
|
| 1354 |
</div>
|
|
|
|
| 1355 |
<!-- ═══════ 푸터 (CopyKiller 스타일) ═══════ -->
|
| 1356 |
<div style="padding:10px 24px;background:#F0F3F8;border-top:2px solid {HDR_BG};display:flex;justify-content:space-between;align-items:center;">
|
| 1357 |
<div>
|
|
|
|
| 1364 |
</div>
|
| 1365 |
</div>
|
| 1366 |
</div>"""
|
|
|
|
| 1367 |
log = '\n'.join(log_lines) + f"\n\n종합: {plag_pct}% {grade} | 출처 {len(unique_sources)}건 | 유사문장 {matched_sents}/{total_sents}"
|
| 1368 |
return html, log
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1369 |
def run_detection(text, progress=gr.Progress()):
|
| 1370 |
if not text or len(text.strip())<50: return "<div style='padding:20px;text-align:center;color:#888;'>⚠️ 최소 50자</div>",""
|
| 1371 |
text=text.strip()
|
|
|
|
| 1382 |
sc={"통계":s1["score"],"문체":s2["score"],"반복성":s3["score"],"구조":s4["score"],"지문":s5["score"]}
|
| 1383 |
sent_scores=[score_sentence(s)[0] for s in sents]
|
| 1384 |
sent_avg=sum(sent_scores)/len(sent_scores) if sent_scores else -1
|
|
|
|
|
|
|
| 1385 |
ms_raw=s5.get("model_scores",{})
|
| 1386 |
model_est=estimate_model(text,sents,morphs,ms_raw)
|
|
|
|
| 1387 |
fs,verdict,level=compute_verdict(sc,lr["score"],sent_avg=sent_avg,ppx_score=ppx["score"],hum_score=hum["score"])
|
| 1388 |
progress(0.95)
|
| 1389 |
cm={"ai_high":("#FF4444","#FFE0E0","높음"),"ai_medium":("#FF8800","#FFF0DD","중간~높음"),"ai_low":("#DDAA00","#FFFBE0","중간"),"uncertain":("#888","#F0F0F0","낮음"),"human":("#22AA44","#E0FFE8","매우 낮음")}
|
| 1390 |
fg,bg,conf=cm.get(level,("#888","#F0F0F0","?"))
|
|
|
|
|
|
|
| 1391 |
est_model=model_est.get("model","특정 불가")
|
| 1392 |
est_conf=model_est.get("confidence","낮음")
|
| 1393 |
est_reasons=model_est.get("reasons",[])
|
|
|
|
| 1397 |
mt=f"{est_model} (참고)"
|
| 1398 |
else:
|
| 1399 |
mt="특정 불가"
|
|
|
|
| 1400 |
ai_sents=sum(1 for s in sent_scores if s>=40)
|
| 1401 |
human_sents=sum(1 for s in sent_scores if s<20)
|
|
|
|
| 1402 |
def gb(l,s,w="",desc=""):
|
| 1403 |
c="#FF4444" if s>=70 else "#FF8800" if s>=50 else "#DDAA00" if s>=35 else "#22AA44"
|
| 1404 |
wt=f"<span style='color:#999;font-size:9px;'> ×{w}</span>" if w else ""
|
| 1405 |
dt=f"<div style='font-size:9px;color:#888;margin-top:1px;'>{desc}</div>" if desc else ""
|
| 1406 |
return f"<div style='margin:4px 0;'><div style='display:flex;justify-content:space-between;'><span style='font-size:11px;font-weight:600;'>{l}{wt}</span><span style='font-size:11px;font-weight:700;color:{c};'>{s}</span></div><div style='background:#E8E8E8;border-radius:4px;height:7px;'><div style='background:{c};height:100%;width:{s}%;border-radius:4px;'></div></div>{dt}</div>"
|
|
|
|
|
|
|
| 1407 |
mb=""
|
| 1408 |
for mn in ["GPT","Claude","Gemini","Perplexity"]:
|
| 1409 |
s=ms_raw.get(mn,0); mc="#FF4444" if s>=40 else "#FF8800" if s>=20 else "#CCC"
|
|
|
|
| 1410 |
tag=""
|
| 1411 |
if mn==est_model and est_conf!="낮음":
|
| 1412 |
tag=f" <span style='background:#FF4444;color:white;font-size:7px;padding:0 3px;border-radius:3px;'>추정</span>"
|
| 1413 |
mb+=f"<div style='display:flex;align-items:center;gap:4px;margin:2px 0;'><span style='width:66px;font-size:10px;font-weight:600;'>{mn}{tag}</span><div style='flex:1;background:#E8E8E8;border-radius:3px;height:5px;'><div style='background:{mc};height:100%;width:{s}%;'></div></div><span style='font-size:9px;width:18px;text-align:right;color:{mc};'>{s}</span></div>"
|
|
|
|
|
|
|
| 1414 |
ls=""
|
| 1415 |
if lr["score"]>=0:
|
| 1416 |
lsc=lr["score"]
|
| 1417 |
lr_rows="".join(f"<div style='font-size:9px;color:#555;'>{mn}: {lr['detail'].get(mn,'—')}</div>" for _,mn in LLM_JUDGES)
|
| 1418 |
ls=f"<div style='margin-top:8px;padding:8px;background:#F8F8FF;border-radius:6px;border:1px solid #E0E0FF;'><div style='font-size:10px;font-weight:700;margin-bottom:3px;'>🤖 LLM 교차검증 (평균 {lsc}%)</div>{lr_rows}</div>"
|
| 1419 |
else: ls="<div style='margin-top:6px;padding:4px 8px;background:#F5F5F5;border-radius:4px;color:#999;font-size:9px;'>🤖 GROQ_API_KEY 미설정</div>"
|
|
|
|
|
|
|
| 1420 |
qs=qr["sub_scores"]; gc={"S":"#FF6B6B","A":"#4ECDC4","B":"#45B7D1","C":"#DDAA00","D":"#FF8800","F":"#FF4444"}.get(qr["grade"],"#888")
|
| 1421 |
def qgb(l,s):
|
| 1422 |
c="#22AA44" if s>=70 else "#4ECDC4" if s>=55 else "#DDAA00" if s>=40 else "#FF8800"
|
| 1423 |
return f"<div style='margin:2px 0;display:flex;align-items:center;gap:4px;'><span style='width:50px;font-size:10px;'>{l}</span><div style='flex:1;background:#E8E8E8;border-radius:3px;height:5px;'><div style='background:{c};height:100%;width:{s}%;'></div></div><span style='font-size:9px;color:{c};width:18px;text-align:right;'>{s}</span></div>"
|
|
|
|
|
|
|
| 1424 |
reasons=[]
|
| 1425 |
if sc["문체"]>=70: reasons.append("격식체 종결어미가 대부분, AI형 접속사·상투표현 다수 감지")
|
| 1426 |
elif sc["문체"]>=50: reasons.append("격식체와 AI형 표현이 혼재")
|
|
|
|
| 1437 |
reasons.append(f"🔍 추정 모델: <b>{est_model}</b> ({est_why})")
|
| 1438 |
if not reasons: reasons.append("인간적 표현이 우세하며 AI 패턴이 약함")
|
| 1439 |
reason_html='<br>'.join(f"• {r}" for r in reasons)
|
|
|
|
|
|
|
| 1440 |
ppx_c="#FF4444" if ppx["score"]>=65 else "#FF8800" if ppx["score"]>=50 else "#DDAA00" if ppx["score"]>=35 else "#22AA44"
|
| 1441 |
ppx_html=f"""<div style='margin-top:8px;padding:8px;background:linear-gradient(135deg,#FFF8F0,#FFF0FF);border-radius:6px;border:1px solid #E8D0FF;'>
|
| 1442 |
<div style='font-size:10px;font-weight:700;margin-bottom:4px;'>🧠 Perplexity 분석 <span style='color:{ppx_c};font-size:12px;font-weight:900;'>{ppx["score"]}점</span></div>
|
|
|
|
| 1447 |
<span style='font-size:9px;color:#777;'>Zipf적합: {ppx.get("zipf",0)}</span>
|
| 1448 |
</div>
|
| 1449 |
</div>"""
|
|
|
|
|
|
|
| 1450 |
hum_html=""
|
| 1451 |
if hum["score"]>=30:
|
| 1452 |
hc="#FF4444" if hum["score"]>=65 else "#FF8800" if hum["score"]>=50 else "#DDAA00"
|
|
|
|
| 1455 |
<div style='font-size:10px;font-weight:700;margin-bottom:3px;'>🛡️ Humanizer 탐지 <span style='color:{hc};font-size:12px;font-weight:900;'>{hum["score"]}점</span></div>
|
| 1456 |
{sig_rows}
|
| 1457 |
</div>"""
|
|
|
|
|
|
|
| 1458 |
est_html=""
|
| 1459 |
if est_model!="특정 불가":
|
| 1460 |
ec="#FF4444" if est_conf=="높음" else "#FF8800" if est_conf=="중간" else "#DDAA00"
|
|
|
|
| 1468 |
{ranked_html}
|
| 1469 |
<div style='font-size:8px;color:#999;margin-top:2px;'>근거: {", ".join(est_reasons[:3]) if est_reasons else "복합 지표"}</div>
|
| 1470 |
</div>"""
|
|
|
|
| 1471 |
html=f"""<div style="font-family:'Pretendard','Noto Sans KR',sans-serif;max-width:720px;margin:0 auto;">
|
| 1472 |
<div style="background:{bg};border:2px solid {fg};border-radius:14px;padding:20px;margin-bottom:12px;">
|
| 1473 |
<div style="display:flex;align-items:center;gap:16px;">
|
|
|
|
| 1489 |
<div style="font-size:11px;color:#555;line-height:1.7;">{reason_html}</div>
|
| 1490 |
</div>
|
| 1491 |
</div>
|
|
|
|
| 1492 |
<div style="display:grid;grid-template-columns:1fr 1fr;gap:8px;">
|
| 1493 |
<div style="background:#FAFAFA;border-radius:8px;padding:10px;">
|
| 1494 |
<div style="font-size:10px;font-weight:700;margin-bottom:4px;">📊 AI 탐지 5축 + Perplexity</div>
|
|
|
|
| 1515 |
</div>"""
|
| 1516 |
log=f"AI:{fs}점 [{verdict}] 신뢰:{conf} | 모델:{mt} | PPX:{ppx['score']} HUM:{hum['score']} | 품질:{qr['grade']}({qr['score']})\n축: 통계{sc['통계']} 문체{sc['문체']} 반복{sc['반복성']} 구조{sc['구조']} 지문{sc['지문']} PPX{ppx['score']} HUM{hum['score']}"
|
| 1517 |
return html, log
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1518 |
def run_highlight(text):
|
| 1519 |
if not text or len(text.strip())<30: return "<div style='color:#888;'>텍스트 필요</div>"
|
| 1520 |
sents=split_sentences(text)
|
| 1521 |
hl=[]
|
| 1522 |
for s in sents:
|
| 1523 |
sc, reasons = score_sentence(s)
|
|
|
|
| 1524 |
if sc >= 60: bg="rgba(220,38,38,0.35)"; level="AI확신"
|
| 1525 |
elif sc >= 40: bg="rgba(249,115,22,0.30)"; level="AI의심"
|
| 1526 |
elif sc >= 25: bg="rgba(234,179,8,0.25)"; level="주의"
|
| 1527 |
elif sc >= 10: bg="rgba(132,204,22,0.15)"; level="인간추정"
|
| 1528 |
else: bg="rgba(34,197,94,0.20)"; level="인간"
|
|
|
|
| 1529 |
detail_parts = []
|
| 1530 |
for r in reasons:
|
| 1531 |
if '격식' in r or '비격식AI' in r: detail_parts.append(f"🔤 {r}")
|
|
|
|
| 1536 |
else: detail_parts.append(r)
|
| 1537 |
tt = ' | '.join(detail_parts) if detail_parts else '특이 패턴 없음'
|
| 1538 |
hl.append(f'<span style="background:{bg};padding:2px 4px;border-radius:4px;display:inline;line-height:2.2;border-bottom:2px solid {"#DC2626" if sc>=60 else "#F97316" if sc>=40 else "#EAB308" if sc>=25 else "#84CC16" if sc>=10 else "#22C55E"};" title="[{level}] {tt} ({sc}점)">{s}</span>')
|
|
|
|
| 1539 |
total_scores = [score_sentence(s)[0] for s in sents]
|
| 1540 |
avg_sc = sum(total_scores)/len(total_scores) if total_scores else 0
|
| 1541 |
ai_high = sum(1 for s in total_scores if s >= 60)
|
| 1542 |
ai_mid = sum(1 for s in total_scores if 40 <= s < 60)
|
| 1543 |
human_cnt = sum(1 for s in total_scores if s < 25)
|
|
|
|
| 1544 |
return f"""<div style='font-family:Pretendard,sans-serif;'>
|
| 1545 |
<div style='margin-bottom:10px;padding:10px;background:#F8F8FF;border-radius:8px;'>
|
| 1546 |
<div style='display:flex;gap:8px;align-items:center;font-size:11px;margin-bottom:6px;flex-wrap:wrap;'>
|
|
|
|
| 1554 |
</div>
|
| 1555 |
<div style='line-height:2.4;font-size:14px;'>{' '.join(hl)}</div>
|
| 1556 |
</div>"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1557 |
SAMPLE_AI = """인공지능 기술은 현대 사회에서 매우 중요한 역할을 하고 있습니다. 특히 자연어 처리 분야에서의 발전은 눈부신 성과를 거두고 있습니다. 이러한 기술의 발전은 다양한 산업 분야에 긍정적인 영향을 미치고 있으며, 향후 더욱 발전할 것으로 예상됩니다.
|
|
|
|
| 1558 |
또한 생성형 AI의 등장으로 콘텐츠 제작 방식이 크게 변화하고 있습니다. 이를 통해 기업들은 효율적인 콘텐츠 생산이 가능해졌으며, 개인 사용자들도 다양한 창작 활동에 AI를 활용할 수 있게 되었습니다. 따라서 AI 리터러시의 중요성이 더욱 부각되고 있습니다.
|
|
|
|
| 1559 |
나아가 AI 윤리와 규제에 대한 논의도 활발히 진행되고 있습니다. 특히 AI가 생성한 콘텐츠의 저작권 문제는 중요한 의미를 가지며, 이에 대한 법적 프레임워크 구축이 필요합니다. 결과적으로 기술 발전과 함께 사회적 합의를 이루는 것이 중요합니다."""
|
|
|
|
| 1560 |
SAMPLE_HUMAN = """아 진짜 요즘 AI 때문에 머리 아프다ㅋㅋㅋ 어제 chatgpt한테 레포트 써달라고 했는데 완전 교과서 같은 글만 써줘서 그냥 내가 다시 썼음;;
|
|
|
|
| 1561 |
근데 생각해보면 AI가 쓴 글이랑 사람이 쓴 글이 확실히 다르긴 해. 뭔가... 너무 깔끔하달까? 사람은 이렇게 횡설수설도 하고 맞춤법도 틀리고 그러잖아.
|
|
|
|
| 1562 |
교수님이 AI 탐지기 돌린다고 해서 좀 무서운데 ㅠㅠ 나는 진짜 직접 쓴 건데 혹시 오탐 나면 어쩌지... 걱정된다 진심으로."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1563 |
def run_document_analysis(file, progress=gr.Progress()):
|
| 1564 |
"""문서 파일 업로드 → 섹션별 AI 탐지 히트맵 + PDF 보고서 생성"""
|
| 1565 |
if file is None:
|
| 1566 |
return "<div style='padding:20px;text-align:center;color:#888;'>📄 파일을 업로드하세요 (PDF, DOCX, HWP, HWPX, TXT)</div>", "", None
|
|
|
|
| 1567 |
if hasattr(file, 'path'):
|
| 1568 |
file_path = file.path
|
| 1569 |
elif hasattr(file, 'name'):
|
|
|
|
| 1572 |
file_path = str(file)
|
| 1573 |
fname = os.path.basename(file_path)
|
| 1574 |
progress(0.05, f"📄 {fname} 읽는 중...")
|
|
|
|
| 1575 |
sections, full_text, error = extract_text_from_file(file_path)
|
| 1576 |
if error:
|
| 1577 |
return f"<div style='padding:20px;text-align:center;color:#E44;'>⚠️ {error}</div>", "", None
|
| 1578 |
if not sections or not full_text or len(full_text.strip()) < 50:
|
| 1579 |
return "<div style='padding:20px;text-align:center;color:#E44;'>⚠️ 텍스트가 충분하지 않습니다 (50자 미만)</div>", "", None
|
|
|
|
| 1580 |
progress(0.15, "전체 텍스트 분석...")
|
|
|
|
| 1581 |
sents_all = split_sentences(full_text)
|
| 1582 |
words_all = split_words(full_text)
|
| 1583 |
morphs_all = get_morphemes(full_text)
|
| 1584 |
total_score, total_verdict, total_level, total_axes, total_ppx, total_hum = quick_score(full_text)
|
| 1585 |
quality = analyze_quality(full_text, sents_all, words_all, morphs_all)
|
|
|
|
|
|
|
| 1586 |
progress(0.30, "LLM 교차검증...")
|
| 1587 |
llm_result = llm_cross_check(full_text[:3000])
|
| 1588 |
if llm_result["score"] >= 0:
|
| 1589 |
_sent_scores = [score_sentence(s)[0] for s in sents_all]
|
| 1590 |
_sent_avg = sum(_sent_scores)/len(_sent_scores) if _sent_scores else -1
|
| 1591 |
total_score, total_verdict, total_level = compute_verdict(total_axes, llm_result["score"], sent_avg=_sent_avg, ppx_score=total_ppx["score"], hum_score=total_hum["score"])
|
|
|
|
|
|
|
| 1592 |
progress(0.45, f"{len(sections)}개 섹션 분석...")
|
| 1593 |
section_results = []
|
| 1594 |
for i, sec in enumerate(sections):
|
|
|
|
| 1596 |
section_results.append({"idx": i+1, "text": sec, "score": -1, "verdict": "너무 짧음", "skipped": True})
|
| 1597 |
continue
|
| 1598 |
s_score, s_verdict, s_level, s_axes, _, _ = quick_score(sec)
|
|
|
|
| 1599 |
sec_sents = split_sentences(sec)
|
| 1600 |
sent_scores = []
|
| 1601 |
for sent in sec_sents:
|
|
|
|
| 1608 |
})
|
| 1609 |
pct = 0.45 + (i / max(len(sections), 1)) * 0.30
|
| 1610 |
progress(pct, f"섹션 {i+1}/{len(sections)}")
|
|
|
|
|
|
|
| 1611 |
now = datetime.now().strftime("%Y-%m-%d %H:%M")
|
| 1612 |
ext = Path(file_path).suffix.upper()
|
| 1613 |
cm_map = {"ai_high": ("#FF4444", "#FFE0E0"), "ai_medium": ("#FF8800", "#FFF0DD"),
|
| 1614 |
"ai_low": ("#DDAA00", "#FFFBE0"), "uncertain": ("#888", "#F5F5F5"), "human": ("#22AA44", "#E0FFE8")}
|
| 1615 |
tc, tbg = cm_map.get(total_level, ("#888", "#F5F5F5"))
|
|
|
|
|
|
|
| 1616 |
heatmap_cells = []
|
| 1617 |
for sr in section_results:
|
| 1618 |
sidx = sr["idx"]
|
|
|
|
| 1623 |
ssc = sr["score"]; svd = sr["verdict"]
|
| 1624 |
heatmap_cells.append(f"<div style='flex:1;min-width:18px;height:28px;background:{sc};border-radius:3px;opacity:0.8;cursor:pointer;' title='섹션{sidx}: AI {ssc}점 ({svd})'></div>")
|
| 1625 |
heatmap_bar = f"<div style='display:flex;gap:3px;margin:12px 0;'>" + ''.join(heatmap_cells) + "</div>"
|
|
|
|
|
|
|
| 1626 |
section_cards = []
|
| 1627 |
for sr in section_results:
|
| 1628 |
if sr["skipped"]: continue
|
| 1629 |
sc, sbg = cm_map.get(sr.get("level", "uncertain"), ("#888", "#F5F5F5"))
|
|
|
|
| 1630 |
sent_html = ""
|
| 1631 |
for ss in sr.get("sent_scores", []):
|
| 1632 |
s = ss["score"]
|
|
|
|
| 1635 |
elif s >= 25: sclr = "background:rgba(221,170,0,0.08);border-bottom:1px solid #DDAA00;"
|
| 1636 |
else: sclr = ""
|
| 1637 |
sent_html += f"<span style='{sclr}padding:1px 2px;border-radius:2px;' title='AI {s}점'>{ss['text']}</span> "
|
|
|
|
| 1638 |
axes_html = ""
|
| 1639 |
if "axes" in sr:
|
| 1640 |
ax = sr["axes"]
|
| 1641 |
for k, v in ax.items():
|
| 1642 |
axc = "#FF4444" if v >= 50 else "#FF8800" if v >= 30 else "#22AA44"
|
| 1643 |
axes_html += f"<span style='display:inline-block;margin:1px 3px;padding:2px 8px;background:{axc}22;border:1px solid {axc}44;border-radius:10px;font-size:9px;color:{axc};'>{k} {v}</span>"
|
|
|
|
| 1644 |
section_cards.append(f"""
|
| 1645 |
<div style="border:1px solid #E0E0E0;border-left:4px solid {sc};border-radius:8px;padding:14px;margin-bottom:10px;background:white;">
|
| 1646 |
<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:8px;">
|
|
|
|
| 1650 |
<div style="margin-bottom:6px;">{axes_html}</div>
|
| 1651 |
<div style="font-size:12px;line-height:1.8;color:#333;">{sent_html}</div>
|
| 1652 |
</div>""")
|
|
|
|
|
|
|
| 1653 |
ai_high = sum(1 for s in section_results if not s["skipped"] and s["score"] >= 60)
|
| 1654 |
ai_med = sum(1 for s in section_results if not s["skipped"] and 35 <= s["score"] < 60)
|
| 1655 |
ai_low = sum(1 for s in section_results if not s["skipped"] and s["score"] < 35)
|
| 1656 |
valid_sections = [s for s in section_results if not s["skipped"]]
|
|
|
|
|
|
|
| 1657 |
llm_info = ""
|
| 1658 |
if llm_result["score"] >= 0:
|
| 1659 |
llm_rows = ''.join(f"<span style='margin-right:8px;font-size:10px;color:#555;'>{mn}: {llm_result['detail'].get(mn,'—')}</span>" for _, mn in LLM_JUDGES)
|
| 1660 |
llm_info = f"<div style='margin-top:8px;padding:8px 12px;background:#F8F8FF;border-radius:6px;border:1px solid #E0E0FF;font-size:10px;'><b>🤖 LLM 교차검증:</b> 평균 {llm_result['score']}% | {llm_rows}</div>"
|
|
|
|
| 1661 |
html = f"""<div style="font-family:'Pretendard','Noto Sans KR',sans-serif;max-width:800px;margin:0 auto;background:#FAFBFC;border-radius:12px;border:1px solid #E0E0E0;overflow:hidden;">
|
| 1662 |
<!-- 헤더 -->
|
| 1663 |
<div style="background:linear-gradient(135deg,#1A1F36,#2D3561);padding:20px 24px;color:white;">
|
|
|
|
| 1672 |
</div>
|
| 1673 |
</div>
|
| 1674 |
</div>
|
|
|
|
| 1675 |
<!-- 요약 -->
|
| 1676 |
<div style="padding:16px 24px;">
|
| 1677 |
<div style="display:grid;grid-template-columns:repeat(4,1fr);gap:10px;margin-bottom:12px;">
|
|
|
|
| 1692 |
<div style="font-size:10px;color:#888;">품질 등급</div>
|
| 1693 |
</div>
|
| 1694 |
</div>
|
|
|
|
| 1695 |
<!-- 히트맵 바 -->
|
| 1696 |
<div style="background:white;border:1px solid #E0E0E0;border-radius:8px;padding:12px;">
|
| 1697 |
<div style="font-size:11px;font-weight:700;margin-bottom:6px;">🗺️ 섹션별 AI 히트맵 (빨강=AI의심, 초록=인간)</div>
|
|
|
|
| 1705 |
</div>
|
| 1706 |
{llm_info}
|
| 1707 |
</div>
|
|
|
|
| 1708 |
<!-- 섹션 상세 -->
|
| 1709 |
<div style="padding:0 24px 20px;">
|
| 1710 |
<div style="font-size:13px;font-weight:700;margin-bottom:10px;">📊 섹션별 상세 분석 ({len(valid_sections)}개)</div>
|
| 1711 |
{''.join(section_cards)}
|
| 1712 |
</div>
|
|
|
|
| 1713 |
<!-- 푸터 -->
|
| 1714 |
<div style="padding:10px 24px;background:#F0F3F8;border-top:1px solid #E0E0E0;display:flex;justify-content:space-between;">
|
| 1715 |
<span style="font-size:11px;font-weight:800;color:#1A1F36;">AI Detector v4.0</span>
|
| 1716 |
<span style="font-size:9px;color:#AAA;">{now} · 5축 앙상블 + LLM 교차검증</span>
|
| 1717 |
</div>
|
| 1718 |
</div>"""
|
|
|
|
|
|
|
| 1719 |
progress(0.90, "PDF 보고서 생성...")
|
| 1720 |
pdf_path = _generate_pdf_report(fname, total_score, total_verdict, total_level,
|
| 1721 |
total_axes, quality, section_results, llm_result, now)
|
|
|
|
| 1722 |
log = f"파일: {fname} ({ext})\n"
|
| 1723 |
log += f"섹션: {len(sections)}개 | 전체: {len(full_text)}자\n"
|
| 1724 |
log += f"총점: {total_score} ({total_verdict})\n"
|
|
|
|
| 1727 |
if llm_result["score"] >= 0:
|
| 1728 |
log += f"LLM 교차검증: {llm_result['score']}%\n"
|
| 1729 |
return html, log, pdf_path
|
|
|
|
|
|
|
| 1730 |
def _generate_pdf_report(fname, score, verdict, level, axes, quality, sections, llm_result, now):
|
| 1731 |
"""HTML → PDF 변환으로 보고서 생성"""
|
| 1732 |
try:
|
| 1733 |
cm = {"ai_high":"#FF4444","ai_medium":"#FF8800","ai_low":"#DDAA00","uncertain":"#888","human":"#22AA44"}
|
| 1734 |
tc = cm.get(level, "#888")
|
|
|
|
|
|
|
| 1735 |
sec_rows = ""
|
| 1736 |
for sr in sections:
|
| 1737 |
if sr["skipped"]: continue
|
| 1738 |
sc = cm.get(sr.get("level","uncertain"),"#888")
|
| 1739 |
sec_rows += f"<tr><td style='padding:6px 10px;border:1px solid #DDD;text-align:center;'>{sr['idx']}</td><td style='padding:6px 10px;border:1px solid #DDD;'>{sr['text'][:80]}...</td><td style='padding:6px 10px;border:1px solid #DDD;text-align:center;color:{sc};font-weight:700;'>{sr['score']}</td><td style='padding:6px 10px;border:1px solid #DDD;text-align:center;'>{sr['verdict']}</td></tr>"
|
|
|
|
| 1740 |
ax_rows = ''.join(f"<tr><td style='padding:4px 10px;border:1px solid #DDD;'>{k}</td><td style='padding:4px 10px;border:1px solid #DDD;text-align:center;font-weight:700;'>{v}/100</td></tr>" for k, v in axes.items())
|
|
|
|
| 1741 |
html_content = f"""<!DOCTYPE html><html><head><meta charset="UTF-8">
|
| 1742 |
<style>
|
| 1743 |
body{{font-family:sans-serif;margin:30px;font-size:12px;color:#333;}}
|
|
|
|
| 1750 |
</style></head><body>
|
| 1751 |
<h1>📄 AI 글 판별 보고서</h1>
|
| 1752 |
<p><b>파일:</b> {fname} | <b>생성:</b> {now} | <b>엔진:</b> AI Detector v4.0</p>
|
|
|
|
| 1753 |
<h2>종합 결과</h2>
|
| 1754 |
<div class="score-box">{score}점</div>
|
| 1755 |
<span style="margin-left:12px;font-size:16px;font-weight:700;">{verdict}</span>
|
| 1756 |
<p><b>품질:</b> {quality['grade']} ({quality['score']}점)</p>
|
|
|
|
| 1757 |
<h2>5축 분석</h2>
|
| 1758 |
<table><tr><th>축</th><th>점수</th></tr>{ax_rows}</table>
|
|
|
|
| 1759 |
<h2>섹션별 분석 ({len([s for s in sections if not s['skipped']])}개)</h2>
|
| 1760 |
<table>
|
| 1761 |
<tr><th>No</th><th>내용 (발췌)</th><th>AI 점수</th><th>판정</th></tr>
|
| 1762 |
{sec_rows}
|
| 1763 |
</table>
|
|
|
|
| 1764 |
<div class="footer">
|
| 1765 |
AI Detector v4.0 — 5축 앙상블 + LLM 교차검증 | {now}<br>
|
| 1766 |
본 보고서는 AI 판별 참고 자료이며, 최종 판단은 검토자의 확인이 필요합니다.
|
| 1767 |
</div>
|
| 1768 |
</body></html>"""
|
|
|
|
|
|
|
| 1769 |
report_dir = tempfile.mkdtemp()
|
| 1770 |
html_path = os.path.join(report_dir, f"AI_Report_{fname}.html")
|
| 1771 |
with open(html_path, 'w', encoding='utf-8') as f:
|
|
|
|
| 1774 |
except Exception as e:
|
| 1775 |
print(f"PDF 보고서 생성 오류: {e}")
|
| 1776 |
return None
|
|
|
|
|
|
|
| 1777 |
def extract_file_text_api(file):
|
| 1778 |
"""파일 업로드 → 텍스트 추출 API (탭1~4용)"""
|
| 1779 |
if file is None:
|
| 1780 |
return "⚠️ 파일이 없습니다."
|
|
|
|
| 1781 |
if hasattr(file, 'path'):
|
| 1782 |
file_path = file.path
|
| 1783 |
elif hasattr(file, 'name'):
|
|
|
|
| 1793 |
if not full_text or len(full_text.strip()) < 10:
|
| 1794 |
return f"⚠️ {fname}: 텍스트 추출 결과 없음"
|
| 1795 |
return full_text.strip()
|
|
|
|
|
|
|
| 1796 |
with gr.Blocks(title="AI 글 판별기 v5.1") as demo:
|
| 1797 |
gr.Markdown("# 🔎 AI 글 판별기 v5.1\n**5축+Perplexity+Humanizer탐지+모델추정 · 품질 측정 · LLM 교차검증 · 표절 검사**")
|
| 1798 |
with gr.Tab("🏆 경쟁력 비교"):
|
|
|
|
| 1808 |
@keyframes glow { 0%,100% { box-shadow:0 0 8px rgba(99,102,241,0.3); } 50% { box-shadow:0 0 20px rgba(99,102,241,0.6); } }
|
| 1809 |
@keyframes borderPulse { 0%,100% { border-color:rgba(99,102,241,0.3); } 50% { border-color:rgba(99,102,241,0.8); } }
|
| 1810 |
@keyframes checkPop { from { transform:scale(0) rotate(-180deg); opacity:0; } to { transform:scale(1) rotate(0deg); opacity:1; } }
|
|
|
|
| 1811 |
.comp-wrap { font-family:'Pretendard','Noto Sans KR',system-ui,sans-serif; max-width:860px; margin:0 auto; padding:8px; }
|
|
|
|
| 1812 |
/* 히어로 배너 */
|
| 1813 |
.hero-banner {
|
| 1814 |
background: linear-gradient(135deg, #1e1b4b 0%, #312e81 30%, #4338ca 60%, #6366f1 100%);
|
|
|
|
| 1827 |
font-size:11px; font-weight:800; padding:4px 12px; border-radius:20px; margin-top:10px;
|
| 1828 |
animation: pulse 2s ease-in-out infinite; position:relative;
|
| 1829 |
}
|
|
|
|
| 1830 |
/* 핵심 수치 */
|
| 1831 |
.stats-row { display:grid; grid-template-columns:repeat(4,1fr); gap:10px; margin-bottom:18px; animation:fadeInUp 1s ease-out 0.2s both; }
|
| 1832 |
.stat-card {
|
|
|
|
| 1836 |
.stat-card:hover { transform:translateY(-4px); border-color:#6366f1; box-shadow:0 8px 25px rgba(99,102,241,0.15); }
|
| 1837 |
.stat-num { font-size:32px; font-weight:900; background:linear-gradient(135deg,#6366f1,#8b5cf6); -webkit-background-clip:text; -webkit-text-fill-color:transparent; animation:countUp 0.6s ease-out; }
|
| 1838 |
.stat-label { font-size:10px; color:#64748b; margin-top:2px; font-weight:600; }
|
|
|
|
| 1839 |
/* 비교 테이블 */
|
| 1840 |
.comp-table-wrap { animation:fadeInUp 1.1s ease-out 0.4s both; margin-bottom:18px; }
|
| 1841 |
.comp-table {
|
|
|
|
| 1856 |
.comp-table tbody td:first-child { text-align:left; padding-left:14px; font-weight:600; color:#1e293b; }
|
| 1857 |
.comp-table tbody td:nth-child(2) { background:rgba(239,68,68,0.04); font-weight:700; }
|
| 1858 |
.comp-table tbody tr:last-child td { border-bottom:none; }
|
|
|
|
| 1859 |
/* 체크/엑스 아이콘 */
|
| 1860 |
.ck { color:#22c55e; font-weight:800; font-size:15px; animation:checkPop 0.4s ease-out; }
|
| 1861 |
.cx { color:#ef4444; font-weight:700; font-size:13px; }
|
| 1862 |
.cp { color:#f59e0b; font-weight:700; font-size:12px; }
|
| 1863 |
.cstar { color:#6366f1; font-weight:900; font-size:14px; }
|
|
|
|
| 1864 |
/* 차별화 카드 */
|
| 1865 |
.diff-grid { display:grid; grid-template-columns:1fr 1fr; gap:10px; margin-bottom:18px; }
|
| 1866 |
.diff-card {
|
|
|
|
| 1879 |
.diff-title { font-size:13px; font-weight:800; color:#1e293b; margin-bottom:4px; }
|
| 1880 |
.diff-desc { font-size:10px; color:#64748b; line-height:1.5; }
|
| 1881 |
.diff-vs { display:inline-block; background:#fef2f2; color:#dc2626; font-size:9px; font-weight:700; padding:2px 6px; border-radius:4px; margin-top:4px; }
|
|
|
|
| 1882 |
/* 경쟁사 공략 바 */
|
| 1883 |
.attack-section { animation:fadeInUp 1.3s ease-out 0.8s both; margin-bottom:14px; }
|
| 1884 |
.attack-title { font-size:13px; font-weight:800; color:#1e293b; margin-bottom:10px; }
|
|
|
|
| 1888 |
.attack-weak { font-size:9px; color:#ef4444; font-weight:600; }
|
| 1889 |
.attack-bar { height:8px; background:#f1f5f9; border-radius:4px; overflow:hidden; }
|
| 1890 |
.attack-fill { height:100%; border-radius:4px; animation:barGrow 1.5s ease-out; }
|
|
|
|
| 1891 |
/* 슬로건 */
|
| 1892 |
.slogan-box {
|
| 1893 |
background:linear-gradient(135deg,#faf5ff,#ede9fe,#e0e7ff); border-radius:16px;
|
|
|
|
| 1897 |
.slogan-main { font-size:16px; font-weight:900; color:#3730a3; margin-bottom:6px; }
|
| 1898 |
.slogan-sub { font-size:11px; color:#6366f1; line-height:1.7; }
|
| 1899 |
</style>
|
|
|
|
| 1900 |
<div class="comp-wrap">
|
| 1901 |
<!-- 히어로 배너 -->
|
| 1902 |
<div class="hero-banner">
|
|
|
|
| 1907 |
</div>
|
| 1908 |
<div class="hero-badge">🔬 v5.1 — 3대 킬러 기능 탑재</div>
|
| 1909 |
</div>
|
|
|
|
| 1910 |
<!-- 핵심 수치 -->
|
| 1911 |
<div class="stats-row">
|
| 1912 |
<div class="stat-card"><div class="stat-num">8축</div><div class="stat-label">탐지 차원<br>(업계 최다)</div></div>
|
|
|
|
| 1914 |
<div class="stat-card"><div class="stat-num">무료</div><div class="stat-label">완전 무료<br>(횟수 무제한)</div></div>
|
| 1915 |
<div class="stat-card"><div class="stat-num">4종</div><div class="stat-label">모델 추정<br>(GPT/Claude/Gemini/PPX)</div></div>
|
| 1916 |
</div>
|
|
|
|
| 1917 |
<!-- 비교 테이블 -->
|
| 1918 |
<div class="comp-table-wrap">
|
| 1919 |
<table class="comp-table">
|
|
|
|
| 2058 |
</tbody>
|
| 2059 |
</table>
|
| 2060 |
</div>
|
|
|
|
| 2061 |
<!-- 차별화 4대 카드 -->
|
| 2062 |
<div class="diff-grid" style="animation:fadeInUp 1.2s ease-out 0.6s both;">
|
| 2063 |
<div class="diff-card">
|
|
|
|
| 2085 |
<div class="diff-vs">vs 전체 경쟁사: LLM 검증 미보유</div>
|
| 2086 |
</div>
|
| 2087 |
</div>
|
|
|
|
| 2088 |
<!-- 경쟁사 취약점 공략 -->
|
| 2089 |
<div class="attack-section">
|
| 2090 |
<div class="attack-title">⚔️ 경쟁사 취약점 vs 우리의 강점</div>
|
|
|
|
| 2109 |
<div class="attack-bar"><div class="attack-fill" style="width:94%;background:linear-gradient(90deg,#a855f7,#ec4899);"></div></div>
|
| 2110 |
</div>
|
| 2111 |
</div>
|
|
|
|
| 2112 |
<!-- 슬로건 -->
|
| 2113 |
<div class="slogan-box">
|
| 2114 |
<div class="slogan-main">🇰🇷 한국어 AI 글을 가장 정확하게, 가장 투명하게, 완전 무료로 판별합니다</div>
|
|
|
|
| 2145 |
btn_ps.click(lambda:SAMPLE_AI,outputs=[inp_plag])
|
| 2146 |
with gr.Tab("📖 설명"):
|
| 2147 |
gr.Markdown("""
|
|
|
|
| 2148 |
- **탐지 5축:** 통계(6%)·문체(25%)·반복(10%)·구조(12%)·지문(30%)
|
| 2149 |
- **★ Perplexity 확률분석(17%):** 문자 엔트로피·Burstiness·TTR편차·종결엔트로피
|
| 2150 |
- **★ Humanizer 탐지:** 동의어과다치환·구조보존·접속사잔존·이중피동 등 6시그널
|
| 2151 |
- **★ 모델 추정:** GPT·Claude·Gemini·Perplexity 4모델 증거기반 추정
|
| 2152 |
- **품질 6항목:** 가독성·어휘·논리·정확성·표현·정보밀도
|
| 2153 |
- **LLM 교차검증:** GPT-OSS-120B·Qwen3-32B·Kimi-K2 (GROQ)
|
|
|
|
|
|
|
| 2154 |
- **Brave Search**: 병렬 20개 동시 웹검색
|
| 2155 |
- **학술 DB**: KCI(한국학술지인용색인), RISS(학술연구정보), arXiv
|
| 2156 |
- **Gemini**: Google Search Grounding
|
| 2157 |
- **보고서**: CopyKiller 스타일 — 유사도%, 출처표, 문장별 하이라이트
|
|
|
|
|
|
|
| 2158 |
- `GROQ_API_KEY` — LLM 교차검증
|
| 2159 |
- `GEMINI_API_KEY` — 표절 검사 (Google Search Grounding)
|
| 2160 |
- `BRAVE_API_KEY` — 표절 검사 (Brave Search 병렬)
|
| 2161 |
""")
|
|
|
|
| 2162 |
with gr.Row(visible=False):
|
| 2163 |
_file_in = gr.File(label="hidden_file")
|
| 2164 |
_text_out = gr.Textbox(label="hidden_text")
|
| 2165 |
_file_in.change(extract_file_text_api, [_file_in], [_text_out], api_name="extract_file_text")
|
|
|
|
|
|
|
| 2166 |
import shutil, pathlib
|
| 2167 |
static_dir = pathlib.Path("static")
|
| 2168 |
static_dir.mkdir(exist_ok=True)
|
| 2169 |
if pathlib.Path("index.html").exists():
|
| 2170 |
shutil.copy("index.html", static_dir / "index.html")
|
|
|
|
|
|
|
| 2171 |
from fastapi import FastAPI
|
| 2172 |
from fastapi.responses import HTMLResponse, FileResponse
|
| 2173 |
from fastapi.staticfiles import StaticFiles
|
|
|
|
| 2174 |
server = FastAPI()
|
|
|
|
| 2175 |
@server.get("/", response_class=HTMLResponse)
|
| 2176 |
async def serve_root():
|
| 2177 |
"""루트 URL에서 프리미엄 index.html 서빙"""
|
| 2178 |
fp = pathlib.Path("static/index.html")
|
| 2179 |
if fp.exists():
|
| 2180 |
return HTMLResponse(fp.read_text(encoding="utf-8"))
|
|
|
|
| 2181 |
from fastapi.responses import RedirectResponse
|
| 2182 |
return RedirectResponse("/gradio/")
|
|
|
|
|
|
|
| 2183 |
app = gr.mount_gradio_app(server, demo, path="/gradio", allowed_paths=["static"])
|
|
|
|
| 2184 |
if __name__ == "__main__":
|
| 2185 |
import uvicorn
|
| 2186 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|