import gradio as gr
import math, re, os, json, random, time, hashlib, zlib, zipfile, tempfile
from collections import Counter
from datetime import datetime
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from xml.etree import ElementTree as ET
from kiwipiepy import Kiwi
KIWI = Kiwi()
try:
import httpx; HAS_HTTPX = True
except ImportError:
HAS_HTTPX = False
try:
from google import genai
from google.genai import types as gtypes
HAS_GENAI = True
except ImportError:
HAS_GENAI = False
try:
import olefile; HAS_OLEFILE = True
except ImportError:
HAS_OLEFILE = False
try:
import pdfplumber; HAS_PDFPLUMBER = True
except ImportError:
HAS_PDFPLUMBER = False
try:
import PyPDF2; HAS_PYPDF2 = True
except ImportError:
HAS_PYPDF2 = False
try:
from docx import Document as DocxDocument; HAS_DOCX = True
except ImportError:
HAS_DOCX = False
GROQ_KEY = os.getenv("GROQ_API_KEY", "")
GEMINI_KEY = os.getenv("GEMINI_API_KEY", "")
BRAVE_KEY = os.getenv("BRAVE_API_KEY", "")
def extract_text_from_pdf(file_path):
"""PDF → 텍스트 (페이지별 분리)"""
pages = []
if HAS_PDFPLUMBER:
try:
with pdfplumber.open(file_path) as pdf:
for p in pdf.pages:
t = p.extract_text()
if t: pages.append(t)
if pages: return pages, None
except Exception as e:
print(f"pdfplumber: {e}")
if HAS_PYPDF2:
try:
with open(file_path, 'rb') as f:
reader = PyPDF2.PdfReader(f)
for p in reader.pages:
t = p.extract_text()
if t: pages.append(t)
if pages: return pages, None
except Exception as e:
print(f"PyPDF2: {e}")
return None, "PDF 추출 실패 (pdfplumber, PyPDF2 없음)"
def extract_text_from_docx(file_path):
"""DOCX → 텍스트 (문단별 분리)"""
if not HAS_DOCX: return None, "python-docx 없음"
try:
doc = DocxDocument(file_path)
sections = []
current = []
for para in doc.paragraphs:
txt = para.text.strip()
if not txt:
if current:
sections.append('\n'.join(current))
current = []
else:
current.append(txt)
if current: sections.append('\n'.join(current))
if sections: return sections, None
return None, "DOCX 텍스트 없음"
except Exception as e:
return None, f"DOCX 오류: {e}"
def extract_text_from_txt(file_path):
"""TXT/MD/CSV 등 → 텍스트"""
for enc in ['utf-8', 'euc-kr', 'cp949', 'utf-16', 'latin-1']:
try:
with open(file_path, 'r', encoding=enc) as f:
text = f.read()
if text.strip():
sections = [s.strip() for s in re.split(r'\n{2,}', text) if s.strip()]
return sections if sections else [text], None
except: continue
return None, "텍스트 인코딩 감지 실패"
def extract_text_from_hwpx(file_path):
"""HWPX (ZIP 기반) → 텍스트"""
try:
text_parts = []
with zipfile.ZipFile(file_path, 'r') as zf:
file_list = zf.namelist()
section_files = sorted([f for f in file_list if f.startswith('Contents/section') and f.endswith('.xml')])
if not section_files:
section_files = sorted([f for f in file_list if 'section' in f.lower() and f.endswith('.xml')])
for sf_name in section_files:
try:
with zf.open(sf_name) as sf:
content = sf.read().decode('utf-8', errors='ignore')
content = re.sub(r'\sxmlns[^"]*"[^"]*"', '', content)
content = re.sub(r'<[a-zA-Z]+:', '<', content)
content = re.sub(r'[a-zA-Z]+:', '', content)
try:
root = ET.fromstring(content)
texts = []
for elem in root.iter():
if elem.tag.endswith('t') or elem.tag == 't':
if elem.text: texts.append(elem.text)
elif elem.text and elem.text.strip():
if any(x in elem.tag.lower() for x in ['text', 'run', 'para', 'char']):
texts.append(elem.text.strip())
if texts: text_parts.append(' '.join(texts))
except ET.ParseError:
matches = re.findall(r'>([^<]+)<', content)
clean = [t.strip() for t in matches if t.strip() and len(t.strip()) > 1]
if clean: text_parts.append(' '.join(clean))
except: continue
if text_parts:
return text_parts, None
return None, "HWPX 텍스트 없음"
except zipfile.BadZipFile:
return None, "유효하지 않은 HWPX"
except Exception as e:
return None, f"HWPX 오류: {e}"
def _decode_hwp_para(data):
"""HWP 바이너리 → 문단 텍스트"""
result = []
i = 0
while i < len(data) - 1:
code = int.from_bytes(data[i:i+2], 'little')
if code in (1,2,3): i += 14
elif code == 9: result.append('\t')
elif code in (10,13): result.append('\n')
elif code == 24: result.append('-')
elif code in (30,31): result.append(' ')
elif code >= 32:
try:
ch = chr(code)
if ch.isprintable() or ch in '\n\t ': result.append(ch)
except: pass
i += 2
text = ''.join(result).strip()
text = re.sub(r'[ \t]+', ' ', text)
text = re.sub(r'\n{3,}', '\n\n', text)
return text if len(text) > 2 else None
def _extract_hwp_section(data):
"""HWP 섹션 바이너리 → 텍스트"""
texts = []
pos = 0
while pos < len(data) - 4:
try:
header = int.from_bytes(data[pos:pos+4], 'little')
tag_id = header & 0x3FF
size = (header >> 20) & 0xFFF
pos += 4
if size == 0xFFF:
if pos + 4 > len(data): break
size = int.from_bytes(data[pos:pos+4], 'little')
pos += 4
if pos + size > len(data): break
record_data = data[pos:pos+size]
pos += size
if tag_id == 67 and size > 0:
t = _decode_hwp_para(record_data)
if t: texts.append(t)
except:
pos += 1
return '\n'.join(texts) if texts else None
def extract_text_from_hwp(file_path):
"""HWP (OLE 기반) → 텍스트"""
if not HAS_OLEFILE: return None, "olefile 없음"
try:
ole = olefile.OleFileIO(file_path)
if not ole.exists('FileHeader'):
ole.close(); return None, "HWP 헤더 없음"
header_data = ole.openstream('FileHeader').read()
is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
all_texts = []
for entry in ole.listdir():
entry_path = '/'.join(entry)
if entry_path.startswith('BodyText/Section'):
try:
stream = ole.openstream(entry).read()
if is_compressed:
try: stream = zlib.decompress(stream, -15)
except:
try: stream = zlib.decompress(stream)
except: pass
section_text = _extract_hwp_section(stream)
if section_text: all_texts.append(section_text)
except: continue
ole.close()
if all_texts: return all_texts, None
return None, "HWP 텍스트 없음"
except Exception as e:
return None, f"HWP 오류: {e}"
def extract_text_from_file(file_path):
"""
만능 문서 추출: PDF/DOCX/HWP/HWPX/TXT → (sections_list, full_text, error)
sections_list: 페이지/섹션별 텍스트 리스트
full_text: 전체 합친 텍스트
"""
if not file_path or not os.path.exists(file_path):
return None, None, "파일 없음"
ext = Path(file_path).suffix.lower()
sections, error = None, None
if ext == '.pdf':
sections, error = extract_text_from_pdf(file_path)
elif ext == '.docx':
sections, error = extract_text_from_docx(file_path)
elif ext == '.hwpx':
sections, error = extract_text_from_hwpx(file_path)
elif ext == '.hwp':
sections, error = extract_text_from_hwp(file_path)
elif ext in ('.txt', '.md', '.csv', '.json', '.xml', '.html'):
sections, error = extract_text_from_txt(file_path)
else:
return None, None, f"지원하지 않는 형식: {ext}"
if sections:
full = '\n\n'.join(sections)
return sections, full, None
return None, None, error or "텍스트 추출 실패"
def split_sentences(text):
try:
s = [x.text.strip() for x in KIWI.split_into_sents(text) if x.text.strip()]
if s: return s
except: pass
return [x.strip() for x in re.split(r'(?<=[.!?。])\s+', text) if x.strip()]
def split_words(text):
return [w for w in re.findall(r'[가-힣a-zA-Z0-9]+', text) if w]
def get_morphemes(text):
try:
r = KIWI.analyze(text)
if r and r[0]: return [(m.form, m.tag) for m in r[0][0]]
except: pass
return []
def http_get(url, headers=None, timeout=15):
try:
if HAS_HTTPX:
r = httpx.get(url, headers=headers or {}, timeout=timeout, follow_redirects=True)
return r.text if r.status_code == 200 else None
else:
import urllib.request
req = urllib.request.Request(url, headers=headers or {})
with urllib.request.urlopen(req, timeout=timeout) as resp:
return resp.read().decode('utf-8', errors='replace')
except: return None
def http_post_json(url, body, headers=None, timeout=30):
try:
h = headers or {}
h["Content-Type"] = "application/json"
if HAS_HTTPX:
r = httpx.post(url, json=body, headers=h, timeout=timeout)
if r.status_code == 200: return r.json()
return None
else:
import urllib.request, ssl
req = urllib.request.Request(url, json.dumps(body).encode(), h)
with urllib.request.urlopen(req, timeout=timeout, context=ssl.create_default_context()) as resp:
return json.loads(resp.read())
except: return None
def call_groq(model, prompt, max_tokens=800, temperature=0.1):
if not GROQ_KEY: return None, "NO_KEY"
url = "https://api.groq.com/openai/v1/chat/completions"
h = {"Authorization": f"Bearer {GROQ_KEY}", "Content-Type": "application/json"}
b = {"model": model, "messages": [{"role":"user","content":prompt}], "max_tokens": max_tokens, "temperature": temperature}
try:
if HAS_HTTPX:
r = httpx.post(url, json=b, headers=h, timeout=45)
if r.status_code == 200: return r.json()["choices"][0]["message"]["content"], None
return None, f"HTTP {r.status_code}"
else:
import urllib.request, ssl
req = urllib.request.Request(url, json.dumps(b).encode(), h)
with urllib.request.urlopen(req, timeout=45, context=ssl.create_default_context()) as resp:
return json.loads(resp.read())["choices"][0]["message"]["content"], None
except Exception as e: return None, str(e)[:150]
AI_ENDINGS = ['합니다','입니다','됩니다','습니다','있습니다','했습니다','겠습니다']
AI_CASUAL_ENDINGS = ['라고 할 수 있다','라고 볼 수 있다','다고 생각한다','다고 판단된다',
'인 셈이다','인 것이다','는 것이다','는 셈이다','ㄹ 것이다','을 것이다',
'라 할 수 있다','로 보인다','로 판단된다','고 있다','는 추세다','는 상황이다',
'지 않을 수 없다','라 하겠다','음을 알 수 있다','할 필요가 있다']
AI_CONNS = ['또한','따라서','그러므로','이에 따라','한편','더불어','아울러','뿐만 아니라',
'이를 통해','이에','결과적으로','궁극적으로','특히','나아가','이러한']
AI_SOFT_CONNS = ['물론','그러나','하지만','이처럼','이와 같이','이를 바탕으로']
AI_FILLER = ['것으로 보','것으로 나타','것으로 예상','할 수 있','볼 수 있','주목할 만',
'중요한 역할','중요한 의미','긍정적인 영향','부정적인 영향','필요합니다','필요하다',
'중요합니다','중요하다','역할을 하','영향을 미','기대된다','예상됩니다','부각되고',
'대두되고','다양한 분야','다양한 산업','눈부신 성과','획기적인 변화','혁신적인',
'점에서','측면에서','관점에서']
AI_CASUAL_FILLER = ['무궁무진하다','무궁무진한','과언이 아니','돌파구가 될','전환점이 될',
'기반으로','발판으로','원동력이','초석이 될','가속화되','급부상','패러다임',
'지평을 열','새로운 장을','대전환','본격화되','고도화','이정표']
AI_CONCESSION = re.compile(r'물론.{2,20}(하지만|그러나|그렇지만|다만)|.{2,15}(이긴 하지만|기는 하지만|수 있지만|수는 있지만)')
EN_AI_MARKERS = ['furthermore','additionally','moreover','it is worth noting','in conclusion',
'it is important to','plays a crucial role','significant impact','various aspects',
'in this regard','consequently','nevertheless','integral part of','led to remarkable',
'fundamentally transformed','has become increasingly','it should be noted',
'in the context of','paradigm shift','landscape of','methodologies',
'transformative impact','unprecedented','in various domains']
HUMAN_MARKERS = {
'ㅋㅎㅠ': re.compile(r'([ㅋㅎㅠㅜㄷㄱ])\1{1,}'),
'이모티콘': re.compile(r'[;:]-?[)(DPp]|\^[_\-]?\^|ㅡㅡ|;;'),
'줄임': re.compile(r'ㄹㅇ|ㅇㅇ|ㄴㄴ|ㅇㅋ|ㄷㄷ|ㅂㅂ'),
'느낌표': re.compile(r'[!?]{2,}'),
'비격식종결': re.compile(r'(거든|잖아|인데|인걸|같음|느낌|아님|대박|미쳤|헐|ㅋ$|ㅎ$|임$|음$|듯$)'),
'구어축약': re.compile(r'(걍|좀|막|완전|진짜|레알|존나|개|졸라|존맛|겁나)'),
'맞춤법오류': re.compile(r'됬|몇일|금새|할수있|것같[은다]|되가|되서|안됀|햇다'),
'말줄임표': re.compile(r'\.{3,}|…'),
}
FP = {
"GPT": {"m":['물론이죠','도움이 되셨기를','설명해 드리겠습니다','추가 질문','도움이 필요하시면',
'요약하자면','간략히 정리하면','핵심은'],"e":['습니다','드리겠습니다'],"lp":re.compile(r'^\d+\.\s|^[-•]\s',re.M)},
"Claude": {"m":['말씀하신','살펴보겠습니다','균형 잡힌','맥락에서','한 가지 주의할','뉘앙스',
'흥미로운 질문','복잡한 주제'],"e":['네요','거예요'],"lp":re.compile(r'^\*\*.*\*\*|^#+\s',re.M)},
"Gemini": {"m":['다음과 같습니다','정리해 드리겠습니다','핵심 내용을','더 알고 싶으시면',
'알아보겠습니다'],"e":['겠습니다','보세요'],"lp":re.compile(r'^\*\s|^-\s\*\*',re.M)},
"Perplexity": {"m":['검색 결과에 따르면','보도에 따르면','연구에 따르면','밝혔다','전했다',
'것으로 나타났다','것으로 조사됐다','것으로 집계됐다','발표했다'],"e":['밝혔다','나타났다','전했다'],"lp":re.compile(r'\[\d+\]',re.M)},
}
def score_sentence(sent):
"""단일 문장 AI 점수 (0~100). 탭1·탭2 공유. v5.0 대폭 강화."""
sc = 0; reasons = []
sl = sent.lower().strip()
sr = sent.rstrip('.!?。')
for e in AI_ENDINGS:
if sr.endswith(e): sc += 22; reasons.append(f"격식어미(-{e})"); break
if sc == 0: # 격식이 아닌 경우만
for e in AI_CASUAL_ENDINGS:
if sr.endswith(e): sc += 15; reasons.append(f"비격식AI(-{e})"); break
stripped = sent.strip()
for c in AI_CONNS:
if stripped.startswith(c):
sc += 18; reasons.append(f"AI접속사({c})"); break
else:
for c in AI_SOFT_CONNS:
if stripped.startswith(c): sc += 8; reasons.append(f"약한접속사({c})"); break
filler_found = sum(1 for f in AI_FILLER if f in sent)
casual_filler = sum(1 for f in AI_CASUAL_FILLER if f in sent)
total_filler = filler_found + casual_filler
if total_filler >= 3: sc += 25; reasons.append(f"상투표현×{total_filler}")
elif total_filler == 2: sc += 18; reasons.append(f"상투표현×2")
elif total_filler == 1: sc += 10; reasons.append(f"상투표현×1")
if AI_CONCESSION.search(sent): sc += 10; reasons.append("양보패턴")
for mn, fp in FP.items():
for m in fp["m"]:
if m in sent: sc += 8; reasons.append(f"{mn}지문"); break
en_count = sum(1 for em in EN_AI_MARKERS if em in sl)
if en_count >= 3: sc += 25; reasons.append(f"영어AI×{en_count}")
elif en_count >= 2: sc += 18; reasons.append(f"영어AI×{en_count}")
elif en_count >= 1: sc += 12; reasons.append(f"영어AI×1")
has_formal = any(sr.endswith(e) for e in AI_ENDINGS)
has_conn = any(stripped.startswith(c) for c in AI_CONNS)
if has_formal and total_filler >= 1 and has_conn: sc += 8; reasons.append("복합AI")
elif has_formal and total_filler >= 2: sc += 5; reasons.append("격식+상투")
for n, p in HUMAN_MARKERS.items():
matches = p.findall(sent)
if matches:
if n in ('ㅋㅎㅠ','이모티콘','줄임'): sc -= 25; reasons.append(f"인간({n})")
elif n in ('비격식종결','구어축약'): sc -= 18; reasons.append(f"구어체({n})")
elif n == '맞춤법오류': sc -= 12; reasons.append("맞춤법오류")
elif n in ('느낌표','말줄임표'): sc -= 10; reasons.append(f"인간({n})")
return max(0, min(100, sc)), reasons
def analyze_statistics(text, sentences, words):
sl = [len(s) for s in sentences]
if len(sl) < 2: return {"score":50}
avg = sum(sl)/len(sl); std = math.sqrt(sum((l-avg)**2 for l in sl)/len(sl))
cv = std/avg if avg > 0 else 0
cv_score = 85 if cv<0.20 else 70 if cv<0.30 else 50 if cv<0.45 else 30 if cv<0.60 else 15
diffs = [abs(sl[i]-sl[i-1]) for i in range(1,len(sl))]
burst_score = 50
if diffs:
avg_d = sum(diffs)/len(diffs)
max_d = max(diffs)
burst_ratio = max_d / (avg_d + 1)
burst_score = 85 if burst_ratio < 1.8 else 65 if burst_ratio < 2.5 else 40 if burst_ratio < 3.5 else 20
standard_ratio = sum(1 for l in sl if 20 <= l <= 60) / len(sl)
std_score = 80 if standard_ratio > 0.8 else 60 if standard_ratio > 0.6 else 40 if standard_ratio > 0.4 else 20
extreme = sum(1 for l in sl if l < 10 or l > 80)
if extreme >= 2: std_score = max(10, std_score - 20)
elif extreme >= 1: std_score = max(15, std_score - 10)
wf = Counter(words); t = len(words)
ttr = len(wf)/t if t>0 else 0
vocab_score = 70 if ttr<0.45 else 55 if ttr<0.55 else 35 if ttr<0.65 else 20
wpc = [len(split_words(s)) for s in sentences]
complex_score = 50
if len(wpc) >= 3:
wpc_avg = sum(wpc)/len(wpc)
wpc_std = math.sqrt(sum((w-wpc_avg)**2 for w in wpc)/len(wpc))
wpc_cv = wpc_std/wpc_avg if wpc_avg > 0 else 0
complex_score = 80 if wpc_cv < 0.20 else 60 if wpc_cv < 0.35 else 35 if wpc_cv < 0.50 else 15
final = int(cv_score*0.20 + burst_score*0.20 + std_score*0.25 + vocab_score*0.15 + complex_score*0.20)
return {"score":final,"cv":round(cv,3),"ttr":round(ttr,3)}
def analyze_korean_style(text, sentences, morphemes):
if not sentences: return {"score":50}
formal_cnt = sum(1 for s in sentences if any(s.rstrip('.!?').endswith(e) for e in AI_ENDINGS))
casual_ai = sum(1 for s in sentences if any(s.rstrip('.!?').endswith(e) for e in AI_CASUAL_ENDINGS))
fr = formal_cnt/len(sentences)
car = casual_ai/len(sentences)
ending_score = 85 if fr>0.7 else 65 if fr>0.5 else 45 if fr>0.3 else 25 if fr>0.1 else 10
ending_score = min(90, ending_score + int(car * 25)) # 비격식AI 보너스
conn_positions = []
for i, s in enumerate(sentences):
for c in AI_CONNS:
if s.strip().startswith(c): conn_positions.append(i); break
conn_density = len(conn_positions)/len(sentences) if sentences else 0
conn_score = 85 if conn_density>0.4 else 65 if conn_density>0.25 else 40 if conn_density>0.1 else 15
if len(conn_positions) >= 2:
gaps = [conn_positions[i]-conn_positions[i-1] for i in range(1,len(conn_positions))]
gap_cv = (math.sqrt(sum((g-sum(gaps)/len(gaps))**2 for g in gaps)/len(gaps))/(sum(gaps)/len(gaps)+0.01))
if gap_cv < 0.5: conn_score = min(90, conn_score + 10) # 매우 규칙적 → AI 보너스
filler_cnt = sum(1 for f in AI_FILLER if f in text) + sum(1 for f in AI_CASUAL_FILLER if f in text)
filler_score = 90 if filler_cnt>=6 else 75 if filler_cnt>=4 else 55 if filler_cnt>=2 else 30 if filler_cnt>=1 else 10
concession_cnt = len(AI_CONCESSION.findall(text))
conc_score = 80 if concession_cnt >= 2 else 55 if concession_cnt >= 1 else 20
human_count = sum(len(p.findall(text)) for p in HUMAN_MARKERS.values())
human_penalty = min(35, human_count * 8)
pos_score = 45
if morphemes:
pc = Counter(t for _,t in morphemes); tm = sum(pc.values())
noun_r = sum(pc.get(t,0) for t in ['NNG','NNP','NNB','NR'])/tm if tm else 0
pos_score = 70 if noun_r>0.42 else 55 if noun_r>0.38 else 35 if noun_r>0.32 else 20
final = max(5, int(ending_score*0.25 + conn_score*0.20 + filler_score*0.20 +
conc_score*0.10 + pos_score*0.15 + 10*0.10) - human_penalty)
return {"score":final,"formal":f"{fr:.0%}","conn":f"{conn_density:.2f}","filler":filler_cnt,"human":human_count}
def analyze_repetition(text, sentences, words):
if not sentences or len(sentences) < 2: return {"score":35}
tr = 0
if len(words)>=5:
tg = Counter(tuple(words[i:i+3]) for i in range(len(words)-2))
tr = sum(1 for c in tg.values() if c>1)/len(tg) if tg else 0
ngram_score = 80 if tr>0.15 else 60 if tr>0.08 else 35 if tr>0.03 else 15
openers_2 = []
openers_3 = []
for s in sentences:
ws = split_words(s)
if len(ws) >= 2: openers_2.append(tuple(ws[:2]))
if len(ws) >= 3: openers_3.append(tuple(ws[:3]))
opener2_score = 50
if openers_2:
unique2 = len(set(openers_2))/len(openers_2)
opener2_score = 80 if unique2 < 0.5 else 60 if unique2 < 0.7 else 35 if unique2 < 0.85 else 15
ai_only_conns = ['또한','따라서','그러므로','이에 따라','더불어','아울러','뿐만 아니라',
'이를 통해','이에','결과적으로','궁극적으로','나아가','이러한']
cr = sum(1 for s in sentences if any(s.strip().startswith(c) for c in ai_only_conns))
crr = cr/len(sentences) if sentences else 0
ai_conn_score = 85 if crr>0.35 else 65 if crr>0.2 else 40 if crr>0.08 else 15
templates = []
for s in sentences:
ws = split_words(s)
if len(ws) >= 4:
templates.append((ws[0], ws[-1]))
template_rep = 0
if templates:
tc = Counter(templates)
template_rep = sum(1 for c in tc.values() if c > 1) / len(tc) if tc else 0
template_score = 80 if template_rep > 0.3 else 55 if template_rep > 0.1 else 25
endings = []
for s in sentences:
sr = s.rstrip('.!?。')
for e in AI_ENDINGS + ['있다','했다','된다','한다','이다','는다']:
if sr.endswith(e): endings.append(e); break
ending_div = 50
if endings:
unique_e = len(set(endings))/len(endings)
ending_div = 80 if unique_e < 0.3 else 60 if unique_e < 0.5 else 35 if unique_e < 0.7 else 15
final = int(ngram_score*0.15 + opener2_score*0.20 + ai_conn_score*0.25 +
template_score*0.15 + ending_div*0.25)
return {"score":final}
AI_VAGUE = re.compile(r'다양한|중요한|긍정적인|부정적인|획기적인|혁신적인|효율적인|체계적인|종합적인|전반적인|지속적인|적극적인|상당한|주요한')
CONCRETE_PROPER = re.compile(r'삼성|LG|현대|SK|카카오|네이버|넷플릭스|구글|애플|테슬라|아마존|마이크로소프트|[가-힣]{2,}대학|[가-힣]{2,}병원|[가-힣]{1,3}시[는을이가]|[가-힣]{1,3}구[는을이가]|[가-힣]{2,}동[에서]')
CONCRETE_NUMBER = re.compile(r'\d{2,}[만억조원달러%개년월일위등호]|\d+\.\d+%|\d{4}년|\d{1,2}월')
CONCRETE_QUOTE = re.compile(r'에 따르면|발표했|밝혔다|보도했|전했다|라고 말|인터뷰|설문|조사|통계청|보고서')
def analyze_structure(text, sentences):
if not sentences: return {"score":35}
lt = (len(re.findall(r'^\d+[.)]\s',text,re.M)) + len(re.findall(r'^[-•*]\s',text,re.M)) +
len(re.findall(r'^#+\s',text,re.M)) + len(re.findall(r'\*\*[^*]+\*\*',text)))
list_score = 90 if lt>=5 else 70 if lt>=3 else 45 if lt>=1 else 10
vague_cnt = len(AI_VAGUE.findall(text))
proper_cnt = len(CONCRETE_PROPER.findall(text))
number_cnt = len(CONCRETE_NUMBER.findall(text))
quote_cnt = len(CONCRETE_QUOTE.findall(text))
concrete_total = proper_cnt + number_cnt + quote_cnt
if vague_cnt >= 3 and concrete_total == 0: abstract_score = 90
elif vague_cnt >= 2 and concrete_total <= 1: abstract_score = 70
elif vague_cnt >= 1 and concrete_total == 0: abstract_score = 55
elif concrete_total >= 3: abstract_score = 10
elif concrete_total >= 2: abstract_score = 20
elif concrete_total >= 1: abstract_score = 30
else: abstract_score = 45
has_question = any(s.strip().endswith('?') for s in sentences)
has_exclaim = any(s.strip().endswith('!') for s in sentences)
has_ellipsis = any('...' in s or '…' in s for s in sentences)
variety = sum([has_question, has_exclaim, has_ellipsis])
type_score = 15 if variety >= 2 else 40 if variety >= 1 else 65
puncts = re.findall(r'[!?,;:…—\-~]', text)
unique_punct = len(set(puncts))
punct_score = 65 if unique_punct <= 1 else 45 if unique_punct <= 3 else 20
paras = [p.strip() for p in text.split('\n\n') if p.strip()]
para_score = 35
if len(paras) >= 2:
pl = [len(split_sentences(p)) for p in paras]
avg_p = sum(pl)/len(pl)
if avg_p > 0:
pcv = math.sqrt(sum((l-avg_p)**2 for l in pl)/len(pl))/avg_p
para_score = 75 if pcv < 0.2 else 55 if pcv < 0.35 else 30
if len(paras) >= 3 and pl[0] < avg_p and pl[-1] < avg_p:
para_score = min(85, para_score + 10)
final = int(list_score*0.10 + abstract_score*0.40 + type_score*0.20 + punct_score*0.10 + para_score*0.20)
return {"score":final}
def analyze_model_fingerprint(text, sentences):
ms = {}
sl = text.lower()
for mn, fp in FP.items():
sc = sum(min(15,text.count(m)*5) for m in fp["m"] if text.count(m)>0)
lm = fp["lp"].findall(text)
if lm: sc += min(20,len(lm)*3)
em = sum(1 for s in sentences if any(s.rstrip('.!?').endswith(e) for e in fp.get("e",[])))
if sentences: sc += int((em/len(sentences))*20)
ms[mn] = min(100,sc)
general_ai = 0
general_ai += sum(5 for f in AI_CASUAL_FILLER if f in text)
casual_end_cnt = sum(1 for s in sentences if any(s.rstrip('.!?').endswith(e) for e in AI_CASUAL_ENDINGS))
general_ai += casual_end_cnt * 5
general_ai += len(AI_CONCESSION.findall(text)) * 8
ms["비격식AI"] = min(100, general_ai)
en_score = sum(5 for em in EN_AI_MARKERS if em in sl)
ms["영어AI"] = min(100, en_score)
mx = max(ms.values()) if ms else 0
multi = sum(1 for v in ms.values() if v >= 10)
multi_bonus = 10 if multi >= 3 else 5 if multi >= 2 else 0
base = 85 if mx>=50 else 65 if mx>=35 else 45 if mx>=20 else 25 if mx>=10 else 10
return {"score":min(95, base + multi_bonus),"model_scores":{k:v for k,v in ms.items() if k not in ("비격식AI","영어AI") or v > 0}}
def analyze_perplexity(text, sentences, morphemes):
"""한국어 특화 Perplexity + Burstiness — 문자 엔트로피 보정"""
if len(sentences) < 2: return {"score": 40, "entropy": 0, "variance": 0, "order": 0, "zipf": 0}
chars = [c for c in text if c.strip()]
char_score = 45
if len(chars) >= 30:
cbigrams = [(chars[i], chars[i+1]) for i in range(len(chars)-1)]
cb_freq = Counter(cbigrams)
total_cb = len(cbigrams)
char_entropy = -sum((cnt/total_cb)*math.log2(cnt/total_cb) for cnt in cb_freq.values())
if char_entropy < 7.5: char_score = 78
elif char_entropy < 8.5: char_score = 62
elif char_entropy < 9.5: char_score = 42
elif char_entropy < 10.5: char_score = 25
else: char_score = 12
sl = [len(s) for s in sentences]
burst_score = 45
if len(sl) >= 3:
avg = sum(sl)/len(sl)
std = math.sqrt(sum((l-avg)**2 for l in sl)/len(sl))
cv = std/(avg+1e-10)
if cv < 0.15: burst_score = 82
elif cv < 0.25: burst_score = 62
elif cv < 0.40: burst_score = 38
elif cv < 0.60: burst_score = 20
else: burst_score = 8
sent_ttr = []
for s in sentences:
sw = split_words(s)
if len(sw) >= 3:
sent_ttr.append(len(set(sw))/len(sw))
ttr_score = 42
if len(sent_ttr) >= 3:
avg_ttr = sum(sent_ttr)/len(sent_ttr)
std_ttr = math.sqrt(sum((t-avg_ttr)**2 for t in sent_ttr)/len(sent_ttr))
if std_ttr < 0.04: ttr_score = 75
elif std_ttr < 0.08: ttr_score = 55
elif std_ttr < 0.15: ttr_score = 35
else: ttr_score = 15
endings = [s.rstrip('.!?\u2026')[-3:] for s in sentences if len(s) >= 5]
end_score = 40
if len(endings) >= 3:
ef = Counter(endings)
end_ent = -sum((c/len(endings))*math.log2(c/len(endings)) for c in ef.values())
max_ent = math.log2(len(ef)) if len(ef) > 1 else 1
norm_ent = end_ent / (max_ent + 1e-10)
if norm_ent < 0.5: end_score = 72
elif norm_ent < 0.7: end_score = 50
elif norm_ent < 0.85: end_score = 32
else: end_score = 15
final = int(char_score * 0.30 + burst_score * 0.30 + ttr_score * 0.20 + end_score * 0.20)
return {"score": final, "entropy": char_score, "variance": burst_score, "order": ttr_score, "zipf": end_score}
HUMANIZER_OVERSUBST = re.compile(r'활용하다|이용하다|사용하다|적용하다|도입하다|채택하다|수행하다|진행하다|실시하다|실행하다')
HUMANIZER_AWKWARD = re.compile(r'그것은|이것은|저것은|해당 사항|앞서 언급한|전술한|상기한|기술된')
HUMANIZER_PASSIVE = re.compile(r'되어지[고는며]|하게 되었[다습]|수행되[었어]|진행되[었어]|실시되[었어]|활용되[었어]')
def analyze_humanizer(text, sentences, words, morphemes):
"""Humanizer/Bypasser 탐지 — AI 원문 패러프레이즈 흔적 분석"""
if len(sentences) < 2: return {"score": 20, "signals": []}
signals = []
content_words = [f for f, t in morphemes if t in ('NNG', 'NNP', 'VV', 'VA')]
if len(content_words) >= 10:
cw_freq = Counter(content_words)
hapax = sum(1 for c in cw_freq.values() if c == 1)
hapax_ratio = hapax / len(cw_freq) if cw_freq else 0
if hapax_ratio > 0.95 and len(content_words) >= 30:
signals.append(("동의어과다치환", 20, "핵심 어휘가 과도하게 분산"))
elif hapax_ratio > 0.90 and len(content_words) >= 25:
signals.append(("동의어치환의심", 12, "어휘 반복 회피 패턴"))
sl = [len(s) for s in sentences]
if len(sl) >= 4:
avg = sum(sl) / len(sl)
cv = math.sqrt(sum((l - avg)**2 for l in sl) / len(sl)) / (avg + 1e-10)
unique_ratio = len(set(words)) / len(words) if words else 0
if cv < 0.20 and unique_ratio > 0.80 and len(sentences) >= 5:
signals.append(("구조보존어휘변경", 18, "문장 구조 균일 + 비정상적 어휘 다양성"))
residual = 0
conn_positions = []
for i, s in enumerate(sentences):
stripped = s.strip()
for c in ['또한','특히','한편','더불어','아울러','나아가','이에','게다가','반면','결국']:
if stripped.startswith(c):
conn_positions.append(i)
break
if len(conn_positions) >= 2:
gaps = [conn_positions[i] - conn_positions[i-1] for i in range(1, len(conn_positions))]
if gaps and max(gaps) - min(gaps) <= 1: # 거의 등간격
signals.append(("접속사등간격잔존", 15, "접속사 배치가 규칙적 (AI 원문 구조 잔존)"))
residual += 15
oversubst = len(HUMANIZER_OVERSUBST.findall(text))
awkward = len(HUMANIZER_AWKWARD.findall(text))
passive = len(HUMANIZER_PASSIVE.findall(text))
if oversubst >= 3:
signals.append(("유사동사난무", 12, f"활용/이용/사용/적용 등 {oversubst}개"))
if awkward >= 2:
signals.append(("어색한지시어", 10, f"해당/전술/상기 등 {awkward}개"))
if passive >= 3:
signals.append(("이중피동과다", 15, f"되어지/수행되 등 {passive}개"))
endings = [s.rstrip('.!?')[-2:] for s in sentences if len(s) >= 4]
end_types = len(set(endings)) / len(endings) if endings else 0
has_question = any(s.strip().endswith('?') for s in sentences)
has_exclaim = any(s.strip().endswith('!') for s in sentences)
sent_type_variety = sum([has_question, has_exclaim])
if sent_type_variety == 0 and end_types > 0.85 and len(sentences) >= 6:
signals.append(("유형단조어미다양", 12, "서술문만 + 종결어미 과다 다양 = Humanizer 패턴"))
starters = [s.strip()[:3] for s in sentences if len(s) >= 6]
starter_unique = len(set(starters)) / len(starters) if starters else 0
if starter_unique >= 0.98 and len(sentences) >= 7:
signals.append(("문두과다다양", 8, "모든 문장 시작이 다름 (자연스럽지 않은 다양성)"))
total = sum(s[1] for s in signals)
if total >= 45: score = 85
elif total >= 30: score = 68
elif total >= 20: score = 52
elif total >= 10: score = 35
else: score = 15
return {"score": score, "signals": signals, "total_evidence": total}
MODEL_PROFILES = {
"GPT": {
"style": ["격식체 ~습니다", "또한/특히 접속사", "~에 대해", "~것으로 예상됩니다"],
"markers": ["다양한", "중요한 역할", "긍정적인", "눈부신", "주목할 만한", "살펴보겠습니다"],
"structure": "균일한 문단, 서론-본론-결론 구조, 마크다운 선호",
"endings": ["습니다", "있습니다", "됩니다", "입니다"],
"connectors": ["또한", "특히", "한편", "이처럼"],
},
"Claude": {
"style": ["맥락 제시", "균형 잡힌", "사려 깊은 어조", "양보 후 주장"],
"markers": ["흥미로운 질문", "복잡한 주제", "맥락에서", "균형 잡힌", "살펴보겠습니다", "한 가지 주의할"],
"structure": "자연스러운 흐름, 양보-주장 구문 선호, 부드러운 전환",
"endings": ["네요", "입니다", "있습니다", "습니다"],
"connectors": ["한편", "물론", "다만", "이와 관련해"],
},
"Gemini": {
"style": ["정보 나열형", "~에 대해 알아보겠습니다", "검색 결과 기반"],
"markers": ["알아보겠습니다", "도움이 되셨으면", "추가 질문", "참고로"],
"structure": "리스트/번호 매기기 선호, 헤더 활용, 정보 밀도 높음",
"endings": ["습니다", "있습니다", "세요"],
"connectors": ["또한", "그리고", "참고로"],
},
"Perplexity": {
"style": ["출처 인용형", "~에 따르면", "수치 제시", "것으로 나타났다"],
"markers": ["에 따르면", "것으로 나타났", "것으로 조사됐", "것으로 집계됐", "발표했", "보도에 따르면"],
"structure": "팩트 중심, 수치 인용 다수, 출처 명시 스타일",
"endings": ["습니다", "나타났다", "밝혔다", "전했다"],
"connectors": ["한편", "또한", "이에"],
},
}
def estimate_model(text, sentences, morphemes, model_scores):
"""AI 모델 추정 — 복합 증거 기반"""
evidence = {m: {"score": 0, "reasons": []} for m in MODEL_PROFILES}
sl = text.lower()
for model, profile in MODEL_PROFILES.items():
fp_score = model_scores.get(model, 0)
evidence[model]["score"] += fp_score * 0.4
if fp_score >= 20:
evidence[model]["reasons"].append(f"지문 매칭 {fp_score}점")
marker_cnt = sum(1 for m in profile["markers"] if m in text)
if marker_cnt >= 2:
evidence[model]["score"] += marker_cnt * 8
evidence[model]["reasons"].append(f"특유 표현 {marker_cnt}개")
end_match = 0
for s in sentences:
for e in profile["endings"]:
if s.rstrip('.!?').endswith(e):
end_match += 1; break
if sentences:
end_ratio = end_match / len(sentences)
if end_ratio > 0.7:
evidence[model]["score"] += 12
evidence[model]["reasons"].append(f"종결어미 {end_ratio:.0%} 일치")
conn_match = sum(1 for s in sentences if any(s.strip().startswith(c) for c in profile["connectors"]))
if conn_match >= 2:
evidence[model]["score"] += conn_match * 4
evidence[model]["reasons"].append(f"접속사 패턴 {conn_match}회")
number_citations = len(re.findall(r'\d+[%만억조]|에 따르면|것으로 나타났|발표했', text))
if number_citations >= 3:
evidence["Perplexity"]["score"] += number_citations * 5
evidence["Perplexity"]["reasons"].append(f"수치/인용 {number_citations}회")
concession_cnt = len(AI_CONCESSION.findall(text))
if concession_cnt >= 1:
evidence["Claude"]["score"] += concession_cnt * 10
evidence["Claude"]["reasons"].append(f"양보-주장 구문 {concession_cnt}회")
ranked = sorted(evidence.items(), key=lambda x: x[1]["score"], reverse=True)
top = ranked[0]
second = ranked[1] if len(ranked) > 1 else None
if top[1]["score"] < 10:
return {"model": "특정 불가", "confidence": "낮음", "detail": evidence, "ranked": ranked}
gap = top[1]["score"] - (second[1]["score"] if second else 0)
if gap >= 20 and top[1]["score"] >= 30:
conf = "높음"
elif gap >= 10 and top[1]["score"] >= 20:
conf = "중간"
else:
conf = "낮음"
return {
"model": top[0],
"confidence": conf,
"score": top[1]["score"],
"reasons": top[1]["reasons"],
"detail": evidence,
"ranked": ranked
}
def analyze_quality(text, sentences, words, morphemes):
qs = {}; sl = [len(s) for s in sentences]; tw = len(words)
ideal = sum(1 for l in sl if 15<=l<=70)/len(sentences) if sentences else 0
qs["가독성"] = min(100,int(ideal*70+(1-sum(1 for l in sl if l>100)/max(1,len(sentences)))*30))
wf = Counter(words); uw = len(wf)
mattr = (sum(len(set(words[i:i+50]))/50 for i in range(max(1,tw-50)))/max(1,tw-50)) if tw>=100 else (uw/tw if tw>0 else 0.5)
hr = sum(1 for c in wf.values() if c==1)/tw if tw>0 else 0
qs["어휘풍부도"] = min(100,int(mattr*80+hr*40))
lc = {'순접':['그래서','따라서'],'역접':['그러나','하지만','다만'],'첨가':['또한','그리고','게다가'],'전환':['한편'],'예시':['예를 들어'],'요약':['결국','결론적으로']}
ut = sum(1 for cw in lc.values() if any(w in text for w in cw))
qs["논리구조"] = min(100,int(ut/len(lc)*60+min(40,ut*10)))
si = sum(1 for p in [re.compile(r'됬'),re.compile(r'몇일'),re.compile(r'금새')] if p.search(text))
spi = sum(1 for p in [re.compile(r'할수있'),re.compile(r'것같')] if p.search(text))
qs["정확성"] = max(0,100-(si+spi)*15)
ar=0;vv=0
if morphemes:
pc = Counter(t for _,t in morphemes); tm = sum(pc.values())
ar = sum(pc.get(t,0) for t in ['VA','MAG','MAJ'])/tm if tm else 0
vv = len(set(f for f,t in morphemes if t in ['VV','VA']))/max(1,sum(1 for _,t in morphemes if t in ['VV','VA']))
qs["표현풍부성"] = min(100,int(ar*200+vv*30))
cr = 0.5
if morphemes:
ct={'NNG','NNP','VV','VA','MAG'}; ft={'JKS','JKC','JKG','JKO','JX','JC','EP','EF','EC','ETN','ETM'}
cc=sum(1 for _,t in morphemes if t in ct); fc=sum(1 for _,t in morphemes if t in ft)
cr = cc/(cc+fc) if (cc+fc)>0 else 0.5
qs["정보밀도"] = min(100,int(cr*80))
wq = {"가독성":.20,"어휘풍부도":.18,"논리구조":.18,"정확성":.18,"표현풍부성":.13,"정보밀도":.13}
total = int(sum(qs[k]*wq[k] for k in wq))
grade = "S" if total>=85 else "A" if total>=72 else "B" if total>=58 else "C" if total>=42 else "D" if total>=28 else "F"
return {"score":total,"grade":grade,"sub_scores":qs}
LLM_JUDGES = [("openai/gpt-oss-120b","GPT-OSS 120B"),("qwen/qwen3-32b","Qwen3 32B"),("moonshotai/kimi-k2-instruct-0905","Kimi-K2")]
def _parse_ai_probability(raw_resp):
"""LLM 응답에서 AI 확률(0~100)을 추출. 한국어/영어 다양한 형식 대응."""
if not raw_resp: return -1
think_content = ''
think_m = re.search(r'
파일: {fname} | 생성: {now} | 엔진: AI Detector v4.0
품질: {quality['grade']} ({quality['score']}점)
| 축 | 점수 |
|---|
| No | 내용 (발췌) | AI 점수 | 판정 |
|---|
| 기능 | 🔎 AI 판별기 v5.1 |
📋 카피킬러 GPT킬러 |
🅹 JustDone .com |
🔍 GPTZero | 🅾 Originality .ai |
🅃 Turnitin |
|---|---|---|---|---|---|---|
| 🎯 한국어 형태소 분석 | ★ 전용 | ✓ | ✗ | ✗ | ✗ | △ |
| 📊 탐지 축 수 | 8축 | 1~2축 | 1축 | 2축 | 1축 | 1축 |
| 🧠 Perplexity 확률분석 | ✓ 4중 | ✗ | ✗ | ✓ | ✗ | ✗ |
| 🛡️ Humanizer/Bypasser 탐지 | ✓ 6시그널 | ✗ | ✗ | ✗ | ✓ | ✓ |
| 🎯 AI 모델 추정 | ★ 4모델 | ✗ | ✗ | △ | ✗ | ✗ |
| 🤖 LLM 교차검증 | ★ 3모델 | ✗ | ✗ | ✗ | ✗ | ✗ |
| 📝 판정 근거 투명 공개 | ★ 8축별 | ✗ | ✗ | △ | △ | ✗ |
| 🎨 문장별 하이라이트 | ✓ 5단계 | ✗ | ✗ | ✓ | ✓ | ✓ |
| 📄 HWP/HWPX 문서 | ✓ | ✓ | ✗ | ✗ | ✗ | ✗ |
| 📝 글 품질 측정 | ✓ 6항목 | ✗ | △ | ✗ | ✓ | ✗ |
| 🔍 표절 검사 | ✓ 5소스 | ✓ DB | ✓ | ✗ | ✗ | ✓ DB |
| 💰 가격 | 완전 무료 | 9,900원/건 | $7.99~/월 | $8.33~/월 | $14.95/월 | 기관 전용 |
| 🎯 대상 모델 | 전 AI 모델 | GPT만 | 범용 | 범용 | 범용 | 범용 |
| ⚠️ 핵심 약점 | 규칙 기반 (ML 미적용) |
GPT만·오탐률 89% 불만 |
점수 30%+ 편차 "Hello"→92%AI |
혼합텍스트 41% 편집AI 35% 오류 |
$14.95/월 비영어 약함 |
기관만 사용 학생 접근 불가 |