openfree commited on
Commit
ada4ae5
Β·
verified Β·
1 Parent(s): 0d5c726

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +936 -247
app.py CHANGED
@@ -6,27 +6,6 @@ from pathlib import Path
6
  from concurrent.futures import ThreadPoolExecutor, as_completed
7
  from xml.etree import ElementTree as ET
8
  from kiwipiepy import Kiwi
9
-
10
- # βœ… text_utilsμ—μ„œ ν•„μš”ν•œ ν•¨μˆ˜λ“€ import
11
- from text_utils import (
12
- extract_text_from_pdf,
13
- extract_text_from_docx,
14
- extract_text_from_txt,
15
- extract_text_from_hwpx,
16
- extract_text_from_hwp,
17
- extract_file_text_api,
18
- split_sentences,
19
- split_words,
20
- http_get,
21
- brave_search,
22
- search_kci,
23
- search_riss,
24
- search_arxiv,
25
- duckduckgo_search,
26
- self_crawl_search,
27
- parallel_brave_search,
28
- )
29
-
30
  KIWI = Kiwi()
31
  try:
32
  import httpx; HAS_HTTPX = True
@@ -36,6 +15,618 @@ try:
36
  from google import genai
37
  from google.genai import types as gtypes
38
  HAS_GENAI = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  passive = len(HUMANIZER_PASSIVE.findall(text))
40
  if oversubst >= 3:
41
  signals.append(("μœ μ‚¬λ™μ‚¬λ‚œλ¬΄", 12, f"ν™œμš©/이용/μ‚¬μš©/적용 λ“± {oversubst}개"))
@@ -358,72 +949,34 @@ def search_arxiv(query):
358
  pass
359
  return results[:3]
360
  def gemini_plagiarism_check(text_chunk):
361
- """βœ… κ°œμ„ : 전체 ν…μŠ€νŠΈλ‘œ ν‘œμ ˆ 검사 (블둝 μ œν•œ μ—†μŒ)"""
362
  if not HAS_GENAI or not GEMINI_KEY: return None
363
  try:
364
  client = genai.Client(api_key=GEMINI_KEY)
365
  tool = gtypes.Tool(google_search=gtypes.GoogleSearch())
366
- # βœ… κ°œμ„ : 전체 ν…μŠ€νŠΈλ₯Ό 전달 ([:1000] 제거!)
367
- prompt = f"""당신은 κ³ κΈ‰ ν‘œμ ˆ 검사 μ „λ¬Έκ°€μž…λ‹ˆλ‹€.
368
-
369
- λ‹€μŒ ν…μŠ€νŠΈλ₯Ό Google Search둜 κ²€μƒ‰ν•˜μ—¬ μΈν„°λ„·μ—μ„œ λ™μΌν•˜κ±°λ‚˜ 맀우 μœ μ‚¬ν•œ λ‚΄μš©μ΄ μžˆλŠ”μ§€ ν™•μΈν•˜μ„Έμš”.
370
-
371
- 검사 방법:
372
- 1. 핡심 λ¬Έμž₯듀을 Google Search둜 검색
373
- 2. 발견된 μΆœμ²˜μ™€ μœ μ‚¬λ„(%) 정리
374
- 3. ν•œκ΅­μ–΄/μ˜μ–΄ λͺ¨λ‘ 검색
375
-
376
- [검사 λŒ€μƒ ν…μŠ€νŠΈ]
377
- {text_chunk}
378
-
379
- 응닡 ν˜•μ‹:
380
- 발견된 ν‘œμ ˆ:
381
- - 좜처 1: [제λͺ©] (μœ μ‚¬λ„: XX%)
382
- - 좜처 2: [제λͺ©] (μœ μ‚¬λ„: XX%)
383
- ...
384
-
385
- 평가:
386
- 전체 μœ μ‚¬λ„: XX%
387
- (κ°€μž₯ 높은 μœ μ‚¬λ„ κΈ°μ€€)
388
-
389
- μ°Έκ³ : λΈ”λ‘œκ·Έ, λ‰΄μŠ€, ν•™μˆ μ§€, SNS λ“± λͺ¨λ“  좜처 검색"""
390
  resp = client.models.generate_content(
391
  model="gemini-2.0-flash-lite",
392
  contents=prompt,
393
- config=gtypes.GenerateContentConfig(tools=[tool], temperature=0.1, max_output_tokens=2000) # 더 κΈ΄ 응닡
394
  )
395
  text_resp = resp.text if resp.text else ""
396
  sources = []
397
-
398
- # Grounding λ©”νƒ€λ°μ΄ν„°μ—μ„œ 좜처 μΆ”μΆœ
399
  if hasattr(resp, 'candidates') and resp.candidates:
400
  gc = resp.candidates[0].grounding_metadata
401
  if gc and hasattr(gc, 'grounding_chunks'):
402
  for chunk in gc.grounding_chunks:
403
  if hasattr(chunk, 'web') and chunk.web:
404
- sources.append({
405
- "title": chunk.web.title or "",
406
- "url": chunk.web.uri or "",
407
- "source": "Google"
408
- })
409
-
410
- # μ‘λ‹΅μ—μ„œ μœ μ‚¬λ„ μΆ”μΆœ
411
- pm = re.search(r'(?:전체\s)?μœ μ‚¬λ„[:\s]*(\d+)', text_resp)
412
  pct = int(pm.group(1)) if pm else 0
413
-
414
- return {
415
- "pct": pct,
416
- "response": text_resp,
417
- "sources": sources,
418
- "full_analysis": True # 전체 ν…μŠ€νŠΈ 검사 μ™„λ£Œ ν‘œμ‹œ
419
- }
420
  except Exception as e:
421
- return {
422
- "pct": 0,
423
- "response": str(e)[:100],
424
- "sources": [],
425
- "full_analysis": False
426
- }
427
  def parallel_brave_search(queries, max_workers=10):
428
  """Brave Search 병렬 μ‹€ν–‰ (μ΅œλŒ€ 20개)"""
429
  all_results = {}
@@ -467,142 +1020,103 @@ def self_crawl_search(query, max_results=3):
467
  all_results.extend(duckduckgo_search(f"{query} λ…Όλ¬Έ ν•™μˆ ", 2))
468
  return all_results
469
  def run_plagiarism(text, progress=gr.Progress()):
470
- """βœ… Gemini Google Search 90% (메인) + Brave/arXiv 10% (보쑰)"""
471
-
472
  if not text or len(text.strip())<50:
473
  return "<div style='padding:20px;text-align:center;color:#888;'>⚠️ μ΅œμ†Œ 50자 이상</div>", ""
474
-
475
  text = text.strip()
476
  sents = split_sentences(text)
477
  now = datetime.now().strftime("%Y-%m-%d %H:%M")
478
-
 
479
  progress(0.05, "λ¬Έμž₯ 뢄리...")
480
-
 
 
 
 
481
  all_sources = []
 
 
482
  log_lines = []
483
-
484
- # ============================================
485
- # 메인 (90%): Gemini Google Search
486
- # ============================================
487
- gemini_pct = 0
488
- gemini_sources = []
489
-
490
- if HAS_GENAI and GEMINI_KEY:
491
- progress(0.20, "Gemini + Google Search (메인 90% ν‘œμ ˆ 검사)...")
492
-
493
- try:
494
- client = genai.Client(api_key=GEMINI_KEY)
495
-
496
- prompt = f"""당신은 κ³ κΈ‰ ν‘œμ ˆ 검사 μ „λ¬Έκ°€μž…λ‹ˆλ‹€.
497
-
498
- μ•„λž˜ ν…μŠ€νŠΈλ₯Ό Google Search둜 μ² μ €νžˆ κ²€μƒ‰ν•˜μ—¬ μœ μ‚¬ν•œ λ‚΄μš©μ„ λͺ¨λ‘ μ°ΎμœΌμ„Έμš”.
499
-
500
- [검사 ν…μŠ€νŠΈ]
501
- {text}
502
-
503
- 응닡 ν˜•μ‹:
504
- 발견된 μœ μ‚¬ λ‚΄μš©:
505
- - [제λͺ© 1] (μœ μ‚¬λ„: XX%)
506
- - [제λͺ© 2] (μœ μ‚¬λ„: XX%)
507
-
508
- μ΅œμ’… ν‘œμ ˆμœ¨: XX%"""
509
-
510
- contents = [
511
- types.Content(
512
- role="user",
513
- parts=[
514
- types.Part.from_text(text=prompt),
515
- ],
516
- )
517
- ]
518
-
519
- tools = [
520
- types.Tool(googleSearch=types.GoogleSearch()),
521
- ]
522
-
523
- config = types.GenerateContentConfig(
524
- thinking_config=types.ThinkingConfig(thinking_budget=0),
525
- tools=tools,
526
- temperature=0.3,
527
- max_output_tokens=3000,
528
- )
529
-
530
- # 슀트리밍으둜 μˆ˜μ§‘
531
- full_response = ""
532
- for chunk in client.models.generate_content_stream(
533
- model="gemini-2.0-flash-lite-latest",
534
- contents=contents,
535
- config=config,
536
- ):
537
- if chunk.text:
538
- full_response += chunk.text
539
-
540
- # μœ μ‚¬λ„ μΆ”μΆœ
541
- pm = re.search(r'(?:ν‘œμ ˆ|μœ μ‚¬λ„)[:\s]*(\d+)', full_response)
542
- if pm:
543
- gemini_pct = int(pm.group(1))
544
-
545
- # 좜처 μΆ”μΆœ
546
- for m in re.finditer(r'https?://[^\s\)]+', full_response):
547
- url = m.group(0)
548
- if len(url) > 10:
549
- gemini_sources.append({
550
- "title": url.split('/')[2] if '/' in url else url,
551
- "url": url,
552
- "source": "Google",
553
- })
554
- all_sources.append({
555
- "title": url.split('/')[2] if '/' in url else url,
556
- "url": url,
557
- "source": "Google",
558
- })
559
-
560
- log_lines.append(f"βœ… Gemini: {gemini_pct}% β†’ {len(gemini_sources)}개 좜처")
561
-
562
- except Exception as e:
563
- log_lines.append(f"⚠️ Gemini: {str(e)[:80]}")
564
-
565
- # ============================================
566
- # 보쑰 (10%): Brave + arXiv
567
- # ============================================
568
- brave_pct = 0
569
- arxiv_pct = 0
570
-
571
- progress(0.60, "보쑰 검사 (Brave + arXiv)...")
572
-
573
- try:
574
- if BRAVE_KEY:
575
- words = split_words(text)
576
- key_query = ' '.join(words[:5])
577
- brave_results = brave_search(key_query, 3)
578
- if brave_results:
579
- brave_pct = 30
580
- all_sources.extend(brave_results)
581
- log_lines.append(f"Brave: {len(brave_results)}건")
582
-
583
- words = split_words(text)
584
- wf = Counter(words)
585
- keywords = [w for w, c in wf.most_common(10) if len(w) >= 3][:3]
586
- arxiv_results = search_arxiv(' '.join(keywords))
587
- if arxiv_results:
588
- arxiv_pct = 20
589
- all_sources.extend(arxiv_results)
590
- log_lines.append(f"arXiv: {len(arxiv_results)}건")
591
- except:
592
- pass
593
-
594
- # ============================================
595
- # μ΅œμ’…: Gemini 90% + 보쑰 10%
596
- # ============================================
597
  progress(0.80, "λ³΄κ³ μ„œ 생성...")
 
 
598
 
599
- helper_pct = max(brave_pct, arxiv_pct)
600
- plag_pct = int(gemini_pct * 0.9 + helper_pct * 0.1) if gemini_pct > 0 else helper_pct
601
-
602
- # ============================================
603
- # HTML λ³΄κ³ μ„œ
604
- # ============================================
605
 
 
 
 
 
 
 
 
 
 
606
  seen_urls = set()
607
  unique_sources = []
608
  for s in all_sources:
@@ -610,79 +1124,254 @@ def run_plagiarism(text, progress=gr.Progress()):
610
  if url and url not in seen_urls:
611
  seen_urls.add(url)
612
  unique_sources.append(s)
613
-
614
- if plag_pct >= 50: grade, gc = "ν‘œμ ˆ μ˜μ‹¬", "#FF4444"
615
- elif plag_pct >= 30: grade, gc = "주의 ν•„μš”", "#FF8800"
616
- elif plag_pct >= 15: grade, gc = "μœ μ‚¬ ν‘œν˜„ 일뢀", "#DDAA00"
617
- elif plag_pct >= 5: grade, gc = "μ–‘ν˜Έ", "#4ECDC4"
618
- else: grade, gc = "우수", "#22AA44"
619
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620
  word_count = len(split_words(text))
621
  char_count = len(text)
622
  doc_id = hashlib.md5(text[:100].encode()).hexdigest()[:8].upper()
623
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
624
  src_rows = ""
625
- for i, s in enumerate(unique_sources[:15]):
626
- ico = "πŸ”" if "google" in s.get("source","").lower() else "🌐"
 
 
 
627
  src_rows += f"""<tr>
628
- <td style="padding:6px 8px;font-size:11px;text-align:center;border:1px solid #D5D5D5;">{i+1}</td>
629
- <td style="padding:6px 8px;font-size:11px;text-align:center;border:1px solid #D5D5D5;">{ico}</td>
630
- <td style="padding:6px 8px;border:1px solid #D5D5D5;font-size:11px;color:#1A3C6E;"><strong>{s['title'][:40]}</strong></td>
631
- <td style="padding:6px 8px;border:1px solid #D5D5D5;font-size:9px;color:#666;word-break:break-all;">{s['url'][:50]}</td>
 
632
  </tr>"""
633
-
634
- HDR = '#3B7DD8'
635
- TL = f'padding:7px 10px;font-size:11px;color:#444;font-weight:600;background:#EDF2FA;border:1px solid #D5D5D5;'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
636
  TV = 'padding:7px 10px;font-size:12px;color:#333;border:1px solid #D5D5D5;'
637
-
638
- html = f"""<div style="font-family:'Noto Sans KR',sans-serif;max-width:780px;margin:0 auto;background:#fff;border:2px solid {HDR};box-shadow:0 2px 12px rgba(0,0,0,0.08);">
639
- <div style="background:linear-gradient(135deg,{HDR},#4A8DE0);padding:18px 24px;color:#fff;">
640
- <div style="font-size:22px;font-weight:900;">ν‘œμ ˆ 검사 κ²°κ³Ό</div>
641
- <div style="font-size:10px;opacity:0.8;">λ¬Έμ„œ: {doc_id} | {now}</div>
 
 
 
 
 
 
 
 
 
642
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
  <div style="padding:18px 24px 0;">
 
644
  <table style="width:100%;border-collapse:collapse;">
645
  <tr>
646
- <td style="width:200px;padding:20px;text-align:center;border:1px solid #D5D5D5;background:#FAFBFE;">
647
- <div style="font-size:48px;font-weight:900;color:{gc};">{plag_pct}%</div>
648
- <div style="font-size:12px;color:#666;margin-top:4px;">ν‘œμ ˆμœ¨</div>
 
 
 
 
 
 
 
 
 
 
 
 
649
  </td>
650
- <td style="padding:20px;border:1px solid #D5D5D5;">
651
- <div style="font-size:16px;font-weight:900;color:{gc};margin-bottom:12px;">{grade}</div>
652
- <div style="font-size:11px;color:#666;line-height:1.8;">
653
- <div>κΈ€μžμˆ˜: <b>{char_count:,}</b></div>
654
- <div>λ‹¨μ–΄μˆ˜: <b>{word_count:,}</b></div>
655
- <div>좜처: <b>{len(unique_sources)}</b>개</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
656
  </div>
657
  </td>
658
  </tr>
659
  </table>
660
  </div>
661
- <div style="padding:16px 24px;">
662
- <div style="font-size:13px;font-weight:800;color:#1A3C6E;margin-bottom:10px;padding-bottom:8px;border-bottom:2px solid {HDR};">πŸ” 발견된 좜처</div>
663
- <table style="width:100%;border-collapse:collapse;font-size:10px;">
664
- <thead>
665
- <tr style="background:{HDR};color:white;">
666
- <th style="padding:8px;border:1px solid #D5D5D5;width:30px;">μˆœμœ„</th>
667
- <th style="padding:8px;border:1px solid #D5D5D5;width:30px;">좜처</th>
668
- <th style="padding:8px;border:1px solid #D5D5D5;">제λͺ©</th>
669
- <th style="padding:8px;border:1px solid #D5D5D5;">URL</th>
670
- </tr>
671
- </thead>
672
- <tbody>
673
- {src_rows if src_rows else '<tr><td colspan="4" style="padding:16px;text-align:center;color:#999;">발견된 좜처 μ—†μŒ</td></tr>'}
674
- </tbody>
 
 
 
 
 
 
 
 
 
 
675
  </table>
676
  </div>
677
- <div style="padding:16px 24px;background:#FFF8E1;border-top:1px solid #D5D5D5;font-size:10px;color:#666;line-height:1.6;">
678
- <strong>뢄석:</strong> <strong style="color:#D63031;">Gemini Google Search 90% (메인)</strong> + Brave/arXiv 10% (보쑰)<br>
679
- <strong>방법:</strong> Gemini의 AI 뢄석 + Google Search Grounding으둜 μ‹€μ‹œκ°„ μ›Ή 검색
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
680
  </div>
681
  </div>"""
682
-
683
- log = "\n".join(log_lines)
684
- progress(0.95, "μ™„λ£Œ...")
685
-
686
  return html, log
687
  def run_detection(text, progress=gr.Progress()):
688
  if not text or len(text.strip())<50: return "<div style='padding:20px;text-align:center;color:#888;'>⚠️ μ΅œμ†Œ 50자</div>",""
 
6
  from concurrent.futures import ThreadPoolExecutor, as_completed
7
  from xml.etree import ElementTree as ET
8
  from kiwipiepy import Kiwi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  KIWI = Kiwi()
10
  try:
11
  import httpx; HAS_HTTPX = True
 
15
  from google import genai
16
  from google.genai import types as gtypes
17
  HAS_GENAI = True
18
+ except ImportError:
19
+ HAS_GENAI = False
20
+ try:
21
+ import olefile; HAS_OLEFILE = True
22
+ except ImportError:
23
+ HAS_OLEFILE = False
24
+ try:
25
+ import pdfplumber; HAS_PDFPLUMBER = True
26
+ except ImportError:
27
+ HAS_PDFPLUMBER = False
28
+ try:
29
+ import PyPDF2; HAS_PYPDF2 = True
30
+ except ImportError:
31
+ HAS_PYPDF2 = False
32
+ try:
33
+ from docx import Document as DocxDocument; HAS_DOCX = True
34
+ except ImportError:
35
+ HAS_DOCX = False
36
+ GROQ_KEY = os.getenv("GROQ_API_KEY", "")
37
+ GEMINI_KEY = os.getenv("GEMINI_API_KEY", "")
38
+ BRAVE_KEY = os.getenv("BRAVE_API_KEY", "")
39
+ def extract_text_from_pdf(file_path):
40
+ """PDF β†’ ν…μŠ€νŠΈ (νŽ˜μ΄μ§€λ³„ 뢄리)"""
41
+ pages = []
42
+ if HAS_PDFPLUMBER:
43
+ try:
44
+ with pdfplumber.open(file_path) as pdf:
45
+ for p in pdf.pages:
46
+ t = p.extract_text()
47
+ if t: pages.append(t)
48
+ if pages: return pages, None
49
+ except Exception as e:
50
+ print(f"pdfplumber: {e}")
51
+ if HAS_PYPDF2:
52
+ try:
53
+ with open(file_path, 'rb') as f:
54
+ reader = PyPDF2.PdfReader(f)
55
+ for p in reader.pages:
56
+ t = p.extract_text()
57
+ if t: pages.append(t)
58
+ if pages: return pages, None
59
+ except Exception as e:
60
+ print(f"PyPDF2: {e}")
61
+ return None, "PDF μΆ”μΆœ μ‹€νŒ¨ (pdfplumber, PyPDF2 μ—†μŒ)"
62
+ def extract_text_from_docx(file_path):
63
+ """DOCX β†’ ν…μŠ€νŠΈ (문단별 뢄리)"""
64
+ if not HAS_DOCX: return None, "python-docx μ—†μŒ"
65
+ try:
66
+ doc = DocxDocument(file_path)
67
+ sections = []
68
+ current = []
69
+ for para in doc.paragraphs:
70
+ txt = para.text.strip()
71
+ if not txt:
72
+ if current:
73
+ sections.append('\n'.join(current))
74
+ current = []
75
+ else:
76
+ current.append(txt)
77
+ if current: sections.append('\n'.join(current))
78
+ if sections: return sections, None
79
+ return None, "DOCX ν…μŠ€νŠΈ μ—†μŒ"
80
+ except Exception as e:
81
+ return None, f"DOCX 였λ₯˜: {e}"
82
+ def extract_text_from_txt(file_path):
83
+ """TXT/MD/CSV λ“± β†’ ν…μŠ€νŠΈ"""
84
+ for enc in ['utf-8', 'euc-kr', 'cp949', 'utf-16', 'latin-1']:
85
+ try:
86
+ with open(file_path, 'r', encoding=enc) as f:
87
+ text = f.read()
88
+ if text.strip():
89
+ sections = [s.strip() for s in re.split(r'\n{2,}', text) if s.strip()]
90
+ return sections if sections else [text], None
91
+ except: continue
92
+ return None, "ν…μŠ€νŠΈ 인코딩 감지 μ‹€νŒ¨"
93
+ def extract_text_from_hwpx(file_path):
94
+ """HWPX (ZIP 기반) β†’ ν…μŠ€νŠΈ"""
95
+ try:
96
+ text_parts = []
97
+ with zipfile.ZipFile(file_path, 'r') as zf:
98
+ file_list = zf.namelist()
99
+ section_files = sorted([f for f in file_list if f.startswith('Contents/section') and f.endswith('.xml')])
100
+ if not section_files:
101
+ section_files = sorted([f for f in file_list if 'section' in f.lower() and f.endswith('.xml')])
102
+ for sf_name in section_files:
103
+ try:
104
+ with zf.open(sf_name) as sf:
105
+ content = sf.read().decode('utf-8', errors='ignore')
106
+ content = re.sub(r'\sxmlns[^"]*"[^"]*"', '', content)
107
+ content = re.sub(r'<[a-zA-Z]+:', '<', content)
108
+ content = re.sub(r'</[a-zA-Z]+:', '</', content)
109
+ try:
110
+ root = ET.fromstring(content)
111
+ texts = []
112
+ for elem in root.iter():
113
+ if elem.tag.endswith('t') or elem.tag == 't':
114
+ if elem.text: texts.append(elem.text)
115
+ elif elem.text and elem.text.strip():
116
+ if any(x in elem.tag.lower() for x in ['text', 'run', 'para', 'char']):
117
+ texts.append(elem.text.strip())
118
+ if texts: text_parts.append(' '.join(texts))
119
+ except ET.ParseError:
120
+ matches = re.findall(r'>([^<]+)<', content)
121
+ clean = [t.strip() for t in matches if t.strip() and len(t.strip()) > 1]
122
+ if clean: text_parts.append(' '.join(clean))
123
+ except: continue
124
+ if text_parts:
125
+ return text_parts, None
126
+ return None, "HWPX ν…μŠ€νŠΈ μ—†μŒ"
127
+ except zipfile.BadZipFile:
128
+ return None, "μœ νš¨ν•˜μ§€ μ•Šμ€ HWPX"
129
+ except Exception as e:
130
+ return None, f"HWPX 였λ₯˜: {e}"
131
+ def _decode_hwp_para(data):
132
+ """HWP λ°”μ΄λ„ˆλ¦¬ β†’ 문단 ν…μŠ€νŠΈ"""
133
+ result = []
134
+ i = 0
135
+ while i < len(data) - 1:
136
+ code = int.from_bytes(data[i:i+2], 'little')
137
+ if code in (1,2,3): i += 14
138
+ elif code == 9: result.append('\t')
139
+ elif code in (10,13): result.append('\n')
140
+ elif code == 24: result.append('-')
141
+ elif code in (30,31): result.append(' ')
142
+ elif code >= 32:
143
+ try:
144
+ ch = chr(code)
145
+ if ch.isprintable() or ch in '\n\t ': result.append(ch)
146
+ except: pass
147
+ i += 2
148
+ text = ''.join(result).strip()
149
+ text = re.sub(r'[ \t]+', ' ', text)
150
+ text = re.sub(r'\n{3,}', '\n\n', text)
151
+ return text if len(text) > 2 else None
152
+ def _extract_hwp_section(data):
153
+ """HWP μ„Ήμ…˜ λ°”μ΄λ„ˆλ¦¬ β†’ ν…μŠ€νŠΈ"""
154
+ texts = []
155
+ pos = 0
156
+ while pos < len(data) - 4:
157
+ try:
158
+ header = int.from_bytes(data[pos:pos+4], 'little')
159
+ tag_id = header & 0x3FF
160
+ size = (header >> 20) & 0xFFF
161
+ pos += 4
162
+ if size == 0xFFF:
163
+ if pos + 4 > len(data): break
164
+ size = int.from_bytes(data[pos:pos+4], 'little')
165
+ pos += 4
166
+ if pos + size > len(data): break
167
+ record_data = data[pos:pos+size]
168
+ pos += size
169
+ if tag_id == 67 and size > 0:
170
+ t = _decode_hwp_para(record_data)
171
+ if t: texts.append(t)
172
+ except:
173
+ pos += 1
174
+ return '\n'.join(texts) if texts else None
175
+ def extract_text_from_hwp(file_path):
176
+ """HWP (OLE 기반) β†’ ν…μŠ€νŠΈ"""
177
+ if not HAS_OLEFILE: return None, "olefile μ—†μŒ"
178
+ try:
179
+ ole = olefile.OleFileIO(file_path)
180
+ if not ole.exists('FileHeader'):
181
+ ole.close(); return None, "HWP 헀더 μ—†μŒ"
182
+ header_data = ole.openstream('FileHeader').read()
183
+ is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
184
+ all_texts = []
185
+ for entry in ole.listdir():
186
+ entry_path = '/'.join(entry)
187
+ if entry_path.startswith('BodyText/Section'):
188
+ try:
189
+ stream = ole.openstream(entry).read()
190
+ if is_compressed:
191
+ try: stream = zlib.decompress(stream, -15)
192
+ except:
193
+ try: stream = zlib.decompress(stream)
194
+ except: pass
195
+ section_text = _extract_hwp_section(stream)
196
+ if section_text: all_texts.append(section_text)
197
+ except: continue
198
+ ole.close()
199
+ if all_texts: return all_texts, None
200
+ return None, "HWP ν…μŠ€νŠΈ μ—†μŒ"
201
+ except Exception as e:
202
+ return None, f"HWP 였λ₯˜: {e}"
203
+ def extract_text_from_file(file_path):
204
+ """
205
+ 만λŠ₯ λ¬Έμ„œ μΆ”μΆœ: PDF/DOCX/HWP/HWPX/TXT β†’ (sections_list, full_text, error)
206
+ sections_list: νŽ˜μ΄μ§€/μ„Ήμ…˜λ³„ ν…μŠ€νŠΈ 리슀트
207
+ full_text: 전체 ν•©μΉœ ν…μŠ€νŠΈ
208
+ """
209
+ if not file_path or not os.path.exists(file_path):
210
+ return None, None, "파일 μ—†μŒ"
211
+ ext = Path(file_path).suffix.lower()
212
+ sections, error = None, None
213
+ if ext == '.pdf':
214
+ sections, error = extract_text_from_pdf(file_path)
215
+ elif ext == '.docx':
216
+ sections, error = extract_text_from_docx(file_path)
217
+ elif ext == '.hwpx':
218
+ sections, error = extract_text_from_hwpx(file_path)
219
+ elif ext == '.hwp':
220
+ sections, error = extract_text_from_hwp(file_path)
221
+ elif ext in ('.txt', '.md', '.csv', '.json', '.xml', '.html'):
222
+ sections, error = extract_text_from_txt(file_path)
223
+ else:
224
+ return None, None, f"μ§€μ›ν•˜μ§€ μ•ŠλŠ” ν˜•μ‹: {ext}"
225
+ if sections:
226
+ full = '\n\n'.join(sections)
227
+ return sections, full, None
228
+ return None, None, error or "ν…μŠ€νŠΈ μΆ”μΆœ μ‹€νŒ¨"
229
+ def split_sentences(text):
230
+ try:
231
+ s = [x.text.strip() for x in KIWI.split_into_sents(text) if x.text.strip()]
232
+ if s: return s
233
+ except: pass
234
+ return [x.strip() for x in re.split(r'(?<=[.!?。])\s+', text) if x.strip()]
235
+ def split_words(text):
236
+ return [w for w in re.findall(r'[κ°€-힣a-zA-Z0-9]+', text) if w]
237
+ def get_morphemes(text):
238
+ try:
239
+ r = KIWI.analyze(text)
240
+ if r and r[0]: return [(m.form, m.tag) for m in r[0][0]]
241
+ except: pass
242
+ return []
243
+ def http_get(url, headers=None, timeout=15):
244
+ try:
245
+ if HAS_HTTPX:
246
+ r = httpx.get(url, headers=headers or {}, timeout=timeout, follow_redirects=True)
247
+ return r.text if r.status_code == 200 else None
248
+ else:
249
+ import urllib.request
250
+ req = urllib.request.Request(url, headers=headers or {})
251
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
252
+ return resp.read().decode('utf-8', errors='replace')
253
+ except: return None
254
+ def http_post_json(url, body, headers=None, timeout=30):
255
+ try:
256
+ h = headers or {}
257
+ h["Content-Type"] = "application/json"
258
+ if HAS_HTTPX:
259
+ r = httpx.post(url, json=body, headers=h, timeout=timeout)
260
+ if r.status_code == 200: return r.json()
261
+ return None
262
+ else:
263
+ import urllib.request, ssl
264
+ req = urllib.request.Request(url, json.dumps(body).encode(), h)
265
+ with urllib.request.urlopen(req, timeout=timeout, context=ssl.create_default_context()) as resp:
266
+ return json.loads(resp.read())
267
+ except: return None
268
+ def call_groq(model, prompt, max_tokens=800, temperature=0.1):
269
+ if not GROQ_KEY: return None, "NO_KEY"
270
+ url = "https://api.groq.com/openai/v1/chat/completions"
271
+ h = {"Authorization": f"Bearer {GROQ_KEY}", "Content-Type": "application/json"}
272
+ b = {"model": model, "messages": [{"role":"user","content":prompt}], "max_tokens": max_tokens, "temperature": temperature}
273
+ try:
274
+ if HAS_HTTPX:
275
+ r = httpx.post(url, json=b, headers=h, timeout=45)
276
+ if r.status_code == 200: return r.json()["choices"][0]["message"]["content"], None
277
+ return None, f"HTTP {r.status_code}"
278
+ else:
279
+ import urllib.request, ssl
280
+ req = urllib.request.Request(url, json.dumps(b).encode(), h)
281
+ with urllib.request.urlopen(req, timeout=45, context=ssl.create_default_context()) as resp:
282
+ return json.loads(resp.read())["choices"][0]["message"]["content"], None
283
+ except Exception as e: return None, str(e)[:150]
284
+ AI_ENDINGS = ['ν•©λ‹ˆλ‹€','μž…λ‹ˆλ‹€','λ©λ‹ˆλ‹€','μŠ΅λ‹ˆλ‹€','μžˆμŠ΅λ‹ˆλ‹€','ν–ˆμŠ΅λ‹ˆλ‹€','κ² μŠ΅λ‹ˆλ‹€']
285
+ AI_CASUAL_ENDINGS = ['라고 ν•  수 μžˆλ‹€','라고 λ³Ό 수 μžˆλ‹€','λ‹€κ³  μƒκ°ν•œλ‹€','λ‹€κ³  νŒλ‹¨λœλ‹€',
286
+ '인 μ…ˆμ΄λ‹€','인 것이닀','λŠ” 것이닀','λŠ” μ…ˆμ΄λ‹€','γ„Ή 것이닀','을 것이닀',
287
+ '라 ν•  수 μžˆλ‹€','둜 보인닀','둜 νŒλ‹¨λœλ‹€','κ³  μžˆλ‹€','λŠ” μΆ”μ„Έλ‹€','λŠ” 상황이닀',
288
+ 'μ§€ μ•Šμ„ 수 μ—†λ‹€','라 ν•˜κ² λ‹€','μŒμ„ μ•Œ 수 μžˆλ‹€','ν•  ν•„μš”κ°€ μžˆλ‹€']
289
+ AI_CONNS = ['λ˜ν•œ','λ”°λΌμ„œ','κ·ΈλŸ¬λ―€λ‘œ','이에 따라','ν•œνŽΈ','λ”λΆˆμ–΄','μ•„μšΈλŸ¬','뿐만 μ•„λ‹ˆλΌ',
290
+ '이λ₯Ό 톡해','이에','결과적으둜','ꢁ극적으둜','특히','λ‚˜μ•„κ°€','μ΄λŸ¬ν•œ']
291
+ AI_SOFT_CONNS = ['λ¬Όλ‘ ','κ·ΈλŸ¬λ‚˜','ν•˜μ§€λ§Œ','이처럼','이와 같이','이λ₯Ό λ°”νƒ•μœΌλ‘œ']
292
+ AI_FILLER = ['κ²ƒμœΌλ‘œ 보','κ²ƒμœΌλ‘œ λ‚˜νƒ€','κ²ƒμœΌλ‘œ μ˜ˆμƒ','ν•  수 있','λ³Ό 수 있','μ£Όλͺ©ν•  만',
293
+ 'μ€‘μš”ν•œ μ—­ν• ','μ€‘μš”ν•œ 의미','긍정적인 영ν–₯','뢀정적인 영ν–₯','ν•„μš”ν•©λ‹ˆλ‹€','ν•„μš”ν•˜λ‹€',
294
+ 'μ€‘μš”ν•©λ‹ˆλ‹€','μ€‘μš”ν•˜λ‹€','역할을 ν•˜','영ν–₯을 λ―Έ','κΈ°λŒ€λœλ‹€','μ˜ˆμƒλ©λ‹ˆλ‹€','λΆ€κ°λ˜κ³ ',
295
+ 'λŒ€λ‘λ˜κ³ ','λ‹€μ–‘ν•œ λΆ„μ•Ό','λ‹€μ–‘ν•œ μ‚°μ—…','λˆˆλΆ€μ‹  μ„±κ³Ό','획기적인 λ³€ν™”','ν˜μ‹ μ μΈ',
296
+ 'μ μ—μ„œ','μΈ‘λ©΄μ—μ„œ','κ΄€μ μ—μ„œ']
297
+ AI_CASUAL_FILLER = ['λ¬΄κΆλ¬΄μ§„ν•˜λ‹€','λ¬΄κΆλ¬΄μ§„ν•œ','과언이 μ•„λ‹ˆ','λŒνŒŒκ΅¬κ°€ 될','μ „ν™˜μ μ΄ 될',
298
+ '기반으둜','발판으둜','원동λ ₯이','μ΄ˆμ„μ΄ 될','κ°€μ†ν™”λ˜','급뢀상','νŒ¨λŸ¬λ‹€μž„',
299
+ '지평을 μ—΄','μƒˆλ‘œμš΄ μž₯을','λŒ€μ „ν™˜','λ³Έκ²©ν™”λ˜','고도화','μ΄μ •ν‘œ']
300
+ AI_CONCESSION = re.compile(r'λ¬Όλ‘ .{2,20}(ν•˜μ§€λ§Œ|κ·ΈλŸ¬λ‚˜|κ·Έλ ‡μ§€λ§Œ|λ‹€λ§Œ)|.{2,15}(이긴 ν•˜μ§€λ§Œ|κΈ°λŠ” ν•˜μ§€λ§Œ|수 μžˆμ§€λ§Œ|μˆ˜λŠ” μžˆμ§€λ§Œ)')
301
+ EN_AI_MARKERS = ['furthermore','additionally','moreover','it is worth noting','in conclusion',
302
+ 'it is important to','plays a crucial role','significant impact','various aspects',
303
+ 'in this regard','consequently','nevertheless','integral part of','led to remarkable',
304
+ 'fundamentally transformed','has become increasingly','it should be noted',
305
+ 'in the context of','paradigm shift','landscape of','methodologies',
306
+ 'transformative impact','unprecedented','in various domains']
307
+ HUMAN_MARKERS = {
308
+ 'γ…‹γ…Žγ… ': re.compile(r'([γ…‹γ…Žγ… γ…œγ„·γ„±])\1{1,}'),
309
+ '이λͺ¨ν‹°μ½˜': re.compile(r'[;:]-?[)(DPp]|\^[_\-]?\^|γ…‘γ…‘|;;'),
310
+ 'μ€„μž„': re.compile(r'γ„Ήγ…‡|γ…‡γ…‡|γ„΄γ„΄|γ…‡γ…‹|γ„·γ„·|γ…‚γ…‚'),
311
+ 'λŠλ‚Œν‘œ': re.compile(r'[!?]{2,}'),
312
+ '비격식쒅결': re.compile(r'(κ±°λ“ |μž–μ•„|인데|인걸|κ°™μŒ|λŠλ‚Œ|μ•„λ‹˜|λŒ€λ°•|λ―Έμ³€|헐|γ…‹$|γ…Ž$|μž„$|음$|λ“―$)'),
313
+ 'ꡬ어좕약': re.compile(r'(걍|μ’€|막|μ™„μ „|μ§„μ§œ|λ ˆμ•Œ|μ‘΄λ‚˜|개|쑸라|μ‘΄λ§›|κ²λ‚˜)'),
314
+ 'λ§žμΆ€λ²•μ˜€λ₯˜': re.compile(r'됬|λͺ‡μΌ|κΈˆμƒˆ|ν• μˆ˜μžˆ|것같[은닀]|λ˜κ°€|λ˜μ„œ|μ•ˆλ€|ν–‡λ‹€'),
315
+ 'λ§μ€„μž„ν‘œ': re.compile(r'\.{3,}|…'),
316
+ }
317
+ FP = {
318
+ "GPT": {"m":['물둠이죠','도움이 λ˜μ…¨κΈ°λ₯Ό','μ„€λͺ…ν•΄ λ“œλ¦¬κ² μŠ΅λ‹ˆλ‹€','μΆ”κ°€ 질문','도움이 ν•„μš”ν•˜μ‹œλ©΄',
319
+ 'μš”μ•½ν•˜μžλ©΄','κ°„λž΅νžˆ μ •λ¦¬ν•˜λ©΄','핡심은'],"e":['μŠ΅λ‹ˆλ‹€','λ“œλ¦¬κ² μŠ΅λ‹ˆλ‹€'],"lp":re.compile(r'^\d+\.\s|^[-β€’]\s',re.M)},
320
+ "Claude": {"m":['λ§μ”€ν•˜μ‹ ','μ‚΄νŽ΄λ³΄κ² μŠ΅λ‹ˆλ‹€','κ· ν˜• 작힌','λ§₯λ½μ—μ„œ','ν•œ κ°€μ§€ μ£Όμ˜ν• ','λ‰˜μ•™μŠ€',
321
+ 'ν₯미둜운 질문','λ³΅μž‘ν•œ 주제'],"e":['λ„€μš”','κ±°μ˜ˆμš”'],"lp":re.compile(r'^\*\*.*\*\*|^#+\s',re.M)},
322
+ "Gemini": {"m":['λ‹€μŒκ³Ό κ°™μŠ΅λ‹ˆλ‹€','정리해 λ“œλ¦¬κ² μŠ΅λ‹ˆλ‹€','핡심 λ‚΄μš©μ„','더 μ•Œκ³  μ‹ΆμœΌμ‹œλ©΄',
323
+ 'μ•Œμ•„λ³΄κ² μŠ΅λ‹ˆλ‹€'],"e":['κ² μŠ΅λ‹ˆλ‹€','λ³΄μ„Έμš”'],"lp":re.compile(r'^\*\s|^-\s\*\*',re.M)},
324
+ "Perplexity": {"m":['검색 결과에 λ”°λ₯΄λ©΄','보도에 λ”°λ₯΄λ©΄','연ꡬ에 λ”°λ₯΄λ©΄','λ°ν˜”λ‹€','μ „ν–ˆλ‹€',
325
+ 'κ²ƒμœΌλ‘œ λ‚˜νƒ€λ‚¬λ‹€','κ²ƒμœΌλ‘œ 쑰사됐닀','κ²ƒμœΌλ‘œ 집계됐닀','λ°œν‘œν–ˆλ‹€'],"e":['λ°ν˜”λ‹€','λ‚˜νƒ€λ‚¬λ‹€','μ „ν–ˆλ‹€'],"lp":re.compile(r'\[\d+\]',re.M)},
326
+ }
327
+ def score_sentence(sent):
328
+ """단일 λ¬Έμž₯ AI 점수 (0~100). νƒ­1Β·νƒ­2 곡유. v5.0 λŒ€ν­ κ°•ν™”."""
329
+ sc = 0; reasons = []
330
+ sl = sent.lower().strip()
331
+ sr = sent.rstrip('.!?。')
332
+ for e in AI_ENDINGS:
333
+ if sr.endswith(e): sc += 22; reasons.append(f"격식어미(-{e})"); break
334
+ if sc == 0: # 격식이 μ•„λ‹Œ 경우만
335
+ for e in AI_CASUAL_ENDINGS:
336
+ if sr.endswith(e): sc += 15; reasons.append(f"비격식AI(-{e})"); break
337
+ stripped = sent.strip()
338
+ for c in AI_CONNS:
339
+ if stripped.startswith(c):
340
+ sc += 18; reasons.append(f"AI접속사({c})"); break
341
+ else:
342
+ for c in AI_SOFT_CONNS:
343
+ if stripped.startswith(c): sc += 8; reasons.append(f"μ•½ν•œμ ‘μ†μ‚¬({c})"); break
344
+ filler_found = sum(1 for f in AI_FILLER if f in sent)
345
+ casual_filler = sum(1 for f in AI_CASUAL_FILLER if f in sent)
346
+ total_filler = filler_found + casual_filler
347
+ if total_filler >= 3: sc += 25; reasons.append(f"μƒνˆ¬ν‘œν˜„Γ—{total_filler}")
348
+ elif total_filler == 2: sc += 18; reasons.append(f"μƒνˆ¬ν‘œν˜„Γ—2")
349
+ elif total_filler == 1: sc += 10; reasons.append(f"μƒνˆ¬ν‘œν˜„Γ—1")
350
+ if AI_CONCESSION.search(sent): sc += 10; reasons.append("μ–‘λ³΄νŒ¨ν„΄")
351
+ for mn, fp in FP.items():
352
+ for m in fp["m"]:
353
+ if m in sent: sc += 8; reasons.append(f"{mn}μ§€λ¬Έ"); break
354
+ en_count = sum(1 for em in EN_AI_MARKERS if em in sl)
355
+ if en_count >= 3: sc += 25; reasons.append(f"μ˜μ–΄AIΓ—{en_count}")
356
+ elif en_count >= 2: sc += 18; reasons.append(f"μ˜μ–΄AIΓ—{en_count}")
357
+ elif en_count >= 1: sc += 12; reasons.append(f"μ˜μ–΄AIΓ—1")
358
+ has_formal = any(sr.endswith(e) for e in AI_ENDINGS)
359
+ has_conn = any(stripped.startswith(c) for c in AI_CONNS)
360
+ if has_formal and total_filler >= 1 and has_conn: sc += 8; reasons.append("볡합AI")
361
+ elif has_formal and total_filler >= 2: sc += 5; reasons.append("격식+μƒνˆ¬")
362
+ for n, p in HUMAN_MARKERS.items():
363
+ matches = p.findall(sent)
364
+ if matches:
365
+ if n in ('γ…‹γ…Žγ… ','이λͺ¨ν‹°μ½˜','μ€„μž„'): sc -= 25; reasons.append(f"인간({n})")
366
+ elif n in ('비격식쒅결','ꡬ어좕약'): sc -= 18; reasons.append(f"ꡬ어체({n})")
367
+ elif n == 'λ§žμΆ€λ²•μ˜€λ₯˜': sc -= 12; reasons.append("λ§žμΆ€λ²•μ˜€λ₯˜")
368
+ elif n in ('λŠλ‚Œν‘œ','λ§μ€„μž„ν‘œ'): sc -= 10; reasons.append(f"인간({n})")
369
+ return max(0, min(100, sc)), reasons
370
+ def analyze_statistics(text, sentences, words):
371
+ sl = [len(s) for s in sentences]
372
+ if len(sl) < 2: return {"score":50}
373
+ avg = sum(sl)/len(sl); std = math.sqrt(sum((l-avg)**2 for l in sl)/len(sl))
374
+ cv = std/avg if avg > 0 else 0
375
+ cv_score = 85 if cv<0.20 else 70 if cv<0.30 else 50 if cv<0.45 else 30 if cv<0.60 else 15
376
+ diffs = [abs(sl[i]-sl[i-1]) for i in range(1,len(sl))]
377
+ burst_score = 50
378
+ if diffs:
379
+ avg_d = sum(diffs)/len(diffs)
380
+ max_d = max(diffs)
381
+ burst_ratio = max_d / (avg_d + 1)
382
+ burst_score = 85 if burst_ratio < 1.8 else 65 if burst_ratio < 2.5 else 40 if burst_ratio < 3.5 else 20
383
+ standard_ratio = sum(1 for l in sl if 20 <= l <= 60) / len(sl)
384
+ std_score = 80 if standard_ratio > 0.8 else 60 if standard_ratio > 0.6 else 40 if standard_ratio > 0.4 else 20
385
+ extreme = sum(1 for l in sl if l < 10 or l > 80)
386
+ if extreme >= 2: std_score = max(10, std_score - 20)
387
+ elif extreme >= 1: std_score = max(15, std_score - 10)
388
+ wf = Counter(words); t = len(words)
389
+ ttr = len(wf)/t if t>0 else 0
390
+ vocab_score = 70 if ttr<0.45 else 55 if ttr<0.55 else 35 if ttr<0.65 else 20
391
+ wpc = [len(split_words(s)) for s in sentences]
392
+ complex_score = 50
393
+ if len(wpc) >= 3:
394
+ wpc_avg = sum(wpc)/len(wpc)
395
+ wpc_std = math.sqrt(sum((w-wpc_avg)**2 for w in wpc)/len(wpc))
396
+ wpc_cv = wpc_std/wpc_avg if wpc_avg > 0 else 0
397
+ complex_score = 80 if wpc_cv < 0.20 else 60 if wpc_cv < 0.35 else 35 if wpc_cv < 0.50 else 15
398
+ final = int(cv_score*0.20 + burst_score*0.20 + std_score*0.25 + vocab_score*0.15 + complex_score*0.20)
399
+ return {"score":final,"cv":round(cv,3),"ttr":round(ttr,3)}
400
+ def analyze_korean_style(text, sentences, morphemes):
401
+ if not sentences: return {"score":50}
402
+ formal_cnt = sum(1 for s in sentences if any(s.rstrip('.!?').endswith(e) for e in AI_ENDINGS))
403
+ casual_ai = sum(1 for s in sentences if any(s.rstrip('.!?').endswith(e) for e in AI_CASUAL_ENDINGS))
404
+ fr = formal_cnt/len(sentences)
405
+ car = casual_ai/len(sentences)
406
+ ending_score = 85 if fr>0.7 else 65 if fr>0.5 else 45 if fr>0.3 else 25 if fr>0.1 else 10
407
+ ending_score = min(90, ending_score + int(car * 25)) # 비격식AI λ³΄λ„ˆμŠ€
408
+ conn_positions = []
409
+ for i, s in enumerate(sentences):
410
+ for c in AI_CONNS:
411
+ if s.strip().startswith(c): conn_positions.append(i); break
412
+ conn_density = len(conn_positions)/len(sentences) if sentences else 0
413
+ conn_score = 85 if conn_density>0.4 else 65 if conn_density>0.25 else 40 if conn_density>0.1 else 15
414
+ if len(conn_positions) >= 2:
415
+ gaps = [conn_positions[i]-conn_positions[i-1] for i in range(1,len(conn_positions))]
416
+ gap_cv = (math.sqrt(sum((g-sum(gaps)/len(gaps))**2 for g in gaps)/len(gaps))/(sum(gaps)/len(gaps)+0.01))
417
+ if gap_cv < 0.5: conn_score = min(90, conn_score + 10) # 맀우 κ·œμΉ™μ  β†’ AI λ³΄λ„ˆμŠ€
418
+ filler_cnt = sum(1 for f in AI_FILLER if f in text) + sum(1 for f in AI_CASUAL_FILLER if f in text)
419
+ filler_score = 90 if filler_cnt>=6 else 75 if filler_cnt>=4 else 55 if filler_cnt>=2 else 30 if filler_cnt>=1 else 10
420
+ concession_cnt = len(AI_CONCESSION.findall(text))
421
+ conc_score = 80 if concession_cnt >= 2 else 55 if concession_cnt >= 1 else 20
422
+ human_count = sum(len(p.findall(text)) for p in HUMAN_MARKERS.values())
423
+ human_penalty = min(35, human_count * 8)
424
+ pos_score = 45
425
+ if morphemes:
426
+ pc = Counter(t for _,t in morphemes); tm = sum(pc.values())
427
+ noun_r = sum(pc.get(t,0) for t in ['NNG','NNP','NNB','NR'])/tm if tm else 0
428
+ pos_score = 70 if noun_r>0.42 else 55 if noun_r>0.38 else 35 if noun_r>0.32 else 20
429
+ final = max(5, int(ending_score*0.25 + conn_score*0.20 + filler_score*0.20 +
430
+ conc_score*0.10 + pos_score*0.15 + 10*0.10) - human_penalty)
431
+ return {"score":final,"formal":f"{fr:.0%}","conn":f"{conn_density:.2f}","filler":filler_cnt,"human":human_count}
432
+ def analyze_repetition(text, sentences, words):
433
+ if not sentences or len(sentences) < 2: return {"score":35}
434
+ tr = 0
435
+ if len(words)>=5:
436
+ tg = Counter(tuple(words[i:i+3]) for i in range(len(words)-2))
437
+ tr = sum(1 for c in tg.values() if c>1)/len(tg) if tg else 0
438
+ ngram_score = 80 if tr>0.15 else 60 if tr>0.08 else 35 if tr>0.03 else 15
439
+ openers_2 = []
440
+ openers_3 = []
441
+ for s in sentences:
442
+ ws = split_words(s)
443
+ if len(ws) >= 2: openers_2.append(tuple(ws[:2]))
444
+ if len(ws) >= 3: openers_3.append(tuple(ws[:3]))
445
+ opener2_score = 50
446
+ if openers_2:
447
+ unique2 = len(set(openers_2))/len(openers_2)
448
+ opener2_score = 80 if unique2 < 0.5 else 60 if unique2 < 0.7 else 35 if unique2 < 0.85 else 15
449
+ ai_only_conns = ['λ˜ν•œ','λ”°λΌμ„œ','κ·ΈλŸ¬λ―€λ‘œ','이에 따라','λ”λΆˆμ–΄','μ•„μšΈλŸ¬','뿐만 μ•„λ‹ˆλΌ',
450
+ '이λ₯Ό 톡해','이에','결과적으둜','ꢁ극적으둜','λ‚˜μ•„κ°€','μ΄λŸ¬ν•œ']
451
+ cr = sum(1 for s in sentences if any(s.strip().startswith(c) for c in ai_only_conns))
452
+ crr = cr/len(sentences) if sentences else 0
453
+ ai_conn_score = 85 if crr>0.35 else 65 if crr>0.2 else 40 if crr>0.08 else 15
454
+ templates = []
455
+ for s in sentences:
456
+ ws = split_words(s)
457
+ if len(ws) >= 4:
458
+ templates.append((ws[0], ws[-1]))
459
+ template_rep = 0
460
+ if templates:
461
+ tc = Counter(templates)
462
+ template_rep = sum(1 for c in tc.values() if c > 1) / len(tc) if tc else 0
463
+ template_score = 80 if template_rep > 0.3 else 55 if template_rep > 0.1 else 25
464
+ endings = []
465
+ for s in sentences:
466
+ sr = s.rstrip('.!?。')
467
+ for e in AI_ENDINGS + ['μžˆλ‹€','ν–ˆλ‹€','λœλ‹€','ν•œλ‹€','이닀','λŠ”λ‹€']:
468
+ if sr.endswith(e): endings.append(e); break
469
+ ending_div = 50
470
+ if endings:
471
+ unique_e = len(set(endings))/len(endings)
472
+ ending_div = 80 if unique_e < 0.3 else 60 if unique_e < 0.5 else 35 if unique_e < 0.7 else 15
473
+ final = int(ngram_score*0.15 + opener2_score*0.20 + ai_conn_score*0.25 +
474
+ template_score*0.15 + ending_div*0.25)
475
+ return {"score":final}
476
+ AI_VAGUE = re.compile(r'λ‹€μ–‘ν•œ|μ€‘μš”ν•œ|긍정적인|뢀정적인|획기적인|ν˜μ‹ μ μΈ|효율적인|체계적인|쒅합적인|μ „λ°˜μ μΈ|지속적인|적극적인|μƒλ‹Ήν•œ|μ£Όμš”ν•œ')
477
+ CONCRETE_PROPER = re.compile(r'μ‚Όμ„±|LG|ν˜„λŒ€|SK|카카였|넀이버|λ„·ν”Œλ¦­μŠ€|ꡬ글|μ• ν”Œ|ν…ŒμŠ¬λΌ|μ•„λ§ˆμ‘΄|λ§ˆμ΄ν¬λ‘œμ†Œν”„νŠΈ|[κ°€-힣]{2,}λŒ€ν•™|[κ°€-힣]{2,}병원|[κ°€-힣]{1,3}μ‹œ[λŠ”μ„μ΄κ°€]|[κ°€-힣]{1,3}ꡬ[λŠ”μ„μ΄κ°€]|[κ°€-힣]{2,}동[μ—μ„œ]')
478
+ CONCRETE_NUMBER = re.compile(r'\d{2,}[λ§Œμ–΅μ‘°μ›λ‹¬λŸ¬%κ°œλ…„μ›”μΌμœ„λ“±ν˜Έ]|\d+\.\d+%|\d{4}λ…„|\d{1,2}μ›”')
479
+ CONCRETE_QUOTE = re.compile(r'에 λ”°λ₯΄λ©΄|λ°œν‘œν–ˆ|λ°ν˜”λ‹€|λ³΄λ„ν–ˆ|μ „ν–ˆλ‹€|라고 말|인터뷰|μ„€λ¬Έ|쑰사|톡계청|λ³΄κ³ μ„œ')
480
+ def analyze_structure(text, sentences):
481
+ if not sentences: return {"score":35}
482
+ lt = (len(re.findall(r'^\d+[.)]\s',text,re.M)) + len(re.findall(r'^[-β€’*]\s',text,re.M)) +
483
+ len(re.findall(r'^#+\s',text,re.M)) + len(re.findall(r'\*\*[^*]+\*\*',text)))
484
+ list_score = 90 if lt>=5 else 70 if lt>=3 else 45 if lt>=1 else 10
485
+ vague_cnt = len(AI_VAGUE.findall(text))
486
+ proper_cnt = len(CONCRETE_PROPER.findall(text))
487
+ number_cnt = len(CONCRETE_NUMBER.findall(text))
488
+ quote_cnt = len(CONCRETE_QUOTE.findall(text))
489
+ concrete_total = proper_cnt + number_cnt + quote_cnt
490
+ if vague_cnt >= 3 and concrete_total == 0: abstract_score = 90
491
+ elif vague_cnt >= 2 and concrete_total <= 1: abstract_score = 70
492
+ elif vague_cnt >= 1 and concrete_total == 0: abstract_score = 55
493
+ elif concrete_total >= 3: abstract_score = 10
494
+ elif concrete_total >= 2: abstract_score = 20
495
+ elif concrete_total >= 1: abstract_score = 30
496
+ else: abstract_score = 45
497
+ has_question = any(s.strip().endswith('?') for s in sentences)
498
+ has_exclaim = any(s.strip().endswith('!') for s in sentences)
499
+ has_ellipsis = any('...' in s or '…' in s for s in sentences)
500
+ variety = sum([has_question, has_exclaim, has_ellipsis])
501
+ type_score = 15 if variety >= 2 else 40 if variety >= 1 else 65
502
+ puncts = re.findall(r'[!?,;:…—\-~]', text)
503
+ unique_punct = len(set(puncts))
504
+ punct_score = 65 if unique_punct <= 1 else 45 if unique_punct <= 3 else 20
505
+ paras = [p.strip() for p in text.split('\n\n') if p.strip()]
506
+ para_score = 35
507
+ if len(paras) >= 2:
508
+ pl = [len(split_sentences(p)) for p in paras]
509
+ avg_p = sum(pl)/len(pl)
510
+ if avg_p > 0:
511
+ pcv = math.sqrt(sum((l-avg_p)**2 for l in pl)/len(pl))/avg_p
512
+ para_score = 75 if pcv < 0.2 else 55 if pcv < 0.35 else 30
513
+ if len(paras) >= 3 and pl[0] < avg_p and pl[-1] < avg_p:
514
+ para_score = min(85, para_score + 10)
515
+ final = int(list_score*0.10 + abstract_score*0.40 + type_score*0.20 + punct_score*0.10 + para_score*0.20)
516
+ return {"score":final}
517
+ def analyze_model_fingerprint(text, sentences):
518
+ ms = {}
519
+ sl = text.lower()
520
+ for mn, fp in FP.items():
521
+ sc = sum(min(15,text.count(m)*5) for m in fp["m"] if text.count(m)>0)
522
+ lm = fp["lp"].findall(text)
523
+ if lm: sc += min(20,len(lm)*3)
524
+ em = sum(1 for s in sentences if any(s.rstrip('.!?').endswith(e) for e in fp.get("e",[])))
525
+ if sentences: sc += int((em/len(sentences))*20)
526
+ ms[mn] = min(100,sc)
527
+ general_ai = 0
528
+ general_ai += sum(5 for f in AI_CASUAL_FILLER if f in text)
529
+ casual_end_cnt = sum(1 for s in sentences if any(s.rstrip('.!?').endswith(e) for e in AI_CASUAL_ENDINGS))
530
+ general_ai += casual_end_cnt * 5
531
+ general_ai += len(AI_CONCESSION.findall(text)) * 8
532
+ ms["비격식AI"] = min(100, general_ai)
533
+ en_score = sum(5 for em in EN_AI_MARKERS if em in sl)
534
+ ms["μ˜μ–΄AI"] = min(100, en_score)
535
+ mx = max(ms.values()) if ms else 0
536
+ multi = sum(1 for v in ms.values() if v >= 10)
537
+ multi_bonus = 10 if multi >= 3 else 5 if multi >= 2 else 0
538
+ base = 85 if mx>=50 else 65 if mx>=35 else 45 if mx>=20 else 25 if mx>=10 else 10
539
+ return {"score":min(95, base + multi_bonus),"model_scores":{k:v for k,v in ms.items() if k not in ("비격식AI","μ˜μ–΄AI") or v > 0}}
540
+ def analyze_perplexity(text, sentences, morphemes):
541
+ """ν•œκ΅­μ–΄ νŠΉν™” Perplexity + Burstiness β€” 문자 μ—”νŠΈλ‘œν”Ό 보정"""
542
+ if len(sentences) < 2: return {"score": 40, "entropy": 0, "variance": 0, "order": 0, "zipf": 0}
543
+ chars = [c for c in text if c.strip()]
544
+ char_score = 45
545
+ if len(chars) >= 30:
546
+ cbigrams = [(chars[i], chars[i+1]) for i in range(len(chars)-1)]
547
+ cb_freq = Counter(cbigrams)
548
+ total_cb = len(cbigrams)
549
+ char_entropy = -sum((cnt/total_cb)*math.log2(cnt/total_cb) for cnt in cb_freq.values())
550
+ if char_entropy < 7.5: char_score = 78
551
+ elif char_entropy < 8.5: char_score = 62
552
+ elif char_entropy < 9.5: char_score = 42
553
+ elif char_entropy < 10.5: char_score = 25
554
+ else: char_score = 12
555
+ sl = [len(s) for s in sentences]
556
+ burst_score = 45
557
+ if len(sl) >= 3:
558
+ avg = sum(sl)/len(sl)
559
+ std = math.sqrt(sum((l-avg)**2 for l in sl)/len(sl))
560
+ cv = std/(avg+1e-10)
561
+ if cv < 0.15: burst_score = 82
562
+ elif cv < 0.25: burst_score = 62
563
+ elif cv < 0.40: burst_score = 38
564
+ elif cv < 0.60: burst_score = 20
565
+ else: burst_score = 8
566
+ sent_ttr = []
567
+ for s in sentences:
568
+ sw = split_words(s)
569
+ if len(sw) >= 3:
570
+ sent_ttr.append(len(set(sw))/len(sw))
571
+ ttr_score = 42
572
+ if len(sent_ttr) >= 3:
573
+ avg_ttr = sum(sent_ttr)/len(sent_ttr)
574
+ std_ttr = math.sqrt(sum((t-avg_ttr)**2 for t in sent_ttr)/len(sent_ttr))
575
+ if std_ttr < 0.04: ttr_score = 75
576
+ elif std_ttr < 0.08: ttr_score = 55
577
+ elif std_ttr < 0.15: ttr_score = 35
578
+ else: ttr_score = 15
579
+ endings = [s.rstrip('.!?\u2026')[-3:] for s in sentences if len(s) >= 5]
580
+ end_score = 40
581
+ if len(endings) >= 3:
582
+ ef = Counter(endings)
583
+ end_ent = -sum((c/len(endings))*math.log2(c/len(endings)) for c in ef.values())
584
+ max_ent = math.log2(len(ef)) if len(ef) > 1 else 1
585
+ norm_ent = end_ent / (max_ent + 1e-10)
586
+ if norm_ent < 0.5: end_score = 72
587
+ elif norm_ent < 0.7: end_score = 50
588
+ elif norm_ent < 0.85: end_score = 32
589
+ else: end_score = 15
590
+ final = int(char_score * 0.30 + burst_score * 0.30 + ttr_score * 0.20 + end_score * 0.20)
591
+ return {"score": final, "entropy": char_score, "variance": burst_score, "order": ttr_score, "zipf": end_score}
592
+ HUMANIZER_OVERSUBST = re.compile(r'ν™œμš©ν•˜λ‹€|μ΄μš©ν•˜λ‹€|μ‚¬μš©ν•˜λ‹€|μ μš©ν•˜λ‹€|λ„μž…ν•˜λ‹€|μ±„νƒν•˜λ‹€|μˆ˜ν–‰ν•˜λ‹€|μ§„ν–‰ν•˜λ‹€|μ‹€μ‹œν•˜λ‹€|μ‹€ν–‰ν•˜λ‹€')
593
+ HUMANIZER_AWKWARD = re.compile(r'그것은|이것은|저것은|ν•΄λ‹Ή 사항|μ•žμ„œ μ–ΈκΈ‰ν•œ|μ „μˆ ν•œ|μƒκΈ°ν•œ|기술된')
594
+ HUMANIZER_PASSIVE = re.compile(r'λ˜μ–΄μ§€[κ³ λŠ”λ©°]|ν•˜κ²Œ λ˜μ—ˆ[λ‹€μŠ΅]|μˆ˜ν–‰λ˜[μ—ˆμ–΄]|μ§„ν–‰λ˜[μ—ˆμ–΄]|μ‹€μ‹œλ˜[μ—ˆμ–΄]|ν™œμš©λ˜[μ—ˆμ–΄]')
595
+ def analyze_humanizer(text, sentences, words, morphemes):
596
+ """Humanizer/Bypasser 탐지 β€” AI 원문 νŒ¨λŸ¬ν”„λ ˆμ΄μ¦ˆ 흔적 뢄석"""
597
+ if len(sentences) < 2: return {"score": 20, "signals": []}
598
+ signals = []
599
+ content_words = [f for f, t in morphemes if t in ('NNG', 'NNP', 'VV', 'VA')]
600
+ if len(content_words) >= 10:
601
+ cw_freq = Counter(content_words)
602
+ hapax = sum(1 for c in cw_freq.values() if c == 1)
603
+ hapax_ratio = hapax / len(cw_freq) if cw_freq else 0
604
+ if hapax_ratio > 0.95 and len(content_words) >= 30:
605
+ signals.append(("λ™μ˜μ–΄κ³Όλ‹€μΉ˜ν™˜", 20, "핡심 μ–΄νœ˜κ°€ κ³Όλ„ν•˜κ²Œ λΆ„μ‚°"))
606
+ elif hapax_ratio > 0.90 and len(content_words) >= 25:
607
+ signals.append(("λ™μ˜μ–΄μΉ˜ν™˜μ˜μ‹¬", 12, "μ–΄νœ˜ 반볡 νšŒν”Ό νŒ¨ν„΄"))
608
+ sl = [len(s) for s in sentences]
609
+ if len(sl) >= 4:
610
+ avg = sum(sl) / len(sl)
611
+ cv = math.sqrt(sum((l - avg)**2 for l in sl) / len(sl)) / (avg + 1e-10)
612
+ unique_ratio = len(set(words)) / len(words) if words else 0
613
+ if cv < 0.20 and unique_ratio > 0.80 and len(sentences) >= 5:
614
+ signals.append(("κ΅¬μ‘°λ³΄μ‘΄μ–΄νœ˜λ³€κ²½", 18, "λ¬Έμž₯ ꡬ쑰 균일 + 비정상적 μ–΄νœ˜ λ‹€μ–‘μ„±"))
615
+ residual = 0
616
+ conn_positions = []
617
+ for i, s in enumerate(sentences):
618
+ stripped = s.strip()
619
+ for c in ['λ˜ν•œ','특히','ν•œνŽΈ','λ”λΆˆμ–΄','μ•„μšΈλŸ¬','λ‚˜μ•„κ°€','이에','κ²Œλ‹€κ°€','반면','κ²°κ΅­']:
620
+ if stripped.startswith(c):
621
+ conn_positions.append(i)
622
+ break
623
+ if len(conn_positions) >= 2:
624
+ gaps = [conn_positions[i] - conn_positions[i-1] for i in range(1, len(conn_positions))]
625
+ if gaps and max(gaps) - min(gaps) <= 1: # 거의 등간격
626
+ signals.append(("μ ‘μ†μ‚¬λ“±κ°„κ²©μž”μ‘΄", 15, "접속사 λ°°μΉ˜κ°€ κ·œμΉ™μ  (AI 원문 ꡬ쑰 μž”μ‘΄)"))
627
+ residual += 15
628
+ oversubst = len(HUMANIZER_OVERSUBST.findall(text))
629
+ awkward = len(HUMANIZER_AWKWARD.findall(text))
630
  passive = len(HUMANIZER_PASSIVE.findall(text))
631
  if oversubst >= 3:
632
  signals.append(("μœ μ‚¬λ™μ‚¬λ‚œλ¬΄", 12, f"ν™œμš©/이용/μ‚¬μš©/적용 λ“± {oversubst}개"))
 
949
  pass
950
  return results[:3]
951
  def gemini_plagiarism_check(text_chunk):
952
+ """Gemini + Google Search Grounding으둜 ν‘œμ ˆ 검사"""
953
  if not HAS_GENAI or not GEMINI_KEY: return None
954
  try:
955
  client = genai.Client(api_key=GEMINI_KEY)
956
  tool = gtypes.Tool(google_search=gtypes.GoogleSearch())
957
+ prompt = f"""λ‹€μŒ ν…μŠ€νŠΈκ°€ 인터넷에 μ‘΄μž¬ν•˜λŠ”μ§€ Google Search둜 ν™•μΈν•˜μ„Έμš”.
958
+ μœ μ‚¬ν•œ λ¬Έμž₯이 발견되면 좜처 URLκ³Ό μœ μ‚¬λ„(%)λ₯Ό λ³΄κ³ ν•˜μ„Έμš”.
959
+ λ§ˆμ§€λ§‰ 쀄에 "μœ μ‚¬λ„: XX%" ν˜•μ‹μœΌλ‘œ μž‘μ„±.
960
+ [ν…μŠ€νŠΈ]
961
+ {text_chunk[:1000]}"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
962
  resp = client.models.generate_content(
963
  model="gemini-2.0-flash-lite",
964
  contents=prompt,
965
+ config=gtypes.GenerateContentConfig(tools=[tool], temperature=0.1, max_output_tokens=600)
966
  )
967
  text_resp = resp.text if resp.text else ""
968
  sources = []
 
 
969
  if hasattr(resp, 'candidates') and resp.candidates:
970
  gc = resp.candidates[0].grounding_metadata
971
  if gc and hasattr(gc, 'grounding_chunks'):
972
  for chunk in gc.grounding_chunks:
973
  if hasattr(chunk, 'web') and chunk.web:
974
+ sources.append({"title": chunk.web.title or "", "url": chunk.web.uri or "", "source": "Google"})
975
+ pm = re.search(r'μœ μ‚¬λ„[:\s]*(\d+)', text_resp)
 
 
 
 
 
 
976
  pct = int(pm.group(1)) if pm else 0
977
+ return {"pct": pct, "response": text_resp, "sources": sources}
 
 
 
 
 
 
978
  except Exception as e:
979
+ return {"pct": 0, "response": str(e)[:100], "sources": []}
 
 
 
 
 
980
  def parallel_brave_search(queries, max_workers=10):
981
  """Brave Search 병렬 μ‹€ν–‰ (μ΅œλŒ€ 20개)"""
982
  all_results = {}
 
1020
  all_results.extend(duckduckgo_search(f"{query} λ…Όλ¬Έ ν•™μˆ ", 2))
1021
  return all_results
1022
  def run_plagiarism(text, progress=gr.Progress()):
 
 
1023
  if not text or len(text.strip())<50:
1024
  return "<div style='padding:20px;text-align:center;color:#888;'>⚠️ μ΅œμ†Œ 50자 이상</div>", ""
 
1025
  text = text.strip()
1026
  sents = split_sentences(text)
1027
  now = datetime.now().strftime("%Y-%m-%d %H:%M")
1028
+ has_brave = bool(BRAVE_KEY)
1029
+ has_gemini = bool(HAS_GENAI and GEMINI_KEY)
1030
  progress(0.05, "λ¬Έμž₯ 뢄리...")
1031
+ blocks = []
1032
+ for i in range(0, len(sents), 4):
1033
+ block = ' '.join(sents[i:i+4])
1034
+ if len(block) > 20:
1035
+ blocks.append({"text": block, "sent_indices": list(range(i, min(i+4, len(sents))))})
1036
  all_sources = []
1037
+ sent_matches = {i: [] for i in range(len(sents))} # λ¬Έμž₯별 λ§€μΉ­ 정보
1038
+ block_results = []
1039
  log_lines = []
1040
+ if has_brave:
1041
+ progress(0.15, f"Brave Search 병렬 검색 ({len(blocks)}블둝)...")
1042
+ queries = []
1043
+ for b in blocks:
1044
+ key_phrase = b["text"][:60].strip()
1045
+ queries.append(f'"{key_phrase}"')
1046
+ brave_results = parallel_brave_search(queries[:20])
1047
+ for q, results in brave_results.items():
1048
+ for r in results:
1049
+ all_sources.append(r)
1050
+ for b in blocks:
1051
+ if q.strip('"') in b["text"][:60]:
1052
+ for si in b["sent_indices"]:
1053
+ sent_matches[si].append({"source": r["title"], "url": r["url"], "type": "Brave"})
1054
+ log_lines.append(f"Brave Search: {len(queries)}쿼리 β†’ {sum(len(v) for v in brave_results.values())}건")
1055
+ else:
1056
+ progress(0.15, f"자체 μ›Ή 검색 ({len(blocks)}블둝)...")
1057
+ crawl_queries = []
1058
+ for b in blocks[:10]: # μ΅œλŒ€ 10블둝
1059
+ key_phrase = b["text"][:50].strip()
1060
+ crawl_queries.append((key_phrase, b))
1061
+ with ThreadPoolExecutor(max_workers=5) as executor:
1062
+ futures = {executor.submit(self_crawl_search, q, 3): (q, b) for q, b in crawl_queries}
1063
+ for future in as_completed(futures):
1064
+ q, b = futures[future]
1065
+ try:
1066
+ results = future.result()
1067
+ for r in results:
1068
+ all_sources.append(r)
1069
+ for si in b["sent_indices"]:
1070
+ sent_matches[si].append({"source": r["title"], "url": r["url"], "type": r.get("source","Web")})
1071
+ except: pass
1072
+ log_lines.append(f"자체 웹검색: {len(crawl_queries)}쿼리 (DuckDuckGo)")
1073
+ progress(0.40, "ν•™μˆ  DB 검색 (KCI/RISS/arXiv)...")
1074
+ words = split_words(text)
1075
+ wf = Counter(words)
1076
+ keywords = [w for w, c in wf.most_common(20) if len(w) >= 2 and c >= 2][:5]
1077
+ kw_query = ' '.join(keywords[:3])
1078
+ academic_results = []
1079
+ with ThreadPoolExecutor(max_workers=3) as executor:
1080
+ futures = [
1081
+ executor.submit(search_kci, kw_query),
1082
+ executor.submit(search_riss, kw_query),
1083
+ executor.submit(search_arxiv, kw_query),
1084
+ ]
1085
+ for future in as_completed(futures):
1086
+ try:
1087
+ results = future.result()
1088
+ academic_results.extend(results)
1089
+ all_sources.extend(results)
1090
+ except: pass
1091
+ log_lines.append(f"ν•™μˆ DB: KCI/RISS/arXiv β†’ {len(academic_results)}건")
1092
+ gemini_results = []
1093
+ if has_gemini:
1094
+ progress(0.60, "Gemini + Google Search...")
1095
+ for i, b in enumerate(blocks[:5]): # μ΅œλŒ€ 5블둝
1096
+ gr_result = gemini_plagiarism_check(b["text"])
1097
+ if gr_result:
1098
+ gemini_results.append(gr_result)
1099
+ for src in gr_result.get("sources", []):
1100
+ all_sources.append(src)
1101
+ for si in b["sent_indices"]:
1102
+ sent_matches[si].append({"source": src.get("title",""), "url": src.get("url",""), "type": "Google"})
1103
+ log_lines.append(f"Gemini: {len(blocks[:5])}블둝 β†’ {sum(len(r.get('sources',[])) for r in gemini_results)}좜처")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1104
  progress(0.80, "λ³΄κ³ μ„œ 생성...")
1105
+ matched_sents = sum(1 for si, matches in sent_matches.items() if matches)
1106
+ total_sents = len(sents)
1107
 
1108
+ # βœ… Gemini 90% (메인) + Brave/ν•™μˆ DB 10% (보쑰)
1109
+ brave_pct = int(matched_sents / total_sents * 100) if total_sents > 0 else 0
 
 
 
 
1110
 
1111
+ if gemini_results:
1112
+ gemini_pcts = [r["pct"] for r in gemini_results if r["pct"] > 0]
1113
+ if gemini_pcts:
1114
+ gemini_avg = sum(gemini_pcts) / len(gemini_pcts)
1115
+ plag_pct = int(gemini_avg * 0.9 + brave_pct * 0.1)
1116
+ else:
1117
+ plag_pct = brave_pct
1118
+ else:
1119
+ plag_pct = brave_pct
1120
  seen_urls = set()
1121
  unique_sources = []
1122
  for s in all_sources:
 
1124
  if url and url not in seen_urls:
1125
  seen_urls.add(url)
1126
  unique_sources.append(s)
1127
+ if plag_pct >= 50: grade, grade_color, grade_bg = "ν‘œμ ˆ μ˜μ‹¬", "#FF4444", "#FFE0E0"
1128
+ elif plag_pct >= 30: grade, grade_color, grade_bg = "주의 ν•„μš”", "#FF8800", "#FFF0DD"
1129
+ elif plag_pct >= 15: grade, grade_color, grade_bg = "μœ μ‚¬ ν‘œν˜„ 일뢀", "#DDAA00", "#FFFBE0"
1130
+ elif plag_pct >= 5: grade, grade_color, grade_bg = "μ–‘ν˜Έ", "#4ECDC4", "#E0FFF8"
1131
+ else: grade, grade_color, grade_bg = "우수 (원본성 λ†’μŒ)", "#22AA44", "#E0FFE8"
1132
+ sent_analysis = []
1133
+ for i, s in enumerate(sents):
1134
+ matches = sent_matches.get(i, [])
1135
+ if matches:
1136
+ best = matches[0]
1137
+ sent_analysis.append({"idx":i, "text":s, "matched":True, "source":best.get("source","")[:40], "url":best.get("url",""), "type":best.get("type","")})
1138
+ else:
1139
+ sent_analysis.append({"idx":i, "text":s, "matched":False})
1140
+ sim_sents = [s for s in sent_analysis if s["matched"]]
1141
+ src_groups = {}
1142
+ for src in unique_sources:
1143
+ key = src.get("url","")[:80]
1144
+ if key not in src_groups:
1145
+ src_groups[key] = {"title":src.get("title",""), "url":src.get("url",""), "source":src.get("source",""), "count":0}
1146
+ src_groups[key]["count"] += 1
1147
+ src_list = sorted(src_groups.values(), key=lambda x: -x["count"])
1148
+ methods_used = []
1149
+ if has_brave: methods_used.append("Brave Search(병렬)")
1150
+ elif all_sources: methods_used.append("DuckDuckGo(자체크둀링)")
1151
+ methods_used.append("KCI Β· RISS Β· arXiv")
1152
+ if has_gemini: methods_used.append("Gemini+Google Search")
1153
+ method_str = " + ".join(methods_used)
1154
+ gc = grade_color
1155
  word_count = len(split_words(text))
1156
  char_count = len(text)
1157
  doc_id = hashlib.md5(text[:100].encode()).hexdigest()[:8].upper()
1158
+ similarity_pct = plag_pct
1159
+ citation_pct = 0
1160
+ cat_suspect = len(sim_sents) # μ˜μ‹¬
1161
+ cat_cited = 0 # 인용 (ν˜•μ‹μ  인용 감지)
1162
+ cat_normal = total_sents - cat_suspect - cat_cited # 일반
1163
+ cat_suspect_pct = int(cat_suspect / max(1, total_sents) * 100)
1164
+ cat_normal_pct = 100 - cat_suspect_pct
1165
+ def src_icon(s):
1166
+ src = s.get("source","").lower()
1167
+ if "kci" in src: return "πŸ“š", "KCI"
1168
+ if "riss" in src: return "πŸ“–", "RISS"
1169
+ if "arxiv" in src: return "πŸ“„", "arXiv"
1170
+ if "google" in src: return "πŸ”", "Google"
1171
+ if "brave" in src: return "🌐", "Brave"
1172
+ return "🌐", "Web"
1173
  src_rows = ""
1174
+ for i, sg in enumerate(src_list[:15]):
1175
+ pct = min(100, int(sg["count"] / max(1, total_sents) * 100 * 3))
1176
+ ico, stype = src_icon(sg)
1177
+ title_short = sg["title"][:50] or "(제λͺ© μ—†μŒ)"
1178
+ url_short = sg["url"][:60]
1179
  src_rows += f"""<tr>
1180
+ <td style="padding:6px 8px;font-size:11px;text-align:center;border:1px solid #D5D5D5;color:#555;">{i+1}</td>
1181
+ <td style="padding:6px 8px;font-size:11px;text-align:center;border:1px solid #D5D5D5;"><span style="font-size:14px;">{ico}</span><br><span style="font-size:9px;color:#666;">{stype}</span></td>
1182
+ <td style="padding:6px 8px;border:1px solid #D5D5D5;"><div style="font-size:11px;font-weight:600;color:#1A3C6E;margin-bottom:2px;">{title_short}</div><div style="font-size:9px;color:#888;word-break:break-all;">{url_short}</div></td>
1183
+ <td style="padding:6px 8px;font-size:12px;text-align:center;border:1px solid #D5D5D5;font-weight:800;color:#D63031;">{pct}%</td>
1184
+ <td style="padding:6px 10px;border:1px solid #D5D5D5;"><div style="background:#EDEDED;height:14px;border-radius:2px;overflow:hidden;"><div style="background:linear-gradient(90deg,#D63031,#FF7675);height:100%;width:{max(3,pct)}%;border-radius:2px;"></div></div></td>
1185
  </tr>"""
1186
+ suspect_rows = ""
1187
+ for i, sa in enumerate(sim_sents[:15]):
1188
+ suspect_rows += f"""<tr>
1189
+ <td style="padding:8px;font-size:11px;text-align:center;color:#888;border:1px solid #D5D5D5;vertical-align:top;">{i+1}</td>
1190
+ <td style="padding:8px;font-size:11px;line-height:1.7;border:1px solid #D5D5D5;vertical-align:top;"><span style="background:#FFF3CD;border-bottom:2px solid #FFD43B;padding:1px 3px;">{sa["text"][:90]}</span></td>
1191
+ <td style="padding:8px;font-size:10px;line-height:1.6;border:1px solid #D5D5D5;vertical-align:top;color:#555;"><span style="background:#FFE0E0;border-bottom:2px solid #E74C3C;padding:1px 3px;">{sa["text"][:70]}...</span></td>
1192
+ <td style="padding:8px;font-size:10px;border:1px solid #D5D5D5;vertical-align:top;"><a href="{sa.get('url','#')}" target="_blank" style="color:#2E86C1;text-decoration:none;font-weight:600;">{sa["source"][:28]}</a><br><span style="font-size:8px;color:#AAA;">{sa.get('type','')}</span></td>
1193
+ </tr>"""
1194
+ full_hl = ""
1195
+ for sa in sent_analysis:
1196
+ sidx = sa["idx"] + 1
1197
+ if sa["matched"]:
1198
+ full_hl += f'<span style="background:#FFD6D6;border-bottom:2px solid #E74C3C;padding:1px 2px;cursor:pointer;" title="[μ˜μ‹¬ #{sidx}] 좜처: {sa.get("source","")}">{sa["text"]}</span> '
1199
+ else:
1200
+ full_hl += f'<span style="color:#333;">{sa["text"]}</span> '
1201
+ bar_suspect_w = max(2, cat_suspect_pct) if cat_suspect > 0 else 0
1202
+ bar_normal_w = 100 - bar_suspect_w
1203
+ HDR_BG = '#3B7DD8'
1204
+ HDR_BG2 = '#4A8DE0'
1205
+ TH = 'padding:8px 10px;font-size:10px;font-weight:700;color:#fff;background:{};text-align:center;border:1px solid {};'.format(HDR_BG, HDR_BG)
1206
+ TL = 'padding:7px 10px;font-size:11px;color:#444;font-weight:600;background:#EDF2FA;border:1px solid #D5D5D5;'
1207
  TV = 'padding:7px 10px;font-size:12px;color:#333;border:1px solid #D5D5D5;'
1208
+ SEC = 'font-size:13px;font-weight:800;color:#1A3C6E;margin:0 0 10px 0;padding:8px 12px;background:#EDF2FA;border-left:4px solid {};border-bottom:1px solid #D5D5D5;'.format(HDR_BG)
1209
+ html = f"""<div style="font-family:'Noto Sans KR','Malgun Gothic','Apple SD Gothic Neo',sans-serif;max-width:780px;margin:0 auto;background:#fff;border:2px solid #3B7DD8;box-shadow:0 2px 12px rgba(0,0,0,0.08);">
1210
+ <!-- ═══════ 헀더 (CopyKiller μŠ€νƒ€μΌ) ═══════ -->
1211
+ <div style="background:linear-gradient(135deg,{HDR_BG},{HDR_BG2});padding:18px 24px;color:#fff;">
1212
+ <table style="width:100%;"><tr>
1213
+ <td>
1214
+ <div style="font-size:10px;opacity:0.8;letter-spacing:1px;margin-bottom:4px;">AI TEXT DETECTOR Β· PLAGIARISM REPORT</div>
1215
+ <div style="font-size:22px;font-weight:900;letter-spacing:-0.5px;">ν‘œμ ˆ 검사 κ²°κ³Ό ν™•μΈμ„œ</div>
1216
+ </td>
1217
+ <td style="text-align:right;vertical-align:bottom;">
1218
+ <div style="font-size:10px;opacity:0.7;">λ¬Έμ„œλ²ˆν˜Έ {doc_id}</div>
1219
+ <div style="font-size:10px;opacity:0.7;">{now}</div>
1220
+ </td>
1221
+ </tr></table>
1222
  </div>
1223
+ <!-- ═══════ 제좜 정보 ν…Œμ΄λΈ” ═══════ -->
1224
+ <div style="padding:16px 24px 0;">
1225
+ <div style="{SEC}">πŸ“‹ 검사 정보</div>
1226
+ <table style="width:100%;border-collapse:collapse;">
1227
+ <tr>
1228
+ <td style="{TL}width:90px;">검사 μΌμ‹œ</td>
1229
+ <td style="{TV}">{now}</td>
1230
+ <td style="{TL}width:90px;">λ¬Έμ„œλ²ˆν˜Έ</td>
1231
+ <td style="{TV}">{doc_id}</td>
1232
+ </tr>
1233
+ <tr>
1234
+ <td style="{TL}">검사 방법</td>
1235
+ <td style="{TV}font-size:10px;" colspan="3">{method_str}</td>
1236
+ </tr>
1237
+ <tr>
1238
+ <td style="{TL}">전체 λΆ„λŸ‰</td>
1239
+ <td style="{TV}" colspan="3">κΈ€μžμˆ˜ <b>{char_count:,}</b> Β· μ–΄μ ˆμˆ˜ <b>{word_count:,}</b> Β· λ¬Έμž₯수 <b>{total_sents}</b></td>
1240
+ </tr>
1241
+ <tr>
1242
+ <td style="{TL}">검색 λ²”μœ„</td>
1243
+ <td style="{TV}" colspan="3">인터넷(μ›Ή), ν•™μˆ λ…Όλ¬Έ(KCIΒ·RISS), ν•΄μ™Έλ…Όλ¬Έ(arXiv), Google Scholar</td>
1244
+ </tr>
1245
+ </table>
1246
+ </div>
1247
+ <!-- ═══════ 검사 κ²°κ³Ό (도넛 + λ°”) ═══════ -->
1248
  <div style="padding:18px 24px 0;">
1249
+ <div style="{SEC}">πŸ“Š 검사 κ²°κ³Ό</div>
1250
  <table style="width:100%;border-collapse:collapse;">
1251
  <tr>
1252
+ <!-- 도넛 차트 -->
1253
+ <td style="width:200px;padding:20px;text-align:center;border:1px solid #D5D5D5;vertical-align:middle;background:#FAFBFE;">
1254
+ <div style="position:relative;width:140px;height:140px;margin:0 auto;">
1255
+ <svg viewBox="0 0 36 36" style="width:140px;height:140px;transform:rotate(-90deg);">
1256
+ <circle cx="18" cy="18" r="15.9" fill="none" stroke="#E8ECF0" stroke-width="2.8"/>
1257
+ <circle cx="18" cy="18" r="15.9" fill="none" stroke="{gc}" stroke-width="2.8" stroke-dasharray="{plag_pct} {100-plag_pct}" stroke-linecap="round"/>
1258
+ </svg>
1259
+ <div style="position:absolute;top:50%;left:50%;transform:translate(-50%,-50%);text-align:center;">
1260
+ <div style="font-size:9px;color:#888;margin-bottom:2px;">ν‘œμ ˆλ₯ </div>
1261
+ <div style="font-size:32px;font-weight:900;color:{gc};line-height:1;">{plag_pct}<span style="font-size:14px;font-weight:700;">%</span></div>
1262
+ </div>
1263
+ </div>
1264
+ <div style="margin-top:8px;padding:4px 12px;background:{grade_bg};border:1px solid {gc};border-radius:20px;display:inline-block;">
1265
+ <span style="font-size:11px;font-weight:800;color:{gc};">{grade}</span>
1266
+ </div>
1267
  </td>
1268
+ <!-- 상세 κ²°κ³Ό -->
1269
+ <td style="padding:0;border:1px solid #D5D5D5;vertical-align:top;">
1270
+ <!-- μΉ΄ν…Œκ³ λ¦¬ λ°” (CopyKiller 핡심) -->
1271
+ <div style="padding:14px 18px;border-bottom:1px solid #E8E8E8;">
1272
+ <div style="display:flex;height:28px;border-radius:4px;overflow:hidden;border:1px solid #D0D0D0;margin-bottom:8px;">
1273
+ <div style="background:#E74C3C;width:{bar_suspect_w}%;"></div>
1274
+ <div style="background:#E8ECF0;width:{bar_normal_w}%;"></div>
1275
+ </div>
1276
+ <div style="display:flex;gap:14px;flex-wrap:wrap;font-size:9px;">
1277
+ <span style="display:flex;align-items:center;gap:3px;"><span style="display:inline-block;width:10px;height:10px;background:#E74C3C;border-radius:2px;"></span> μ˜μ‹¬ <b>{cat_suspect}</b>건</span>
1278
+ <span style="display:flex;align-items:center;gap:3px;"><span style="display:inline-block;width:10px;height:10px;background:#F39C12;border-radius:2px;"></span> μΆœμ²˜ν‘œμ‹œ <b>0</b>건</span>
1279
+ <span style="display:flex;align-items:center;gap:3px;"><span style="display:inline-block;width:10px;height:10px;background:#3498DB;border-radius:2px;"></span> 인용 <b>{cat_cited}</b>건</span>
1280
+ <span style="display:flex;align-items:center;gap:3px;"><span style="display:inline-block;width:10px;height:10px;background:#E8ECF0;border:1px solid #CCC;border-radius:2px;"></span> 일반 <b>{cat_normal}</b>건</span>
1281
+ </div>
1282
+ </div>
1283
+ <!-- λΉ„μœ¨ λ°” 3쀄 -->
1284
+ <div style="padding:12px 18px;">
1285
+ <div style="margin-bottom:10px;">
1286
+ <div style="display:flex;justify-content:space-between;margin-bottom:4px;"><span style="font-size:11px;font-weight:700;color:#E74C3C;">β–  ν‘œμ ˆλ₯ </span><span style="font-size:18px;font-weight:900;color:#E74C3C;">{plag_pct}%</span></div>
1287
+ <div style="background:#EDEDED;height:16px;border-radius:3px;overflow:hidden;border:1px solid #DDD;"><div style="background:linear-gradient(90deg,#E74C3C,#FF7675);height:100%;width:{max(1,plag_pct)}%;transition:width 0.5s;"></div></div>
1288
+ </div>
1289
+ <div style="margin-bottom:10px;">
1290
+ <div style="display:flex;justify-content:space-between;margin-bottom:4px;"><span style="font-size:11px;font-weight:700;color:#F39C12;">β–  μœ μ‚¬οΏ½οΏ½οΏ½</span><span style="font-size:18px;font-weight:900;color:#F39C12;">{similarity_pct}%</span></div>
1291
+ <div style="background:#EDEDED;height:16px;border-radius:3px;overflow:hidden;border:1px solid #DDD;"><div style="background:linear-gradient(90deg,#F39C12,#FFEAA7);height:100%;width:{max(1,similarity_pct)}%;transition:width 0.5s;"></div></div>
1292
+ </div>
1293
+ <div>
1294
+ <div style="display:flex;justify-content:space-between;margin-bottom:4px;"><span style="font-size:11px;font-weight:700;color:#3498DB;">β–  인용λ₯ </span><span style="font-size:18px;font-weight:900;color:#3498DB;">{citation_pct}%</span></div>
1295
+ <div style="background:#EDEDED;height:16px;border-radius:3px;overflow:hidden;border:1px solid #DDD;"><div style="background:linear-gradient(90deg,#3498DB,#85C1E9);height:100%;width:{max(1,citation_pct)}%;transition:width 0.5s;"></div></div>
1296
+ </div>
1297
+ </div>
1298
+ <!-- μš”μ•½ 수치 -->
1299
+ <div style="padding:8px 18px;background:#F8F9FB;border-top:1px solid #E8E8E8;">
1300
+ <table style="width:100%;border-collapse:collapse;">
1301
+ <tr>
1302
+ <td style="padding:4px;font-size:10px;color:#888;">μ˜μ‹¬λ¬Έμž₯</td>
1303
+ <td style="padding:4px;font-size:12px;font-weight:800;color:#E74C3C;">{cat_suspect}건</td>
1304
+ <td style="padding:4px;font-size:10px;color:#888;">일반문μž₯</td>
1305
+ <td style="padding:4px;font-size:12px;font-weight:800;color:#27AE60;">{cat_normal}건</td>
1306
+ <td style="padding:4px;font-size:10px;color:#888;">전체</td>
1307
+ <td style="padding:4px;font-size:12px;font-weight:800;color:#333;">{total_sents}건</td>
1308
+ </tr>
1309
+ </table>
1310
  </div>
1311
  </td>
1312
  </tr>
1313
  </table>
1314
  </div>
1315
+ <!-- ═══════ 전체 ν…μŠ€νŠΈ 뢄석 ═══════ -->
1316
+ <div style="padding:18px 24px 0;">
1317
+ <div style="{SEC}">πŸ“ 전체 ν…μŠ€νŠΈ 뢄석</div>
1318
+ <div style="display:flex;gap:16px;margin-bottom:8px;font-size:9px;padding:6px 10px;background:#F8F9FB;border:1px solid #E8E8E8;border-radius:4px;">
1319
+ <span style="display:flex;align-items:center;gap:4px;"><span style="display:inline-block;width:12px;height:12px;background:#E74C3C;border-radius:2px;"></span> ν‘œμ ˆ μ˜μ‹¬</span>
1320
+ <span style="display:flex;align-items:center;gap:4px;"><span style="display:inline-block;width:12px;height:12px;background:#F39C12;border-radius:2px;"></span> μΆœμ²˜ν‘œμ‹œ</span>
1321
+ <span style="display:flex;align-items:center;gap:4px;"><span style="display:inline-block;width:12px;height:12px;background:#3498DB;border-radius:2px;"></span> 인용</span>
1322
+ <span style="display:flex;align-items:center;gap:4px;"><span style="display:inline-block;width:12px;height:12px;background:#27AE60;border-radius:2px;"></span> μžκΈ°ν‘œμ ˆ</span>
1323
+ <span style="display:flex;align-items:center;gap:4px;"><span style="display:inline-block;width:12px;height:12px;background:#fff;border:1px solid #CCC;border-radius:2px;"></span> 일반</span>
1324
+ </div>
1325
+ <div style="padding:14px;background:#FAFBFC;border:1px solid #D5D5D5;line-height:2.1;font-size:13px;max-height:300px;overflow-y:auto;">{full_hl}</div>
1326
+ </div>
1327
+ <!-- ═══════ ν‘œμ ˆ μ˜μ‹¬ 좜처 ═══════ -->
1328
+ <div style="padding:18px 24px 0;">
1329
+ <div style="{SEC}">πŸ”— ν‘œμ ˆ μ˜μ‹¬ 좜처 ({len(src_list)}건)</div>
1330
+ <table style="width:100%;border-collapse:collapse;">
1331
+ <tr>
1332
+ <th style="{TH}width:32px;">No</th>
1333
+ <th style="{TH}width:48px;">μœ ν˜•</th>
1334
+ <th style="{TH}">좜처λͺ… / URL</th>
1335
+ <th style="{TH}width:55px;">μœ μ‚¬μœ¨</th>
1336
+ <th style="{TH}width:100px;">뢄포</th>
1337
+ </tr>
1338
+ {src_rows if src_rows else '<tr><td colspan="5" style="padding:18px;text-align:center;color:#999;font-size:11px;border:1px solid #D5D5D5;background:#FAFBFC;">발견된 μœ μ‚¬ μΆœμ²˜κ°€ μ—†μŠ΅λ‹ˆλ‹€.</td></tr>'}
1339
  </table>
1340
  </div>
1341
+ <!-- ═══════ μ˜μ‹¬ λ¬Έμž₯ 비ꡐ ═══════ -->
1342
+ <div style="padding:18px 24px 0;">
1343
+ <div style="{SEC}">⚠️ μ˜μ‹¬ λ¬Έμž₯ 비ꡐ ({len(sim_sents)}건)</div>
1344
+ <table style="width:100%;border-collapse:collapse;">
1345
+ <tr>
1346
+ <th style="{TH}width:32px;">No</th>
1347
+ <th style="{TH}width:38%;">검사 λ¬Έμž₯ (원문)</th>
1348
+ <th style="{TH}width:34%;">비ꡐ λ¬Έμž₯ (좜처)</th>
1349
+ <th style="{TH}">좜처</th>
1350
+ </tr>
1351
+ {suspect_rows if suspect_rows else '<tr><td colspan="4" style="padding:18px;text-align:center;color:#999;font-size:11px;border:1px solid #D5D5D5;background:#FAFBFC;">μœ μ‚¬ μ˜μ‹¬ λ¬Έμž₯이 λ°œκ²¬λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.</td></tr>'}
1352
+ </table>
1353
+ </div>
1354
+ <!-- ═══════ 검사 μ•ˆλ‚΄ ═══════ -->
1355
+ <div style="margin:18px 24px;padding:12px 14px;background:#F5F8FC;border:1px solid #D0DAEA;border-radius:4px;font-size:9px;color:#555;line-height:1.8;">
1356
+ <b style="color:#333;">πŸ“Œ 검사 μ•ˆλ‚΄</b><br>
1357
+ Β· λ³Έ λ³΄κ³ μ„œλŠ” <b>{method_str}</b> 기반 μžλ™ ν‘œμ ˆ 검사 κ²°κ³Όμž…λ‹ˆλ‹€.<br>
1358
+ Β· 검색 λ²”μœ„: 인터넷 μ›ΉνŽ˜μ΄μ§€, ν•™μˆ λ…Όλ¬Έ(KCI, RISS), ν•΄μ™Έλ…Όλ¬Έ(arXiv)<br>
1359
+ Β· μœ μ‚¬λ„λŠ” λ¬Έμž₯ λ‹¨μœ„ λ§€μΉ­ 기반이며, μ΅œμ’… νŒμ •μ€ ꡐ수자/κ²€ν† μžμ˜ 확인이 ν•„μš”ν•©λ‹ˆλ‹€.<br>
1360
+ Β· 인용 ν‘œκΈ°(λ”°μ˜΄ν‘œ, 각주 λ“±)κ°€ ν¬ν•¨λœ λ¬Έμž₯은 인용으둜 λΆ„λ₯˜λ  수 μžˆμŠ΅λ‹ˆλ‹€.
1361
+ </div>
1362
+ <!-- ═══════ ν‘Έν„° (CopyKiller μŠ€νƒ€μΌ) ═══════ -->
1363
+ <div style="padding:10px 24px;background:#F0F3F8;border-top:2px solid {HDR_BG};display:flex;justify-content:space-between;align-items:center;">
1364
+ <div>
1365
+ <span style="font-size:13px;font-weight:900;color:{HDR_BG};">AI Detector</span>
1366
+ <span style="font-size:9px;color:#999;margin-left:6px;">Plagiarism Checker v3.5</span>
1367
+ </div>
1368
+ <div style="text-align:right;">
1369
+ <div style="font-size:9px;color:#AAA;">Powered by Brave Β· KCI Β· RISS Β· arXiv Β· Gemini</div>
1370
+ <div style="font-size:8px;color:#CCC;">{now} Β· ID: {doc_id} Β· All Rights Reserved.</div>
1371
+ </div>
1372
  </div>
1373
  </div>"""
1374
+ log = '\n'.join(log_lines) + f"\n\nμ’…ν•©: {plag_pct}% {grade} | 좜처 {len(unique_sources)}건 | μœ μ‚¬λ¬Έμž₯ {matched_sents}/{total_sents}"
 
 
 
1375
  return html, log
1376
  def run_detection(text, progress=gr.Progress()):
1377
  if not text or len(text.strip())<50: return "<div style='padding:20px;text-align:center;color:#888;'>⚠️ μ΅œμ†Œ 50자</div>",""