openfree commited on
Commit
0d5c726
·
verified ·
1 Parent(s): 089551b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +176 -325
app.py CHANGED
@@ -467,116 +467,142 @@ def self_crawl_search(query, max_results=3):
467
  all_results.extend(duckduckgo_search(f"{query} 논문 학술", 2))
468
  return all_results
469
  def run_plagiarism(text, progress=gr.Progress()):
 
 
470
  if not text or len(text.strip())<50:
471
  return "<div style='padding:20px;text-align:center;color:#888;'>⚠️ 최소 50자 이상</div>", ""
 
472
  text = text.strip()
473
  sents = split_sentences(text)
474
  now = datetime.now().strftime("%Y-%m-%d %H:%M")
475
- has_brave = bool(BRAVE_KEY)
476
- has_gemini = bool(HAS_GENAI and GEMINI_KEY)
477
  progress(0.05, "문장 분리...")
478
- blocks = []
479
- for i in range(0, len(sents), 4):
480
- block = ' '.join(sents[i:i+4])
481
- if len(block) > 20:
482
- blocks.append({"text": block, "sent_indices": list(range(i, min(i+4, len(sents))))})
483
  all_sources = []
484
- sent_matches = {i: [] for i in range(len(sents))} # 문장별 매칭 정보
485
- block_results = []
486
  log_lines = []
487
- if has_brave:
488
- progress(0.15, f"Brave Search 병렬 검색 ({len(blocks)}블록)...")
489
- queries = []
490
- for b in blocks:
491
- key_phrase = b["text"][:60].strip()
492
- queries.append(f'"{key_phrase}"')
493
- brave_results = parallel_brave_search(queries[:20])
494
- for q, results in brave_results.items():
495
- for r in results:
496
- all_sources.append(r)
497
- for b in blocks:
498
- if q.strip('"') in b["text"][:60]:
499
- for si in b["sent_indices"]:
500
- sent_matches[si].append({"source": r["title"], "url": r["url"], "type": "Brave"})
501
- log_lines.append(f"Brave Search: {len(queries)}쿼리 → {sum(len(v) for v in brave_results.values())}건")
502
- else:
503
- progress(0.15, f"자체 웹 검색 ({len(blocks)}블록)...")
504
- crawl_queries = []
505
- for b in blocks[:10]: # 최대 10블록
506
- key_phrase = b["text"][:50].strip()
507
- crawl_queries.append((key_phrase, b))
508
- with ThreadPoolExecutor(max_workers=5) as executor:
509
- futures = {executor.submit(self_crawl_search, q, 3): (q, b) for q, b in crawl_queries}
510
- for future in as_completed(futures):
511
- q, b = futures[future]
512
- try:
513
- results = future.result()
514
- for r in results:
515
- all_sources.append(r)
516
- for si in b["sent_indices"]:
517
- sent_matches[si].append({"source": r["title"], "url": r["url"], "type": r.get("source","Web")})
518
- except: pass
519
- log_lines.append(f"자체 웹검색: {len(crawl_queries)}쿼리 (DuckDuckGo)")
520
- progress(0.40, "학술 DB 검색 (KCI/RISS/arXiv)...")
521
- words = split_words(text)
522
- wf = Counter(words)
523
- keywords = [w for w, c in wf.most_common(20) if len(w) >= 2 and c >= 2][:5]
524
- kw_query = ' '.join(keywords[:3])
525
- academic_results = []
526
- with ThreadPoolExecutor(max_workers=3) as executor:
527
- futures = [
528
- executor.submit(search_kci, kw_query),
529
- executor.submit(search_riss, kw_query),
530
- executor.submit(search_arxiv, kw_query),
531
- ]
532
- for future in as_completed(futures):
533
- try:
534
- results = future.result()
535
- academic_results.extend(results)
536
- all_sources.extend(results)
537
- except: pass
538
- log_lines.append(f"학술DB: KCI/RISS/arXiv → {len(academic_results)}건")
539
- gemini_results = []
540
- if has_gemini:
541
- progress(0.60, "Gemini + Google Search (전체 텍스트 검사)...")
542
- # ✅ 개선: 블록 제한 제거! 전체 텍스트를 한 번에 Gemini로 검사
543
- gr_result = gemini_plagiarism_check(text)
544
- if gr_result:
545
- gemini_results.append(gr_result)
546
- for src in gr_result.get("sources", []):
547
- all_sources.append(src)
548
- pct = gr_result.get('pct', 0) if gr_result else 0
549
- log_lines.append(f"✓ Gemini: 전체 텍스트 검사 완료 → {pct}% 유사도, {len(gr_result.get('sources',[]))}출처 발견")
550
- progress(0.80, "보고서 생성...")
551
- matched_sents = sum(1 for si, matches in sent_matches.items() if matches)
552
- total_sents = len(sents)
553
 
554
- # ✅ 개선: Gemini를 메인으로 (80%)
555
- # Gemini가 전체 텍스트를 검사했으므로 가장 정확함
 
 
 
556
 
557
- if gemini_results and gemini_results[0].get("pct", 0) > 0:
558
- # Gemini 결과를 메인으로
559
- gemini_pct = gemini_results[0]["pct"]
560
 
561
- # 보조 채널 결과
562
- brave_matched = sum(1 for si, matches in sent_matches.items() if matches)
563
- brave_pct = int(brave_matched / max(1, total_sents) * 100)
564
- academic_pct = int(len(academic_results) / max(1, total_sents) * 100)
565
- helper_pct = max(brave_pct, academic_pct) # 둘 중 높은 값
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
566
 
567
- # Gemini 80% + 보조 채널 20%
568
- if helper_pct > 0:
569
- plag_pct = int(gemini_pct * 0.8 + helper_pct * 0.2)
570
- else:
571
- plag_pct = gemini_pct
572
- else:
573
- # Gemini 없으면 Brave + 학술DB
574
- plag_pct = int(matched_sents / total_sents * 100) if total_sents > 0 else 0
575
- if academic_results and plag_pct == 0:
576
- plag_pct = int(len(academic_results) / max(1, total_sents) * 100)
577
- elif academic_results and plag_pct > 0:
578
- academic_pct = int(len(academic_results) / max(1, total_sents) * 100)
579
- plag_pct = int(plag_pct * 0.6 + academic_pct * 0.4)
 
 
 
 
 
 
 
 
 
 
580
  seen_urls = set()
581
  unique_sources = []
582
  for s in all_sources:
@@ -584,254 +610,79 @@ def run_plagiarism(text, progress=gr.Progress()):
584
  if url and url not in seen_urls:
585
  seen_urls.add(url)
586
  unique_sources.append(s)
587
- if plag_pct >= 50: grade, grade_color, grade_bg = "표절 의심", "#FF4444", "#FFE0E0"
588
- elif plag_pct >= 30: grade, grade_color, grade_bg = "주의 필요", "#FF8800", "#FFF0DD"
589
- elif plag_pct >= 15: grade, grade_color, grade_bg = "유사 표현 일부", "#DDAA00", "#FFFBE0"
590
- elif plag_pct >= 5: grade, grade_color, grade_bg = "양호", "#4ECDC4", "#E0FFF8"
591
- else: grade, grade_color, grade_bg = "우수 (원본성 높음)", "#22AA44", "#E0FFE8"
592
- sent_analysis = []
593
- for i, s in enumerate(sents):
594
- matches = sent_matches.get(i, [])
595
- if matches:
596
- best = matches[0]
597
- sent_analysis.append({"idx":i, "text":s, "matched":True, "source":best.get("source","")[:40], "url":best.get("url",""), "type":best.get("type","")})
598
- else:
599
- sent_analysis.append({"idx":i, "text":s, "matched":False})
600
- sim_sents = [s for s in sent_analysis if s["matched"]]
601
- src_groups = {}
602
- for src in unique_sources:
603
- key = src.get("url","")[:80]
604
- if key not in src_groups:
605
- src_groups[key] = {"title":src.get("title",""), "url":src.get("url",""), "source":src.get("source",""), "count":0}
606
- src_groups[key]["count"] += 1
607
- src_list = sorted(src_groups.values(), key=lambda x: -x["count"])
608
- methods_used = []
609
- if has_brave: methods_used.append("Brave Search(병렬)")
610
- elif all_sources: methods_used.append("DuckDuckGo(자체크롤링)")
611
- methods_used.append("KCI · RISS · arXiv")
612
- if has_gemini: methods_used.append("Gemini+Google Search")
613
- method_str = " + ".join(methods_used)
614
- gc = grade_color
615
  word_count = len(split_words(text))
616
  char_count = len(text)
617
  doc_id = hashlib.md5(text[:100].encode()).hexdigest()[:8].upper()
618
- similarity_pct = plag_pct
619
- citation_pct = 0
620
- cat_suspect = len(sim_sents) # 의심
621
- cat_cited = 0 # 인용 (형식적 인용 감지)
622
- cat_normal = total_sents - cat_suspect - cat_cited # 일반
623
- cat_suspect_pct = int(cat_suspect / max(1, total_sents) * 100)
624
- cat_normal_pct = 100 - cat_suspect_pct
625
- def src_icon(s):
626
- src = s.get("source","").lower()
627
- if "kci" in src: return "📚", "KCI"
628
- if "riss" in src: return "📖", "RISS"
629
- if "arxiv" in src: return "📄", "arXiv"
630
- if "google" in src: return "🔍", "Google"
631
- if "brave" in src: return "🌐", "Brave"
632
- return "🌐", "Web"
633
  src_rows = ""
634
- for i, sg in enumerate(src_list[:15]):
635
- pct = min(100, int(sg["count"] / max(1, total_sents) * 100 * 3))
636
- ico, stype = src_icon(sg)
637
- title_short = sg["title"][:50] or "(제목 없음)"
638
- url_short = sg["url"][:60]
639
  src_rows += f"""<tr>
640
- <td style="padding:6px 8px;font-size:11px;text-align:center;border:1px solid #D5D5D5;color:#555;">{i+1}</td>
641
- <td style="padding:6px 8px;font-size:11px;text-align:center;border:1px solid #D5D5D5;"><span style="font-size:14px;">{ico}</span><br><span style="font-size:9px;color:#666;">{stype}</span></td>
642
- <td style="padding:6px 8px;border:1px solid #D5D5D5;"><div style="font-size:11px;font-weight:600;color:#1A3C6E;margin-bottom:2px;">{title_short}</div><div style="font-size:9px;color:#888;word-break:break-all;">{url_short}</div></td>
643
- <td style="padding:6px 8px;font-size:12px;text-align:center;border:1px solid #D5D5D5;font-weight:800;color:#D63031;">{pct}%</td>
644
- <td style="padding:6px 10px;border:1px solid #D5D5D5;"><div style="background:#EDEDED;height:14px;border-radius:2px;overflow:hidden;"><div style="background:linear-gradient(90deg,#D63031,#FF7675);height:100%;width:{max(3,pct)}%;border-radius:2px;"></div></div></td>
645
- </tr>"""
646
- suspect_rows = ""
647
- for i, sa in enumerate(sim_sents[:15]):
648
- suspect_rows += f"""<tr>
649
- <td style="padding:8px;font-size:11px;text-align:center;color:#888;border:1px solid #D5D5D5;vertical-align:top;">{i+1}</td>
650
- <td style="padding:8px;font-size:11px;line-height:1.7;border:1px solid #D5D5D5;vertical-align:top;"><span style="background:#FFF3CD;border-bottom:2px solid #FFD43B;padding:1px 3px;">{sa["text"][:90]}</span></td>
651
- <td style="padding:8px;font-size:10px;line-height:1.6;border:1px solid #D5D5D5;vertical-align:top;color:#555;"><span style="background:#FFE0E0;border-bottom:2px solid #E74C3C;padding:1px 3px;">{sa["text"][:70]}...</span></td>
652
- <td style="padding:8px;font-size:10px;border:1px solid #D5D5D5;vertical-align:top;"><a href="{sa.get('url','#')}" target="_blank" style="color:#2E86C1;text-decoration:none;font-weight:600;">{sa["source"][:28]}</a><br><span style="font-size:8px;color:#AAA;">{sa.get('type','')}</span></td>
653
  </tr>"""
654
- full_hl = ""
655
- for sa in sent_analysis:
656
- sidx = sa["idx"] + 1
657
- if sa["matched"]:
658
- full_hl += f'<span style="background:#FFD6D6;border-bottom:2px solid #E74C3C;padding:1px 2px;cursor:pointer;" title="[의심 #{sidx}] 출처: {sa.get("source","")}">{sa["text"]}</span> '
659
- else:
660
- full_hl += f'<span style="color:#333;">{sa["text"]}</span> '
661
- bar_suspect_w = max(2, cat_suspect_pct) if cat_suspect > 0 else 0
662
- bar_normal_w = 100 - bar_suspect_w
663
- HDR_BG = '#3B7DD8'
664
- HDR_BG2 = '#4A8DE0'
665
- TH = 'padding:8px 10px;font-size:10px;font-weight:700;color:#fff;background:{};text-align:center;border:1px solid {};'.format(HDR_BG, HDR_BG)
666
- TL = 'padding:7px 10px;font-size:11px;color:#444;font-weight:600;background:#EDF2FA;border:1px solid #D5D5D5;'
667
  TV = 'padding:7px 10px;font-size:12px;color:#333;border:1px solid #D5D5D5;'
668
- SEC = 'font-size:13px;font-weight:800;color:#1A3C6E;margin:0 0 10px 0;padding:8px 12px;background:#EDF2FA;border-left:4px solid {};border-bottom:1px solid #D5D5D5;'.format(HDR_BG)
669
- html = f"""<div style="font-family:'Noto Sans KR','Malgun Gothic','Apple SD Gothic Neo',sans-serif;max-width:780px;margin:0 auto;background:#fff;border:2px solid #3B7DD8;box-shadow:0 2px 12px rgba(0,0,0,0.08);">
670
- <!-- ═══════ 헤더 (CopyKiller 스타일) ═══════ -->
671
- <div style="background:linear-gradient(135deg,{HDR_BG},{HDR_BG2});padding:18px 24px;color:#fff;">
672
- <table style="width:100%;"><tr>
673
- <td>
674
- <div style="font-size:10px;opacity:0.8;letter-spacing:1px;margin-bottom:4px;">AI TEXT DETECTOR · PLAGIARISM REPORT</div>
675
- <div style="font-size:22px;font-weight:900;letter-spacing:-0.5px;">표절 검사 결과 확인서</div>
676
- </td>
677
- <td style="text-align:right;vertical-align:bottom;">
678
- <div style="font-size:10px;opacity:0.7;">문서번호 {doc_id}</div>
679
- <div style="font-size:10px;opacity:0.7;">{now}</div>
680
- </td>
681
- </tr></table>
682
- </div>
683
- <!-- ═══════ 제출 정보 테이블 ═══════ -->
684
- <div style="padding:16px 24px 0;">
685
- <div style="{SEC}">📋 검사 정보</div>
686
- <table style="width:100%;border-collapse:collapse;">
687
- <tr>
688
- <td style="{TL}width:90px;">검사 일시</td>
689
- <td style="{TV}">{now}</td>
690
- <td style="{TL}width:90px;">문서번호</td>
691
- <td style="{TV}">{doc_id}</td>
692
- </tr>
693
- <tr>
694
- <td style="{TL}">검사 방법</td>
695
- <td style="{TV}font-size:10px;" colspan="3">{method_str}</td>
696
- </tr>
697
- <tr>
698
- <td style="{TL}">전체 분량</td>
699
- <td style="{TV}" colspan="3">글자수 <b>{char_count:,}</b> · 어절수 <b>{word_count:,}</b> · 문장수 <b>{total_sents}</b></td>
700
- </tr>
701
- <tr>
702
- <td style="{TL}">검색 범위</td>
703
- <td style="{TV}" colspan="3">인터넷(웹), 학술논문(KCI·RISS), 해외논문(arXiv), Google Scholar</td>
704
- </tr>
705
- </table>
706
  </div>
707
- <!-- ═══════ 검사 결과 (도넛 + 바) ═══════ -->
708
  <div style="padding:18px 24px 0;">
709
- <div style="{SEC}">📊 검사 결과</div>
710
  <table style="width:100%;border-collapse:collapse;">
711
  <tr>
712
- <!-- 도넛 차트 -->
713
- <td style="width:200px;padding:20px;text-align:center;border:1px solid #D5D5D5;vertical-align:middle;background:#FAFBFE;">
714
- <div style="position:relative;width:140px;height:140px;margin:0 auto;">
715
- <svg viewBox="0 0 36 36" style="width:140px;height:140px;transform:rotate(-90deg);">
716
- <circle cx="18" cy="18" r="15.9" fill="none" stroke="#E8ECF0" stroke-width="2.8"/>
717
- <circle cx="18" cy="18" r="15.9" fill="none" stroke="{gc}" stroke-width="2.8" stroke-dasharray="{plag_pct} {100-plag_pct}" stroke-linecap="round"/>
718
- </svg>
719
- <div style="position:absolute;top:50%;left:50%;transform:translate(-50%,-50%);text-align:center;">
720
- <div style="font-size:9px;color:#888;margin-bottom:2px;">표절률</div>
721
- <div style="font-size:32px;font-weight:900;color:{gc};line-height:1;">{plag_pct}<span style="font-size:14px;font-weight:700;">%</span></div>
722
- </div>
723
- </div>
724
- <div style="margin-top:8px;padding:4px 12px;background:{grade_bg};border:1px solid {gc};border-radius:20px;display:inline-block;">
725
- <span style="font-size:11px;font-weight:800;color:{gc};">{grade}</span>
726
- </div>
727
  </td>
728
- <!-- 상세 결과 -->
729
- <td style="padding:0;border:1px solid #D5D5D5;vertical-align:top;">
730
- <!-- 카테고리 바 (CopyKiller 핵심) -->
731
- <div style="padding:14px 18px;border-bottom:1px solid #E8E8E8;">
732
- <div style="display:flex;height:28px;border-radius:4px;overflow:hidden;border:1px solid #D0D0D0;margin-bottom:8px;">
733
- <div style="background:#E74C3C;width:{bar_suspect_w}%;"></div>
734
- <div style="background:#E8ECF0;width:{bar_normal_w}%;"></div>
735
- </div>
736
- <div style="display:flex;gap:14px;flex-wrap:wrap;font-size:9px;">
737
- <span style="display:flex;align-items:center;gap:3px;"><span style="display:inline-block;width:10px;height:10px;background:#E74C3C;border-radius:2px;"></span> 의심 <b>{cat_suspect}</b>건</span>
738
- <span style="display:flex;align-items:center;gap:3px;"><span style="display:inline-block;width:10px;height:10px;background:#F39C12;border-radius:2px;"></span> 출처표시 <b>0</b>건</span>
739
- <span style="display:flex;align-items:center;gap:3px;"><span style="display:inline-block;width:10px;height:10px;background:#3498DB;border-radius:2px;"></span> 인용 <b>{cat_cited}</b>건</span>
740
- <span style="display:flex;align-items:center;gap:3px;"><span style="display:inline-block;width:10px;height:10px;background:#E8ECF0;border:1px solid #CCC;border-radius:2px;"></span> 일반 <b>{cat_normal}</b>건</span>
741
- </div>
742
- </div>
743
- <!-- 비율 바 3줄 -->
744
- <div style="padding:12px 18px;">
745
- <div style="margin-bottom:10px;">
746
- <div style="display:flex;justify-content:space-between;margin-bottom:4px;"><span style="font-size:11px;font-weight:700;color:#E74C3C;">■ 표절률</span><span style="font-size:18px;font-weight:900;color:#E74C3C;">{plag_pct}%</span></div>
747
- <div style="background:#EDEDED;height:16px;border-radius:3px;overflow:hidden;border:1px solid #DDD;"><div style="background:linear-gradient(90deg,#E74C3C,#FF7675);height:100%;width:{max(1,plag_pct)}%;transition:width 0.5s;"></div></div>
748
- </div>
749
- <div style="margin-bottom:10px;">
750
- <div style="display:flex;justify-content:space-between;margin-bottom:4px;"><span style="font-size:11px;font-weight:700;color:#F39C12;">■ 유사율</span><span style="font-size:18px;font-weight:900;color:#F39C12;">{similarity_pct}%</span></div>
751
- <div style="background:#EDEDED;height:16px;border-radius:3px;overflow:hidden;border:1px solid #DDD;"><div style="background:linear-gradient(90deg,#F39C12,#FFEAA7);height:100%;width:{max(1,similarity_pct)}%;transition:width 0.5s;"></div></div>
752
- </div>
753
- <div>
754
- <div style="display:flex;justify-content:space-between;margin-bottom:4px;"><span style="font-size:11px;font-weight:700;color:#3498DB;">■ 인용률</span><span style="font-size:18px;font-weight:900;color:#3498DB;">{citation_pct}%</span></div>
755
- <div style="background:#EDEDED;height:16px;border-radius:3px;overflow:hidden;border:1px solid #DDD;"><div style="background:linear-gradient(90deg,#3498DB,#85C1E9);height:100%;width:{max(1,citation_pct)}%;transition:width 0.5s;"></div></div>
756
- </div>
757
- </div>
758
- <!-- 요약 수치 -->
759
- <div style="padding:8px 18px;background:#F8F9FB;border-top:1px solid #E8E8E8;">
760
- <table style="width:100%;border-collapse:collapse;">
761
- <tr>
762
- <td style="padding:4px;font-size:10px;color:#888;">의심문장</td>
763
- <td style="padding:4px;font-size:12px;font-weight:800;color:#E74C3C;">{cat_suspect}건</td>
764
- <td style="padding:4px;font-size:10px;color:#888;">일반문장</td>
765
- <td style="padding:4px;font-size:12px;font-weight:800;color:#27AE60;">{cat_normal}건</td>
766
- <td style="padding:4px;font-size:10px;color:#888;">전체</td>
767
- <td style="padding:4px;font-size:12px;font-weight:800;color:#333;">{total_sents}건</td>
768
- </tr>
769
- </table>
770
  </div>
771
  </td>
772
  </tr>
773
  </table>
774
  </div>
775
- <!-- ═══════ 전체 텍스트 분석 ═══════ -->
776
- <div style="padding:18px 24px 0;">
777
- <div style="{SEC}">📝 전체 텍스트 분석</div>
778
- <div style="display:flex;gap:16px;margin-bottom:8px;font-size:9px;padding:6px 10px;background:#F8F9FB;border:1px solid #E8E8E8;border-radius:4px;">
779
- <span style="display:flex;align-items:center;gap:4px;"><span style="display:inline-block;width:12px;height:12px;background:#E74C3C;border-radius:2px;"></span> 표절 의심</span>
780
- <span style="display:flex;align-items:center;gap:4px;"><span style="display:inline-block;width:12px;height:12px;background:#F39C12;border-radius:2px;"></span> 출처표시</span>
781
- <span style="display:flex;align-items:center;gap:4px;"><span style="display:inline-block;width:12px;height:12px;background:#3498DB;border-radius:2px;"></span> 인용</span>
782
- <span style="display:flex;align-items:center;gap:4px;"><span style="display:inline-block;width:12px;height:12px;background:#27AE60;border-radius:2px;"></span> 자기표절</span>
783
- <span style="display:flex;align-items:center;gap:4px;"><span style="display:inline-block;width:12px;height:12px;background:#fff;border:1px solid #CCC;border-radius:2px;"></span> 일반</span>
784
- </div>
785
- <div style="padding:14px;background:#FAFBFC;border:1px solid #D5D5D5;line-height:2.1;font-size:13px;max-height:300px;overflow-y:auto;">{full_hl}</div>
786
- </div>
787
- <!-- ═══════ 표절 의심 출처 ═══════ -->
788
- <div style="padding:18px 24px 0;">
789
- <div style="{SEC}">🔗 표절 의심 출처 ({len(src_list)}건)</div>
790
- <table style="width:100%;border-collapse:collapse;">
791
- <tr>
792
- <th style="{TH}width:32px;">No</th>
793
- <th style="{TH}width:48px;">유형</th>
794
- <th style="{TH}">출처명 / URL</th>
795
- <th style="{TH}width:55px;">유사율</th>
796
- <th style="{TH}width:100px;">분포</th>
797
- </tr>
798
- {src_rows if src_rows else '<tr><td colspan="5" style="padding:18px;text-align:center;color:#999;font-size:11px;border:1px solid #D5D5D5;background:#FAFBFC;">발견된 유사 출처가 없습니다.</td></tr>'}
799
- </table>
800
- </div>
801
- <!-- ═══════ 의심 문장 비교 ═══════ -->
802
- <div style="padding:18px 24px 0;">
803
- <div style="{SEC}">⚠️ 의심 문장 비교 ({len(sim_sents)}건)</div>
804
- <table style="width:100%;border-collapse:collapse;">
805
- <tr>
806
- <th style="{TH}width:32px;">No</th>
807
- <th style="{TH}width:38%;">검사 문장 (원문)</th>
808
- <th style="{TH}width:34%;">비교 문장 (출처)</th>
809
- <th style="{TH}">출처</th>
810
- </tr>
811
- {suspect_rows if suspect_rows else '<tr><td colspan="4" style="padding:18px;text-align:center;color:#999;font-size:11px;border:1px solid #D5D5D5;background:#FAFBFC;">유사 의심 문장이 발견되지 않았습니다.</td></tr>'}
812
  </table>
813
  </div>
814
- <!-- ═══════ 검사 안내 ═══════ -->
815
- <div style="margin:18px 24px;padding:12px 14px;background:#F5F8FC;border:1px solid #D0DAEA;border-radius:4px;font-size:9px;color:#555;line-height:1.8;">
816
- <b style="color:#333;">📌 검사 안내</b><br>
817
- · 본 보고서는 <b>{method_str}</b> 기반 자동 표절 검사 결과입니다.<br>
818
- · 검색 범위: 인터넷 웹페이지, 학술논문(KCI, RISS), 해외논문(arXiv)<br>
819
- · 유사도는 문장 단위 매칭 기반이며, 최종 판정은 ���수자/검토자의 확인이 필요합니다.<br>
820
- · 인용 표기(따옴표, 각주 등)가 포함된 문장은 인용으로 분류될 수 있습니다.
821
- </div>
822
- <!-- ═══════ 푸터 (CopyKiller 스타일) ═══════ -->
823
- <div style="padding:10px 24px;background:#F0F3F8;border-top:2px solid {HDR_BG};display:flex;justify-content:space-between;align-items:center;">
824
- <div>
825
- <span style="font-size:13px;font-weight:900;color:{HDR_BG};">AI Detector</span>
826
- <span style="font-size:9px;color:#999;margin-left:6px;">Plagiarism Checker v3.5</span>
827
- </div>
828
- <div style="text-align:right;">
829
- <div style="font-size:9px;color:#AAA;">Powered by Brave · KCI · RISS · arXiv · Gemini</div>
830
- <div style="font-size:8px;color:#CCC;">{now} · ID: {doc_id} · All Rights Reserved.</div>
831
- </div>
832
  </div>
833
  </div>"""
834
- log = '\n'.join(log_lines) + f"\n\n종합: {plag_pct}% {grade} | 출처 {len(unique_sources)}건 | 유사문장 {matched_sents}/{total_sents}"
 
 
 
835
  return html, log
836
  def run_detection(text, progress=gr.Progress()):
837
  if not text or len(text.strip())<50: return "<div style='padding:20px;text-align:center;color:#888;'>⚠️ 최소 50자</div>",""
 
467
  all_results.extend(duckduckgo_search(f"{query} 논문 학술", 2))
468
  return all_results
469
  def run_plagiarism(text, progress=gr.Progress()):
470
+ """✅ Gemini Google Search 90% (메인) + Brave/arXiv 10% (보조)"""
471
+
472
  if not text or len(text.strip())<50:
473
  return "<div style='padding:20px;text-align:center;color:#888;'>⚠️ 최소 50자 이상</div>", ""
474
+
475
  text = text.strip()
476
  sents = split_sentences(text)
477
  now = datetime.now().strftime("%Y-%m-%d %H:%M")
478
+
 
479
  progress(0.05, "문장 분리...")
480
+
 
 
 
 
481
  all_sources = []
 
 
482
  log_lines = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
 
484
+ # ============================================
485
+ # 메인 (90%): Gemini Google Search
486
+ # ============================================
487
+ gemini_pct = 0
488
+ gemini_sources = []
489
 
490
+ if HAS_GENAI and GEMINI_KEY:
491
+ progress(0.20, "Gemini + Google Search (메인 90% 표절 검)...")
 
492
 
493
+ try:
494
+ client = genai.Client(api_key=GEMINI_KEY)
495
+
496
+ prompt = f"""당신은 고급 표절 검사 전문가입니다.
497
+
498
+ 아래 텍스트를 Google Search로 철저히 검색하여 유사한 내용을 모두 찾으세요.
499
+
500
+ [검사 텍스트]
501
+ {text}
502
+
503
+ 응답 형식:
504
+ 발견된 유사 내용:
505
+ - [제목 1] (유사도: XX%)
506
+ - [제목 2] (유사도: XX%)
507
+
508
+ 최종 표절율: XX%"""
509
+
510
+ contents = [
511
+ types.Content(
512
+ role="user",
513
+ parts=[
514
+ types.Part.from_text(text=prompt),
515
+ ],
516
+ )
517
+ ]
518
+
519
+ tools = [
520
+ types.Tool(googleSearch=types.GoogleSearch()),
521
+ ]
522
+
523
+ config = types.GenerateContentConfig(
524
+ thinking_config=types.ThinkingConfig(thinking_budget=0),
525
+ tools=tools,
526
+ temperature=0.3,
527
+ max_output_tokens=3000,
528
+ )
529
+
530
+ # 스트리밍으로 수집
531
+ full_response = ""
532
+ for chunk in client.models.generate_content_stream(
533
+ model="gemini-2.0-flash-lite-latest",
534
+ contents=contents,
535
+ config=config,
536
+ ):
537
+ if chunk.text:
538
+ full_response += chunk.text
539
+
540
+ # 유사도 추출
541
+ pm = re.search(r'(?:표절|유사도)[:\s]*(\d+)', full_response)
542
+ if pm:
543
+ gemini_pct = int(pm.group(1))
544
+
545
+ # 출처 추출
546
+ for m in re.finditer(r'https?://[^\s\)]+', full_response):
547
+ url = m.group(0)
548
+ if len(url) > 10:
549
+ gemini_sources.append({
550
+ "title": url.split('/')[2] if '/' in url else url,
551
+ "url": url,
552
+ "source": "Google",
553
+ })
554
+ all_sources.append({
555
+ "title": url.split('/')[2] if '/' in url else url,
556
+ "url": url,
557
+ "source": "Google",
558
+ })
559
+
560
+ log_lines.append(f"✅ Gemini: {gemini_pct}% → {len(gemini_sources)}개 출처")
561
+
562
+ except Exception as e:
563
+ log_lines.append(f"⚠️ Gemini: {str(e)[:80]}")
564
+
565
+ # ============================================
566
+ # 보조 (10%): Brave + arXiv
567
+ # ============================================
568
+ brave_pct = 0
569
+ arxiv_pct = 0
570
+
571
+ progress(0.60, "보조 검사 (Brave + arXiv)...")
572
+
573
+ try:
574
+ if BRAVE_KEY:
575
+ words = split_words(text)
576
+ key_query = ' '.join(words[:5])
577
+ brave_results = brave_search(key_query, 3)
578
+ if brave_results:
579
+ brave_pct = 30
580
+ all_sources.extend(brave_results)
581
+ log_lines.append(f"Brave: {len(brave_results)}건")
582
 
583
+ words = split_words(text)
584
+ wf = Counter(words)
585
+ keywords = [w for w, c in wf.most_common(10) if len(w) >= 3][:3]
586
+ arxiv_results = search_arxiv(' '.join(keywords))
587
+ if arxiv_results:
588
+ arxiv_pct = 20
589
+ all_sources.extend(arxiv_results)
590
+ log_lines.append(f"arXiv: {len(arxiv_results)}건")
591
+ except:
592
+ pass
593
+
594
+ # ============================================
595
+ # 최종: Gemini 90% + 보조 10%
596
+ # ============================================
597
+ progress(0.80, "보고서 생성...")
598
+
599
+ helper_pct = max(brave_pct, arxiv_pct)
600
+ plag_pct = int(gemini_pct * 0.9 + helper_pct * 0.1) if gemini_pct > 0 else helper_pct
601
+
602
+ # ============================================
603
+ # HTML 보고서
604
+ # ============================================
605
+
606
  seen_urls = set()
607
  unique_sources = []
608
  for s in all_sources:
 
610
  if url and url not in seen_urls:
611
  seen_urls.add(url)
612
  unique_sources.append(s)
613
+
614
+ if plag_pct >= 50: grade, gc = "표절 의심", "#FF4444"
615
+ elif plag_pct >= 30: grade, gc = "주의 필요", "#FF8800"
616
+ elif plag_pct >= 15: grade, gc = "유사 표현 일부", "#DDAA00"
617
+ elif plag_pct >= 5: grade, gc = "양호", "#4ECDC4"
618
+ else: grade, gc = "우수", "#22AA44"
619
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620
  word_count = len(split_words(text))
621
  char_count = len(text)
622
  doc_id = hashlib.md5(text[:100].encode()).hexdigest()[:8].upper()
623
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
624
  src_rows = ""
625
+ for i, s in enumerate(unique_sources[:15]):
626
+ ico = "🔍" if "google" in s.get("source","").lower() else "🌐"
 
 
 
627
  src_rows += f"""<tr>
628
+ <td style="padding:6px 8px;font-size:11px;text-align:center;border:1px solid #D5D5D5;">{i+1}</td>
629
+ <td style="padding:6px 8px;font-size:11px;text-align:center;border:1px solid #D5D5D5;">{ico}</td>
630
+ <td style="padding:6px 8px;border:1px solid #D5D5D5;font-size:11px;color:#1A3C6E;"><strong>{s['title'][:40]}</strong></td>
631
+ <td style="padding:6px 8px;border:1px solid #D5D5D5;font-size:9px;color:#666;word-break:break-all;">{s['url'][:50]}</td>
 
 
 
 
 
 
 
 
 
632
  </tr>"""
633
+
634
+ HDR = '#3B7DD8'
635
+ TL = f'padding:7px 10px;font-size:11px;color:#444;font-weight:600;background:#EDF2FA;border:1px solid #D5D5D5;'
 
 
 
 
 
 
 
 
 
 
636
  TV = 'padding:7px 10px;font-size:12px;color:#333;border:1px solid #D5D5D5;'
637
+
638
+ html = f"""<div style="font-family:'Noto Sans KR',sans-serif;max-width:780px;margin:0 auto;background:#fff;border:2px solid {HDR};box-shadow:0 2px 12px rgba(0,0,0,0.08);">
639
+ <div style="background:linear-gradient(135deg,{HDR},#4A8DE0);padding:18px 24px;color:#fff;">
640
+ <div style="font-size:22px;font-weight:900;">표절 검사 결과</div>
641
+ <div style="font-size:10px;opacity:0.8;">문서: {doc_id} | {now}</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
642
  </div>
 
643
  <div style="padding:18px 24px 0;">
 
644
  <table style="width:100%;border-collapse:collapse;">
645
  <tr>
646
+ <td style="width:200px;padding:20px;text-align:center;border:1px solid #D5D5D5;background:#FAFBFE;">
647
+ <div style="font-size:48px;font-weight:900;color:{gc};">{plag_pct}%</div>
648
+ <div style="font-size:12px;color:#666;margin-top:4px;">표절율</div>
 
 
 
 
 
 
 
 
 
 
 
 
649
  </td>
650
+ <td style="padding:20px;border:1px solid #D5D5D5;">
651
+ <div style="font-size:16px;font-weight:900;color:{gc};margin-bottom:12px;">{grade}</div>
652
+ <div style="font-size:11px;color:#666;line-height:1.8;">
653
+ <div>글자수: <b>{char_count:,}</b></div>
654
+ <div>단어수: <b>{word_count:,}</b></div>
655
+ <div>출처: <b>{len(unique_sources)}</b></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
656
  </div>
657
  </td>
658
  </tr>
659
  </table>
660
  </div>
661
+ <div style="padding:16px 24px;">
662
+ <div style="font-size:13px;font-weight:800;color:#1A3C6E;margin-bottom:10px;padding-bottom:8px;border-bottom:2px solid {HDR};">🔍 발견된 출처</div>
663
+ <table style="width:100%;border-collapse:collapse;font-size:10px;">
664
+ <thead>
665
+ <tr style="background:{HDR};color:white;">
666
+ <th style="padding:8px;border:1px solid #D5D5D5;width:30px;">순위</th>
667
+ <th style="padding:8px;border:1px solid #D5D5D5;width:30px;">출처</th>
668
+ <th style="padding:8px;border:1px solid #D5D5D5;">제목</th>
669
+ <th style="padding:8px;border:1px solid #D5D5D5;">URL</th>
670
+ </tr>
671
+ </thead>
672
+ <tbody>
673
+ {src_rows if src_rows else '<tr><td colspan="4" style="padding:16px;text-align:center;color:#999;">발견된 출처 없음</td></tr>'}
674
+ </tbody>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675
  </table>
676
  </div>
677
+ <div style="padding:16px 24px;background:#FFF8E1;border-top:1px solid #D5D5D5;font-size:10px;color:#666;line-height:1.6;">
678
+ <strong>분석:</strong> <strong style="color:#D63031;">Gemini Google Search 90% (메인)</strong> + Brave/arXiv 10% (보조)<br>
679
+ <strong>방법:</strong> Gemini의 AI 분석 + Google Search Grounding으로 실시간 웹 검색
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
680
  </div>
681
  </div>"""
682
+
683
+ log = "\n".join(log_lines)
684
+ progress(0.95, "완료...")
685
+
686
  return html, log
687
  def run_detection(text, progress=gr.Progress()):
688
  if not text or len(text.strip())<50: return "<div style='padding:20px;text-align:center;color:#888;'>⚠️ 최소 50자</div>",""