import gradio as gr import numpy as np import torch import os import json from core.model import DiscrepancyEstimator import re import docx import spaces from datasets import load_dataset def read_file_content(file): if file is None: return "" if file.name.endswith('.txt'): with open(file.name, 'r', encoding='utf-8') as f: return f.read() elif file.name.endswith('.docx'): doc = docx.Document(file.name) full_text = [] for para in doc.paragraphs: full_text.append(para.text) return '\n'.join(full_text) return "" def split_sentences(text): """根据句号、句点、分号分割文本成句子,同时保留分句符号。""" sentences = re.split(r'([。.])', text) combined_sentences = [sentences[i] + sentences[i+1] for i in range(0, len(sentences)-1, 2)] if len(sentences) % 2 == 1: combined_sentences.append(sentences[-1]) return [s.strip() for s in combined_sentences if s.strip()] def count_words(sentence, language='Chinese'): """统计句子的词数。""" return len(sentence.replace('\n', '').replace('\r', '').split()) if language != 'Chinese' else len(sentence.replace('\n', '').replace('\r', '')) def segment_text(sentences, language='Chinese'): """按照要求拼接句子,确保不忽略第一段并处理最后一句话不足100词的情况。""" result = [] current_segment = [] current_length = 0 for i, sentence in enumerate(sentences): word_count = count_words(sentence, language) if word_count > 100: # 如果单个句子超过100词,考虑拼接 if i + 1 < len(sentences) and word_count + count_words(sentences[i + 1], language) <= 200: # 拼接当前和下一个句子 if current_segment: # 先保存当前段 result.append(' '.join(current_segment) if language != 'Chinese' else ''.join(current_segment)) result.append((sentence + ' ' + sentences[i + 1]) if language != 'Chinese' else (sentence + sentences[i + 1])) current_segment = [] current_length = 0 i += 1 # 跳过下一个句子 continue else: # 单独存放 if current_segment: # 先保存当前段 result.append(' '.join(current_segment) if language != 'Chinese' else ''.join(current_segment)) result.append(sentence) current_segment = [] current_length = 0 else: if current_length + word_count > 100: # 当前段超过100词,保存并开始新段 if current_segment: result.append(' '.join(current_segment) if language != 'Chinese' else ''.join(current_segment)) current_segment = [sentence] current_length = word_count else: # 继续累积 current_segment.append(sentence) current_length += word_count # 处理最后一段 if current_segment: if current_length < 100 and result and current_length + count_words(result[-1], language) <= 200: # 如果最后一段不足100词,且可以与前一段合并 last_segment = result.pop() if result else '' current_segment = (last_segment.split() if language != 'Chinese' else list(last_segment)) + current_segment result.append(' '.join(current_segment) if language != 'Chinese' else ''.join(current_segment)) else: # 直接添加最后一段 result.append(' '.join(current_segment) if language != 'Chinese' else ''.join(current_segment)) return result def extract_latex_text(latex_source): # 提取document环境中的内容 doc_pattern = re.compile(r'\\begin{document}(.*?)\\end{document}', re.DOTALL) match = doc_pattern.search(latex_source) content = match.group(1) if match else latex_source # 删除注释(排除转义后的%) content = re.sub(r'(? crit - offset) & (np.array(real_crits) < crit + offset)) cnt_fake = np.sum((np.array(fake_crits) > crit - offset) & (np.array(fake_crits) < crit + offset)) probs[task] = (cnt_fake / (cnt_real + cnt_fake)) if (cnt_real + cnt_fake) > 0 else 0.5 return probs device = 'cuda' zh_prob_estimator = ProbEstimator(ref_file_dir="JiachenFu/Qwen2-0.5B-detectanyllm-detector-ref-zh") en_prob_estimator = ProbEstimator(ref_file_dir="JiachenFu/Qwen2-0.5B-detectanyllm-detector-ref-en") @spaces.GPU def greet(mode, language, input_text): if mode == "LaTex": input_text = extract_latex_text(input_text) split_texts = split_sentences(input_text) sub_texts = segment_text(split_texts, language=language) detected = [] if language == "Chinese": model = DiscrepancyEstimator(pretrained_ckpt="JiachenFu/Qwen2-0.5B-detectanyllm-detector-zh").to(device) prob_estimator = zh_prob_estimator else: model = DiscrepancyEstimator(pretrained_ckpt="JiachenFu/Qwen2-0.5B-detectanyllm-detector-en").to(device) prob_estimator = en_prob_estimator model.eval() for i, sub_text in enumerate(sub_texts): text_content = sub_text print(f'processing {sub_text}') tokens = model.scoring_tokenizer( text_content, return_tensors='pt', padding=True, truncation=True, return_token_type_ids=False ) print(f'tokenized') input_ids = tokens['input_ids'].to(device) attention_mask = tokens['attention_mask'].to(device) with torch.no_grad(): output = model.get_discrepancy_of_scoring_and_reference_models( input_ids_for_scoring_model=input_ids, attention_mask_for_scoring_model=attention_mask, input_ids_for_reference_model=None, attention_mask_for_reference_model=None, ) discrepancy = output['scoring_discrepancy'] discrepancy = discrepancy.cpu().numpy().item() print(f'discrepancy: {discrepancy}') probs = prob_estimator.crit_to_prob(discrepancy) if discrepancy < 15: for task in probs.keys(): probs[task] = 0.0 detected.append({ 'order': i, 'text': text_content, 'words_count': len(text_content) if language == "Chinese" else len(text_content.split()), 'probs': probs }) # 添加绝对定位的总概率显示 # 构建动画效果 html_output = '''
''' current_delay = 0.0 # 当前动画延迟时间 char_duration = 0.001 # 每个字符的间隔时间 # 处理文本内容 for item in detected: ai_generate_prob = item['probs']['generate'] ai_revise_prob = max(item['probs']['polish'], item['probs']['rewrite']) prob = max(ai_generate_prob, ai_revise_prob) if prob >= 0.75: if ai_generate_prob >= ai_revise_prob: color = "red" item["generate"] = 1 item["revise"] = 0 else: color = "orange" item["generate"] = 0 item["revise"] = 1 else: color = "black" item["generate"] = 0 item["revise"] = 0 for char in item['text']: html_output += f'{char}' current_delay += char_duration total_length = sum(item['words_count'] for item in detected) # total_prob = sum(item['prob'] * item['words_count'] for item in detected) / total_length if total_length > 0 else 0 generate_prob = sum(item["generate"] * item["words_count"] for item in detected) / total_length if total_length > 0 else 0 revise_prob = sum(item["revise"] * item["words_count"] for item in detected) / total_length if total_length > 0 else 0 html_output += f'''
🤖 AI Generated Rate: {generate_prob:.2%}
✍️ AI Revised Rate: {revise_prob:.2%}
''' html_output += '
' return html_output # 使用Blocks替代Interface以获得更好的自定义能力 # 修改CSS部分 with gr.Blocks(css=""" @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap'); :root { --accent-color: #6366f1; --text-color: #374151; --border-color: #e5e7eb; --background-light: #f9fafb; --background-card: #ffffff; } body, .gradio-container { background: var(--background-light); font-family: 'Inter', sans-serif; color: var(--text-color); } #header { text-align: center; padding: 2rem; margin: 0 auto; /* Use gap for spacing, remove margin-bottom */ background-color: var(--background-card); background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='40' height='40' viewBox='0 0 40 40'%3E%3Cg fill-rule='evenodd'%3E%3Cg fill='%23e5e7eb' fill-opacity='0.3'%3E%3Cpath d='M0 38.59l2.83-2.83 1.41 1.41L1.41 40H0v-1.41zM0 1.4l2.83 2.83 1.41-1.41L1.41 0H0v1.41zM38.59 40l-2.83-2.83 1.41-1.41L40 38.59V40h-1.41zM40 1.41l-2.83 2.83-1.41-1.41L38.59 0H40v1.41zM20 18.6l2.83-2.83 1.41 1.41L21.41 20l2.83 2.83-1.41 1.41L20 21.41l-2.83 2.83-1.41-1.41L18.59 20l-2.83-2.83 1.41-1.41L20 18.59z'/%3E%3C/g%3E%3C/g%3E%3C/svg%3E"); border: 1px solid var(--border-color); border-radius: 16px; box-shadow: 0 4px 12px rgba(0,0,0,0.05); } #title { font-weight: 800; font-size: 2.5em; letter-spacing: -0.02em; color: var(--text-color); margin-bottom: 0.25em; } .detect-grad { background: -webkit-linear-gradient(left, #ff8c8c, #ffc89e); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: 800; } .anyllm-grad { background: -webkit-linear-gradient(left, #a0e6ff, #aaffd4); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: 800; } #authors { font-size: 1.1em; color: #6b7280; margin: 0; } #main-container { max-width: 1200px; margin: 0 auto; padding: 0 1rem; gap: 2rem; /* Add gap for consistent spacing */ } #controls-row { justify-content: center; gap: 2rem; } /* Custom styles for Radio Button Groups */ #controls-row > div { background-color: var(--background-card); background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='40' height='40' viewBox='0 0 40 40'%3E%3Cg fill-rule='evenodd'%3E%3Cg fill='%23e5e7eb' fill-opacity='0.3'%3E%3Cpath d='M0 38.59l2.83-2.83 1.41 1.41L1.41 40H0v-1.41zM0 1.4l2.83 2.83 1.41-1.41L1.41 0H0v1.41zM38.59 40l-2.83-2.83 1.41-1.41L40 38.59V40h-1.41zM40 1.41l-2.83 2.83-1.41-1.41L38.59 0H40v1.41zM20 18.6l2.83-2.83 1.41 1.41L21.41 20l2.83 2.83-1.41 1.41L20 21.41l-2.83 2.83-1.41-1.41L18.59 20l-2.83-2.83 1.41-1.41L20 18.59z'/%3E%3C/g%3E%3C/g%3E%3C/svg%3E"); border: 1px solid var(--border-color); border-radius: 16px; padding: 1rem; box-shadow: 0 4px 12px rgba(0,0,0,0.05); } #controls-row .gradio-button { border-radius: 10px !important; transition: background-color 0.2s ease, color 0.2s ease; } #controls-row .gradio-button.selected { background: var(--accent-color) !important; color: white !important; border-color: var(--accent-color) !important; } #content-row { gap: 1.5rem; } .card { background-color: var(--background-card); background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='40' height='40' viewBox='0 0 40 40'%3E%3Cg fill-rule='evenodd'%3E%3Cg fill='%23e5e7eb' fill-opacity='0.3'%3E%3Cpath d='M0 38.59l2.83-2.83 1.41 1.41L1.41 40H0v-1.41zM0 1.4l2.83 2.83 1.41-1.41L1.41 0H0v1.41zM38.59 40l-2.83-2.83 1.41-1.41L40 38.59V40h-1.41zM40 1.41l-2.83 2.83-1.41-1.41L38.59 0H40v1.41zM20 18.6l2.83-2.83 1.41 1.41L21.41 20l2.83 2.83-1.41 1.41L20 21.41l-2.83 2.83-1.41-1.41L18.59 20l-2.83-2.83 1.41-1.41L20 18.59z'/%3E%3C/g%3E%3C/g%3E%3C/svg%3E"); border: 1px solid var(--border-color); border-radius: 16px; padding: 1.5rem; box-shadow: 0 4px 12px rgba(0,0,0,0.05); height: 100%; display: flex; flex-direction: column; gap: 1rem; } .card-title { font-weight: 600; font-size: 1.2rem; color: var(--text-color); padding-bottom: 0.75rem; border-bottom: 1px solid var(--border-color); } #input-text textarea { flex-grow: 1; border: none !important; box-shadow: none !important; padding: 0 !important; font-size: 1.1em; line-height: 1.7; } #result-html { flex-grow: 1; font-size: 1.1em; line-height: 1.7; overflow-y: auto; height: 520px; } #input-footer { display: flex; justify-content: space-between; align-items: center; margin-top: auto; /* Push to bottom */ } #char-counter { font-size: 0.9em; color: #9ca3af; } #char-counter.error { color: #ef4444; } #submit-btn { flex-grow: 1; max-width: 200px; font-size: 1.05em; font-weight: 600; background: var(--accent-color); color: white; border-radius: 10px; } #submit-btn:hover { background: #4f46e5; } .disclaimer { text-align: center; margin: 0 auto; /* Remove vertical margins */ color: #64748b; font-size: 1.1em; max-width: 800px; } /* Reveal 动画更丝滑 */ @keyframes reveal { from { opacity: 0; } to { opacity: 1; } } .reveal-char { opacity: 0; animation: reveal 0.2s forwards; white-space: pre-wrap; } """) as demo: with gr.Column(elem_id="main-container"): gr.Markdown(""" """) with gr.Row(elem_id="controls-row"): language_radio = gr.Radio( choices=["English", "Chinese"], value="English", label="🌐 Language", interactive=True ) mode_radio = gr.Radio( choices=["Text-Only", "LaTex"], value="Text-Only", label="✍️ Input Type", interactive=True ) with gr.Row(equal_height=True, elem_id="content-row"): with gr.Column(scale=1, min_width=500): with gr.Column(elem_classes="card"): gr.HTML('
📝 Input
') upload_btn = gr.File( label="Upload File (txt, docx)", file_types=['.txt', '.docx'], elem_id="upload-btn" ) input_text = gr.Textbox( show_label=False, placeholder="Enter text to detect or upload a file...", lines=15, elem_id="input-text", max_length=100000, ) with gr.Row(elem_id="input-footer"): counter_html = gr.HTML("
0/100000
") submit_btn = gr.Button("✨ Detect", variant="primary", elem_id="submit-btn") with gr.Column(scale=1, min_width=500): with gr.Column(elem_classes="card"): gr.HTML('
🔍 Result
') result = gr.HTML(elem_id="result-html") gr.HTML("""
💡 Red fonts indicate a high probability of AI generation. Orange fonts indicate a high probability of AI revision or polishing. The detection results are for reference only.
""") upload_btn.upload( read_file_content, inputs=upload_btn, outputs=input_text ) input_text.input( None, [input_text], None, js=""" (text) => { setTimeout(() => { const counter = document.getElementById("char-counter"); if (counter) { const length = text.length; counter.innerHTML = `${length}/100000`; counter.classList.toggle("error", length > 100000); } }, 0); return text; } """ ) submit_btn.click( greet, inputs=[mode_radio, language_radio, input_text], outputs=result ) demo.launch(share=True)