import jieba import torch import re import numpy as np import streamlit as st from transformers import AutoTokenizer, AutoModelForSequenceClassification # ========================================== # 第一部分:统计风格特征提取 (占权重 20%) # ========================================== def extract_text_statistics(text): """ 提取文本的统计学与风格特征。 """ words = list(jieba.cut(text)) words = [w for w in words if w.strip() and len(w) > 0] if len(words) == 0: return 0.0, 0, 0 unique_words = set(words) richness = len(unique_words) / len(words) sentences = re.split(r'[。!?\n]', text) sentences = [s for s in sentences if len(s.strip()) > 0] sentence_lengths = [len(s) for s in sentences] avg_sentence_len = np.mean(sentence_lengths) if sentence_lengths else 0 sentence_len_std = np.std(sentence_lengths) if sentence_lengths else 0 score = 0.0 if richness < 0.55: score += 0.5 if sentence_len_std < 10.0 and avg_sentence_len > 15: score += 0.5 return min(score, 1.0), richness, sentence_len_std # ========================================== # 第二部分:深度语义特征提取 (换上我们自己微调的专属模型) # ========================================== @st.cache_resource def get_custom_text_model(): """加载我们刚刚炼丹微调出来的专属 BERT 模型""" # 直接指向你刚刚生成的那个文件夹 model_path = "./finetuned_text_model" tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForSequenceClassification.from_pretrained(model_path) model.eval() return tokenizer, model def extract_deep_text_features(text, tokenizer, model): """使用专属模型进行推理打分""" inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128) with torch.no_grad(): outputs = model(**inputs) # 提取输出并转换为概率分布 (0: AI, 1: 真实) logits = outputs.logits probs = torch.softmax(logits, dim=1) # 获取它被判定为 AI (标签0) 的概率百分比 fake_prob = probs[0][0].item() return fake_prob # ========================================== # 第三部分:多模态加权融合 # ========================================== def analyze_text(text_content): # 1. 统计特征 stat_score, richness, std_len = extract_text_statistics(text_content) # 2. 深度特征 (加载你的专属模型) tokenizer, model = get_custom_text_model() deep_score = extract_deep_text_features(text_content, tokenizer, model) # 3. 加权融合 (既然我们自己炼了丹,深度模型变得极强,权重拉高到 80%) final_prob = (stat_score * 0.2) + (deep_score * 0.8) return { "stat_score": stat_score, "deep_score": deep_score, "final_probability": final_prob, "details": f"词汇丰富度: {richness:.2f} | 句长波动率: {std_len:.1f}" } def generate_text_highlight_html(text, tokenizer, model): """逐句扫描文本,生成带有高亮背景色的 HTML 代码""" # 按照标点符号将文章拆分为句子 sentences = re.split(r'([。!?\n])', text) # 重新把句子和标点拼装起来 sentences = ["".join(i) for i in zip(sentences[0::2], sentences[1::2] + [""])] html_content = '
' for s in sentences: if len(s.strip()) < 2: html_content += s.replace('\n', '
') continue # 让专属模型给这一句话单独打分 prob = extract_deep_text_features(s, tokenizer, model) # 只有当 AI 概率大于 50% 时才开始变红,概率越高颜色越深 if prob > 0.5: alpha = (prob - 0.5) * 2 # 映射到 0~1 透明度 color = f"rgba(255, 75, 75, {alpha:.2f})" html_content += f'{s}' else: html_content += s html_content += "
" return html_content