TraceDetect-AI / text_module.py
nchdlhbctm's picture
Upload 13 files
646535a verified
Raw
History Blame Contribute Delete
4.27 kB
import jieba
import torch
import re
import numpy as np
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# ==========================================
# 第一部分:统计风格特征提取 (占权重 20%)
# ==========================================
def extract_text_statistics(text):
"""
提取文本的统计学与风格特征。
"""
words = list(jieba.cut(text))
words = [w for w in words if w.strip() and len(w) > 0]
if len(words) == 0: return 0.0, 0, 0
unique_words = set(words)
richness = len(unique_words) / len(words)
sentences = re.split(r'[。!?\n]', text)
sentences = [s for s in sentences if len(s.strip()) > 0]
sentence_lengths = [len(s) for s in sentences]
avg_sentence_len = np.mean(sentence_lengths) if sentence_lengths else 0
sentence_len_std = np.std(sentence_lengths) if sentence_lengths else 0
score = 0.0
if richness < 0.55:
score += 0.5
if sentence_len_std < 10.0 and avg_sentence_len > 15:
score += 0.5
return min(score, 1.0), richness, sentence_len_std
# ==========================================
# 第二部分:深度语义特征提取 (换上我们自己微调的专属模型)
# ==========================================
@st.cache_resource
def get_custom_text_model():
"""加载我们刚刚炼丹微调出来的专属 BERT 模型"""
# 直接指向你刚刚生成的那个文件夹
model_path = "./finetuned_text_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()
return tokenizer, model
def extract_deep_text_features(text, tokenizer, model):
"""使用专属模型进行推理打分"""
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
with torch.no_grad():
outputs = model(**inputs)
# 提取输出并转换为概率分布 (0: AI, 1: 真实)
logits = outputs.logits
probs = torch.softmax(logits, dim=1)
# 获取它被判定为 AI (标签0) 的概率百分比
fake_prob = probs[0][0].item()
return fake_prob
# ==========================================
# 第三部分:多模态加权融合
# ==========================================
def analyze_text(text_content):
# 1. 统计特征
stat_score, richness, std_len = extract_text_statistics(text_content)
# 2. 深度特征 (加载你的专属模型)
tokenizer, model = get_custom_text_model()
deep_score = extract_deep_text_features(text_content, tokenizer, model)
# 3. 加权融合 (既然我们自己炼了丹,深度模型变得极强,权重拉高到 80%)
final_prob = (stat_score * 0.2) + (deep_score * 0.8)
return {
"stat_score": stat_score,
"deep_score": deep_score,
"final_probability": final_prob,
"details": f"词汇丰富度: {richness:.2f} | 句长波动率: {std_len:.1f}"
}
def generate_text_highlight_html(text, tokenizer, model):
"""逐句扫描文本,生成带有高亮背景色的 HTML 代码"""
# 按照标点符号将文章拆分为句子
sentences = re.split(r'([。!?\n])', text)
# 重新把句子和标点拼装起来
sentences = ["".join(i) for i in zip(sentences[0::2], sentences[1::2] + [""])]
html_content = '<div style="line-height: 1.6; font-size: 16px; padding: 10px; border: 1px solid #ddd; border-radius: 5px;">'
for s in sentences:
if len(s.strip()) < 2:
html_content += s.replace('\n', '<br>')
continue
# 让专属模型给这一句话单独打分
prob = extract_deep_text_features(s, tokenizer, model)
# 只有当 AI 概率大于 50% 时才开始变红,概率越高颜色越深
if prob > 0.5:
alpha = (prob - 0.5) * 2 # 映射到 0~1 透明度
color = f"rgba(255, 75, 75, {alpha:.2f})"
html_content += f'<span style="background-color: {color}; border-radius: 3px;">{s}</span>'
else:
html_content += s
html_content += "</div>"
return html_content