Spaces:

nchdlhbctm
/

TraceDetect-AI

Sleeping

App Files Files Community

TraceDetect-AI / text_module.py

nchdlhbctm

Upload 13 files

646535a verified about 1 month ago

Raw

History Blame Contribute Delete

4.27 kB

	import jieba
	import torch
	import re
	import numpy as np
	import streamlit as st
	from transformers import AutoTokenizer, AutoModelForSequenceClassification


	# ==========================================
	# 第一部分：统计风格特征提取 (占权重 20%)
	# ==========================================
	def extract_text_statistics(text):
	"""
	提取文本的统计学与风格特征。
	"""
	words = list(jieba.cut(text))
	words = [w for w in words if w.strip() and len(w) > 0]
	if len(words) == 0: return 0.0, 0, 0

	unique_words = set(words)
	richness = len(unique_words) / len(words)

	sentences = re.split(r'[。！？\n]', text)
	sentences = [s for s in sentences if len(s.strip()) > 0]

	sentence_lengths = [len(s) for s in sentences]
	avg_sentence_len = np.mean(sentence_lengths) if sentence_lengths else 0
	sentence_len_std = np.std(sentence_lengths) if sentence_lengths else 0

	score = 0.0
	if richness < 0.55:
	score += 0.5
	if sentence_len_std < 10.0 and avg_sentence_len > 15:
	score += 0.5

	return min(score, 1.0), richness, sentence_len_std


	# ==========================================
	# 第二部分：深度语义特征提取 (换上我们自己微调的专属模型)
	# ==========================================
	@st.cache_resource
	def get_custom_text_model():
	"""加载我们刚刚炼丹微调出来的专属 BERT 模型"""
	# 直接指向你刚刚生成的那个文件夹
	model_path = "./finetuned_text_model"
	tokenizer = AutoTokenizer.from_pretrained(model_path)
	model = AutoModelForSequenceClassification.from_pretrained(model_path)
	model.eval()
	return tokenizer, model


	def extract_deep_text_features(text, tokenizer, model):
	"""使用专属模型进行推理打分"""
	inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
	with torch.no_grad():
	outputs = model(**inputs)

	# 提取输出并转换为概率分布 (0: AI, 1: 真实)
	logits = outputs.logits
	probs = torch.softmax(logits, dim=1)

	# 获取它被判定为 AI (标签0) 的概率百分比
	fake_prob = probs[0][0].item()
	return fake_prob


	# ==========================================
	# 第三部分：多模态加权融合
	# ==========================================
	def analyze_text(text_content):
	# 1. 统计特征
	stat_score, richness, std_len = extract_text_statistics(text_content)

	# 2. 深度特征 (加载你的专属模型)
	tokenizer, model = get_custom_text_model()
	deep_score = extract_deep_text_features(text_content, tokenizer, model)

	# 3. 加权融合 (既然我们自己炼了丹，深度模型变得极强，权重拉高到 80%)
	final_prob = (stat_score * 0.2) + (deep_score * 0.8)

	return {
	"stat_score": stat_score,
	"deep_score": deep_score,
	"final_probability": final_prob,
	"details": f"词汇丰富度: {richness:.2f} \| 句长波动率: {std_len:.1f}"
	}


	def generate_text_highlight_html(text, tokenizer, model):
	"""逐句扫描文本，生成带有高亮背景色的 HTML 代码"""
	# 按照标点符号将文章拆分为句子
	sentences = re.split(r'([。！？\n])', text)
	# 重新把句子和标点拼装起来
	sentences = ["".join(i) for i in zip(sentences[0::2], sentences[1::2] + [""])]

	html_content = '<div style="line-height: 1.6; font-size: 16px; padding: 10px; border: 1px solid #ddd; border-radius: 5px;">'

	for s in sentences:
	if len(s.strip()) < 2:
	html_content += s.replace('\n', '<br>')
	continue

	# 让专属模型给这一句话单独打分
	prob = extract_deep_text_features(s, tokenizer, model)

	# 只有当 AI 概率大于 50% 时才开始变红，概率越高颜色越深
	if prob > 0.5:
	alpha = (prob - 0.5) * 2 # 映射到 0~1 透明度
	color = f"rgba(255, 75, 75, {alpha:.2f})"
	html_content += f'<span style="background-color: {color}; border-radius: 3px;">{s}</span>'
	else:
	html_content += s

	html_content += "</div>"
	return html_content