Spaces:
Sleeping
Sleeping
| import jieba | |
| import torch | |
| import re | |
| import numpy as np | |
| import streamlit as st | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| # ========================================== | |
| # 第一部分:统计风格特征提取 (占权重 20%) | |
| # ========================================== | |
| def extract_text_statistics(text): | |
| """ | |
| 提取文本的统计学与风格特征。 | |
| """ | |
| words = list(jieba.cut(text)) | |
| words = [w for w in words if w.strip() and len(w) > 0] | |
| if len(words) == 0: return 0.0, 0, 0 | |
| unique_words = set(words) | |
| richness = len(unique_words) / len(words) | |
| sentences = re.split(r'[。!?\n]', text) | |
| sentences = [s for s in sentences if len(s.strip()) > 0] | |
| sentence_lengths = [len(s) for s in sentences] | |
| avg_sentence_len = np.mean(sentence_lengths) if sentence_lengths else 0 | |
| sentence_len_std = np.std(sentence_lengths) if sentence_lengths else 0 | |
| score = 0.0 | |
| if richness < 0.55: | |
| score += 0.5 | |
| if sentence_len_std < 10.0 and avg_sentence_len > 15: | |
| score += 0.5 | |
| return min(score, 1.0), richness, sentence_len_std | |
| # ========================================== | |
| # 第二部分:深度语义特征提取 (换上我们自己微调的专属模型) | |
| # ========================================== | |
| def get_custom_text_model(): | |
| """加载我们刚刚炼丹微调出来的专属 BERT 模型""" | |
| # 直接指向你刚刚生成的那个文件夹 | |
| model_path = "./finetuned_text_model" | |
| tokenizer = AutoTokenizer.from_pretrained(model_path) | |
| model = AutoModelForSequenceClassification.from_pretrained(model_path) | |
| model.eval() | |
| return tokenizer, model | |
| def extract_deep_text_features(text, tokenizer, model): | |
| """使用专属模型进行推理打分""" | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| # 提取输出并转换为概率分布 (0: AI, 1: 真实) | |
| logits = outputs.logits | |
| probs = torch.softmax(logits, dim=1) | |
| # 获取它被判定为 AI (标签0) 的概率百分比 | |
| fake_prob = probs[0][0].item() | |
| return fake_prob | |
| # ========================================== | |
| # 第三部分:多模态加权融合 | |
| # ========================================== | |
| def analyze_text(text_content): | |
| # 1. 统计特征 | |
| stat_score, richness, std_len = extract_text_statistics(text_content) | |
| # 2. 深度特征 (加载你的专属模型) | |
| tokenizer, model = get_custom_text_model() | |
| deep_score = extract_deep_text_features(text_content, tokenizer, model) | |
| # 3. 加权融合 (既然我们自己炼了丹,深度模型变得极强,权重拉高到 80%) | |
| final_prob = (stat_score * 0.2) + (deep_score * 0.8) | |
| return { | |
| "stat_score": stat_score, | |
| "deep_score": deep_score, | |
| "final_probability": final_prob, | |
| "details": f"词汇丰富度: {richness:.2f} | 句长波动率: {std_len:.1f}" | |
| } | |
| def generate_text_highlight_html(text, tokenizer, model): | |
| """逐句扫描文本,生成带有高亮背景色的 HTML 代码""" | |
| # 按照标点符号将文章拆分为句子 | |
| sentences = re.split(r'([。!?\n])', text) | |
| # 重新把句子和标点拼装起来 | |
| sentences = ["".join(i) for i in zip(sentences[0::2], sentences[1::2] + [""])] | |
| html_content = '<div style="line-height: 1.6; font-size: 16px; padding: 10px; border: 1px solid #ddd; border-radius: 5px;">' | |
| for s in sentences: | |
| if len(s.strip()) < 2: | |
| html_content += s.replace('\n', '<br>') | |
| continue | |
| # 让专属模型给这一句话单独打分 | |
| prob = extract_deep_text_features(s, tokenizer, model) | |
| # 只有当 AI 概率大于 50% 时才开始变红,概率越高颜色越深 | |
| if prob > 0.5: | |
| alpha = (prob - 0.5) * 2 # 映射到 0~1 透明度 | |
| color = f"rgba(255, 75, 75, {alpha:.2f})" | |
| html_content += f'<span style="background-color: {color}; border-radius: 3px;">{s}</span>' | |
| else: | |
| html_content += s | |
| html_content += "</div>" | |
| return html_content |