import jieba
import torch
import re
import numpy as np
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification


# ==========================================
# 第一部分：统计风格特征提取 (占权重 20%)
# ==========================================
def extract_text_statistics(text):
    """
    提取文本的统计学与风格特征。
    """
    words = list(jieba.cut(text))
    words = [w for w in words if w.strip() and len(w) > 0]
    if len(words) == 0: return 0.0, 0, 0

    unique_words = set(words)
    richness = len(unique_words) / len(words)

    sentences = re.split(r'[。！？\n]', text)
    sentences = [s for s in sentences if len(s.strip()) > 0]

    sentence_lengths = [len(s) for s in sentences]
    avg_sentence_len = np.mean(sentence_lengths) if sentence_lengths else 0
    sentence_len_std = np.std(sentence_lengths) if sentence_lengths else 0

    score = 0.0
    if richness < 0.55:
        score += 0.5
    if sentence_len_std < 10.0 and avg_sentence_len > 15:
        score += 0.5

    return min(score, 1.0), richness, sentence_len_std


# ==========================================
# 第二部分：深度语义特征提取 (换上我们自己微调的专属模型)
# ==========================================
@st.cache_resource
def get_custom_text_model():
    """加载我们刚刚炼丹微调出来的专属 BERT 模型"""
    # 直接指向你刚刚生成的那个文件夹
    model_path = "./finetuned_text_model"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    model.eval()
    return tokenizer, model


def extract_deep_text_features(text, tokenizer, model):
    """使用专属模型进行推理打分"""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)

    # 提取输出并转换为概率分布 (0: AI, 1: 真实)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1)

    # 获取它被判定为 AI (标签0) 的概率百分比
    fake_prob = probs[0][0].item()
    return fake_prob


# ==========================================
# 第三部分：多模态加权融合
# ==========================================
def analyze_text(text_content):
    # 1. 统计特征
    stat_score, richness, std_len = extract_text_statistics(text_content)

    # 2. 深度特征 (加载你的专属模型)
    tokenizer, model = get_custom_text_model()
    deep_score = extract_deep_text_features(text_content, tokenizer, model)

    # 3. 加权融合 (既然我们自己炼了丹，深度模型变得极强，权重拉高到 80%)
    final_prob = (stat_score * 0.2) + (deep_score * 0.8)

    return {
        "stat_score": stat_score,
        "deep_score": deep_score,
        "final_probability": final_prob,
        "details": f"词汇丰富度: {richness:.2f} | 句长波动率: {std_len:.1f}"
    }


def generate_text_highlight_html(text, tokenizer, model):
    """逐句扫描文本，生成带有高亮背景色的 HTML 代码"""
    # 按照标点符号将文章拆分为句子
    sentences = re.split(r'([。！？\n])', text)
    # 重新把句子和标点拼装起来
    sentences = ["".join(i) for i in zip(sentences[0::2], sentences[1::2] + [""])]

    html_content = '<div style="line-height: 1.6; font-size: 16px; padding: 10px; border: 1px solid #ddd; border-radius: 5px;">'

    for s in sentences:
        if len(s.strip()) < 2:
            html_content += s.replace('\n', '<br>')
            continue

        # 让专属模型给这一句话单独打分
        prob = extract_deep_text_features(s, tokenizer, model)

        # 只有当 AI 概率大于 50% 时才开始变红，概率越高颜色越深
        if prob > 0.5:
            alpha = (prob - 0.5) * 2  # 映射到 0~1 透明度
            color = f"rgba(255, 75, 75, {alpha:.2f})"
            html_content += f'<span style="background-color: {color}; border-radius: 3px;">{s}</span>'
        else:
            html_content += s

    html_content += "</div>"
    return html_content