import gradio as gr
import joblib
import numpy as np
import re

# 載入你訓練好的 .pkl 模型
model = joblib.load("ai_detector_model.pkl")  # 確認路徑正確

# 自訂簡單分句函數
def simple_sent_tokenize(text):
    # 以句點、問號、驚嘆號拆分，保留句尾符號
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s for s in sentences if s]

# 自訂簡單分詞函數
def simple_word_tokenize(text):
    # 只抓字母和數字組成的單詞
    return re.findall(r'\b\w+\b', text.lower())

def extract_features(text):
    sentences = simple_sent_tokenize(text)
    
    words_clean = [w for w in simple_word_tokenize(text) if w.isalpha()]

    features = {}
    features['text_length'] = len(text)
    features['word_count'] = len(words_clean)
    features['sentence_count'] = len(sentences)
    features['avg_sentence_length'] = len(words_clean) / max(len(sentences), 1)
    features['avg_word_length'] = np.mean([len(w) for w in words_clean]) if words_clean else 0

    unique_words = set(words_clean)
    features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)

    sentence_lengths = [len(simple_word_tokenize(s)) for s in sentences]
    features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0

    features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
    features['period_density'] = text.count('.') / max(len(text), 1) * 1000
    features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
    features['question_density'] = text.count('?') / max(len(text), 1) * 1000

    complex_words = [w for w in words_clean if len(w) > 6]
    features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)

    ai_markers = ['furthermore', 'moreover', 'additionally', 'consequently', 'therefore', 'thus', 'hence']
    human_markers = ['i think', 'i believe', 'personally', 'maybe', 'probably', 'actually', 'really']
    text_lower = text.lower()
    features['ai_marker_count'] = sum(text_lower.count(marker) for marker in ai_markers)
    features['human_marker_count'] = sum(text_lower.count(marker) for marker in human_markers)

    return np.array(list(features.values())).reshape(1, -1), features

def predict(text):
    if not text.strip():
        return "請輸入文字內容。"
    
    try:
        X, feats = extract_features(text)
        if hasattr(model, "predict_proba"):
            prob = model.predict_proba(X)[0][1]
        else:
            prob = model.predict(X)[0]
        
        label = "AI 生成" if prob > 0.5 else "人類撰寫"

        reason = []
        if feats['vocabulary_richness'] < 0.3:
            reason.append("詞彙多樣性較低")
        if feats['sentence_length_variance'] < 10:
            reason.append("句子長度平均，像 AI")
        if feats['ai_marker_count'] > feats['human_marker_count']:
            reason.append("包含常見 AI 連接詞")
        if feats['human_marker_count'] > feats['ai_marker_count']:
            reason.append("包含主觀語氣詞")
        if not reason:
            reason.append("整體語言特徵與模型預測一致")

        return f"預測結果：{label}\nAI 機率：{prob:.2%}\n判斷依據：{', '.join(reason)}"
    except Exception as e:
        return f"預測時出現錯誤: {str(e)}"

# Gradio 介面
demo = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(label="請輸入文章內容", lines=15, max_lines=50, placeholder="在此輸入文章…"),
    outputs=gr.Textbox(label="預測結果", lines=15, max_lines=30, placeholder="結果會顯示在這裡…"),
    title="AI / Human 判斷器",
    description="上傳的模型為 .pkl 格式，根據語言特徵分析並判斷文本來源"
)

demo.launch()


'''import gradio as gr
import tensorflow as tf
import pickle

# ---------------- 載入模型 ----------------
try:
    model = tf.keras.models.load_model("AIDetect.h5")
    print("✅ 模型載入成功")
except Exception as e:
    print("❌ 模型載入失敗:", e)
    model = None

# ---------------- 載入詞彙 ----------------
try:
    with open("vocab.pkl", "rb") as f:
        vocab = pickle.load(f)
        vectorized_layer = tf.keras.layers.TextVectorization(
            max_tokens=len(vocab)+1, output_sequence_length=50
        )
        vectorized_layer.set_vocabulary(vocab)
    print("✅ 詞彙載入成功")
except Exception as e:
    print("❌ 詞彙載入失敗:", e)
    vectorized_layer = None

# ---------------- 載入 scaler ----------------
try:
    with open("scaler.pkl", "rb") as f:
        scaler = pickle.load(f)
    print("✅ Scaler 載入成功")
except Exception as e:
    print("❌ Scaler 載入失敗:", e)
    scaler = None

# ---------------- 特徵計算 ----------------
def compute_features(text):
    if isinstance(text, tf.Tensor):
        text = text.numpy().decode('utf-8') if text.dtype == tf.string else str(text.numpy())
    elif isinstance(text, bytes):
        text = text.decode('utf-8')
    else:
        text = str(text)

    words = text.split()
    word_count = len(words)
    unique_words = len(set(words))
    unique_word_ratio = unique_words / (word_count + 1e-6)
    repeat_rate = 1 - unique_word_ratio
    punctuation_count = sum(1 for c in text if c in ".,!?;:")
    punctuation_ratio = punctuation_count / (len(text) + 1e-6)
    avg_word_length = sum(len(w) for w in words) / (word_count if word_count else 1)

    return [[word_count, unique_word_ratio, repeat_rate, punctuation_ratio, avg_word_length]]

# ---------------- 使用 scaler ----------------
def transform_features(feat):
    if scaler is None:
        return feat  # 如果 scaler 沒載入，就直接回傳原始特徵
    return scaler.transform(feat).tolist()  # 轉成 list，避免使用 NumPy

# ---------------- 生成解釋 ----------------
def explain_prediction(text):
    if model is None or vectorized_layer is None:
        return "❌ 模型或詞彙尚未載入，無法預測"

    try:
        # ---------------- 特徵計算 ----------------
        feat_raw = compute_features(text)
        feat = transform_features(feat_raw)

        # ---------------- 文字向量化 ----------------
        seq = vectorized_layer([text])
        seq = tf.keras.utils.pad_sequences(seq, maxlen=50, padding='pre')

        # 轉成 TensorFlow tensor
        seq = tf.convert_to_tensor(seq)
        feat = tf.convert_to_tensor(feat, dtype=tf.float32)

        # ---------------- 預測 ----------------
        pred_prob = model([seq, feat], training=False).numpy()[0][0]
        label = "AI 生成" if pred_prob >= 0.5 else "人類撰寫"
        prob = pred_prob * 100

        # ---------------- 判斷依據 ----------------
        reasons = []
        if feat_raw[0][0] > 100: reasons.append("句子長度偏長")
        if feat_raw[0][2] > 0.3: reasons.append("重複率高")
        if feat_raw[0][1] < 0.2: reasons.append("詞彙多樣性低")
        if feat_raw[0][3] < 0.01: reasons.append("標點符號少")
        if feat_raw[0][4] > 6: reasons.append("平均詞長偏長")
        if not reasons: reasons.append("句子長度與用詞平均")
        explanation = "；".join(reasons)

        return f"預測結果：{label}\nAI 機率：{prob:.2f}%\n判斷依據：{explanation}"

    except Exception as e:
        return f"❌ 預測時發生錯誤: {e}"

# ---------------- Gradio 介面 ----------------
iface = gr.Interface(
    fn=explain_prediction,
    inputs=gr.Textbox(label="請輸入文章內容", lines=15, max_lines=50, placeholder="在此輸入文章…"),
    outputs=gr.Textbox(label="預測結果", lines=15, max_lines=30, placeholder="結果會顯示在這裡…"),
    title="AI vs Human 文本判斷",
    description="輸入文章，模型會判斷是 AI 或人類撰寫，並給出機率與判斷依據"
)

iface.launch()'''