Spaces:

Hellowish
/

AI_Detect

Sleeping

File size: 7,892 Bytes

98cf186
bc4d57c
e60aa0d
bc4d57c
71cdeed
4960be3
 
e113804
4960be3
4e8d445
4960be3
 
 
e60aa0d
4960be3
 
 
 
95a9b8d
e60aa0d
4960be3
b949d6b
4960be3
95a9b8d
e60aa0d
 
 
 
 
 
bc4d57c
e60aa0d
 
b949d6b
4960be3
e60aa0d
b949d6b
e60aa0d
 
 
 
b949d6b
e60aa0d
 
bc4d57c
e60aa0d
 
 
 
 
 
bc4d57c
 
 
02068eb
ff0d606
e60aa0d
ff0d606
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc4d57c
ff0d606
 
 
71cdeed
e60aa0d
71cdeed
bc4d57c
b949d6b
35fd3be
 
bc4d57c
71cdeed
 
98cf186
1c3315f
 
 
 
 
98cf186
1c3315f
 
 
 
eeec4a7
 
 
 
 
 
 
16223b1
eeec4a7
 
 
 
 
 
 
 
 
 
 
1c3315f
1f55837
 
 
 
 
 
 
 
 
 
1c3315f
1f55837
 
 
 
 
 
 
1c3315f
 
4353bdb
 
1c3315f
b2a9789
 
4353bdb
eeec4a7
57ec304
1f55837
 
eeec4a7
1f55837
 
57ec304
4353bdb
1c3315f
 
eeec4a7
 
 
 
16223b1
 
 
 
 
eeec4a7
 
 
57ec304
16223b1
 
eeec4a7
16223b1
 
eeec4a7
 
 
16223b1
eeec4a7
16223b1
 
 
 
 
eeec4a7
 
 
 
 
 
 
1c3315f
 
 
 
 
 
 
 
 
 
98cf186

import gradio as gr
import joblib
import numpy as np
import re

# 載入你訓練好的 .pkl 模型
model = joblib.load("ai_detector_model.pkl")  # 確認路徑正確

# 自訂簡單分句函數
def simple_sent_tokenize(text):
    # 以句點、問號、驚嘆號拆分，保留句尾符號
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s for s in sentences if s]

# 自訂簡單分詞函數
def simple_word_tokenize(text):
    # 只抓字母和數字組成的單詞
    return re.findall(r'\b\w+\b', text.lower())

def extract_features(text):
    sentences = simple_sent_tokenize(text)
    
    words_clean = [w for w in simple_word_tokenize(text) if w.isalpha()]

    features = {}
    features['text_length'] = len(text)
    features['word_count'] = len(words_clean)
    features['sentence_count'] = len(sentences)
    features['avg_sentence_length'] = len(words_clean) / max(len(sentences), 1)
    features['avg_word_length'] = np.mean([len(w) for w in words_clean]) if words_clean else 0

    unique_words = set(words_clean)
    features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)

    sentence_lengths = [len(simple_word_tokenize(s)) for s in sentences]
    features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0

    features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
    features['period_density'] = text.count('.') / max(len(text), 1) * 1000
    features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
    features['question_density'] = text.count('?') / max(len(text), 1) * 1000

    complex_words = [w for w in words_clean if len(w) > 6]
    features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)

    ai_markers = ['furthermore', 'moreover', 'additionally', 'consequently', 'therefore', 'thus', 'hence']
    human_markers = ['i think', 'i believe', 'personally', 'maybe', 'probably', 'actually', 'really']
    text_lower = text.lower()
    features['ai_marker_count'] = sum(text_lower.count(marker) for marker in ai_markers)
    features['human_marker_count'] = sum(text_lower.count(marker) for marker in human_markers)

    return np.array(list(features.values())).reshape(1, -1), features

def predict(text):
    if not text.strip():
        return "請輸入文字內容。"
    
    try:
        X, feats = extract_features(text)
        if hasattr(model, "predict_proba"):
            prob = model.predict_proba(X)[0][1]
        else:
            prob = model.predict(X)[0]
        
        label = "AI 生成" if prob > 0.5 else "人類撰寫"

        reason = []
        if feats['vocabulary_richness'] < 0.3:
            reason.append("詞彙多樣性較低")
        if feats['sentence_length_variance'] < 10:
            reason.append("句子長度平均，像 AI")
        if feats['ai_marker_count'] > feats['human_marker_count']:
            reason.append("包含常見 AI 連接詞")
        if feats['human_marker_count'] > feats['ai_marker_count']:
            reason.append("包含主觀語氣詞")
        if not reason:
            reason.append("整體語言特徵與模型預測一致")

        return f"預測結果：{label}\nAI 機率：{prob:.2%}\n判斷依據：{', '.join(reason)}"
    except Exception as e:
        return f"預測時出現錯誤: {str(e)}"

# Gradio 介面
demo = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(label="請輸入文章內容", lines=15, max_lines=50, placeholder="在此輸入文章…"),
    outputs=gr.Textbox(label="預測結果", lines=15, max_lines=30, placeholder="結果會顯示在這裡…"),
    title="AI / Human 判斷器",
    description="上傳的模型為 .pkl 格式，根據語言特徵分析並判斷文本來源"
)

demo.launch()





'''import gradio as gr
import tensorflow as tf
import pickle

# ---------------- 載入模型 ----------------
try:
    model = tf.keras.models.load_model("AIDetect.h5")
    print("✅ 模型載入成功")
except Exception as e:
    print("❌ 模型載入失敗:", e)
    model = None

# ---------------- 載入詞彙 ----------------
try:
    with open("vocab.pkl", "rb") as f:
        vocab = pickle.load(f)
        vectorized_layer = tf.keras.layers.TextVectorization(
            max_tokens=len(vocab)+1, output_sequence_length=50
        )
        vectorized_layer.set_vocabulary(vocab)
    print("✅ 詞彙載入成功")
except Exception as e:
    print("❌ 詞彙載入失敗:", e)
    vectorized_layer = None

# ---------------- 載入 scaler ----------------
try:
    with open("scaler.pkl", "rb") as f:
        scaler = pickle.load(f)
    print("✅ Scaler 載入成功")
except Exception as e:
    print("❌ Scaler 載入失敗:", e)
    scaler = None

# ---------------- 特徵計算 ----------------
def compute_features(text):
    if isinstance(text, tf.Tensor):
        text = text.numpy().decode('utf-8') if text.dtype == tf.string else str(text.numpy())
    elif isinstance(text, bytes):
        text = text.decode('utf-8')
    else:
        text = str(text)

    words = text.split()
    word_count = len(words)
    unique_words = len(set(words))
    unique_word_ratio = unique_words / (word_count + 1e-6)
    repeat_rate = 1 - unique_word_ratio
    punctuation_count = sum(1 for c in text if c in ".,!?;:")
    punctuation_ratio = punctuation_count / (len(text) + 1e-6)
    avg_word_length = sum(len(w) for w in words) / (word_count if word_count else 1)

    return [[word_count, unique_word_ratio, repeat_rate, punctuation_ratio, avg_word_length]]

# ---------------- 使用 scaler ----------------
def transform_features(feat):
    if scaler is None:
        return feat  # 如果 scaler 沒載入，就直接回傳原始特徵
    return scaler.transform(feat).tolist()  # 轉成 list，避免使用 NumPy

# ---------------- 生成解釋 ----------------
def explain_prediction(text):
    if model is None or vectorized_layer is None:
        return "❌ 模型或詞彙尚未載入，無法預測"

    try:
        # ---------------- 特徵計算 ----------------
        feat_raw = compute_features(text)
        feat = transform_features(feat_raw)

        # ---------------- 文字向量化 ----------------
        seq = vectorized_layer([text])
        seq = tf.keras.utils.pad_sequences(seq, maxlen=50, padding='pre')

        # 轉成 TensorFlow tensor
        seq = tf.convert_to_tensor(seq)
        feat = tf.convert_to_tensor(feat, dtype=tf.float32)

        # ---------------- 預測 ----------------
        pred_prob = model([seq, feat], training=False).numpy()[0][0]
        label = "AI 生成" if pred_prob >= 0.5 else "人類撰寫"
        prob = pred_prob * 100

        # ---------------- 判斷依據 ----------------
        reasons = []
        if feat_raw[0][0] > 100: reasons.append("句子長度偏長")
        if feat_raw[0][2] > 0.3: reasons.append("重複率高")
        if feat_raw[0][1] < 0.2: reasons.append("詞彙多樣性低")
        if feat_raw[0][3] < 0.01: reasons.append("標點符號少")
        if feat_raw[0][4] > 6: reasons.append("平均詞長偏長")
        if not reasons: reasons.append("句子長度與用詞平均")
        explanation = "；".join(reasons)

        return f"預測結果：{label}\nAI 機率：{prob:.2f}%\n判斷依據：{explanation}"

    except Exception as e:
        return f"❌ 預測時發生錯誤: {e}"

# ---------------- Gradio 介面 ----------------
iface = gr.Interface(
    fn=explain_prediction,
    inputs=gr.Textbox(label="請輸入文章內容", lines=15, max_lines=50, placeholder="在此輸入文章…"),
    outputs=gr.Textbox(label="預測結果", lines=15, max_lines=30, placeholder="結果會顯示在這裡…"),
    title="AI vs Human 文本判斷",
    description="輸入文章，模型會判斷是 AI 或人類撰寫，並給出機率與判斷依據"
)

iface.launch()'''