File size: 7,892 Bytes
98cf186
bc4d57c
e60aa0d
bc4d57c
71cdeed
4960be3
 
e113804
4960be3
4e8d445
4960be3
 
 
e60aa0d
4960be3
 
 
 
95a9b8d
e60aa0d
4960be3
b949d6b
4960be3
95a9b8d
e60aa0d
 
 
 
 
 
bc4d57c
e60aa0d
 
b949d6b
4960be3
e60aa0d
b949d6b
e60aa0d
 
 
 
b949d6b
e60aa0d
 
bc4d57c
e60aa0d
 
 
 
 
 
bc4d57c
 
 
02068eb
ff0d606
e60aa0d
ff0d606
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc4d57c
ff0d606
 
 
71cdeed
e60aa0d
71cdeed
bc4d57c
b949d6b
35fd3be
 
bc4d57c
71cdeed
 
98cf186
1c3315f
 
 
 
 
98cf186
1c3315f
 
 
 
eeec4a7
 
 
 
 
 
 
16223b1
eeec4a7
 
 
 
 
 
 
 
 
 
 
1c3315f
1f55837
 
 
 
 
 
 
 
 
 
1c3315f
1f55837
 
 
 
 
 
 
1c3315f
 
4353bdb
 
1c3315f
b2a9789
 
4353bdb
eeec4a7
57ec304
1f55837
 
eeec4a7
1f55837
 
57ec304
4353bdb
1c3315f
 
eeec4a7
 
 
 
16223b1
 
 
 
 
eeec4a7
 
 
57ec304
16223b1
 
eeec4a7
16223b1
 
eeec4a7
 
 
16223b1
eeec4a7
16223b1
 
 
 
 
eeec4a7
 
 
 
 
 
 
1c3315f
 
 
 
 
 
 
 
 
 
98cf186
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import gradio as gr
import joblib
import numpy as np
import re

# 載入你訓練好的 .pkl 模型
model = joblib.load("ai_detector_model.pkl")  # 確認路徑正確

# 自訂簡單分句函數
def simple_sent_tokenize(text):
    # 以句點、問號、驚嘆號拆分,保留句尾符號
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s for s in sentences if s]

# 自訂簡單分詞函數
def simple_word_tokenize(text):
    # 只抓字母和數字組成的單詞
    return re.findall(r'\b\w+\b', text.lower())

def extract_features(text):
    sentences = simple_sent_tokenize(text)
    
    words_clean = [w for w in simple_word_tokenize(text) if w.isalpha()]

    features = {}
    features['text_length'] = len(text)
    features['word_count'] = len(words_clean)
    features['sentence_count'] = len(sentences)
    features['avg_sentence_length'] = len(words_clean) / max(len(sentences), 1)
    features['avg_word_length'] = np.mean([len(w) for w in words_clean]) if words_clean else 0

    unique_words = set(words_clean)
    features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)

    sentence_lengths = [len(simple_word_tokenize(s)) for s in sentences]
    features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0

    features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
    features['period_density'] = text.count('.') / max(len(text), 1) * 1000
    features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
    features['question_density'] = text.count('?') / max(len(text), 1) * 1000

    complex_words = [w for w in words_clean if len(w) > 6]
    features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)

    ai_markers = ['furthermore', 'moreover', 'additionally', 'consequently', 'therefore', 'thus', 'hence']
    human_markers = ['i think', 'i believe', 'personally', 'maybe', 'probably', 'actually', 'really']
    text_lower = text.lower()
    features['ai_marker_count'] = sum(text_lower.count(marker) for marker in ai_markers)
    features['human_marker_count'] = sum(text_lower.count(marker) for marker in human_markers)

    return np.array(list(features.values())).reshape(1, -1), features

def predict(text):
    if not text.strip():
        return "請輸入文字內容。"
    
    try:
        X, feats = extract_features(text)
        if hasattr(model, "predict_proba"):
            prob = model.predict_proba(X)[0][1]
        else:
            prob = model.predict(X)[0]
        
        label = "AI 生成" if prob > 0.5 else "人類撰寫"

        reason = []
        if feats['vocabulary_richness'] < 0.3:
            reason.append("詞彙多樣性較低")
        if feats['sentence_length_variance'] < 10:
            reason.append("句子長度平均,像 AI")
        if feats['ai_marker_count'] > feats['human_marker_count']:
            reason.append("包含常見 AI 連接詞")
        if feats['human_marker_count'] > feats['ai_marker_count']:
            reason.append("包含主觀語氣詞")
        if not reason:
            reason.append("整體語言特徵與模型預測一致")

        return f"預測結果:{label}\nAI 機率:{prob:.2%}\n判斷依據:{', '.join(reason)}"
    except Exception as e:
        return f"預測時出現錯誤: {str(e)}"

# Gradio 介面
demo = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(label="請輸入文章內容", lines=15, max_lines=50, placeholder="在此輸入文章…"),
    outputs=gr.Textbox(label="預測結果", lines=15, max_lines=30, placeholder="結果會顯示在這裡…"),
    title="AI / Human 判斷器",
    description="上傳的模型為 .pkl 格式,根據語言特徵分析並判斷文本來源"
)

demo.launch()





'''import gradio as gr
import tensorflow as tf
import pickle

# ---------------- 載入模型 ----------------
try:
    model = tf.keras.models.load_model("AIDetect.h5")
    print("✅ 模型載入成功")
except Exception as e:
    print("❌ 模型載入失敗:", e)
    model = None

# ---------------- 載入詞彙 ----------------
try:
    with open("vocab.pkl", "rb") as f:
        vocab = pickle.load(f)
        vectorized_layer = tf.keras.layers.TextVectorization(
            max_tokens=len(vocab)+1, output_sequence_length=50
        )
        vectorized_layer.set_vocabulary(vocab)
    print("✅ 詞彙載入成功")
except Exception as e:
    print("❌ 詞彙載入失敗:", e)
    vectorized_layer = None

# ---------------- 載入 scaler ----------------
try:
    with open("scaler.pkl", "rb") as f:
        scaler = pickle.load(f)
    print("✅ Scaler 載入成功")
except Exception as e:
    print("❌ Scaler 載入失敗:", e)
    scaler = None

# ---------------- 特徵計算 ----------------
def compute_features(text):
    if isinstance(text, tf.Tensor):
        text = text.numpy().decode('utf-8') if text.dtype == tf.string else str(text.numpy())
    elif isinstance(text, bytes):
        text = text.decode('utf-8')
    else:
        text = str(text)

    words = text.split()
    word_count = len(words)
    unique_words = len(set(words))
    unique_word_ratio = unique_words / (word_count + 1e-6)
    repeat_rate = 1 - unique_word_ratio
    punctuation_count = sum(1 for c in text if c in ".,!?;:")
    punctuation_ratio = punctuation_count / (len(text) + 1e-6)
    avg_word_length = sum(len(w) for w in words) / (word_count if word_count else 1)

    return [[word_count, unique_word_ratio, repeat_rate, punctuation_ratio, avg_word_length]]

# ---------------- 使用 scaler ----------------
def transform_features(feat):
    if scaler is None:
        return feat  # 如果 scaler 沒載入,就直接回傳原始特徵
    return scaler.transform(feat).tolist()  # 轉成 list,避免使用 NumPy

# ---------------- 生成解釋 ----------------
def explain_prediction(text):
    if model is None or vectorized_layer is None:
        return "❌ 模型或詞彙尚未載入,無法預測"

    try:
        # ---------------- 特徵計算 ----------------
        feat_raw = compute_features(text)
        feat = transform_features(feat_raw)

        # ---------------- 文字向量化 ----------------
        seq = vectorized_layer([text])
        seq = tf.keras.utils.pad_sequences(seq, maxlen=50, padding='pre')

        # 轉成 TensorFlow tensor
        seq = tf.convert_to_tensor(seq)
        feat = tf.convert_to_tensor(feat, dtype=tf.float32)

        # ---------------- 預測 ----------------
        pred_prob = model([seq, feat], training=False).numpy()[0][0]
        label = "AI 生成" if pred_prob >= 0.5 else "人類撰寫"
        prob = pred_prob * 100

        # ---------------- 判斷依據 ----------------
        reasons = []
        if feat_raw[0][0] > 100: reasons.append("句子長度偏長")
        if feat_raw[0][2] > 0.3: reasons.append("重複率高")
        if feat_raw[0][1] < 0.2: reasons.append("詞彙多樣性低")
        if feat_raw[0][3] < 0.01: reasons.append("標點符號少")
        if feat_raw[0][4] > 6: reasons.append("平均詞長偏長")
        if not reasons: reasons.append("句子長度與用詞平均")
        explanation = ";".join(reasons)

        return f"預測結果:{label}\nAI 機率:{prob:.2f}%\n判斷依據:{explanation}"

    except Exception as e:
        return f"❌ 預測時發生錯誤: {e}"

# ---------------- Gradio 介面 ----------------
iface = gr.Interface(
    fn=explain_prediction,
    inputs=gr.Textbox(label="請輸入文章內容", lines=15, max_lines=50, placeholder="在此輸入文章…"),
    outputs=gr.Textbox(label="預測結果", lines=15, max_lines=30, placeholder="結果會顯示在這裡…"),
    title="AI vs Human 文本判斷",
    description="輸入文章,模型會判斷是 AI 或人類撰寫,並給出機率與判斷依據"
)

iface.launch()'''