File size: 7,892 Bytes
98cf186 bc4d57c e60aa0d bc4d57c 71cdeed 4960be3 e113804 4960be3 4e8d445 4960be3 e60aa0d 4960be3 95a9b8d e60aa0d 4960be3 b949d6b 4960be3 95a9b8d e60aa0d bc4d57c e60aa0d b949d6b 4960be3 e60aa0d b949d6b e60aa0d b949d6b e60aa0d bc4d57c e60aa0d bc4d57c 02068eb ff0d606 e60aa0d ff0d606 bc4d57c ff0d606 71cdeed e60aa0d 71cdeed bc4d57c b949d6b 35fd3be bc4d57c 71cdeed 98cf186 1c3315f 98cf186 1c3315f eeec4a7 16223b1 eeec4a7 1c3315f 1f55837 1c3315f 1f55837 1c3315f 4353bdb 1c3315f b2a9789 4353bdb eeec4a7 57ec304 1f55837 eeec4a7 1f55837 57ec304 4353bdb 1c3315f eeec4a7 16223b1 eeec4a7 57ec304 16223b1 eeec4a7 16223b1 eeec4a7 16223b1 eeec4a7 16223b1 eeec4a7 1c3315f 98cf186 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | import gradio as gr
import joblib
import numpy as np
import re
# 載入你訓練好的 .pkl 模型
model = joblib.load("ai_detector_model.pkl") # 確認路徑正確
# 自訂簡單分句函數
def simple_sent_tokenize(text):
# 以句點、問號、驚嘆號拆分,保留句尾符號
sentences = re.split(r'(?<=[.!?])\s+', text.strip())
return [s for s in sentences if s]
# 自訂簡單分詞函數
def simple_word_tokenize(text):
# 只抓字母和數字組成的單詞
return re.findall(r'\b\w+\b', text.lower())
def extract_features(text):
sentences = simple_sent_tokenize(text)
words_clean = [w for w in simple_word_tokenize(text) if w.isalpha()]
features = {}
features['text_length'] = len(text)
features['word_count'] = len(words_clean)
features['sentence_count'] = len(sentences)
features['avg_sentence_length'] = len(words_clean) / max(len(sentences), 1)
features['avg_word_length'] = np.mean([len(w) for w in words_clean]) if words_clean else 0
unique_words = set(words_clean)
features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
sentence_lengths = [len(simple_word_tokenize(s)) for s in sentences]
features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
features['period_density'] = text.count('.') / max(len(text), 1) * 1000
features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
features['question_density'] = text.count('?') / max(len(text), 1) * 1000
complex_words = [w for w in words_clean if len(w) > 6]
features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)
ai_markers = ['furthermore', 'moreover', 'additionally', 'consequently', 'therefore', 'thus', 'hence']
human_markers = ['i think', 'i believe', 'personally', 'maybe', 'probably', 'actually', 'really']
text_lower = text.lower()
features['ai_marker_count'] = sum(text_lower.count(marker) for marker in ai_markers)
features['human_marker_count'] = sum(text_lower.count(marker) for marker in human_markers)
return np.array(list(features.values())).reshape(1, -1), features
def predict(text):
if not text.strip():
return "請輸入文字內容。"
try:
X, feats = extract_features(text)
if hasattr(model, "predict_proba"):
prob = model.predict_proba(X)[0][1]
else:
prob = model.predict(X)[0]
label = "AI 生成" if prob > 0.5 else "人類撰寫"
reason = []
if feats['vocabulary_richness'] < 0.3:
reason.append("詞彙多樣性較低")
if feats['sentence_length_variance'] < 10:
reason.append("句子長度平均,像 AI")
if feats['ai_marker_count'] > feats['human_marker_count']:
reason.append("包含常見 AI 連接詞")
if feats['human_marker_count'] > feats['ai_marker_count']:
reason.append("包含主觀語氣詞")
if not reason:
reason.append("整體語言特徵與模型預測一致")
return f"預測結果:{label}\nAI 機率:{prob:.2%}\n判斷依據:{', '.join(reason)}"
except Exception as e:
return f"預測時出現錯誤: {str(e)}"
# Gradio 介面
demo = gr.Interface(
fn=predict,
inputs=gr.Textbox(label="請輸入文章內容", lines=15, max_lines=50, placeholder="在此輸入文章…"),
outputs=gr.Textbox(label="預測結果", lines=15, max_lines=30, placeholder="結果會顯示在這裡…"),
title="AI / Human 判斷器",
description="上傳的模型為 .pkl 格式,根據語言特徵分析並判斷文本來源"
)
demo.launch()
'''import gradio as gr
import tensorflow as tf
import pickle
# ---------------- 載入模型 ----------------
try:
model = tf.keras.models.load_model("AIDetect.h5")
print("✅ 模型載入成功")
except Exception as e:
print("❌ 模型載入失敗:", e)
model = None
# ---------------- 載入詞彙 ----------------
try:
with open("vocab.pkl", "rb") as f:
vocab = pickle.load(f)
vectorized_layer = tf.keras.layers.TextVectorization(
max_tokens=len(vocab)+1, output_sequence_length=50
)
vectorized_layer.set_vocabulary(vocab)
print("✅ 詞彙載入成功")
except Exception as e:
print("❌ 詞彙載入失敗:", e)
vectorized_layer = None
# ---------------- 載入 scaler ----------------
try:
with open("scaler.pkl", "rb") as f:
scaler = pickle.load(f)
print("✅ Scaler 載入成功")
except Exception as e:
print("❌ Scaler 載入失敗:", e)
scaler = None
# ---------------- 特徵計算 ----------------
def compute_features(text):
if isinstance(text, tf.Tensor):
text = text.numpy().decode('utf-8') if text.dtype == tf.string else str(text.numpy())
elif isinstance(text, bytes):
text = text.decode('utf-8')
else:
text = str(text)
words = text.split()
word_count = len(words)
unique_words = len(set(words))
unique_word_ratio = unique_words / (word_count + 1e-6)
repeat_rate = 1 - unique_word_ratio
punctuation_count = sum(1 for c in text if c in ".,!?;:")
punctuation_ratio = punctuation_count / (len(text) + 1e-6)
avg_word_length = sum(len(w) for w in words) / (word_count if word_count else 1)
return [[word_count, unique_word_ratio, repeat_rate, punctuation_ratio, avg_word_length]]
# ---------------- 使用 scaler ----------------
def transform_features(feat):
if scaler is None:
return feat # 如果 scaler 沒載入,就直接回傳原始特徵
return scaler.transform(feat).tolist() # 轉成 list,避免使用 NumPy
# ---------------- 生成解釋 ----------------
def explain_prediction(text):
if model is None or vectorized_layer is None:
return "❌ 模型或詞彙尚未載入,無法預測"
try:
# ---------------- 特徵計算 ----------------
feat_raw = compute_features(text)
feat = transform_features(feat_raw)
# ---------------- 文字向量化 ----------------
seq = vectorized_layer([text])
seq = tf.keras.utils.pad_sequences(seq, maxlen=50, padding='pre')
# 轉成 TensorFlow tensor
seq = tf.convert_to_tensor(seq)
feat = tf.convert_to_tensor(feat, dtype=tf.float32)
# ---------------- 預測 ----------------
pred_prob = model([seq, feat], training=False).numpy()[0][0]
label = "AI 生成" if pred_prob >= 0.5 else "人類撰寫"
prob = pred_prob * 100
# ---------------- 判斷依據 ----------------
reasons = []
if feat_raw[0][0] > 100: reasons.append("句子長度偏長")
if feat_raw[0][2] > 0.3: reasons.append("重複率高")
if feat_raw[0][1] < 0.2: reasons.append("詞彙多樣性低")
if feat_raw[0][3] < 0.01: reasons.append("標點符號少")
if feat_raw[0][4] > 6: reasons.append("平均詞長偏長")
if not reasons: reasons.append("句子長度與用詞平均")
explanation = ";".join(reasons)
return f"預測結果:{label}\nAI 機率:{prob:.2f}%\n判斷依據:{explanation}"
except Exception as e:
return f"❌ 預測時發生錯誤: {e}"
# ---------------- Gradio 介面 ----------------
iface = gr.Interface(
fn=explain_prediction,
inputs=gr.Textbox(label="請輸入文章內容", lines=15, max_lines=50, placeholder="在此輸入文章…"),
outputs=gr.Textbox(label="預測結果", lines=15, max_lines=30, placeholder="結果會顯示在這裡…"),
title="AI vs Human 文本判斷",
description="輸入文章,模型會判斷是 AI 或人類撰寫,並給出機率與判斷依據"
)
iface.launch()'''
|