Update app.py
Browse files
app.py
CHANGED
|
@@ -1,21 +1,19 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
|
| 3 |
-
import torch
|
| 4 |
import numpy as np
|
|
|
|
| 5 |
import nltk
|
| 6 |
|
| 7 |
nltk.download('punkt')
|
| 8 |
|
| 9 |
-
# 載入模型
|
| 10 |
-
|
| 11 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 12 |
-
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
| 13 |
|
| 14 |
-
# 特徵
|
| 15 |
def extract_features(text):
|
| 16 |
-
words = nltk.word_tokenize(text)
|
| 17 |
-
words_clean = [w for w in words if w.isalpha()]
|
| 18 |
sentences = nltk.sent_tokenize(text)
|
|
|
|
|
|
|
| 19 |
|
| 20 |
features = {}
|
| 21 |
features['text_length'] = len(text)
|
|
@@ -23,75 +21,55 @@ def extract_features(text):
|
|
| 23 |
features['sentence_count'] = len(sentences)
|
| 24 |
features['avg_sentence_length'] = len(words_clean) / max(len(sentences), 1)
|
| 25 |
features['avg_word_length'] = np.mean([len(w) for w in words_clean]) if words_clean else 0
|
| 26 |
-
|
| 27 |
unique_words = set(words_clean)
|
| 28 |
features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
|
| 29 |
-
|
| 30 |
sentence_lengths = [len(nltk.word_tokenize(s)) for s in sentences]
|
| 31 |
features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
|
| 32 |
-
|
| 33 |
features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
|
| 34 |
features['period_density'] = text.count('.') / max(len(text), 1) * 1000
|
| 35 |
features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
|
| 36 |
features['question_density'] = text.count('?') / max(len(text), 1) * 1000
|
| 37 |
-
|
| 38 |
complex_words = [w for w in words_clean if len(w) > 6]
|
| 39 |
features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)
|
| 40 |
-
|
| 41 |
ai_markers = ['furthermore', 'moreover', 'additionally', 'consequently', 'therefore', 'thus', 'hence']
|
| 42 |
human_markers = ['i think', 'i believe', 'personally', 'maybe', 'probably', 'actually', 'really']
|
| 43 |
-
|
| 44 |
text_lower = text.lower()
|
| 45 |
features['ai_marker_count'] = sum(text_lower.count(marker) for marker in ai_markers)
|
| 46 |
features['human_marker_count'] = sum(text_lower.count(marker) for marker in human_markers)
|
| 47 |
-
|
| 48 |
-
return features
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
logits = output.logits
|
| 58 |
-
probs = torch.softmax(logits, dim=-1).squeeze().tolist()
|
| 59 |
-
pred = torch.argmax(logits, dim=-1).item()
|
| 60 |
-
label = "AI" if pred == 1 else "Human"
|
| 61 |
-
|
| 62 |
-
# 特徵抽取
|
| 63 |
-
features = extract_features(text)
|
| 64 |
-
|
| 65 |
-
# 自動生成依據文字
|
| 66 |
-
reasons = []
|
| 67 |
-
if features['sentence_length_variance'] < 5:
|
| 68 |
-
reasons.append("句子長度變化較小,可能是 AI 生成")
|
| 69 |
-
if features['complex_word_ratio'] > 0.2:
|
| 70 |
-
reasons.append("使用較多複雜字詞")
|
| 71 |
-
if features['ai_marker_count'] > 0:
|
| 72 |
-
reasons.append(f"出現 AI 標記詞: {features['ai_marker_count']} 次")
|
| 73 |
-
if features['human_marker_count'] > 0:
|
| 74 |
-
reasons.append(f"出現人類口語標記詞: {features['human_marker_count']} 次")
|
| 75 |
-
if not reasons:
|
| 76 |
-
reasons.append("文字特徵未明顯偏向 AI 或 Human")
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
# Gradio 介面
|
| 85 |
demo = gr.Interface(
|
| 86 |
-
fn=
|
| 87 |
-
inputs=gr.Textbox(label="請輸入文
|
| 88 |
-
outputs=
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
gr.Textbox(label="判斷依據")
|
| 92 |
-
],
|
| 93 |
-
title="AI/Human 判斷器(特徵依據版)",
|
| 94 |
-
description="輸入文字即可判斷是 AI 生成還是 Human 撰寫,並顯示判斷��據"
|
| 95 |
)
|
| 96 |
|
| 97 |
-
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
import joblib
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
+
import re
|
| 5 |
import nltk
|
| 6 |
|
| 7 |
nltk.download('punkt')
|
| 8 |
|
| 9 |
+
# 載入你訓練好的 .pkl 模型
|
| 10 |
+
model = joblib.load("AI_Detect.pkl") # 請確認檔名正確(在 /app 資料夾裡)
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
# 特徵擷取函數(依照你之前提供的邏輯)
|
| 13 |
def extract_features(text):
|
|
|
|
|
|
|
| 14 |
sentences = nltk.sent_tokenize(text)
|
| 15 |
+
words = re.findall(r'\b\w+\b', text.lower())
|
| 16 |
+
words_clean = [w for w in words if w.isalpha()]
|
| 17 |
|
| 18 |
features = {}
|
| 19 |
features['text_length'] = len(text)
|
|
|
|
| 21 |
features['sentence_count'] = len(sentences)
|
| 22 |
features['avg_sentence_length'] = len(words_clean) / max(len(sentences), 1)
|
| 23 |
features['avg_word_length'] = np.mean([len(w) for w in words_clean]) if words_clean else 0
|
| 24 |
+
|
| 25 |
unique_words = set(words_clean)
|
| 26 |
features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
|
|
|
|
| 27 |
sentence_lengths = [len(nltk.word_tokenize(s)) for s in sentences]
|
| 28 |
features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
|
|
|
|
| 29 |
features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
|
| 30 |
features['period_density'] = text.count('.') / max(len(text), 1) * 1000
|
| 31 |
features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
|
| 32 |
features['question_density'] = text.count('?') / max(len(text), 1) * 1000
|
|
|
|
| 33 |
complex_words = [w for w in words_clean if len(w) > 6]
|
| 34 |
features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)
|
| 35 |
+
|
| 36 |
ai_markers = ['furthermore', 'moreover', 'additionally', 'consequently', 'therefore', 'thus', 'hence']
|
| 37 |
human_markers = ['i think', 'i believe', 'personally', 'maybe', 'probably', 'actually', 'really']
|
|
|
|
| 38 |
text_lower = text.lower()
|
| 39 |
features['ai_marker_count'] = sum(text_lower.count(marker) for marker in ai_markers)
|
| 40 |
features['human_marker_count'] = sum(text_lower.count(marker) for marker in human_markers)
|
|
|
|
|
|
|
| 41 |
|
| 42 |
+
return np.array(list(features.values())).reshape(1, -1), features
|
| 43 |
+
|
| 44 |
+
# 預測函數
|
| 45 |
+
def predict(text):
|
| 46 |
+
X, feats = extract_features(text)
|
| 47 |
+
prob = model.predict_proba(X)[0][1] # 假設模型是二分類 [0: human, 1: AI]
|
| 48 |
+
label = "AI 生成" if prob > 0.5 else "人類撰寫"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
+
# 簡單生成「判斷依據」
|
| 51 |
+
reason = []
|
| 52 |
+
if feats['vocabulary_richness'] < 0.3:
|
| 53 |
+
reason.append("詞彙多樣性較低")
|
| 54 |
+
if feats['sentence_length_variance'] < 10:
|
| 55 |
+
reason.append("句子長度較平均,像 AI")
|
| 56 |
+
if feats['ai_marker_count'] > feats['human_marker_count']:
|
| 57 |
+
reason.append("包含常見 AI 連接詞")
|
| 58 |
+
if feats['human_marker_count'] > feats['ai_marker_count']:
|
| 59 |
+
reason.append("包含主觀語氣詞")
|
| 60 |
+
if not reason:
|
| 61 |
+
reason.append("整體語言特徵符合模型預測")
|
| 62 |
+
|
| 63 |
+
result = f"預測結果:{label}\nAI 機率:{prob:.2%}\n判斷依據:{', '.join(reason)}"
|
| 64 |
+
return result
|
| 65 |
|
| 66 |
# Gradio 介面
|
| 67 |
demo = gr.Interface(
|
| 68 |
+
fn=predict,
|
| 69 |
+
inputs=gr.Textbox(label="請輸入文章內容"),
|
| 70 |
+
outputs="text",
|
| 71 |
+
title="AI / Human 判斷器",
|
| 72 |
+
description="上傳的模型為 .pkl 格式,根據語言特徵分析並判斷文本來源"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
)
|
| 74 |
|
| 75 |
+
demo.launch()
|