Hellowish commited on
Commit
bc4d57c
·
verified ·
1 Parent(s): d4f5fda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -59
app.py CHANGED
@@ -1,21 +1,19 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
- import torch
4
  import numpy as np
 
5
  import nltk
6
 
7
  nltk.download('punkt')
8
 
9
- # 載入模型
10
- model_name = "Hellowish/AI_Detect"
11
- tokenizer = AutoTokenizer.from_pretrained(model_name)
12
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
13
 
14
- # 特徵取函數
15
  def extract_features(text):
16
- words = nltk.word_tokenize(text)
17
- words_clean = [w for w in words if w.isalpha()]
18
  sentences = nltk.sent_tokenize(text)
 
 
19
 
20
  features = {}
21
  features['text_length'] = len(text)
@@ -23,75 +21,55 @@ def extract_features(text):
23
  features['sentence_count'] = len(sentences)
24
  features['avg_sentence_length'] = len(words_clean) / max(len(sentences), 1)
25
  features['avg_word_length'] = np.mean([len(w) for w in words_clean]) if words_clean else 0
26
-
27
  unique_words = set(words_clean)
28
  features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
29
-
30
  sentence_lengths = [len(nltk.word_tokenize(s)) for s in sentences]
31
  features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
32
-
33
  features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
34
  features['period_density'] = text.count('.') / max(len(text), 1) * 1000
35
  features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
36
  features['question_density'] = text.count('?') / max(len(text), 1) * 1000
37
-
38
  complex_words = [w for w in words_clean if len(w) > 6]
39
  features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)
40
-
41
  ai_markers = ['furthermore', 'moreover', 'additionally', 'consequently', 'therefore', 'thus', 'hence']
42
  human_markers = ['i think', 'i believe', 'personally', 'maybe', 'probably', 'actually', 'really']
43
-
44
  text_lower = text.lower()
45
  features['ai_marker_count'] = sum(text_lower.count(marker) for marker in ai_markers)
46
  features['human_marker_count'] = sum(text_lower.count(marker) for marker in human_markers)
47
-
48
- return features
49
 
50
- # AI/Human 判斷 + 特徵依據
51
- def predict_with_reason(text):
52
- # 模型判斷
53
- inputs = tokenizer(text, return_tensors="pt", truncation=True)
54
- with torch.no_grad():
55
- output = model(**inputs)
56
-
57
- logits = output.logits
58
- probs = torch.softmax(logits, dim=-1).squeeze().tolist()
59
- pred = torch.argmax(logits, dim=-1).item()
60
- label = "AI" if pred == 1 else "Human"
61
-
62
- # 特徵抽取
63
- features = extract_features(text)
64
-
65
- # 自動生成依據文字
66
- reasons = []
67
- if features['sentence_length_variance'] < 5:
68
- reasons.append("句子長度變化較小,可能是 AI 生成")
69
- if features['complex_word_ratio'] > 0.2:
70
- reasons.append("使用較多複雜字詞")
71
- if features['ai_marker_count'] > 0:
72
- reasons.append(f"出現 AI 標記詞: {features['ai_marker_count']} 次")
73
- if features['human_marker_count'] > 0:
74
- reasons.append(f"出現人類口語標記詞: {features['human_marker_count']} 次")
75
- if not reasons:
76
- reasons.append("文字特徵未明顯偏向 AI 或 Human")
77
 
78
- return {
79
- "判斷結果": label,
80
- "機率分數": f"Human: {probs[0]:.2f}, AI: {probs[1]:.2f}",
81
- "判斷依據": "; ".join(reasons)
82
- }
 
 
 
 
 
 
 
 
 
 
83
 
84
  # Gradio 介面
85
  demo = gr.Interface(
86
- fn=predict_with_reason,
87
- inputs=gr.Textbox(label="請輸入文"),
88
- outputs=[
89
- gr.Textbox(label="判斷結果"),
90
- gr.Textbox(label="機率"),
91
- gr.Textbox(label="判斷依據")
92
- ],
93
- title="AI/Human 判斷器(特徵依據版)",
94
- description="輸入文字即可判斷是 AI 生成還是 Human 撰寫,並顯示判斷��據"
95
  )
96
 
97
- demo.launch()
 
1
  import gradio as gr
2
+ import joblib
 
3
  import numpy as np
4
+ import re
5
  import nltk
6
 
7
  nltk.download('punkt')
8
 
9
+ # 載入你訓練好的 .pkl 模型
10
+ model = joblib.load("AI_Detect.pkl") # 請確認檔名正確(在 /app 資料夾裡)
 
 
11
 
12
+ # 特徵取函數(依照你之前提供的邏輯)
13
  def extract_features(text):
 
 
14
  sentences = nltk.sent_tokenize(text)
15
+ words = re.findall(r'\b\w+\b', text.lower())
16
+ words_clean = [w for w in words if w.isalpha()]
17
 
18
  features = {}
19
  features['text_length'] = len(text)
 
21
  features['sentence_count'] = len(sentences)
22
  features['avg_sentence_length'] = len(words_clean) / max(len(sentences), 1)
23
  features['avg_word_length'] = np.mean([len(w) for w in words_clean]) if words_clean else 0
24
+
25
  unique_words = set(words_clean)
26
  features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
 
27
  sentence_lengths = [len(nltk.word_tokenize(s)) for s in sentences]
28
  features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
 
29
  features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
30
  features['period_density'] = text.count('.') / max(len(text), 1) * 1000
31
  features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
32
  features['question_density'] = text.count('?') / max(len(text), 1) * 1000
 
33
  complex_words = [w for w in words_clean if len(w) > 6]
34
  features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)
35
+
36
  ai_markers = ['furthermore', 'moreover', 'additionally', 'consequently', 'therefore', 'thus', 'hence']
37
  human_markers = ['i think', 'i believe', 'personally', 'maybe', 'probably', 'actually', 'really']
 
38
  text_lower = text.lower()
39
  features['ai_marker_count'] = sum(text_lower.count(marker) for marker in ai_markers)
40
  features['human_marker_count'] = sum(text_lower.count(marker) for marker in human_markers)
 
 
41
 
42
+ return np.array(list(features.values())).reshape(1, -1), features
43
+
44
+ # 預測函數
45
+ def predict(text):
46
+ X, feats = extract_features(text)
47
+ prob = model.predict_proba(X)[0][1] # 假設模型是二分類 [0: human, 1: AI]
48
+ label = "AI 生成" if prob > 0.5 else "人類撰寫"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # 簡單生成「判斷依據」
51
+ reason = []
52
+ if feats['vocabulary_richness'] < 0.3:
53
+ reason.append("詞彙多樣性較低")
54
+ if feats['sentence_length_variance'] < 10:
55
+ reason.append("句子長度較平均,像 AI")
56
+ if feats['ai_marker_count'] > feats['human_marker_count']:
57
+ reason.append("包含常見 AI 連接詞")
58
+ if feats['human_marker_count'] > feats['ai_marker_count']:
59
+ reason.append("包含主觀語氣詞")
60
+ if not reason:
61
+ reason.append("整體語言特徵符合模型預測")
62
+
63
+ result = f"預測結果:{label}\nAI 機率:{prob:.2%}\n判斷依據:{', '.join(reason)}"
64
+ return result
65
 
66
  # Gradio 介面
67
  demo = gr.Interface(
68
+ fn=predict,
69
+ inputs=gr.Textbox(label="請輸入文章內容"),
70
+ outputs="text",
71
+ title="AI / Human 判斷",
72
+ description="上傳的模型為 .pkl 格式,根據語言特徵析並判斷文本來源"
 
 
 
 
73
  )
74
 
75
+ demo.launch()