Spaces:

Hellowish
/

AI_Detect

Sleeping

App Files Files Community

Hellowish commited on Nov 5, 2025

Commit

bc4d57c

verified ·

1 Parent(s): d4f5fda

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -59

app.py CHANGED Viewed

@@ -1,21 +1,19 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-import torch
 import numpy as np
 import nltk
 nltk.download('punkt')
-# 載入模型
-model_name = "Hellowish/AI_Detect"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSequenceClassification.from_pretrained(model_name)
-# 特徵抽取函數
 def extract_features(text):
-    words = nltk.word_tokenize(text)
-    words_clean = [w for w in words if w.isalpha()]
     sentences = nltk.sent_tokenize(text)
     features = {}
     features['text_length'] = len(text)
@@ -23,75 +21,55 @@ def extract_features(text):
     features['sentence_count'] = len(sentences)
     features['avg_sentence_length'] = len(words_clean) / max(len(sentences), 1)
     features['avg_word_length'] = np.mean([len(w) for w in words_clean]) if words_clean else 0
     unique_words = set(words_clean)
     features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
     sentence_lengths = [len(nltk.word_tokenize(s)) for s in sentences]
     features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
     features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
     features['period_density'] = text.count('.') / max(len(text), 1) * 1000
     features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
     features['question_density'] = text.count('?') / max(len(text), 1) * 1000
     complex_words = [w for w in words_clean if len(w) > 6]
     features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)
     ai_markers = ['furthermore', 'moreover', 'additionally', 'consequently', 'therefore', 'thus', 'hence']
     human_markers = ['i think', 'i believe', 'personally', 'maybe', 'probably', 'actually', 'really']
     text_lower = text.lower()
     features['ai_marker_count'] = sum(text_lower.count(marker) for marker in ai_markers)
     features['human_marker_count'] = sum(text_lower.count(marker) for marker in human_markers)
-    return features
-# AI/Human 判斷 + 特徵依據
-def predict_with_reason(text):
-    # 模型判斷
-    inputs = tokenizer(text, return_tensors="pt", truncation=True)
-    with torch.no_grad():
-        output = model(**inputs)
-    logits = output.logits
-    probs = torch.softmax(logits, dim=-1).squeeze().tolist()
-    pred = torch.argmax(logits, dim=-1).item()
-    label = "AI" if pred == 1 else "Human"
-    # 特徵抽取
-    features = extract_features(text)
-    # 自動生成依據文字
-    reasons = []
-    if features['sentence_length_variance'] < 5:
-        reasons.append("句子長度變化較小，可能是 AI 生成")
-    if features['complex_word_ratio'] > 0.2:
-        reasons.append("使用較多複雜字詞")
-    if features['ai_marker_count'] > 0:
-        reasons.append(f"出現 AI 標記詞: {features['ai_marker_count']} 次")
-    if features['human_marker_count'] > 0:
-        reasons.append(f"出現人類口語標記詞: {features['human_marker_count']} 次")
-    if not reasons:
-        reasons.append("文字特徵未明顯偏向 AI 或 Human")
-    return {
-        "判斷結果": label,
-        "機率分數": f"Human: {probs[0]:.2f}, AI: {probs[1]:.2f}",
-        "判斷依據": "; ".join(reasons)
-    }
 # Gradio 介面
 demo = gr.Interface(
-    fn=predict_with_reason,
-    inputs=gr.Textbox(label="請輸入文字"),
-    outputs=[
-        gr.Textbox(label="判斷結果"),
-        gr.Textbox(label="機率分數"),
-        gr.Textbox(label="判斷依據")
-    ],
-    title="AI/Human 判斷器（特徵依據版）",
-    description="輸入文字即可判斷是 AI 生成還是 Human 撰寫，並顯示判斷��據"
 )
-demo.launch()

 import gradio as gr
+import joblib
 import numpy as np
+import re
 import nltk
 nltk.download('punkt')
+# 載入你訓練好的 .pkl 模型
+model = joblib.load("AI_Detect.pkl")  # 請確認檔名正確（在 /app 資料夾裡）
+# 特徵擷取函數（依照你之前提供的邏輯）
 def extract_features(text):
     sentences = nltk.sent_tokenize(text)
+    words = re.findall(r'\b\w+\b', text.lower())
+    words_clean = [w for w in words if w.isalpha()]
     features = {}
     features['text_length'] = len(text)
     features['sentence_count'] = len(sentences)
     features['avg_sentence_length'] = len(words_clean) / max(len(sentences), 1)
     features['avg_word_length'] = np.mean([len(w) for w in words_clean]) if words_clean else 0
     unique_words = set(words_clean)
     features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
     sentence_lengths = [len(nltk.word_tokenize(s)) for s in sentences]
     features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
     features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
     features['period_density'] = text.count('.') / max(len(text), 1) * 1000
     features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
     features['question_density'] = text.count('?') / max(len(text), 1) * 1000
     complex_words = [w for w in words_clean if len(w) > 6]
     features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)
     ai_markers = ['furthermore', 'moreover', 'additionally', 'consequently', 'therefore', 'thus', 'hence']
     human_markers = ['i think', 'i believe', 'personally', 'maybe', 'probably', 'actually', 'really']
     text_lower = text.lower()
     features['ai_marker_count'] = sum(text_lower.count(marker) for marker in ai_markers)
     features['human_marker_count'] = sum(text_lower.count(marker) for marker in human_markers)
+    return np.array(list(features.values())).reshape(1, -1), features
+# 預測函數
+def predict(text):
+    X, feats = extract_features(text)
+    prob = model.predict_proba(X)[0][1]  # 假設模型是二分類 [0: human, 1: AI]
+    label = "AI 生成" if prob > 0.5 else "人類撰寫"
+    # 簡單生成「判斷依據」
+    reason = []
+    if feats['vocabulary_richness'] < 0.3:
+        reason.append("詞彙多樣性較低")
+    if feats['sentence_length_variance'] < 10:
+        reason.append("句子長度較平均，像 AI")
+    if feats['ai_marker_count'] > feats['human_marker_count']:
+        reason.append("包含常見 AI 連接詞")
+    if feats['human_marker_count'] > feats['ai_marker_count']:
+        reason.append("包含主觀語氣詞")
+    if not reason:
+        reason.append("整體語言特徵符合模型預測")
+    result = f"預測結果：{label}\nAI 機率：{prob:.2%}\n判斷依據：{', '.join(reason)}"
+    return result
 # Gradio 介面
 demo = gr.Interface(
+    fn=predict,
+    inputs=gr.Textbox(label="請輸入文章內容"),
+    outputs="text",
+    title="AI / Human 判斷器",
+    description="上傳的模型為 .pkl 格式，根據語言特徵分析並判斷文本來源"
 )
+demo.launch()