Update app.py
Browse files
app.py
CHANGED
|
@@ -21,14 +21,15 @@ except LookupError:
|
|
| 21 |
# 載入你訓練好的 .pkl 模型
|
| 22 |
model = joblib.load("ai_detector_model.pkl") # 請確認檔名正確(在 /app 資料夾裡)
|
| 23 |
|
| 24 |
-
# 特徵擷取函數
|
| 25 |
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
|
| 26 |
|
| 27 |
-
#
|
| 28 |
-
|
| 29 |
|
| 30 |
def extract_features(text):
|
| 31 |
-
|
|
|
|
|
|
|
| 32 |
words = re.findall(r'\b\w+\b', text.lower())
|
| 33 |
words_clean = [w for w in words if w.isalpha()]
|
| 34 |
|
|
@@ -41,12 +42,16 @@ def extract_features(text):
|
|
| 41 |
|
| 42 |
unique_words = set(words_clean)
|
| 43 |
features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
|
| 44 |
-
|
|
|
|
|
|
|
| 45 |
features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
|
|
|
|
| 46 |
features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
|
| 47 |
features['period_density'] = text.count('.') / max(len(text), 1) * 1000
|
| 48 |
features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
|
| 49 |
features['question_density'] = text.count('?') / max(len(text), 1) * 1000
|
|
|
|
| 50 |
complex_words = [w for w in words_clean if len(w) > 6]
|
| 51 |
features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)
|
| 52 |
|
|
@@ -58,7 +63,6 @@ def extract_features(text):
|
|
| 58 |
|
| 59 |
return np.array(list(features.values())).reshape(1, -1), features
|
| 60 |
|
| 61 |
-
|
| 62 |
# 預測函數
|
| 63 |
def predict(text):
|
| 64 |
if not text.strip():
|
|
@@ -92,9 +96,8 @@ def predict(text):
|
|
| 92 |
# Gradio 介面
|
| 93 |
demo = gr.Interface(
|
| 94 |
fn=predict,
|
| 95 |
-
inputs=gr.Textbox(label="請輸入文章內容"),
|
| 96 |
-
outputs="
|
| 97 |
-
title="AI / Human 判斷器",
|
| 98 |
description="上傳的模型為 .pkl 格式,根據語言特徵分析並判斷文本來源"
|
| 99 |
)
|
| 100 |
|
|
|
|
| 21 |
# 載入你訓練好的 .pkl 模型
|
| 22 |
model = joblib.load("ai_detector_model.pkl") # 請確認檔名正確(在 /app 資料夾裡)
|
| 23 |
|
|
|
|
| 24 |
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
|
| 25 |
|
| 26 |
+
# 建立自訂 Punkt tokenizer
|
| 27 |
+
sentence_tokenizer = PunktSentenceTokenizer()
|
| 28 |
|
| 29 |
def extract_features(text):
|
| 30 |
+
# 使用自訂 tokenizer 分句
|
| 31 |
+
sentences = sentence_tokenizer.tokenize(text)
|
| 32 |
+
|
| 33 |
words = re.findall(r'\b\w+\b', text.lower())
|
| 34 |
words_clean = [w for w in words if w.isalpha()]
|
| 35 |
|
|
|
|
| 42 |
|
| 43 |
unique_words = set(words_clean)
|
| 44 |
features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
|
| 45 |
+
|
| 46 |
+
# 使用 word_tokenize 也明確指定
|
| 47 |
+
sentence_lengths = [len(word_tokenize(s)) for s in sentences]
|
| 48 |
features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
|
| 49 |
+
|
| 50 |
features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
|
| 51 |
features['period_density'] = text.count('.') / max(len(text), 1) * 1000
|
| 52 |
features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
|
| 53 |
features['question_density'] = text.count('?') / max(len(text), 1) * 1000
|
| 54 |
+
|
| 55 |
complex_words = [w for w in words_clean if len(w) > 6]
|
| 56 |
features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)
|
| 57 |
|
|
|
|
| 63 |
|
| 64 |
return np.array(list(features.values())).reshape(1, -1), features
|
| 65 |
|
|
|
|
| 66 |
# 預測函數
|
| 67 |
def predict(text):
|
| 68 |
if not text.strip():
|
|
|
|
| 96 |
# Gradio 介面
|
| 97 |
demo = gr.Interface(
|
| 98 |
fn=predict,
|
| 99 |
+
inputs=gr.Textbox(label="請輸入文章內容", lines=15, max_lines=50, placeholder="在此輸入文章…"),
|
| 100 |
+
outputs=gr.Textbox(label="預測結果", lines=15, max_lines=30, placeholder="結果會顯示在這裡…"), title="AI / Human 判斷器",
|
|
|
|
| 101 |
description="上傳的模型為 .pkl 格式,根據語言特徵分析並判斷文本來源"
|
| 102 |
)
|
| 103 |
|