Spaces:

Hellowish
/

AI_Detect

Sleeping

App Files Files Community

Hellowish commited on Nov 5, 2025

Commit

b949d6b

verified ·

1 Parent(s): 95a9b8d

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -9

app.py CHANGED Viewed

@@ -21,14 +21,15 @@ except LookupError:
 # 載入你訓練好的 .pkl 模型
 model = joblib.load("ai_detector_model.pkl")  # 請確認檔名正確（在 /app 資料夾裡）
-# 特徵擷取函數
 from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
-# 初始化標準 punkt tokenizer
-sent_tokenizer = PunktSentenceTokenizer()
 def extract_features(text):
-    sentences = sent_tokenizer.tokenize(text)
     words = re.findall(r'\b\w+\b', text.lower())
     words_clean = [w for w in words if w.isalpha()]
@@ -41,12 +42,16 @@ def extract_features(text):
     unique_words = set(words_clean)
     features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
-    sentence_lengths = [len(word_tokenize(s)) for s in sentences]  # 改這裡
     features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
     features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
     features['period_density'] = text.count('.') / max(len(text), 1) * 1000
     features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
     features['question_density'] = text.count('?') / max(len(text), 1) * 1000
     complex_words = [w for w in words_clean if len(w) > 6]
     features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)
@@ -58,7 +63,6 @@ def extract_features(text):
     return np.array(list(features.values())).reshape(1, -1), features
 # 預測函數
 def predict(text):
     if not text.strip():
@@ -92,9 +96,8 @@ def predict(text):
 # Gradio 介面
 demo = gr.Interface(
     fn=predict,
-    inputs=gr.Textbox(label="請輸入文章內容"),
-    outputs="text",
-    title="AI / Human 判斷器",
     description="上傳的模型為 .pkl 格式，根據語言特徵分析並判斷文本來源"
 )

 # 載入你訓練好的 .pkl 模型
 model = joblib.load("ai_detector_model.pkl")  # 請確認檔名正確（在 /app 資料夾裡）
 from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
+# 建立自訂 Punkt tokenizer
+sentence_tokenizer = PunktSentenceTokenizer()
 def extract_features(text):
+    # 使用自訂 tokenizer 分句
+    sentences = sentence_tokenizer.tokenize(text)
     words = re.findall(r'\b\w+\b', text.lower())
     words_clean = [w for w in words if w.isalpha()]
     unique_words = set(words_clean)
     features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
+    # 使用 word_tokenize 也明確指定
+    sentence_lengths = [len(word_tokenize(s)) for s in sentences]
     features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
     features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
     features['period_density'] = text.count('.') / max(len(text), 1) * 1000
     features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
     features['question_density'] = text.count('?') / max(len(text), 1) * 1000
     complex_words = [w for w in words_clean if len(w) > 6]
     features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)
     return np.array(list(features.values())).reshape(1, -1), features
 # 預測函數
 def predict(text):
     if not text.strip():
 # Gradio 介面
 demo = gr.Interface(
     fn=predict,
+    inputs=gr.Textbox(label="請輸入文章內容", lines=15, max_lines=50, placeholder="在此輸入文章…"),
+    outputs=gr.Textbox(label="預測結果", lines=15, max_lines=30, placeholder="結果會顯示在這裡…"),    title="AI / Human 判斷器",
     description="上傳的模型為 .pkl 格式，根據語言特徵分析並判斷文本來源"
 )