Spaces:

Hellowish
/

AI_Detect

Sleeping

Hellowish commited on Nov 5, 2025

Commit

95a9b8d

verified ·

1 Parent(s): 2b36831

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -21,12 +21,17 @@ except LookupError:
 # 載入你訓練好的 .pkl 模型
 model = joblib.load("ai_detector_model.pkl")  # 請確認檔名正確（在 /app 資料夾裡）
-# 特徵擷取函數（依照你之前提供的邏輯）
 def extract_features(text):
-    sentences = nltk.sent_tokenize(text)
     words = re.findall(r'\b\w+\b', text.lower())
     words_clean = [w for w in words if w.isalpha()]
     features = {}
     features['text_length'] = len(text)
     features['word_count'] = len(words_clean)
@@ -36,7 +41,7 @@ def extract_features(text):
     unique_words = set(words_clean)
     features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
-    sentence_lengths = [len(nltk.word_tokenize(s)) for s in sentences]
     features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
     features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
     features['period_density'] = text.count('.') / max(len(text), 1) * 1000
@@ -53,6 +58,7 @@ def extract_features(text):
     return np.array(list(features.values())).reshape(1, -1), features
 # 預測函數
 def predict(text):
     if not text.strip():

 # 載入你訓練好的 .pkl 模型
 model = joblib.load("ai_detector_model.pkl")  # 請確認檔名正確（在 /app 資料夾裡）
+# 特徵擷取函數
+from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
+# 初始化標準 punkt tokenizer
+sent_tokenizer = PunktSentenceTokenizer()
 def extract_features(text):
+    sentences = sent_tokenizer.tokenize(text)
     words = re.findall(r'\b\w+\b', text.lower())
     words_clean = [w for w in words if w.isalpha()]
     features = {}
     features['text_length'] = len(text)
     features['word_count'] = len(words_clean)
     unique_words = set(words_clean)
     features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
+    sentence_lengths = [len(word_tokenize(s)) for s in sentences]  # 改這裡
     features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
     features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
     features['period_density'] = text.count('.') / max(len(text), 1) * 1000
     return np.array(list(features.values())).reshape(1, -1), features
 # 預測函數
 def predict(text):
     if not text.strip():