Hellowish commited on
Commit
95a9b8d
·
verified ·
1 Parent(s): 2b36831

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -4
app.py CHANGED
@@ -21,12 +21,17 @@ except LookupError:
21
  # 載入你訓練好的 .pkl 模型
22
  model = joblib.load("ai_detector_model.pkl") # 請確認檔名正確(在 /app 資料夾裡)
23
 
24
- # 特徵擷取函數(依照你之前提供的邏輯)
 
 
 
 
 
25
  def extract_features(text):
26
- sentences = nltk.sent_tokenize(text)
27
  words = re.findall(r'\b\w+\b', text.lower())
28
  words_clean = [w for w in words if w.isalpha()]
29
-
30
  features = {}
31
  features['text_length'] = len(text)
32
  features['word_count'] = len(words_clean)
@@ -36,7 +41,7 @@ def extract_features(text):
36
 
37
  unique_words = set(words_clean)
38
  features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
39
- sentence_lengths = [len(nltk.word_tokenize(s)) for s in sentences]
40
  features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
41
  features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
42
  features['period_density'] = text.count('.') / max(len(text), 1) * 1000
@@ -53,6 +58,7 @@ def extract_features(text):
53
 
54
  return np.array(list(features.values())).reshape(1, -1), features
55
 
 
56
  # 預測函數
57
  def predict(text):
58
  if not text.strip():
 
21
  # 載入你訓練好的 .pkl 模型
22
  model = joblib.load("ai_detector_model.pkl") # 請確認檔名正確(在 /app 資料夾裡)
23
 
24
+ # 特徵擷取函數
25
+ from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
26
+
27
+ # 初始化標準 punkt tokenizer
28
+ sent_tokenizer = PunktSentenceTokenizer()
29
+
30
  def extract_features(text):
31
+ sentences = sent_tokenizer.tokenize(text)
32
  words = re.findall(r'\b\w+\b', text.lower())
33
  words_clean = [w for w in words if w.isalpha()]
34
+
35
  features = {}
36
  features['text_length'] = len(text)
37
  features['word_count'] = len(words_clean)
 
41
 
42
  unique_words = set(words_clean)
43
  features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
44
+ sentence_lengths = [len(word_tokenize(s)) for s in sentences] # 改這裡
45
  features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
46
  features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
47
  features['period_density'] = text.count('.') / max(len(text), 1) * 1000
 
58
 
59
  return np.array(list(features.values())).reshape(1, -1), features
60
 
61
+
62
  # 預測函數
63
  def predict(text):
64
  if not text.strip():