Hellowish commited on
Commit
b949d6b
·
verified ·
1 Parent(s): 95a9b8d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -9
app.py CHANGED
@@ -21,14 +21,15 @@ except LookupError:
21
  # 載入你訓練好的 .pkl 模型
22
  model = joblib.load("ai_detector_model.pkl") # 請確認檔名正確(在 /app 資料夾裡)
23
 
24
- # 特徵擷取函數
25
  from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
26
 
27
- # 初始化標準 punkt tokenizer
28
- sent_tokenizer = PunktSentenceTokenizer()
29
 
30
  def extract_features(text):
31
- sentences = sent_tokenizer.tokenize(text)
 
 
32
  words = re.findall(r'\b\w+\b', text.lower())
33
  words_clean = [w for w in words if w.isalpha()]
34
 
@@ -41,12 +42,16 @@ def extract_features(text):
41
 
42
  unique_words = set(words_clean)
43
  features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
44
- sentence_lengths = [len(word_tokenize(s)) for s in sentences] # 改這裡
 
 
45
  features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
 
46
  features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
47
  features['period_density'] = text.count('.') / max(len(text), 1) * 1000
48
  features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
49
  features['question_density'] = text.count('?') / max(len(text), 1) * 1000
 
50
  complex_words = [w for w in words_clean if len(w) > 6]
51
  features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)
52
 
@@ -58,7 +63,6 @@ def extract_features(text):
58
 
59
  return np.array(list(features.values())).reshape(1, -1), features
60
 
61
-
62
  # 預測函數
63
  def predict(text):
64
  if not text.strip():
@@ -92,9 +96,8 @@ def predict(text):
92
  # Gradio 介面
93
  demo = gr.Interface(
94
  fn=predict,
95
- inputs=gr.Textbox(label="請輸入文章內容"),
96
- outputs="text",
97
- title="AI / Human 判斷器",
98
  description="上傳的模型為 .pkl 格式,根據語言特徵分析並判斷文本來源"
99
  )
100
 
 
21
  # 載入你訓練好的 .pkl 模型
22
  model = joblib.load("ai_detector_model.pkl") # 請確認檔名正確(在 /app 資料夾裡)
23
 
 
24
  from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
25
 
26
+ # 建立自訂 Punkt tokenizer
27
+ sentence_tokenizer = PunktSentenceTokenizer()
28
 
29
  def extract_features(text):
30
+ # 使用自訂 tokenizer 分句
31
+ sentences = sentence_tokenizer.tokenize(text)
32
+
33
  words = re.findall(r'\b\w+\b', text.lower())
34
  words_clean = [w for w in words if w.isalpha()]
35
 
 
42
 
43
  unique_words = set(words_clean)
44
  features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
45
+
46
+ # 使用 word_tokenize 也明確指定
47
+ sentence_lengths = [len(word_tokenize(s)) for s in sentences]
48
  features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
49
+
50
  features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
51
  features['period_density'] = text.count('.') / max(len(text), 1) * 1000
52
  features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000
53
  features['question_density'] = text.count('?') / max(len(text), 1) * 1000
54
+
55
  complex_words = [w for w in words_clean if len(w) > 6]
56
  features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1)
57
 
 
63
 
64
  return np.array(list(features.values())).reshape(1, -1), features
65
 
 
66
  # 預測函數
67
  def predict(text):
68
  if not text.strip():
 
96
  # Gradio 介面
97
  demo = gr.Interface(
98
  fn=predict,
99
+ inputs=gr.Textbox(label="請輸入文章內容", lines=15, max_lines=50, placeholder="在此輸入文章…"),
100
+ outputs=gr.Textbox(label="預測結果", lines=15, max_lines=30, placeholder="結果會顯示在這裡…"), title="AI / Human 判斷器",
 
101
  description="上傳的模型為 .pkl 格式,根據語言特徵分析並判斷文本來源"
102
  )
103