Update app.py
Browse files
app.py
CHANGED
|
@@ -21,12 +21,17 @@ except LookupError:
|
|
| 21 |
# 載入你訓練好的 .pkl 模型
|
| 22 |
model = joblib.load("ai_detector_model.pkl") # 請確認檔名正確(在 /app 資料夾裡)
|
| 23 |
|
| 24 |
-
# 特徵擷取函數
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def extract_features(text):
|
| 26 |
-
sentences =
|
| 27 |
words = re.findall(r'\b\w+\b', text.lower())
|
| 28 |
words_clean = [w for w in words if w.isalpha()]
|
| 29 |
-
|
| 30 |
features = {}
|
| 31 |
features['text_length'] = len(text)
|
| 32 |
features['word_count'] = len(words_clean)
|
|
@@ -36,7 +41,7 @@ def extract_features(text):
|
|
| 36 |
|
| 37 |
unique_words = set(words_clean)
|
| 38 |
features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
|
| 39 |
-
sentence_lengths = [len(
|
| 40 |
features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
|
| 41 |
features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
|
| 42 |
features['period_density'] = text.count('.') / max(len(text), 1) * 1000
|
|
@@ -53,6 +58,7 @@ def extract_features(text):
|
|
| 53 |
|
| 54 |
return np.array(list(features.values())).reshape(1, -1), features
|
| 55 |
|
|
|
|
| 56 |
# 預測函數
|
| 57 |
def predict(text):
|
| 58 |
if not text.strip():
|
|
|
|
| 21 |
# 載入你訓練好的 .pkl 模型
|
| 22 |
model = joblib.load("ai_detector_model.pkl") # 請確認檔名正確(在 /app 資料夾裡)
|
| 23 |
|
| 24 |
+
# 特徵擷取函數
|
| 25 |
+
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
|
| 26 |
+
|
| 27 |
+
# 初始化標準 punkt tokenizer
|
| 28 |
+
sent_tokenizer = PunktSentenceTokenizer()
|
| 29 |
+
|
| 30 |
def extract_features(text):
|
| 31 |
+
sentences = sent_tokenizer.tokenize(text)
|
| 32 |
words = re.findall(r'\b\w+\b', text.lower())
|
| 33 |
words_clean = [w for w in words if w.isalpha()]
|
| 34 |
+
|
| 35 |
features = {}
|
| 36 |
features['text_length'] = len(text)
|
| 37 |
features['word_count'] = len(words_clean)
|
|
|
|
| 41 |
|
| 42 |
unique_words = set(words_clean)
|
| 43 |
features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1)
|
| 44 |
+
sentence_lengths = [len(word_tokenize(s)) for s in sentences] # 改這裡
|
| 45 |
features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0
|
| 46 |
features['comma_density'] = text.count(',') / max(len(text), 1) * 1000
|
| 47 |
features['period_density'] = text.count('.') / max(len(text), 1) * 1000
|
|
|
|
| 58 |
|
| 59 |
return np.array(list(features.values())).reshape(1, -1), features
|
| 60 |
|
| 61 |
+
|
| 62 |
# 預測函數
|
| 63 |
def predict(text):
|
| 64 |
if not text.strip():
|