| import gradio as gr |
| import joblib |
| import numpy as np |
| import re |
|
|
| |
| model = joblib.load("ai_detector_model.pkl") |
|
|
| |
| def simple_sent_tokenize(text): |
| |
| sentences = re.split(r'(?<=[.!?])\s+', text.strip()) |
| return [s for s in sentences if s] |
|
|
| |
| def simple_word_tokenize(text): |
| |
| return re.findall(r'\b\w+\b', text.lower()) |
|
|
| def extract_features(text): |
| sentences = simple_sent_tokenize(text) |
| |
| words_clean = [w for w in simple_word_tokenize(text) if w.isalpha()] |
|
|
| features = {} |
| features['text_length'] = len(text) |
| features['word_count'] = len(words_clean) |
| features['sentence_count'] = len(sentences) |
| features['avg_sentence_length'] = len(words_clean) / max(len(sentences), 1) |
| features['avg_word_length'] = np.mean([len(w) for w in words_clean]) if words_clean else 0 |
|
|
| unique_words = set(words_clean) |
| features['vocabulary_richness'] = len(unique_words) / max(len(words_clean), 1) |
|
|
| sentence_lengths = [len(simple_word_tokenize(s)) for s in sentences] |
| features['sentence_length_variance'] = np.var(sentence_lengths) if sentence_lengths else 0 |
|
|
| features['comma_density'] = text.count(',') / max(len(text), 1) * 1000 |
| features['period_density'] = text.count('.') / max(len(text), 1) * 1000 |
| features['exclamation_density'] = text.count('!') / max(len(text), 1) * 1000 |
| features['question_density'] = text.count('?') / max(len(text), 1) * 1000 |
|
|
| complex_words = [w for w in words_clean if len(w) > 6] |
| features['complex_word_ratio'] = len(complex_words) / max(len(words_clean), 1) |
|
|
| ai_markers = ['furthermore', 'moreover', 'additionally', 'consequently', 'therefore', 'thus', 'hence'] |
| human_markers = ['i think', 'i believe', 'personally', 'maybe', 'probably', 'actually', 'really'] |
| text_lower = text.lower() |
| features['ai_marker_count'] = sum(text_lower.count(marker) for marker in ai_markers) |
| features['human_marker_count'] = sum(text_lower.count(marker) for marker in human_markers) |
|
|
| return np.array(list(features.values())).reshape(1, -1), features |
|
|
| def predict(text): |
| if not text.strip(): |
| return "請輸入文字內容。" |
| |
| try: |
| X, feats = extract_features(text) |
| if hasattr(model, "predict_proba"): |
| prob = model.predict_proba(X)[0][1] |
| else: |
| prob = model.predict(X)[0] |
| |
| label = "AI 生成" if prob > 0.5 else "人類撰寫" |
|
|
| reason = [] |
| if feats['vocabulary_richness'] < 0.3: |
| reason.append("詞彙多樣性較低") |
| if feats['sentence_length_variance'] < 10: |
| reason.append("句子長度平均,像 AI") |
| if feats['ai_marker_count'] > feats['human_marker_count']: |
| reason.append("包含常見 AI 連接詞") |
| if feats['human_marker_count'] > feats['ai_marker_count']: |
| reason.append("包含主觀語氣詞") |
| if not reason: |
| reason.append("整體語言特徵與模型預測一致") |
|
|
| return f"預測結果:{label}\nAI 機率:{prob:.2%}\n判斷依據:{', '.join(reason)}" |
| except Exception as e: |
| return f"預測時出現錯誤: {str(e)}" |
|
|
| |
| demo = gr.Interface( |
| fn=predict, |
| inputs=gr.Textbox(label="請輸入文章內容", lines=15, max_lines=50, placeholder="在此輸入文章…"), |
| outputs=gr.Textbox(label="預測結果", lines=15, max_lines=30, placeholder="結果會顯示在這裡…"), |
| title="AI / Human 判斷器", |
| description="上傳的模型為 .pkl 格式,根據語言特徵分析並判斷文本來源" |
| ) |
|
|
| demo.launch() |
|
|
|
|
|
|
|
|
|
|
| '''import gradio as gr |
| import tensorflow as tf |
| import pickle |
| |
| # ---------------- 載入模型 ---------------- |
| try: |
| model = tf.keras.models.load_model("AIDetect.h5") |
| print("✅ 模型載入成功") |
| except Exception as e: |
| print("❌ 模型載入失敗:", e) |
| model = None |
| |
| # ---------------- 載入詞彙 ---------------- |
| try: |
| with open("vocab.pkl", "rb") as f: |
| vocab = pickle.load(f) |
| vectorized_layer = tf.keras.layers.TextVectorization( |
| max_tokens=len(vocab)+1, output_sequence_length=50 |
| ) |
| vectorized_layer.set_vocabulary(vocab) |
| print("✅ 詞彙載入成功") |
| except Exception as e: |
| print("❌ 詞彙載入失敗:", e) |
| vectorized_layer = None |
| |
| # ---------------- 載入 scaler ---------------- |
| try: |
| with open("scaler.pkl", "rb") as f: |
| scaler = pickle.load(f) |
| print("✅ Scaler 載入成功") |
| except Exception as e: |
| print("❌ Scaler 載入失敗:", e) |
| scaler = None |
| |
| # ---------------- 特徵計算 ---------------- |
| def compute_features(text): |
| if isinstance(text, tf.Tensor): |
| text = text.numpy().decode('utf-8') if text.dtype == tf.string else str(text.numpy()) |
| elif isinstance(text, bytes): |
| text = text.decode('utf-8') |
| else: |
| text = str(text) |
| |
| words = text.split() |
| word_count = len(words) |
| unique_words = len(set(words)) |
| unique_word_ratio = unique_words / (word_count + 1e-6) |
| repeat_rate = 1 - unique_word_ratio |
| punctuation_count = sum(1 for c in text if c in ".,!?;:") |
| punctuation_ratio = punctuation_count / (len(text) + 1e-6) |
| avg_word_length = sum(len(w) for w in words) / (word_count if word_count else 1) |
| |
| return [[word_count, unique_word_ratio, repeat_rate, punctuation_ratio, avg_word_length]] |
| |
| # ---------------- 使用 scaler ---------------- |
| def transform_features(feat): |
| if scaler is None: |
| return feat # 如果 scaler 沒載入,就直接回傳原始特徵 |
| return scaler.transform(feat).tolist() # 轉成 list,避免使用 NumPy |
| |
| # ---------------- 生成解釋 ---------------- |
| def explain_prediction(text): |
| if model is None or vectorized_layer is None: |
| return "❌ 模型或詞彙尚未載入,無法預測" |
| |
| try: |
| # ---------------- 特徵計算 ---------------- |
| feat_raw = compute_features(text) |
| feat = transform_features(feat_raw) |
| |
| # ---------------- 文字向量化 ---------------- |
| seq = vectorized_layer([text]) |
| seq = tf.keras.utils.pad_sequences(seq, maxlen=50, padding='pre') |
| |
| # 轉成 TensorFlow tensor |
| seq = tf.convert_to_tensor(seq) |
| feat = tf.convert_to_tensor(feat, dtype=tf.float32) |
| |
| # ---------------- 預測 ---------------- |
| pred_prob = model([seq, feat], training=False).numpy()[0][0] |
| label = "AI 生成" if pred_prob >= 0.5 else "人類撰寫" |
| prob = pred_prob * 100 |
| |
| # ---------------- 判斷依據 ---------------- |
| reasons = [] |
| if feat_raw[0][0] > 100: reasons.append("句子長度偏長") |
| if feat_raw[0][2] > 0.3: reasons.append("重複率高") |
| if feat_raw[0][1] < 0.2: reasons.append("詞彙多樣性低") |
| if feat_raw[0][3] < 0.01: reasons.append("標點符號少") |
| if feat_raw[0][4] > 6: reasons.append("平均詞長偏長") |
| if not reasons: reasons.append("句子長度與用詞平均") |
| explanation = ";".join(reasons) |
| |
| return f"預測結果:{label}\nAI 機率:{prob:.2f}%\n判斷依據:{explanation}" |
| |
| except Exception as e: |
| return f"❌ 預測時發生錯誤: {e}" |
| |
| # ---------------- Gradio 介面 ---------------- |
| iface = gr.Interface( |
| fn=explain_prediction, |
| inputs=gr.Textbox(label="請輸入文章內容", lines=15, max_lines=50, placeholder="在此輸入文章…"), |
| outputs=gr.Textbox(label="預測結果", lines=15, max_lines=30, placeholder="結果會顯示在這裡…"), |
| title="AI vs Human 文本判斷", |
| description="輸入文章,模型會判斷是 AI 或人類撰寫,並給出機率與判斷依據" |
| ) |
| |
| iface.launch()''' |
|
|