Spaces:

oyasai
/

email-classifier-demo

Sleeping

App Files Files Community

oyasai commited on Mar 4, 2025

Commit

a01ff80

verified ·

1 Parent(s): 27d9468

Create app.py

Browse files

Files changed (1) hide show

app.py +270 -0

app.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import gradio as gr
+import joblib
+import numpy as np
+import pandas as pd
+import os
+import re
+import json
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.pipeline import Pipeline
+import matplotlib.pyplot as plt
+# モデルとベクトル化器のパス
+MODEL_PATH = os.path.join(os.path.dirname(__file__), "models", "email_classifier.pkl")
+# モデルの読み込み関数
+def load_model():
+    try:
+        # 学習済みモデルが存在する場合はロード
+        if os.path.exists(MODEL_PATH):
+            model = joblib.load(MODEL_PATH)
+            print("事前学習済みモデルを読み込みました")
+            return model
+        else:
+            # モデルが存在しない場合は簡易版を作成
+            print("モデルが見つからないため、簡易版を作成します")
+            return create_simple_model()
+    except Exception as e:
+        print(f"モデル読み込みエラー: {e}")
+        return create_simple_model()
+# 簡易モデルの作成関数
+def create_simple_model():
+    # サンプルデータ
+    emails = [
+        "研究プロジェクトについての問い合わせです。詳細資料をご提供いただけますか？",
+        "共同研究の可能性について相談したいです。来週お時間ありますか？",
+        "I am interested in your research project. Could you provide more details?",
+        "I would like to discuss potential collaboration. Are you available next week?",
+        "特別価格でのご提供！今だけの限定キャンペーン！お早めに！",
+        "緊急！あなただけの特別オファーです！今すぐクリック！",
+        "SPECIAL OFFER! Limited time discount! Click now!",
+        "URGENT! Exclusive deal just for you! Don't miss out!"
+    ]
+    # 0: 正当な問い合わせ、1: 営業・スパム
+    labels = [0, 0, 0, 0, 1, 1, 1, 1]
+    # 簡易モデルの作成
+    model = Pipeline([
+        ('vectorizer', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
+        ('classifier', RandomForestClassifier(n_estimators=50, random_state=42))
+    ])
+    # 学習
+    model.fit(emails, labels)
+    # モデルディレクトリがなければ作成
+    os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
+    # モデルを保存
+    joblib.dump(model, MODEL_PATH)
+    return model
+# モデルのロード
+model = load_model()
+# テキスト前処理関数
+def preprocess_text(text):
+    """テキストの簡易前処理"""
+    if not isinstance(text, str):
+        return ""
+    # 小文字化
+    text = text.lower()
+    # 余分な空白の削除
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+# 予測関数
+def predict_email_type(email_text):
+    """メールの種類を予測"""
+    # 前処理
+    processed_text = preprocess_text(email_text)
+    if not processed_text:
+        return {
+            "prediction": "入力が空です",
+            "legitimate_prob": 0,
+            "spam_prob": 0,
+            "confidence": 0,
+            "features": []
+        }
+    # 予測
+    prediction = model.predict([processed_text])[0]
+    probabilities = model.predict_proba([processed_text])[0]
+    # 結果の整形
+    result = {
+        "prediction": "正当な問い合わせ" if prediction == 0 else "営業・スパム",
+        "legitimate_prob": float(probabilities[0]),
+        "spam_prob": float(probabilities[1]),
+        "confidence": float(probabilities[prediction]),
+    }
+    # 特徴量の分析（可能な場合）
+    try:
+        vectorizer = model.named_steps['vectorizer']
+        feature_names = vectorizer.get_feature_names_out()
+        # 文書ベクトルの取得
+        transformed = vectorizer.transform([processed_text])
+        # 非ゼロの特徴量を取得
+        nonzero_indices = transformed.nonzero()[1]
+        # 特徴量と重みのペアを作成
+        features = []
+        for idx in nonzero_indices:
+            if idx < len(feature_names):
+                features.append((feature_names[idx], transformed[0, idx]))
+        # 重みで降順ソート
+        features.sort(key=lambda x: x[1], reverse=True)
+        # 上位10個の特徴量を返す
+        result["features"] = [(str(f), float(w)) for f, w in features[:10]]
+    except Exception as e:
+        result["features"] = []
+        print(f"特徴量分析エラー: {e}")
+    return result
+# 予測結果の可視化
+def create_probability_chart(legitimate_prob, spam_prob):
+    """確率のバーチャートを作成"""
+    categories = ['正当な問い合わせ', '営業・スパム']
+    values = [legitimate_prob, spam_prob]
+    plt.figure(figsize=(8, 4))
+    bars = plt.bar(categories, values, color=['green', 'red'])
+    # バーの上��値を表示
+    for bar, val in zip(bars, values):
+        plt.text(bar.get_x() + bar.get_width()/2, val, f'{val:.2%}',
+                 ha='center', va='bottom')
+    plt.ylim(0, 1.0)
+    plt.ylabel('確率')
+    plt.title('メール種類の予測確率')
+    plt.tight_layout()
+    return plt
+# サンプルメールの読み込み
+def load_sample_emails():
+    """サンプルメールのロード"""
+    samples = {
+        "legitimate": [
+            "研究プロジェクトについて問い合わせします。貴研究所の量子コンピューティングに関する最新の成果に興味があり、詳細資料をいただけないでしょうか。よろしくお願いいたします。",
+            "I am writing to inquire about your research on quantum computing. I am particularly interested in your recent findings and would appreciate any additional materials you could provide. Thank you for your consideration."
+        ],
+        "spam": [
+            "【緊急】特別キャンペーン実施中！今だけ50%OFF！このチャンスをお見逃しなく！今すぐクリックして特典をゲット！期間限定なのでお早めに！",
+            "URGENT! SPECIAL OFFER! 50% OFF TODAY ONLY! Don't miss this amazing opportunity! Click now to claim your exclusive bonus! Limited time offer!"
+        ]
+    }
+    return samples
+# Gradioインターフェースの定義
+def create_interface():
+    # サンプルメールのロード
+    samples = load_sample_emails()
+    with gr.Blocks(title="メール判別システム") as demo:
+        gr.Markdown("# 研究関連メールとスパム/営業メールの判別システム")
+        gr.Markdown("テキストボックスにメール内容を入力して「分析」ボタンをクリックしてください。システムがそのメールが正当な研究問い合わせか、営業・スパムメールかを判定します。")
+        with gr.Row():
+            with gr.Column(scale=2):
+                # 入力エリア
+                email_input = gr.Textbox(
+                    label="メール内容",
+                    placeholder="ここにメール本文を入力してください...",
+                    lines=10
+                )
+                # サンプルボタン
+                with gr.Row():
+                    legitimate_btn = gr.Button("研究問い合わせサンプル")
+                    spam_btn = gr.Button("営業・スパムサンプル")
+                analyze_btn = gr.Button("分析", variant="primary")
+            with gr.Column(scale=1):
+                # 結果表示エリア
+                result_label = gr.Label(label="判定結果")
+                prob_chart = gr.Plot(label="確率分布")
+                # 特徴語の表示
+                features_md = gr.Markdown(label="重要な特徴語")
+        # サンプルボタンの機能
+        legitimate_btn.click(
+            lambda: np.random.choice(samples["legitimate"]),
+            outputs=email_input
+        )
+        spam_btn.click(
+            lambda: np.random.choice(samples["spam"]),
+            outputs=email_input
+        )
+        # 分析ボタンの機能
+        def process_email(text):
+            result = predict_email_type(text)
+            # ラベル用のデータ
+            label_data = {
+                "正当な問い合わせ": result["legitimate_prob"],
+                "営業・スパム": result["spam_prob"]
+            }
+            # 特徴語のマークダウン
+            features_text = "### 検出された重要な特徴語\n"
+            if result["features"]:
+                for feature, weight in result["features"]:
+                    features_text += f"- {feature}: {weight:.4f}\n"
+            else:
+                features_text += "特徴語の分析に失敗しました。"
+            # 確率チャート
+            chart = create_probability_chart(
+                result["legitimate_prob"],
+                result["spam_prob"]
+            )
+            return label_data, chart, features_text
+        analyze_btn.click(
+            process_email,
+            inputs=email_input,
+            outputs=[result_label, prob_chart, features_md]
+        )
+        gr.Markdown("""
+        ## 使い方
+        1. テキストボックスにメール内容を入力する（または「サンプル」ボタンをクリック）
+        2. 「分析」ボタンをクリックして結果を確認
+        ## システムについて
+        このシステムは機械学習を使って、研究に関する正当な問い合わせと営業・スパムメールを自動的に判別します。
+        判別結果は確率で表示され、どの単語や表現がその判断に寄与したかも表示されます。
+        注意: このデモは学習データが限られているため、精度は実用レベルではありません。実際の運用では、より多くの学習データでモデルを調整する必要があります。
+        """)
+    return demo
+# インターフェースの作成と起動
+demo = create_interface()
+# Hugging Face Spacesでの実行
+if __name__ == "__main__":
+    demo.launch()