Spaces:

summerstars
/

Embedding

Sleeping

App Files Files Community

summerstars commited on Sep 25, 2025

Commit

629d674

verified ·

1 Parent(s): cb79eff

Update main.py

Browse files

Files changed (1) hide show

main.py +67 -68

main.py CHANGED Viewed

@@ -4,95 +4,100 @@ import torch
 import csv
 import os
-# --- 1. モデルのロード ---
-print("モデルをロードしています... (初回は時間がかかることがあります)")
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 try:
-    model = SentenceTransformer("summerstars/MARK-Embedding", device=device)
     print(f"モデルのロードが完了しました。デバイス: {device}")
 except Exception as e:
     print(f"モデルのロード中にエラーが発生しました: {e}")
-    print("インターネット接続を確認するか、モデル名が正しいか確認してください。")
     exit()
-# --- 2. subset.csvから不適切表現サンプルを抽出 ---
-TOXIC_SAMPLES = []
-CSV_FILE_PATH = "subset.csv"
 # ファイルが存在するか確認
 if not os.path.exists(CSV_FILE_PATH):
     print(f"エラー: {CSV_FILE_PATH} が見つかりません。")
-    print("app.py と同じディレクトリに subset.csv を配置してください。")
-    exit() # ファイルがなければプログラムを終了
 try:
     with open(CSV_FILE_PATH, mode='r', encoding='utf-8') as f:
-        # ヘッダー付きCSVとして読み込む
         reader = csv.DictReader(f)
         for row in reader:
-            # 'Toxic' または 'Very Toxic' のスコアが1以上の場合、そのテキストをサンプルとして追加
-            # .get(key, 0) はキーが存在しない場合に0を返す安全な方法
-            if int(row.get('Toxic', 0)) > 0 or int(row.get('Very Toxic', 0)) > 0:
-                TOXIC_SAMPLES.append(row['text'])
 except Exception as e:
     print(f"CSVファイルの読み込み中にエラーが発生しました: {e}")
     exit()
-if not TOXIC_SAMPLES:
-    print("警告: CSVファイルから不適切表現のサンプルが見つかりませんでした。")
-    print("CSVファイルの'Toxic'または'Very Toxic'列に1以上の値を持つ行があるか確認してください。")
-    toxic_sample_embeddings = None
 else:
-    print(f"データセットから {len(TOXIC_SAMPLES)} 件の不適切表現サンプルを抽出しました。")
-    # 抽出したサンプルのベクトルを事前に計算しておく
-    print("不適切表現サンプルのベクトルを計算しています...")
-    toxic_sample_embeddings = model.encode(TOXIC_SAMPLES, convert_to_tensor=True, device=device)
     print("ベクトル計算が完了しました。")
-# --- 3. APIのコア機能となる関数を定義 ---
-def check_toxicity(text, threshold):
-    """入力テキストの不適切度を判定する関数"""
     if not text.strip():
-        return "⚪️ テキスト未入力", "テキストを入力してから判定実行ボタンを押してください。"
-    if toxic_sample_embeddings is None:
         return "判定不可", "判定用のサンプルデータがロードされていません。"
     # 入力テキストの埋め込みを計算
     text_embedding = model.encode(text, convert_to_tensor=True, device=device)
     # 入力テキストと全てのサンプルベクトルとのコサイン類似度を計算
-    cos_scores = util.cos_sim(text_embedding, toxic_sample_embeddings)[0]
     # 最も高い類似度スコ���とそのインデックスを取得
     top_score, top_idx = torch.max(cos_scores, dim=0)
-    top_score = top_score.item()
-    top_idx = top_idx.item()
-    # 最も類似した不適切表現サンプル
-    most_similar_sample = TOXIC_SAMPLES[top_idx]
-    # しきい値と比較して判定
-    if top_score >= threshold:
-        judgment = "🔴 不適切の可能性あり"
         details = (
-            f"最も類似したサンプル文: '{most_similar_sample}'\n"
-            f"類似度スコア: {top_score:.4f}\n"
-            f"(判定しきい値: {threshold})"
         )
     else:
-        judgment = "🟢 問題なし"
         details = (
-            f"最も類似したサンプル文: '{most_similar_sample}'\n"
-            f"類似度スコア: {top_score:.4f}\n"
-            f"(判定しきい値: {threshold})"
         )
     return judgment, details
-# (以前の類似度計算とベクトル生成の関数はそのまま)
 def calculate_similarity(text1, text2):
     if not text1 or not text2: return 0.0
     embeddings = model.encode([text1, text2], convert_to_tensor=True)
@@ -107,39 +112,35 @@ def get_embeddings(texts):
     return {s: e.tolist() for s, e in zip(sentences, embeddings)}
-# --- 4. Gradioインターフェースの構築 ---
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
-        """
-        # テキスト解析API (`summerstars/MARK-Embedding`)
-        日本語のテキスト埋め込みモデルを使用して、文章の類似度計算、ベクトル化、不適切表現の判定を行います。
         """
     )
-    with gr.Tab("不適切表現判定"):
-        gr.Markdown("## テキストが不適切表現に類似しているかを判定します")
-        gr.Markdown(
-            "**仕組み:** `subset.csv`内の'Toxic'/'Very Toxic'ラベルが付いた文章と、入力テキストの意味的な近さ（コサイン類似度）を計算します。"
-            "文脈を完全に理解するわけではないため、誤判定の可能性があります。あくまで参考としてご利用ください。"
-        )
         with gr.Row():
-            toxicity_text_input = gr.Textbox(label="判定したいテキスト", lines=5, placeholder="ここに文章を入力してください...")
-        toxicity_threshold_slider = gr.Slider(
-            minimum=0.3, maximum=1.0, value=0.6, step=0.05, label="不適切と判定する類似度のしきい値"
         )
-        toxicity_check_button = gr.Button("判定実行", variant="primary")
         with gr.Row():
-            toxicity_judgment_output = gr.Textbox(label="判定結果", interactive=False, scale=1)
-            toxicity_details_output = gr.Textbox(label="判定詳細", interactive=False, scale=2)
-        toxicity_check_button.click(
-            fn=check_toxicity,
-            inputs=[toxicity_text_input, toxicity_threshold_slider],
-            outputs=[toxicity_judgment_output, toxicity_details_output]
         )
     with gr.Tab("文章の類似度計算"):
@@ -151,9 +152,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         calculate_button = gr.Button("類似度を計算", variant="primary")
         similarity_output = gr.Number(label="コサイン類似度スコア (値が高いほど類似)")
         calculate_button.click(
-            fn=calculate_similarity,
-            inputs=[text_input1, text_input2],
-            outputs=similarity_output
         )
     with gr.Tab("埋め込みベクトル生成"):
@@ -165,6 +164,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         embeddings_output = gr.JSON(label="生成された埋め込みベクトル")
         generate_button.click(fn=get_embeddings, inputs=texts_input, outputs=embeddings_output)
-# --- 5. Gradioアプリの起動 ---
 if __name__ == "__main__":
     demo.launch()

 import csv
 import os
+# --- 1. 設定項目 ---
+# 読み込むCSVファイル名
+CSV_FILE_PATH = "text_data.csv"
+# 埋め込みモデル
+MODEL_NAME = "summerstars/MARK-Embedding"
+# --- 2. モデルのロード ---
+print(f"モデル '{MODEL_NAME}' をロードしています...")
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 try:
+    model = SentenceTransformer(MODEL_NAME, device=device)
     print(f"モデルのロードが完了しました。デバイス: {device}")
 except Exception as e:
     print(f"モデルのロード中にエラーが発生しました: {e}")
     exit()
+# --- 3. CSVデータからサンプルを読み込み、ベクトル化 ---
+SAMPLE_TEXTS = []
+SAMPLE_CATEGORIES = []
 # ファイルが存在するか確認
 if not os.path.exists(CSV_FILE_PATH):
     print(f"エラー: {CSV_FILE_PATH} が見つかりません。")
+    print("app.py と同じディレクトリにCSVファイルを配置してください。")
+    exit()
 try:
     with open(CSV_FILE_PATH, mode='r', encoding='utf-8') as f:
         reader = csv.DictReader(f)
         for row in reader:
+            # '発言'と'カテゴリ'列が存在することを確認
+            if '発言' in row and 'カテゴリ' in row and row['発言'].strip():
+                SAMPLE_TEXTS.append(row['発言'])
+                SAMPLE_CATEGORIES.append(row['カテゴリ'])
 except Exception as e:
     print(f"CSVファイルの読み込み中にエラーが発生しました: {e}")
     exit()
+if not SAMPLE_TEXTS:
+    print(f"警告: {CSV_FILE_PATH} から判定用のサンプルが読み込めませんでした。")
+    print("CSVファイルに '発言' と 'カテゴリ' の列があり、データが含まれているか確認してください。")
+    sample_embeddings = None
 else:
+    print(f"データセットから {len(SAMPLE_TEXTS)} 件のサンプルを読み込みました。")
+    print("サンプルのベクトルを計算しています...")
+    sample_embeddings = model.encode(SAMPLE_TEXTS, convert_to_tensor=True, device=device)
     print("ベクトル計算が完了しました。")
+# --- 4. APIのコア機能となる関数を定義 ---
+def check_category_similarity(text, threshold):
+    """入力テキストがどのカテゴリの発言に最も類似しているかを判定する関数"""
     if not text.strip():
+        return "⚪️ テキスト未入力", "テキストを入力してください。"
+    if sample_embeddings is None:
         return "判定不可", "判定用のサンプルデータがロードされていません。"
     # 入力テキストの埋め込みを計算
     text_embedding = model.encode(text, convert_to_tensor=True, device=device)
     # 入力テキストと全てのサンプルベクトルとのコサイン類似度を計算
+    cos_scores = util.cos_sim(text_embedding, sample_embeddings)[0]
     # 最も高い類似度スコ���とそのインデックスを取得
     top_score, top_idx = torch.max(cos_scores, dim=0)
+    top_score_item = top_score.item()
+    top_idx_item = top_idx.item()
+    # 最も類似したサンプルとそのカテゴリ
+    most_similar_text = SAMPLE_TEXTS[top_idx_item]
+    most_similar_category = SAMPLE_CATEGORIES[top_idx_item]
+    # しきい値に基づいて判定メッセージを作成
+    if top_score_item >= threshold:
+        judgment = f"🔴 {most_similar_category} の可能性"
         details = (
+            f"入力文は「{most_similar_category}」カテゴリの発言に類似しています。\n\n"
+            f"最も類似したサンプル文: '{most_similar_text}'\n"
+            f"類似度スコア: {top_score_item:.4f} (しきい値: {threshold})"
         )
     else:
+        judgment = "🟢 特定のカテゴリとの強い類似性なし"
         details = (
+            f"最も近かったのは「{most_similar_category}」カテゴリの発言です。\n\n"
+            f"最も類似したサンプル文: '{most_similar_text}'\n"
+            f"類似度スコア: {top_score_item:.4f} (しきい値: {threshold})"
         )
     return judgment, details
+# (既存機能の関数は変更なし)
 def calculate_similarity(text1, text2):
     if not text1 or not text2: return 0.0
     embeddings = model.encode([text1, text2], convert_to_tensor=True)
     return {s: e.tolist() for s, e in zip(sentences, embeddings)}
+# --- 5. Gradioインターフェースの構築 ---
+with gr.Blocks(theme=gr.themes.Default()) as demo:
     gr.Markdown(
+        f"""
+        # テキストカテゴリ類似性判定API
+        `{MODEL_NAME}` を使用し、入力テキストが `{CSV_FILE_PATH}` 内のどのカテゴリの発言に類似しているかを判定します。
         """
     )
+    with gr.Tab("カテゴリ類似性判定"):
+        gr.Markdown("## テキストがどのカテゴリの発言に類似しているかを判定します")
         with gr.Row():
+            text_input = gr.Textbox(label="判定したいテキスト", lines=5, placeholder="ここに文章を入力してください...")
+        threshold_slider = gr.Slider(
+            minimum=0.3, maximum=1.0, value=0.6, step=0.05, label="判定のしきい値 (この値以上の類似度で警告)"
         )
+        check_button = gr.Button("判定実行", variant="primary")
         with gr.Row():
+            judgment_output = gr.Textbox(label="判定結果", interactive=False, scale=1)
+            details_output = gr.Textbox(label="判定詳細", interactive=False, scale=2, lines=4)
+        check_button.click(
+            fn=check_category_similarity,
+            inputs=[text_input, threshold_slider],
+            outputs=[judgment_output, details_output]
         )
     with gr.Tab("文章の類似度計算"):
         calculate_button = gr.Button("類似度を計算", variant="primary")
         similarity_output = gr.Number(label="コサイン類似度スコア (値が高いほど類似)")
         calculate_button.click(
+            fn=calculate_similarity, inputs=[text_input1, text_input2], outputs=similarity_output
         )
     with gr.Tab("埋め込みベクトル生成"):
         embeddings_output = gr.JSON(label="生成された埋め込みベクトル")
         generate_button.click(fn=get_embeddings, inputs=texts_input, outputs=embeddings_output)
+# --- 6. Gradioアプリの起動 ---
 if __name__ == "__main__":
     demo.launch()