Spaces:

summerstars
/

Embedding

Sleeping

App Files Files Community

summerstars commited on Sep 25, 2025

Commit

f930370

verified ·

1 Parent(s): 232711a

Update main.py

Browse files

Files changed (1) hide show

main.py +125 -65

main.py CHANGED Viewed

@@ -1,84 +1,155 @@
 import gradio as gr
 from sentence_transformers import SentenceTransformer, util
 import torch
-import numpy as np
 # --- 1. モデルのロード ---
-# スクリプト起動時に一度だけモデルをロードする
 print("モデルをロードしています... (初回は時間がかかることがあります)")
-# GPUが利用可能ならGPUを、そうでなければCPUを使用
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
-model = SentenceTransformer("summerstars/MARK-Embedding", device=device)
-print(f"モデルのロードが完了しました。デバイス: {device}")
-# --- 2. APIのコア機能となる関数を定義 ---
 def calculate_similarity(text1, text2):
-    """
-    2つのテキストのコサイン類似度を計算する関数
-    """
-    if not text1 or not text2:
-        return 0.0
-    # テキストをリストにまとめる
-    sentences = [text1, text2]
-    # モデルを使って埋め込みベクトルを計算
-    # convert_to_tensor=TrueでPyTorchテン��ルとして結果を得る
-    embeddings = model.encode(sentences, convert_to_tensor=True)
-    # コサイン類似度を計算
     cosine_scores = util.cos_sim(embeddings[0], embeddings[1])
-    # テンソルからfloat値を取り出して返す
     return round(cosine_scores.item(), 4)
 def get_embeddings(texts):
-    """
-    改行区切りのテキストから埋め込みベクトルを生成する関数
-    """
-    if not texts.strip():
-        return {}
-    # 改行でテキストを分割し、空行は無視する
     sentences = [s.strip() for s in texts.strip().split('\n') if s.strip()]
-    if not sentences:
-        return {}
-    # 埋め込みベクトルを計算
-    # convert_to_numpy=TrueでNumpy配列として結果を得る
     embeddings = model.encode(sentences, convert_to_numpy=True)
-    # テキストとベクトルのペアを辞書形式で作成
-    # JSONで扱いやすいようにNumpy配列をリストに変換
-    response_data = {
-        sentence: embedding.tolist()
-        for sentence, embedding in zip(sentences, embeddings)
-    }
-    return response_data
-# --- 3. Gradioインターフェースの構築 ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
-        # 埋め込みモデル `summerstars/MARK-Embedding` API
-        日本語のテキスト埋め込みモデルを使用して、文章の類似度計算やベクトル化を行うことができます。
         """
     )
     with gr.Tab("文章の類似度計算"):
         gr.Markdown("## 2つの文章を入力して類似度を計算します")
         with gr.Row():
             text_input1 = gr.Textbox(label="文章1", lines=3, placeholder="例: 今日の天気は晴れです。")
             text_input2 = gr.Textbox(label="文章2", lines=3, placeholder="例: 今日は良い天気ですね。")
         calculate_button = gr.Button("類似度を計算", variant="primary")
         similarity_output = gr.Number(label="コサイン類似度スコア (値が高いほど類似)")
         calculate_button.click(
             fn=calculate_similarity,
             inputs=[text_input1, text_input2],
@@ -86,25 +157,14 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         )
     with gr.Tab("埋め込みベクトル生成"):
         gr.Markdown("## テキスト（改行区切り）を入力して埋め込みベクトルを生成します")
         with gr.Row():
-            texts_input = gr.Textbox(
-                label="テキスト入力 (1行に1つの文章)",
-                lines=5,
-                placeholder="""犬が公園を走っている。
-猫が窓際で日向ぼっこをしている。
-明日の会議の資料を準備する。"""
-            )
         generate_button = gr.Button("ベクトルを生成", variant="primary")
         embeddings_output = gr.JSON(label="生成された埋め込みベクトル")
-        generate_button.click(
-            fn=get_embeddings,
-            inputs=texts_input,
-            outputs=embeddings_output
-        )
-# --- 4. Gradioアプリの起動 ---
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 from sentence_transformers import SentenceTransformer, util
 import torch
+import csv
+import os
 # --- 1. モデルのロード ---
 print("モデルをロードしています... (初回は時間がかかることがあります)")
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
+try:
+    model = SentenceTransformer("summerstars/MARK-Embedding", device=device)
+    print(f"モデルのロードが完了しました。デバイス: {device}")
+except Exception as e:
+    print(f"モデルのロード中にエラーが発生しました: {e}")
+    print("インターネット接続を確認するか、モデル名が正しいか確認してください。")
+    exit()
+# --- 2. subset.csvから不適切表現サンプルを抽出 ---
+TOXIC_SAMPLES = []
+CSV_FILE_PATH = "subset.csv"
+# ファイルが存在するか確認
+if not os.path.exists(CSV_FILE_PATH):
+    print(f"エラー: {CSV_FILE_PATH} が見つかりません。")
+    print("app.py と同じディレクトリに subset.csv を配置してください。")
+    exit() # ファイルがなければプログラムを終了
+try:
+    with open(CSV_FILE_PATH, mode='r', encoding='utf-8') as f:
+        # ヘッダー付きCSVとして読み込む
+        reader = csv.DictReader(f)
+        for row in reader:
+            # 'Toxic' または 'Very Toxic' のスコアが1以上の場合、そのテキストをサンプルとして追加
+            # .get(key, 0) はキーが存在しない場合に0を返す安全な方法
+            if int(row.get('Toxic', 0)) > 0 or int(row.get('Very Toxic', 0)) > 0:
+                TOXIC_SAMPLES.append(row['text'])
+except Exception as e:
+    print(f"CSVファイルの読み込み中にエラーが発生しました: {e}")
+    exit()
+if not TOXIC_SAMPLES:
+    print("警告: CSVファイルから不適切表現のサンプルが見つかりませんでした。")
+    print("CSVファイルの'Toxic'または'Very Toxic'列に1以上の値を持つ行があるか確認してください。")
+    toxic_sample_embeddings = None
+else:
+    print(f"データセットから {len(TOXIC_SAMPLES)} 件の不適切表現サンプルを抽出しました。")
+    # 抽出したサンプルのベクトルを事前に計算しておく
+    print("不適切表現サンプルのベクトルを計算しています...")
+    toxic_sample_embeddings = model.encode(TOXIC_SAMPLES, convert_to_tensor=True, device=device)
+    print("ベクトル計算が完了しました。")
+# --- 3. APIのコア機能となる関数を定義 ---
+def check_toxicity(text, threshold):
+    """入力テキストの不適切度を判定する関数"""
+    if not text.strip():
+        return "⚪️ テキスト未入力", "テキストを入力してから判定実行ボタンを押してください。"
+    if toxic_sample_embeddings is None:
+        return "判定不可", "判定用のサンプルデータがロードされていません。"
+    # 入力テキストの埋め込みを計算
+    text_embedding = model.encode(text, convert_to_tensor=True, device=device)
+    # 入力テキストと全てのサンプルベクトルとのコサイン類似度を計算
+    cos_scores = util.cos_sim(text_embedding, toxic_sample_embeddings)[0]
+    # 最も高い類似度スコアとそのインデックスを取得
+    top_score, top_idx = torch.max(cos_scores, dim=0)
+    top_score = top_score.item()
+    top_idx = top_idx.item()
+    # 最も類似した不適切表現サンプル
+    most_similar_sample = TOXIC_SAMPLES[top_idx]
+    # しきい値と比較して判定
+    if top_score >= threshold:
+        judgment = "🔴 不適切の可能性あり"
+        details = (
+            f"最も類似したサンプル文: '{most_similar_sample}'\n"
+            f"類似度スコア: {top_score:.4f}\n"
+            f"(判定しきい値: {threshold})"
+        )
+    else:
+        judgment = "🟢 問題なし"
+        details = (
+            f"最も類似したサンプル文: '{most_similar_sample}'\n"
+            f"類似度スコア: {top_score:.4f}\n"
+            f"(判定しきい値: {threshold})"
+        )
+    return judgment, details
+# (以前の類似度計算とベクトル生成の関数はそのまま)
 def calculate_similarity(text1, text2):
+    if not text1 or not text2: return 0.0
+    embeddings = model.encode([text1, text2], convert_to_tensor=True)
     cosine_scores = util.cos_sim(embeddings[0], embeddings[1])
     return round(cosine_scores.item(), 4)
 def get_embeddings(texts):
+    if not texts.strip(): return {}
     sentences = [s.strip() for s in texts.strip().split('\n') if s.strip()]
+    if not sentences: return {}
     embeddings = model.encode(sentences, convert_to_numpy=True)
+    return {s: e.tolist() for s, e in zip(sentences, embeddings)}
+# --- 4. Gradioインターフェースの構築 ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
+        # テキスト解析API (`summerstars/MARK-Embedding`)
+        日本語のテキスト埋め込みモデルを使用して、文章の類似度計算、ベクトル化、不適切表現の判定を行います。
         """
     )
+    with gr.Tab("不適切表現判定"):
+        gr.Markdown("## テキストが不適切表現に類似しているかを判定します")
+        gr.Markdown(
+            "**仕組み:** `subset.csv`内の'Toxic'/'Very Toxic'ラベルが付いた文章と、入力テキストの意味的な近さ（コサイン類似度）を計算します。"
+            "文脈を完全に理解するわけではないため、誤判定の可能性があります。あくまで参考としてご利用ください。"
+        )
+        with gr.Row():
+            toxicity_text_input = gr.Textbox(label="判定したいテキスト", lines=5, placeholder="ここに文章を入力してください...")
+        toxicity_threshold_slider = gr.Slider(
+            minimum=0.3, maximum=1.0, value=0.6, step=0.05, label="不適切と判定する類似度のしきい値"
+        )
+        toxicity_check_button = gr.Button("判定実行", variant="primary")
+        with gr.Row():
+            toxicity_judgment_output = gr.Textbox(label="判定結果", interactive=False, scale=1)
+            toxicity_details_output = gr.Textbox(label="判定詳細", interactive=False, scale=2)
+        toxicity_check_button.click(
+            fn=check_toxicity,
+            inputs=[toxicity_text_input, toxicity_threshold_slider],
+            outputs=[toxicity_judgment_output, toxicity_details_output]
+        )
     with gr.Tab("文章の類似度計算"):
+        # (変更なし)
         gr.Markdown("## 2つの文章を入力して類似度を計算します")
         with gr.Row():
             text_input1 = gr.Textbox(label="文章1", lines=3, placeholder="例: 今日の天気は晴れです。")
             text_input2 = gr.Textbox(label="文章2", lines=3, placeholder="例: 今日は良い天気ですね。")
         calculate_button = gr.Button("類似度を計算", variant="primary")
         similarity_output = gr.Number(label="コサイン類似度スコア (値が高いほど類似)")
         calculate_button.click(
             fn=calculate_similarity,
             inputs=[text_input1, text_input2],
         )
     with gr.Tab("埋め込みベクトル生成"):
+        # (変更なし)
         gr.Markdown("## テキスト（改行区切り）を入力して埋め込みベクトルを生成します")
         with gr.Row():
+            texts_input = gr.Textbox(label="テキスト入力 (1行に1つの文章)", lines=5, placeholder="犬が公園を走っている。\n猫が窓際で日向ぼっこをしている。")
         generate_button = gr.Button("ベクトルを生成", variant="primary")
         embeddings_output = gr.JSON(label="生成された埋め込みベクトル")
+        generate_button.click(fn=get_embeddings, inputs=texts_input, outputs=embeddings_output)
+# --- 5. Gradioアプリの起動 ---
 if __name__ == "__main__":
     demo.launch()