Spaces:

Corin1998
/

multilingual_sns_analyzer

Sleeping

File size: 4,167 Bytes

d66d843
50a40b5
8c31ab5
d66d843
51113b2
8af1e78
d66d843
 
8af1e78
d66d843
8af1e78
51113b2
8c31ab5
50a40b5
d66d843
fb51b03
d66d843
 
51113b2
8c31ab5
50a40b5
fb9a965
8c31ab5
d66d843
fb9a965
d66d843
 
8af1e78
d66d843
 
fb9a965
d66d843
8c31ab5
d66d843
 
 
 
8af1e78
 
d66d843
8af1e78
d66d843
 
 
 
 
 
 
 
 
 
fb9a965
d66d843
 
8af1e78
d66d843
 
 
 
 
 
 
 
 
 
fb9a965
d66d843
 
 
8af1e78
 
d66d843
 
fb9a965
d66d843
 
fb9a965
d66d843
 
 
 
fb9a965
d66d843
 
 
 
 
 
 
 
 
 
 
 
fb9a965
d66d843

from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
import math
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import umap
from .openai_client import embed_texts, chat_summarize
from ..config import settings


@dataclass
class Result:
    summary: str
    points: List[Dict[str, Any]]   # [{x,y,cluster,text}]
    topics: List[Dict[str, Any]]   # [{id,size,top_terms}]


class TopicEngine:
    def __init__(self):
        # ← ここが今回のエラー発生箇所。設定名を統一（embedding_model）に修正済み
        self.embed_model = settings.embedding_model

    def _auto_k(self, n: int) -> int:
        if n < 3:
            return 1
        # √N をベースに 2..10 にクリップ
        return max(2, min(10, int(math.sqrt(n))))

    def _topic_terms(self, texts: List[str], labels: np.ndarray, topn: int = 5) -> List[Dict[str, Any]]:
        vec = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
        X = vec.fit_transform(texts)
        vocab = np.array(vec.get_feature_names_out())
        topics: List[Dict[str, Any]] = []
        for cid in sorted(set(labels.tolist())):
            idx = np.where(labels == cid)[0]
            if len(idx) == 0:
                topics.append({"id": int(cid), "size": 0, "top_terms": []})
                continue
            mean_tfidf = np.asarray(X[idx].mean(axis=0)).ravel()
            if mean_tfidf.sum() > 0:
                mean_tfidf = mean_tfidf / (mean_tfidf.sum() + 1e-12)
            top_idx = mean_tfidf.argsort()[-topn:][::-1]
            topics.append({
                "id": int(cid),
                "size": int(len(idx)),
                "top_terms": vocab[top_idx].tolist(),
            })
        return topics

    def analyze(self, texts: List[str], n_clusters: Optional[int] = None) -> Result:
        texts = [t.strip() for t in texts if t and t.strip()]
        if not texts:
            return Result("テキストが空です。1行1件で入力してください。", [], [])

        # ---- 埋め込み ----
        embs = np.array(embed_texts(texts, self.embed_model), dtype=np.float32)
        if embs.ndim != 2 or embs.shape[0] != len(texts):
            raise RuntimeError("Embedding 取得に失敗しました。")

        # ---- 次元圧縮（2D）----
        reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine", random_state=42)
        pts2d = reducer.fit_transform(normalize(embs))

        # ---- クラスタリング ----
        k = (n_clusters if n_clusters and n_clusters > 0 else self._auto_k(len(texts)))
        if k <= 1 or len(texts) < 3:
            labels = np.zeros(len(texts), dtype=int)
        else:
            km = KMeans(n_clusters=k, n_init='auto', random_state=42)
            labels = km.fit_predict(embs)

        # ---- トピック語抽出 ----
        topics = self._topic_terms(texts, labels, topn=5)

        # ---- 要約（LLM）----
        by_c: Dict[int, List[str]] = {}
        for i, c in enumerate(labels.tolist()):
            by_c.setdefault(int(c), []).append(texts[i])

        sample_lines = []
        for cid, arr in by_c.items():
            sample = "\n".join(f"- {s}" for s in arr[: min(5, len(arr))])
            t_terms = next((t["top_terms"] for t in topics if t["id"] == cid), [])
            sample_lines.append(f"Cluster {cid} (size={len(arr)}, terms={', '.join(t_terms)}):\n{sample}")

        prompt = (
            "以下はソーシャル投稿のクラスタごとの抜粋です。"
            "それぞれの傾向・関心・感情を日本語で3〜5行に要約し、"
            "最後に全体傾向を1行で総括してください。\n\n" + "\n\n".join(sample_lines)
        )
        summary = chat_summarize(prompt, settings.chat_model)

        points = [
            {"x": float(pts2d[i, 0]), "y": float(pts2d[i, 1]), "cluster": int(labels[i]), "text": texts[i]}
            for i in range(len(texts))
        ]
        return Result(summary=summary, points=points, topics=topics)