Corin1998's picture
Update app/lib/topic.py
8c31ab5 verified
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
import math
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import umap
from .openai_client import embed_texts, chat_summarize
from ..config import settings
@dataclass
class Result:
summary: str
points: List[Dict[str, Any]] # [{x,y,cluster,text}]
topics: List[Dict[str, Any]] # [{id,size,top_terms}]
class TopicEngine:
def __init__(self):
# ← ここが今回のエラー発生箇所。設定名を統一(embedding_model)に修正済み
self.embed_model = settings.embedding_model
def _auto_k(self, n: int) -> int:
if n < 3:
return 1
# √N をベースに 2..10 にクリップ
return max(2, min(10, int(math.sqrt(n))))
def _topic_terms(self, texts: List[str], labels: np.ndarray, topn: int = 5) -> List[Dict[str, Any]]:
vec = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
X = vec.fit_transform(texts)
vocab = np.array(vec.get_feature_names_out())
topics: List[Dict[str, Any]] = []
for cid in sorted(set(labels.tolist())):
idx = np.where(labels == cid)[0]
if len(idx) == 0:
topics.append({"id": int(cid), "size": 0, "top_terms": []})
continue
mean_tfidf = np.asarray(X[idx].mean(axis=0)).ravel()
if mean_tfidf.sum() > 0:
mean_tfidf = mean_tfidf / (mean_tfidf.sum() + 1e-12)
top_idx = mean_tfidf.argsort()[-topn:][::-1]
topics.append({
"id": int(cid),
"size": int(len(idx)),
"top_terms": vocab[top_idx].tolist(),
})
return topics
def analyze(self, texts: List[str], n_clusters: Optional[int] = None) -> Result:
texts = [t.strip() for t in texts if t and t.strip()]
if not texts:
return Result("テキストが空です。1行1件で入力してください。", [], [])
# ---- 埋め込み ----
embs = np.array(embed_texts(texts, self.embed_model), dtype=np.float32)
if embs.ndim != 2 or embs.shape[0] != len(texts):
raise RuntimeError("Embedding 取得に失敗しました。")
# ---- 次元圧縮(2D)----
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine", random_state=42)
pts2d = reducer.fit_transform(normalize(embs))
# ---- クラスタリング ----
k = (n_clusters if n_clusters and n_clusters > 0 else self._auto_k(len(texts)))
if k <= 1 or len(texts) < 3:
labels = np.zeros(len(texts), dtype=int)
else:
km = KMeans(n_clusters=k, n_init='auto', random_state=42)
labels = km.fit_predict(embs)
# ---- トピック語抽出 ----
topics = self._topic_terms(texts, labels, topn=5)
# ---- 要約(LLM)----
by_c: Dict[int, List[str]] = {}
for i, c in enumerate(labels.tolist()):
by_c.setdefault(int(c), []).append(texts[i])
sample_lines = []
for cid, arr in by_c.items():
sample = "\n".join(f"- {s}" for s in arr[: min(5, len(arr))])
t_terms = next((t["top_terms"] for t in topics if t["id"] == cid), [])
sample_lines.append(f"Cluster {cid} (size={len(arr)}, terms={', '.join(t_terms)}):\n{sample}")
prompt = (
"以下はソーシャル投稿のクラスタごとの抜粋です。"
"それぞれの傾向・関心・感情を日本語で3〜5行に要約し、"
"最後に全体傾向を1行で総括してください。\n\n" + "\n\n".join(sample_lines)
)
summary = chat_summarize(prompt, settings.chat_model)
points = [
{"x": float(pts2d[i, 0]), "y": float(pts2d[i, 1]), "cluster": int(labels[i]), "text": texts[i]}
for i in range(len(texts))
]
return Result(summary=summary, points=points, topics=topics)