from __future__ import annotations from dataclasses import dataclass from typing import Any, Dict, List, Optional import math import numpy as np from sklearn.cluster import KMeans from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import normalize import umap from .openai_client import embed_texts, chat_summarize from ..config import settings @dataclass class Result: summary: str points: List[Dict[str, Any]] # [{x,y,cluster,text}] topics: List[Dict[str, Any]] # [{id,size,top_terms}] class TopicEngine: def __init__(self): # ← ここが今回のエラー発生箇所。設定名を統一(embedding_model)に修正済み self.embed_model = settings.embedding_model def _auto_k(self, n: int) -> int: if n < 3: return 1 # √N をベースに 2..10 にクリップ return max(2, min(10, int(math.sqrt(n)))) def _topic_terms(self, texts: List[str], labels: np.ndarray, topn: int = 5) -> List[Dict[str, Any]]: vec = TfidfVectorizer(max_features=2000, ngram_range=(1, 2)) X = vec.fit_transform(texts) vocab = np.array(vec.get_feature_names_out()) topics: List[Dict[str, Any]] = [] for cid in sorted(set(labels.tolist())): idx = np.where(labels == cid)[0] if len(idx) == 0: topics.append({"id": int(cid), "size": 0, "top_terms": []}) continue mean_tfidf = np.asarray(X[idx].mean(axis=0)).ravel() if mean_tfidf.sum() > 0: mean_tfidf = mean_tfidf / (mean_tfidf.sum() + 1e-12) top_idx = mean_tfidf.argsort()[-topn:][::-1] topics.append({ "id": int(cid), "size": int(len(idx)), "top_terms": vocab[top_idx].tolist(), }) return topics def analyze(self, texts: List[str], n_clusters: Optional[int] = None) -> Result: texts = [t.strip() for t in texts if t and t.strip()] if not texts: return Result("テキストが空です。1行1件で入力してください。", [], []) # ---- 埋め込み ---- embs = np.array(embed_texts(texts, self.embed_model), dtype=np.float32) if embs.ndim != 2 or embs.shape[0] != len(texts): raise RuntimeError("Embedding 取得に失敗しました。") # ---- 次元圧縮(2D)---- reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine", random_state=42) pts2d = reducer.fit_transform(normalize(embs)) # ---- クラスタリング ---- k = (n_clusters if n_clusters and n_clusters > 0 else self._auto_k(len(texts))) if k <= 1 or len(texts) < 3: labels = np.zeros(len(texts), dtype=int) else: km = KMeans(n_clusters=k, n_init='auto', random_state=42) labels = km.fit_predict(embs) # ---- トピック語抽出 ---- topics = self._topic_terms(texts, labels, topn=5) # ---- 要約(LLM)---- by_c: Dict[int, List[str]] = {} for i, c in enumerate(labels.tolist()): by_c.setdefault(int(c), []).append(texts[i]) sample_lines = [] for cid, arr in by_c.items(): sample = "\n".join(f"- {s}" for s in arr[: min(5, len(arr))]) t_terms = next((t["top_terms"] for t in topics if t["id"] == cid), []) sample_lines.append(f"Cluster {cid} (size={len(arr)}, terms={', '.join(t_terms)}):\n{sample}") prompt = ( "以下はソーシャル投稿のクラスタごとの抜粋です。" "それぞれの傾向・関心・感情を日本語で3〜5行に要約し、" "最後に全体傾向を1行で総括してください。\n\n" + "\n\n".join(sample_lines) ) summary = chat_summarize(prompt, settings.chat_model) points = [ {"x": float(pts2d[i, 0]), "y": float(pts2d[i, 1]), "cluster": int(labels[i]), "text": texts[i]} for i in range(len(texts)) ] return Result(summary=summary, points=points, topics=topics)