Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from typing import Any, Dict, List, Optional | |
| import math | |
| import numpy as np | |
| from sklearn.cluster import KMeans | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.preprocessing import normalize | |
| import umap | |
| from .openai_client import embed_texts, chat_summarize | |
| from ..config import settings | |
| class Result: | |
| summary: str | |
| points: List[Dict[str, Any]] # [{x,y,cluster,text}] | |
| topics: List[Dict[str, Any]] # [{id,size,top_terms}] | |
| class TopicEngine: | |
| def __init__(self): | |
| # ← ここが今回のエラー発生箇所。設定名を統一(embedding_model)に修正済み | |
| self.embed_model = settings.embedding_model | |
| def _auto_k(self, n: int) -> int: | |
| if n < 3: | |
| return 1 | |
| # √N をベースに 2..10 にクリップ | |
| return max(2, min(10, int(math.sqrt(n)))) | |
| def _topic_terms(self, texts: List[str], labels: np.ndarray, topn: int = 5) -> List[Dict[str, Any]]: | |
| vec = TfidfVectorizer(max_features=2000, ngram_range=(1, 2)) | |
| X = vec.fit_transform(texts) | |
| vocab = np.array(vec.get_feature_names_out()) | |
| topics: List[Dict[str, Any]] = [] | |
| for cid in sorted(set(labels.tolist())): | |
| idx = np.where(labels == cid)[0] | |
| if len(idx) == 0: | |
| topics.append({"id": int(cid), "size": 0, "top_terms": []}) | |
| continue | |
| mean_tfidf = np.asarray(X[idx].mean(axis=0)).ravel() | |
| if mean_tfidf.sum() > 0: | |
| mean_tfidf = mean_tfidf / (mean_tfidf.sum() + 1e-12) | |
| top_idx = mean_tfidf.argsort()[-topn:][::-1] | |
| topics.append({ | |
| "id": int(cid), | |
| "size": int(len(idx)), | |
| "top_terms": vocab[top_idx].tolist(), | |
| }) | |
| return topics | |
| def analyze(self, texts: List[str], n_clusters: Optional[int] = None) -> Result: | |
| texts = [t.strip() for t in texts if t and t.strip()] | |
| if not texts: | |
| return Result("テキストが空です。1行1件で入力してください。", [], []) | |
| # ---- 埋め込み ---- | |
| embs = np.array(embed_texts(texts, self.embed_model), dtype=np.float32) | |
| if embs.ndim != 2 or embs.shape[0] != len(texts): | |
| raise RuntimeError("Embedding 取得に失敗しました。") | |
| # ---- 次元圧縮(2D)---- | |
| reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine", random_state=42) | |
| pts2d = reducer.fit_transform(normalize(embs)) | |
| # ---- クラスタリング ---- | |
| k = (n_clusters if n_clusters and n_clusters > 0 else self._auto_k(len(texts))) | |
| if k <= 1 or len(texts) < 3: | |
| labels = np.zeros(len(texts), dtype=int) | |
| else: | |
| km = KMeans(n_clusters=k, n_init='auto', random_state=42) | |
| labels = km.fit_predict(embs) | |
| # ---- トピック語抽出 ---- | |
| topics = self._topic_terms(texts, labels, topn=5) | |
| # ---- 要約(LLM)---- | |
| by_c: Dict[int, List[str]] = {} | |
| for i, c in enumerate(labels.tolist()): | |
| by_c.setdefault(int(c), []).append(texts[i]) | |
| sample_lines = [] | |
| for cid, arr in by_c.items(): | |
| sample = "\n".join(f"- {s}" for s in arr[: min(5, len(arr))]) | |
| t_terms = next((t["top_terms"] for t in topics if t["id"] == cid), []) | |
| sample_lines.append(f"Cluster {cid} (size={len(arr)}, terms={', '.join(t_terms)}):\n{sample}") | |
| prompt = ( | |
| "以下はソーシャル投稿のクラスタごとの抜粋です。" | |
| "それぞれの傾向・関心・感情を日本語で3〜5行に要約し、" | |
| "最後に全体傾向を1行で総括してください。\n\n" + "\n\n".join(sample_lines) | |
| ) | |
| summary = chat_summarize(prompt, settings.chat_model) | |
| points = [ | |
| {"x": float(pts2d[i, 0]), "y": float(pts2d[i, 1]), "cluster": int(labels[i]), "text": texts[i]} | |
| for i in range(len(texts)) | |
| ] | |
| return Result(summary=summary, points=points, topics=topics) | |