Spaces:
Sleeping
Sleeping
File size: 4,167 Bytes
d66d843 50a40b5 8c31ab5 d66d843 51113b2 8af1e78 d66d843 8af1e78 d66d843 8af1e78 51113b2 8c31ab5 50a40b5 d66d843 fb51b03 d66d843 51113b2 8c31ab5 50a40b5 fb9a965 8c31ab5 d66d843 fb9a965 d66d843 8af1e78 d66d843 fb9a965 d66d843 8c31ab5 d66d843 8af1e78 d66d843 8af1e78 d66d843 fb9a965 d66d843 8af1e78 d66d843 fb9a965 d66d843 8af1e78 d66d843 fb9a965 d66d843 fb9a965 d66d843 fb9a965 d66d843 fb9a965 d66d843 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
import math
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import umap
from .openai_client import embed_texts, chat_summarize
from ..config import settings
@dataclass
class Result:
summary: str
points: List[Dict[str, Any]] # [{x,y,cluster,text}]
topics: List[Dict[str, Any]] # [{id,size,top_terms}]
class TopicEngine:
def __init__(self):
# ← ここが今回のエラー発生箇所。設定名を統一(embedding_model)に修正済み
self.embed_model = settings.embedding_model
def _auto_k(self, n: int) -> int:
if n < 3:
return 1
# √N をベースに 2..10 にクリップ
return max(2, min(10, int(math.sqrt(n))))
def _topic_terms(self, texts: List[str], labels: np.ndarray, topn: int = 5) -> List[Dict[str, Any]]:
vec = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
X = vec.fit_transform(texts)
vocab = np.array(vec.get_feature_names_out())
topics: List[Dict[str, Any]] = []
for cid in sorted(set(labels.tolist())):
idx = np.where(labels == cid)[0]
if len(idx) == 0:
topics.append({"id": int(cid), "size": 0, "top_terms": []})
continue
mean_tfidf = np.asarray(X[idx].mean(axis=0)).ravel()
if mean_tfidf.sum() > 0:
mean_tfidf = mean_tfidf / (mean_tfidf.sum() + 1e-12)
top_idx = mean_tfidf.argsort()[-topn:][::-1]
topics.append({
"id": int(cid),
"size": int(len(idx)),
"top_terms": vocab[top_idx].tolist(),
})
return topics
def analyze(self, texts: List[str], n_clusters: Optional[int] = None) -> Result:
texts = [t.strip() for t in texts if t and t.strip()]
if not texts:
return Result("テキストが空です。1行1件で入力してください。", [], [])
# ---- 埋め込み ----
embs = np.array(embed_texts(texts, self.embed_model), dtype=np.float32)
if embs.ndim != 2 or embs.shape[0] != len(texts):
raise RuntimeError("Embedding 取得に失敗しました。")
# ---- 次元圧縮(2D)----
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine", random_state=42)
pts2d = reducer.fit_transform(normalize(embs))
# ---- クラスタリング ----
k = (n_clusters if n_clusters and n_clusters > 0 else self._auto_k(len(texts)))
if k <= 1 or len(texts) < 3:
labels = np.zeros(len(texts), dtype=int)
else:
km = KMeans(n_clusters=k, n_init='auto', random_state=42)
labels = km.fit_predict(embs)
# ---- トピック語抽出 ----
topics = self._topic_terms(texts, labels, topn=5)
# ---- 要約(LLM)----
by_c: Dict[int, List[str]] = {}
for i, c in enumerate(labels.tolist()):
by_c.setdefault(int(c), []).append(texts[i])
sample_lines = []
for cid, arr in by_c.items():
sample = "\n".join(f"- {s}" for s in arr[: min(5, len(arr))])
t_terms = next((t["top_terms"] for t in topics if t["id"] == cid), [])
sample_lines.append(f"Cluster {cid} (size={len(arr)}, terms={', '.join(t_terms)}):\n{sample}")
prompt = (
"以下はソーシャル投稿のクラスタごとの抜粋です。"
"それぞれの傾向・関心・感情を日本語で3〜5行に要約し、"
"最後に全体傾向を1行で総括してください。\n\n" + "\n\n".join(sample_lines)
)
summary = chat_summarize(prompt, settings.chat_model)
points = [
{"x": float(pts2d[i, 0]), "y": float(pts2d[i, 1]), "cluster": int(labels[i]), "text": texts[i]}
for i in range(len(texts))
]
return Result(summary=summary, points=points, topics=topics)
|