from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
import math
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import umap
from .openai_client import embed_texts, chat_summarize
from ..config import settings


@dataclass
class Result:
    summary: str
    points: List[Dict[str, Any]]   # [{x,y,cluster,text}]
    topics: List[Dict[str, Any]]   # [{id,size,top_terms}]


class TopicEngine:
    def __init__(self):
        # ← ここが今回のエラー発生箇所。設定名を統一（embedding_model）に修正済み
        self.embed_model = settings.embedding_model

    def _auto_k(self, n: int) -> int:
        if n < 3:
            return 1
        # √N をベースに 2..10 にクリップ
        return max(2, min(10, int(math.sqrt(n))))

    def _topic_terms(self, texts: List[str], labels: np.ndarray, topn: int = 5) -> List[Dict[str, Any]]:
        vec = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
        X = vec.fit_transform(texts)
        vocab = np.array(vec.get_feature_names_out())
        topics: List[Dict[str, Any]] = []
        for cid in sorted(set(labels.tolist())):
            idx = np.where(labels == cid)[0]
            if len(idx) == 0:
                topics.append({"id": int(cid), "size": 0, "top_terms": []})
                continue
            mean_tfidf = np.asarray(X[idx].mean(axis=0)).ravel()
            if mean_tfidf.sum() > 0:
                mean_tfidf = mean_tfidf / (mean_tfidf.sum() + 1e-12)
            top_idx = mean_tfidf.argsort()[-topn:][::-1]
            topics.append({
                "id": int(cid),
                "size": int(len(idx)),
                "top_terms": vocab[top_idx].tolist(),
            })
        return topics

    def analyze(self, texts: List[str], n_clusters: Optional[int] = None) -> Result:
        texts = [t.strip() for t in texts if t and t.strip()]
        if not texts:
            return Result("テキストが空です。1行1件で入力してください。", [], [])

        # ---- 埋め込み ----
        embs = np.array(embed_texts(texts, self.embed_model), dtype=np.float32)
        if embs.ndim != 2 or embs.shape[0] != len(texts):
            raise RuntimeError("Embedding 取得に失敗しました。")

        # ---- 次元圧縮（2D）----
        reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine", random_state=42)
        pts2d = reducer.fit_transform(normalize(embs))

        # ---- クラスタリング ----
        k = (n_clusters if n_clusters and n_clusters > 0 else self._auto_k(len(texts)))
        if k <= 1 or len(texts) < 3:
            labels = np.zeros(len(texts), dtype=int)
        else:
            km = KMeans(n_clusters=k, n_init='auto', random_state=42)
            labels = km.fit_predict(embs)

        # ---- トピック語抽出 ----
        topics = self._topic_terms(texts, labels, topn=5)

        # ---- 要約（LLM）----
        by_c: Dict[int, List[str]] = {}
        for i, c in enumerate(labels.tolist()):
            by_c.setdefault(int(c), []).append(texts[i])

        sample_lines = []
        for cid, arr in by_c.items():
            sample = "\n".join(f"- {s}" for s in arr[: min(5, len(arr))])
            t_terms = next((t["top_terms"] for t in topics if t["id"] == cid), [])
            sample_lines.append(f"Cluster {cid} (size={len(arr)}, terms={', '.join(t_terms)}):\n{sample}")

        prompt = (
            "以下はソーシャル投稿のクラスタごとの抜粋です。"
            "それぞれの傾向・関心・感情を日本語で3〜5行に要約し、"
            "最後に全体傾向を1行で総括してください。\n\n" + "\n\n".join(sample_lines)
        )
        summary = chat_summarize(prompt, settings.chat_model)

        points = [
            {"x": float(pts2d[i, 0]), "y": float(pts2d[i, 1]), "cluster": int(labels[i]), "text": texts[i]}
            for i in range(len(texts))
        ]
        return Result(summary=summary, points=points, topics=topics)