"""
topic_modeler.py — BERTopic wrapper with small-dataset fallback.

Problem with the original:
  BERTopic uses HDBSCAN for clustering. HDBSCAN has a min_cluster_size
  parameter that defaults to 10 in BERTopic — meaning it needs at least
  10 documents just to form a single cluster. Below that, EVERYTHING gets
  assigned topic -1 (outlier) and the result is completely empty.

Solution:
  For small datasets (MIN_BERTOPIC_DOCS = 10+): use BERTopic as normal.
  For tiny datasets (MIN_TINY_DOCS = 3+): use KMeans clustering inside
  BERTopic. KMeans always assigns every point to a cluster (no outliers),
  and it works with as few as 2-3 documents.

  The number of clusters (topics) is automatically chosen as:
    n_clusters = max(2, min(n_docs // 2, MAX_TINY_TOPICS))
  So 3 docs → 2 topics, 6 docs → 3 topics, 8 docs → 4 topics.

Below MIN_TINY_DOCS (3): return empty — can't cluster 1-2 texts meaningfully.
"""

from typing import List, Dict, Tuple
from .models import TopicResult

# ---------------------------------------------------------------------------
# Mongolian suffix stripping for c-TF-IDF keyword extraction
# ---------------------------------------------------------------------------
# BERTopic uses CountVectorizer + c-TF-IDF to label each topic cluster.
# Without this, agglutinated forms fragment a single concept into many
# low-frequency tokens:  монголын / монголд / монголаас → 3 keywords
# With this tokenizer they all reduce to монгол → 1 keyword, higher weight.
#
# Rules are ordered longest-first so a longer suffix is tried before a
# shorter one that is a suffix of it (e.g. "аас" before "ас").
# Root must be ≥ 3 characters after stripping to avoid destroying short words.

_MN_SUFFIXES = [
    # Ablative (longest first to avoid partial matches)
    "аас", "ээс", "оос", "өөс",
    # Genitive
    "ийн", "ын", "ний",
    # Comitative
    "тай", "тэй", "той",
    # Directive
    "руу", "рүү",
    # Plural
    "ууд", "үүд",
    # Accusative
    "ийг", "ыг",
    # Dative (single char — checked last so longer suffixes win)
    "д", "т",
]
_MIN_ROOT = 3  # don't strip if remaining root would be shorter than this

# ---------------------------------------------------------------------------
# Mongolian stopwords for topic modeling c-TF-IDF
# ---------------------------------------------------------------------------
# These words appear in nearly every document and add no topic-discriminating
# value. Filtering them lets BERTopic surface meaningful content keywords.
_MN_STOPWORDS = {
    # Copulas / auxiliary verbs
    "байна", "байгаа", "байсан", "байх", "байдаг", "болно", "болох", "болсон",
    "болж", "бол", "бна", "бсан", "бгаа", "бхаа", "бн", "бдаг", "бхоо", "бх",
    # Common verbs (too generic for topics)
    "хийх", "хийж", "хийсэн", "авах", "авч", "авсан", "өгөх", "өгч", "өгсөн",
    "ирэх", "ирж", "ирсэн", "очих", "очсон", "гарах", "гарч", "гарсан",
    "орох", "орж", "орсон", "үзүүлж", "явагдаж", "ажиллаж", "эхэлж", "эхэллээ",
    # Conjunctions / particles
    "ба", "бас", "болон", "мөн", "эсвэл", "гэхдээ", "харин", "бөгөөд",
    "гэж", "гэх", "гэсэн", "гэжээ", "гэв", "гэвч", "гээд", "гэнэ", "гээ",
    # Pronouns / demonstratives
    "энэ", "тэр", "эдгээр", "тэдгээр", "үүн", "түүн", "бид", "тэд",
    "би", "чи", "та", "миний", "чиний", "таны", "өөр", "өөрийн",
    # Postpositions / spatial
    "дээр", "доор", "дотор", "гадна", "хойно", "өмнө", "дунд",
    # Intensifiers / degree
    "их", "бага", "маш", "тун", "нэлээд", "шиг", "хамгийн",
    # Single-char particles and suffixes
    "л", "ч", "нь", "аа", "ээ", "оо", "өө", "юм", "биш",
    "уу", "үү", "юу", "вэ", "бэ",
    # Question words
    "яаж", "яагаад", "хаана", "хэзээ", "хэн", "ямар",
    # Informal / social media
    "шд", "шдэ", "шдээ", "шт", "штэ", "штээ", "дээ", "даа",
    "бз", "биз", "хаха", "кк",
    # Generic high-frequency nouns (appear in every news article)
    "монгол", "улс", "улсын", "хот", "хотын", "аймаг", "аймагт",
    "шинэ", "онд", "жил", "жилд", "хувь", "хувиар", "тэрбум",
    "байна.", "нэг", "гаруй", "дахин", "хэд", "хэдэн", "өнгөрсөн",
    # Numbers written as words
    "нэг", "хоёр", "гурав", "дөрөв", "тав", "зургаа", "долоо", "найм",
    # Common news/media filler words
    "ноцтой", "ноц", "томоохон", "чухал", "асуудал", "асуудлыг",
    "нөлөө", "нөлөөл", "байгааг", "байгаад", "салбар", "салбарт",
    "ажиллагаа", "ашиглалта", "ашиглалтад", "нэмэгдсэн", "нэмэгд",
    "бууруул", "буурсан", "сайжруул", "хангах", "хангаж", "хүрч",
    "хүрсэн", "хүрэлцэх", "шийдвэрлэх", "шаардлагатай", "шаардаж",
    "түвшин", "түвш", "хэрэгжүүлж", "хэмжээ", "нийтлэл",
    "алхам", "ахиц", "үр", "дүн", "олон", "бүх", "иргэд", "иргэн",
    "засгийн", "газар", "засаг", "өмнөх",
    # Other function words
    "тийм", "ийм", "чинь", "минь", "билээ", "шүү",
    "надад", "танд", "бусад", "зарим", "ийнхүү", "тухай",
    "дамжуулан", "хүртэл", "ороос", "хооронд",
}


def _mn_stem(word: str) -> str:
    for sfx in _MN_SUFFIXES:
        if word.endswith(sfx) and len(word) - len(sfx) >= _MIN_ROOT:
            return word[: -len(sfx)]
    return word


def _mongolian_tokenizer(text: str) -> List[str]:
    """Tokenize, stem, and filter Mongolian text for BERTopic's c-TF-IDF."""
    tokens = []
    for w in text.split():
        if not w or len(w) < 2:
            continue
        # Skip pure numbers (years, percentages, amounts)
        if w.isdigit():
            continue
        stem = _mn_stem(w)
        if stem.lower() not in _MN_STOPWORDS and len(stem) >= 2:
            tokens.append(stem)
    return tokens

# Thresholds
MIN_TINY_DOCS = 3       # minimum to attempt topic modeling at all
MIN_BERTOPIC_DOCS = 50  # use KMeans for <50 docs (HDBSCAN needs more)
MAX_TINY_TOPICS = 10    # cap for KMeans cluster count on small datasets


class TopicModeler:
    """Topic modeling service using BERTopic with small-dataset fallback."""

    def __init__(
        self,
        embedding_model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
        language: str = "multilingual",
        min_topics: int = 5,
        max_topics: int = 15,
    ):
        self.embedding_model_name = embedding_model
        self.language = language
        self.min_topics = min_topics
        self.max_topics = max_topics
        self._embedding_model = None
        self._model = None             # last fitted BERTopic model

    def _load_embedding_model(self):
        if self._embedding_model is None:
            from sentence_transformers import SentenceTransformer
            self._embedding_model = SentenceTransformer(self.embedding_model_name)
        return self._embedding_model

    def _make_bertopic(self, n_docs: int):
        """
        Build a BERTopic instance appropriate for the dataset size.

        For n_docs >= MIN_BERTOPIC_DOCS: standard BERTopic with HDBSCAN.
        For n_docs < MIN_BERTOPIC_DOCS: BERTopic with KMeans so every
          document gets a real topic assignment instead of -1.
        """
        from bertopic import BERTopic
        from bertopic.representation import MaximalMarginalRelevance
        from sklearn.feature_extraction.text import CountVectorizer

        vectorizer = CountVectorizer(
            tokenizer=_mongolian_tokenizer,
            min_df=1,
            max_df=0.80,  # ignore terms appearing in >80% of docs
        )

        # MMR picks diverse keywords instead of redundant near-synonyms
        mmr = MaximalMarginalRelevance(diversity=0.5)

        if n_docs >= MIN_BERTOPIC_DOCS:
            # Large dataset: use KMeans to guarantee a controllable number
            # of topics. HDBSCAN tends to produce too few topics (2-3) on
            # medium datasets (100-1000 docs) because of aggressive merging.
            from sklearn.cluster import KMeans
            n_clusters = max(
                self.min_topics,
                min(n_docs // 10, self.max_topics), # Increased division base to allow more topics
            )
            # Ensure we don't request more clusters than documents
            n_clusters = min(n_clusters, n_docs)
            cluster_model = KMeans(
                n_clusters=n_clusters, random_state=42, n_init="auto"
            )
            return BERTopic(
                language=self.language,
                embedding_model=self._load_embedding_model(),
                hdbscan_model=cluster_model,
                vectorizer_model=vectorizer,
                representation_model=mmr,
                min_topic_size=2,
            )
        else:
            # Small/medium dataset (<50 docs): KMeans guarantees every
            # document gets a topic (no outlier -1 assignments).
            from sklearn.cluster import KMeans
            n_clusters = max(min(2, n_docs), min(n_docs // 3, self.max_topics))
            # If user wants min_topics=5, try to enforce it if dataset is large enough
            n_clusters = min(max(n_clusters, self.min_topics), n_docs)
            cluster_model = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")
            return BERTopic(
                language=self.language,
                embedding_model=self._load_embedding_model(),
                hdbscan_model=cluster_model,
                vectorizer_model=vectorizer,
                representation_model=mmr,
                min_topic_size=1,
                nr_topics="auto",
            )

    def fit_transform(self, texts: List[str]) -> Tuple[List[TopicResult], List[Dict]]:
        """
        Fit topic model on texts and return per-document topic assignments.

        Thresholds:
          < MIN_TINY_DOCS (3): returns empty — not enough data
          3 to 9 docs:         KMeans-backed BERTopic
          10+ docs:            standard HDBSCAN BERTopic

        Returns:
            (topic_results, topic_summary)
            topic_results — one TopicResult per input document
            topic_summary — list of {topic_id, name, count} dicts
        """
        # Filter empty strings — they confuse the embedding model
        non_empty = [(i, t) for i, t in enumerate(texts) if t.strip()]

        if len(non_empty) < MIN_TINY_DOCS:
            return [], [{
                "info": (
                    f"Topic modeling needs at least {MIN_TINY_DOCS} non-empty documents. "
                    f"Got {len(non_empty)}."
                )
            }]

        indices, valid_texts = zip(*non_empty)

        emb_model = self._load_embedding_model()
        embeddings = emb_model.encode(list(valid_texts), show_progress_bar=False)

        model = self._make_bertopic(len(valid_texts))
        topics, probs = model.fit_transform(list(valid_texts), embeddings)
        self._model = model

        topic_info = model.get_topic_info()

        # Build per-document results
        # Map from valid_texts index back to original texts index
        result_map: Dict[int, TopicResult] = {}
        for pos, (orig_idx, topic_id) in enumerate(zip(indices, topics)):
            try:
                prob = float(probs[pos]) if probs is not None else 0.0
            except (TypeError, IndexError, ValueError):
                prob = 0.0
            try:
                topic_words = model.get_topic(topic_id)
                keywords = [w for w, _ in (topic_words or [])[:5]]
            except Exception:
                keywords = []

            topic_row = topic_info[topic_info["Topic"] == topic_id]
            if not topic_row.empty and "Name" in topic_row.columns:
                label = str(topic_row.iloc[0]["Name"])
            else:
                label = f"Topic {topic_id}" if topic_id != -1 else "Outlier"

            result_map[orig_idx] = TopicResult(
                topic_id=int(topic_id),
                topic_label=label,
                probability=float(prob),
                keywords=keywords,
            )

        # Fill results list aligned to original texts list
        # Documents that were empty strings get topic_id=-1
        results = []
        for i in range(len(texts)):
            if i in result_map:
                results.append(result_map[i])
            else:
                results.append(TopicResult(
                    topic_id=-1, topic_label="Empty", probability=0.0, keywords=[]
                ))

        # Build summary (exclude outlier topic -1 from summary)
        summary = []
        for _, row in topic_info.iterrows():
            tid = int(row["Topic"])
            summary.append({
                "topic_id": tid,
                "name": str(row.get("Name", f"Topic {tid}")),
                "count": int(row["Count"]),
            })

        return results, summary

    def get_topic_info(self) -> List[Dict]:
        """Return topic summary from the last fitted model."""
        if self._model is None:
            return []
        return [
            {
                "topic_id": int(row["Topic"]),
                "name": str(row.get("Name", f"Topic {row['Topic']}")),
                "count": int(row["Count"]),
            }
            for _, row in self._model.get_topic_info().iterrows()
        ]