import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st

MODEL_NAME = "all-MiniLM-L6-v2"   # ~90 MB, English-optimised, fast
SIMILARITY_THRESHOLD = 0.72


@st.cache_resource(show_spinner=False)
def load_embedder():
    """Load the sentence-transformer model once and cache it for the session."""
    return SentenceTransformer(MODEL_NAME)


def get_embeddings(titles: list[str], embedder) -> np.ndarray:
    """Generate sentence embeddings for a list of titles."""
    return embedder.encode(titles, show_progress_bar=False, batch_size=64)


def find_merge_candidates(
    df,
    threshold: float = SIMILARITY_THRESHOLD,
    max_weak_clicks: int = 200,
) -> list[dict]:
    """
    Compare all blog titles using cosine similarity.
    Returns pairs where:
      - Weak blog has fewer clicks than strong blog
      - Similarity score >= threshold
      - Weak blog has <= max_weak_clicks total clicks
    One result per weak blog (best matching strong).
    """
    embedder = load_embedder()

    all_titles   = df["title"].tolist()
    all_clicks   = df["total_clicks"].tolist()
    all_urls     = df["url"].tolist()

    # Candidate pool: blogs with low clicks only
    weak_mask    = df["total_clicks"] <= max_weak_clicks
    weak_idx     = df[weak_mask].index.tolist()

    if not weak_idx:
        return []

    weak_titles  = [all_titles[i] for i in weak_idx]

    all_emb  = get_embeddings(all_titles, embedder)
    weak_emb = np.array([all_emb[i] for i in weak_idx])

    sim_matrix = cosine_similarity(weak_emb, all_emb)

    pairs = []
    seen_weak = set()

    for row_i, wi in enumerate(weak_idx):
        if wi in seen_weak:
            continue
        sims = sim_matrix[row_i]

        # Build ranked candidates for this weak blog
        ranked = sorted(
            [
                (j, float(sims[j]))
                for j in range(len(all_titles))
                if j != wi
                and sims[j] >= threshold
                and all_clicks[j] > all_clicks[wi]   # strong must have more clicks
            ],
            key=lambda x: x[1],
            reverse=True,
        )

        if ranked:
            best_j, best_score = ranked[0]
            pairs.append({
                "weak_url":      all_urls[wi],
                "weak_title":    all_titles[wi],
                "weak_clicks":   all_clicks[wi],
                "strong_url":    all_urls[best_j],
                "strong_title":  all_titles[best_j],
                "strong_clicks": all_clicks[best_j],
                "similarity":    round(best_score, 4),
                # Placeholders — filled in by LLM later
                "topic_cluster": "",
                "merge_reason":  "",
                "approved":      True,   # default approved until user toggles
            })
            seen_weak.add(wi)

    # Sort by similarity descending
    pairs.sort(key=lambda x: x["similarity"], reverse=True)
    return pairs