import numpy as np from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import streamlit as st MODEL_NAME = "all-MiniLM-L6-v2" # ~90 MB, English-optimised, fast SIMILARITY_THRESHOLD = 0.72 @st.cache_resource(show_spinner=False) def load_embedder(): """Load the sentence-transformer model once and cache it for the session.""" return SentenceTransformer(MODEL_NAME) def get_embeddings(titles: list[str], embedder) -> np.ndarray: """Generate sentence embeddings for a list of titles.""" return embedder.encode(titles, show_progress_bar=False, batch_size=64) def find_merge_candidates( df, threshold: float = SIMILARITY_THRESHOLD, max_weak_clicks: int = 200, ) -> list[dict]: """ Compare all blog titles using cosine similarity. Returns pairs where: - Weak blog has fewer clicks than strong blog - Similarity score >= threshold - Weak blog has <= max_weak_clicks total clicks One result per weak blog (best matching strong). """ embedder = load_embedder() all_titles = df["title"].tolist() all_clicks = df["total_clicks"].tolist() all_urls = df["url"].tolist() # Candidate pool: blogs with low clicks only weak_mask = df["total_clicks"] <= max_weak_clicks weak_idx = df[weak_mask].index.tolist() if not weak_idx: return [] weak_titles = [all_titles[i] for i in weak_idx] all_emb = get_embeddings(all_titles, embedder) weak_emb = np.array([all_emb[i] for i in weak_idx]) sim_matrix = cosine_similarity(weak_emb, all_emb) pairs = [] seen_weak = set() for row_i, wi in enumerate(weak_idx): if wi in seen_weak: continue sims = sim_matrix[row_i] # Build ranked candidates for this weak blog ranked = sorted( [ (j, float(sims[j])) for j in range(len(all_titles)) if j != wi and sims[j] >= threshold and all_clicks[j] > all_clicks[wi] # strong must have more clicks ], key=lambda x: x[1], reverse=True, ) if ranked: best_j, best_score = ranked[0] pairs.append({ "weak_url": all_urls[wi], "weak_title": all_titles[wi], "weak_clicks": all_clicks[wi], "strong_url": all_urls[best_j], "strong_title": all_titles[best_j], "strong_clicks": all_clicks[best_j], "similarity": round(best_score, 4), # Placeholders — filled in by LLM later "topic_cluster": "", "merge_reason": "", "approved": True, # default approved until user toggles }) seen_weak.add(wi) # Sort by similarity descending pairs.sort(key=lambda x: x["similarity"], reverse=True) return pairs