Spaces:
Sleeping
Sleeping
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import streamlit as st | |
| MODEL_NAME = "all-MiniLM-L6-v2" # ~90 MB, English-optimised, fast | |
| SIMILARITY_THRESHOLD = 0.72 | |
| def load_embedder(): | |
| """Load the sentence-transformer model once and cache it for the session.""" | |
| return SentenceTransformer(MODEL_NAME) | |
| def get_embeddings(titles: list[str], embedder) -> np.ndarray: | |
| """Generate sentence embeddings for a list of titles.""" | |
| return embedder.encode(titles, show_progress_bar=False, batch_size=64) | |
| def find_merge_candidates( | |
| df, | |
| threshold: float = SIMILARITY_THRESHOLD, | |
| max_weak_clicks: int = 200, | |
| ) -> list[dict]: | |
| """ | |
| Compare all blog titles using cosine similarity. | |
| Returns pairs where: | |
| - Weak blog has fewer clicks than strong blog | |
| - Similarity score >= threshold | |
| - Weak blog has <= max_weak_clicks total clicks | |
| One result per weak blog (best matching strong). | |
| """ | |
| embedder = load_embedder() | |
| all_titles = df["title"].tolist() | |
| all_clicks = df["total_clicks"].tolist() | |
| all_urls = df["url"].tolist() | |
| # Candidate pool: blogs with low clicks only | |
| weak_mask = df["total_clicks"] <= max_weak_clicks | |
| weak_idx = df[weak_mask].index.tolist() | |
| if not weak_idx: | |
| return [] | |
| weak_titles = [all_titles[i] for i in weak_idx] | |
| all_emb = get_embeddings(all_titles, embedder) | |
| weak_emb = np.array([all_emb[i] for i in weak_idx]) | |
| sim_matrix = cosine_similarity(weak_emb, all_emb) | |
| pairs = [] | |
| seen_weak = set() | |
| for row_i, wi in enumerate(weak_idx): | |
| if wi in seen_weak: | |
| continue | |
| sims = sim_matrix[row_i] | |
| # Build ranked candidates for this weak blog | |
| ranked = sorted( | |
| [ | |
| (j, float(sims[j])) | |
| for j in range(len(all_titles)) | |
| if j != wi | |
| and sims[j] >= threshold | |
| and all_clicks[j] > all_clicks[wi] # strong must have more clicks | |
| ], | |
| key=lambda x: x[1], | |
| reverse=True, | |
| ) | |
| if ranked: | |
| best_j, best_score = ranked[0] | |
| pairs.append({ | |
| "weak_url": all_urls[wi], | |
| "weak_title": all_titles[wi], | |
| "weak_clicks": all_clicks[wi], | |
| "strong_url": all_urls[best_j], | |
| "strong_title": all_titles[best_j], | |
| "strong_clicks": all_clicks[best_j], | |
| "similarity": round(best_score, 4), | |
| # Placeholders — filled in by LLM later | |
| "topic_cluster": "", | |
| "merge_reason": "", | |
| "approved": True, # default approved until user toggles | |
| }) | |
| seen_weak.add(wi) | |
| # Sort by similarity descending | |
| pairs.sort(key=lambda x: x["similarity"], reverse=True) | |
| return pairs | |