blog-audit / utils /embeddings.py
vijaykumaredstellar's picture
Upload 3 files
6d28094 verified
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st
MODEL_NAME = "all-MiniLM-L6-v2" # ~90 MB, English-optimised, fast
SIMILARITY_THRESHOLD = 0.72
@st.cache_resource(show_spinner=False)
def load_embedder():
"""Load the sentence-transformer model once and cache it for the session."""
return SentenceTransformer(MODEL_NAME)
def get_embeddings(titles: list[str], embedder) -> np.ndarray:
"""Generate sentence embeddings for a list of titles."""
return embedder.encode(titles, show_progress_bar=False, batch_size=64)
def find_merge_candidates(
df,
threshold: float = SIMILARITY_THRESHOLD,
max_weak_clicks: int = 200,
) -> list[dict]:
"""
Compare all blog titles using cosine similarity.
Returns pairs where:
- Weak blog has fewer clicks than strong blog
- Similarity score >= threshold
- Weak blog has <= max_weak_clicks total clicks
One result per weak blog (best matching strong).
"""
embedder = load_embedder()
all_titles = df["title"].tolist()
all_clicks = df["total_clicks"].tolist()
all_urls = df["url"].tolist()
# Candidate pool: blogs with low clicks only
weak_mask = df["total_clicks"] <= max_weak_clicks
weak_idx = df[weak_mask].index.tolist()
if not weak_idx:
return []
weak_titles = [all_titles[i] for i in weak_idx]
all_emb = get_embeddings(all_titles, embedder)
weak_emb = np.array([all_emb[i] for i in weak_idx])
sim_matrix = cosine_similarity(weak_emb, all_emb)
pairs = []
seen_weak = set()
for row_i, wi in enumerate(weak_idx):
if wi in seen_weak:
continue
sims = sim_matrix[row_i]
# Build ranked candidates for this weak blog
ranked = sorted(
[
(j, float(sims[j]))
for j in range(len(all_titles))
if j != wi
and sims[j] >= threshold
and all_clicks[j] > all_clicks[wi] # strong must have more clicks
],
key=lambda x: x[1],
reverse=True,
)
if ranked:
best_j, best_score = ranked[0]
pairs.append({
"weak_url": all_urls[wi],
"weak_title": all_titles[wi],
"weak_clicks": all_clicks[wi],
"strong_url": all_urls[best_j],
"strong_title": all_titles[best_j],
"strong_clicks": all_clicks[best_j],
"similarity": round(best_score, 4),
# Placeholders — filled in by LLM later
"topic_cluster": "",
"merge_reason": "",
"approved": True, # default approved until user toggles
})
seen_weak.add(wi)
# Sort by similarity descending
pairs.sort(key=lambda x: x["similarity"], reverse=True)
return pairs