Spaces:

edstellar
/

blog-audit

Sleeping

App Files Files Community

blog-audit / app.py

vijaykumaredstellar

Upload 3 files

5284eaf verified 10 days ago

raw

history blame contribute delete

24.2 kB

	import streamlit as st
	import pandas as pd
	import plotly.graph_objects as go
	import plotly.express as px
	import time

	from utils.data_processor import detect_columns, clean_and_tier, get_tier_summary, TIER_META
	from utils.embeddings import find_merge_candidates, load_embedder
	from utils.llm_client import MODEL_OPTIONS, test_connection, enrich_merge_pairs
	from utils.excel_builder import build_excel

	# ── Page config ───────────────────────────────────────────────────────────────
	st.set_page_config(
	page_title="Blog Audit Tool",
	page_icon="📊",
	layout="wide",
	initial_sidebar_state="expanded",
	)

	# ── Global CSS ────────────────────────────────────────────────────────────────
	st.markdown("""
	<style>
	.block-container { padding-top: 1.5rem; }
	.metric-card {
	background: #F8F9FA;
	border: 1px solid #E0E0E0;
	border-radius: 10px;
	padding: 16px 20px;
	text-align: center;
	}
	.metric-card .value { font-size: 2rem; font-weight: 700; }
	.metric-card .label { font-size: 0.82rem; color: #666; margin-top: 2px; }
	.section-header {
	font-size: 1.15rem; font-weight: 700;
	color: #1A1A2E; margin: 1.2rem 0 0.4rem 0;
	padding-bottom: 4px; border-bottom: 2px solid #4F46E5;
	display: inline-block;
	}
	.status-ok { color: #2ECC71; font-weight: 600; }
	.status-err { color: #E74C3C; font-weight: 600; }
	div[data-testid="stDataFrame"] { border-radius: 8px; }
	.stButton > button {
	border-radius: 8px; font-weight: 600;
	}
	.approve-row { background: #E8F5E9 !important; }
	.reject-row { background: #FFEBEE !important; }
	</style>
	""", unsafe_allow_html=True)


	# ── Session state init ────────────────────────────────────────────────────────
	for key in ["df_clean", "col_map", "merge_pairs", "analysis_done",
	"api_key", "model_id", "model_label", "site_name"]:
	if key not in st.session_state:
	st.session_state[key] = None

	if "analysis_done" not in st.session_state or st.session_state["analysis_done"] is None:
	st.session_state["analysis_done"] = False


	# ── Sidebar ───────────────────────────────────────────────────────────────────
	with st.sidebar:
	st.image("https://via.placeholder.com/200x50/4F46E5/FFFFFF?text=Blog+Audit", width=200)
	st.markdown("---")
	st.markdown("### 📋 Steps")
	st.markdown("""
	1. 📁 Upload your CSV
	2. 🔑 Enter API key & pick model
	3. 🚀 Run analysis
	4. 📊 Review results
	5. 💾 Download report
	""")
	st.markdown("---")
	st.markdown("### ℹ️ CSV Format")
	st.markdown("""
	Your CSV must have:
	- A URL column (`Blogs` or similar)
	- A title column (`H1` or `Title`)
	- 4 monthly click columns (any names)
	- A Total Clicks column
	""")
	st.markdown("---")
	st.caption("Blog Audit Tool — works for any website")


	# ══════════════════════════════════════════════════════════════════════════════
	# HEADER
	# ══════════════════════════════════════════════════════════════════════════════
	st.title("📊 Blog Performance Audit Tool")
	st.markdown("Upload any website's blog click data CSV to get a full performance audit with AI-powered merge recommendations.")
	st.markdown("---")


	# ══════════════════════════════════════════════════════════════════════════════
	# STEP 1 — UPLOAD CSV
	# ══════════════════════════════════════════════════════════════════════════════
	st.markdown('<p class="section-header">Step 1 — Upload Your CSV</p>', unsafe_allow_html=True)

	col_up1, col_up2 = st.columns([2, 1])
	with col_up1:
	uploaded_file = st.file_uploader(
	"Upload your blog clicks CSV",
	type=["csv"],
	help="CSV with URL column, H1/title column, 4 monthly click columns, and Total Clicks column.",
	label_visibility="collapsed",
	)

	with col_up2:
	site_name_input = st.text_input(
	"Website / Brand name (for report)",
	value="My Website",
	placeholder="e.g. Edstellar, TechBlog...",
	)

	if uploaded_file:
	try:
	df_raw = pd.read_csv(uploaded_file)
	col_map = detect_columns(df_raw)

	# Validate detection
	if not col_map["url_col"]:
	st.error("❌ Could not detect a URL column. Make sure your CSV has a column with URLs (e.g. 'Blogs').")
	st.stop()
	if not col_map["month_cols"]:
	st.error("❌ Could not detect monthly click columns. Ensure your numeric click columns are present.")
	st.stop()

	df_clean = clean_and_tier(df_raw, col_map)
	st.session_state["df_clean"] = df_clean
	st.session_state["col_map"] = col_map
	st.session_state["site_name"] = site_name_input

	# Preview
	with st.expander("📋 CSV Preview & Column Detection", expanded=True):
	c1, c2, c3, c4 = st.columns(4)
	c1.metric("Total Blogs", len(df_clean))
	c2.metric("URL Column", col_map["url_col"])
	c3.metric("Title Column", col_map["title_col"] or "—")
	c4.metric("Month Columns", len(col_map["month_cols"]))

	st.markdown(f"Detected month columns: `{'` · `'.join(col_map['month_cols'])}`")
	st.dataframe(df_clean[["url", "title"] + col_map["month_cols"] + ["total_clicks", "tier", "trend"]].head(8),
	use_container_width=True, height=220)

	st.success(f"✅ File loaded: {len(df_clean)} blogs detected.")

	except Exception as e:
	st.error(f"❌ Error reading CSV: {e}")
	st.stop()

	st.markdown("---")


	# ══════════════════════════════════════════════════════════════════════════════
	# STEP 2 — AI CONFIGURATION
	# ══════════════════════════════════════════════════════════════════════════════
	st.markdown('<p class="section-header">Step 2 — AI Configuration (OpenRouter)</p>', unsafe_allow_html=True)
	st.caption("Get a free API key at [openrouter.ai/keys](https://openrouter.ai/keys)")

	ai_col1, ai_col2, ai_col3 = st.columns([2, 2, 1])

	with ai_col1:
	api_key_input = st.text_input(
	"🔑 OpenRouter API Key",
	type="password",
	placeholder="sk-or-v1-xxxxxxxxxxxxxxxxxxxxxxxx",
	value=st.secrets.get("OPENROUTER_API_KEY", "") if hasattr(st, "secrets") else "",
	)

	with ai_col2:
	model_label = st.selectbox(
	"🧠 Select LLM Model",
	options=list(MODEL_OPTIONS.keys()),
	index=0,
	)

	with ai_col3:
	st.markdown("<br>", unsafe_allow_html=True)
	test_btn = st.button("🔌 Test Connection", use_container_width=True)

	if test_btn:
	if not api_key_input:
	st.warning("⚠️ Please enter your API key first.")
	else:
	with st.spinner("Testing connection..."):
	ok, msg = test_connection(api_key_input, MODEL_OPTIONS[model_label])
	if ok:
	st.session_state["api_key"] = api_key_input
	st.session_state["model_id"] = MODEL_OPTIONS[model_label]
	st.session_state["model_label"] = model_label
	st.success(f"✅ {msg} Model: {model_label}")
	else:
	st.error(f"❌ {msg}")

	# Auto-save key if already entered without test
	if api_key_input and not test_btn:
	st.session_state["api_key"] = api_key_input
	st.session_state["model_id"] = MODEL_OPTIONS[model_label]
	st.session_state["model_label"] = model_label

	st.markdown("---")


	# ══════════════════════════════════════════════════════════════════════════════
	# STEP 3 — RUN ANALYSIS
	# ══════════════════════════════════════════════════════════════════════════════
	st.markdown('<p class="section-header">Step 3 — Run Analysis</p>', unsafe_allow_html=True)

	run_ready = (
	st.session_state.get("df_clean") is not None
	and st.session_state.get("api_key")
	)

	if not run_ready:
	st.info("⬆️ Complete Steps 1 and 2 above before running the analysis.")

	run_btn = st.button(
	"🚀 Run Full Analysis",
	disabled=not run_ready,
	use_container_width=False,
	type="primary",
	)

	if run_btn:
	df_clean = st.session_state["df_clean"]
	col_map = st.session_state["col_map"]
	api_key = st.session_state["api_key"]
	model_id = st.session_state["model_id"]
	model_label = st.session_state["model_label"]

	progress_bar = st.progress(0, text="Starting analysis...")
	status_box = st.empty()

	# Step A — Embeddings
	status_box.info("🧠 Step 1/3 — Loading embedding model and generating title vectors...")
	progress_bar.progress(10, text="Loading embedding model...")
	with st.spinner("Generating embeddings (this may take 30–60s on first run)..."):
	load_embedder() # warm up cache
	merge_candidates = find_merge_candidates(df_clean, threshold=0.72, max_weak_clicks=200)
	progress_bar.progress(45, text=f"Found {len(merge_candidates)} merge candidate pairs.")
	status_box.success(f"✅ Embeddings done — {len(merge_candidates)} potential merge pairs found.")
	time.sleep(0.4)

	# Step B — LLM enrichment
	if merge_candidates:
	status_box.info(f"🤖 Step 2/3 — Sending {len(merge_candidates)} pairs to {model_label} for merge reasoning...")
	progress_bar.progress(50, text="LLM analysis in progress...")

	llm_progress = st.empty()
	llm_prog_bar = st.progress(0)

	def llm_cb(done, total):
	pct = int(done / total * 100)
	llm_prog_bar.progress(pct, text=f"LLM: {done}/{total} pairs processed")

	enriched_pairs = enrich_merge_pairs(
	merge_candidates, api_key, model_id,
	batch_size=10, progress_callback=llm_cb,
	)
	llm_prog_bar.empty()
	llm_progress.empty()
	else:
	enriched_pairs = []

	st.session_state["merge_pairs"] = enriched_pairs
	progress_bar.progress(90, text="Building report...")

	# Step C — Done
	status_box.success(f"✅ Analysis complete! {len(enriched_pairs)} merge recommendations generated.")
	progress_bar.progress(100, text="✅ Analysis complete!")
	st.session_state["analysis_done"] = True
	time.sleep(0.5)
	st.rerun()

	st.markdown("---")


	# ══════════════════════════════════════════════════════════════════════════════
	# STEP 4 — RESULTS DASHBOARD
	# ══════════════════════════════════════════════════════════════════════════════
	if st.session_state["analysis_done"] and st.session_state["df_clean"] is not None:
	df_clean = st.session_state["df_clean"]
	col_map = st.session_state["col_map"]
	month_cols = col_map["month_cols"]
	merge_pairs = st.session_state.get("merge_pairs", [])
	site_name = st.session_state.get("site_name", "Website")

	st.markdown('<p class="section-header">Step 4 — Results Dashboard</p>', unsafe_allow_html=True)

	counts = get_tier_summary(df_clean)

	# ── KPI metric row ──────────────────────────────────────────────────────
	m_cols = st.columns(7)
	kpis = [
	("Total Blogs", len(df_clean), "#4F46E5"),
	("🔴 Take Down", counts["TAKEDOWN_ZERO"] + counts["TAKEDOWN_LOW"], "#E74C3C"),
	("🔵 Merge", len(merge_pairs), "#2980B9"),
	("🟡 Monitor", counts["MONITOR"], "#F39C12"),
	("✅ Performing", counts["PERFORMING"], "#27AE60"),
	("💪 Strong", counts["STRONG"], "#1ABC9C"),
	("🏆 Top", counts["TOP"], "#9B59B6"),
	]
	for col_w, (label, val, color) in zip(m_cols, kpis):
	col_w.markdown(
	f'<div class="metric-card">'
	f'<div class="value" style="color:{color}">{val}</div>'
	f'<div class="label">{label}</div>'
	f'</div>',
	unsafe_allow_html=True,
	)

	st.markdown("<br>", unsafe_allow_html=True)

	# ── Charts ──────────────────────────────────────────────────────────────
	chart_col1, chart_col2 = st.columns(2)

	with chart_col1:
	st.markdown("Click Distribution by Tier")
	tier_labels = [TIER_META[t]["label"] for t in counts]
	tier_values = list(counts.values())
	tier_colors = [TIER_META[t]["color"] for t in counts]
	fig_pie = go.Figure(go.Pie(
	labels=tier_labels, values=tier_values,
	marker_colors=tier_colors,
	hole=0.45, textinfo="label+percent",
	))
	fig_pie.update_layout(
	showlegend=False, margin=dict(t=10, b=10, l=10, r=10), height=280
	)
	st.plotly_chart(fig_pie, use_container_width=True)

	with chart_col2:
	st.markdown("Monthly Click Trend (Top 5 Blogs)")
	top5 = df_clean.nlargest(5, "total_clicks")
	fig_line = go.Figure()
	for _, row in top5.iterrows():
	short_title = row["title"][:40] + "…" if len(row["title"]) > 40 else row["title"]
	fig_line.add_trace(go.Scatter(
	x=month_cols,
	y=[row[m] for m in month_cols],
	mode="lines+markers",
	name=short_title,
	))
	fig_line.update_layout(
	margin=dict(t=10, b=10, l=10, r=10), height=280,
	legend=dict(font=dict(size=9)),
	xaxis_title="Month", yaxis_title="Clicks",
	)
	st.plotly_chart(fig_line, use_container_width=True)

	st.markdown("---")

	# ── Result Tabs ─────────────────────────────────────────────────────────
	tab_td, tab_merge, tab_mon, tab_perf, tab_str, tab_top = st.tabs([
	f"🔴 Take Down ({counts['TAKEDOWN_ZERO'] + counts['TAKEDOWN_LOW']})",
	f"🔵 Merge ({len(merge_pairs)})",
	f"🟡 Monitor ({counts['MONITOR']})",
	f"✅ Performing ({counts['PERFORMING']})",
	f"💪 Strong ({counts['STRONG']})",
	f"🏆 Top ({counts['TOP']})",
	])

	display_cols_base = ["url", "title"] + month_cols + ["total_clicks", "trend", "action"]

	# ── Take Down tab ────────────────────────────────────────────────────────
	with tab_td:
	st.markdown("Blogs with zero or negligible traffic. Recommended for removal or merge.")
	td_df = df_clean[df_clean["tier"].isin(["TAKEDOWN_ZERO", "TAKEDOWN_LOW"])].sort_values("total_clicks")
	if len(td_df):
	# Search filter
	search_td = st.text_input("🔍 Filter by URL or title", key="search_td", placeholder="Type to filter...")
	if search_td:
	mask = td_df["url"].str.contains(search_td, case=False, na=False) \| \
	td_df["title"].str.contains(search_td, case=False, na=False)
	td_df = td_df[mask]
	st.dataframe(td_df[display_cols_base], use_container_width=True, height=400)
	st.caption(f"{len(td_df)} blogs")
	else:
	st.success("🎉 No take-down candidates found!")

	# ── Merge tab ────────────────────────────────────────────────────────────
	with tab_merge:
	st.markdown("AI-detected topically overlapping blogs. Review and approve/reject each suggestion.")

	if merge_pairs:
	# Review interface
	st.markdown("✅ = Keep in report    ❌ = Reject suggestion")

	# Init approval state
	if "merge_approvals" not in st.session_state:
	st.session_state["merge_approvals"] = {i: True for i in range(len(merge_pairs))}

	col_filter, col_actions = st.columns([3, 1])
	with col_filter:
	search_merge = st.text_input("🔍 Filter merges", key="search_merge", placeholder="Filter by title or cluster...")
	with col_actions:
	st.markdown("<br>", unsafe_allow_html=True)
	if st.button("✅ Approve All", key="approve_all"):
	for i in range(len(merge_pairs)):
	st.session_state["merge_approvals"][i] = True
	st.rerun()

	# Display table
	merge_display = []
	for i, p in enumerate(merge_pairs):
	if search_merge and search_merge.lower() not in p.get("weak_title","").lower() \
	and search_merge.lower() not in p.get("strong_title","").lower() \
	and search_merge.lower() not in p.get("topic_cluster","").lower():
	continue
	approved = st.session_state["merge_approvals"].get(i, True)
	merge_display.append({
	"#": i + 1,
	"Weak Blog (Merge FROM)": p.get("weak_title", "")[:60],
	"Weak Clicks": p.get("weak_clicks", 0),
	"Strong Blog (Merge INTO)":p.get("strong_title", "")[:60],
	"Strong Clicks": p.get("strong_clicks", 0),
	"Similarity": p.get("similarity", ""),
	"Topic Cluster": p.get("topic_cluster", ""),
	"AI Merge Reason": p.get("merge_reason", "")[:100],
	"Approved": "✅ Yes" if approved else "❌ Rejected",
	})

	merge_df_display = pd.DataFrame(merge_display)
	st.dataframe(merge_df_display, use_container_width=True, height=380)

	# Approve/Reject individual rows
	st.markdown("Toggle individual suggestions:")
	toggle_cols = st.columns(5)
	for i, p in enumerate(merge_pairs[:20]): # show first 20 toggles
	col_idx = i % 5
	with toggle_cols[col_idx]:
	current = st.session_state["merge_approvals"].get(i, True)
	label = f"#{i+1}: {p.get('weak_title','')[:25]}…"
	new_val = st.checkbox(label, value=current, key=f"merge_toggle_{i}")
	st.session_state["merge_approvals"][i] = new_val

	approved_count = sum(st.session_state["merge_approvals"].values())
	st.caption(f"✅ {approved_count} approved · ❌ {len(merge_pairs) - approved_count} rejected")
	else:
	st.info("No merge candidates detected above the similarity threshold.")

	# ── Generic tier tab helper ──────────────────────────────────────────────
	def render_tier_tab(tier_key, tab_obj, search_key):
	with tab_obj:
	subset = df_clean[df_clean["tier"] == tier_key].sort_values("total_clicks", ascending=False)
	search = st.text_input("🔍 Filter", key=search_key, placeholder="Filter by URL or title...")
	if search:
	mask = subset["url"].str.contains(search, case=False, na=False) \| \
	subset["title"].str.contains(search, case=False, na=False)
	subset = subset[mask]
	trend_filter = st.selectbox(
	"Trend filter", ["All", "📈 Growing", "📉 Declining", "➡️ Stable"],
	key=f"{search_key}_trend"
	)
	if trend_filter != "All":
	subset = subset[subset["trend"] == trend_filter]
	st.dataframe(subset[display_cols_base], use_container_width=True, height=400)
	st.caption(f"{len(subset)} blogs")

	render_tier_tab("MONITOR", tab_mon, "search_mon")
	render_tier_tab("PERFORMING", tab_perf, "search_perf")
	render_tier_tab("STRONG", tab_str, "search_str")
	render_tier_tab("TOP", tab_top, "search_top")

	st.markdown("---")

	# ══════════════════════════════════════════════════════════════════════════
	# STEP 5 — DOWNLOAD REPORT
	# ══════════════════════════════════════════════════════════════════════════
	st.markdown('<p class="section-header">Step 5 — Download Excel Report</p>', unsafe_allow_html=True)

	# Apply approvals to merge pairs before export
	approvals = st.session_state.get("merge_approvals", {})
	approved_pairs = [p for i, p in enumerate(merge_pairs) if approvals.get(i, True)]

	dl_col1, dl_col2 = st.columns([2, 3])
	with dl_col1:
	with st.spinner("Building Excel report..."):
	excel_bytes = build_excel(
	df_clean, approved_pairs, month_cols,
	site_name=st.session_state.get("site_name", "Website"),
	)
	filename = f"{site_name.replace(' ', '_').lower()}_blog_audit_report.xlsx"
	st.download_button(
	label="⬇️ Download 7-Tab Excel Report",
	data=excel_bytes,
	file_name=filename,
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
	type="primary",
	use_container_width=True,
	)

	with dl_col2:
	st.markdown(f"""
	Report includes 7 tabs:
	- 📊 Summary Dashboard
	- 🔴 Take Down ({counts['TAKEDOWN_ZERO'] + counts['TAKEDOWN_LOW']} blogs)
	- 🔵 Merge Recommendations ({len(approved_pairs)} approved pairs)
	- 🟡 Monitor · ✅ Performing · 💪 Strong · 🏆 Top Performers
	""")
	st.caption(f"Model used: {st.session_state.get('model_label','—')}")

	elif not st.session_state["analysis_done"]:
	st.markdown('<p class="section-header">Step 4 — Results Dashboard</p>', unsafe_allow_html=True)
	st.info("⬆️ Complete Steps 1–3 above to see results here.")