Spaces:

edstellar
/

blog-audit

Sleeping

App Files Files Community

vijaykumaredstellar commited on Feb 21

Commit

6d28094

verified ·

1 Parent(s): 1a9abc2

Upload 3 files

Browse files

Files changed (3) hide show

utils/data_processor.py +169 -0
utils/embeddings.py +94 -0
utils/excel_builder.py +256 -0

utils/data_processor.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import pandas as pd
+import numpy as np
+# Fixed non-click columns
+NON_CLICK_COLS = {"date", "month", "year", "blogs", "h1", "meta title",
+                  "meta description", "total clicks", "total_clicks"}
+TIER_CONFIG = {
+    "takedown_zero_max":    0,
+    "takedown_low_max":     5,
+    "takedown_low_monthly": 2,
+    "monitor_min":          6,
+    "monitor_max":          20,
+    "performing_min":       21,
+    "performing_max":       100,
+    "strong_min":           101,
+    "strong_max":           500,
+    "top_min":              501,
+}
+def detect_columns(df: pd.DataFrame) -> dict:
+    """
+    Auto-detect URL, title, month, and total_clicks columns
+    from any CSV with the expected structure.
+    Returns a dict with keys: url_col, title_col, month_cols, total_col
+    """
+    cols_lower = {c.lower(): c for c in df.columns}
+    # URL column — 'Blogs' or any col containing http values
+    url_col = cols_lower.get("blogs") or cols_lower.get("url") or cols_lower.get("urls")
+    if not url_col:
+        for col in df.columns:
+            sample = df[col].dropna().astype(str).head(5)
+            if sample.str.startswith("http").any():
+                url_col = col
+                break
+    # Title column — 'H1' or 'Title'
+    title_col = cols_lower.get("h1") or cols_lower.get("title") or cols_lower.get("meta title")
+    # Total clicks column
+    total_col = cols_lower.get("total clicks") or cols_lower.get("total_clicks")
+    # Month columns — numeric columns not in the known set
+    month_cols = []
+    for col in df.columns:
+        if col.lower() in NON_CLICK_COLS:
+            continue
+        if pd.api.types.is_numeric_dtype(df[col]) or _is_mostly_numeric(df[col]):
+            month_cols.append(col)
+    # Remove total_col from month_cols if accidentally included
+    if total_col and total_col in month_cols:
+        month_cols.remove(total_col)
+    return {
+        "url_col":    url_col,
+        "title_col":  title_col,
+        "month_cols": month_cols,
+        "total_col":  total_col,
+    }
+def _is_mostly_numeric(series: pd.Series, threshold=0.7) -> bool:
+    converted = pd.to_numeric(series, errors="coerce")
+    valid = converted.notna().sum()
+    return valid / max(len(series), 1) >= threshold
+def clean_and_tier(df: pd.DataFrame, col_map: dict) -> pd.DataFrame:
+    """
+    Build a clean analysis dataframe with tier, trend, and slug columns.
+    """
+    url_col   = col_map["url_col"]
+    title_col = col_map["title_col"]
+    month_cols = col_map["month_cols"]
+    total_col  = col_map["total_col"]
+    out = pd.DataFrame()
+    out["url"]   = df[url_col].astype(str).str.strip()
+    out["title"] = df[title_col].astype(str).str.strip() if title_col else out["url"]
+    out["slug"]  = out["url"].apply(
+        lambda x: x.split("/blog/")[-1] if "/blog/" in x else x.rstrip("/").split("/")[-1]
+    )
+    for col in month_cols:
+        out[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)
+    if total_col:
+        out["total_clicks"] = pd.to_numeric(df[total_col], errors="coerce").fillna(0).astype(int)
+    else:
+        out["total_clicks"] = out[month_cols].sum(axis=1)
+    # Trend
+    out["trend"] = out.apply(lambda r: _get_trend(r, month_cols), axis=1)
+    # Tier
+    out["tier"] = out.apply(lambda r: _assign_tier(r, month_cols), axis=1)
+    # Recommended action
+    out["action"] = out.apply(lambda r: _get_action(r["tier"], r["total_clicks"]), axis=1)
+    return out
+def _get_trend(row, month_cols):
+    if len(month_cols) < 2:
+        return "➡️ Stable"
+    vals = [row[c] for c in month_cols]
+    if vals[-1] > vals[0]:
+        return "📈 Growing"
+    elif vals[-1] < vals[0]:
+        return "📉 Declining"
+    return "➡️ Stable"
+def _assign_tier(row, month_cols):
+    c = row["total_clicks"]
+    max_m = max([row[m] for m in month_cols]) if month_cols else 0
+    if c == 0:
+        return "TAKEDOWN_ZERO"
+    elif c <= TIER_CONFIG["takedown_low_max"] and max_m <= TIER_CONFIG["takedown_low_monthly"]:
+        return "TAKEDOWN_LOW"
+    elif TIER_CONFIG["monitor_min"] <= c <= TIER_CONFIG["monitor_max"]:
+        return "MONITOR"
+    elif TIER_CONFIG["performing_min"] <= c <= TIER_CONFIG["performing_max"]:
+        return "PERFORMING"
+    elif TIER_CONFIG["strong_min"] <= c <= TIER_CONFIG["strong_max"]:
+        return "STRONG"
+    elif c >= TIER_CONFIG["top_min"]:
+        return "TOP"
+    return "PERFORMING"
+def _get_action(tier, clicks):
+    actions = {
+        "TAKEDOWN_ZERO": "Remove immediately. Zero organic traction across all months. Set up 410 or 301 redirect.",
+        "TAKEDOWN_LOW":  "Merge into a stronger related article or remove. Implement 301 redirect.",
+        "MONITOR":       "Optimize meta title, description & keywords. Review in 90 days. Merge if no improvement.",
+        "PERFORMING":    "Refresh content, strengthen internal links, add FAQ schema. Push for top 50 clicks.",
+        "STRONG":        "Update statistics & examples. Add lead gen CTA. Build backlinks to reach 500+ clicks.",
+        "TOP":           "Priority asset. Add lead magnets, improve CTAs, build backlinks. Protect rankings.",
+    }
+    if tier == "MONITOR" and clicks >= 15:
+        return "Good potential. Optimize meta description and add 2–3 internal links to improve CTR."
+    return actions.get(tier, "Review manually.")
+def get_tier_summary(df: pd.DataFrame) -> dict:
+    counts = df["tier"].value_counts().to_dict()
+    return {
+        "TAKEDOWN_ZERO": counts.get("TAKEDOWN_ZERO", 0),
+        "TAKEDOWN_LOW":  counts.get("TAKEDOWN_LOW", 0),
+        "MONITOR":       counts.get("MONITOR", 0),
+        "PERFORMING":    counts.get("PERFORMING", 0),
+        "STRONG":        counts.get("STRONG", 0),
+        "TOP":           counts.get("TOP", 0),
+    }
+TIER_META = {
+    "TAKEDOWN_ZERO": {"label": "🔴 Take Down (0 clicks)",       "color": "#FF4B4B"},
+    "TAKEDOWN_LOW":  {"label": "🟠 Take Down (1–5 clicks)",      "color": "#FF8C00"},
+    "MONITOR":       {"label": "🟡 Monitor (6–20 clicks)",       "color": "#FFC300"},
+    "PERFORMING":    {"label": "✅ Performing (21–100 clicks)",  "color": "#2ECC71"},
+    "STRONG":        {"label": "💪 Strong (101–500 clicks)",     "color": "#1ABC9C"},
+    "TOP":           {"label": "🏆 Top Performers (500+ clicks)","color": "#9B59B6"},
+}

utils/embeddings.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import numpy as np
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import streamlit as st
+MODEL_NAME = "all-MiniLM-L6-v2"   # ~90 MB, English-optimised, fast
+SIMILARITY_THRESHOLD = 0.72
+@st.cache_resource(show_spinner=False)
+def load_embedder():
+    """Load the sentence-transformer model once and cache it for the session."""
+    return SentenceTransformer(MODEL_NAME)
+def get_embeddings(titles: list[str], embedder) -> np.ndarray:
+    """Generate sentence embeddings for a list of titles."""
+    return embedder.encode(titles, show_progress_bar=False, batch_size=64)
+def find_merge_candidates(
+    df,
+    threshold: float = SIMILARITY_THRESHOLD,
+    max_weak_clicks: int = 200,
+) -> list[dict]:
+    """
+    Compare all blog titles using cosine similarity.
+    Returns pairs where:
+      - Weak blog has fewer clicks than strong blog
+      - Similarity score >= threshold
+      - Weak blog has <= max_weak_clicks total clicks
+    One result per weak blog (best matching strong).
+    """
+    embedder = load_embedder()
+    all_titles   = df["title"].tolist()
+    all_clicks   = df["total_clicks"].tolist()
+    all_urls     = df["url"].tolist()
+    # Candidate pool: blogs with low clicks only
+    weak_mask    = df["total_clicks"] <= max_weak_clicks
+    weak_idx     = df[weak_mask].index.tolist()
+    if not weak_idx:
+        return []
+    weak_titles  = [all_titles[i] for i in weak_idx]
+    all_emb  = get_embeddings(all_titles, embedder)
+    weak_emb = np.array([all_emb[i] for i in weak_idx])
+    sim_matrix = cosine_similarity(weak_emb, all_emb)
+    pairs = []
+    seen_weak = set()
+    for row_i, wi in enumerate(weak_idx):
+        if wi in seen_weak:
+            continue
+        sims = sim_matrix[row_i]
+        # Build ranked candidates for this weak blog
+        ranked = sorted(
+            [
+                (j, float(sims[j]))
+                for j in range(len(all_titles))
+                if j != wi
+                and sims[j] >= threshold
+                and all_clicks[j] > all_clicks[wi]   # strong must have more clicks
+            ],
+            key=lambda x: x[1],
+            reverse=True,
+        )
+        if ranked:
+            best_j, best_score = ranked[0]
+            pairs.append({
+                "weak_url":      all_urls[wi],
+                "weak_title":    all_titles[wi],
+                "weak_clicks":   all_clicks[wi],
+                "strong_url":    all_urls[best_j],
+                "strong_title":  all_titles[best_j],
+                "strong_clicks": all_clicks[best_j],
+                "similarity":    round(best_score, 4),
+                # Placeholders — filled in by LLM later
+                "topic_cluster": "",
+                "merge_reason":  "",
+                "approved":      True,   # default approved until user toggles
+            })
+            seen_weak.add(wi)
+    # Sort by similarity descending
+    pairs.sort(key=lambda x: x["similarity"], reverse=True)
+    return pairs

utils/excel_builder.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import io
+from openpyxl import Workbook
+from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
+from openpyxl.utils import get_column_letter
+# ── Shared styles ─────────────────────────────────────────────────────────────
+_thin   = Side(style="thin", color="CCCCCC")
+BORDER  = Border(left=_thin, right=_thin, top=_thin, bottom=_thin)
+NORMAL  = Font(name="Arial", size=10)
+HDR_FNT = Font(bold=True, color="FFFFFF", name="Arial", size=11)
+LEFT    = Alignment(horizontal="left",   vertical="center", wrap_text=True)
+CENTER  = Alignment(horizontal="center", vertical="center", wrap_text=True)
+def _fill(hex_color: str) -> PatternFill:
+    return PatternFill("solid", start_color=hex_color, end_color=hex_color)
+FILLS = {
+    "red":      _fill("FFDCE0"),
+    "orange":   _fill("FFE5CC"),
+    "yellow":   _fill("FFFACD"),
+    "yellow2":  _fill("FFF2A0"),
+    "blue":     _fill("DCE6F1"),
+    "blue2":    _fill("BDD7EE"),
+    "purple":   _fill("E8D5F5"),
+    "green":    _fill("E2EFDA"),
+    "green2":   _fill("D9EAD3"),
+    "dkgreen":  _fill("C6EFCE"),
+    "gold":     _fill("FFF2CC"),
+    "gold2":    _fill("FFE599"),
+    "hdr_red":  _fill("C00000"),
+    "hdr_navy": _fill("1F4E79"),
+    "hdr_olive":_fill("7B6000"),
+    "hdr_grn":  _fill("375623"),
+    "hdr_blue": _fill("0070C0"),
+    "hdr_gold": _fill("7F6000"),
+    "hdr_purple":_fill("5B2C8D"),
+}
+def _add_header(ws, headers: list, fill_key: str, row_h: int = 30):
+    ws.append(headers)
+    for cell in ws[1]:
+        cell.font      = HDR_FNT
+        cell.fill      = FILLS[fill_key]
+        cell.alignment = CENTER
+        cell.border    = BORDER
+    ws.row_dimensions[1].height = row_h
+def _style_row(ws, row_idx: int, fill_a: str, fill_b: str | None = None, i: int = 0):
+    fill = FILLS[fill_b] if (fill_b and i % 2 == 1) else FILLS[fill_a]
+    for cell in ws[row_idx]:
+        cell.font      = NORMAL
+        cell.fill      = fill
+        cell.alignment = LEFT
+        cell.border    = BORDER
+def _set_widths(ws, widths: dict):
+    for col, w in widths.items():
+        ws.column_dimensions[col].width = w
+def _month_col_widths(month_cols: list) -> dict:
+    """Return column letter → width for month columns starting at D."""
+    return {get_column_letter(4 + i): 9 for i in range(len(month_cols))}
+# ── Public builder ─────────────────────────────────────────────────────────────
+def build_excel(df, merge_pairs: list[dict], month_cols: list, site_name: str = "Website") -> bytes:
+    """
+    Build the full 7-tab Excel workbook and return it as bytes for download.
+    """
+    wb = Workbook()
+    # ── Tab 1: Summary Dashboard ───────────────────────────────────────────────
+    ws1 = wb.active
+    ws1.title = "Summary Dashboard"
+    _build_summary(ws1, df, merge_pairs, month_cols, site_name)
+    # ── Tab 2: Take Down ───────────────────────────────────────────────────────
+    ws2 = wb.create_sheet("Take Down")
+    _build_tier_tab(
+        ws2, df, month_cols,
+        tiers=["TAKEDOWN_ZERO", "TAKEDOWN_LOW"],
+        fill_map={"TAKEDOWN_ZERO": "red", "TAKEDOWN_LOW": "orange"},
+        hdr_key="hdr_red",
+        severity_map={"TAKEDOWN_ZERO": "CRITICAL – 0 Clicks", "TAKEDOWN_LOW": "HIGH – 1–5 Clicks"},
+    )
+    # ── Tab 3: Merge Recommendations ──────────────────────────────────────────
+    ws3 = wb.create_sheet("Merge Recommendations")
+    _build_merge_tab(ws3, merge_pairs)
+    # ── Tab 4: Monitor ─────────────────────────────────────────────────────────
+    ws4 = wb.create_sheet("Monitor (6–20 Clicks)")
+    _build_simple_tier(ws4, df, month_cols, "MONITOR", "yellow", "yellow2", "hdr_olive")
+    # ── Tab 5: Performing ──────────────────────────────────────────────────────
+    ws5 = wb.create_sheet("Performing (21–100 Clicks)")
+    _build_simple_tier(ws5, df, month_cols, "PERFORMING", "green", "green2", "hdr_grn")
+    # ── Tab 6: Strong ──────────────────────────────────────────────────────────
+    ws6 = wb.create_sheet("Strong (101–500 Clicks)")
+    _build_simple_tier(ws6, df, month_cols, "STRONG", "blue", "blue2", "hdr_blue")
+    # ── Tab 7: Top Performers ──────────────────────────────────────────────────
+    ws7 = wb.create_sheet("Top Performers (500+ Clicks)")
+    _build_simple_tier(ws7, df, month_cols, "TOP", "gold", "gold2", "hdr_gold")
+    # Return as bytes buffer
+    buf = io.BytesIO()
+    wb.save(buf)
+    buf.seek(0)
+    return buf.read()
+# ── Tab builders ──────────────────────────────────────────────────────────────
+def _build_summary(ws, df, merge_pairs, month_cols, site_name):
+    ws["A1"] = f"Blog Audit Report — {site_name}"
+    ws["A1"].font      = Font(bold=True, name="Arial", size=16, color="1F4E79")
+    ws["A1"].alignment = CENTER
+    ws.merge_cells("A1:E1")
+    ws.row_dimensions[1].height = 36
+    ws["A2"] = f"Total Blogs: {len(df)}  |  Months Analyzed: {len(month_cols)}"
+    ws["A2"].font      = Font(italic=True, name="Arial", size=10, color="808080")
+    ws["A2"].alignment = CENTER
+    ws.merge_cells("A2:E2")
+    # Header row
+    for j, h in enumerate(["Category", "Count", "Action Required", "Description"], 1):
+        c = ws.cell(row=3, column=j, value=h)
+        c.font = HDR_FNT; c.fill = FILLS["hdr_navy"]; c.alignment = CENTER; c.border = BORDER
+    ws.row_dimensions[3].height = 28
+    from utils.data_processor import get_tier_summary
+    counts = get_tier_summary(df)
+    rows = [
+        ("🔴 TAKE DOWN – Zero Clicks",        counts["TAKEDOWN_ZERO"], "Remove",              "No traffic at all. Immediate removal recommended.",       "red"),
+        ("🟠 TAKE DOWN – 1–5 Clicks",         counts["TAKEDOWN_LOW"],  "Remove / Merge",      "Negligible traffic with no recovery signal.",             "orange"),
+        ("🔵 MERGE – AI Detected Pairs",       len(merge_pairs),        "Merge + 301 Redirect","Consolidate into stronger related articles.",             "blue"),
+        ("🟡 MONITOR – 6–20 Clicks",          counts["MONITOR"],       "Optimize & Monitor",  "Underperforming. Optimize and review in 90 days.",        "yellow"),
+        ("✅ PERFORMING – 21–100 Clicks",     counts["PERFORMING"],    "Maintain & Optimize", "Acceptable performance. Strengthen meta and links.",      "green"),
+        ("💪 STRONG – 101–500 Clicks",        counts["STRONG"],        "Strengthen",          "Good performance. Freshen content and build backlinks.",  "dkgreen"),
+        ("🏆 TOP PERFORMERS – 500+ Clicks",   counts["TOP"],           "Priority Investment", "Star content. CTAs, lead magnets, backlink outreach.",    "gold2"),
+    ]
+    for i, (cat, cnt, act, desc, fk) in enumerate(rows):
+        r = i + 4
+        for j, val in enumerate([cat, cnt, act, desc], 1):
+            c = ws.cell(row=r, column=j, value=val)
+            c.font = NORMAL; c.fill = FILLS[fk]; c.alignment = LEFT; c.border = BORDER
+        ws.row_dimensions[r].height = 22
+    # Top 10
+    ws["A12"] = "Top 10 Performing Blogs"
+    ws["A12"].font = Font(bold=True, name="Arial", size=13, color="1F4E79")
+    ws.merge_cells("A12:E12")
+    ws.row_dimensions[12].height = 26
+    for j, h in enumerate(["#", "Blog URL", "Title", "Total Clicks", "Trend"], 1):
+        c = ws.cell(row=13, column=j, value=h)
+        c.font = HDR_FNT; c.fill = FILLS["hdr_grn"]; c.alignment = CENTER; c.border = BORDER
+    ws.row_dimensions[13].height = 28
+    top10 = df.nlargest(10, "total_clicks")
+    for i, (_, row) in enumerate(top10.iterrows()):
+        r = 14 + i
+        for j, val in enumerate([i + 1, row["url"], row["title"], row["total_clicks"], row["trend"]], 1):
+            c = ws.cell(row=r, column=j, value=val)
+            c.font = NORMAL; c.fill = FILLS["dkgreen"]; c.alignment = LEFT; c.border = BORDER
+    _set_widths(ws, {"A": 42, "B": 12, "C": 22, "D": 65, "E": 16})
+def _build_tier_tab(ws, df, month_cols, tiers, fill_map, hdr_key, severity_map=None):
+    extra = ["Severity"] if severity_map else []
+    headers = ["#", "Blog URL", "Title"] + month_cols + ["Total Clicks", "Trend"] + extra + ["Recommended Action"]
+    _add_header(ws, headers, hdr_key)
+    subset = df[df["tier"].isin(tiers)].sort_values("total_clicks")
+    for i, (_, row) in enumerate(subset.iterrows()):
+        monthly = [row[m] for m in month_cols]
+        sev = [severity_map[row["tier"]]] if severity_map else []
+        vals = [i + 1, row["url"], row["title"]] + monthly + \
+               [row["total_clicks"], row["trend"]] + sev + [row["action"]]
+        ws.append(vals)
+        fk = fill_map.get(row["tier"], "orange")
+        _style_row(ws, i + 2, fk)
+    n = len(month_cols)
+    w = {"A": 5, "B": 55, "C": 50}
+    w.update(_month_col_widths(month_cols))
+    clicks_col  = get_column_letter(4 + n)
+    trend_col   = get_column_letter(5 + n)
+    action_col  = get_column_letter(6 + n + (1 if severity_map else 0))
+    sev_col     = get_column_letter(6 + n) if severity_map else None
+    w[clicks_col] = 12
+    w[trend_col]  = 14
+    if sev_col: w[sev_col] = 22
+    w[action_col] = 60
+    _set_widths(ws, w)
+def _build_merge_tab(ws, merge_pairs):
+    headers = ["#", "Weak Blog URL (Merge FROM)", "Weak Title", "Weak Clicks",
+               "Strong Blog URL (Merge INTO)", "Strong Title", "Strong Clicks",
+               "Similarity", "Topic Cluster", "AI Merge Reason"]
+    _add_header(ws, headers, "hdr_navy")
+    for i, p in enumerate(merge_pairs):
+        vals = [
+            i + 1,
+            p.get("weak_url", ""),
+            p.get("weak_title", ""),
+            p.get("weak_clicks", 0),
+            p.get("strong_url", ""),
+            p.get("strong_title", ""),
+            p.get("strong_clicks", 0),
+            p.get("similarity", ""),
+            p.get("topic_cluster", ""),
+            p.get("merge_reason", ""),
+        ]
+        ws.append(vals)
+        fk = "blue" if i % 2 == 0 else "purple"
+        _style_row(ws, i + 2, fk)
+    _set_widths(ws, {"A": 5, "B": 52, "C": 42, "D": 10,
+                     "E": 52, "F": 42, "G": 10, "H": 11,
+                     "I": 22, "J": 65})
+def _build_simple_tier(ws, df, month_cols, tier_key, fill_a, fill_b, hdr_key):
+    headers = ["#", "Blog URL", "Title"] + month_cols + ["Total Clicks", "Trend", "Recommended Action"]
+    _add_header(ws, headers, hdr_key)
+    subset = df[df["tier"] == tier_key].sort_values("total_clicks", ascending=False)
+    for i, (_, row) in enumerate(subset.iterrows()):
+        monthly = [row[m] for m in month_cols]
+        vals = [i + 1, row["url"], row["title"]] + monthly + \
+               [row["total_clicks"], row["trend"], row["action"]]
+        ws.append(vals)
+        _style_row(ws, i + 2, fill_a, fill_b, i)
+    n = len(month_cols)
+    w = {"A": 5, "B": 55, "C": 50}
+    w.update(_month_col_widths(month_cols))
+    w[get_column_letter(4 + n)] = 12
+    w[get_column_letter(5 + n)] = 14
+    w[get_column_letter(6 + n)] = 60
+    _set_widths(ws, w)