Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- utils/data_processor.py +169 -0
- utils/embeddings.py +94 -0
- utils/excel_builder.py +256 -0
utils/data_processor.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
# Fixed non-click columns
|
| 5 |
+
NON_CLICK_COLS = {"date", "month", "year", "blogs", "h1", "meta title",
|
| 6 |
+
"meta description", "total clicks", "total_clicks"}
|
| 7 |
+
|
| 8 |
+
TIER_CONFIG = {
|
| 9 |
+
"takedown_zero_max": 0,
|
| 10 |
+
"takedown_low_max": 5,
|
| 11 |
+
"takedown_low_monthly": 2,
|
| 12 |
+
"monitor_min": 6,
|
| 13 |
+
"monitor_max": 20,
|
| 14 |
+
"performing_min": 21,
|
| 15 |
+
"performing_max": 100,
|
| 16 |
+
"strong_min": 101,
|
| 17 |
+
"strong_max": 500,
|
| 18 |
+
"top_min": 501,
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def detect_columns(df: pd.DataFrame) -> dict:
|
| 23 |
+
"""
|
| 24 |
+
Auto-detect URL, title, month, and total_clicks columns
|
| 25 |
+
from any CSV with the expected structure.
|
| 26 |
+
Returns a dict with keys: url_col, title_col, month_cols, total_col
|
| 27 |
+
"""
|
| 28 |
+
cols_lower = {c.lower(): c for c in df.columns}
|
| 29 |
+
|
| 30 |
+
# URL column β 'Blogs' or any col containing http values
|
| 31 |
+
url_col = cols_lower.get("blogs") or cols_lower.get("url") or cols_lower.get("urls")
|
| 32 |
+
if not url_col:
|
| 33 |
+
for col in df.columns:
|
| 34 |
+
sample = df[col].dropna().astype(str).head(5)
|
| 35 |
+
if sample.str.startswith("http").any():
|
| 36 |
+
url_col = col
|
| 37 |
+
break
|
| 38 |
+
|
| 39 |
+
# Title column β 'H1' or 'Title'
|
| 40 |
+
title_col = cols_lower.get("h1") or cols_lower.get("title") or cols_lower.get("meta title")
|
| 41 |
+
|
| 42 |
+
# Total clicks column
|
| 43 |
+
total_col = cols_lower.get("total clicks") or cols_lower.get("total_clicks")
|
| 44 |
+
|
| 45 |
+
# Month columns β numeric columns not in the known set
|
| 46 |
+
month_cols = []
|
| 47 |
+
for col in df.columns:
|
| 48 |
+
if col.lower() in NON_CLICK_COLS:
|
| 49 |
+
continue
|
| 50 |
+
if pd.api.types.is_numeric_dtype(df[col]) or _is_mostly_numeric(df[col]):
|
| 51 |
+
month_cols.append(col)
|
| 52 |
+
|
| 53 |
+
# Remove total_col from month_cols if accidentally included
|
| 54 |
+
if total_col and total_col in month_cols:
|
| 55 |
+
month_cols.remove(total_col)
|
| 56 |
+
|
| 57 |
+
return {
|
| 58 |
+
"url_col": url_col,
|
| 59 |
+
"title_col": title_col,
|
| 60 |
+
"month_cols": month_cols,
|
| 61 |
+
"total_col": total_col,
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _is_mostly_numeric(series: pd.Series, threshold=0.7) -> bool:
|
| 66 |
+
converted = pd.to_numeric(series, errors="coerce")
|
| 67 |
+
valid = converted.notna().sum()
|
| 68 |
+
return valid / max(len(series), 1) >= threshold
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def clean_and_tier(df: pd.DataFrame, col_map: dict) -> pd.DataFrame:
|
| 72 |
+
"""
|
| 73 |
+
Build a clean analysis dataframe with tier, trend, and slug columns.
|
| 74 |
+
"""
|
| 75 |
+
url_col = col_map["url_col"]
|
| 76 |
+
title_col = col_map["title_col"]
|
| 77 |
+
month_cols = col_map["month_cols"]
|
| 78 |
+
total_col = col_map["total_col"]
|
| 79 |
+
|
| 80 |
+
out = pd.DataFrame()
|
| 81 |
+
out["url"] = df[url_col].astype(str).str.strip()
|
| 82 |
+
out["title"] = df[title_col].astype(str).str.strip() if title_col else out["url"]
|
| 83 |
+
out["slug"] = out["url"].apply(
|
| 84 |
+
lambda x: x.split("/blog/")[-1] if "/blog/" in x else x.rstrip("/").split("/")[-1]
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
for col in month_cols:
|
| 88 |
+
out[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)
|
| 89 |
+
|
| 90 |
+
if total_col:
|
| 91 |
+
out["total_clicks"] = pd.to_numeric(df[total_col], errors="coerce").fillna(0).astype(int)
|
| 92 |
+
else:
|
| 93 |
+
out["total_clicks"] = out[month_cols].sum(axis=1)
|
| 94 |
+
|
| 95 |
+
# Trend
|
| 96 |
+
out["trend"] = out.apply(lambda r: _get_trend(r, month_cols), axis=1)
|
| 97 |
+
|
| 98 |
+
# Tier
|
| 99 |
+
out["tier"] = out.apply(lambda r: _assign_tier(r, month_cols), axis=1)
|
| 100 |
+
|
| 101 |
+
# Recommended action
|
| 102 |
+
out["action"] = out.apply(lambda r: _get_action(r["tier"], r["total_clicks"]), axis=1)
|
| 103 |
+
|
| 104 |
+
return out
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _get_trend(row, month_cols):
|
| 108 |
+
if len(month_cols) < 2:
|
| 109 |
+
return "β‘οΈ Stable"
|
| 110 |
+
vals = [row[c] for c in month_cols]
|
| 111 |
+
if vals[-1] > vals[0]:
|
| 112 |
+
return "π Growing"
|
| 113 |
+
elif vals[-1] < vals[0]:
|
| 114 |
+
return "π Declining"
|
| 115 |
+
return "β‘οΈ Stable"
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def _assign_tier(row, month_cols):
|
| 119 |
+
c = row["total_clicks"]
|
| 120 |
+
max_m = max([row[m] for m in month_cols]) if month_cols else 0
|
| 121 |
+
if c == 0:
|
| 122 |
+
return "TAKEDOWN_ZERO"
|
| 123 |
+
elif c <= TIER_CONFIG["takedown_low_max"] and max_m <= TIER_CONFIG["takedown_low_monthly"]:
|
| 124 |
+
return "TAKEDOWN_LOW"
|
| 125 |
+
elif TIER_CONFIG["monitor_min"] <= c <= TIER_CONFIG["monitor_max"]:
|
| 126 |
+
return "MONITOR"
|
| 127 |
+
elif TIER_CONFIG["performing_min"] <= c <= TIER_CONFIG["performing_max"]:
|
| 128 |
+
return "PERFORMING"
|
| 129 |
+
elif TIER_CONFIG["strong_min"] <= c <= TIER_CONFIG["strong_max"]:
|
| 130 |
+
return "STRONG"
|
| 131 |
+
elif c >= TIER_CONFIG["top_min"]:
|
| 132 |
+
return "TOP"
|
| 133 |
+
return "PERFORMING"
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _get_action(tier, clicks):
|
| 137 |
+
actions = {
|
| 138 |
+
"TAKEDOWN_ZERO": "Remove immediately. Zero organic traction across all months. Set up 410 or 301 redirect.",
|
| 139 |
+
"TAKEDOWN_LOW": "Merge into a stronger related article or remove. Implement 301 redirect.",
|
| 140 |
+
"MONITOR": "Optimize meta title, description & keywords. Review in 90 days. Merge if no improvement.",
|
| 141 |
+
"PERFORMING": "Refresh content, strengthen internal links, add FAQ schema. Push for top 50 clicks.",
|
| 142 |
+
"STRONG": "Update statistics & examples. Add lead gen CTA. Build backlinks to reach 500+ clicks.",
|
| 143 |
+
"TOP": "Priority asset. Add lead magnets, improve CTAs, build backlinks. Protect rankings.",
|
| 144 |
+
}
|
| 145 |
+
if tier == "MONITOR" and clicks >= 15:
|
| 146 |
+
return "Good potential. Optimize meta description and add 2β3 internal links to improve CTR."
|
| 147 |
+
return actions.get(tier, "Review manually.")
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def get_tier_summary(df: pd.DataFrame) -> dict:
|
| 151 |
+
counts = df["tier"].value_counts().to_dict()
|
| 152 |
+
return {
|
| 153 |
+
"TAKEDOWN_ZERO": counts.get("TAKEDOWN_ZERO", 0),
|
| 154 |
+
"TAKEDOWN_LOW": counts.get("TAKEDOWN_LOW", 0),
|
| 155 |
+
"MONITOR": counts.get("MONITOR", 0),
|
| 156 |
+
"PERFORMING": counts.get("PERFORMING", 0),
|
| 157 |
+
"STRONG": counts.get("STRONG", 0),
|
| 158 |
+
"TOP": counts.get("TOP", 0),
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
TIER_META = {
|
| 163 |
+
"TAKEDOWN_ZERO": {"label": "π΄ Take Down (0 clicks)", "color": "#FF4B4B"},
|
| 164 |
+
"TAKEDOWN_LOW": {"label": "π Take Down (1β5 clicks)", "color": "#FF8C00"},
|
| 165 |
+
"MONITOR": {"label": "π‘ Monitor (6β20 clicks)", "color": "#FFC300"},
|
| 166 |
+
"PERFORMING": {"label": "β
Performing (21β100 clicks)", "color": "#2ECC71"},
|
| 167 |
+
"STRONG": {"label": "πͺ Strong (101β500 clicks)", "color": "#1ABC9C"},
|
| 168 |
+
"TOP": {"label": "π Top Performers (500+ clicks)","color": "#9B59B6"},
|
| 169 |
+
}
|
utils/embeddings.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from sentence_transformers import SentenceTransformer
|
| 3 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 4 |
+
import streamlit as st
|
| 5 |
+
|
| 6 |
+
MODEL_NAME = "all-MiniLM-L6-v2" # ~90 MB, English-optimised, fast
|
| 7 |
+
SIMILARITY_THRESHOLD = 0.72
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@st.cache_resource(show_spinner=False)
|
| 11 |
+
def load_embedder():
|
| 12 |
+
"""Load the sentence-transformer model once and cache it for the session."""
|
| 13 |
+
return SentenceTransformer(MODEL_NAME)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def get_embeddings(titles: list[str], embedder) -> np.ndarray:
|
| 17 |
+
"""Generate sentence embeddings for a list of titles."""
|
| 18 |
+
return embedder.encode(titles, show_progress_bar=False, batch_size=64)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def find_merge_candidates(
|
| 22 |
+
df,
|
| 23 |
+
threshold: float = SIMILARITY_THRESHOLD,
|
| 24 |
+
max_weak_clicks: int = 200,
|
| 25 |
+
) -> list[dict]:
|
| 26 |
+
"""
|
| 27 |
+
Compare all blog titles using cosine similarity.
|
| 28 |
+
Returns pairs where:
|
| 29 |
+
- Weak blog has fewer clicks than strong blog
|
| 30 |
+
- Similarity score >= threshold
|
| 31 |
+
- Weak blog has <= max_weak_clicks total clicks
|
| 32 |
+
One result per weak blog (best matching strong).
|
| 33 |
+
"""
|
| 34 |
+
embedder = load_embedder()
|
| 35 |
+
|
| 36 |
+
all_titles = df["title"].tolist()
|
| 37 |
+
all_clicks = df["total_clicks"].tolist()
|
| 38 |
+
all_urls = df["url"].tolist()
|
| 39 |
+
|
| 40 |
+
# Candidate pool: blogs with low clicks only
|
| 41 |
+
weak_mask = df["total_clicks"] <= max_weak_clicks
|
| 42 |
+
weak_idx = df[weak_mask].index.tolist()
|
| 43 |
+
|
| 44 |
+
if not weak_idx:
|
| 45 |
+
return []
|
| 46 |
+
|
| 47 |
+
weak_titles = [all_titles[i] for i in weak_idx]
|
| 48 |
+
|
| 49 |
+
all_emb = get_embeddings(all_titles, embedder)
|
| 50 |
+
weak_emb = np.array([all_emb[i] for i in weak_idx])
|
| 51 |
+
|
| 52 |
+
sim_matrix = cosine_similarity(weak_emb, all_emb)
|
| 53 |
+
|
| 54 |
+
pairs = []
|
| 55 |
+
seen_weak = set()
|
| 56 |
+
|
| 57 |
+
for row_i, wi in enumerate(weak_idx):
|
| 58 |
+
if wi in seen_weak:
|
| 59 |
+
continue
|
| 60 |
+
sims = sim_matrix[row_i]
|
| 61 |
+
|
| 62 |
+
# Build ranked candidates for this weak blog
|
| 63 |
+
ranked = sorted(
|
| 64 |
+
[
|
| 65 |
+
(j, float(sims[j]))
|
| 66 |
+
for j in range(len(all_titles))
|
| 67 |
+
if j != wi
|
| 68 |
+
and sims[j] >= threshold
|
| 69 |
+
and all_clicks[j] > all_clicks[wi] # strong must have more clicks
|
| 70 |
+
],
|
| 71 |
+
key=lambda x: x[1],
|
| 72 |
+
reverse=True,
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
if ranked:
|
| 76 |
+
best_j, best_score = ranked[0]
|
| 77 |
+
pairs.append({
|
| 78 |
+
"weak_url": all_urls[wi],
|
| 79 |
+
"weak_title": all_titles[wi],
|
| 80 |
+
"weak_clicks": all_clicks[wi],
|
| 81 |
+
"strong_url": all_urls[best_j],
|
| 82 |
+
"strong_title": all_titles[best_j],
|
| 83 |
+
"strong_clicks": all_clicks[best_j],
|
| 84 |
+
"similarity": round(best_score, 4),
|
| 85 |
+
# Placeholders β filled in by LLM later
|
| 86 |
+
"topic_cluster": "",
|
| 87 |
+
"merge_reason": "",
|
| 88 |
+
"approved": True, # default approved until user toggles
|
| 89 |
+
})
|
| 90 |
+
seen_weak.add(wi)
|
| 91 |
+
|
| 92 |
+
# Sort by similarity descending
|
| 93 |
+
pairs.sort(key=lambda x: x["similarity"], reverse=True)
|
| 94 |
+
return pairs
|
utils/excel_builder.py
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
from openpyxl import Workbook
|
| 3 |
+
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
| 4 |
+
from openpyxl.utils import get_column_letter
|
| 5 |
+
|
| 6 |
+
# ββ Shared styles βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 7 |
+
_thin = Side(style="thin", color="CCCCCC")
|
| 8 |
+
BORDER = Border(left=_thin, right=_thin, top=_thin, bottom=_thin)
|
| 9 |
+
NORMAL = Font(name="Arial", size=10)
|
| 10 |
+
HDR_FNT = Font(bold=True, color="FFFFFF", name="Arial", size=11)
|
| 11 |
+
LEFT = Alignment(horizontal="left", vertical="center", wrap_text=True)
|
| 12 |
+
CENTER = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _fill(hex_color: str) -> PatternFill:
|
| 16 |
+
return PatternFill("solid", start_color=hex_color, end_color=hex_color)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
FILLS = {
|
| 20 |
+
"red": _fill("FFDCE0"),
|
| 21 |
+
"orange": _fill("FFE5CC"),
|
| 22 |
+
"yellow": _fill("FFFACD"),
|
| 23 |
+
"yellow2": _fill("FFF2A0"),
|
| 24 |
+
"blue": _fill("DCE6F1"),
|
| 25 |
+
"blue2": _fill("BDD7EE"),
|
| 26 |
+
"purple": _fill("E8D5F5"),
|
| 27 |
+
"green": _fill("E2EFDA"),
|
| 28 |
+
"green2": _fill("D9EAD3"),
|
| 29 |
+
"dkgreen": _fill("C6EFCE"),
|
| 30 |
+
"gold": _fill("FFF2CC"),
|
| 31 |
+
"gold2": _fill("FFE599"),
|
| 32 |
+
"hdr_red": _fill("C00000"),
|
| 33 |
+
"hdr_navy": _fill("1F4E79"),
|
| 34 |
+
"hdr_olive":_fill("7B6000"),
|
| 35 |
+
"hdr_grn": _fill("375623"),
|
| 36 |
+
"hdr_blue": _fill("0070C0"),
|
| 37 |
+
"hdr_gold": _fill("7F6000"),
|
| 38 |
+
"hdr_purple":_fill("5B2C8D"),
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _add_header(ws, headers: list, fill_key: str, row_h: int = 30):
|
| 43 |
+
ws.append(headers)
|
| 44 |
+
for cell in ws[1]:
|
| 45 |
+
cell.font = HDR_FNT
|
| 46 |
+
cell.fill = FILLS[fill_key]
|
| 47 |
+
cell.alignment = CENTER
|
| 48 |
+
cell.border = BORDER
|
| 49 |
+
ws.row_dimensions[1].height = row_h
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _style_row(ws, row_idx: int, fill_a: str, fill_b: str | None = None, i: int = 0):
|
| 53 |
+
fill = FILLS[fill_b] if (fill_b and i % 2 == 1) else FILLS[fill_a]
|
| 54 |
+
for cell in ws[row_idx]:
|
| 55 |
+
cell.font = NORMAL
|
| 56 |
+
cell.fill = fill
|
| 57 |
+
cell.alignment = LEFT
|
| 58 |
+
cell.border = BORDER
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _set_widths(ws, widths: dict):
|
| 62 |
+
for col, w in widths.items():
|
| 63 |
+
ws.column_dimensions[col].width = w
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _month_col_widths(month_cols: list) -> dict:
|
| 67 |
+
"""Return column letter β width for month columns starting at D."""
|
| 68 |
+
return {get_column_letter(4 + i): 9 for i in range(len(month_cols))}
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
# ββ Public builder βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 72 |
+
def build_excel(df, merge_pairs: list[dict], month_cols: list, site_name: str = "Website") -> bytes:
|
| 73 |
+
"""
|
| 74 |
+
Build the full 7-tab Excel workbook and return it as bytes for download.
|
| 75 |
+
"""
|
| 76 |
+
wb = Workbook()
|
| 77 |
+
|
| 78 |
+
# ββ Tab 1: Summary Dashboard βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 79 |
+
ws1 = wb.active
|
| 80 |
+
ws1.title = "Summary Dashboard"
|
| 81 |
+
_build_summary(ws1, df, merge_pairs, month_cols, site_name)
|
| 82 |
+
|
| 83 |
+
# ββ Tab 2: Take Down βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 84 |
+
ws2 = wb.create_sheet("Take Down")
|
| 85 |
+
_build_tier_tab(
|
| 86 |
+
ws2, df, month_cols,
|
| 87 |
+
tiers=["TAKEDOWN_ZERO", "TAKEDOWN_LOW"],
|
| 88 |
+
fill_map={"TAKEDOWN_ZERO": "red", "TAKEDOWN_LOW": "orange"},
|
| 89 |
+
hdr_key="hdr_red",
|
| 90 |
+
severity_map={"TAKEDOWN_ZERO": "CRITICAL β 0 Clicks", "TAKEDOWN_LOW": "HIGH β 1β5 Clicks"},
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
# ββ Tab 3: Merge Recommendations ββββββββββββββββββββββββββββββββββββββββββ
|
| 94 |
+
ws3 = wb.create_sheet("Merge Recommendations")
|
| 95 |
+
_build_merge_tab(ws3, merge_pairs)
|
| 96 |
+
|
| 97 |
+
# ββ Tab 4: Monitor βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 98 |
+
ws4 = wb.create_sheet("Monitor (6β20 Clicks)")
|
| 99 |
+
_build_simple_tier(ws4, df, month_cols, "MONITOR", "yellow", "yellow2", "hdr_olive")
|
| 100 |
+
|
| 101 |
+
# ββ Tab 5: Performing ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 102 |
+
ws5 = wb.create_sheet("Performing (21β100 Clicks)")
|
| 103 |
+
_build_simple_tier(ws5, df, month_cols, "PERFORMING", "green", "green2", "hdr_grn")
|
| 104 |
+
|
| 105 |
+
# ββ Tab 6: Strong ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 106 |
+
ws6 = wb.create_sheet("Strong (101β500 Clicks)")
|
| 107 |
+
_build_simple_tier(ws6, df, month_cols, "STRONG", "blue", "blue2", "hdr_blue")
|
| 108 |
+
|
| 109 |
+
# ββ Tab 7: Top Performers ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 110 |
+
ws7 = wb.create_sheet("Top Performers (500+ Clicks)")
|
| 111 |
+
_build_simple_tier(ws7, df, month_cols, "TOP", "gold", "gold2", "hdr_gold")
|
| 112 |
+
|
| 113 |
+
# Return as bytes buffer
|
| 114 |
+
buf = io.BytesIO()
|
| 115 |
+
wb.save(buf)
|
| 116 |
+
buf.seek(0)
|
| 117 |
+
return buf.read()
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# ββ Tab builders ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 121 |
+
|
| 122 |
+
def _build_summary(ws, df, merge_pairs, month_cols, site_name):
|
| 123 |
+
ws["A1"] = f"Blog Audit Report β {site_name}"
|
| 124 |
+
ws["A1"].font = Font(bold=True, name="Arial", size=16, color="1F4E79")
|
| 125 |
+
ws["A1"].alignment = CENTER
|
| 126 |
+
ws.merge_cells("A1:E1")
|
| 127 |
+
ws.row_dimensions[1].height = 36
|
| 128 |
+
|
| 129 |
+
ws["A2"] = f"Total Blogs: {len(df)} | Months Analyzed: {len(month_cols)}"
|
| 130 |
+
ws["A2"].font = Font(italic=True, name="Arial", size=10, color="808080")
|
| 131 |
+
ws["A2"].alignment = CENTER
|
| 132 |
+
ws.merge_cells("A2:E2")
|
| 133 |
+
|
| 134 |
+
# Header row
|
| 135 |
+
for j, h in enumerate(["Category", "Count", "Action Required", "Description"], 1):
|
| 136 |
+
c = ws.cell(row=3, column=j, value=h)
|
| 137 |
+
c.font = HDR_FNT; c.fill = FILLS["hdr_navy"]; c.alignment = CENTER; c.border = BORDER
|
| 138 |
+
ws.row_dimensions[3].height = 28
|
| 139 |
+
|
| 140 |
+
from utils.data_processor import get_tier_summary
|
| 141 |
+
counts = get_tier_summary(df)
|
| 142 |
+
|
| 143 |
+
rows = [
|
| 144 |
+
("π΄ TAKE DOWN β Zero Clicks", counts["TAKEDOWN_ZERO"], "Remove", "No traffic at all. Immediate removal recommended.", "red"),
|
| 145 |
+
("π TAKE DOWN β 1β5 Clicks", counts["TAKEDOWN_LOW"], "Remove / Merge", "Negligible traffic with no recovery signal.", "orange"),
|
| 146 |
+
("π΅ MERGE β AI Detected Pairs", len(merge_pairs), "Merge + 301 Redirect","Consolidate into stronger related articles.", "blue"),
|
| 147 |
+
("π‘ MONITOR β 6β20 Clicks", counts["MONITOR"], "Optimize & Monitor", "Underperforming. Optimize and review in 90 days.", "yellow"),
|
| 148 |
+
("β
PERFORMING β 21β100 Clicks", counts["PERFORMING"], "Maintain & Optimize", "Acceptable performance. Strengthen meta and links.", "green"),
|
| 149 |
+
("πͺ STRONG β 101β500 Clicks", counts["STRONG"], "Strengthen", "Good performance. Freshen content and build backlinks.", "dkgreen"),
|
| 150 |
+
("π TOP PERFORMERS β 500+ Clicks", counts["TOP"], "Priority Investment", "Star content. CTAs, lead magnets, backlink outreach.", "gold2"),
|
| 151 |
+
]
|
| 152 |
+
|
| 153 |
+
for i, (cat, cnt, act, desc, fk) in enumerate(rows):
|
| 154 |
+
r = i + 4
|
| 155 |
+
for j, val in enumerate([cat, cnt, act, desc], 1):
|
| 156 |
+
c = ws.cell(row=r, column=j, value=val)
|
| 157 |
+
c.font = NORMAL; c.fill = FILLS[fk]; c.alignment = LEFT; c.border = BORDER
|
| 158 |
+
ws.row_dimensions[r].height = 22
|
| 159 |
+
|
| 160 |
+
# Top 10
|
| 161 |
+
ws["A12"] = "Top 10 Performing Blogs"
|
| 162 |
+
ws["A12"].font = Font(bold=True, name="Arial", size=13, color="1F4E79")
|
| 163 |
+
ws.merge_cells("A12:E12")
|
| 164 |
+
ws.row_dimensions[12].height = 26
|
| 165 |
+
|
| 166 |
+
for j, h in enumerate(["#", "Blog URL", "Title", "Total Clicks", "Trend"], 1):
|
| 167 |
+
c = ws.cell(row=13, column=j, value=h)
|
| 168 |
+
c.font = HDR_FNT; c.fill = FILLS["hdr_grn"]; c.alignment = CENTER; c.border = BORDER
|
| 169 |
+
ws.row_dimensions[13].height = 28
|
| 170 |
+
|
| 171 |
+
top10 = df.nlargest(10, "total_clicks")
|
| 172 |
+
for i, (_, row) in enumerate(top10.iterrows()):
|
| 173 |
+
r = 14 + i
|
| 174 |
+
for j, val in enumerate([i + 1, row["url"], row["title"], row["total_clicks"], row["trend"]], 1):
|
| 175 |
+
c = ws.cell(row=r, column=j, value=val)
|
| 176 |
+
c.font = NORMAL; c.fill = FILLS["dkgreen"]; c.alignment = LEFT; c.border = BORDER
|
| 177 |
+
|
| 178 |
+
_set_widths(ws, {"A": 42, "B": 12, "C": 22, "D": 65, "E": 16})
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def _build_tier_tab(ws, df, month_cols, tiers, fill_map, hdr_key, severity_map=None):
|
| 182 |
+
extra = ["Severity"] if severity_map else []
|
| 183 |
+
headers = ["#", "Blog URL", "Title"] + month_cols + ["Total Clicks", "Trend"] + extra + ["Recommended Action"]
|
| 184 |
+
_add_header(ws, headers, hdr_key)
|
| 185 |
+
|
| 186 |
+
subset = df[df["tier"].isin(tiers)].sort_values("total_clicks")
|
| 187 |
+
for i, (_, row) in enumerate(subset.iterrows()):
|
| 188 |
+
monthly = [row[m] for m in month_cols]
|
| 189 |
+
sev = [severity_map[row["tier"]]] if severity_map else []
|
| 190 |
+
vals = [i + 1, row["url"], row["title"]] + monthly + \
|
| 191 |
+
[row["total_clicks"], row["trend"]] + sev + [row["action"]]
|
| 192 |
+
ws.append(vals)
|
| 193 |
+
fk = fill_map.get(row["tier"], "orange")
|
| 194 |
+
_style_row(ws, i + 2, fk)
|
| 195 |
+
|
| 196 |
+
n = len(month_cols)
|
| 197 |
+
w = {"A": 5, "B": 55, "C": 50}
|
| 198 |
+
w.update(_month_col_widths(month_cols))
|
| 199 |
+
clicks_col = get_column_letter(4 + n)
|
| 200 |
+
trend_col = get_column_letter(5 + n)
|
| 201 |
+
action_col = get_column_letter(6 + n + (1 if severity_map else 0))
|
| 202 |
+
sev_col = get_column_letter(6 + n) if severity_map else None
|
| 203 |
+
w[clicks_col] = 12
|
| 204 |
+
w[trend_col] = 14
|
| 205 |
+
if sev_col: w[sev_col] = 22
|
| 206 |
+
w[action_col] = 60
|
| 207 |
+
_set_widths(ws, w)
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def _build_merge_tab(ws, merge_pairs):
|
| 211 |
+
headers = ["#", "Weak Blog URL (Merge FROM)", "Weak Title", "Weak Clicks",
|
| 212 |
+
"Strong Blog URL (Merge INTO)", "Strong Title", "Strong Clicks",
|
| 213 |
+
"Similarity", "Topic Cluster", "AI Merge Reason"]
|
| 214 |
+
_add_header(ws, headers, "hdr_navy")
|
| 215 |
+
|
| 216 |
+
for i, p in enumerate(merge_pairs):
|
| 217 |
+
vals = [
|
| 218 |
+
i + 1,
|
| 219 |
+
p.get("weak_url", ""),
|
| 220 |
+
p.get("weak_title", ""),
|
| 221 |
+
p.get("weak_clicks", 0),
|
| 222 |
+
p.get("strong_url", ""),
|
| 223 |
+
p.get("strong_title", ""),
|
| 224 |
+
p.get("strong_clicks", 0),
|
| 225 |
+
p.get("similarity", ""),
|
| 226 |
+
p.get("topic_cluster", ""),
|
| 227 |
+
p.get("merge_reason", ""),
|
| 228 |
+
]
|
| 229 |
+
ws.append(vals)
|
| 230 |
+
fk = "blue" if i % 2 == 0 else "purple"
|
| 231 |
+
_style_row(ws, i + 2, fk)
|
| 232 |
+
|
| 233 |
+
_set_widths(ws, {"A": 5, "B": 52, "C": 42, "D": 10,
|
| 234 |
+
"E": 52, "F": 42, "G": 10, "H": 11,
|
| 235 |
+
"I": 22, "J": 65})
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def _build_simple_tier(ws, df, month_cols, tier_key, fill_a, fill_b, hdr_key):
|
| 239 |
+
headers = ["#", "Blog URL", "Title"] + month_cols + ["Total Clicks", "Trend", "Recommended Action"]
|
| 240 |
+
_add_header(ws, headers, hdr_key)
|
| 241 |
+
|
| 242 |
+
subset = df[df["tier"] == tier_key].sort_values("total_clicks", ascending=False)
|
| 243 |
+
for i, (_, row) in enumerate(subset.iterrows()):
|
| 244 |
+
monthly = [row[m] for m in month_cols]
|
| 245 |
+
vals = [i + 1, row["url"], row["title"]] + monthly + \
|
| 246 |
+
[row["total_clicks"], row["trend"], row["action"]]
|
| 247 |
+
ws.append(vals)
|
| 248 |
+
_style_row(ws, i + 2, fill_a, fill_b, i)
|
| 249 |
+
|
| 250 |
+
n = len(month_cols)
|
| 251 |
+
w = {"A": 5, "B": 55, "C": 50}
|
| 252 |
+
w.update(_month_col_widths(month_cols))
|
| 253 |
+
w[get_column_letter(4 + n)] = 12
|
| 254 |
+
w[get_column_letter(5 + n)] = 14
|
| 255 |
+
w[get_column_letter(6 + n)] = 60
|
| 256 |
+
_set_widths(ws, w)
|