Spaces:

edstellar
/

blog-audit

Sleeping

File size: 6,332 Bytes

6d28094

import pandas as pd
import numpy as np

# Fixed non-click columns
NON_CLICK_COLS = {"date", "month", "year", "blogs", "h1", "meta title",
                  "meta description", "total clicks", "total_clicks"}

TIER_CONFIG = {
    "takedown_zero_max":    0,
    "takedown_low_max":     5,
    "takedown_low_monthly": 2,
    "monitor_min":          6,
    "monitor_max":          20,
    "performing_min":       21,
    "performing_max":       100,
    "strong_min":           101,
    "strong_max":           500,
    "top_min":              501,
}


def detect_columns(df: pd.DataFrame) -> dict:
    """
    Auto-detect URL, title, month, and total_clicks columns
    from any CSV with the expected structure.
    Returns a dict with keys: url_col, title_col, month_cols, total_col
    """
    cols_lower = {c.lower(): c for c in df.columns}

    # URL column — 'Blogs' or any col containing http values
    url_col = cols_lower.get("blogs") or cols_lower.get("url") or cols_lower.get("urls")
    if not url_col:
        for col in df.columns:
            sample = df[col].dropna().astype(str).head(5)
            if sample.str.startswith("http").any():
                url_col = col
                break

    # Title column — 'H1' or 'Title'
    title_col = cols_lower.get("h1") or cols_lower.get("title") or cols_lower.get("meta title")

    # Total clicks column
    total_col = cols_lower.get("total clicks") or cols_lower.get("total_clicks")

    # Month columns — numeric columns not in the known set
    month_cols = []
    for col in df.columns:
        if col.lower() in NON_CLICK_COLS:
            continue
        if pd.api.types.is_numeric_dtype(df[col]) or _is_mostly_numeric(df[col]):
            month_cols.append(col)

    # Remove total_col from month_cols if accidentally included
    if total_col and total_col in month_cols:
        month_cols.remove(total_col)

    return {
        "url_col":    url_col,
        "title_col":  title_col,
        "month_cols": month_cols,
        "total_col":  total_col,
    }


def _is_mostly_numeric(series: pd.Series, threshold=0.7) -> bool:
    converted = pd.to_numeric(series, errors="coerce")
    valid = converted.notna().sum()
    return valid / max(len(series), 1) >= threshold


def clean_and_tier(df: pd.DataFrame, col_map: dict) -> pd.DataFrame:
    """
    Build a clean analysis dataframe with tier, trend, and slug columns.
    """
    url_col   = col_map["url_col"]
    title_col = col_map["title_col"]
    month_cols = col_map["month_cols"]
    total_col  = col_map["total_col"]

    out = pd.DataFrame()
    out["url"]   = df[url_col].astype(str).str.strip()
    out["title"] = df[title_col].astype(str).str.strip() if title_col else out["url"]
    out["slug"]  = out["url"].apply(
        lambda x: x.split("/blog/")[-1] if "/blog/" in x else x.rstrip("/").split("/")[-1]
    )

    for col in month_cols:
        out[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)

    if total_col:
        out["total_clicks"] = pd.to_numeric(df[total_col], errors="coerce").fillna(0).astype(int)
    else:
        out["total_clicks"] = out[month_cols].sum(axis=1)

    # Trend
    out["trend"] = out.apply(lambda r: _get_trend(r, month_cols), axis=1)

    # Tier
    out["tier"] = out.apply(lambda r: _assign_tier(r, month_cols), axis=1)

    # Recommended action
    out["action"] = out.apply(lambda r: _get_action(r["tier"], r["total_clicks"]), axis=1)

    return out


def _get_trend(row, month_cols):
    if len(month_cols) < 2:
        return "➡️ Stable"
    vals = [row[c] for c in month_cols]
    if vals[-1] > vals[0]:
        return "📈 Growing"
    elif vals[-1] < vals[0]:
        return "📉 Declining"
    return "➡️ Stable"


def _assign_tier(row, month_cols):
    c = row["total_clicks"]
    max_m = max([row[m] for m in month_cols]) if month_cols else 0
    if c == 0:
        return "TAKEDOWN_ZERO"
    elif c <= TIER_CONFIG["takedown_low_max"] and max_m <= TIER_CONFIG["takedown_low_monthly"]:
        return "TAKEDOWN_LOW"
    elif TIER_CONFIG["monitor_min"] <= c <= TIER_CONFIG["monitor_max"]:
        return "MONITOR"
    elif TIER_CONFIG["performing_min"] <= c <= TIER_CONFIG["performing_max"]:
        return "PERFORMING"
    elif TIER_CONFIG["strong_min"] <= c <= TIER_CONFIG["strong_max"]:
        return "STRONG"
    elif c >= TIER_CONFIG["top_min"]:
        return "TOP"
    return "PERFORMING"


def _get_action(tier, clicks):
    actions = {
        "TAKEDOWN_ZERO": "Remove immediately. Zero organic traction across all months. Set up 410 or 301 redirect.",
        "TAKEDOWN_LOW":  "Merge into a stronger related article or remove. Implement 301 redirect.",
        "MONITOR":       "Optimize meta title, description & keywords. Review in 90 days. Merge if no improvement.",
        "PERFORMING":    "Refresh content, strengthen internal links, add FAQ schema. Push for top 50 clicks.",
        "STRONG":        "Update statistics & examples. Add lead gen CTA. Build backlinks to reach 500+ clicks.",
        "TOP":           "Priority asset. Add lead magnets, improve CTAs, build backlinks. Protect rankings.",
    }
    if tier == "MONITOR" and clicks >= 15:
        return "Good potential. Optimize meta description and add 2–3 internal links to improve CTR."
    return actions.get(tier, "Review manually.")


def get_tier_summary(df: pd.DataFrame) -> dict:
    counts = df["tier"].value_counts().to_dict()
    return {
        "TAKEDOWN_ZERO": counts.get("TAKEDOWN_ZERO", 0),
        "TAKEDOWN_LOW":  counts.get("TAKEDOWN_LOW", 0),
        "MONITOR":       counts.get("MONITOR", 0),
        "PERFORMING":    counts.get("PERFORMING", 0),
        "STRONG":        counts.get("STRONG", 0),
        "TOP":           counts.get("TOP", 0),
    }


TIER_META = {
    "TAKEDOWN_ZERO": {"label": "🔴 Take Down (0 clicks)",       "color": "#FF4B4B"},
    "TAKEDOWN_LOW":  {"label": "🟠 Take Down (1–5 clicks)",      "color": "#FF8C00"},
    "MONITOR":       {"label": "🟡 Monitor (6–20 clicks)",       "color": "#FFC300"},
    "PERFORMING":    {"label": "✅ Performing (21–100 clicks)",  "color": "#2ECC71"},
    "STRONG":        {"label": "💪 Strong (101–500 clicks)",     "color": "#1ABC9C"},
    "TOP":           {"label": "🏆 Top Performers (500+ clicks)","color": "#9B59B6"},
}