Spaces:
Sleeping
Sleeping
Upload 6 files
Browse files- README.md +29 -0
- app.py +633 -0
- audit_engine.py +312 -0
- db.py +161 -0
- report_generator.py +268 -0
- requirements.txt +7 -0
README.md
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Link Audit Tool
|
| 3 |
+
emoji: π
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.12.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# π Bulk Link Audit Tool
|
| 13 |
+
|
| 14 |
+
SEO link audit tool that crawls pages, extracts body-content links, checks status, detects broken links, follow flags, duplicates, and orphan pages.
|
| 15 |
+
|
| 16 |
+
## Features
|
| 17 |
+
- **Batch processing** with auto-save to Supabase after each batch
|
| 18 |
+
- **Proper pause/resume** β pause mid-audit, come back later, resume from where you left off
|
| 19 |
+
- **Interactive HTML report** with accordion cells, filters, and issue highlighting
|
| 20 |
+
- **Orphan page detection** after full crawl
|
| 21 |
+
- **Follow flag detection** β internal nofollow β , external dofollow β
|
| 22 |
+
- **Duplicate link detection** with body locations
|
| 23 |
+
|
| 24 |
+
## Setup
|
| 25 |
+
Add these secrets in your Space settings:
|
| 26 |
+
- `SUPABASE_URL` β your Supabase project URL
|
| 27 |
+
- `SUPABASE_KEY` β your Supabase anon key
|
| 28 |
+
|
| 29 |
+
Then run the SQL schema from `db.py` in your Supabase SQL editor.
|
app.py
ADDED
|
@@ -0,0 +1,633 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Link Audit Tool β Gradio UI for Hugging Face Spaces
|
| 3 |
+
Proper pause/resume via threading + Supabase persistence.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import time
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
import threading
|
| 12 |
+
import tempfile
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
from audit_engine import audit_page, DEFAULT_BODY_SELECTORS, DEFAULT_SUGGESTION_MAP
|
| 15 |
+
from report_generator import generate_report
|
| 16 |
+
from db import (
|
| 17 |
+
get_client, create_run, get_all_runs, get_all_page_results,
|
| 18 |
+
get_completed_urls, get_pending_urls, get_completed_count,
|
| 19 |
+
save_batch_results, update_run_status, delete_run,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
# βββ Supabase Connection βββ
|
| 23 |
+
SUPABASE_URL = os.environ.get("SUPABASE_URL", "")
|
| 24 |
+
SUPABASE_KEY = os.environ.get("SUPABASE_KEY", "")
|
| 25 |
+
sb = None
|
| 26 |
+
if SUPABASE_URL and SUPABASE_KEY:
|
| 27 |
+
try:
|
| 28 |
+
sb = get_client(SUPABASE_URL, SUPABASE_KEY)
|
| 29 |
+
sb.table("audit_runs").select("id").limit(1).execute()
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f"Supabase connection failed: {e}")
|
| 32 |
+
sb = None
|
| 33 |
+
|
| 34 |
+
# βββ Audit State (thread-safe) βββ
|
| 35 |
+
class AuditState:
|
| 36 |
+
def __init__(self):
|
| 37 |
+
self.lock = threading.Lock()
|
| 38 |
+
self.paused = False
|
| 39 |
+
self.running = False
|
| 40 |
+
self.run_id = None
|
| 41 |
+
|
| 42 |
+
def request_pause(self):
|
| 43 |
+
with self.lock:
|
| 44 |
+
self.paused = True
|
| 45 |
+
|
| 46 |
+
def resume(self):
|
| 47 |
+
with self.lock:
|
| 48 |
+
self.paused = False
|
| 49 |
+
|
| 50 |
+
def is_paused(self):
|
| 51 |
+
with self.lock:
|
| 52 |
+
return self.paused
|
| 53 |
+
|
| 54 |
+
def set_running(self, val, run_id=None):
|
| 55 |
+
with self.lock:
|
| 56 |
+
self.running = val
|
| 57 |
+
if run_id:
|
| 58 |
+
self.run_id = run_id
|
| 59 |
+
|
| 60 |
+
def is_running(self):
|
| 61 |
+
with self.lock:
|
| 62 |
+
return self.running
|
| 63 |
+
|
| 64 |
+
audit_state = AuditState()
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 68 |
+
# CORE AUDIT FUNCTION (runs as generator for streaming)
|
| 69 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 70 |
+
|
| 71 |
+
def run_audit(file, pasted_urls, domain, batch_size, timeout, delay, workers):
|
| 72 |
+
"""Main audit generator β yields progress updates."""
|
| 73 |
+
if sb is None:
|
| 74 |
+
yield "β Supabase not connected. Set SUPABASE_URL and SUPABASE_KEY in Space secrets.", "", gr.update(), gr.update()
|
| 75 |
+
return
|
| 76 |
+
|
| 77 |
+
# Parse URLs
|
| 78 |
+
urls = []
|
| 79 |
+
if file is not None:
|
| 80 |
+
try:
|
| 81 |
+
if file.name.endswith('.csv'):
|
| 82 |
+
df = pd.read_csv(file.name)
|
| 83 |
+
else:
|
| 84 |
+
df = pd.read_excel(file.name)
|
| 85 |
+
url_col = None
|
| 86 |
+
for col in df.columns:
|
| 87 |
+
sample = str(df[col].iloc[0]).strip().lower()
|
| 88 |
+
if sample.startswith('http') or domain in sample:
|
| 89 |
+
url_col = col
|
| 90 |
+
break
|
| 91 |
+
if url_col is None:
|
| 92 |
+
url_col = df.columns[0]
|
| 93 |
+
urls = [u for u in df[url_col].dropna().astype(str).str.strip().tolist() if u.startswith('http')]
|
| 94 |
+
except Exception as e:
|
| 95 |
+
yield f"β File error: {e}", "", gr.update(), gr.update()
|
| 96 |
+
return
|
| 97 |
+
elif pasted_urls and pasted_urls.strip():
|
| 98 |
+
urls = [u.strip() for u in pasted_urls.strip().split('\n') if u.strip().startswith('http')]
|
| 99 |
+
|
| 100 |
+
if not urls:
|
| 101 |
+
yield "β No valid URLs found. Upload a file or paste URLs.", "", gr.update(), gr.update()
|
| 102 |
+
return
|
| 103 |
+
|
| 104 |
+
# Deduplicate preserving order
|
| 105 |
+
seen = set()
|
| 106 |
+
unique_urls = []
|
| 107 |
+
for u in urls:
|
| 108 |
+
if u not in seen:
|
| 109 |
+
seen.add(u)
|
| 110 |
+
unique_urls.append(u)
|
| 111 |
+
urls = unique_urls
|
| 112 |
+
|
| 113 |
+
run_name = f"{domain} Audit β {datetime.now().strftime('%b %d %H:%M')} β {len(urls)} pages"
|
| 114 |
+
run_id = create_run(sb, run_name, domain, len(urls), urls)
|
| 115 |
+
|
| 116 |
+
audit_state.set_running(True, run_id)
|
| 117 |
+
audit_state.resume() # Reset pause flag
|
| 118 |
+
|
| 119 |
+
total = len(urls)
|
| 120 |
+
start_time = time.time()
|
| 121 |
+
batch_num = 0
|
| 122 |
+
log_lines = []
|
| 123 |
+
|
| 124 |
+
yield f"π Started: {run_name}\nπ¦ {total} URLs Β· Batch size: {batch_size}", "", gr.update(interactive=True), gr.update(interactive=False)
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
for batch_start in range(0, total, batch_size):
|
| 128 |
+
# Check for pause
|
| 129 |
+
if audit_state.is_paused():
|
| 130 |
+
completed = get_completed_count(sb, run_id)
|
| 131 |
+
update_run_status(sb, run_id, "paused", completed)
|
| 132 |
+
log_lines.append(f"βΈοΈ PAUSED at {completed}/{total} β resume from Past Runs")
|
| 133 |
+
audit_state.set_running(False)
|
| 134 |
+
yield "\n".join(log_lines[-40:]), "", gr.update(interactive=False), gr.update(interactive=False)
|
| 135 |
+
return
|
| 136 |
+
|
| 137 |
+
batch_end = min(batch_start + batch_size, total)
|
| 138 |
+
batch_urls = urls[batch_start:batch_end]
|
| 139 |
+
batch_num += 1
|
| 140 |
+
batch_results = []
|
| 141 |
+
|
| 142 |
+
for j, url in enumerate(batch_urls):
|
| 143 |
+
# Check pause between each URL
|
| 144 |
+
if audit_state.is_paused():
|
| 145 |
+
# Save partial batch
|
| 146 |
+
if batch_results:
|
| 147 |
+
save_batch_results(sb, run_id, batch_results)
|
| 148 |
+
completed = get_completed_count(sb, run_id)
|
| 149 |
+
update_run_status(sb, run_id, "paused", completed)
|
| 150 |
+
log_lines.append(f"βΈοΈ PAUSED at {completed}/{total}")
|
| 151 |
+
audit_state.set_running(False)
|
| 152 |
+
yield "\n".join(log_lines[-40:]), "", gr.update(interactive=False), gr.update(interactive=False)
|
| 153 |
+
return
|
| 154 |
+
|
| 155 |
+
global_idx = batch_start + j + 1
|
| 156 |
+
elapsed = time.time() - start_time
|
| 157 |
+
avg = elapsed / global_idx
|
| 158 |
+
eta = avg * (total - global_idx)
|
| 159 |
+
eta_str = f"{int(eta//60)}m {int(eta%60)}s" if eta > 60 else f"{eta:.0f}s"
|
| 160 |
+
|
| 161 |
+
result = audit_page(
|
| 162 |
+
url, domain, DEFAULT_BODY_SELECTORS,
|
| 163 |
+
suggestion_map=DEFAULT_SUGGESTION_MAP,
|
| 164 |
+
timeout=timeout, concurrent_workers=workers,
|
| 165 |
+
)
|
| 166 |
+
batch_results.append(result)
|
| 167 |
+
|
| 168 |
+
short = url.replace('https://www.', '').replace('https://', '')[:70]
|
| 169 |
+
if result['error']:
|
| 170 |
+
log_lines.append(f"β [{global_idx}/{total}] {short} β {result['error'][:50]}")
|
| 171 |
+
else:
|
| 172 |
+
b = result['broken_int_count'] + result['broken_ext_count']
|
| 173 |
+
fc = result['follow_flag_count']
|
| 174 |
+
d = result['duplicate_count']
|
| 175 |
+
flags = []
|
| 176 |
+
if b: flags.append(f"π΄ {b} broken")
|
| 177 |
+
if fc: flags.append(f"π‘ {fc} flags")
|
| 178 |
+
if d: flags.append(f"π£ {d} dups")
|
| 179 |
+
flag_str = " Β· ".join(flags) if flags else "β
"
|
| 180 |
+
log_lines.append(f"[{global_idx}/{total}] {short} β Int:{result['int_count']} Ext:{result['ext_count']} Β· {flag_str}")
|
| 181 |
+
|
| 182 |
+
progress_text = f"π Progress: {global_idx}/{total} ({global_idx*100//total}%) Β· Batch {batch_num} Β· ETA: {eta_str}"
|
| 183 |
+
yield "\n".join(log_lines[-40:]), progress_text, gr.update(interactive=True), gr.update(interactive=False)
|
| 184 |
+
|
| 185 |
+
if j < len(batch_urls) - 1:
|
| 186 |
+
time.sleep(delay)
|
| 187 |
+
|
| 188 |
+
# Save batch to Supabase
|
| 189 |
+
if batch_results:
|
| 190 |
+
try:
|
| 191 |
+
save_batch_results(sb, run_id, batch_results)
|
| 192 |
+
completed = get_completed_count(sb, run_id)
|
| 193 |
+
update_run_status(sb, run_id, "running", completed)
|
| 194 |
+
log_lines.append(f"πΎ Batch {batch_num} saved β {completed}/{total} done")
|
| 195 |
+
except Exception as e:
|
| 196 |
+
log_lines.append(f"β Batch save error: {str(e)[:60]}")
|
| 197 |
+
|
| 198 |
+
yield "\n".join(log_lines[-40:]), f"π Progress: {min(batch_end, total)}/{total} Β· Saved batch {batch_num}", gr.update(interactive=True), gr.update(interactive=False)
|
| 199 |
+
del batch_results
|
| 200 |
+
|
| 201 |
+
# ββ ALL DONE β Orphan analysis ββ
|
| 202 |
+
log_lines.append("π Running orphan page analysis...")
|
| 203 |
+
yield "\n".join(log_lines[-40:]), f"π Orphan analysis...", gr.update(interactive=False), gr.update(interactive=False)
|
| 204 |
+
|
| 205 |
+
all_pages = get_all_page_results(sb, run_id)
|
| 206 |
+
all_results = [p['result'] for p in all_pages]
|
| 207 |
+
|
| 208 |
+
all_internal_targets = set()
|
| 209 |
+
all_page_urls = set()
|
| 210 |
+
for r in all_results:
|
| 211 |
+
all_page_urls.add(r['url'].rstrip('/').split('?')[0])
|
| 212 |
+
for link in r.get('internal_links', []):
|
| 213 |
+
all_internal_targets.add(link['url'].rstrip('/').split('?')[0])
|
| 214 |
+
orphan_pages = sorted([p for p in all_page_urls if p not in all_internal_targets])
|
| 215 |
+
|
| 216 |
+
summary = {
|
| 217 |
+
'total_pages': len(all_results),
|
| 218 |
+
'total_int': sum(r.get('int_count', 0) for r in all_results),
|
| 219 |
+
'total_ext': sum(r.get('ext_count', 0) for r in all_results),
|
| 220 |
+
'total_broken': sum(r.get('broken_int_count', 0) + r.get('broken_ext_count', 0) for r in all_results),
|
| 221 |
+
'total_redirects': sum(r.get('redirect_int_count', 0) + r.get('redirect_ext_count', 0) for r in all_results),
|
| 222 |
+
'total_flags': sum(r.get('follow_flag_count', 0) for r in all_results),
|
| 223 |
+
'total_dups': sum(r.get('duplicate_count', 0) for r in all_results),
|
| 224 |
+
'total_sug': sum(len(r.get('suggestions', [])) for r in all_results),
|
| 225 |
+
'orphan_count': len(orphan_pages),
|
| 226 |
+
'orphan_urls': orphan_pages[:100],
|
| 227 |
+
}
|
| 228 |
+
update_run_status(sb, run_id, "completed", len(all_results), summary)
|
| 229 |
+
|
| 230 |
+
total_time = time.time() - start_time
|
| 231 |
+
log_lines.append(f"β
COMPLETE! {len(all_results)} pages in {total_time:.0f}s Β· {len(orphan_pages)} orphans")
|
| 232 |
+
log_lines.append(f"π Broken: {summary['total_broken']} Β· Redirects: {summary['total_redirects']} Β· Flags: {summary['total_flags']} Β· Dups: {summary['total_dups']}")
|
| 233 |
+
log_lines.append("β Go to Past Runs tab to generate report")
|
| 234 |
+
|
| 235 |
+
audit_state.set_running(False)
|
| 236 |
+
yield "\n".join(log_lines[-40:]), f"β
Complete β {len(all_results)} pages", gr.update(interactive=False), gr.update(interactive=False)
|
| 237 |
+
|
| 238 |
+
except Exception as e:
|
| 239 |
+
log_lines.append(f"β Error: {str(e)}")
|
| 240 |
+
audit_state.set_running(False)
|
| 241 |
+
if run_id:
|
| 242 |
+
completed = get_completed_count(sb, run_id)
|
| 243 |
+
update_run_status(sb, run_id, "paused", completed)
|
| 244 |
+
yield "\n".join(log_lines[-40:]), f"β Error β saved progress to Supabase", gr.update(interactive=False), gr.update(interactive=False)
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def pause_audit():
|
| 248 |
+
"""Signal the audit loop to pause."""
|
| 249 |
+
if audit_state.is_running():
|
| 250 |
+
audit_state.request_pause()
|
| 251 |
+
return "βΈοΈ Pause requested β will stop after current page completes..."
|
| 252 |
+
return "No audit running."
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 256 |
+
# RESUME FUNCTION
|
| 257 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 258 |
+
|
| 259 |
+
def resume_audit(run_id, domain, batch_size, timeout, delay, workers):
|
| 260 |
+
"""Resume a paused/interrupted run."""
|
| 261 |
+
if sb is None:
|
| 262 |
+
yield "β Supabase not connected.", "", gr.update(), gr.update()
|
| 263 |
+
return
|
| 264 |
+
|
| 265 |
+
if not run_id:
|
| 266 |
+
yield "β No run selected.", "", gr.update(), gr.update()
|
| 267 |
+
return
|
| 268 |
+
|
| 269 |
+
all_urls_for_run = get_pending_urls(sb, run_id)
|
| 270 |
+
done_urls = get_completed_urls(sb, run_id)
|
| 271 |
+
remaining = [u for u in all_urls_for_run if u not in done_urls]
|
| 272 |
+
|
| 273 |
+
if not remaining:
|
| 274 |
+
update_run_status(sb, run_id, "completed", len(done_urls))
|
| 275 |
+
yield "β
All pages already audited!", "", gr.update(), gr.update()
|
| 276 |
+
return
|
| 277 |
+
|
| 278 |
+
audit_state.set_running(True, run_id)
|
| 279 |
+
audit_state.resume()
|
| 280 |
+
update_run_status(sb, run_id, "running")
|
| 281 |
+
|
| 282 |
+
total = len(all_urls_for_run)
|
| 283 |
+
start_time = time.time()
|
| 284 |
+
batch_num = 0
|
| 285 |
+
log_lines = [f"βΆοΈ Resuming β {len(remaining)} pages remaining ({len(done_urls)} already done)"]
|
| 286 |
+
|
| 287 |
+
yield "\n".join(log_lines), f"π Resuming: {len(done_urls)}/{total}", gr.update(interactive=True), gr.update(interactive=False)
|
| 288 |
+
|
| 289 |
+
try:
|
| 290 |
+
for batch_start in range(0, len(remaining), batch_size):
|
| 291 |
+
if audit_state.is_paused():
|
| 292 |
+
completed = get_completed_count(sb, run_id)
|
| 293 |
+
update_run_status(sb, run_id, "paused", completed)
|
| 294 |
+
log_lines.append(f"βΈοΈ PAUSED at {completed}/{total}")
|
| 295 |
+
audit_state.set_running(False)
|
| 296 |
+
yield "\n".join(log_lines[-40:]), "", gr.update(interactive=False), gr.update(interactive=False)
|
| 297 |
+
return
|
| 298 |
+
|
| 299 |
+
batch_end = min(batch_start + batch_size, len(remaining))
|
| 300 |
+
batch_urls = remaining[batch_start:batch_end]
|
| 301 |
+
batch_num += 1
|
| 302 |
+
batch_results = []
|
| 303 |
+
|
| 304 |
+
for j, url in enumerate(batch_urls):
|
| 305 |
+
if audit_state.is_paused():
|
| 306 |
+
if batch_results:
|
| 307 |
+
save_batch_results(sb, run_id, batch_results)
|
| 308 |
+
completed = get_completed_count(sb, run_id)
|
| 309 |
+
update_run_status(sb, run_id, "paused", completed)
|
| 310 |
+
log_lines.append(f"βΈοΈ PAUSED at {completed}/{total}")
|
| 311 |
+
audit_state.set_running(False)
|
| 312 |
+
yield "\n".join(log_lines[-40:]), "", gr.update(interactive=False), gr.update(interactive=False)
|
| 313 |
+
return
|
| 314 |
+
|
| 315 |
+
global_idx = len(done_urls) + batch_start + j + 1
|
| 316 |
+
elapsed = time.time() - start_time
|
| 317 |
+
processed = batch_start + j + 1
|
| 318 |
+
avg = elapsed / processed
|
| 319 |
+
eta = avg * (len(remaining) - processed)
|
| 320 |
+
eta_str = f"{int(eta//60)}m {int(eta%60)}s" if eta > 60 else f"{eta:.0f}s"
|
| 321 |
+
|
| 322 |
+
result = audit_page(
|
| 323 |
+
url, domain, DEFAULT_BODY_SELECTORS,
|
| 324 |
+
suggestion_map=DEFAULT_SUGGESTION_MAP,
|
| 325 |
+
timeout=timeout, concurrent_workers=workers,
|
| 326 |
+
)
|
| 327 |
+
batch_results.append(result)
|
| 328 |
+
|
| 329 |
+
short = url.replace('https://www.', '').replace('https://', '')[:70]
|
| 330 |
+
if result['error']:
|
| 331 |
+
log_lines.append(f"β [{global_idx}/{total}] {short}")
|
| 332 |
+
else:
|
| 333 |
+
b = result['broken_int_count'] + result['broken_ext_count']
|
| 334 |
+
flag_str = f"π΄ {b} broken" if b else "β
"
|
| 335 |
+
log_lines.append(f"[{global_idx}/{total}] {short} Β· {flag_str}")
|
| 336 |
+
|
| 337 |
+
yield "\n".join(log_lines[-40:]), f"π Progress: {global_idx}/{total} ({global_idx*100//total}%) Β· ETA: {eta_str}", gr.update(interactive=True), gr.update(interactive=False)
|
| 338 |
+
if j < len(batch_urls) - 1:
|
| 339 |
+
time.sleep(delay)
|
| 340 |
+
|
| 341 |
+
if batch_results:
|
| 342 |
+
save_batch_results(sb, run_id, batch_results)
|
| 343 |
+
completed = get_completed_count(sb, run_id)
|
| 344 |
+
update_run_status(sb, run_id, "running", completed)
|
| 345 |
+
log_lines.append(f"πΎ Batch {batch_num} saved β {completed}/{total}")
|
| 346 |
+
del batch_results
|
| 347 |
+
|
| 348 |
+
# Orphan analysis
|
| 349 |
+
log_lines.append("π Orphan analysis...")
|
| 350 |
+
yield "\n".join(log_lines[-40:]), "π Orphan analysis...", gr.update(interactive=False), gr.update(interactive=False)
|
| 351 |
+
|
| 352 |
+
all_pages = get_all_page_results(sb, run_id)
|
| 353 |
+
all_results = [p['result'] for p in all_pages]
|
| 354 |
+
|
| 355 |
+
all_targets = set()
|
| 356 |
+
all_pg = set()
|
| 357 |
+
for r in all_results:
|
| 358 |
+
all_pg.add(r['url'].rstrip('/').split('?')[0])
|
| 359 |
+
for link in r.get('internal_links', []):
|
| 360 |
+
all_targets.add(link['url'].rstrip('/').split('?')[0])
|
| 361 |
+
orphans = sorted([p for p in all_pg if p not in all_targets])
|
| 362 |
+
|
| 363 |
+
final_summary = {
|
| 364 |
+
'total_pages': len(all_results),
|
| 365 |
+
'total_int': sum(r.get('int_count', 0) for r in all_results),
|
| 366 |
+
'total_ext': sum(r.get('ext_count', 0) for r in all_results),
|
| 367 |
+
'total_broken': sum(r.get('broken_int_count', 0) + r.get('broken_ext_count', 0) for r in all_results),
|
| 368 |
+
'total_redirects': sum(r.get('redirect_int_count', 0) + r.get('redirect_ext_count', 0) for r in all_results),
|
| 369 |
+
'total_flags': sum(r.get('follow_flag_count', 0) for r in all_results),
|
| 370 |
+
'total_dups': sum(r.get('duplicate_count', 0) for r in all_results),
|
| 371 |
+
'total_sug': sum(len(r.get('suggestions', [])) for r in all_results),
|
| 372 |
+
'orphan_count': len(orphans),
|
| 373 |
+
'orphan_urls': orphans[:100],
|
| 374 |
+
}
|
| 375 |
+
update_run_status(sb, run_id, "completed", len(all_results), final_summary)
|
| 376 |
+
|
| 377 |
+
total_time = time.time() - start_time
|
| 378 |
+
log_lines.append(f"β
COMPLETE! {len(all_results)} pages in {total_time:.0f}s Β· {len(orphans)} orphans")
|
| 379 |
+
audit_state.set_running(False)
|
| 380 |
+
yield "\n".join(log_lines[-40:]), f"β
Complete β {len(all_results)} pages", gr.update(interactive=False), gr.update(interactive=False)
|
| 381 |
+
|
| 382 |
+
except Exception as e:
|
| 383 |
+
log_lines.append(f"β Error: {str(e)}")
|
| 384 |
+
audit_state.set_running(False)
|
| 385 |
+
completed = get_completed_count(sb, run_id)
|
| 386 |
+
update_run_status(sb, run_id, "paused", completed)
|
| 387 |
+
yield "\n".join(log_lines[-40:]), "β Error", gr.update(interactive=False), gr.update(interactive=False)
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 391 |
+
# PAST RUNS HELPERS
|
| 392 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 393 |
+
|
| 394 |
+
def load_past_runs():
|
| 395 |
+
if sb is None:
|
| 396 |
+
return "<p>β Supabase not connected</p>", gr.update(choices=[], value=None)
|
| 397 |
+
|
| 398 |
+
runs = get_all_runs(sb)
|
| 399 |
+
if not runs:
|
| 400 |
+
return "<p>No saved runs yet.</p>", gr.update(choices=[], value=None)
|
| 401 |
+
|
| 402 |
+
# Build choices for dropdown
|
| 403 |
+
choices = []
|
| 404 |
+
for r in runs:
|
| 405 |
+
status = r.get('status', 'unknown')
|
| 406 |
+
completed = r.get('completed_urls', 0)
|
| 407 |
+
total = r.get('total_urls', 0)
|
| 408 |
+
label = f"{r.get('name', 'Untitled')} [{status.upper()}] ({completed}/{total})"
|
| 409 |
+
choices.append((label, r['id']))
|
| 410 |
+
|
| 411 |
+
# Build HTML table
|
| 412 |
+
html = '<div style="max-height:400px;overflow-y:auto;">'
|
| 413 |
+
html += '<table style="width:100%;border-collapse:collapse;font-size:13px;">'
|
| 414 |
+
html += '<tr style="background:#f1f5f9;"><th style="padding:10px;text-align:left;">Run Name</th><th style="padding:10px;text-align:center;">Status</th><th style="padding:10px;text-align:center;">Pages</th><th style="padding:10px;text-align:center;">Broken</th><th style="padding:10px;text-align:center;">Flags</th><th style="padding:10px;text-align:center;">Dups</th><th style="padding:10px;text-align:center;">Orphans</th></tr>'
|
| 415 |
+
|
| 416 |
+
for r in runs:
|
| 417 |
+
summary = r.get('summary', {}) or {}
|
| 418 |
+
status = r.get('status', 'unknown')
|
| 419 |
+
s_color = {'completed': '#059669', 'paused': '#d97706', 'running': '#2563eb'}.get(status, '#888')
|
| 420 |
+
created = r.get('created_at', '')[:16].replace('T', ' ')
|
| 421 |
+
|
| 422 |
+
html += f'''<tr style="border-bottom:1px solid #e2e8f0;">
|
| 423 |
+
<td style="padding:10px;"><b>{r.get('name','Untitled')}</b><br><span style="font-size:11px;color:#94a3b8;">{created}</span></td>
|
| 424 |
+
<td style="padding:10px;text-align:center;"><span style="background:{s_color}15;color:{s_color};padding:3px 10px;border-radius:12px;font-size:11px;font-weight:700;">{status.upper()}</span></td>
|
| 425 |
+
<td style="padding:10px;text-align:center;font-weight:700;">{r.get('completed_urls',0)}/{r.get('total_urls',0)}</td>
|
| 426 |
+
<td style="padding:10px;text-align:center;color:#dc2626;font-weight:700;">{summary.get('total_broken','β')}</td>
|
| 427 |
+
<td style="padding:10px;text-align:center;color:#dc2626;font-weight:700;">{summary.get('total_flags','β')}</td>
|
| 428 |
+
<td style="padding:10px;text-align:center;color:#db2777;font-weight:700;">{summary.get('total_dups','β')}</td>
|
| 429 |
+
<td style="padding:10px;text-align:center;color:#dc2626;font-weight:700;">{summary.get('orphan_count','β')}</td>
|
| 430 |
+
</tr>'''
|
| 431 |
+
|
| 432 |
+
html += '</table></div>'
|
| 433 |
+
return html, gr.update(choices=choices, value=choices[0][1] if choices else None)
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
def generate_report_for_run(run_id, domain):
|
| 437 |
+
if sb is None or not run_id:
|
| 438 |
+
return None, "β No run selected or Supabase not connected."
|
| 439 |
+
|
| 440 |
+
try:
|
| 441 |
+
run = None
|
| 442 |
+
runs = get_all_runs(sb)
|
| 443 |
+
for r in runs:
|
| 444 |
+
if r['id'] == run_id:
|
| 445 |
+
run = r
|
| 446 |
+
break
|
| 447 |
+
|
| 448 |
+
pages = get_all_page_results(sb, run_id)
|
| 449 |
+
if not pages:
|
| 450 |
+
return None, "β No page data found for this run."
|
| 451 |
+
|
| 452 |
+
results = [p['result'] for p in pages]
|
| 453 |
+
summary = (run.get('summary', {}) or {}) if run else {}
|
| 454 |
+
orphan_urls = summary.get('orphan_urls', [])
|
| 455 |
+
report_domain = run.get('domain', domain) if run else domain
|
| 456 |
+
|
| 457 |
+
report_html = generate_report(results, orphan_urls, report_domain)
|
| 458 |
+
|
| 459 |
+
# Save to temp file
|
| 460 |
+
tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.html', prefix='Link_Audit_')
|
| 461 |
+
tmp.write(report_html.encode('utf-8'))
|
| 462 |
+
tmp.close()
|
| 463 |
+
|
| 464 |
+
return tmp.name, f"β
Report generated β {len(results)} pages"
|
| 465 |
+
except Exception as e:
|
| 466 |
+
return None, f"β Error: {str(e)}"
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
def generate_csv_for_run(run_id):
|
| 470 |
+
if sb is None or not run_id:
|
| 471 |
+
return None, "β No run selected."
|
| 472 |
+
|
| 473 |
+
try:
|
| 474 |
+
pages = get_all_page_results(sb, run_id)
|
| 475 |
+
if not pages:
|
| 476 |
+
return None, "β No data."
|
| 477 |
+
|
| 478 |
+
rows = []
|
| 479 |
+
for p in pages:
|
| 480 |
+
r = p['result']
|
| 481 |
+
rows.append({
|
| 482 |
+
'URL': r.get('url', ''),
|
| 483 |
+
'Internal': r.get('int_count', 0),
|
| 484 |
+
'External': r.get('ext_count', 0),
|
| 485 |
+
'Broken': r.get('broken_int_count', 0) + r.get('broken_ext_count', 0),
|
| 486 |
+
'Redirects': r.get('redirect_int_count', 0) + r.get('redirect_ext_count', 0),
|
| 487 |
+
'Flags': r.get('follow_flag_count', 0),
|
| 488 |
+
'Duplicates': r.get('duplicate_count', 0),
|
| 489 |
+
'Error': r.get('error', ''),
|
| 490 |
+
})
|
| 491 |
+
|
| 492 |
+
tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.csv', prefix='Audit_CSV_')
|
| 493 |
+
pd.DataFrame(rows).to_csv(tmp.name, index=False)
|
| 494 |
+
tmp.close()
|
| 495 |
+
return tmp.name, f"β
CSV exported β {len(rows)} rows"
|
| 496 |
+
except Exception as e:
|
| 497 |
+
return None, f"β Error: {str(e)}"
|
| 498 |
+
|
| 499 |
+
|
| 500 |
+
def delete_selected_run(run_id):
|
| 501 |
+
if sb is None or not run_id:
|
| 502 |
+
return "β No run selected."
|
| 503 |
+
try:
|
| 504 |
+
delete_run(sb, run_id)
|
| 505 |
+
return "ποΈ Run deleted."
|
| 506 |
+
except Exception as e:
|
| 507 |
+
return f"β {str(e)}"
|
| 508 |
+
|
| 509 |
+
|
| 510 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 511 |
+
# GRADIO UI
|
| 512 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 513 |
+
|
| 514 |
+
css = """
|
| 515 |
+
.main-header { background: linear-gradient(135deg, #1e3a5f, #2563eb); padding: 24px 28px; border-radius: 12px; color: white; margin-bottom: 16px; }
|
| 516 |
+
.main-header h1 { margin: 0 0 4px 0; font-size: 24px; }
|
| 517 |
+
.main-header p { margin: 0; opacity: 0.8; font-size: 13px; }
|
| 518 |
+
.status-bar { background: #f1f5f9; border: 1px solid #e2e8f0; border-radius: 8px; padding: 10px 16px; font-family: monospace; font-size: 13px; font-weight: 600; }
|
| 519 |
+
.log-area textarea { font-family: 'JetBrains Mono', monospace !important; font-size: 12px !important; line-height: 1.6 !important; }
|
| 520 |
+
"""
|
| 521 |
+
|
| 522 |
+
with gr.Blocks(css=css, title="π Link Audit Tool", theme=gr.themes.Soft()) as app:
|
| 523 |
+
|
| 524 |
+
# Header
|
| 525 |
+
gr.HTML("""
|
| 526 |
+
<div class="main-header">
|
| 527 |
+
<p style="font-size:10px;font-weight:700;letter-spacing:1.5px;text-transform:uppercase;color:#93c5fd;margin-bottom:8px;">SEO Link Audit Tool</p>
|
| 528 |
+
<h1>π Bulk Link Audit</h1>
|
| 529 |
+
<p>Upload URLs β batch crawl with auto-save β pause/resume anytime β generate interactive report</p>
|
| 530 |
+
</div>
|
| 531 |
+
""")
|
| 532 |
+
|
| 533 |
+
# Connection status
|
| 534 |
+
conn_status = "β
Supabase Connected" if sb else "β Supabase Not Connected β add SUPABASE_URL and SUPABASE_KEY to Space secrets"
|
| 535 |
+
gr.HTML(f'<div class="status-bar">ποΈ {conn_status}</div>')
|
| 536 |
+
|
| 537 |
+
with gr.Tabs():
|
| 538 |
+
|
| 539 |
+
# βββ TAB 1: NEW AUDIT βββ
|
| 540 |
+
with gr.Tab("π New Audit"):
|
| 541 |
+
with gr.Row():
|
| 542 |
+
with gr.Column(scale=2):
|
| 543 |
+
file_input = gr.File(label="Upload Excel / CSV", file_types=[".xlsx", ".csv", ".xls"])
|
| 544 |
+
pasted_urls = gr.Textbox(label="Or paste URLs (one per line)", lines=5, placeholder="https://www.example.com/blog/page1\nhttps://www.example.com/blog/page2")
|
| 545 |
+
|
| 546 |
+
with gr.Column(scale=1):
|
| 547 |
+
domain = gr.Textbox(label="Your Domain", value="edstellar.com")
|
| 548 |
+
batch_size = gr.Slider(5, 50, value=25, step=5, label="Batch Size")
|
| 549 |
+
timeout = gr.Slider(5, 60, value=15, step=5, label="Timeout (s)")
|
| 550 |
+
delay = gr.Slider(0, 5, value=1.0, step=0.5, label="Delay between pages (s)")
|
| 551 |
+
workers = gr.Slider(1, 10, value=5, step=1, label="Parallel link checks")
|
| 552 |
+
|
| 553 |
+
with gr.Row():
|
| 554 |
+
run_btn = gr.Button("π Run Audit", variant="primary", scale=2)
|
| 555 |
+
pause_btn = gr.Button("βΈοΈ Pause", variant="stop", scale=1, interactive=False)
|
| 556 |
+
|
| 557 |
+
progress_text = gr.Textbox(label="Status", interactive=False, elem_classes=["status-bar"])
|
| 558 |
+
log_output = gr.Textbox(label="Audit Log", lines=20, interactive=False, elem_classes=["log-area"])
|
| 559 |
+
|
| 560 |
+
# Wire up run button (generator for streaming)
|
| 561 |
+
run_btn.click(
|
| 562 |
+
fn=run_audit,
|
| 563 |
+
inputs=[file_input, pasted_urls, domain, batch_size, timeout, delay, workers],
|
| 564 |
+
outputs=[log_output, progress_text, pause_btn, run_btn],
|
| 565 |
+
)
|
| 566 |
+
|
| 567 |
+
# Wire up pause button
|
| 568 |
+
pause_btn.click(fn=pause_audit, outputs=[progress_text])
|
| 569 |
+
|
| 570 |
+
# βββ TAB 2: PAST RUNS βββ
|
| 571 |
+
with gr.Tab("π Past Runs"):
|
| 572 |
+
refresh_btn = gr.Button("π Refresh Runs", variant="secondary")
|
| 573 |
+
runs_html = gr.HTML(value="<p>Click Refresh to load runs.</p>")
|
| 574 |
+
run_dropdown = gr.Dropdown(label="Select a Run", choices=[], interactive=True)
|
| 575 |
+
|
| 576 |
+
with gr.Row():
|
| 577 |
+
report_btn = gr.Button("π Generate HTML Report", variant="primary")
|
| 578 |
+
csv_btn = gr.Button("π Export CSV", variant="secondary")
|
| 579 |
+
resume_btn = gr.Button("βΆοΈ Resume Audit", variant="primary")
|
| 580 |
+
delete_btn = gr.Button("ποΈ Delete Run", variant="stop")
|
| 581 |
+
|
| 582 |
+
action_status = gr.Textbox(label="Action Status", interactive=False)
|
| 583 |
+
report_file = gr.File(label="Download Report", interactive=False)
|
| 584 |
+
csv_file = gr.File(label="Download CSV", interactive=False)
|
| 585 |
+
|
| 586 |
+
# Resume log & progress (shared with new audit display format)
|
| 587 |
+
resume_progress = gr.Textbox(label="Resume Status", interactive=False, elem_classes=["status-bar"])
|
| 588 |
+
resume_log = gr.Textbox(label="Resume Log", lines=15, interactive=False, elem_classes=["log-area"])
|
| 589 |
+
resume_pause_btn = gr.Button("βΈοΈ Pause Resume", variant="stop", interactive=False)
|
| 590 |
+
|
| 591 |
+
# Refresh
|
| 592 |
+
refresh_btn.click(fn=load_past_runs, outputs=[runs_html, run_dropdown])
|
| 593 |
+
|
| 594 |
+
# Generate report
|
| 595 |
+
def gen_report_wrapper(run_id, domain_val):
|
| 596 |
+
filepath, msg = generate_report_for_run(run_id, domain_val)
|
| 597 |
+
return filepath, msg
|
| 598 |
+
|
| 599 |
+
report_btn.click(
|
| 600 |
+
fn=gen_report_wrapper,
|
| 601 |
+
inputs=[run_dropdown, domain],
|
| 602 |
+
outputs=[report_file, action_status],
|
| 603 |
+
)
|
| 604 |
+
|
| 605 |
+
# CSV
|
| 606 |
+
def csv_wrapper(run_id):
|
| 607 |
+
filepath, msg = generate_csv_for_run(run_id)
|
| 608 |
+
return filepath, msg
|
| 609 |
+
|
| 610 |
+
csv_btn.click(fn=csv_wrapper, inputs=[run_dropdown], outputs=[csv_file, action_status])
|
| 611 |
+
|
| 612 |
+
# Delete
|
| 613 |
+
def delete_wrapper(run_id):
|
| 614 |
+
msg = delete_selected_run(run_id)
|
| 615 |
+
html, dropdown = load_past_runs()
|
| 616 |
+
return msg, html, dropdown
|
| 617 |
+
|
| 618 |
+
delete_btn.click(fn=delete_wrapper, inputs=[run_dropdown], outputs=[action_status, runs_html, run_dropdown])
|
| 619 |
+
|
| 620 |
+
# Resume
|
| 621 |
+
resume_btn.click(
|
| 622 |
+
fn=resume_audit,
|
| 623 |
+
inputs=[run_dropdown, domain, batch_size, timeout, delay, workers],
|
| 624 |
+
outputs=[resume_log, resume_progress, resume_pause_btn, resume_btn],
|
| 625 |
+
)
|
| 626 |
+
resume_pause_btn.click(fn=pause_audit, outputs=[resume_progress])
|
| 627 |
+
|
| 628 |
+
# Auto-load runs on startup
|
| 629 |
+
app.load(fn=load_past_runs, outputs=[runs_html, run_dropdown])
|
| 630 |
+
|
| 631 |
+
|
| 632 |
+
if __name__ == "__main__":
|
| 633 |
+
app.queue().launch(server_name="0.0.0.0", server_port=7860)
|
audit_engine.py
ADDED
|
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Core Link Audit Engine
|
| 3 |
+
Crawls pages, extracts body-content links, checks status, detects issues.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import requests
|
| 7 |
+
from bs4 import BeautifulSoup, Comment
|
| 8 |
+
from urllib.parse import urljoin, urlparse
|
| 9 |
+
from collections import defaultdict
|
| 10 |
+
import concurrent.futures
|
| 11 |
+
|
| 12 |
+
HEADERS = {
|
| 13 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 14 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
DEFAULT_BODY_SELECTORS = [
|
| 18 |
+
"div.blog-rich-text",
|
| 19 |
+
"div.w-richtext",
|
| 20 |
+
"article .rich-text",
|
| 21 |
+
"article",
|
| 22 |
+
"div.blog-content",
|
| 23 |
+
"div.post-content",
|
| 24 |
+
"main",
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
DEFAULT_SUGGESTION_MAP = {
|
| 28 |
+
"artificial intelligence": ("/category/artificial-intelligence-training", "artificial intelligence training programs"),
|
| 29 |
+
"machine learning": ("/category/artificial-intelligence-training", "machine learning training"),
|
| 30 |
+
"leadership": ("/type/leadership-training", "leadership training programs"),
|
| 31 |
+
"soft skills": ("/type/behavioral-training", "behavioral training programs"),
|
| 32 |
+
"remote employee": ("/blog/how-to-train-remote-employees", "remote employee training"),
|
| 33 |
+
"training management": ("/training-management-software", "training management software"),
|
| 34 |
+
"instructor-led": ("/instructor-led-training-services", "instructor-led training"),
|
| 35 |
+
"corporate training": ("/corporate-training-courses", "corporate training programs"),
|
| 36 |
+
"skill matrix": ("/skill-matrix", "skills matrix"),
|
| 37 |
+
"stellar ai": ("/stellar-ai", "AI-powered training"),
|
| 38 |
+
"book a demo": ("/book-a-demo", "book a demo"),
|
| 39 |
+
"compliance": ("/type/compliance-training", "compliance training"),
|
| 40 |
+
"cybersecurity": ("/category/cybersecurity-training", "cybersecurity training"),
|
| 41 |
+
"data analytics": ("/category/data-analytics-training", "data analytics training"),
|
| 42 |
+
"project management": ("/category/project-management-training", "project management training"),
|
| 43 |
+
"coaching": ("/coaching-solutions", "coaching solutions"),
|
| 44 |
+
"hr training": ("/category/human-resource-training", "HR training programs"),
|
| 45 |
+
"employee engagement": ("/blog/how-to-train-remote-employees", "employee training best practices"),
|
| 46 |
+
"onboarding": ("/category/human-resource-training", "onboarding training"),
|
| 47 |
+
"digital transformation": ("/type/it-technical-training", "IT & technical training"),
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def is_internal(href, domain):
|
| 52 |
+
if not href:
|
| 53 |
+
return False
|
| 54 |
+
parsed = urlparse(href)
|
| 55 |
+
if not parsed.netloc:
|
| 56 |
+
return True
|
| 57 |
+
return domain.lower() in parsed.netloc.lower()
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def normalize_url(href, base_url):
|
| 61 |
+
if not href:
|
| 62 |
+
return None
|
| 63 |
+
href = href.strip()
|
| 64 |
+
if href.startswith(('#', 'mailto:', 'tel:', 'javascript:')):
|
| 65 |
+
return None
|
| 66 |
+
return urljoin(base_url, href)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def get_follow_status(tag):
|
| 70 |
+
rel = tag.get('rel', [])
|
| 71 |
+
if isinstance(rel, str):
|
| 72 |
+
rel = rel.split()
|
| 73 |
+
return 'Nofollow' if 'nofollow' in [r.lower() for r in rel] else 'Dofollow'
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def find_body_content(soup, selectors):
|
| 77 |
+
for sel in selectors:
|
| 78 |
+
el = soup.select_one(sel)
|
| 79 |
+
if el:
|
| 80 |
+
return el
|
| 81 |
+
return soup.find('body')
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def get_link_location(link_tag, body_el):
|
| 85 |
+
body_text = body_el.get_text()
|
| 86 |
+
total_len = len(body_text)
|
| 87 |
+
if total_len == 0:
|
| 88 |
+
return "Unknown"
|
| 89 |
+
|
| 90 |
+
preceding_text = ""
|
| 91 |
+
for el in body_el.descendants:
|
| 92 |
+
if el == link_tag:
|
| 93 |
+
break
|
| 94 |
+
if isinstance(el, str) and not isinstance(el, Comment):
|
| 95 |
+
preceding_text += el
|
| 96 |
+
|
| 97 |
+
pos = len(preceding_text)
|
| 98 |
+
ratio = pos / total_len if total_len > 0 else 0
|
| 99 |
+
|
| 100 |
+
heading = ""
|
| 101 |
+
for parent in link_tag.parents:
|
| 102 |
+
for sib in parent.previous_siblings:
|
| 103 |
+
if hasattr(sib, 'name') and sib.name in ['h1', 'h2', 'h3', 'h4']:
|
| 104 |
+
heading = sib.get_text(strip=True)[:60]
|
| 105 |
+
break
|
| 106 |
+
if heading:
|
| 107 |
+
break
|
| 108 |
+
|
| 109 |
+
if ratio < 0.1:
|
| 110 |
+
section = "Intro"
|
| 111 |
+
elif ratio > 0.85:
|
| 112 |
+
section = "Conclusion"
|
| 113 |
+
else:
|
| 114 |
+
section = f"Mid-article (~{int(ratio*100)}%)"
|
| 115 |
+
|
| 116 |
+
if heading:
|
| 117 |
+
return f'{section} Β· near "{heading}"'
|
| 118 |
+
return section
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def check_url_status(url, timeout=15):
|
| 122 |
+
try:
|
| 123 |
+
r = requests.head(url, headers=HEADERS, timeout=timeout, allow_redirects=False)
|
| 124 |
+
status = r.status_code
|
| 125 |
+
redirect_url = ""
|
| 126 |
+
|
| 127 |
+
if status in (301, 302, 303, 307, 308):
|
| 128 |
+
redirect_url = r.headers.get('Location', '')
|
| 129 |
+
if redirect_url and not redirect_url.startswith('http'):
|
| 130 |
+
redirect_url = urljoin(url, redirect_url)
|
| 131 |
+
|
| 132 |
+
if status == 405:
|
| 133 |
+
r = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=False, stream=True)
|
| 134 |
+
status = r.status_code
|
| 135 |
+
if status in (301, 302, 303, 307, 308):
|
| 136 |
+
redirect_url = r.headers.get('Location', '')
|
| 137 |
+
r.close()
|
| 138 |
+
|
| 139 |
+
if status in (301, 302, 303, 307, 308):
|
| 140 |
+
link_status = "Redirect"
|
| 141 |
+
elif 200 <= status < 300:
|
| 142 |
+
link_status = "Active"
|
| 143 |
+
else:
|
| 144 |
+
link_status = "Broken"
|
| 145 |
+
|
| 146 |
+
return url, status, link_status, redirect_url
|
| 147 |
+
|
| 148 |
+
except requests.exceptions.Timeout:
|
| 149 |
+
return url, "Timeout", "Broken", ""
|
| 150 |
+
except requests.exceptions.ConnectionError:
|
| 151 |
+
return url, "ConnError", "Broken", ""
|
| 152 |
+
except Exception:
|
| 153 |
+
return url, "Error", "Broken", ""
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def generate_suggestions(body_text, existing_internal_urls, page_url, suggestion_map=None):
|
| 157 |
+
if suggestion_map is None:
|
| 158 |
+
suggestion_map = DEFAULT_SUGGESTION_MAP
|
| 159 |
+
|
| 160 |
+
suggestions = []
|
| 161 |
+
text_lower = body_text.lower()
|
| 162 |
+
existing_paths = set(urlparse(u).path.rstrip('/') for u in existing_internal_urls)
|
| 163 |
+
|
| 164 |
+
for keyword, (path, anchor) in suggestion_map.items():
|
| 165 |
+
clean_path = path.rstrip('/')
|
| 166 |
+
if clean_path in existing_paths:
|
| 167 |
+
continue
|
| 168 |
+
if clean_path == urlparse(page_url).path.rstrip('/'):
|
| 169 |
+
continue
|
| 170 |
+
count = text_lower.count(keyword.lower())
|
| 171 |
+
if count > 0:
|
| 172 |
+
pos = text_lower.find(keyword.lower())
|
| 173 |
+
ratio = pos / len(text_lower) if len(text_lower) > 0 else 0
|
| 174 |
+
if ratio < 0.15:
|
| 175 |
+
loc = "Intro"
|
| 176 |
+
elif ratio > 0.85:
|
| 177 |
+
loc = "Conclusion"
|
| 178 |
+
else:
|
| 179 |
+
loc = f"Mid-article (~{int(ratio*100)}%)"
|
| 180 |
+
|
| 181 |
+
priority = "High" if count >= 3 else "Med"
|
| 182 |
+
suggestions.append({
|
| 183 |
+
'section': loc,
|
| 184 |
+
'target': path,
|
| 185 |
+
'anchor': anchor,
|
| 186 |
+
'priority': priority,
|
| 187 |
+
'keyword': keyword,
|
| 188 |
+
'count': count
|
| 189 |
+
})
|
| 190 |
+
|
| 191 |
+
suggestions.sort(key=lambda x: (0 if x['priority'] == 'High' else 1, -x['count']))
|
| 192 |
+
return suggestions[:10]
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def audit_page(page_url, domain, body_selectors=None, suggestion_map=None,
|
| 196 |
+
timeout=15, concurrent_workers=5):
|
| 197 |
+
if body_selectors is None:
|
| 198 |
+
body_selectors = DEFAULT_BODY_SELECTORS
|
| 199 |
+
|
| 200 |
+
result = {
|
| 201 |
+
'url': page_url, 'error': None,
|
| 202 |
+
'internal_links': [], 'external_links': [],
|
| 203 |
+
'broken_internal': [], 'broken_external': [],
|
| 204 |
+
'redirect_internal': [], 'redirect_external': [],
|
| 205 |
+
'follow_flags': [], 'duplicates': [], 'suggestions': [],
|
| 206 |
+
'int_count': 0, 'ext_count': 0,
|
| 207 |
+
'int_df': 0, 'int_nf': 0, 'ext_df': 0, 'ext_nf': 0,
|
| 208 |
+
'broken_int_count': 0, 'broken_ext_count': 0,
|
| 209 |
+
'redirect_int_count': 0, 'redirect_ext_count': 0,
|
| 210 |
+
'follow_flag_count': 0, 'duplicate_count': 0,
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
try:
|
| 214 |
+
resp = requests.get(page_url, headers=HEADERS, timeout=timeout)
|
| 215 |
+
resp.raise_for_status()
|
| 216 |
+
except Exception as e:
|
| 217 |
+
result['error'] = str(e)
|
| 218 |
+
return result
|
| 219 |
+
|
| 220 |
+
soup = BeautifulSoup(resp.text, 'lxml')
|
| 221 |
+
body_el = find_body_content(soup, body_selectors)
|
| 222 |
+
if not body_el:
|
| 223 |
+
result['error'] = "Could not find body content element"
|
| 224 |
+
return result
|
| 225 |
+
|
| 226 |
+
body_text = body_el.get_text(' ', strip=True)
|
| 227 |
+
all_links = body_el.find_all('a', href=True)
|
| 228 |
+
url_locations = defaultdict(list)
|
| 229 |
+
|
| 230 |
+
raw_links = []
|
| 231 |
+
for tag in all_links:
|
| 232 |
+
href = normalize_url(tag['href'], page_url)
|
| 233 |
+
if not href:
|
| 234 |
+
continue
|
| 235 |
+
anchor = tag.get_text(strip=True) or "[no text]"
|
| 236 |
+
follow = get_follow_status(tag)
|
| 237 |
+
location = get_link_location(tag, body_el)
|
| 238 |
+
internal = is_internal(href, domain)
|
| 239 |
+
link_type = 'internal' if internal else 'external'
|
| 240 |
+
|
| 241 |
+
link_data = {
|
| 242 |
+
'url': href, 'anchor': anchor[:100], 'follow': follow,
|
| 243 |
+
'location': location, 'type': link_type,
|
| 244 |
+
'status_code': None, 'link_status': None,
|
| 245 |
+
'redirect_url': '', 'flags': [],
|
| 246 |
+
}
|
| 247 |
+
raw_links.append(link_data)
|
| 248 |
+
clean_url = href.rstrip('/').split('?')[0].split('#')[0]
|
| 249 |
+
url_locations[clean_url].append(location)
|
| 250 |
+
|
| 251 |
+
# Check status in parallel
|
| 252 |
+
unique_urls = list(set(l['url'] for l in raw_links))
|
| 253 |
+
status_map = {}
|
| 254 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_workers) as executor:
|
| 255 |
+
futures = {executor.submit(check_url_status, u, timeout): u for u in unique_urls}
|
| 256 |
+
for future in concurrent.futures.as_completed(futures):
|
| 257 |
+
url, status, link_status, redirect_url = future.result()
|
| 258 |
+
status_map[url] = (status, link_status, redirect_url)
|
| 259 |
+
|
| 260 |
+
for link in raw_links:
|
| 261 |
+
if link['url'] in status_map:
|
| 262 |
+
status, link_status, redirect_url = status_map[link['url']]
|
| 263 |
+
link['status_code'] = status
|
| 264 |
+
link['link_status'] = link_status
|
| 265 |
+
link['redirect_url'] = redirect_url
|
| 266 |
+
|
| 267 |
+
if link['type'] == 'internal' and link['follow'] == 'Nofollow':
|
| 268 |
+
link['flags'].append('Internal link is Nofollow β should be Dofollow')
|
| 269 |
+
if link['type'] == 'external' and link['follow'] == 'Dofollow':
|
| 270 |
+
link['flags'].append('External link is Dofollow β should be Nofollow')
|
| 271 |
+
|
| 272 |
+
# Detect duplicates
|
| 273 |
+
duplicates = []
|
| 274 |
+
for clean_url, locations in url_locations.items():
|
| 275 |
+
if len(locations) > 1:
|
| 276 |
+
duplicates.append({'url': clean_url, 'count': len(locations), 'locations': locations})
|
| 277 |
+
for link in raw_links:
|
| 278 |
+
link_clean = link['url'].rstrip('/').split('?')[0].split('#')[0]
|
| 279 |
+
if link_clean == clean_url:
|
| 280 |
+
link['flags'].append(f'Duplicate: appears {len(locations)}x in body')
|
| 281 |
+
|
| 282 |
+
for link in raw_links:
|
| 283 |
+
if link['type'] == 'internal':
|
| 284 |
+
result['internal_links'].append(link)
|
| 285 |
+
if link['follow'] == 'Dofollow': result['int_df'] += 1
|
| 286 |
+
else: result['int_nf'] += 1
|
| 287 |
+
if link['link_status'] == 'Broken': result['broken_internal'].append(link)
|
| 288 |
+
if link['link_status'] == 'Redirect': result['redirect_internal'].append(link)
|
| 289 |
+
else:
|
| 290 |
+
result['external_links'].append(link)
|
| 291 |
+
if link['follow'] == 'Dofollow': result['ext_df'] += 1
|
| 292 |
+
else: result['ext_nf'] += 1
|
| 293 |
+
if link['link_status'] == 'Broken': result['broken_external'].append(link)
|
| 294 |
+
if link['link_status'] == 'Redirect': result['redirect_external'].append(link)
|
| 295 |
+
|
| 296 |
+
if link['flags']:
|
| 297 |
+
result['follow_flags'].append(link)
|
| 298 |
+
|
| 299 |
+
result['int_count'] = len(result['internal_links'])
|
| 300 |
+
result['ext_count'] = len(result['external_links'])
|
| 301 |
+
result['broken_int_count'] = len(result['broken_internal'])
|
| 302 |
+
result['broken_ext_count'] = len(result['broken_external'])
|
| 303 |
+
result['redirect_int_count'] = len(result['redirect_internal'])
|
| 304 |
+
result['redirect_ext_count'] = len(result['redirect_external'])
|
| 305 |
+
result['follow_flag_count'] = len(result['follow_flags'])
|
| 306 |
+
result['duplicates'] = duplicates
|
| 307 |
+
result['duplicate_count'] = len(duplicates)
|
| 308 |
+
|
| 309 |
+
existing_int_urls = [l['url'] for l in result['internal_links']]
|
| 310 |
+
result['suggestions'] = generate_suggestions(body_text, existing_int_urls, page_url, suggestion_map)
|
| 311 |
+
|
| 312 |
+
return result
|
db.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Supabase Database Helper
|
| 3 |
+
Saves/loads audit results in batches.
|
| 4 |
+
|
| 5 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 6 |
+
RUN THIS SQL IN SUPABASE SQL EDITOR (one-time setup):
|
| 7 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 8 |
+
|
| 9 |
+
CREATE TABLE audit_runs (
|
| 10 |
+
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
|
| 11 |
+
name TEXT NOT NULL,
|
| 12 |
+
domain TEXT NOT NULL,
|
| 13 |
+
total_urls INTEGER DEFAULT 0,
|
| 14 |
+
completed_urls INTEGER DEFAULT 0,
|
| 15 |
+
status TEXT DEFAULT 'running' CHECK (status IN ('running', 'paused', 'completed', 'error')),
|
| 16 |
+
created_at TIMESTAMPTZ DEFAULT now(),
|
| 17 |
+
updated_at TIMESTAMPTZ DEFAULT now(),
|
| 18 |
+
summary JSONB DEFAULT '{}'::jsonb,
|
| 19 |
+
pending_urls JSONB DEFAULT '[]'::jsonb
|
| 20 |
+
);
|
| 21 |
+
|
| 22 |
+
CREATE TABLE audit_pages (
|
| 23 |
+
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
|
| 24 |
+
run_id UUID REFERENCES audit_runs(id) ON DELETE CASCADE,
|
| 25 |
+
url TEXT NOT NULL,
|
| 26 |
+
result JSONB NOT NULL,
|
| 27 |
+
created_at TIMESTAMPTZ DEFAULT now()
|
| 28 |
+
);
|
| 29 |
+
|
| 30 |
+
CREATE INDEX idx_audit_pages_run_id ON audit_pages(run_id);
|
| 31 |
+
CREATE INDEX idx_audit_pages_url ON audit_pages(url);
|
| 32 |
+
CREATE INDEX idx_audit_runs_status ON audit_runs(status);
|
| 33 |
+
|
| 34 |
+
ALTER TABLE audit_runs ENABLE ROW LEVEL SECURITY;
|
| 35 |
+
ALTER TABLE audit_pages ENABLE ROW LEVEL SECURITY;
|
| 36 |
+
CREATE POLICY "Allow all on audit_runs" ON audit_runs FOR ALL USING (true);
|
| 37 |
+
CREATE POLICY "Allow all on audit_pages" ON audit_pages FOR ALL USING (true);
|
| 38 |
+
|
| 39 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
import json
|
| 43 |
+
from datetime import datetime
|
| 44 |
+
from supabase import create_client, Client
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def get_client(url: str, key: str) -> Client:
|
| 48 |
+
return create_client(url, key)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# βββ Run Management βββ
|
| 52 |
+
|
| 53 |
+
def create_run(client: Client, name: str, domain: str, total_urls: int, all_urls: list) -> str:
|
| 54 |
+
data = {
|
| 55 |
+
"name": name,
|
| 56 |
+
"domain": domain,
|
| 57 |
+
"total_urls": total_urls,
|
| 58 |
+
"completed_urls": 0,
|
| 59 |
+
"status": "running",
|
| 60 |
+
"pending_urls": json.dumps(all_urls),
|
| 61 |
+
}
|
| 62 |
+
response = client.table("audit_runs").insert(data).execute()
|
| 63 |
+
return response.data[0]["id"]
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_run(client: Client, run_id: str):
|
| 67 |
+
response = client.table("audit_runs").select("*").eq("id", run_id).single().execute()
|
| 68 |
+
return response.data
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def get_all_runs(client: Client):
|
| 72 |
+
response = client.table("audit_runs").select("*").order("created_at", desc=True).execute()
|
| 73 |
+
return response.data
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def update_run_status(client: Client, run_id: str, status: str, completed: int = None, summary: dict = None):
|
| 77 |
+
data = {"status": status, "updated_at": datetime.utcnow().isoformat()}
|
| 78 |
+
if completed is not None:
|
| 79 |
+
data["completed_urls"] = completed
|
| 80 |
+
if summary is not None:
|
| 81 |
+
data["summary"] = json.loads(json.dumps(summary, default=str))
|
| 82 |
+
client.table("audit_runs").update(data).eq("id", run_id).execute()
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def delete_run(client: Client, run_id: str):
|
| 86 |
+
client.table("audit_pages").delete().eq("run_id", run_id).execute()
|
| 87 |
+
client.table("audit_runs").delete().eq("id", run_id).execute()
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
# βββ Page Results βββ
|
| 91 |
+
|
| 92 |
+
def save_batch_results(client: Client, run_id: str, batch_results: list):
|
| 93 |
+
rows = []
|
| 94 |
+
for r in batch_results:
|
| 95 |
+
clean = json.loads(json.dumps(r, default=str))
|
| 96 |
+
rows.append({"run_id": run_id, "url": r['url'], "result": clean})
|
| 97 |
+
|
| 98 |
+
for i in range(0, len(rows), 50):
|
| 99 |
+
chunk = rows[i:i+50]
|
| 100 |
+
client.table("audit_pages").insert(chunk).execute()
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def get_completed_urls(client: Client, run_id: str) -> set:
|
| 104 |
+
urls = set()
|
| 105 |
+
offset = 0
|
| 106 |
+
page_size = 1000
|
| 107 |
+
while True:
|
| 108 |
+
response = (
|
| 109 |
+
client.table("audit_pages")
|
| 110 |
+
.select("url")
|
| 111 |
+
.eq("run_id", run_id)
|
| 112 |
+
.range(offset, offset + page_size - 1)
|
| 113 |
+
.execute()
|
| 114 |
+
)
|
| 115 |
+
if not response.data:
|
| 116 |
+
break
|
| 117 |
+
urls.update(p["url"] for p in response.data)
|
| 118 |
+
if len(response.data) < page_size:
|
| 119 |
+
break
|
| 120 |
+
offset += page_size
|
| 121 |
+
return urls
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def get_completed_count(client: Client, run_id: str) -> int:
|
| 125 |
+
response = (
|
| 126 |
+
client.table("audit_pages")
|
| 127 |
+
.select("id", count="exact")
|
| 128 |
+
.eq("run_id", run_id)
|
| 129 |
+
.execute()
|
| 130 |
+
)
|
| 131 |
+
return response.count or 0
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def get_all_page_results(client: Client, run_id: str) -> list:
|
| 135 |
+
all_pages = []
|
| 136 |
+
offset = 0
|
| 137 |
+
page_size = 500
|
| 138 |
+
while True:
|
| 139 |
+
response = (
|
| 140 |
+
client.table("audit_pages")
|
| 141 |
+
.select("url, result")
|
| 142 |
+
.eq("run_id", run_id)
|
| 143 |
+
.order("created_at", desc=False)
|
| 144 |
+
.range(offset, offset + page_size - 1)
|
| 145 |
+
.execute()
|
| 146 |
+
)
|
| 147 |
+
if not response.data:
|
| 148 |
+
break
|
| 149 |
+
all_pages.extend(response.data)
|
| 150 |
+
if len(response.data) < page_size:
|
| 151 |
+
break
|
| 152 |
+
offset += page_size
|
| 153 |
+
return all_pages
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def get_pending_urls(client: Client, run_id: str) -> list:
|
| 157 |
+
run = get_run(client, run_id)
|
| 158 |
+
pending = run.get("pending_urls")
|
| 159 |
+
if isinstance(pending, str):
|
| 160 |
+
return json.loads(pending)
|
| 161 |
+
return pending or []
|
report_generator.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HTML Report Generator
|
| 3 |
+
Produces the interactive accordion-based light-theme audit report.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import html as html_module
|
| 7 |
+
from urllib.parse import urlparse
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def esc(text):
|
| 12 |
+
return html_module.escape(str(text)) if text else ""
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def short_url(url, domain="edstellar.com"):
|
| 16 |
+
parsed = urlparse(str(url))
|
| 17 |
+
if domain in parsed.netloc:
|
| 18 |
+
return parsed.path + (('?' + parsed.query) if parsed.query else '')
|
| 19 |
+
host = parsed.netloc.replace('www.', '')
|
| 20 |
+
path = parsed.path
|
| 21 |
+
if len(path) > 50:
|
| 22 |
+
path = path[:25] + '...' + path[-20:]
|
| 23 |
+
return f"{host}{path}"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def badge(text, cls):
|
| 27 |
+
return f'<span class="badge {cls}">{esc(text)}</span>'
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def render_link_entry(link, domain="edstellar.com", show_flags=True):
|
| 31 |
+
has_issues = link['link_status'] in ('Broken',) or link.get('flags')
|
| 32 |
+
is_redirect = link['link_status'] == 'Redirect'
|
| 33 |
+
cls = 's-issue' if (has_issues or is_redirect) else 's-ok'
|
| 34 |
+
|
| 35 |
+
if link['link_status'] == 'Broken':
|
| 36 |
+
status_tag = f'<span class="tag-sm issue">{esc(link["status_code"])} BROKEN</span>'
|
| 37 |
+
elif link['link_status'] == 'Redirect':
|
| 38 |
+
status_tag = f'<span class="tag-sm issue">{esc(link["status_code"])} Redirect</span>'
|
| 39 |
+
else:
|
| 40 |
+
status_tag = f'<span class="tag-sm ok">{esc(link["status_code"])}</span>'
|
| 41 |
+
|
| 42 |
+
has_follow_flag = any('Dofollow' in f or 'Nofollow' in f for f in link.get('flags', []))
|
| 43 |
+
if has_follow_flag:
|
| 44 |
+
follow_tag = f'<span class="tag-sm issue">{link["follow"]} β </span>'
|
| 45 |
+
else:
|
| 46 |
+
follow_tag = f'<span class="tag-sm ok">{link["follow"]} β</span>'
|
| 47 |
+
|
| 48 |
+
out = f'<div class="le {cls}">'
|
| 49 |
+
out += f'<div class="le-url">{esc(short_url(link["url"], domain))}</div>'
|
| 50 |
+
out += f'<div class="le-tags">{status_tag}{follow_tag}</div>'
|
| 51 |
+
out += f'<div class="le-anchor">Anchor: <b>{esc(link["anchor"])}</b></div>'
|
| 52 |
+
|
| 53 |
+
if link.get('redirect_url'):
|
| 54 |
+
out += f'<div class="le-redir">β {esc(short_url(link["redirect_url"], domain))}</div>'
|
| 55 |
+
|
| 56 |
+
out += f'<div class="le-location">π {esc(link["location"])}</div>'
|
| 57 |
+
|
| 58 |
+
if show_flags:
|
| 59 |
+
for flag in link.get('flags', []):
|
| 60 |
+
out += f'<div class="le-issue">β {esc(flag)}</div>'
|
| 61 |
+
|
| 62 |
+
out += '</div>'
|
| 63 |
+
return out
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def render_accordion(collapsed_html, details_html):
|
| 67 |
+
return f'''<td class="acc-cell" onclick="toggleAcc(this)">
|
| 68 |
+
<div class="acc-collapsed"><div class="acc-chevron">βΆ</div><div class="acc-summary">{collapsed_html}</div></div>
|
| 69 |
+
<div class="acc-details">{details_html}</div>
|
| 70 |
+
</td>'''
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def generate_report(results, orphan_pages, domain="edstellar.com"):
|
| 74 |
+
now = datetime.now().strftime("%b %d, %Y %H:%M")
|
| 75 |
+
total_pages = len(results)
|
| 76 |
+
total_int = sum(r['int_count'] for r in results)
|
| 77 |
+
total_ext = sum(r['ext_count'] for r in results)
|
| 78 |
+
total_broken = sum(r['broken_int_count'] + r['broken_ext_count'] for r in results)
|
| 79 |
+
total_redirects = sum(r['redirect_int_count'] + r['redirect_ext_count'] for r in results)
|
| 80 |
+
total_flags = sum(r['follow_flag_count'] for r in results)
|
| 81 |
+
total_dups = sum(r['duplicate_count'] for r in results)
|
| 82 |
+
total_sug = sum(len(r['suggestions']) for r in results)
|
| 83 |
+
total_orphan = len(orphan_pages)
|
| 84 |
+
|
| 85 |
+
rows_html = ""
|
| 86 |
+
for idx, r in enumerate(results, 1):
|
| 87 |
+
if r['error']:
|
| 88 |
+
rows_html += f'''<tr><td class="cell-url"><div class="row-num">#{idx}</div>{esc(short_url(r["url"], domain))}</td>
|
| 89 |
+
<td colspan="15" style="color:var(--red);vertical-align:middle;">β Error: {esc(r["error"])}</td></tr>'''
|
| 90 |
+
continue
|
| 91 |
+
|
| 92 |
+
is_orphan = r['url'].rstrip('/').split('?')[0] in orphan_pages
|
| 93 |
+
|
| 94 |
+
# Internal Links
|
| 95 |
+
int_badges = ""
|
| 96 |
+
if r['int_count'] == 0:
|
| 97 |
+
int_badges = badge('0 Links β ', 'issue')
|
| 98 |
+
else:
|
| 99 |
+
ok_count = r['int_count'] - r['broken_int_count'] - r['redirect_int_count']
|
| 100 |
+
if ok_count > 0: int_badges += badge(f'{ok_count} Active', 'ok')
|
| 101 |
+
if r['broken_int_count'] > 0: int_badges += badge(f'{r["broken_int_count"]} Broken', 'issue')
|
| 102 |
+
if r['redirect_int_count'] > 0: int_badges += badge(f'{r["redirect_int_count"]} Redirect', 'issue')
|
| 103 |
+
int_details = "".join(render_link_entry(l, domain) for l in r['internal_links'])
|
| 104 |
+
if not int_details:
|
| 105 |
+
int_details = '<div style="font-size:11px;color:var(--red);">No internal links found in body content.</div>'
|
| 106 |
+
|
| 107 |
+
# External Links
|
| 108 |
+
ext_badges = ""
|
| 109 |
+
if r['ext_count'] == 0:
|
| 110 |
+
ext_badges = badge('0 Links', 'neutral')
|
| 111 |
+
else:
|
| 112 |
+
if r['broken_ext_count'] > 0: ext_badges += badge(f'{r["broken_ext_count"]} Broken', 'issue')
|
| 113 |
+
if r['redirect_ext_count'] > 0: ext_badges += badge(f'{r["redirect_ext_count"]} Redirect', 'issue')
|
| 114 |
+
ext_df_count = sum(1 for l in r['external_links'] if l['follow'] == 'Dofollow')
|
| 115 |
+
if ext_df_count > 0: ext_badges += badge(f'{ext_df_count} Dofollow β ', 'issue')
|
| 116 |
+
ok_ext = r['ext_count'] - r['broken_ext_count'] - r['redirect_ext_count']
|
| 117 |
+
if ok_ext > 0 and not r['broken_ext_count'] and not r['redirect_ext_count'] and not ext_df_count:
|
| 118 |
+
ext_badges += badge(f'{ok_ext} Active', 'ok')
|
| 119 |
+
ext_details = "".join(render_link_entry(l, domain) for l in r['external_links'])
|
| 120 |
+
if not ext_details:
|
| 121 |
+
ext_details = '<div style="font-size:11px;color:var(--text-dim);">No external links in body content.</div>'
|
| 122 |
+
|
| 123 |
+
# Follow Flags
|
| 124 |
+
int_nf_flags = [l for l in r['internal_links'] if l['follow'] == 'Nofollow']
|
| 125 |
+
ext_df_flags_list = [l for l in r['external_links'] if l['follow'] == 'Dofollow']
|
| 126 |
+
flag_badges = ""
|
| 127 |
+
if int_nf_flags: flag_badges += badge(f'{len(int_nf_flags)} Int. Nofollow β ', 'issue')
|
| 128 |
+
if ext_df_flags_list: flag_badges += badge(f'{len(ext_df_flags_list)} Ext. Dofollow β ', 'issue')
|
| 129 |
+
if not flag_badges: flag_badges = badge('β No Flags', 'ok')
|
| 130 |
+
flag_details = "".join(render_link_entry(l, domain, show_flags=True) for l in int_nf_flags + ext_df_flags_list)
|
| 131 |
+
if not flag_details:
|
| 132 |
+
flag_details = '<div style="font-size:11px;color:var(--green);">All internal=Dofollow β and external=Nofollow β</div>'
|
| 133 |
+
|
| 134 |
+
# Broken / Redirect
|
| 135 |
+
bi_badges = badge(f'{r["broken_int_count"]} Broken', 'issue') if r['broken_int_count'] > 0 else badge('β None', 'ok')
|
| 136 |
+
bi_details = "".join(render_link_entry(l, domain) for l in r['broken_internal']) or '<div style="font-size:11px;color:var(--green);">No broken internal links.</div>'
|
| 137 |
+
|
| 138 |
+
be_badges = badge(f'{r["broken_ext_count"]} Broken', 'issue') if r['broken_ext_count'] > 0 else badge('β None', 'ok')
|
| 139 |
+
be_details = "".join(render_link_entry(l, domain) for l in r['broken_external']) or '<div style="font-size:11px;color:var(--green);">No broken external links.</div>'
|
| 140 |
+
|
| 141 |
+
ri_badges = badge(f'{r["redirect_int_count"]} Redirects', 'issue') if r['redirect_int_count'] > 0 else badge('β None', 'ok')
|
| 142 |
+
ri_details = "".join(render_link_entry(l, domain) for l in r['redirect_internal']) or '<div style="font-size:11px;color:var(--green);">No internal redirects.</div>'
|
| 143 |
+
|
| 144 |
+
re_badges = badge(f'{r["redirect_ext_count"]} Redirects', 'issue') if r['redirect_ext_count'] > 0 else badge('β None', 'ok')
|
| 145 |
+
re_details = "".join(render_link_entry(l, domain) for l in r['redirect_external']) or '<div style="font-size:11px;color:var(--green);">No external redirects.</div>'
|
| 146 |
+
|
| 147 |
+
# Duplicates
|
| 148 |
+
dup_badges = badge(f'{r["duplicate_count"]} Duplicates', 'issue') if r['duplicate_count'] > 0 else badge('β None', 'ok')
|
| 149 |
+
dup_details = ""
|
| 150 |
+
for d in r['duplicates']:
|
| 151 |
+
locs = ", ".join(esc(l) for l in d['locations'])
|
| 152 |
+
dup_details += f'<div class="le s-issue"><div class="le-url">{esc(short_url(d["url"], domain))}</div>'
|
| 153 |
+
dup_details += f'<div class="le-issue">β Appears {d["count"]}x in body content</div>'
|
| 154 |
+
dup_details += f'<div class="le-location">π Locations: {locs}</div></div>'
|
| 155 |
+
if not dup_details:
|
| 156 |
+
dup_details = '<div style="font-size:11px;color:var(--green);">No duplicate links in body content.</div>'
|
| 157 |
+
|
| 158 |
+
# Suggestions
|
| 159 |
+
sug_list = r['suggestions']
|
| 160 |
+
high_count = sum(1 for s in sug_list if s['priority'] == 'High')
|
| 161 |
+
sug_badges = ""
|
| 162 |
+
if sug_list:
|
| 163 |
+
sug_badges = badge(f'{len(sug_list)} Suggestions', 'sug')
|
| 164 |
+
if high_count: sug_badges += badge(f'{high_count} High', 'issue')
|
| 165 |
+
else:
|
| 166 |
+
sug_badges = badge('0', 'neutral')
|
| 167 |
+
sug_details = ""
|
| 168 |
+
for s in sug_list:
|
| 169 |
+
pri_cls = 'high' if s['priority'] == 'High' else 'med'
|
| 170 |
+
sug_details += f'''<div class="se"><div class="se-head"><span class="se-section">{esc(s["section"])}</span>
|
| 171 |
+
<span class="se-pri {pri_cls}">{s["priority"]}</span></div>
|
| 172 |
+
<div class="se-url">{esc(s["target"])}</div>
|
| 173 |
+
<div class="se-anchor">β "{esc(s["anchor"])}"</div></div>'''
|
| 174 |
+
if not sug_details:
|
| 175 |
+
sug_details = '<div style="font-size:11px;color:var(--text-dim);">No keyword matches for suggestions.</div>'
|
| 176 |
+
|
| 177 |
+
# Notes
|
| 178 |
+
issues = []
|
| 179 |
+
if r['int_count'] < 3:
|
| 180 |
+
issues.append(f"Only {r['int_count']} internal links β very low for article length")
|
| 181 |
+
if r['broken_int_count'] + r['broken_ext_count'] > 0:
|
| 182 |
+
issues.append(f"{r['broken_int_count']+r['broken_ext_count']} broken link(s) need fixing")
|
| 183 |
+
if ext_df_flags_list:
|
| 184 |
+
issues.append(f"{len(ext_df_flags_list)} external links are Dofollow β add nofollow")
|
| 185 |
+
if int_nf_flags:
|
| 186 |
+
issues.append(f"{len(int_nf_flags)} internal links are Nofollow β should be Dofollow")
|
| 187 |
+
if r['redirect_int_count'] + r['redirect_ext_count'] > 0:
|
| 188 |
+
issues.append(f"{r['redirect_int_count']+r['redirect_ext_count']} redirect(s) β update href")
|
| 189 |
+
if r['duplicate_count'] > 0:
|
| 190 |
+
issues.append(f"{r['duplicate_count']} duplicate link(s) in body")
|
| 191 |
+
if is_orphan:
|
| 192 |
+
issues.append("ORPHAN PAGE β no incoming internal links from other pages")
|
| 193 |
+
|
| 194 |
+
note_badges = badge(f'{len(issues)} Issues', 'issue') if issues else badge('β Clean', 'ok')
|
| 195 |
+
note_details = "".join(f'<div class="ni critical">β {esc(issue)}</div>' for issue in issues)
|
| 196 |
+
if not note_details:
|
| 197 |
+
note_details = '<div style="font-size:11px;color:var(--green);">No issues detected.</div>'
|
| 198 |
+
|
| 199 |
+
orphan_cell = '<span class="badge issue">Yes β </span>' if is_orphan else '<span class="badge ok">No</span>'
|
| 200 |
+
|
| 201 |
+
rows_html += f'''<tr data-broken="{r['broken_int_count']+r['broken_ext_count']}" data-redirect="{r['redirect_int_count']+r['redirect_ext_count']}" data-internal="{r['int_count']}" data-follow-flag="{r['follow_flag_count']}" data-duplicates="{r['duplicate_count']}" data-orphan="{1 if is_orphan else 0}">
|
| 202 |
+
<td class="cell-url"><div class="row-num">#{idx}</div>{esc(short_url(r['url'], domain))}</td>
|
| 203 |
+
<td class="cell-count {'issue' if r['int_count']<3 else 'neutral'}">{r['int_count']}</td>
|
| 204 |
+
<td class="cell-count neutral">{r['ext_count']}</td>
|
| 205 |
+
{render_accordion(int_badges, int_details)}
|
| 206 |
+
{render_accordion(ext_badges, ext_details)}
|
| 207 |
+
<td class="cell-count"><span style="color:var(--green)">{r['int_df']}</span><span style="color:var(--text-dim)"> / </span><span style="color:{'var(--red)' if r['int_nf']>0 else 'var(--text-dim)'}">{r['int_nf']}</span></td>
|
| 208 |
+
<td class="cell-count"><span style="color:{'var(--red)' if r['ext_df']>0 else 'var(--text-dim)'}">{r['ext_df']}</span><span style="color:var(--text-dim)"> / </span><span style="color:var(--green)">{r['ext_nf']}</span></td>
|
| 209 |
+
{render_accordion(flag_badges, flag_details)}
|
| 210 |
+
{render_accordion(bi_badges, bi_details)}
|
| 211 |
+
{render_accordion(be_badges, be_details)}
|
| 212 |
+
{render_accordion(ri_badges, ri_details)}
|
| 213 |
+
{render_accordion(re_badges, re_details)}
|
| 214 |
+
{render_accordion(dup_badges, dup_details)}
|
| 215 |
+
<td class="cell-orphan">{orphan_cell}</td>
|
| 216 |
+
{render_accordion(sug_badges, sug_details)}
|
| 217 |
+
{render_accordion(note_badges, note_details)}
|
| 218 |
+
</tr>'''
|
| 219 |
+
|
| 220 |
+
report = f'''<!DOCTYPE html>
|
| 221 |
+
<html lang="en">
|
| 222 |
+
<head>
|
| 223 |
+
<meta charset="UTF-8"><meta name="viewport" content="width=device-width,initial-scale=1.0">
|
| 224 |
+
<title>Bulk Link Audit Report</title>
|
| 225 |
+
<link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,300;0,9..40,400;0,9..40,500;0,9..40,600;0,9..40,700;1,9..40,400&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
|
| 226 |
+
<style>
|
| 227 |
+
:root {{--bg:#f4f6f9;--surface:#fff;--surface-2:#f8f9fb;--surface-3:#eef1f5;--border:#e2e6ed;--border-light:#d0d5de;--text:#1a1f2e;--text-muted:#5c6478;--text-dim:#8892a6;--accent:#2563eb;--accent-dim:rgba(37,99,235,0.06);--accent-light:rgba(37,99,235,0.10);--green:#059669;--green-dim:rgba(5,150,105,0.06);--green-light:rgba(5,150,105,0.10);--red:#dc2626;--red-dim:rgba(220,38,38,0.05);--red-light:rgba(220,38,38,0.08);--orange:#d97706;--orange-dim:rgba(217,119,6,0.06);--purple:#7c3aed;--purple-dim:rgba(124,58,237,0.06);--pink:#db2777;--pink-dim:rgba(219,39,119,0.06);--mono:'JetBrains Mono',monospace;--font:'DM Sans',sans-serif;--radius:8px;--radius-sm:5px;}}
|
| 228 |
+
*{{margin:0;padding:0;box-sizing:border-box;}}body{{background:var(--bg);color:var(--text);font-family:var(--font);-webkit-font-smoothing:antialiased;line-height:1.55;}}
|
| 229 |
+
.header{{padding:32px 36px 24px;border-bottom:1px solid var(--border);background:var(--surface);}}.header .tag-line{{font-size:10px;font-weight:700;letter-spacing:1.8px;text-transform:uppercase;color:var(--accent);margin-bottom:8px;}}.header h1{{font-size:21px;font-weight:700;margin-bottom:4px;}}.header .meta{{font-size:11.5px;color:var(--text-dim);}}.header .meta span{{color:var(--text-muted);}}
|
| 230 |
+
.toolbar{{display:flex;align-items:center;gap:10px;padding:12px 36px;background:var(--surface);border-bottom:1px solid var(--border);flex-wrap:wrap;}}.toolbar-btn{{font-family:var(--font);font-size:11px;font-weight:600;color:var(--text-muted);background:var(--surface-2);border:1px solid var(--border);border-radius:var(--radius-sm);padding:6px 14px;cursor:pointer;transition:all .15s;}}.toolbar-btn:hover{{color:var(--text);background:var(--surface-3);}}.toolbar-btn.active{{color:var(--accent);border-color:var(--accent);background:var(--accent-dim);}}.toolbar-sep{{width:1px;height:20px;background:var(--border);}}.toolbar-label{{font-size:10px;font-weight:600;letter-spacing:.5px;text-transform:uppercase;color:var(--text-dim);margin-right:4px;}}
|
| 231 |
+
.legend{{display:flex;flex-wrap:wrap;gap:18px;padding:10px 36px;background:var(--surface-2);border-bottom:1px solid var(--border);}}.legend-item{{display:flex;align-items:center;gap:6px;font-size:10.5px;color:var(--text-muted);}}.legend-dot{{width:8px;height:8px;border-radius:50%;}}.legend-dot.green{{background:var(--green);}}.legend-dot.red{{background:var(--red);}}.legend-dot.blue{{background:var(--accent);}}
|
| 232 |
+
.summary-bar{{display:flex;border-bottom:1px solid var(--border);background:var(--surface);flex-wrap:wrap;}}.summary-stat{{flex:1;padding:14px 16px;border-right:1px solid var(--border);text-align:center;min-width:90px;}}.summary-stat:last-child{{border-right:none;}}.summary-stat .s-val{{font-size:22px;font-weight:700;line-height:1;margin-bottom:2px;}}.summary-stat .s-label{{font-size:9px;font-weight:600;letter-spacing:.5px;text-transform:uppercase;color:var(--text-dim);}}.s-val.blue{{color:var(--accent);}}.s-val.green{{color:var(--green);}}.s-val.red{{color:var(--red);}}.s-val.pink{{color:var(--pink);}}
|
| 233 |
+
.table-wrap{{overflow-x:auto;background:var(--surface);}}table{{width:100%;border-collapse:collapse;min-width:2400px;}}
|
| 234 |
+
thead th{{background:var(--surface-3);color:var(--text-muted);font-size:9px;font-weight:700;letter-spacing:.7px;text-transform:uppercase;padding:12px 12px;text-align:left;border-bottom:2px solid var(--border-light);position:sticky;top:0;z-index:10;white-space:nowrap;}}thead th.center{{text-align:center;}}thead th.c-blue{{border-bottom-color:var(--accent);color:var(--accent);}}thead th.c-red{{border-bottom-color:var(--red);color:var(--red);}}thead th.c-green{{border-bottom-color:var(--green);color:var(--green);}}thead th.c-pink{{border-bottom-color:var(--pink);color:var(--pink);}}thead th:first-child{{position:sticky;left:0;z-index:15;border-right:2px solid var(--border-light);background:var(--surface-3);}}
|
| 235 |
+
tbody tr{{border-bottom:1px solid var(--border);transition:background .12s;}}tbody tr:hover{{background:var(--accent-dim);}}
|
| 236 |
+
tbody td{{padding:10px 12px;vertical-align:top;font-size:12px;border-right:1px solid var(--border);}}tbody td:last-child{{border-right:none;}}
|
| 237 |
+
td.cell-url{{font-family:var(--mono);font-size:11px;color:var(--accent);word-break:break-all;line-height:1.5;background:var(--surface);position:sticky;left:0;z-index:5;border-right:2px solid var(--border-light);min-width:260px;max-width:260px;}}td.cell-url .row-num{{font-size:9px;color:var(--text-dim);margin-bottom:4px;font-weight:600;}}
|
| 238 |
+
td.cell-count{{text-align:center;font-family:var(--mono);font-size:15px;font-weight:700;vertical-align:middle;}}td.cell-count.ok{{color:var(--green);}}td.cell-count.issue{{color:var(--red);}}td.cell-count.neutral{{color:var(--text);}}
|
| 239 |
+
.acc-cell{{cursor:pointer;user-select:none;}}.acc-collapsed{{display:flex;align-items:center;gap:8px;min-height:26px;}}.acc-chevron{{width:20px;height:20px;border-radius:4px;background:var(--surface-3);display:flex;align-items:center;justify-content:center;flex-shrink:0;transition:transform .2s,background .15s;font-size:10px;color:var(--text-dim);border:1px solid var(--border);}}.acc-cell.open .acc-chevron{{transform:rotate(90deg);background:var(--accent-dim);color:var(--accent);border-color:var(--accent);}}.acc-summary{{font-size:11px;color:var(--text-muted);display:flex;flex-wrap:wrap;gap:4px;align-items:center;}}
|
| 240 |
+
.badge{{font-size:9px;font-weight:700;padding:2.5px 8px;border-radius:10px;letter-spacing:.2px;white-space:nowrap;border:1px solid transparent;}}.badge.ok{{background:var(--green-dim);color:var(--green);border-color:var(--green-light);}}.badge.issue{{background:var(--red-dim);color:var(--red);border-color:var(--red-light);}}.badge.sug{{background:var(--green-dim);color:var(--green);border-color:var(--green-light);}}.badge.neutral{{background:var(--surface-3);color:var(--text-dim);border-color:var(--border);}}
|
| 241 |
+
.acc-details{{display:none;margin-top:10px;padding-top:10px;border-top:1px dashed var(--border);}}.acc-cell.open .acc-details{{display:block;}}
|
| 242 |
+
.le{{padding:9px 11px;background:var(--surface-2);border-radius:var(--radius-sm);margin-bottom:6px;border:1px solid var(--border);border-left-width:3px;}}.le:last-child{{margin-bottom:0;}}.le.s-ok{{border-left-color:var(--green);}}.le.s-issue{{border-left-color:var(--red);background:var(--red-dim);}}
|
| 243 |
+
.le .le-url{{font-family:var(--mono);font-size:10.5px;color:var(--accent);word-break:break-all;margin-bottom:4px;}}.le.s-issue .le-url{{color:var(--red);}}
|
| 244 |
+
.le .le-tags{{display:flex;flex-wrap:wrap;gap:4px;margin-bottom:4px;}}.tag-sm{{font-size:8.5px;font-weight:700;padding:2px 6px;border-radius:8px;}}.tag-sm.ok{{background:var(--green-light);color:var(--green);}}.tag-sm.issue{{background:var(--red-light);color:var(--red);}}
|
| 245 |
+
.le .le-anchor{{font-size:10.5px;color:var(--text-muted);}}.le .le-anchor b{{color:var(--text);font-weight:600;}}.le .le-redir{{font-family:var(--mono);font-size:10px;color:var(--orange);margin-top:3px;}}.le .le-location{{font-size:10px;color:var(--text-dim);margin-top:4px;}}.le .le-issue{{font-size:10px;color:var(--red);margin-top:5px;padding-top:5px;border-top:1px dashed var(--border);font-weight:500;}}.le .le-fix{{font-size:10px;color:var(--green);margin-top:3px;font-weight:500;}}
|
| 246 |
+
.se{{padding:8px 11px;background:var(--surface-2);border-radius:var(--radius-sm);margin-bottom:5px;border:1px solid var(--border);border-left:3px solid var(--green);}}.se:last-child{{margin-bottom:0;}}.se .se-head{{display:flex;justify-content:space-between;align-items:center;margin-bottom:2px;}}.se .se-section{{font-size:9.5px;font-weight:600;letter-spacing:.3px;text-transform:uppercase;color:var(--text-dim);}}.se-pri{{font-size:8.5px;font-weight:700;letter-spacing:.5px;text-transform:uppercase;padding:2px 7px;border-radius:8px;}}.se-pri.high{{background:var(--red-light);color:var(--red);}}.se-pri.med{{background:rgba(217,119,6,.1);color:var(--orange);}}.se .se-url{{font-family:var(--mono);font-size:10px;color:var(--accent);word-break:break-all;margin-bottom:1px;}}.se .se-anchor{{font-size:10px;color:var(--green);font-weight:500;}}
|
| 247 |
+
.ni{{font-size:11px;color:var(--text-muted);line-height:1.5;padding:5px 0;border-bottom:1px dashed var(--border);}}.ni:last-child{{border-bottom:none;}}.ni.critical{{color:var(--red);font-weight:500;}}
|
| 248 |
+
td.cell-orphan{{text-align:center;vertical-align:middle;}}
|
| 249 |
+
.footer{{padding:18px 36px;border-top:1px solid var(--border);font-size:11px;color:var(--text-dim);text-align:center;background:var(--surface);}}
|
| 250 |
+
</style>
|
| 251 |
+
</head>
|
| 252 |
+
<body>
|
| 253 |
+
<div class="header"><div class="tag-line">Bulk Link Audit Report</div><h1>Blog β Body Content Link Analysis</h1><div class="meta">Scope: Body content only Β· <span>{now}</span> Β· Pages: <span>{total_pages}</span> Β· Domain: <span>{esc(domain)}</span></div></div>
|
| 254 |
+
<div class="toolbar"><span class="toolbar-label">Actions:</span><button class="toolbar-btn" onclick="expandAll()">β Expand All</button><button class="toolbar-btn" onclick="collapseAll()">β Collapse All</button><div class="toolbar-sep"></div><span class="toolbar-label">Filter:</span><button class="toolbar-btn filter-btn active" onclick="filterRows('all',this)">All ({total_pages})</button><button class="toolbar-btn filter-btn" onclick="filterRows('broken',this)">Broken ({total_broken})</button><button class="toolbar-btn filter-btn" onclick="filterRows('redirect',this)">Redirects ({total_redirects})</button><button class="toolbar-btn filter-btn" onclick="filterRows('low-links',this)">Low Links</button><button class="toolbar-btn filter-btn" onclick="filterRows('follow-flag',this)">Follow Flags ({total_flags})</button><button class="toolbar-btn filter-btn" onclick="filterRows('duplicates',this)">Duplicates ({total_dups})</button><button class="toolbar-btn filter-btn" onclick="filterRows('orphan',this)">Orphans ({total_orphan})</button></div>
|
| 255 |
+
<div class="legend"><div class="legend-item"><div class="legend-dot green"></div> No Issues</div><div class="legend-item"><div class="legend-dot red"></div> Issue (Broken / Flag / Redirect / Duplicate)</div><div class="legend-item"><div class="legend-dot blue"></div> Info</div></div>
|
| 256 |
+
<div class="summary-bar"><div class="summary-stat"><div class="s-val blue">{total_pages}</div><div class="s-label">Pages</div></div><div class="summary-stat"><div class="s-val blue">{total_int}</div><div class="s-label">Internal</div></div><div class="summary-stat"><div class="s-val blue">{total_ext}</div><div class="s-label">External</div></div><div class="summary-stat"><div class="s-val red">{total_broken}</div><div class="s-label">Broken</div></div><div class="summary-stat"><div class="s-val red">{total_redirects}</div><div class="s-label">Redirects</div></div><div class="summary-stat"><div class="s-val red">{total_flags}</div><div class="s-label">Follow Flags</div></div><div class="summary-stat"><div class="s-val pink">{total_dups}</div><div class="s-label">Duplicates</div></div><div class="summary-stat"><div class="s-val green">{total_sug}</div><div class="s-label">Suggestions</div></div><div class="summary-stat"><div class="s-val red">{total_orphan}</div><div class="s-label">Orphans</div></div></div>
|
| 257 |
+
<div class="table-wrap"><table><thead><tr><th>URL</th><th class="center c-blue">Int.</th><th class="center c-blue">Ext.</th><th class="c-blue">Internal Links</th><th class="c-blue">External Links</th><th class="center">Int DF/NF</th><th class="center">Ext DF/NF</th><th class="c-red">Follow Flags</th><th class="c-red">Broken Int.</th><th class="c-red">Broken Ext.</th><th class="c-red">Redirect Int.</th><th class="c-red">Redirect Ext.</th><th class="c-pink">Duplicates</th><th class="center">Orphan</th><th class="c-green">Suggestions</th><th>Notes</th></tr></thead>
|
| 258 |
+
<tbody>{rows_html}</tbody></table></div>
|
| 259 |
+
<div class="footer">Bulk Link Audit Β· Body Content Scope Β· {now} Β· Click βΆ to expand</div>
|
| 260 |
+
<script>
|
| 261 |
+
function toggleAcc(c){{c.classList.toggle('open');}}
|
| 262 |
+
function expandAll(){{document.querySelectorAll('.acc-cell').forEach(c=>c.classList.add('open'));}}
|
| 263 |
+
function collapseAll(){{document.querySelectorAll('.acc-cell').forEach(c=>c.classList.remove('open'));}}
|
| 264 |
+
function filterRows(t,b){{document.querySelectorAll('.filter-btn').forEach(x=>x.classList.remove('active'));b.classList.add('active');document.querySelectorAll('tbody tr').forEach(r=>{{let d=r.dataset,s=true;if(t==='broken')s=parseInt(d.broken||0)>0;else if(t==='redirect')s=parseInt(d.redirect||0)>0;else if(t==='low-links')s=parseInt(d.internal||0)<5;else if(t==='follow-flag')s=parseInt(d.followFlag||0)>0;else if(t==='duplicates')s=parseInt(d.duplicates||0)>0;else if(t==='orphan')s=parseInt(d.orphan||0)>0;r.style.display=s?'':'none';}});}}
|
| 265 |
+
</script>
|
| 266 |
+
</body></html>'''
|
| 267 |
+
|
| 268 |
+
return report
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==5.12.0
|
| 2 |
+
requests==2.32.3
|
| 3 |
+
beautifulsoup4==4.12.3
|
| 4 |
+
lxml==5.3.0
|
| 5 |
+
openpyxl==3.1.5
|
| 6 |
+
pandas==2.2.3
|
| 7 |
+
supabase==2.11.0
|