vijaykumaredstellar commited on
Commit
7f10996
Β·
verified Β·
1 Parent(s): 59df77a

Upload 6 files

Browse files
Files changed (6) hide show
  1. README.md +29 -0
  2. app.py +633 -0
  3. audit_engine.py +312 -0
  4. db.py +161 -0
  5. report_generator.py +268 -0
  6. requirements.txt +7 -0
README.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Link Audit Tool
3
+ emoji: πŸ”—
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 5.12.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # πŸ”— Bulk Link Audit Tool
13
+
14
+ SEO link audit tool that crawls pages, extracts body-content links, checks status, detects broken links, follow flags, duplicates, and orphan pages.
15
+
16
+ ## Features
17
+ - **Batch processing** with auto-save to Supabase after each batch
18
+ - **Proper pause/resume** β€” pause mid-audit, come back later, resume from where you left off
19
+ - **Interactive HTML report** with accordion cells, filters, and issue highlighting
20
+ - **Orphan page detection** after full crawl
21
+ - **Follow flag detection** β€” internal nofollow ⚠, external dofollow ⚠
22
+ - **Duplicate link detection** with body locations
23
+
24
+ ## Setup
25
+ Add these secrets in your Space settings:
26
+ - `SUPABASE_URL` β€” your Supabase project URL
27
+ - `SUPABASE_KEY` β€” your Supabase anon key
28
+
29
+ Then run the SQL schema from `db.py` in your Supabase SQL editor.
app.py ADDED
@@ -0,0 +1,633 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Link Audit Tool β€” Gradio UI for Hugging Face Spaces
3
+ Proper pause/resume via threading + Supabase persistence.
4
+ """
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ import time
9
+ import json
10
+ import os
11
+ import threading
12
+ import tempfile
13
+ from datetime import datetime
14
+ from audit_engine import audit_page, DEFAULT_BODY_SELECTORS, DEFAULT_SUGGESTION_MAP
15
+ from report_generator import generate_report
16
+ from db import (
17
+ get_client, create_run, get_all_runs, get_all_page_results,
18
+ get_completed_urls, get_pending_urls, get_completed_count,
19
+ save_batch_results, update_run_status, delete_run,
20
+ )
21
+
22
+ # ─── Supabase Connection ───
23
+ SUPABASE_URL = os.environ.get("SUPABASE_URL", "")
24
+ SUPABASE_KEY = os.environ.get("SUPABASE_KEY", "")
25
+ sb = None
26
+ if SUPABASE_URL and SUPABASE_KEY:
27
+ try:
28
+ sb = get_client(SUPABASE_URL, SUPABASE_KEY)
29
+ sb.table("audit_runs").select("id").limit(1).execute()
30
+ except Exception as e:
31
+ print(f"Supabase connection failed: {e}")
32
+ sb = None
33
+
34
+ # ─── Audit State (thread-safe) ───
35
+ class AuditState:
36
+ def __init__(self):
37
+ self.lock = threading.Lock()
38
+ self.paused = False
39
+ self.running = False
40
+ self.run_id = None
41
+
42
+ def request_pause(self):
43
+ with self.lock:
44
+ self.paused = True
45
+
46
+ def resume(self):
47
+ with self.lock:
48
+ self.paused = False
49
+
50
+ def is_paused(self):
51
+ with self.lock:
52
+ return self.paused
53
+
54
+ def set_running(self, val, run_id=None):
55
+ with self.lock:
56
+ self.running = val
57
+ if run_id:
58
+ self.run_id = run_id
59
+
60
+ def is_running(self):
61
+ with self.lock:
62
+ return self.running
63
+
64
+ audit_state = AuditState()
65
+
66
+
67
+ # ═══════════════════════════════════════════════════
68
+ # CORE AUDIT FUNCTION (runs as generator for streaming)
69
+ # ═══════════════════════════════════════════════════
70
+
71
+ def run_audit(file, pasted_urls, domain, batch_size, timeout, delay, workers):
72
+ """Main audit generator β€” yields progress updates."""
73
+ if sb is None:
74
+ yield "❌ Supabase not connected. Set SUPABASE_URL and SUPABASE_KEY in Space secrets.", "", gr.update(), gr.update()
75
+ return
76
+
77
+ # Parse URLs
78
+ urls = []
79
+ if file is not None:
80
+ try:
81
+ if file.name.endswith('.csv'):
82
+ df = pd.read_csv(file.name)
83
+ else:
84
+ df = pd.read_excel(file.name)
85
+ url_col = None
86
+ for col in df.columns:
87
+ sample = str(df[col].iloc[0]).strip().lower()
88
+ if sample.startswith('http') or domain in sample:
89
+ url_col = col
90
+ break
91
+ if url_col is None:
92
+ url_col = df.columns[0]
93
+ urls = [u for u in df[url_col].dropna().astype(str).str.strip().tolist() if u.startswith('http')]
94
+ except Exception as e:
95
+ yield f"❌ File error: {e}", "", gr.update(), gr.update()
96
+ return
97
+ elif pasted_urls and pasted_urls.strip():
98
+ urls = [u.strip() for u in pasted_urls.strip().split('\n') if u.strip().startswith('http')]
99
+
100
+ if not urls:
101
+ yield "⚠ No valid URLs found. Upload a file or paste URLs.", "", gr.update(), gr.update()
102
+ return
103
+
104
+ # Deduplicate preserving order
105
+ seen = set()
106
+ unique_urls = []
107
+ for u in urls:
108
+ if u not in seen:
109
+ seen.add(u)
110
+ unique_urls.append(u)
111
+ urls = unique_urls
112
+
113
+ run_name = f"{domain} Audit β€” {datetime.now().strftime('%b %d %H:%M')} β€” {len(urls)} pages"
114
+ run_id = create_run(sb, run_name, domain, len(urls), urls)
115
+
116
+ audit_state.set_running(True, run_id)
117
+ audit_state.resume() # Reset pause flag
118
+
119
+ total = len(urls)
120
+ start_time = time.time()
121
+ batch_num = 0
122
+ log_lines = []
123
+
124
+ yield f"πŸš€ Started: {run_name}\nπŸ“¦ {total} URLs Β· Batch size: {batch_size}", "", gr.update(interactive=True), gr.update(interactive=False)
125
+
126
+ try:
127
+ for batch_start in range(0, total, batch_size):
128
+ # Check for pause
129
+ if audit_state.is_paused():
130
+ completed = get_completed_count(sb, run_id)
131
+ update_run_status(sb, run_id, "paused", completed)
132
+ log_lines.append(f"⏸️ PAUSED at {completed}/{total} β€” resume from Past Runs")
133
+ audit_state.set_running(False)
134
+ yield "\n".join(log_lines[-40:]), "", gr.update(interactive=False), gr.update(interactive=False)
135
+ return
136
+
137
+ batch_end = min(batch_start + batch_size, total)
138
+ batch_urls = urls[batch_start:batch_end]
139
+ batch_num += 1
140
+ batch_results = []
141
+
142
+ for j, url in enumerate(batch_urls):
143
+ # Check pause between each URL
144
+ if audit_state.is_paused():
145
+ # Save partial batch
146
+ if batch_results:
147
+ save_batch_results(sb, run_id, batch_results)
148
+ completed = get_completed_count(sb, run_id)
149
+ update_run_status(sb, run_id, "paused", completed)
150
+ log_lines.append(f"⏸️ PAUSED at {completed}/{total}")
151
+ audit_state.set_running(False)
152
+ yield "\n".join(log_lines[-40:]), "", gr.update(interactive=False), gr.update(interactive=False)
153
+ return
154
+
155
+ global_idx = batch_start + j + 1
156
+ elapsed = time.time() - start_time
157
+ avg = elapsed / global_idx
158
+ eta = avg * (total - global_idx)
159
+ eta_str = f"{int(eta//60)}m {int(eta%60)}s" if eta > 60 else f"{eta:.0f}s"
160
+
161
+ result = audit_page(
162
+ url, domain, DEFAULT_BODY_SELECTORS,
163
+ suggestion_map=DEFAULT_SUGGESTION_MAP,
164
+ timeout=timeout, concurrent_workers=workers,
165
+ )
166
+ batch_results.append(result)
167
+
168
+ short = url.replace('https://www.', '').replace('https://', '')[:70]
169
+ if result['error']:
170
+ log_lines.append(f"❌ [{global_idx}/{total}] {short} β€” {result['error'][:50]}")
171
+ else:
172
+ b = result['broken_int_count'] + result['broken_ext_count']
173
+ fc = result['follow_flag_count']
174
+ d = result['duplicate_count']
175
+ flags = []
176
+ if b: flags.append(f"πŸ”΄ {b} broken")
177
+ if fc: flags.append(f"🟑 {fc} flags")
178
+ if d: flags.append(f"🟣 {d} dups")
179
+ flag_str = " Β· ".join(flags) if flags else "βœ…"
180
+ log_lines.append(f"[{global_idx}/{total}] {short} β€” Int:{result['int_count']} Ext:{result['ext_count']} Β· {flag_str}")
181
+
182
+ progress_text = f"πŸ“Š Progress: {global_idx}/{total} ({global_idx*100//total}%) Β· Batch {batch_num} Β· ETA: {eta_str}"
183
+ yield "\n".join(log_lines[-40:]), progress_text, gr.update(interactive=True), gr.update(interactive=False)
184
+
185
+ if j < len(batch_urls) - 1:
186
+ time.sleep(delay)
187
+
188
+ # Save batch to Supabase
189
+ if batch_results:
190
+ try:
191
+ save_batch_results(sb, run_id, batch_results)
192
+ completed = get_completed_count(sb, run_id)
193
+ update_run_status(sb, run_id, "running", completed)
194
+ log_lines.append(f"πŸ’Ύ Batch {batch_num} saved β€” {completed}/{total} done")
195
+ except Exception as e:
196
+ log_lines.append(f"⚠ Batch save error: {str(e)[:60]}")
197
+
198
+ yield "\n".join(log_lines[-40:]), f"πŸ“Š Progress: {min(batch_end, total)}/{total} Β· Saved batch {batch_num}", gr.update(interactive=True), gr.update(interactive=False)
199
+ del batch_results
200
+
201
+ # ── ALL DONE β€” Orphan analysis ──
202
+ log_lines.append("πŸ” Running orphan page analysis...")
203
+ yield "\n".join(log_lines[-40:]), f"πŸ“Š Orphan analysis...", gr.update(interactive=False), gr.update(interactive=False)
204
+
205
+ all_pages = get_all_page_results(sb, run_id)
206
+ all_results = [p['result'] for p in all_pages]
207
+
208
+ all_internal_targets = set()
209
+ all_page_urls = set()
210
+ for r in all_results:
211
+ all_page_urls.add(r['url'].rstrip('/').split('?')[0])
212
+ for link in r.get('internal_links', []):
213
+ all_internal_targets.add(link['url'].rstrip('/').split('?')[0])
214
+ orphan_pages = sorted([p for p in all_page_urls if p not in all_internal_targets])
215
+
216
+ summary = {
217
+ 'total_pages': len(all_results),
218
+ 'total_int': sum(r.get('int_count', 0) for r in all_results),
219
+ 'total_ext': sum(r.get('ext_count', 0) for r in all_results),
220
+ 'total_broken': sum(r.get('broken_int_count', 0) + r.get('broken_ext_count', 0) for r in all_results),
221
+ 'total_redirects': sum(r.get('redirect_int_count', 0) + r.get('redirect_ext_count', 0) for r in all_results),
222
+ 'total_flags': sum(r.get('follow_flag_count', 0) for r in all_results),
223
+ 'total_dups': sum(r.get('duplicate_count', 0) for r in all_results),
224
+ 'total_sug': sum(len(r.get('suggestions', [])) for r in all_results),
225
+ 'orphan_count': len(orphan_pages),
226
+ 'orphan_urls': orphan_pages[:100],
227
+ }
228
+ update_run_status(sb, run_id, "completed", len(all_results), summary)
229
+
230
+ total_time = time.time() - start_time
231
+ log_lines.append(f"βœ… COMPLETE! {len(all_results)} pages in {total_time:.0f}s Β· {len(orphan_pages)} orphans")
232
+ log_lines.append(f"πŸ“Š Broken: {summary['total_broken']} Β· Redirects: {summary['total_redirects']} Β· Flags: {summary['total_flags']} Β· Dups: {summary['total_dups']}")
233
+ log_lines.append("β†’ Go to Past Runs tab to generate report")
234
+
235
+ audit_state.set_running(False)
236
+ yield "\n".join(log_lines[-40:]), f"βœ… Complete β€” {len(all_results)} pages", gr.update(interactive=False), gr.update(interactive=False)
237
+
238
+ except Exception as e:
239
+ log_lines.append(f"❌ Error: {str(e)}")
240
+ audit_state.set_running(False)
241
+ if run_id:
242
+ completed = get_completed_count(sb, run_id)
243
+ update_run_status(sb, run_id, "paused", completed)
244
+ yield "\n".join(log_lines[-40:]), f"❌ Error β€” saved progress to Supabase", gr.update(interactive=False), gr.update(interactive=False)
245
+
246
+
247
+ def pause_audit():
248
+ """Signal the audit loop to pause."""
249
+ if audit_state.is_running():
250
+ audit_state.request_pause()
251
+ return "⏸️ Pause requested β€” will stop after current page completes..."
252
+ return "No audit running."
253
+
254
+
255
+ # ═══════════════════════════════════════════════════
256
+ # RESUME FUNCTION
257
+ # ═══════════════════════════════════════════════════
258
+
259
+ def resume_audit(run_id, domain, batch_size, timeout, delay, workers):
260
+ """Resume a paused/interrupted run."""
261
+ if sb is None:
262
+ yield "❌ Supabase not connected.", "", gr.update(), gr.update()
263
+ return
264
+
265
+ if not run_id:
266
+ yield "⚠ No run selected.", "", gr.update(), gr.update()
267
+ return
268
+
269
+ all_urls_for_run = get_pending_urls(sb, run_id)
270
+ done_urls = get_completed_urls(sb, run_id)
271
+ remaining = [u for u in all_urls_for_run if u not in done_urls]
272
+
273
+ if not remaining:
274
+ update_run_status(sb, run_id, "completed", len(done_urls))
275
+ yield "βœ… All pages already audited!", "", gr.update(), gr.update()
276
+ return
277
+
278
+ audit_state.set_running(True, run_id)
279
+ audit_state.resume()
280
+ update_run_status(sb, run_id, "running")
281
+
282
+ total = len(all_urls_for_run)
283
+ start_time = time.time()
284
+ batch_num = 0
285
+ log_lines = [f"▢️ Resuming β€” {len(remaining)} pages remaining ({len(done_urls)} already done)"]
286
+
287
+ yield "\n".join(log_lines), f"πŸ“Š Resuming: {len(done_urls)}/{total}", gr.update(interactive=True), gr.update(interactive=False)
288
+
289
+ try:
290
+ for batch_start in range(0, len(remaining), batch_size):
291
+ if audit_state.is_paused():
292
+ completed = get_completed_count(sb, run_id)
293
+ update_run_status(sb, run_id, "paused", completed)
294
+ log_lines.append(f"⏸️ PAUSED at {completed}/{total}")
295
+ audit_state.set_running(False)
296
+ yield "\n".join(log_lines[-40:]), "", gr.update(interactive=False), gr.update(interactive=False)
297
+ return
298
+
299
+ batch_end = min(batch_start + batch_size, len(remaining))
300
+ batch_urls = remaining[batch_start:batch_end]
301
+ batch_num += 1
302
+ batch_results = []
303
+
304
+ for j, url in enumerate(batch_urls):
305
+ if audit_state.is_paused():
306
+ if batch_results:
307
+ save_batch_results(sb, run_id, batch_results)
308
+ completed = get_completed_count(sb, run_id)
309
+ update_run_status(sb, run_id, "paused", completed)
310
+ log_lines.append(f"⏸️ PAUSED at {completed}/{total}")
311
+ audit_state.set_running(False)
312
+ yield "\n".join(log_lines[-40:]), "", gr.update(interactive=False), gr.update(interactive=False)
313
+ return
314
+
315
+ global_idx = len(done_urls) + batch_start + j + 1
316
+ elapsed = time.time() - start_time
317
+ processed = batch_start + j + 1
318
+ avg = elapsed / processed
319
+ eta = avg * (len(remaining) - processed)
320
+ eta_str = f"{int(eta//60)}m {int(eta%60)}s" if eta > 60 else f"{eta:.0f}s"
321
+
322
+ result = audit_page(
323
+ url, domain, DEFAULT_BODY_SELECTORS,
324
+ suggestion_map=DEFAULT_SUGGESTION_MAP,
325
+ timeout=timeout, concurrent_workers=workers,
326
+ )
327
+ batch_results.append(result)
328
+
329
+ short = url.replace('https://www.', '').replace('https://', '')[:70]
330
+ if result['error']:
331
+ log_lines.append(f"❌ [{global_idx}/{total}] {short}")
332
+ else:
333
+ b = result['broken_int_count'] + result['broken_ext_count']
334
+ flag_str = f"πŸ”΄ {b} broken" if b else "βœ…"
335
+ log_lines.append(f"[{global_idx}/{total}] {short} Β· {flag_str}")
336
+
337
+ yield "\n".join(log_lines[-40:]), f"πŸ“Š Progress: {global_idx}/{total} ({global_idx*100//total}%) Β· ETA: {eta_str}", gr.update(interactive=True), gr.update(interactive=False)
338
+ if j < len(batch_urls) - 1:
339
+ time.sleep(delay)
340
+
341
+ if batch_results:
342
+ save_batch_results(sb, run_id, batch_results)
343
+ completed = get_completed_count(sb, run_id)
344
+ update_run_status(sb, run_id, "running", completed)
345
+ log_lines.append(f"πŸ’Ύ Batch {batch_num} saved β€” {completed}/{total}")
346
+ del batch_results
347
+
348
+ # Orphan analysis
349
+ log_lines.append("πŸ” Orphan analysis...")
350
+ yield "\n".join(log_lines[-40:]), "πŸ“Š Orphan analysis...", gr.update(interactive=False), gr.update(interactive=False)
351
+
352
+ all_pages = get_all_page_results(sb, run_id)
353
+ all_results = [p['result'] for p in all_pages]
354
+
355
+ all_targets = set()
356
+ all_pg = set()
357
+ for r in all_results:
358
+ all_pg.add(r['url'].rstrip('/').split('?')[0])
359
+ for link in r.get('internal_links', []):
360
+ all_targets.add(link['url'].rstrip('/').split('?')[0])
361
+ orphans = sorted([p for p in all_pg if p not in all_targets])
362
+
363
+ final_summary = {
364
+ 'total_pages': len(all_results),
365
+ 'total_int': sum(r.get('int_count', 0) for r in all_results),
366
+ 'total_ext': sum(r.get('ext_count', 0) for r in all_results),
367
+ 'total_broken': sum(r.get('broken_int_count', 0) + r.get('broken_ext_count', 0) for r in all_results),
368
+ 'total_redirects': sum(r.get('redirect_int_count', 0) + r.get('redirect_ext_count', 0) for r in all_results),
369
+ 'total_flags': sum(r.get('follow_flag_count', 0) for r in all_results),
370
+ 'total_dups': sum(r.get('duplicate_count', 0) for r in all_results),
371
+ 'total_sug': sum(len(r.get('suggestions', [])) for r in all_results),
372
+ 'orphan_count': len(orphans),
373
+ 'orphan_urls': orphans[:100],
374
+ }
375
+ update_run_status(sb, run_id, "completed", len(all_results), final_summary)
376
+
377
+ total_time = time.time() - start_time
378
+ log_lines.append(f"βœ… COMPLETE! {len(all_results)} pages in {total_time:.0f}s Β· {len(orphans)} orphans")
379
+ audit_state.set_running(False)
380
+ yield "\n".join(log_lines[-40:]), f"βœ… Complete β€” {len(all_results)} pages", gr.update(interactive=False), gr.update(interactive=False)
381
+
382
+ except Exception as e:
383
+ log_lines.append(f"❌ Error: {str(e)}")
384
+ audit_state.set_running(False)
385
+ completed = get_completed_count(sb, run_id)
386
+ update_run_status(sb, run_id, "paused", completed)
387
+ yield "\n".join(log_lines[-40:]), "❌ Error", gr.update(interactive=False), gr.update(interactive=False)
388
+
389
+
390
+ # ═══════════════════════════════════════════════════
391
+ # PAST RUNS HELPERS
392
+ # ═══════════════════════════════════════════════════
393
+
394
+ def load_past_runs():
395
+ if sb is None:
396
+ return "<p>❌ Supabase not connected</p>", gr.update(choices=[], value=None)
397
+
398
+ runs = get_all_runs(sb)
399
+ if not runs:
400
+ return "<p>No saved runs yet.</p>", gr.update(choices=[], value=None)
401
+
402
+ # Build choices for dropdown
403
+ choices = []
404
+ for r in runs:
405
+ status = r.get('status', 'unknown')
406
+ completed = r.get('completed_urls', 0)
407
+ total = r.get('total_urls', 0)
408
+ label = f"{r.get('name', 'Untitled')} [{status.upper()}] ({completed}/{total})"
409
+ choices.append((label, r['id']))
410
+
411
+ # Build HTML table
412
+ html = '<div style="max-height:400px;overflow-y:auto;">'
413
+ html += '<table style="width:100%;border-collapse:collapse;font-size:13px;">'
414
+ html += '<tr style="background:#f1f5f9;"><th style="padding:10px;text-align:left;">Run Name</th><th style="padding:10px;text-align:center;">Status</th><th style="padding:10px;text-align:center;">Pages</th><th style="padding:10px;text-align:center;">Broken</th><th style="padding:10px;text-align:center;">Flags</th><th style="padding:10px;text-align:center;">Dups</th><th style="padding:10px;text-align:center;">Orphans</th></tr>'
415
+
416
+ for r in runs:
417
+ summary = r.get('summary', {}) or {}
418
+ status = r.get('status', 'unknown')
419
+ s_color = {'completed': '#059669', 'paused': '#d97706', 'running': '#2563eb'}.get(status, '#888')
420
+ created = r.get('created_at', '')[:16].replace('T', ' ')
421
+
422
+ html += f'''<tr style="border-bottom:1px solid #e2e8f0;">
423
+ <td style="padding:10px;"><b>{r.get('name','Untitled')}</b><br><span style="font-size:11px;color:#94a3b8;">{created}</span></td>
424
+ <td style="padding:10px;text-align:center;"><span style="background:{s_color}15;color:{s_color};padding:3px 10px;border-radius:12px;font-size:11px;font-weight:700;">{status.upper()}</span></td>
425
+ <td style="padding:10px;text-align:center;font-weight:700;">{r.get('completed_urls',0)}/{r.get('total_urls',0)}</td>
426
+ <td style="padding:10px;text-align:center;color:#dc2626;font-weight:700;">{summary.get('total_broken','β€”')}</td>
427
+ <td style="padding:10px;text-align:center;color:#dc2626;font-weight:700;">{summary.get('total_flags','β€”')}</td>
428
+ <td style="padding:10px;text-align:center;color:#db2777;font-weight:700;">{summary.get('total_dups','β€”')}</td>
429
+ <td style="padding:10px;text-align:center;color:#dc2626;font-weight:700;">{summary.get('orphan_count','β€”')}</td>
430
+ </tr>'''
431
+
432
+ html += '</table></div>'
433
+ return html, gr.update(choices=choices, value=choices[0][1] if choices else None)
434
+
435
+
436
+ def generate_report_for_run(run_id, domain):
437
+ if sb is None or not run_id:
438
+ return None, "❌ No run selected or Supabase not connected."
439
+
440
+ try:
441
+ run = None
442
+ runs = get_all_runs(sb)
443
+ for r in runs:
444
+ if r['id'] == run_id:
445
+ run = r
446
+ break
447
+
448
+ pages = get_all_page_results(sb, run_id)
449
+ if not pages:
450
+ return None, "⚠ No page data found for this run."
451
+
452
+ results = [p['result'] for p in pages]
453
+ summary = (run.get('summary', {}) or {}) if run else {}
454
+ orphan_urls = summary.get('orphan_urls', [])
455
+ report_domain = run.get('domain', domain) if run else domain
456
+
457
+ report_html = generate_report(results, orphan_urls, report_domain)
458
+
459
+ # Save to temp file
460
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.html', prefix='Link_Audit_')
461
+ tmp.write(report_html.encode('utf-8'))
462
+ tmp.close()
463
+
464
+ return tmp.name, f"βœ… Report generated β€” {len(results)} pages"
465
+ except Exception as e:
466
+ return None, f"❌ Error: {str(e)}"
467
+
468
+
469
+ def generate_csv_for_run(run_id):
470
+ if sb is None or not run_id:
471
+ return None, "❌ No run selected."
472
+
473
+ try:
474
+ pages = get_all_page_results(sb, run_id)
475
+ if not pages:
476
+ return None, "⚠ No data."
477
+
478
+ rows = []
479
+ for p in pages:
480
+ r = p['result']
481
+ rows.append({
482
+ 'URL': r.get('url', ''),
483
+ 'Internal': r.get('int_count', 0),
484
+ 'External': r.get('ext_count', 0),
485
+ 'Broken': r.get('broken_int_count', 0) + r.get('broken_ext_count', 0),
486
+ 'Redirects': r.get('redirect_int_count', 0) + r.get('redirect_ext_count', 0),
487
+ 'Flags': r.get('follow_flag_count', 0),
488
+ 'Duplicates': r.get('duplicate_count', 0),
489
+ 'Error': r.get('error', ''),
490
+ })
491
+
492
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.csv', prefix='Audit_CSV_')
493
+ pd.DataFrame(rows).to_csv(tmp.name, index=False)
494
+ tmp.close()
495
+ return tmp.name, f"βœ… CSV exported β€” {len(rows)} rows"
496
+ except Exception as e:
497
+ return None, f"❌ Error: {str(e)}"
498
+
499
+
500
+ def delete_selected_run(run_id):
501
+ if sb is None or not run_id:
502
+ return "❌ No run selected."
503
+ try:
504
+ delete_run(sb, run_id)
505
+ return "πŸ—‘οΈ Run deleted."
506
+ except Exception as e:
507
+ return f"❌ {str(e)}"
508
+
509
+
510
+ # ═══════════════════════════════════════════════════
511
+ # GRADIO UI
512
+ # ═══════════════════════════════════════════════════
513
+
514
+ css = """
515
+ .main-header { background: linear-gradient(135deg, #1e3a5f, #2563eb); padding: 24px 28px; border-radius: 12px; color: white; margin-bottom: 16px; }
516
+ .main-header h1 { margin: 0 0 4px 0; font-size: 24px; }
517
+ .main-header p { margin: 0; opacity: 0.8; font-size: 13px; }
518
+ .status-bar { background: #f1f5f9; border: 1px solid #e2e8f0; border-radius: 8px; padding: 10px 16px; font-family: monospace; font-size: 13px; font-weight: 600; }
519
+ .log-area textarea { font-family: 'JetBrains Mono', monospace !important; font-size: 12px !important; line-height: 1.6 !important; }
520
+ """
521
+
522
+ with gr.Blocks(css=css, title="πŸ”— Link Audit Tool", theme=gr.themes.Soft()) as app:
523
+
524
+ # Header
525
+ gr.HTML("""
526
+ <div class="main-header">
527
+ <p style="font-size:10px;font-weight:700;letter-spacing:1.5px;text-transform:uppercase;color:#93c5fd;margin-bottom:8px;">SEO Link Audit Tool</p>
528
+ <h1>πŸ”— Bulk Link Audit</h1>
529
+ <p>Upload URLs β†’ batch crawl with auto-save β†’ pause/resume anytime β†’ generate interactive report</p>
530
+ </div>
531
+ """)
532
+
533
+ # Connection status
534
+ conn_status = "βœ… Supabase Connected" if sb else "❌ Supabase Not Connected β€” add SUPABASE_URL and SUPABASE_KEY to Space secrets"
535
+ gr.HTML(f'<div class="status-bar">πŸ—„οΈ {conn_status}</div>')
536
+
537
+ with gr.Tabs():
538
+
539
+ # ═══ TAB 1: NEW AUDIT ═══
540
+ with gr.Tab("πŸ” New Audit"):
541
+ with gr.Row():
542
+ with gr.Column(scale=2):
543
+ file_input = gr.File(label="Upload Excel / CSV", file_types=[".xlsx", ".csv", ".xls"])
544
+ pasted_urls = gr.Textbox(label="Or paste URLs (one per line)", lines=5, placeholder="https://www.example.com/blog/page1\nhttps://www.example.com/blog/page2")
545
+
546
+ with gr.Column(scale=1):
547
+ domain = gr.Textbox(label="Your Domain", value="edstellar.com")
548
+ batch_size = gr.Slider(5, 50, value=25, step=5, label="Batch Size")
549
+ timeout = gr.Slider(5, 60, value=15, step=5, label="Timeout (s)")
550
+ delay = gr.Slider(0, 5, value=1.0, step=0.5, label="Delay between pages (s)")
551
+ workers = gr.Slider(1, 10, value=5, step=1, label="Parallel link checks")
552
+
553
+ with gr.Row():
554
+ run_btn = gr.Button("πŸš€ Run Audit", variant="primary", scale=2)
555
+ pause_btn = gr.Button("⏸️ Pause", variant="stop", scale=1, interactive=False)
556
+
557
+ progress_text = gr.Textbox(label="Status", interactive=False, elem_classes=["status-bar"])
558
+ log_output = gr.Textbox(label="Audit Log", lines=20, interactive=False, elem_classes=["log-area"])
559
+
560
+ # Wire up run button (generator for streaming)
561
+ run_btn.click(
562
+ fn=run_audit,
563
+ inputs=[file_input, pasted_urls, domain, batch_size, timeout, delay, workers],
564
+ outputs=[log_output, progress_text, pause_btn, run_btn],
565
+ )
566
+
567
+ # Wire up pause button
568
+ pause_btn.click(fn=pause_audit, outputs=[progress_text])
569
+
570
+ # ═══ TAB 2: PAST RUNS ═══
571
+ with gr.Tab("πŸ“ Past Runs"):
572
+ refresh_btn = gr.Button("πŸ”„ Refresh Runs", variant="secondary")
573
+ runs_html = gr.HTML(value="<p>Click Refresh to load runs.</p>")
574
+ run_dropdown = gr.Dropdown(label="Select a Run", choices=[], interactive=True)
575
+
576
+ with gr.Row():
577
+ report_btn = gr.Button("πŸ“Š Generate HTML Report", variant="primary")
578
+ csv_btn = gr.Button("πŸ“‹ Export CSV", variant="secondary")
579
+ resume_btn = gr.Button("▢️ Resume Audit", variant="primary")
580
+ delete_btn = gr.Button("πŸ—‘οΈ Delete Run", variant="stop")
581
+
582
+ action_status = gr.Textbox(label="Action Status", interactive=False)
583
+ report_file = gr.File(label="Download Report", interactive=False)
584
+ csv_file = gr.File(label="Download CSV", interactive=False)
585
+
586
+ # Resume log & progress (shared with new audit display format)
587
+ resume_progress = gr.Textbox(label="Resume Status", interactive=False, elem_classes=["status-bar"])
588
+ resume_log = gr.Textbox(label="Resume Log", lines=15, interactive=False, elem_classes=["log-area"])
589
+ resume_pause_btn = gr.Button("⏸️ Pause Resume", variant="stop", interactive=False)
590
+
591
+ # Refresh
592
+ refresh_btn.click(fn=load_past_runs, outputs=[runs_html, run_dropdown])
593
+
594
+ # Generate report
595
+ def gen_report_wrapper(run_id, domain_val):
596
+ filepath, msg = generate_report_for_run(run_id, domain_val)
597
+ return filepath, msg
598
+
599
+ report_btn.click(
600
+ fn=gen_report_wrapper,
601
+ inputs=[run_dropdown, domain],
602
+ outputs=[report_file, action_status],
603
+ )
604
+
605
+ # CSV
606
+ def csv_wrapper(run_id):
607
+ filepath, msg = generate_csv_for_run(run_id)
608
+ return filepath, msg
609
+
610
+ csv_btn.click(fn=csv_wrapper, inputs=[run_dropdown], outputs=[csv_file, action_status])
611
+
612
+ # Delete
613
+ def delete_wrapper(run_id):
614
+ msg = delete_selected_run(run_id)
615
+ html, dropdown = load_past_runs()
616
+ return msg, html, dropdown
617
+
618
+ delete_btn.click(fn=delete_wrapper, inputs=[run_dropdown], outputs=[action_status, runs_html, run_dropdown])
619
+
620
+ # Resume
621
+ resume_btn.click(
622
+ fn=resume_audit,
623
+ inputs=[run_dropdown, domain, batch_size, timeout, delay, workers],
624
+ outputs=[resume_log, resume_progress, resume_pause_btn, resume_btn],
625
+ )
626
+ resume_pause_btn.click(fn=pause_audit, outputs=[resume_progress])
627
+
628
+ # Auto-load runs on startup
629
+ app.load(fn=load_past_runs, outputs=[runs_html, run_dropdown])
630
+
631
+
632
+ if __name__ == "__main__":
633
+ app.queue().launch(server_name="0.0.0.0", server_port=7860)
audit_engine.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Core Link Audit Engine
3
+ Crawls pages, extracts body-content links, checks status, detects issues.
4
+ """
5
+
6
+ import requests
7
+ from bs4 import BeautifulSoup, Comment
8
+ from urllib.parse import urljoin, urlparse
9
+ from collections import defaultdict
10
+ import concurrent.futures
11
+
12
+ HEADERS = {
13
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
14
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
15
+ }
16
+
17
+ DEFAULT_BODY_SELECTORS = [
18
+ "div.blog-rich-text",
19
+ "div.w-richtext",
20
+ "article .rich-text",
21
+ "article",
22
+ "div.blog-content",
23
+ "div.post-content",
24
+ "main",
25
+ ]
26
+
27
+ DEFAULT_SUGGESTION_MAP = {
28
+ "artificial intelligence": ("/category/artificial-intelligence-training", "artificial intelligence training programs"),
29
+ "machine learning": ("/category/artificial-intelligence-training", "machine learning training"),
30
+ "leadership": ("/type/leadership-training", "leadership training programs"),
31
+ "soft skills": ("/type/behavioral-training", "behavioral training programs"),
32
+ "remote employee": ("/blog/how-to-train-remote-employees", "remote employee training"),
33
+ "training management": ("/training-management-software", "training management software"),
34
+ "instructor-led": ("/instructor-led-training-services", "instructor-led training"),
35
+ "corporate training": ("/corporate-training-courses", "corporate training programs"),
36
+ "skill matrix": ("/skill-matrix", "skills matrix"),
37
+ "stellar ai": ("/stellar-ai", "AI-powered training"),
38
+ "book a demo": ("/book-a-demo", "book a demo"),
39
+ "compliance": ("/type/compliance-training", "compliance training"),
40
+ "cybersecurity": ("/category/cybersecurity-training", "cybersecurity training"),
41
+ "data analytics": ("/category/data-analytics-training", "data analytics training"),
42
+ "project management": ("/category/project-management-training", "project management training"),
43
+ "coaching": ("/coaching-solutions", "coaching solutions"),
44
+ "hr training": ("/category/human-resource-training", "HR training programs"),
45
+ "employee engagement": ("/blog/how-to-train-remote-employees", "employee training best practices"),
46
+ "onboarding": ("/category/human-resource-training", "onboarding training"),
47
+ "digital transformation": ("/type/it-technical-training", "IT & technical training"),
48
+ }
49
+
50
+
51
+ def is_internal(href, domain):
52
+ if not href:
53
+ return False
54
+ parsed = urlparse(href)
55
+ if not parsed.netloc:
56
+ return True
57
+ return domain.lower() in parsed.netloc.lower()
58
+
59
+
60
+ def normalize_url(href, base_url):
61
+ if not href:
62
+ return None
63
+ href = href.strip()
64
+ if href.startswith(('#', 'mailto:', 'tel:', 'javascript:')):
65
+ return None
66
+ return urljoin(base_url, href)
67
+
68
+
69
+ def get_follow_status(tag):
70
+ rel = tag.get('rel', [])
71
+ if isinstance(rel, str):
72
+ rel = rel.split()
73
+ return 'Nofollow' if 'nofollow' in [r.lower() for r in rel] else 'Dofollow'
74
+
75
+
76
+ def find_body_content(soup, selectors):
77
+ for sel in selectors:
78
+ el = soup.select_one(sel)
79
+ if el:
80
+ return el
81
+ return soup.find('body')
82
+
83
+
84
+ def get_link_location(link_tag, body_el):
85
+ body_text = body_el.get_text()
86
+ total_len = len(body_text)
87
+ if total_len == 0:
88
+ return "Unknown"
89
+
90
+ preceding_text = ""
91
+ for el in body_el.descendants:
92
+ if el == link_tag:
93
+ break
94
+ if isinstance(el, str) and not isinstance(el, Comment):
95
+ preceding_text += el
96
+
97
+ pos = len(preceding_text)
98
+ ratio = pos / total_len if total_len > 0 else 0
99
+
100
+ heading = ""
101
+ for parent in link_tag.parents:
102
+ for sib in parent.previous_siblings:
103
+ if hasattr(sib, 'name') and sib.name in ['h1', 'h2', 'h3', 'h4']:
104
+ heading = sib.get_text(strip=True)[:60]
105
+ break
106
+ if heading:
107
+ break
108
+
109
+ if ratio < 0.1:
110
+ section = "Intro"
111
+ elif ratio > 0.85:
112
+ section = "Conclusion"
113
+ else:
114
+ section = f"Mid-article (~{int(ratio*100)}%)"
115
+
116
+ if heading:
117
+ return f'{section} Β· near "{heading}"'
118
+ return section
119
+
120
+
121
+ def check_url_status(url, timeout=15):
122
+ try:
123
+ r = requests.head(url, headers=HEADERS, timeout=timeout, allow_redirects=False)
124
+ status = r.status_code
125
+ redirect_url = ""
126
+
127
+ if status in (301, 302, 303, 307, 308):
128
+ redirect_url = r.headers.get('Location', '')
129
+ if redirect_url and not redirect_url.startswith('http'):
130
+ redirect_url = urljoin(url, redirect_url)
131
+
132
+ if status == 405:
133
+ r = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=False, stream=True)
134
+ status = r.status_code
135
+ if status in (301, 302, 303, 307, 308):
136
+ redirect_url = r.headers.get('Location', '')
137
+ r.close()
138
+
139
+ if status in (301, 302, 303, 307, 308):
140
+ link_status = "Redirect"
141
+ elif 200 <= status < 300:
142
+ link_status = "Active"
143
+ else:
144
+ link_status = "Broken"
145
+
146
+ return url, status, link_status, redirect_url
147
+
148
+ except requests.exceptions.Timeout:
149
+ return url, "Timeout", "Broken", ""
150
+ except requests.exceptions.ConnectionError:
151
+ return url, "ConnError", "Broken", ""
152
+ except Exception:
153
+ return url, "Error", "Broken", ""
154
+
155
+
156
+ def generate_suggestions(body_text, existing_internal_urls, page_url, suggestion_map=None):
157
+ if suggestion_map is None:
158
+ suggestion_map = DEFAULT_SUGGESTION_MAP
159
+
160
+ suggestions = []
161
+ text_lower = body_text.lower()
162
+ existing_paths = set(urlparse(u).path.rstrip('/') for u in existing_internal_urls)
163
+
164
+ for keyword, (path, anchor) in suggestion_map.items():
165
+ clean_path = path.rstrip('/')
166
+ if clean_path in existing_paths:
167
+ continue
168
+ if clean_path == urlparse(page_url).path.rstrip('/'):
169
+ continue
170
+ count = text_lower.count(keyword.lower())
171
+ if count > 0:
172
+ pos = text_lower.find(keyword.lower())
173
+ ratio = pos / len(text_lower) if len(text_lower) > 0 else 0
174
+ if ratio < 0.15:
175
+ loc = "Intro"
176
+ elif ratio > 0.85:
177
+ loc = "Conclusion"
178
+ else:
179
+ loc = f"Mid-article (~{int(ratio*100)}%)"
180
+
181
+ priority = "High" if count >= 3 else "Med"
182
+ suggestions.append({
183
+ 'section': loc,
184
+ 'target': path,
185
+ 'anchor': anchor,
186
+ 'priority': priority,
187
+ 'keyword': keyword,
188
+ 'count': count
189
+ })
190
+
191
+ suggestions.sort(key=lambda x: (0 if x['priority'] == 'High' else 1, -x['count']))
192
+ return suggestions[:10]
193
+
194
+
195
+ def audit_page(page_url, domain, body_selectors=None, suggestion_map=None,
196
+ timeout=15, concurrent_workers=5):
197
+ if body_selectors is None:
198
+ body_selectors = DEFAULT_BODY_SELECTORS
199
+
200
+ result = {
201
+ 'url': page_url, 'error': None,
202
+ 'internal_links': [], 'external_links': [],
203
+ 'broken_internal': [], 'broken_external': [],
204
+ 'redirect_internal': [], 'redirect_external': [],
205
+ 'follow_flags': [], 'duplicates': [], 'suggestions': [],
206
+ 'int_count': 0, 'ext_count': 0,
207
+ 'int_df': 0, 'int_nf': 0, 'ext_df': 0, 'ext_nf': 0,
208
+ 'broken_int_count': 0, 'broken_ext_count': 0,
209
+ 'redirect_int_count': 0, 'redirect_ext_count': 0,
210
+ 'follow_flag_count': 0, 'duplicate_count': 0,
211
+ }
212
+
213
+ try:
214
+ resp = requests.get(page_url, headers=HEADERS, timeout=timeout)
215
+ resp.raise_for_status()
216
+ except Exception as e:
217
+ result['error'] = str(e)
218
+ return result
219
+
220
+ soup = BeautifulSoup(resp.text, 'lxml')
221
+ body_el = find_body_content(soup, body_selectors)
222
+ if not body_el:
223
+ result['error'] = "Could not find body content element"
224
+ return result
225
+
226
+ body_text = body_el.get_text(' ', strip=True)
227
+ all_links = body_el.find_all('a', href=True)
228
+ url_locations = defaultdict(list)
229
+
230
+ raw_links = []
231
+ for tag in all_links:
232
+ href = normalize_url(tag['href'], page_url)
233
+ if not href:
234
+ continue
235
+ anchor = tag.get_text(strip=True) or "[no text]"
236
+ follow = get_follow_status(tag)
237
+ location = get_link_location(tag, body_el)
238
+ internal = is_internal(href, domain)
239
+ link_type = 'internal' if internal else 'external'
240
+
241
+ link_data = {
242
+ 'url': href, 'anchor': anchor[:100], 'follow': follow,
243
+ 'location': location, 'type': link_type,
244
+ 'status_code': None, 'link_status': None,
245
+ 'redirect_url': '', 'flags': [],
246
+ }
247
+ raw_links.append(link_data)
248
+ clean_url = href.rstrip('/').split('?')[0].split('#')[0]
249
+ url_locations[clean_url].append(location)
250
+
251
+ # Check status in parallel
252
+ unique_urls = list(set(l['url'] for l in raw_links))
253
+ status_map = {}
254
+ with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_workers) as executor:
255
+ futures = {executor.submit(check_url_status, u, timeout): u for u in unique_urls}
256
+ for future in concurrent.futures.as_completed(futures):
257
+ url, status, link_status, redirect_url = future.result()
258
+ status_map[url] = (status, link_status, redirect_url)
259
+
260
+ for link in raw_links:
261
+ if link['url'] in status_map:
262
+ status, link_status, redirect_url = status_map[link['url']]
263
+ link['status_code'] = status
264
+ link['link_status'] = link_status
265
+ link['redirect_url'] = redirect_url
266
+
267
+ if link['type'] == 'internal' and link['follow'] == 'Nofollow':
268
+ link['flags'].append('Internal link is Nofollow β€” should be Dofollow')
269
+ if link['type'] == 'external' and link['follow'] == 'Dofollow':
270
+ link['flags'].append('External link is Dofollow β€” should be Nofollow')
271
+
272
+ # Detect duplicates
273
+ duplicates = []
274
+ for clean_url, locations in url_locations.items():
275
+ if len(locations) > 1:
276
+ duplicates.append({'url': clean_url, 'count': len(locations), 'locations': locations})
277
+ for link in raw_links:
278
+ link_clean = link['url'].rstrip('/').split('?')[0].split('#')[0]
279
+ if link_clean == clean_url:
280
+ link['flags'].append(f'Duplicate: appears {len(locations)}x in body')
281
+
282
+ for link in raw_links:
283
+ if link['type'] == 'internal':
284
+ result['internal_links'].append(link)
285
+ if link['follow'] == 'Dofollow': result['int_df'] += 1
286
+ else: result['int_nf'] += 1
287
+ if link['link_status'] == 'Broken': result['broken_internal'].append(link)
288
+ if link['link_status'] == 'Redirect': result['redirect_internal'].append(link)
289
+ else:
290
+ result['external_links'].append(link)
291
+ if link['follow'] == 'Dofollow': result['ext_df'] += 1
292
+ else: result['ext_nf'] += 1
293
+ if link['link_status'] == 'Broken': result['broken_external'].append(link)
294
+ if link['link_status'] == 'Redirect': result['redirect_external'].append(link)
295
+
296
+ if link['flags']:
297
+ result['follow_flags'].append(link)
298
+
299
+ result['int_count'] = len(result['internal_links'])
300
+ result['ext_count'] = len(result['external_links'])
301
+ result['broken_int_count'] = len(result['broken_internal'])
302
+ result['broken_ext_count'] = len(result['broken_external'])
303
+ result['redirect_int_count'] = len(result['redirect_internal'])
304
+ result['redirect_ext_count'] = len(result['redirect_external'])
305
+ result['follow_flag_count'] = len(result['follow_flags'])
306
+ result['duplicates'] = duplicates
307
+ result['duplicate_count'] = len(duplicates)
308
+
309
+ existing_int_urls = [l['url'] for l in result['internal_links']]
310
+ result['suggestions'] = generate_suggestions(body_text, existing_int_urls, page_url, suggestion_map)
311
+
312
+ return result
db.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Supabase Database Helper
3
+ Saves/loads audit results in batches.
4
+
5
+ ═══════════════════════════════════════════════════════════════
6
+ RUN THIS SQL IN SUPABASE SQL EDITOR (one-time setup):
7
+ ═══════════════════════════════════════════════════════════════
8
+
9
+ CREATE TABLE audit_runs (
10
+ id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
11
+ name TEXT NOT NULL,
12
+ domain TEXT NOT NULL,
13
+ total_urls INTEGER DEFAULT 0,
14
+ completed_urls INTEGER DEFAULT 0,
15
+ status TEXT DEFAULT 'running' CHECK (status IN ('running', 'paused', 'completed', 'error')),
16
+ created_at TIMESTAMPTZ DEFAULT now(),
17
+ updated_at TIMESTAMPTZ DEFAULT now(),
18
+ summary JSONB DEFAULT '{}'::jsonb,
19
+ pending_urls JSONB DEFAULT '[]'::jsonb
20
+ );
21
+
22
+ CREATE TABLE audit_pages (
23
+ id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
24
+ run_id UUID REFERENCES audit_runs(id) ON DELETE CASCADE,
25
+ url TEXT NOT NULL,
26
+ result JSONB NOT NULL,
27
+ created_at TIMESTAMPTZ DEFAULT now()
28
+ );
29
+
30
+ CREATE INDEX idx_audit_pages_run_id ON audit_pages(run_id);
31
+ CREATE INDEX idx_audit_pages_url ON audit_pages(url);
32
+ CREATE INDEX idx_audit_runs_status ON audit_runs(status);
33
+
34
+ ALTER TABLE audit_runs ENABLE ROW LEVEL SECURITY;
35
+ ALTER TABLE audit_pages ENABLE ROW LEVEL SECURITY;
36
+ CREATE POLICY "Allow all on audit_runs" ON audit_runs FOR ALL USING (true);
37
+ CREATE POLICY "Allow all on audit_pages" ON audit_pages FOR ALL USING (true);
38
+
39
+ ═══════════════════════════════════════════════════════════════
40
+ """
41
+
42
+ import json
43
+ from datetime import datetime
44
+ from supabase import create_client, Client
45
+
46
+
47
+ def get_client(url: str, key: str) -> Client:
48
+ return create_client(url, key)
49
+
50
+
51
+ # ─── Run Management ───
52
+
53
+ def create_run(client: Client, name: str, domain: str, total_urls: int, all_urls: list) -> str:
54
+ data = {
55
+ "name": name,
56
+ "domain": domain,
57
+ "total_urls": total_urls,
58
+ "completed_urls": 0,
59
+ "status": "running",
60
+ "pending_urls": json.dumps(all_urls),
61
+ }
62
+ response = client.table("audit_runs").insert(data).execute()
63
+ return response.data[0]["id"]
64
+
65
+
66
+ def get_run(client: Client, run_id: str):
67
+ response = client.table("audit_runs").select("*").eq("id", run_id).single().execute()
68
+ return response.data
69
+
70
+
71
+ def get_all_runs(client: Client):
72
+ response = client.table("audit_runs").select("*").order("created_at", desc=True).execute()
73
+ return response.data
74
+
75
+
76
+ def update_run_status(client: Client, run_id: str, status: str, completed: int = None, summary: dict = None):
77
+ data = {"status": status, "updated_at": datetime.utcnow().isoformat()}
78
+ if completed is not None:
79
+ data["completed_urls"] = completed
80
+ if summary is not None:
81
+ data["summary"] = json.loads(json.dumps(summary, default=str))
82
+ client.table("audit_runs").update(data).eq("id", run_id).execute()
83
+
84
+
85
+ def delete_run(client: Client, run_id: str):
86
+ client.table("audit_pages").delete().eq("run_id", run_id).execute()
87
+ client.table("audit_runs").delete().eq("id", run_id).execute()
88
+
89
+
90
+ # ─── Page Results ───
91
+
92
+ def save_batch_results(client: Client, run_id: str, batch_results: list):
93
+ rows = []
94
+ for r in batch_results:
95
+ clean = json.loads(json.dumps(r, default=str))
96
+ rows.append({"run_id": run_id, "url": r['url'], "result": clean})
97
+
98
+ for i in range(0, len(rows), 50):
99
+ chunk = rows[i:i+50]
100
+ client.table("audit_pages").insert(chunk).execute()
101
+
102
+
103
+ def get_completed_urls(client: Client, run_id: str) -> set:
104
+ urls = set()
105
+ offset = 0
106
+ page_size = 1000
107
+ while True:
108
+ response = (
109
+ client.table("audit_pages")
110
+ .select("url")
111
+ .eq("run_id", run_id)
112
+ .range(offset, offset + page_size - 1)
113
+ .execute()
114
+ )
115
+ if not response.data:
116
+ break
117
+ urls.update(p["url"] for p in response.data)
118
+ if len(response.data) < page_size:
119
+ break
120
+ offset += page_size
121
+ return urls
122
+
123
+
124
+ def get_completed_count(client: Client, run_id: str) -> int:
125
+ response = (
126
+ client.table("audit_pages")
127
+ .select("id", count="exact")
128
+ .eq("run_id", run_id)
129
+ .execute()
130
+ )
131
+ return response.count or 0
132
+
133
+
134
+ def get_all_page_results(client: Client, run_id: str) -> list:
135
+ all_pages = []
136
+ offset = 0
137
+ page_size = 500
138
+ while True:
139
+ response = (
140
+ client.table("audit_pages")
141
+ .select("url, result")
142
+ .eq("run_id", run_id)
143
+ .order("created_at", desc=False)
144
+ .range(offset, offset + page_size - 1)
145
+ .execute()
146
+ )
147
+ if not response.data:
148
+ break
149
+ all_pages.extend(response.data)
150
+ if len(response.data) < page_size:
151
+ break
152
+ offset += page_size
153
+ return all_pages
154
+
155
+
156
+ def get_pending_urls(client: Client, run_id: str) -> list:
157
+ run = get_run(client, run_id)
158
+ pending = run.get("pending_urls")
159
+ if isinstance(pending, str):
160
+ return json.loads(pending)
161
+ return pending or []
report_generator.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HTML Report Generator
3
+ Produces the interactive accordion-based light-theme audit report.
4
+ """
5
+
6
+ import html as html_module
7
+ from urllib.parse import urlparse
8
+ from datetime import datetime
9
+
10
+
11
+ def esc(text):
12
+ return html_module.escape(str(text)) if text else ""
13
+
14
+
15
+ def short_url(url, domain="edstellar.com"):
16
+ parsed = urlparse(str(url))
17
+ if domain in parsed.netloc:
18
+ return parsed.path + (('?' + parsed.query) if parsed.query else '')
19
+ host = parsed.netloc.replace('www.', '')
20
+ path = parsed.path
21
+ if len(path) > 50:
22
+ path = path[:25] + '...' + path[-20:]
23
+ return f"{host}{path}"
24
+
25
+
26
+ def badge(text, cls):
27
+ return f'<span class="badge {cls}">{esc(text)}</span>'
28
+
29
+
30
+ def render_link_entry(link, domain="edstellar.com", show_flags=True):
31
+ has_issues = link['link_status'] in ('Broken',) or link.get('flags')
32
+ is_redirect = link['link_status'] == 'Redirect'
33
+ cls = 's-issue' if (has_issues or is_redirect) else 's-ok'
34
+
35
+ if link['link_status'] == 'Broken':
36
+ status_tag = f'<span class="tag-sm issue">{esc(link["status_code"])} BROKEN</span>'
37
+ elif link['link_status'] == 'Redirect':
38
+ status_tag = f'<span class="tag-sm issue">{esc(link["status_code"])} Redirect</span>'
39
+ else:
40
+ status_tag = f'<span class="tag-sm ok">{esc(link["status_code"])}</span>'
41
+
42
+ has_follow_flag = any('Dofollow' in f or 'Nofollow' in f for f in link.get('flags', []))
43
+ if has_follow_flag:
44
+ follow_tag = f'<span class="tag-sm issue">{link["follow"]} ⚠</span>'
45
+ else:
46
+ follow_tag = f'<span class="tag-sm ok">{link["follow"]} βœ“</span>'
47
+
48
+ out = f'<div class="le {cls}">'
49
+ out += f'<div class="le-url">{esc(short_url(link["url"], domain))}</div>'
50
+ out += f'<div class="le-tags">{status_tag}{follow_tag}</div>'
51
+ out += f'<div class="le-anchor">Anchor: <b>{esc(link["anchor"])}</b></div>'
52
+
53
+ if link.get('redirect_url'):
54
+ out += f'<div class="le-redir">β†’ {esc(short_url(link["redirect_url"], domain))}</div>'
55
+
56
+ out += f'<div class="le-location">πŸ“ {esc(link["location"])}</div>'
57
+
58
+ if show_flags:
59
+ for flag in link.get('flags', []):
60
+ out += f'<div class="le-issue">⚠ {esc(flag)}</div>'
61
+
62
+ out += '</div>'
63
+ return out
64
+
65
+
66
+ def render_accordion(collapsed_html, details_html):
67
+ return f'''<td class="acc-cell" onclick="toggleAcc(this)">
68
+ <div class="acc-collapsed"><div class="acc-chevron">β–Ά</div><div class="acc-summary">{collapsed_html}</div></div>
69
+ <div class="acc-details">{details_html}</div>
70
+ </td>'''
71
+
72
+
73
+ def generate_report(results, orphan_pages, domain="edstellar.com"):
74
+ now = datetime.now().strftime("%b %d, %Y %H:%M")
75
+ total_pages = len(results)
76
+ total_int = sum(r['int_count'] for r in results)
77
+ total_ext = sum(r['ext_count'] for r in results)
78
+ total_broken = sum(r['broken_int_count'] + r['broken_ext_count'] for r in results)
79
+ total_redirects = sum(r['redirect_int_count'] + r['redirect_ext_count'] for r in results)
80
+ total_flags = sum(r['follow_flag_count'] for r in results)
81
+ total_dups = sum(r['duplicate_count'] for r in results)
82
+ total_sug = sum(len(r['suggestions']) for r in results)
83
+ total_orphan = len(orphan_pages)
84
+
85
+ rows_html = ""
86
+ for idx, r in enumerate(results, 1):
87
+ if r['error']:
88
+ rows_html += f'''<tr><td class="cell-url"><div class="row-num">#{idx}</div>{esc(short_url(r["url"], domain))}</td>
89
+ <td colspan="15" style="color:var(--red);vertical-align:middle;">❌ Error: {esc(r["error"])}</td></tr>'''
90
+ continue
91
+
92
+ is_orphan = r['url'].rstrip('/').split('?')[0] in orphan_pages
93
+
94
+ # Internal Links
95
+ int_badges = ""
96
+ if r['int_count'] == 0:
97
+ int_badges = badge('0 Links ⚠', 'issue')
98
+ else:
99
+ ok_count = r['int_count'] - r['broken_int_count'] - r['redirect_int_count']
100
+ if ok_count > 0: int_badges += badge(f'{ok_count} Active', 'ok')
101
+ if r['broken_int_count'] > 0: int_badges += badge(f'{r["broken_int_count"]} Broken', 'issue')
102
+ if r['redirect_int_count'] > 0: int_badges += badge(f'{r["redirect_int_count"]} Redirect', 'issue')
103
+ int_details = "".join(render_link_entry(l, domain) for l in r['internal_links'])
104
+ if not int_details:
105
+ int_details = '<div style="font-size:11px;color:var(--red);">No internal links found in body content.</div>'
106
+
107
+ # External Links
108
+ ext_badges = ""
109
+ if r['ext_count'] == 0:
110
+ ext_badges = badge('0 Links', 'neutral')
111
+ else:
112
+ if r['broken_ext_count'] > 0: ext_badges += badge(f'{r["broken_ext_count"]} Broken', 'issue')
113
+ if r['redirect_ext_count'] > 0: ext_badges += badge(f'{r["redirect_ext_count"]} Redirect', 'issue')
114
+ ext_df_count = sum(1 for l in r['external_links'] if l['follow'] == 'Dofollow')
115
+ if ext_df_count > 0: ext_badges += badge(f'{ext_df_count} Dofollow ⚠', 'issue')
116
+ ok_ext = r['ext_count'] - r['broken_ext_count'] - r['redirect_ext_count']
117
+ if ok_ext > 0 and not r['broken_ext_count'] and not r['redirect_ext_count'] and not ext_df_count:
118
+ ext_badges += badge(f'{ok_ext} Active', 'ok')
119
+ ext_details = "".join(render_link_entry(l, domain) for l in r['external_links'])
120
+ if not ext_details:
121
+ ext_details = '<div style="font-size:11px;color:var(--text-dim);">No external links in body content.</div>'
122
+
123
+ # Follow Flags
124
+ int_nf_flags = [l for l in r['internal_links'] if l['follow'] == 'Nofollow']
125
+ ext_df_flags_list = [l for l in r['external_links'] if l['follow'] == 'Dofollow']
126
+ flag_badges = ""
127
+ if int_nf_flags: flag_badges += badge(f'{len(int_nf_flags)} Int. Nofollow ⚠', 'issue')
128
+ if ext_df_flags_list: flag_badges += badge(f'{len(ext_df_flags_list)} Ext. Dofollow ⚠', 'issue')
129
+ if not flag_badges: flag_badges = badge('βœ“ No Flags', 'ok')
130
+ flag_details = "".join(render_link_entry(l, domain, show_flags=True) for l in int_nf_flags + ext_df_flags_list)
131
+ if not flag_details:
132
+ flag_details = '<div style="font-size:11px;color:var(--green);">All internal=Dofollow βœ“ and external=Nofollow βœ“</div>'
133
+
134
+ # Broken / Redirect
135
+ bi_badges = badge(f'{r["broken_int_count"]} Broken', 'issue') if r['broken_int_count'] > 0 else badge('βœ“ None', 'ok')
136
+ bi_details = "".join(render_link_entry(l, domain) for l in r['broken_internal']) or '<div style="font-size:11px;color:var(--green);">No broken internal links.</div>'
137
+
138
+ be_badges = badge(f'{r["broken_ext_count"]} Broken', 'issue') if r['broken_ext_count'] > 0 else badge('βœ“ None', 'ok')
139
+ be_details = "".join(render_link_entry(l, domain) for l in r['broken_external']) or '<div style="font-size:11px;color:var(--green);">No broken external links.</div>'
140
+
141
+ ri_badges = badge(f'{r["redirect_int_count"]} Redirects', 'issue') if r['redirect_int_count'] > 0 else badge('βœ“ None', 'ok')
142
+ ri_details = "".join(render_link_entry(l, domain) for l in r['redirect_internal']) or '<div style="font-size:11px;color:var(--green);">No internal redirects.</div>'
143
+
144
+ re_badges = badge(f'{r["redirect_ext_count"]} Redirects', 'issue') if r['redirect_ext_count'] > 0 else badge('βœ“ None', 'ok')
145
+ re_details = "".join(render_link_entry(l, domain) for l in r['redirect_external']) or '<div style="font-size:11px;color:var(--green);">No external redirects.</div>'
146
+
147
+ # Duplicates
148
+ dup_badges = badge(f'{r["duplicate_count"]} Duplicates', 'issue') if r['duplicate_count'] > 0 else badge('βœ“ None', 'ok')
149
+ dup_details = ""
150
+ for d in r['duplicates']:
151
+ locs = ", ".join(esc(l) for l in d['locations'])
152
+ dup_details += f'<div class="le s-issue"><div class="le-url">{esc(short_url(d["url"], domain))}</div>'
153
+ dup_details += f'<div class="le-issue">⚠ Appears {d["count"]}x in body content</div>'
154
+ dup_details += f'<div class="le-location">πŸ“ Locations: {locs}</div></div>'
155
+ if not dup_details:
156
+ dup_details = '<div style="font-size:11px;color:var(--green);">No duplicate links in body content.</div>'
157
+
158
+ # Suggestions
159
+ sug_list = r['suggestions']
160
+ high_count = sum(1 for s in sug_list if s['priority'] == 'High')
161
+ sug_badges = ""
162
+ if sug_list:
163
+ sug_badges = badge(f'{len(sug_list)} Suggestions', 'sug')
164
+ if high_count: sug_badges += badge(f'{high_count} High', 'issue')
165
+ else:
166
+ sug_badges = badge('0', 'neutral')
167
+ sug_details = ""
168
+ for s in sug_list:
169
+ pri_cls = 'high' if s['priority'] == 'High' else 'med'
170
+ sug_details += f'''<div class="se"><div class="se-head"><span class="se-section">{esc(s["section"])}</span>
171
+ <span class="se-pri {pri_cls}">{s["priority"]}</span></div>
172
+ <div class="se-url">{esc(s["target"])}</div>
173
+ <div class="se-anchor">β†’ "{esc(s["anchor"])}"</div></div>'''
174
+ if not sug_details:
175
+ sug_details = '<div style="font-size:11px;color:var(--text-dim);">No keyword matches for suggestions.</div>'
176
+
177
+ # Notes
178
+ issues = []
179
+ if r['int_count'] < 3:
180
+ issues.append(f"Only {r['int_count']} internal links β€” very low for article length")
181
+ if r['broken_int_count'] + r['broken_ext_count'] > 0:
182
+ issues.append(f"{r['broken_int_count']+r['broken_ext_count']} broken link(s) need fixing")
183
+ if ext_df_flags_list:
184
+ issues.append(f"{len(ext_df_flags_list)} external links are Dofollow β€” add nofollow")
185
+ if int_nf_flags:
186
+ issues.append(f"{len(int_nf_flags)} internal links are Nofollow β€” should be Dofollow")
187
+ if r['redirect_int_count'] + r['redirect_ext_count'] > 0:
188
+ issues.append(f"{r['redirect_int_count']+r['redirect_ext_count']} redirect(s) β€” update href")
189
+ if r['duplicate_count'] > 0:
190
+ issues.append(f"{r['duplicate_count']} duplicate link(s) in body")
191
+ if is_orphan:
192
+ issues.append("ORPHAN PAGE β€” no incoming internal links from other pages")
193
+
194
+ note_badges = badge(f'{len(issues)} Issues', 'issue') if issues else badge('βœ“ Clean', 'ok')
195
+ note_details = "".join(f'<div class="ni critical">⚠ {esc(issue)}</div>' for issue in issues)
196
+ if not note_details:
197
+ note_details = '<div style="font-size:11px;color:var(--green);">No issues detected.</div>'
198
+
199
+ orphan_cell = '<span class="badge issue">Yes ⚠</span>' if is_orphan else '<span class="badge ok">No</span>'
200
+
201
+ rows_html += f'''<tr data-broken="{r['broken_int_count']+r['broken_ext_count']}" data-redirect="{r['redirect_int_count']+r['redirect_ext_count']}" data-internal="{r['int_count']}" data-follow-flag="{r['follow_flag_count']}" data-duplicates="{r['duplicate_count']}" data-orphan="{1 if is_orphan else 0}">
202
+ <td class="cell-url"><div class="row-num">#{idx}</div>{esc(short_url(r['url'], domain))}</td>
203
+ <td class="cell-count {'issue' if r['int_count']<3 else 'neutral'}">{r['int_count']}</td>
204
+ <td class="cell-count neutral">{r['ext_count']}</td>
205
+ {render_accordion(int_badges, int_details)}
206
+ {render_accordion(ext_badges, ext_details)}
207
+ <td class="cell-count"><span style="color:var(--green)">{r['int_df']}</span><span style="color:var(--text-dim)"> / </span><span style="color:{'var(--red)' if r['int_nf']>0 else 'var(--text-dim)'}">{r['int_nf']}</span></td>
208
+ <td class="cell-count"><span style="color:{'var(--red)' if r['ext_df']>0 else 'var(--text-dim)'}">{r['ext_df']}</span><span style="color:var(--text-dim)"> / </span><span style="color:var(--green)">{r['ext_nf']}</span></td>
209
+ {render_accordion(flag_badges, flag_details)}
210
+ {render_accordion(bi_badges, bi_details)}
211
+ {render_accordion(be_badges, be_details)}
212
+ {render_accordion(ri_badges, ri_details)}
213
+ {render_accordion(re_badges, re_details)}
214
+ {render_accordion(dup_badges, dup_details)}
215
+ <td class="cell-orphan">{orphan_cell}</td>
216
+ {render_accordion(sug_badges, sug_details)}
217
+ {render_accordion(note_badges, note_details)}
218
+ </tr>'''
219
+
220
+ report = f'''<!DOCTYPE html>
221
+ <html lang="en">
222
+ <head>
223
+ <meta charset="UTF-8"><meta name="viewport" content="width=device-width,initial-scale=1.0">
224
+ <title>Bulk Link Audit Report</title>
225
+ <link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,300;0,9..40,400;0,9..40,500;0,9..40,600;0,9..40,700;1,9..40,400&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
226
+ <style>
227
+ :root {{--bg:#f4f6f9;--surface:#fff;--surface-2:#f8f9fb;--surface-3:#eef1f5;--border:#e2e6ed;--border-light:#d0d5de;--text:#1a1f2e;--text-muted:#5c6478;--text-dim:#8892a6;--accent:#2563eb;--accent-dim:rgba(37,99,235,0.06);--accent-light:rgba(37,99,235,0.10);--green:#059669;--green-dim:rgba(5,150,105,0.06);--green-light:rgba(5,150,105,0.10);--red:#dc2626;--red-dim:rgba(220,38,38,0.05);--red-light:rgba(220,38,38,0.08);--orange:#d97706;--orange-dim:rgba(217,119,6,0.06);--purple:#7c3aed;--purple-dim:rgba(124,58,237,0.06);--pink:#db2777;--pink-dim:rgba(219,39,119,0.06);--mono:'JetBrains Mono',monospace;--font:'DM Sans',sans-serif;--radius:8px;--radius-sm:5px;}}
228
+ *{{margin:0;padding:0;box-sizing:border-box;}}body{{background:var(--bg);color:var(--text);font-family:var(--font);-webkit-font-smoothing:antialiased;line-height:1.55;}}
229
+ .header{{padding:32px 36px 24px;border-bottom:1px solid var(--border);background:var(--surface);}}.header .tag-line{{font-size:10px;font-weight:700;letter-spacing:1.8px;text-transform:uppercase;color:var(--accent);margin-bottom:8px;}}.header h1{{font-size:21px;font-weight:700;margin-bottom:4px;}}.header .meta{{font-size:11.5px;color:var(--text-dim);}}.header .meta span{{color:var(--text-muted);}}
230
+ .toolbar{{display:flex;align-items:center;gap:10px;padding:12px 36px;background:var(--surface);border-bottom:1px solid var(--border);flex-wrap:wrap;}}.toolbar-btn{{font-family:var(--font);font-size:11px;font-weight:600;color:var(--text-muted);background:var(--surface-2);border:1px solid var(--border);border-radius:var(--radius-sm);padding:6px 14px;cursor:pointer;transition:all .15s;}}.toolbar-btn:hover{{color:var(--text);background:var(--surface-3);}}.toolbar-btn.active{{color:var(--accent);border-color:var(--accent);background:var(--accent-dim);}}.toolbar-sep{{width:1px;height:20px;background:var(--border);}}.toolbar-label{{font-size:10px;font-weight:600;letter-spacing:.5px;text-transform:uppercase;color:var(--text-dim);margin-right:4px;}}
231
+ .legend{{display:flex;flex-wrap:wrap;gap:18px;padding:10px 36px;background:var(--surface-2);border-bottom:1px solid var(--border);}}.legend-item{{display:flex;align-items:center;gap:6px;font-size:10.5px;color:var(--text-muted);}}.legend-dot{{width:8px;height:8px;border-radius:50%;}}.legend-dot.green{{background:var(--green);}}.legend-dot.red{{background:var(--red);}}.legend-dot.blue{{background:var(--accent);}}
232
+ .summary-bar{{display:flex;border-bottom:1px solid var(--border);background:var(--surface);flex-wrap:wrap;}}.summary-stat{{flex:1;padding:14px 16px;border-right:1px solid var(--border);text-align:center;min-width:90px;}}.summary-stat:last-child{{border-right:none;}}.summary-stat .s-val{{font-size:22px;font-weight:700;line-height:1;margin-bottom:2px;}}.summary-stat .s-label{{font-size:9px;font-weight:600;letter-spacing:.5px;text-transform:uppercase;color:var(--text-dim);}}.s-val.blue{{color:var(--accent);}}.s-val.green{{color:var(--green);}}.s-val.red{{color:var(--red);}}.s-val.pink{{color:var(--pink);}}
233
+ .table-wrap{{overflow-x:auto;background:var(--surface);}}table{{width:100%;border-collapse:collapse;min-width:2400px;}}
234
+ thead th{{background:var(--surface-3);color:var(--text-muted);font-size:9px;font-weight:700;letter-spacing:.7px;text-transform:uppercase;padding:12px 12px;text-align:left;border-bottom:2px solid var(--border-light);position:sticky;top:0;z-index:10;white-space:nowrap;}}thead th.center{{text-align:center;}}thead th.c-blue{{border-bottom-color:var(--accent);color:var(--accent);}}thead th.c-red{{border-bottom-color:var(--red);color:var(--red);}}thead th.c-green{{border-bottom-color:var(--green);color:var(--green);}}thead th.c-pink{{border-bottom-color:var(--pink);color:var(--pink);}}thead th:first-child{{position:sticky;left:0;z-index:15;border-right:2px solid var(--border-light);background:var(--surface-3);}}
235
+ tbody tr{{border-bottom:1px solid var(--border);transition:background .12s;}}tbody tr:hover{{background:var(--accent-dim);}}
236
+ tbody td{{padding:10px 12px;vertical-align:top;font-size:12px;border-right:1px solid var(--border);}}tbody td:last-child{{border-right:none;}}
237
+ td.cell-url{{font-family:var(--mono);font-size:11px;color:var(--accent);word-break:break-all;line-height:1.5;background:var(--surface);position:sticky;left:0;z-index:5;border-right:2px solid var(--border-light);min-width:260px;max-width:260px;}}td.cell-url .row-num{{font-size:9px;color:var(--text-dim);margin-bottom:4px;font-weight:600;}}
238
+ td.cell-count{{text-align:center;font-family:var(--mono);font-size:15px;font-weight:700;vertical-align:middle;}}td.cell-count.ok{{color:var(--green);}}td.cell-count.issue{{color:var(--red);}}td.cell-count.neutral{{color:var(--text);}}
239
+ .acc-cell{{cursor:pointer;user-select:none;}}.acc-collapsed{{display:flex;align-items:center;gap:8px;min-height:26px;}}.acc-chevron{{width:20px;height:20px;border-radius:4px;background:var(--surface-3);display:flex;align-items:center;justify-content:center;flex-shrink:0;transition:transform .2s,background .15s;font-size:10px;color:var(--text-dim);border:1px solid var(--border);}}.acc-cell.open .acc-chevron{{transform:rotate(90deg);background:var(--accent-dim);color:var(--accent);border-color:var(--accent);}}.acc-summary{{font-size:11px;color:var(--text-muted);display:flex;flex-wrap:wrap;gap:4px;align-items:center;}}
240
+ .badge{{font-size:9px;font-weight:700;padding:2.5px 8px;border-radius:10px;letter-spacing:.2px;white-space:nowrap;border:1px solid transparent;}}.badge.ok{{background:var(--green-dim);color:var(--green);border-color:var(--green-light);}}.badge.issue{{background:var(--red-dim);color:var(--red);border-color:var(--red-light);}}.badge.sug{{background:var(--green-dim);color:var(--green);border-color:var(--green-light);}}.badge.neutral{{background:var(--surface-3);color:var(--text-dim);border-color:var(--border);}}
241
+ .acc-details{{display:none;margin-top:10px;padding-top:10px;border-top:1px dashed var(--border);}}.acc-cell.open .acc-details{{display:block;}}
242
+ .le{{padding:9px 11px;background:var(--surface-2);border-radius:var(--radius-sm);margin-bottom:6px;border:1px solid var(--border);border-left-width:3px;}}.le:last-child{{margin-bottom:0;}}.le.s-ok{{border-left-color:var(--green);}}.le.s-issue{{border-left-color:var(--red);background:var(--red-dim);}}
243
+ .le .le-url{{font-family:var(--mono);font-size:10.5px;color:var(--accent);word-break:break-all;margin-bottom:4px;}}.le.s-issue .le-url{{color:var(--red);}}
244
+ .le .le-tags{{display:flex;flex-wrap:wrap;gap:4px;margin-bottom:4px;}}.tag-sm{{font-size:8.5px;font-weight:700;padding:2px 6px;border-radius:8px;}}.tag-sm.ok{{background:var(--green-light);color:var(--green);}}.tag-sm.issue{{background:var(--red-light);color:var(--red);}}
245
+ .le .le-anchor{{font-size:10.5px;color:var(--text-muted);}}.le .le-anchor b{{color:var(--text);font-weight:600;}}.le .le-redir{{font-family:var(--mono);font-size:10px;color:var(--orange);margin-top:3px;}}.le .le-location{{font-size:10px;color:var(--text-dim);margin-top:4px;}}.le .le-issue{{font-size:10px;color:var(--red);margin-top:5px;padding-top:5px;border-top:1px dashed var(--border);font-weight:500;}}.le .le-fix{{font-size:10px;color:var(--green);margin-top:3px;font-weight:500;}}
246
+ .se{{padding:8px 11px;background:var(--surface-2);border-radius:var(--radius-sm);margin-bottom:5px;border:1px solid var(--border);border-left:3px solid var(--green);}}.se:last-child{{margin-bottom:0;}}.se .se-head{{display:flex;justify-content:space-between;align-items:center;margin-bottom:2px;}}.se .se-section{{font-size:9.5px;font-weight:600;letter-spacing:.3px;text-transform:uppercase;color:var(--text-dim);}}.se-pri{{font-size:8.5px;font-weight:700;letter-spacing:.5px;text-transform:uppercase;padding:2px 7px;border-radius:8px;}}.se-pri.high{{background:var(--red-light);color:var(--red);}}.se-pri.med{{background:rgba(217,119,6,.1);color:var(--orange);}}.se .se-url{{font-family:var(--mono);font-size:10px;color:var(--accent);word-break:break-all;margin-bottom:1px;}}.se .se-anchor{{font-size:10px;color:var(--green);font-weight:500;}}
247
+ .ni{{font-size:11px;color:var(--text-muted);line-height:1.5;padding:5px 0;border-bottom:1px dashed var(--border);}}.ni:last-child{{border-bottom:none;}}.ni.critical{{color:var(--red);font-weight:500;}}
248
+ td.cell-orphan{{text-align:center;vertical-align:middle;}}
249
+ .footer{{padding:18px 36px;border-top:1px solid var(--border);font-size:11px;color:var(--text-dim);text-align:center;background:var(--surface);}}
250
+ </style>
251
+ </head>
252
+ <body>
253
+ <div class="header"><div class="tag-line">Bulk Link Audit Report</div><h1>Blog β€” Body Content Link Analysis</h1><div class="meta">Scope: Body content only Β· <span>{now}</span> Β· Pages: <span>{total_pages}</span> Β· Domain: <span>{esc(domain)}</span></div></div>
254
+ <div class="toolbar"><span class="toolbar-label">Actions:</span><button class="toolbar-btn" onclick="expandAll()">⊞ Expand All</button><button class="toolbar-btn" onclick="collapseAll()">⊟ Collapse All</button><div class="toolbar-sep"></div><span class="toolbar-label">Filter:</span><button class="toolbar-btn filter-btn active" onclick="filterRows('all',this)">All ({total_pages})</button><button class="toolbar-btn filter-btn" onclick="filterRows('broken',this)">Broken ({total_broken})</button><button class="toolbar-btn filter-btn" onclick="filterRows('redirect',this)">Redirects ({total_redirects})</button><button class="toolbar-btn filter-btn" onclick="filterRows('low-links',this)">Low Links</button><button class="toolbar-btn filter-btn" onclick="filterRows('follow-flag',this)">Follow Flags ({total_flags})</button><button class="toolbar-btn filter-btn" onclick="filterRows('duplicates',this)">Duplicates ({total_dups})</button><button class="toolbar-btn filter-btn" onclick="filterRows('orphan',this)">Orphans ({total_orphan})</button></div>
255
+ <div class="legend"><div class="legend-item"><div class="legend-dot green"></div> No Issues</div><div class="legend-item"><div class="legend-dot red"></div> Issue (Broken / Flag / Redirect / Duplicate)</div><div class="legend-item"><div class="legend-dot blue"></div> Info</div></div>
256
+ <div class="summary-bar"><div class="summary-stat"><div class="s-val blue">{total_pages}</div><div class="s-label">Pages</div></div><div class="summary-stat"><div class="s-val blue">{total_int}</div><div class="s-label">Internal</div></div><div class="summary-stat"><div class="s-val blue">{total_ext}</div><div class="s-label">External</div></div><div class="summary-stat"><div class="s-val red">{total_broken}</div><div class="s-label">Broken</div></div><div class="summary-stat"><div class="s-val red">{total_redirects}</div><div class="s-label">Redirects</div></div><div class="summary-stat"><div class="s-val red">{total_flags}</div><div class="s-label">Follow Flags</div></div><div class="summary-stat"><div class="s-val pink">{total_dups}</div><div class="s-label">Duplicates</div></div><div class="summary-stat"><div class="s-val green">{total_sug}</div><div class="s-label">Suggestions</div></div><div class="summary-stat"><div class="s-val red">{total_orphan}</div><div class="s-label">Orphans</div></div></div>
257
+ <div class="table-wrap"><table><thead><tr><th>URL</th><th class="center c-blue">Int.</th><th class="center c-blue">Ext.</th><th class="c-blue">Internal Links</th><th class="c-blue">External Links</th><th class="center">Int DF/NF</th><th class="center">Ext DF/NF</th><th class="c-red">Follow Flags</th><th class="c-red">Broken Int.</th><th class="c-red">Broken Ext.</th><th class="c-red">Redirect Int.</th><th class="c-red">Redirect Ext.</th><th class="c-pink">Duplicates</th><th class="center">Orphan</th><th class="c-green">Suggestions</th><th>Notes</th></tr></thead>
258
+ <tbody>{rows_html}</tbody></table></div>
259
+ <div class="footer">Bulk Link Audit Β· Body Content Scope Β· {now} Β· Click β–Ά to expand</div>
260
+ <script>
261
+ function toggleAcc(c){{c.classList.toggle('open');}}
262
+ function expandAll(){{document.querySelectorAll('.acc-cell').forEach(c=>c.classList.add('open'));}}
263
+ function collapseAll(){{document.querySelectorAll('.acc-cell').forEach(c=>c.classList.remove('open'));}}
264
+ function filterRows(t,b){{document.querySelectorAll('.filter-btn').forEach(x=>x.classList.remove('active'));b.classList.add('active');document.querySelectorAll('tbody tr').forEach(r=>{{let d=r.dataset,s=true;if(t==='broken')s=parseInt(d.broken||0)>0;else if(t==='redirect')s=parseInt(d.redirect||0)>0;else if(t==='low-links')s=parseInt(d.internal||0)<5;else if(t==='follow-flag')s=parseInt(d.followFlag||0)>0;else if(t==='duplicates')s=parseInt(d.duplicates||0)>0;else if(t==='orphan')s=parseInt(d.orphan||0)>0;r.style.display=s?'':'none';}});}}
265
+ </script>
266
+ </body></html>'''
267
+
268
+ return report
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio==5.12.0
2
+ requests==2.32.3
3
+ beautifulsoup4==4.12.3
4
+ lxml==5.3.0
5
+ openpyxl==3.1.5
6
+ pandas==2.2.3
7
+ supabase==2.11.0