Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
Link Audit Tool β Gradio UI for Hugging Face Spaces
|
| 3 |
-
Proper pause/resume via threading + Supabase persistence.
|
| 4 |
"""
|
| 5 |
|
| 6 |
import gradio as gr
|
|
@@ -19,7 +18,6 @@ from db import (
|
|
| 19 |
save_batch_results, update_run_status, delete_run,
|
| 20 |
)
|
| 21 |
|
| 22 |
-
# βββ Supabase Connection βββ
|
| 23 |
SUPABASE_URL = os.environ.get("SUPABASE_URL", "")
|
| 24 |
SUPABASE_KEY = os.environ.get("SUPABASE_KEY", "")
|
| 25 |
sb = None
|
|
@@ -29,10 +27,10 @@ if SUPABASE_URL and SUPABASE_KEY:
|
|
| 29 |
sb.table("audit_runs").select("id").limit(1).execute()
|
| 30 |
print("β
Supabase connected")
|
| 31 |
except Exception as e:
|
| 32 |
-
print(f"β Supabase
|
| 33 |
sb = None
|
| 34 |
|
| 35 |
-
|
| 36 |
class AuditState:
|
| 37 |
def __init__(self):
|
| 38 |
self.lock = threading.Lock()
|
|
@@ -41,30 +39,48 @@ class AuditState:
|
|
| 41 |
self.run_id = None
|
| 42 |
|
| 43 |
def request_pause(self):
|
| 44 |
-
with self.lock:
|
| 45 |
-
self.paused = True
|
| 46 |
|
| 47 |
def resume(self):
|
| 48 |
-
with self.lock:
|
| 49 |
-
self.paused = False
|
| 50 |
|
| 51 |
def is_paused(self):
|
| 52 |
-
with self.lock:
|
| 53 |
-
return self.paused
|
| 54 |
|
| 55 |
def set_running(self, val, run_id=None):
|
| 56 |
with self.lock:
|
| 57 |
self.running = val
|
| 58 |
-
if run_id:
|
| 59 |
-
self.run_id = run_id
|
| 60 |
|
| 61 |
def is_running(self):
|
| 62 |
-
with self.lock:
|
| 63 |
-
return self.running
|
| 64 |
|
| 65 |
audit_state = AuditState()
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
|
|
|
| 68 |
def run_audit(file, pasted_urls, domain, batch_size, timeout, delay, workers):
|
| 69 |
if sb is None:
|
| 70 |
yield "β Supabase not connected.", ""
|
|
@@ -79,27 +95,21 @@ def run_audit(file, pasted_urls, domain, batch_size, timeout, delay, workers):
|
|
| 79 |
for col in df.columns:
|
| 80 |
sample = str(df[col].iloc[0]).strip().lower()
|
| 81 |
if sample.startswith('http') or domain in sample:
|
| 82 |
-
url_col = col
|
| 83 |
-
|
| 84 |
-
if url_col is None:
|
| 85 |
-
url_col = df.columns[0]
|
| 86 |
urls = [u for u in df[url_col].dropna().astype(str).str.strip().tolist() if u.startswith('http')]
|
| 87 |
except Exception as e:
|
| 88 |
-
yield f"β File error: {e}", ""
|
| 89 |
-
return
|
| 90 |
elif pasted_urls and pasted_urls.strip():
|
| 91 |
urls = [u.strip() for u in pasted_urls.strip().split('\n') if u.strip().startswith('http')]
|
| 92 |
|
| 93 |
if not urls:
|
| 94 |
-
yield "β No valid URLs.", ""
|
| 95 |
-
return
|
| 96 |
|
| 97 |
seen = set()
|
| 98 |
unique = []
|
| 99 |
for u in urls:
|
| 100 |
-
if u not in seen:
|
| 101 |
-
seen.add(u)
|
| 102 |
-
unique.append(u)
|
| 103 |
urls = unique
|
| 104 |
|
| 105 |
run_name = f"{domain} Audit β {datetime.now().strftime('%b %d %H:%M')} β {len(urls)} pages"
|
|
@@ -122,8 +132,7 @@ def run_audit(file, pasted_urls, domain, batch_size, timeout, delay, workers):
|
|
| 122 |
update_run_status(sb, run_id, "paused", c)
|
| 123 |
log.append(f"βΈοΈ PAUSED at {c}/{total}")
|
| 124 |
audit_state.set_running(False)
|
| 125 |
-
yield "\n".join(log[-40:]), f"βΈοΈ Paused β {c}/{total}"
|
| 126 |
-
return
|
| 127 |
|
| 128 |
be = min(bs + batch_size, total)
|
| 129 |
batch_urls = urls[bs:be]
|
|
@@ -132,14 +141,12 @@ def run_audit(file, pasted_urls, domain, batch_size, timeout, delay, workers):
|
|
| 132 |
|
| 133 |
for j, url in enumerate(batch_urls):
|
| 134 |
if audit_state.is_paused():
|
| 135 |
-
if batch_results:
|
| 136 |
-
save_batch_results(sb, run_id, batch_results)
|
| 137 |
c = get_completed_count(sb, run_id)
|
| 138 |
update_run_status(sb, run_id, "paused", c)
|
| 139 |
log.append(f"βΈοΈ PAUSED at {c}/{total}")
|
| 140 |
audit_state.set_running(False)
|
| 141 |
-
yield "\n".join(log[-40:]), f"βΈοΈ Paused β {c}/{total}"
|
| 142 |
-
return
|
| 143 |
|
| 144 |
gi = bs + j + 1
|
| 145 |
elapsed = time.time() - start_time
|
|
@@ -165,8 +172,7 @@ def run_audit(file, pasted_urls, domain, batch_size, timeout, delay, workers):
|
|
| 165 |
log.append(f"[{gi}/{total}] {short} β Int:{result['int_count']} Ext:{result['ext_count']} {fs}")
|
| 166 |
|
| 167 |
yield "\n".join(log[-40:]), f"π {gi}/{total} ({gi*100//total}%) Batch{batch_num} ETA:{eta_s}"
|
| 168 |
-
if j < len(batch_urls) - 1:
|
| 169 |
-
time.sleep(delay)
|
| 170 |
|
| 171 |
if batch_results:
|
| 172 |
try:
|
|
@@ -187,21 +193,18 @@ def run_audit(file, pasted_urls, domain, batch_size, timeout, delay, workers):
|
|
| 187 |
targets, pg_urls = set(), set()
|
| 188 |
for r in all_results:
|
| 189 |
pg_urls.add(r['url'].rstrip('/').split('?')[0])
|
| 190 |
-
for lk in r.get('internal_links', []):
|
| 191 |
-
targets.add(lk['url'].rstrip('/').split('?')[0])
|
| 192 |
orphans = sorted([p for p in pg_urls if p not in targets])
|
| 193 |
|
| 194 |
summary = {
|
| 195 |
-
'total_pages': len(all_results),
|
| 196 |
-
'
|
| 197 |
-
'
|
| 198 |
-
'
|
| 199 |
-
'
|
| 200 |
-
'
|
| 201 |
-
'
|
| 202 |
-
'
|
| 203 |
-
'orphan_count': len(orphans),
|
| 204 |
-
'orphan_urls': orphans[:100],
|
| 205 |
}
|
| 206 |
update_run_status(sb, run_id, "completed", len(all_results), summary)
|
| 207 |
tt = time.time() - start_time
|
|
@@ -214,9 +217,7 @@ def run_audit(file, pasted_urls, domain, batch_size, timeout, delay, workers):
|
|
| 214 |
except Exception as e:
|
| 215 |
log.append(f"β {str(e)}")
|
| 216 |
audit_state.set_running(False)
|
| 217 |
-
try:
|
| 218 |
-
c = get_completed_count(sb, run_id)
|
| 219 |
-
update_run_status(sb, run_id, "paused", c)
|
| 220 |
except: pass
|
| 221 |
yield "\n".join(log[-40:]), "β Error β progress saved"
|
| 222 |
|
|
@@ -228,13 +229,16 @@ def pause_audit():
|
|
| 228 |
return "No audit running."
|
| 229 |
|
| 230 |
|
| 231 |
-
|
|
|
|
| 232 |
if sb is None:
|
| 233 |
-
yield "β Supabase not connected.", ""
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
| 235 |
if not run_id:
|
| 236 |
-
yield "
|
| 237 |
-
return
|
| 238 |
|
| 239 |
all_urls = get_pending_urls(sb, run_id)
|
| 240 |
done = get_completed_urls(sb, run_id)
|
|
@@ -242,12 +246,10 @@ def resume_audit(run_id, domain, batch_size, timeout, delay, workers):
|
|
| 242 |
|
| 243 |
if not remaining:
|
| 244 |
update_run_status(sb, run_id, "completed", len(done))
|
| 245 |
-
yield "β
Already complete!", ""
|
| 246 |
-
return
|
| 247 |
|
| 248 |
try:
|
| 249 |
-
|
| 250 |
-
rd = next((r for r in runs if r['id'] == run_id), None)
|
| 251 |
if rd: domain = rd.get('domain', domain)
|
| 252 |
except: pass
|
| 253 |
|
|
@@ -269,13 +271,11 @@ def resume_audit(run_id, domain, batch_size, timeout, delay, workers):
|
|
| 269 |
update_run_status(sb, run_id, "paused", c)
|
| 270 |
log.append(f"βΈοΈ PAUSED {c}/{total}")
|
| 271 |
audit_state.set_running(False)
|
| 272 |
-
yield "\n".join(log[-40:]), f"βΈοΈ Paused {c}/{total}"
|
| 273 |
-
return
|
| 274 |
|
| 275 |
be = min(bs + batch_size, len(remaining))
|
| 276 |
bu = remaining[bs:be]
|
| 277 |
-
bn += 1
|
| 278 |
-
br = []
|
| 279 |
|
| 280 |
for j, url in enumerate(bu):
|
| 281 |
if audit_state.is_paused():
|
|
@@ -284,8 +284,7 @@ def resume_audit(run_id, domain, batch_size, timeout, delay, workers):
|
|
| 284 |
update_run_status(sb, run_id, "paused", c)
|
| 285 |
log.append(f"βΈοΈ PAUSED {c}/{total}")
|
| 286 |
audit_state.set_running(False)
|
| 287 |
-
yield "\n".join(log[-40:]), f"βΈοΈ Paused {c}/{total}"
|
| 288 |
-
return
|
| 289 |
|
| 290 |
gi = len(done) + bs + j + 1
|
| 291 |
elapsed = time.time() - start_time
|
|
@@ -344,84 +343,70 @@ def resume_audit(run_id, domain, batch_size, timeout, delay, workers):
|
|
| 344 |
except Exception as e:
|
| 345 |
log.append(f"β {str(e)}")
|
| 346 |
audit_state.set_running(False)
|
| 347 |
-
try:
|
| 348 |
-
c = get_completed_count(sb, run_id)
|
| 349 |
-
update_run_status(sb, run_id, "paused", c)
|
| 350 |
except: pass
|
| 351 |
yield "\n".join(log[-40:]), "β Error"
|
| 352 |
|
| 353 |
|
| 354 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 355 |
-
# PAST RUNS
|
| 356 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 357 |
|
| 358 |
def load_runs_html():
|
| 359 |
-
|
| 360 |
-
if
|
| 361 |
-
return "<p>
|
| 362 |
-
runs = get_all_runs(sb)
|
| 363 |
-
if not runs:
|
| 364 |
-
return "<p>No saved runs yet.</p>"
|
| 365 |
-
|
| 366 |
html = '<table style="width:100%;border-collapse:collapse;font-size:13px;">'
|
| 367 |
html += '<tr style="background:#f1f5f9;"><th style="padding:8px;text-align:left;">Run</th><th style="padding:8px;text-align:center;">Status</th><th style="padding:8px;text-align:center;">Pages</th><th style="padding:8px;text-align:center;">Broken</th><th style="padding:8px;text-align:center;">Flags</th><th style="padding:8px;text-align:center;">Dups</th><th style="padding:8px;text-align:center;">Orphans</th></tr>'
|
| 368 |
-
|
| 369 |
-
for r in runs:
|
| 370 |
s = r.get('summary', {}) or {}
|
| 371 |
st = r.get('status', '?')
|
| 372 |
sc = {'completed':'#059669','paused':'#d97706','running':'#2563eb'}.get(st,'#888')
|
| 373 |
-
|
| 374 |
cr = r.get('created_at','')[:16].replace('T',' ')
|
| 375 |
-
html += f'<tr style="border-bottom:1px solid #e2e8f0;"><td style="padding:8px;"><b>{r.get("name","?")}</b><br><span style="font-size:10px;color:#94a3b8;">{cr}</span></td><td style="padding:8px;text-align:center;"><span style="background:{
|
| 376 |
html += '</table>'
|
| 377 |
return html
|
| 378 |
|
| 379 |
|
| 380 |
def load_runs_choices():
|
| 381 |
-
"""
|
| 382 |
-
if sb is None:
|
| 383 |
-
return []
|
| 384 |
-
runs = get_all_runs(sb)
|
| 385 |
-
if not runs:
|
| 386 |
-
return []
|
| 387 |
choices = []
|
| 388 |
-
for r in
|
| 389 |
st = r.get('status', '?')
|
| 390 |
label = f"{r.get('name','?')} [{st.upper()}] ({r.get('completed_urls',0)}/{r.get('total_urls',0)})"
|
| 391 |
-
choices.append(
|
| 392 |
return choices
|
| 393 |
|
| 394 |
|
| 395 |
-
def generate_report_for_run(
|
| 396 |
-
if sb is None or not
|
| 397 |
return None, "β No run selected."
|
|
|
|
|
|
|
|
|
|
| 398 |
try:
|
| 399 |
-
run = None
|
| 400 |
-
for r in get_all_runs(sb):
|
| 401 |
-
if r['id'] == run_id:
|
| 402 |
-
run = r
|
| 403 |
-
break
|
| 404 |
pages = get_all_page_results(sb, run_id)
|
| 405 |
-
if not pages:
|
| 406 |
-
return None, "β No data."
|
| 407 |
results = [p['result'] for p in pages]
|
| 408 |
s = (run.get('summary', {}) or {}) if run else {}
|
| 409 |
rh = generate_report(results, s.get('orphan_urls', []), run.get('domain', domain) if run else domain)
|
| 410 |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.html', prefix='Audit_')
|
| 411 |
-
tmp.write(rh.encode('utf-8'))
|
| 412 |
-
tmp.close()
|
| 413 |
return tmp.name, f"β
Report β {len(results)} pages"
|
| 414 |
except Exception as e:
|
| 415 |
return None, f"β {str(e)}"
|
| 416 |
|
| 417 |
|
| 418 |
-
def generate_csv_for_run(
|
| 419 |
-
if sb is None or not
|
| 420 |
return None, "β No run selected."
|
|
|
|
|
|
|
| 421 |
try:
|
| 422 |
pages = get_all_page_results(sb, run_id)
|
| 423 |
-
if not pages:
|
| 424 |
-
return None, "β No data."
|
| 425 |
rows = [{'URL': p['result'].get('url',''), 'Internal': p['result'].get('int_count',0),
|
| 426 |
'External': p['result'].get('ext_count',0),
|
| 427 |
'Broken': p['result'].get('broken_int_count',0)+p['result'].get('broken_ext_count',0),
|
|
@@ -429,16 +414,17 @@ def generate_csv_for_run(run_id):
|
|
| 429 |
'Flags': p['result'].get('follow_flag_count',0),
|
| 430 |
'Dups': p['result'].get('duplicate_count',0)} for p in pages]
|
| 431 |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.csv', prefix='Audit_')
|
| 432 |
-
pd.DataFrame(rows).to_csv(tmp.name, index=False)
|
| 433 |
-
tmp.close()
|
| 434 |
return tmp.name, f"β
CSV β {len(rows)} rows"
|
| 435 |
except Exception as e:
|
| 436 |
return None, f"β {str(e)}"
|
| 437 |
|
| 438 |
|
| 439 |
-
def delete_selected_run(
|
| 440 |
-
if sb is None or not
|
| 441 |
return "β No run selected."
|
|
|
|
|
|
|
| 442 |
try:
|
| 443 |
delete_run(sb, run_id)
|
| 444 |
return "ποΈ Deleted. Click Refresh."
|
|
@@ -487,10 +473,12 @@ with gr.Blocks(title="Link Audit Tool", theme=gr.themes.Soft()) as app:
|
|
| 487 |
pause_btn.click(api_name=False, fn=pause_audit, outputs=[progress_text])
|
| 488 |
|
| 489 |
with gr.Tab("π Past Runs"):
|
| 490 |
-
|
| 491 |
-
refresh_btn = gr.Button("π Refresh", variant="secondary")
|
| 492 |
runs_html = gr.HTML(value="<p>Click Refresh to load.</p>")
|
| 493 |
-
|
|
|
|
|
|
|
|
|
|
| 494 |
|
| 495 |
with gr.Row():
|
| 496 |
report_btn = gr.Button("π HTML Report", variant="primary")
|
|
@@ -508,9 +496,9 @@ with gr.Blocks(title="Link Audit Tool", theme=gr.themes.Soft()) as app:
|
|
| 508 |
resume_log = gr.Textbox(label="Resume Log", lines=15, interactive=False)
|
| 509 |
resume_pause_btn = gr.Button("βΈοΈ Pause Resume", variant="stop")
|
| 510 |
|
| 511 |
-
# Refresh:
|
| 512 |
-
refresh_btn.click(api_name=False, fn=load_runs_html, outputs=[runs_html])
|
| 513 |
-
|
| 514 |
|
| 515 |
report_btn.click(api_name=False, fn=generate_report_for_run, inputs=[run_dropdown, domain_input], outputs=[report_file, action_status])
|
| 516 |
csv_btn.click(api_name=False, fn=generate_csv_for_run, inputs=[run_dropdown], outputs=[csv_file, action_status])
|
|
|
|
| 1 |
"""
|
| 2 |
Link Audit Tool β Gradio UI for Hugging Face Spaces
|
|
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
import gradio as gr
|
|
|
|
| 18 |
save_batch_results, update_run_status, delete_run,
|
| 19 |
)
|
| 20 |
|
|
|
|
| 21 |
SUPABASE_URL = os.environ.get("SUPABASE_URL", "")
|
| 22 |
SUPABASE_KEY = os.environ.get("SUPABASE_KEY", "")
|
| 23 |
sb = None
|
|
|
|
| 27 |
sb.table("audit_runs").select("id").limit(1).execute()
|
| 28 |
print("β
Supabase connected")
|
| 29 |
except Exception as e:
|
| 30 |
+
print(f"β Supabase failed: {e}")
|
| 31 |
sb = None
|
| 32 |
|
| 33 |
+
|
| 34 |
class AuditState:
|
| 35 |
def __init__(self):
|
| 36 |
self.lock = threading.Lock()
|
|
|
|
| 39 |
self.run_id = None
|
| 40 |
|
| 41 |
def request_pause(self):
|
| 42 |
+
with self.lock: self.paused = True
|
|
|
|
| 43 |
|
| 44 |
def resume(self):
|
| 45 |
+
with self.lock: self.paused = False
|
|
|
|
| 46 |
|
| 47 |
def is_paused(self):
|
| 48 |
+
with self.lock: return self.paused
|
|
|
|
| 49 |
|
| 50 |
def set_running(self, val, run_id=None):
|
| 51 |
with self.lock:
|
| 52 |
self.running = val
|
| 53 |
+
if run_id: self.run_id = run_id
|
|
|
|
| 54 |
|
| 55 |
def is_running(self):
|
| 56 |
+
with self.lock: return self.running
|
|
|
|
| 57 |
|
| 58 |
audit_state = AuditState()
|
| 59 |
|
| 60 |
+
# βββ Global runs cache for dropdown βββ
|
| 61 |
+
_runs_cache = []
|
| 62 |
+
|
| 63 |
+
def _refresh_cache():
|
| 64 |
+
global _runs_cache
|
| 65 |
+
if sb is None:
|
| 66 |
+
_runs_cache = []
|
| 67 |
+
return
|
| 68 |
+
_runs_cache = get_all_runs(sb) or []
|
| 69 |
+
|
| 70 |
+
def _get_run_id_by_label(label):
|
| 71 |
+
"""Look up run ID from dropdown label."""
|
| 72 |
+
for r in _runs_cache:
|
| 73 |
+
st = r.get('status', '?')
|
| 74 |
+
expected = f"{r.get('name','?')} [{st.upper()}] ({r.get('completed_urls',0)}/{r.get('total_urls',0)})"
|
| 75 |
+
if label == expected:
|
| 76 |
+
return r['id']
|
| 77 |
+
# Maybe it's a raw UUID
|
| 78 |
+
if label and len(label) > 30:
|
| 79 |
+
return label
|
| 80 |
+
return None
|
| 81 |
+
|
| 82 |
|
| 83 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 84 |
def run_audit(file, pasted_urls, domain, batch_size, timeout, delay, workers):
|
| 85 |
if sb is None:
|
| 86 |
yield "β Supabase not connected.", ""
|
|
|
|
| 95 |
for col in df.columns:
|
| 96 |
sample = str(df[col].iloc[0]).strip().lower()
|
| 97 |
if sample.startswith('http') or domain in sample:
|
| 98 |
+
url_col = col; break
|
| 99 |
+
if not url_col: url_col = df.columns[0]
|
|
|
|
|
|
|
| 100 |
urls = [u for u in df[url_col].dropna().astype(str).str.strip().tolist() if u.startswith('http')]
|
| 101 |
except Exception as e:
|
| 102 |
+
yield f"β File error: {e}", ""; return
|
|
|
|
| 103 |
elif pasted_urls and pasted_urls.strip():
|
| 104 |
urls = [u.strip() for u in pasted_urls.strip().split('\n') if u.strip().startswith('http')]
|
| 105 |
|
| 106 |
if not urls:
|
| 107 |
+
yield "β No valid URLs.", ""; return
|
|
|
|
| 108 |
|
| 109 |
seen = set()
|
| 110 |
unique = []
|
| 111 |
for u in urls:
|
| 112 |
+
if u not in seen: seen.add(u); unique.append(u)
|
|
|
|
|
|
|
| 113 |
urls = unique
|
| 114 |
|
| 115 |
run_name = f"{domain} Audit β {datetime.now().strftime('%b %d %H:%M')} β {len(urls)} pages"
|
|
|
|
| 132 |
update_run_status(sb, run_id, "paused", c)
|
| 133 |
log.append(f"βΈοΈ PAUSED at {c}/{total}")
|
| 134 |
audit_state.set_running(False)
|
| 135 |
+
yield "\n".join(log[-40:]), f"βΈοΈ Paused β {c}/{total}"; return
|
|
|
|
| 136 |
|
| 137 |
be = min(bs + batch_size, total)
|
| 138 |
batch_urls = urls[bs:be]
|
|
|
|
| 141 |
|
| 142 |
for j, url in enumerate(batch_urls):
|
| 143 |
if audit_state.is_paused():
|
| 144 |
+
if batch_results: save_batch_results(sb, run_id, batch_results)
|
|
|
|
| 145 |
c = get_completed_count(sb, run_id)
|
| 146 |
update_run_status(sb, run_id, "paused", c)
|
| 147 |
log.append(f"βΈοΈ PAUSED at {c}/{total}")
|
| 148 |
audit_state.set_running(False)
|
| 149 |
+
yield "\n".join(log[-40:]), f"βΈοΈ Paused β {c}/{total}"; return
|
|
|
|
| 150 |
|
| 151 |
gi = bs + j + 1
|
| 152 |
elapsed = time.time() - start_time
|
|
|
|
| 172 |
log.append(f"[{gi}/{total}] {short} β Int:{result['int_count']} Ext:{result['ext_count']} {fs}")
|
| 173 |
|
| 174 |
yield "\n".join(log[-40:]), f"π {gi}/{total} ({gi*100//total}%) Batch{batch_num} ETA:{eta_s}"
|
| 175 |
+
if j < len(batch_urls) - 1: time.sleep(delay)
|
|
|
|
| 176 |
|
| 177 |
if batch_results:
|
| 178 |
try:
|
|
|
|
| 193 |
targets, pg_urls = set(), set()
|
| 194 |
for r in all_results:
|
| 195 |
pg_urls.add(r['url'].rstrip('/').split('?')[0])
|
| 196 |
+
for lk in r.get('internal_links', []): targets.add(lk['url'].rstrip('/').split('?')[0])
|
|
|
|
| 197 |
orphans = sorted([p for p in pg_urls if p not in targets])
|
| 198 |
|
| 199 |
summary = {
|
| 200 |
+
'total_pages': len(all_results), 'total_int': sum(r.get('int_count',0) for r in all_results),
|
| 201 |
+
'total_ext': sum(r.get('ext_count',0) for r in all_results),
|
| 202 |
+
'total_broken': sum(r.get('broken_int_count',0)+r.get('broken_ext_count',0) for r in all_results),
|
| 203 |
+
'total_redirects': sum(r.get('redirect_int_count',0)+r.get('redirect_ext_count',0) for r in all_results),
|
| 204 |
+
'total_flags': sum(r.get('follow_flag_count',0) for r in all_results),
|
| 205 |
+
'total_dups': sum(r.get('duplicate_count',0) for r in all_results),
|
| 206 |
+
'total_sug': sum(len(r.get('suggestions',[])) for r in all_results),
|
| 207 |
+
'orphan_count': len(orphans), 'orphan_urls': orphans[:100],
|
|
|
|
|
|
|
| 208 |
}
|
| 209 |
update_run_status(sb, run_id, "completed", len(all_results), summary)
|
| 210 |
tt = time.time() - start_time
|
|
|
|
| 217 |
except Exception as e:
|
| 218 |
log.append(f"β {str(e)}")
|
| 219 |
audit_state.set_running(False)
|
| 220 |
+
try: c = get_completed_count(sb, run_id); update_run_status(sb, run_id, "paused", c)
|
|
|
|
|
|
|
| 221 |
except: pass
|
| 222 |
yield "\n".join(log[-40:]), "β Error β progress saved"
|
| 223 |
|
|
|
|
| 229 |
return "No audit running."
|
| 230 |
|
| 231 |
|
| 232 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 233 |
+
def resume_audit(run_label, domain, batch_size, timeout, delay, workers):
|
| 234 |
if sb is None:
|
| 235 |
+
yield "β Supabase not connected.", ""; return
|
| 236 |
+
if not run_label:
|
| 237 |
+
yield "β Select a run first (click Refresh, then pick from dropdown).", ""; return
|
| 238 |
+
|
| 239 |
+
run_id = _get_run_id_by_label(run_label)
|
| 240 |
if not run_id:
|
| 241 |
+
yield f"β Could not find run for: {run_label}", ""; return
|
|
|
|
| 242 |
|
| 243 |
all_urls = get_pending_urls(sb, run_id)
|
| 244 |
done = get_completed_urls(sb, run_id)
|
|
|
|
| 246 |
|
| 247 |
if not remaining:
|
| 248 |
update_run_status(sb, run_id, "completed", len(done))
|
| 249 |
+
yield "β
Already complete!", ""; return
|
|
|
|
| 250 |
|
| 251 |
try:
|
| 252 |
+
rd = next((r for r in _runs_cache if r['id'] == run_id), None)
|
|
|
|
| 253 |
if rd: domain = rd.get('domain', domain)
|
| 254 |
except: pass
|
| 255 |
|
|
|
|
| 271 |
update_run_status(sb, run_id, "paused", c)
|
| 272 |
log.append(f"βΈοΈ PAUSED {c}/{total}")
|
| 273 |
audit_state.set_running(False)
|
| 274 |
+
yield "\n".join(log[-40:]), f"βΈοΈ Paused {c}/{total}"; return
|
|
|
|
| 275 |
|
| 276 |
be = min(bs + batch_size, len(remaining))
|
| 277 |
bu = remaining[bs:be]
|
| 278 |
+
bn += 1; br = []
|
|
|
|
| 279 |
|
| 280 |
for j, url in enumerate(bu):
|
| 281 |
if audit_state.is_paused():
|
|
|
|
| 284 |
update_run_status(sb, run_id, "paused", c)
|
| 285 |
log.append(f"βΈοΈ PAUSED {c}/{total}")
|
| 286 |
audit_state.set_running(False)
|
| 287 |
+
yield "\n".join(log[-40:]), f"βΈοΈ Paused {c}/{total}"; return
|
|
|
|
| 288 |
|
| 289 |
gi = len(done) + bs + j + 1
|
| 290 |
elapsed = time.time() - start_time
|
|
|
|
| 343 |
except Exception as e:
|
| 344 |
log.append(f"β {str(e)}")
|
| 345 |
audit_state.set_running(False)
|
| 346 |
+
try: c = get_completed_count(sb, run_id); update_run_status(sb, run_id, "paused", c)
|
|
|
|
|
|
|
| 347 |
except: pass
|
| 348 |
yield "\n".join(log[-40:]), "β Error"
|
| 349 |
|
| 350 |
|
| 351 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 352 |
+
# PAST RUNS
|
| 353 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 354 |
|
| 355 |
def load_runs_html():
|
| 356 |
+
_refresh_cache()
|
| 357 |
+
if not _runs_cache:
|
| 358 |
+
return "<p>No saved runs.</p>"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
html = '<table style="width:100%;border-collapse:collapse;font-size:13px;">'
|
| 360 |
html += '<tr style="background:#f1f5f9;"><th style="padding:8px;text-align:left;">Run</th><th style="padding:8px;text-align:center;">Status</th><th style="padding:8px;text-align:center;">Pages</th><th style="padding:8px;text-align:center;">Broken</th><th style="padding:8px;text-align:center;">Flags</th><th style="padding:8px;text-align:center;">Dups</th><th style="padding:8px;text-align:center;">Orphans</th></tr>'
|
| 361 |
+
for r in _runs_cache:
|
|
|
|
| 362 |
s = r.get('summary', {}) or {}
|
| 363 |
st = r.get('status', '?')
|
| 364 |
sc = {'completed':'#059669','paused':'#d97706','running':'#2563eb'}.get(st,'#888')
|
| 365 |
+
bg = {'completed':'rgba(5,150,105,0.1)','paused':'rgba(217,119,6,0.1)','running':'rgba(37,99,235,0.1)'}.get(st,'rgba(136,136,136,0.1)')
|
| 366 |
cr = r.get('created_at','')[:16].replace('T',' ')
|
| 367 |
+
html += f'<tr style="border-bottom:1px solid #e2e8f0;"><td style="padding:8px;"><b>{r.get("name","?")}</b><br><span style="font-size:10px;color:#94a3b8;">{cr}</span></td><td style="padding:8px;text-align:center;"><span style="background:{bg};color:{sc};padding:2px 8px;border-radius:10px;font-size:10px;font-weight:700;">{st.upper()}</span></td><td style="padding:8px;text-align:center;font-weight:700;">{r.get("completed_urls",0)}/{r.get("total_urls",0)}</td><td style="padding:8px;text-align:center;color:#dc2626;font-weight:700;">{s.get("total_broken","β")}</td><td style="padding:8px;text-align:center;color:#dc2626;font-weight:700;">{s.get("total_flags","β")}</td><td style="padding:8px;text-align:center;color:#db2777;font-weight:700;">{s.get("total_dups","β")}</td><td style="padding:8px;text-align:center;color:#dc2626;font-weight:700;">{s.get("orphan_count","β")}</td></tr>'
|
| 368 |
html += '</table>'
|
| 369 |
return html
|
| 370 |
|
| 371 |
|
| 372 |
def load_runs_choices():
|
| 373 |
+
"""Return plain list of label strings for dropdown. Uses cache from load_runs_html."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
choices = []
|
| 375 |
+
for r in _runs_cache:
|
| 376 |
st = r.get('status', '?')
|
| 377 |
label = f"{r.get('name','?')} [{st.upper()}] ({r.get('completed_urls',0)}/{r.get('total_urls',0)})"
|
| 378 |
+
choices.append(label)
|
| 379 |
return choices
|
| 380 |
|
| 381 |
|
| 382 |
+
def generate_report_for_run(run_label, domain):
|
| 383 |
+
if sb is None or not run_label:
|
| 384 |
return None, "β No run selected."
|
| 385 |
+
run_id = _get_run_id_by_label(run_label)
|
| 386 |
+
if not run_id:
|
| 387 |
+
return None, "β Run not found."
|
| 388 |
try:
|
| 389 |
+
run = next((r for r in _runs_cache if r['id'] == run_id), None)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
pages = get_all_page_results(sb, run_id)
|
| 391 |
+
if not pages: return None, "β No data."
|
|
|
|
| 392 |
results = [p['result'] for p in pages]
|
| 393 |
s = (run.get('summary', {}) or {}) if run else {}
|
| 394 |
rh = generate_report(results, s.get('orphan_urls', []), run.get('domain', domain) if run else domain)
|
| 395 |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.html', prefix='Audit_')
|
| 396 |
+
tmp.write(rh.encode('utf-8')); tmp.close()
|
|
|
|
| 397 |
return tmp.name, f"β
Report β {len(results)} pages"
|
| 398 |
except Exception as e:
|
| 399 |
return None, f"β {str(e)}"
|
| 400 |
|
| 401 |
|
| 402 |
+
def generate_csv_for_run(run_label):
|
| 403 |
+
if sb is None or not run_label:
|
| 404 |
return None, "β No run selected."
|
| 405 |
+
run_id = _get_run_id_by_label(run_label)
|
| 406 |
+
if not run_id: return None, "β Run not found."
|
| 407 |
try:
|
| 408 |
pages = get_all_page_results(sb, run_id)
|
| 409 |
+
if not pages: return None, "β No data."
|
|
|
|
| 410 |
rows = [{'URL': p['result'].get('url',''), 'Internal': p['result'].get('int_count',0),
|
| 411 |
'External': p['result'].get('ext_count',0),
|
| 412 |
'Broken': p['result'].get('broken_int_count',0)+p['result'].get('broken_ext_count',0),
|
|
|
|
| 414 |
'Flags': p['result'].get('follow_flag_count',0),
|
| 415 |
'Dups': p['result'].get('duplicate_count',0)} for p in pages]
|
| 416 |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.csv', prefix='Audit_')
|
| 417 |
+
pd.DataFrame(rows).to_csv(tmp.name, index=False); tmp.close()
|
|
|
|
| 418 |
return tmp.name, f"β
CSV β {len(rows)} rows"
|
| 419 |
except Exception as e:
|
| 420 |
return None, f"β {str(e)}"
|
| 421 |
|
| 422 |
|
| 423 |
+
def delete_selected_run(run_label):
|
| 424 |
+
if sb is None or not run_label:
|
| 425 |
return "β No run selected."
|
| 426 |
+
run_id = _get_run_id_by_label(run_label)
|
| 427 |
+
if not run_id: return "β Run not found."
|
| 428 |
try:
|
| 429 |
delete_run(sb, run_id)
|
| 430 |
return "ποΈ Deleted. Click Refresh."
|
|
|
|
| 473 |
pause_btn.click(api_name=False, fn=pause_audit, outputs=[progress_text])
|
| 474 |
|
| 475 |
with gr.Tab("π Past Runs"):
|
| 476 |
+
refresh_btn = gr.Button("π Refresh", variant="secondary")
|
|
|
|
| 477 |
runs_html = gr.HTML(value="<p>Click Refresh to load.</p>")
|
| 478 |
+
|
| 479 |
+
# Dropdown uses plain string labels (no tuples, no UUIDs as values)
|
| 480 |
+
# We look up the UUID from the label when needed
|
| 481 |
+
run_dropdown = gr.Dropdown(label="Select Run", choices=[], interactive=True, allow_custom_value=True)
|
| 482 |
|
| 483 |
with gr.Row():
|
| 484 |
report_btn = gr.Button("π HTML Report", variant="primary")
|
|
|
|
| 496 |
resume_log = gr.Textbox(label="Resume Log", lines=15, interactive=False)
|
| 497 |
resume_pause_btn = gr.Button("βΈοΈ Pause Resume", variant="stop")
|
| 498 |
|
| 499 |
+
# Refresh: load HTML first (which refreshes cache), then update dropdown choices
|
| 500 |
+
refresh_btn.click(api_name=False, fn=load_runs_html, outputs=[runs_html]).then(
|
| 501 |
+
api_name=False, fn=load_runs_choices, outputs=[run_dropdown])
|
| 502 |
|
| 503 |
report_btn.click(api_name=False, fn=generate_report_for_run, inputs=[run_dropdown, domain_input], outputs=[report_file, action_status])
|
| 504 |
csv_btn.click(api_name=False, fn=generate_csv_for_run, inputs=[run_dropdown], outputs=[csv_file, action_status])
|