Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -65,24 +65,16 @@ class AuditState:
|
|
| 65 |
audit_state = AuditState()
|
| 66 |
|
| 67 |
|
| 68 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 69 |
-
# CORE AUDIT FUNCTION
|
| 70 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 71 |
-
|
| 72 |
def run_audit(file, pasted_urls, domain, batch_size, timeout, delay, workers):
|
| 73 |
if sb is None:
|
| 74 |
-
yield "β Supabase not connected.
|
| 75 |
return
|
| 76 |
|
| 77 |
-
# Parse URLs
|
| 78 |
urls = []
|
| 79 |
if file is not None:
|
| 80 |
try:
|
| 81 |
fpath = file.name if hasattr(file, 'name') else file
|
| 82 |
-
if str(fpath).endswith('.csv')
|
| 83 |
-
df = pd.read_csv(fpath)
|
| 84 |
-
else:
|
| 85 |
-
df = pd.read_excel(fpath)
|
| 86 |
url_col = None
|
| 87 |
for col in df.columns:
|
| 88 |
sample = str(df[col].iloc[0]).strip().lower()
|
|
@@ -99,46 +91,42 @@ def run_audit(file, pasted_urls, domain, batch_size, timeout, delay, workers):
|
|
| 99 |
urls = [u.strip() for u in pasted_urls.strip().split('\n') if u.strip().startswith('http')]
|
| 100 |
|
| 101 |
if not urls:
|
| 102 |
-
yield "β No valid URLs
|
| 103 |
return
|
| 104 |
|
| 105 |
-
# Deduplicate
|
| 106 |
seen = set()
|
| 107 |
-
|
| 108 |
for u in urls:
|
| 109 |
if u not in seen:
|
| 110 |
seen.add(u)
|
| 111 |
-
|
| 112 |
-
urls =
|
| 113 |
|
| 114 |
run_name = f"{domain} Audit β {datetime.now().strftime('%b %d %H:%M')} β {len(urls)} pages"
|
| 115 |
run_id = create_run(sb, run_name, domain, len(urls), urls)
|
| 116 |
-
|
| 117 |
audit_state.set_running(True, run_id)
|
| 118 |
audit_state.resume()
|
| 119 |
|
| 120 |
total = len(urls)
|
| 121 |
-
batch_size = int(batch_size)
|
| 122 |
-
timeout = int(timeout)
|
| 123 |
-
workers = int(workers)
|
| 124 |
start_time = time.time()
|
| 125 |
batch_num = 0
|
| 126 |
-
|
| 127 |
|
| 128 |
-
yield f"π
|
| 129 |
|
| 130 |
try:
|
| 131 |
-
for
|
| 132 |
if audit_state.is_paused():
|
| 133 |
-
|
| 134 |
-
update_run_status(sb, run_id, "paused",
|
| 135 |
-
|
| 136 |
audit_state.set_running(False)
|
| 137 |
-
yield "\n".join(
|
| 138 |
return
|
| 139 |
|
| 140 |
-
|
| 141 |
-
batch_urls = urls[
|
| 142 |
batch_num += 1
|
| 143 |
batch_results = []
|
| 144 |
|
|
@@ -146,72 +134,62 @@ def run_audit(file, pasted_urls, domain, batch_size, timeout, delay, workers):
|
|
| 146 |
if audit_state.is_paused():
|
| 147 |
if batch_results:
|
| 148 |
save_batch_results(sb, run_id, batch_results)
|
| 149 |
-
|
| 150 |
-
update_run_status(sb, run_id, "paused",
|
| 151 |
-
|
| 152 |
audit_state.set_running(False)
|
| 153 |
-
yield "\n".join(
|
| 154 |
return
|
| 155 |
|
| 156 |
-
|
| 157 |
elapsed = time.time() - start_time
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
url, domain, DEFAULT_BODY_SELECTORS,
|
| 164 |
-
suggestion_map=DEFAULT_SUGGESTION_MAP,
|
| 165 |
-
timeout=timeout, concurrent_workers=workers,
|
| 166 |
-
)
|
| 167 |
batch_results.append(result)
|
| 168 |
|
| 169 |
short = url.replace('https://www.', '').replace('https://', '')[:70]
|
| 170 |
if result['error']:
|
| 171 |
-
|
| 172 |
else:
|
| 173 |
b = result['broken_int_count'] + result['broken_ext_count']
|
| 174 |
fc = result['follow_flag_count']
|
| 175 |
d = result['duplicate_count']
|
| 176 |
-
|
| 177 |
-
if b:
|
| 178 |
-
if fc:
|
| 179 |
-
if d:
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
yield "\n".join(
|
| 184 |
-
|
| 185 |
if j < len(batch_urls) - 1:
|
| 186 |
time.sleep(delay)
|
| 187 |
|
| 188 |
-
# Save batch
|
| 189 |
if batch_results:
|
| 190 |
try:
|
| 191 |
save_batch_results(sb, run_id, batch_results)
|
| 192 |
-
|
| 193 |
-
update_run_status(sb, run_id, "running",
|
| 194 |
-
|
| 195 |
except Exception as e:
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
yield "\n".join(log_lines[-40:]), f"πΎ Saved batch {batch_num} β {min(batch_end, total)}/{total}"
|
| 199 |
del batch_results
|
| 200 |
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
yield "\n".join(log_lines[-40:]), "π Orphan analysis..."
|
| 204 |
|
| 205 |
all_pages = get_all_page_results(sb, run_id)
|
| 206 |
all_results = [p['result'] for p in all_pages]
|
| 207 |
-
|
| 208 |
-
all_internal_targets = set()
|
| 209 |
-
all_page_urls = set()
|
| 210 |
for r in all_results:
|
| 211 |
-
|
| 212 |
-
for
|
| 213 |
-
|
| 214 |
-
|
| 215 |
|
| 216 |
summary = {
|
| 217 |
'total_pages': len(all_results),
|
|
@@ -222,340 +200,279 @@ def run_audit(file, pasted_urls, domain, batch_size, timeout, delay, workers):
|
|
| 222 |
'total_flags': sum(r.get('follow_flag_count', 0) for r in all_results),
|
| 223 |
'total_dups': sum(r.get('duplicate_count', 0) for r in all_results),
|
| 224 |
'total_sug': sum(len(r.get('suggestions', [])) for r in all_results),
|
| 225 |
-
'orphan_count': len(
|
| 226 |
-
'orphan_urls':
|
| 227 |
}
|
| 228 |
update_run_status(sb, run_id, "completed", len(all_results), summary)
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
log_lines.append("β Go to Past Runs tab β Refresh β Generate Report")
|
| 234 |
-
|
| 235 |
audit_state.set_running(False)
|
| 236 |
-
yield "\n".join(
|
| 237 |
|
| 238 |
except Exception as e:
|
| 239 |
-
|
| 240 |
audit_state.set_running(False)
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
pass
|
| 247 |
-
yield "\n".join(log_lines[-40:]), f"β Error β progress saved to Supabase"
|
| 248 |
|
| 249 |
|
| 250 |
def pause_audit():
|
| 251 |
if audit_state.is_running():
|
| 252 |
audit_state.request_pause()
|
| 253 |
-
return "βΈοΈ
|
| 254 |
return "No audit running."
|
| 255 |
|
| 256 |
|
| 257 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 258 |
-
# RESUME FUNCTION
|
| 259 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 260 |
-
|
| 261 |
def resume_audit(run_id, domain, batch_size, timeout, delay, workers):
|
| 262 |
if sb is None:
|
| 263 |
yield "β Supabase not connected.", ""
|
| 264 |
return
|
| 265 |
-
|
| 266 |
if not run_id:
|
| 267 |
-
yield "β
|
| 268 |
return
|
| 269 |
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
remaining = [u for u in
|
| 273 |
|
| 274 |
if not remaining:
|
| 275 |
-
update_run_status(sb, run_id, "completed", len(
|
| 276 |
-
yield "β
|
| 277 |
return
|
| 278 |
|
| 279 |
-
# Get domain from run
|
| 280 |
try:
|
| 281 |
runs = get_all_runs(sb)
|
| 282 |
-
|
| 283 |
-
if
|
| 284 |
-
|
| 285 |
-
except:
|
| 286 |
-
pass
|
| 287 |
|
| 288 |
audit_state.set_running(True, run_id)
|
| 289 |
audit_state.resume()
|
| 290 |
update_run_status(sb, run_id, "running")
|
| 291 |
|
| 292 |
-
total = len(
|
| 293 |
-
batch_size = int(batch_size)
|
| 294 |
-
timeout = int(timeout)
|
| 295 |
-
workers = int(workers)
|
| 296 |
start_time = time.time()
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
yield "\n".join(log_lines), f"π Resuming: {len(done_urls)}/{total}"
|
| 301 |
|
| 302 |
try:
|
| 303 |
-
for
|
| 304 |
if audit_state.is_paused():
|
| 305 |
-
|
| 306 |
-
update_run_status(sb, run_id, "paused",
|
| 307 |
-
|
| 308 |
audit_state.set_running(False)
|
| 309 |
-
yield "\n".join(
|
| 310 |
return
|
| 311 |
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
|
| 317 |
-
for j, url in enumerate(
|
| 318 |
if audit_state.is_paused():
|
| 319 |
-
if
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
log_lines.append(f"βΈοΈ PAUSED at {completed}/{total}")
|
| 324 |
audit_state.set_running(False)
|
| 325 |
-
yield "\n".join(
|
| 326 |
return
|
| 327 |
|
| 328 |
-
|
| 329 |
elapsed = time.time() - start_time
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
suggestion_map=DEFAULT_SUGGESTION_MAP,
|
| 338 |
-
timeout=timeout, concurrent_workers=workers,
|
| 339 |
-
)
|
| 340 |
-
batch_results.append(result)
|
| 341 |
|
| 342 |
short = url.replace('https://www.', '').replace('https://', '')[:70]
|
| 343 |
if result['error']:
|
| 344 |
-
|
| 345 |
else:
|
| 346 |
b = result['broken_int_count'] + result['broken_ext_count']
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
'
|
| 378 |
-
'
|
| 379 |
-
'total_ext': sum(r.get('ext_count', 0) for r in all_results),
|
| 380 |
-
'total_broken': sum(r.get('broken_int_count', 0) + r.get('broken_ext_count', 0) for r in all_results),
|
| 381 |
-
'total_redirects': sum(r.get('redirect_int_count', 0) + r.get('redirect_ext_count', 0) for r in all_results),
|
| 382 |
-
'total_flags': sum(r.get('follow_flag_count', 0) for r in all_results),
|
| 383 |
-
'total_dups': sum(r.get('duplicate_count', 0) for r in all_results),
|
| 384 |
-
'total_sug': sum(len(r.get('suggestions', [])) for r in all_results),
|
| 385 |
-
'orphan_count': len(orphans),
|
| 386 |
-
'orphan_urls': orphans[:100],
|
| 387 |
}
|
| 388 |
-
update_run_status(sb, run_id, "completed", len(
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
log_lines.append(f"β
COMPLETE! {len(all_results)} pages in {total_time:.0f}s Β· {len(orphans)} orphans")
|
| 392 |
audit_state.set_running(False)
|
| 393 |
-
yield "\n".join(
|
| 394 |
|
| 395 |
except Exception as e:
|
| 396 |
-
|
| 397 |
audit_state.set_running(False)
|
| 398 |
try:
|
| 399 |
-
|
| 400 |
-
update_run_status(sb, run_id, "paused",
|
| 401 |
-
except:
|
| 402 |
-
|
| 403 |
-
yield "\n".join(log_lines[-40:]), "β Error β progress saved"
|
| 404 |
|
| 405 |
|
| 406 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 407 |
-
# PAST RUNS
|
| 408 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 409 |
|
| 410 |
-
def
|
|
|
|
| 411 |
if sb is None:
|
| 412 |
-
return "<p>β Supabase not connected</p>"
|
| 413 |
-
|
| 414 |
runs = get_all_runs(sb)
|
| 415 |
if not runs:
|
| 416 |
-
return "<p>No saved runs yet.</p>"
|
|
|
|
|
|
|
|
|
|
| 417 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
choices = []
|
| 419 |
for r in runs:
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
total = r.get('total_urls', 0)
|
| 423 |
-
label = f"{r.get('name', 'Untitled')} [{status.upper()}] ({completed}/{total})"
|
| 424 |
choices.append((label, r['id']))
|
| 425 |
-
|
| 426 |
-
html = '<div style="max-height:400px;overflow-y:auto;">'
|
| 427 |
-
html += '<table style="width:100%;border-collapse:collapse;font-size:13px;">'
|
| 428 |
-
html += '<tr style="background:#f1f5f9;"><th style="padding:10px;text-align:left;">Run Name</th><th style="padding:10px;text-align:center;">Status</th><th style="padding:10px;text-align:center;">Pages</th><th style="padding:10px;text-align:center;">Broken</th><th style="padding:10px;text-align:center;">Flags</th><th style="padding:10px;text-align:center;">Dups</th><th style="padding:10px;text-align:center;">Orphans</th></tr>'
|
| 429 |
-
|
| 430 |
-
for r in runs:
|
| 431 |
-
summary = r.get('summary', {}) or {}
|
| 432 |
-
status = r.get('status', 'unknown')
|
| 433 |
-
s_colors = {'completed': '#059669', 'paused': '#d97706', 'running': '#2563eb'}
|
| 434 |
-
s_bg = {'completed': 'rgba(5,150,105,0.1)', 'paused': 'rgba(217,119,6,0.1)', 'running': 'rgba(37,99,235,0.1)'}
|
| 435 |
-
color = s_colors.get(status, '#888')
|
| 436 |
-
bg = s_bg.get(status, 'rgba(136,136,136,0.1)')
|
| 437 |
-
created = r.get('created_at', '')[:16].replace('T', ' ')
|
| 438 |
-
|
| 439 |
-
html += f'''<tr style="border-bottom:1px solid #e2e8f0;">
|
| 440 |
-
<td style="padding:10px;"><b>{r.get('name','Untitled')}</b><br><span style="font-size:11px;color:#94a3b8;">{created}</span></td>
|
| 441 |
-
<td style="padding:10px;text-align:center;"><span style="background:{bg};color:{color};padding:3px 10px;border-radius:12px;font-size:11px;font-weight:700;">{status.upper()}</span></td>
|
| 442 |
-
<td style="padding:10px;text-align:center;font-weight:700;">{r.get('completed_urls',0)}/{r.get('total_urls',0)}</td>
|
| 443 |
-
<td style="padding:10px;text-align:center;color:#dc2626;font-weight:700;">{summary.get('total_broken','β')}</td>
|
| 444 |
-
<td style="padding:10px;text-align:center;color:#dc2626;font-weight:700;">{summary.get('total_flags','β')}</td>
|
| 445 |
-
<td style="padding:10px;text-align:center;color:#db2777;font-weight:700;">{summary.get('total_dups','β')}</td>
|
| 446 |
-
<td style="padding:10px;text-align:center;color:#dc2626;font-weight:700;">{summary.get('orphan_count','β')}</td>
|
| 447 |
-
</tr>'''
|
| 448 |
-
|
| 449 |
-
html += '</table></div>'
|
| 450 |
-
return html, gr.Dropdown(choices=choices, value=choices[0][1] if choices else None)
|
| 451 |
|
| 452 |
|
| 453 |
def generate_report_for_run(run_id, domain):
|
| 454 |
if sb is None or not run_id:
|
| 455 |
-
return None, "β No run selected
|
| 456 |
-
|
| 457 |
try:
|
| 458 |
run = None
|
| 459 |
-
|
| 460 |
-
for r in runs:
|
| 461 |
if r['id'] == run_id:
|
| 462 |
run = r
|
| 463 |
break
|
| 464 |
-
|
| 465 |
pages = get_all_page_results(sb, run_id)
|
| 466 |
if not pages:
|
| 467 |
-
return None, "β No
|
| 468 |
-
|
| 469 |
results = [p['result'] for p in pages]
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
report_html = generate_report(results, orphan_urls, report_domain)
|
| 475 |
-
|
| 476 |
-
tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.html', prefix='Link_Audit_')
|
| 477 |
-
tmp.write(report_html.encode('utf-8'))
|
| 478 |
tmp.close()
|
| 479 |
-
|
| 480 |
-
return tmp.name, f"β
Report generated β {len(results)} pages"
|
| 481 |
except Exception as e:
|
| 482 |
-
return None, f"β
|
| 483 |
|
| 484 |
|
| 485 |
def generate_csv_for_run(run_id):
|
| 486 |
if sb is None or not run_id:
|
| 487 |
return None, "β No run selected."
|
| 488 |
-
|
| 489 |
try:
|
| 490 |
pages = get_all_page_results(sb, run_id)
|
| 491 |
if not pages:
|
| 492 |
return None, "β No data."
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
'External': r.get('ext_count', 0),
|
| 501 |
-
'Broken': r.get('broken_int_count', 0) + r.get('broken_ext_count', 0),
|
| 502 |
-
'Redirects': r.get('redirect_int_count', 0) + r.get('redirect_ext_count', 0),
|
| 503 |
-
'Flags': r.get('follow_flag_count', 0),
|
| 504 |
-
'Duplicates': r.get('duplicate_count', 0),
|
| 505 |
-
'Error': r.get('error', ''),
|
| 506 |
-
})
|
| 507 |
-
|
| 508 |
-
tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.csv', prefix='Audit_CSV_')
|
| 509 |
pd.DataFrame(rows).to_csv(tmp.name, index=False)
|
| 510 |
tmp.close()
|
| 511 |
-
return tmp.name, f"β
CSV
|
| 512 |
except Exception as e:
|
| 513 |
-
return None, f"β
|
| 514 |
|
| 515 |
|
| 516 |
def delete_selected_run(run_id):
|
| 517 |
if sb is None or not run_id:
|
| 518 |
-
return "β No run selected."
|
| 519 |
try:
|
| 520 |
delete_run(sb, run_id)
|
| 521 |
-
|
| 522 |
-
return "ποΈ Run deleted.", html, dropdown
|
| 523 |
except Exception as e:
|
| 524 |
-
return f"β {str(e)}"
|
| 525 |
|
| 526 |
|
| 527 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 528 |
-
#
|
| 529 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 530 |
|
| 531 |
with gr.Blocks(title="Link Audit Tool", theme=gr.themes.Soft()) as app:
|
| 532 |
|
| 533 |
-
gr.HTML("""
|
| 534 |
-
<div style="background:linear-gradient(135deg,#1e3a5f,#2563eb);padding:24px 28px;border-radius:12px;color:white;margin-bottom:16px;">
|
| 535 |
<p style="font-size:10px;font-weight:700;letter-spacing:1.5px;text-transform:uppercase;color:#93c5fd;margin-bottom:8px;">SEO LINK AUDIT TOOL</p>
|
| 536 |
<h1 style="margin:0 0 4px 0;font-size:24px;">π Bulk Link Audit</h1>
|
| 537 |
-
<p style="margin:0;opacity:0.8;font-size:13px;">Upload URLs β batch crawl
|
| 538 |
-
</div>
|
| 539 |
-
""")
|
| 540 |
|
| 541 |
-
|
| 542 |
-
gr.Markdown(f"**{
|
| 543 |
|
| 544 |
with gr.Tabs():
|
| 545 |
|
| 546 |
-
# βββ TAB 1: NEW AUDIT βββ
|
| 547 |
with gr.Tab("π New Audit"):
|
| 548 |
with gr.Row():
|
| 549 |
with gr.Column(scale=2):
|
| 550 |
file_input = gr.File(label="Upload Excel / CSV", file_types=[".xlsx", ".csv", ".xls"])
|
| 551 |
-
pasted_urls = gr.Textbox(label="Or paste URLs (one per line)", lines=5
|
| 552 |
-
|
| 553 |
with gr.Column(scale=1):
|
| 554 |
domain_input = gr.Textbox(label="Your Domain", value="edstellar.com")
|
| 555 |
batch_size_input = gr.Slider(5, 50, value=25, step=5, label="Batch Size")
|
| 556 |
timeout_input = gr.Slider(5, 60, value=15, step=5, label="Timeout (s)")
|
| 557 |
-
delay_input = gr.Slider(0, 5, value=1.0, step=0.5, label="Delay
|
| 558 |
-
workers_input = gr.Slider(1, 10, value=5, step=1, label="Parallel
|
| 559 |
|
| 560 |
with gr.Row():
|
| 561 |
run_btn = gr.Button("π Run Audit", variant="primary", scale=2)
|
|
@@ -564,55 +481,44 @@ with gr.Blocks(title="Link Audit Tool", theme=gr.themes.Soft()) as app:
|
|
| 564 |
progress_text = gr.Textbox(label="Status", interactive=False)
|
| 565 |
log_output = gr.Textbox(label="Audit Log", lines=20, interactive=False)
|
| 566 |
|
| 567 |
-
run_btn.click(
|
| 568 |
-
fn=run_audit,
|
| 569 |
inputs=[file_input, pasted_urls, domain_input, batch_size_input, timeout_input, delay_input, workers_input],
|
| 570 |
-
outputs=[log_output, progress_text]
|
| 571 |
-
)
|
| 572 |
pause_btn.click(fn=pause_audit, outputs=[progress_text])
|
| 573 |
|
| 574 |
-
# βββ TAB 2: PAST RUNS βββ
|
| 575 |
with gr.Tab("π Past Runs"):
|
| 576 |
-
refresh_btn = gr.Button("π Refresh Runs", variant="secondary")
|
| 577 |
-
runs_html = gr.HTML(value="<p>Click Refresh to load past runs.</p>")
|
| 578 |
-
run_dropdown = gr.Dropdown(label="Select a Run", choices=[], interactive=True)
|
| 579 |
-
|
| 580 |
with gr.Row():
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
delete_btn = gr.Button("ποΈ Delete Run", variant="stop")
|
| 585 |
|
| 586 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 587 |
|
|
|
|
| 588 |
with gr.Row():
|
| 589 |
-
report_file = gr.File(label="
|
| 590 |
-
csv_file = gr.File(label="
|
| 591 |
|
| 592 |
-
gr.Markdown("---")
|
| 593 |
-
gr.Markdown("### Resume / Pause Controls")
|
| 594 |
resume_progress = gr.Textbox(label="Resume Status", interactive=False)
|
| 595 |
resume_log = gr.Textbox(label="Resume Log", lines=15, interactive=False)
|
| 596 |
resume_pause_btn = gr.Button("βΈοΈ Pause Resume", variant="stop")
|
| 597 |
|
| 598 |
-
#
|
| 599 |
-
refresh_btn.click(fn=
|
| 600 |
-
|
| 601 |
-
report_btn.click(
|
| 602 |
-
fn=generate_report_for_run,
|
| 603 |
-
inputs=[run_dropdown, domain_input],
|
| 604 |
-
outputs=[report_file, action_status],
|
| 605 |
-
)
|
| 606 |
|
|
|
|
| 607 |
csv_btn.click(fn=generate_csv_for_run, inputs=[run_dropdown], outputs=[csv_file, action_status])
|
|
|
|
| 608 |
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
resume_btn.click(
|
| 612 |
-
fn=resume_audit,
|
| 613 |
inputs=[run_dropdown, domain_input, batch_size_input, timeout_input, delay_input, workers_input],
|
| 614 |
-
outputs=[resume_log, resume_progress]
|
| 615 |
-
)
|
| 616 |
resume_pause_btn.click(fn=pause_audit, outputs=[resume_progress])
|
| 617 |
|
| 618 |
|
|
|
|
| 65 |
audit_state = AuditState()
|
| 66 |
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
def run_audit(file, pasted_urls, domain, batch_size, timeout, delay, workers):
|
| 69 |
if sb is None:
|
| 70 |
+
yield "β Supabase not connected.", ""
|
| 71 |
return
|
| 72 |
|
|
|
|
| 73 |
urls = []
|
| 74 |
if file is not None:
|
| 75 |
try:
|
| 76 |
fpath = file.name if hasattr(file, 'name') else file
|
| 77 |
+
df = pd.read_csv(fpath) if str(fpath).endswith('.csv') else pd.read_excel(fpath)
|
|
|
|
|
|
|
|
|
|
| 78 |
url_col = None
|
| 79 |
for col in df.columns:
|
| 80 |
sample = str(df[col].iloc[0]).strip().lower()
|
|
|
|
| 91 |
urls = [u.strip() for u in pasted_urls.strip().split('\n') if u.strip().startswith('http')]
|
| 92 |
|
| 93 |
if not urls:
|
| 94 |
+
yield "β No valid URLs.", ""
|
| 95 |
return
|
| 96 |
|
|
|
|
| 97 |
seen = set()
|
| 98 |
+
unique = []
|
| 99 |
for u in urls:
|
| 100 |
if u not in seen:
|
| 101 |
seen.add(u)
|
| 102 |
+
unique.append(u)
|
| 103 |
+
urls = unique
|
| 104 |
|
| 105 |
run_name = f"{domain} Audit β {datetime.now().strftime('%b %d %H:%M')} β {len(urls)} pages"
|
| 106 |
run_id = create_run(sb, run_name, domain, len(urls), urls)
|
|
|
|
| 107 |
audit_state.set_running(True, run_id)
|
| 108 |
audit_state.resume()
|
| 109 |
|
| 110 |
total = len(urls)
|
| 111 |
+
batch_size, timeout, workers = int(batch_size), int(timeout), int(workers)
|
|
|
|
|
|
|
| 112 |
start_time = time.time()
|
| 113 |
batch_num = 0
|
| 114 |
+
log = []
|
| 115 |
|
| 116 |
+
yield f"π {run_name}\nπ¦ {total} URLs Β· Batch: {batch_size}", "βΆοΈ Running..."
|
| 117 |
|
| 118 |
try:
|
| 119 |
+
for bs in range(0, total, batch_size):
|
| 120 |
if audit_state.is_paused():
|
| 121 |
+
c = get_completed_count(sb, run_id)
|
| 122 |
+
update_run_status(sb, run_id, "paused", c)
|
| 123 |
+
log.append(f"βΈοΈ PAUSED at {c}/{total}")
|
| 124 |
audit_state.set_running(False)
|
| 125 |
+
yield "\n".join(log[-40:]), f"βΈοΈ Paused β {c}/{total}"
|
| 126 |
return
|
| 127 |
|
| 128 |
+
be = min(bs + batch_size, total)
|
| 129 |
+
batch_urls = urls[bs:be]
|
| 130 |
batch_num += 1
|
| 131 |
batch_results = []
|
| 132 |
|
|
|
|
| 134 |
if audit_state.is_paused():
|
| 135 |
if batch_results:
|
| 136 |
save_batch_results(sb, run_id, batch_results)
|
| 137 |
+
c = get_completed_count(sb, run_id)
|
| 138 |
+
update_run_status(sb, run_id, "paused", c)
|
| 139 |
+
log.append(f"βΈοΈ PAUSED at {c}/{total}")
|
| 140 |
audit_state.set_running(False)
|
| 141 |
+
yield "\n".join(log[-40:]), f"βΈοΈ Paused β {c}/{total}"
|
| 142 |
return
|
| 143 |
|
| 144 |
+
gi = bs + j + 1
|
| 145 |
elapsed = time.time() - start_time
|
| 146 |
+
eta = (elapsed / gi) * (total - gi)
|
| 147 |
+
eta_s = f"{int(eta//60)}m{int(eta%60)}s" if eta > 60 else f"{eta:.0f}s"
|
| 148 |
+
|
| 149 |
+
result = audit_page(url, domain, DEFAULT_BODY_SELECTORS,
|
| 150 |
+
suggestion_map=DEFAULT_SUGGESTION_MAP, timeout=timeout, concurrent_workers=workers)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
batch_results.append(result)
|
| 152 |
|
| 153 |
short = url.replace('https://www.', '').replace('https://', '')[:70]
|
| 154 |
if result['error']:
|
| 155 |
+
log.append(f"β [{gi}/{total}] {short} β {result['error'][:50]}")
|
| 156 |
else:
|
| 157 |
b = result['broken_int_count'] + result['broken_ext_count']
|
| 158 |
fc = result['follow_flag_count']
|
| 159 |
d = result['duplicate_count']
|
| 160 |
+
fl = []
|
| 161 |
+
if b: fl.append(f"π΄{b}broken")
|
| 162 |
+
if fc: fl.append(f"π‘{fc}flags")
|
| 163 |
+
if d: fl.append(f"π£{d}dups")
|
| 164 |
+
fs = " ".join(fl) if fl else "β
"
|
| 165 |
+
log.append(f"[{gi}/{total}] {short} β Int:{result['int_count']} Ext:{result['ext_count']} {fs}")
|
| 166 |
+
|
| 167 |
+
yield "\n".join(log[-40:]), f"π {gi}/{total} ({gi*100//total}%) Batch{batch_num} ETA:{eta_s}"
|
|
|
|
| 168 |
if j < len(batch_urls) - 1:
|
| 169 |
time.sleep(delay)
|
| 170 |
|
|
|
|
| 171 |
if batch_results:
|
| 172 |
try:
|
| 173 |
save_batch_results(sb, run_id, batch_results)
|
| 174 |
+
c = get_completed_count(sb, run_id)
|
| 175 |
+
update_run_status(sb, run_id, "running", c)
|
| 176 |
+
log.append(f"πΎ Batch {batch_num} saved β {c}/{total}")
|
| 177 |
except Exception as e:
|
| 178 |
+
log.append(f"β Save error: {str(e)[:60]}")
|
| 179 |
+
yield "\n".join(log[-40:]), f"πΎ Batch {batch_num} saved"
|
|
|
|
| 180 |
del batch_results
|
| 181 |
|
| 182 |
+
log.append("π Orphan analysis...")
|
| 183 |
+
yield "\n".join(log[-40:]), "π Orphan analysis..."
|
|
|
|
| 184 |
|
| 185 |
all_pages = get_all_page_results(sb, run_id)
|
| 186 |
all_results = [p['result'] for p in all_pages]
|
| 187 |
+
targets, pg_urls = set(), set()
|
|
|
|
|
|
|
| 188 |
for r in all_results:
|
| 189 |
+
pg_urls.add(r['url'].rstrip('/').split('?')[0])
|
| 190 |
+
for lk in r.get('internal_links', []):
|
| 191 |
+
targets.add(lk['url'].rstrip('/').split('?')[0])
|
| 192 |
+
orphans = sorted([p for p in pg_urls if p not in targets])
|
| 193 |
|
| 194 |
summary = {
|
| 195 |
'total_pages': len(all_results),
|
|
|
|
| 200 |
'total_flags': sum(r.get('follow_flag_count', 0) for r in all_results),
|
| 201 |
'total_dups': sum(r.get('duplicate_count', 0) for r in all_results),
|
| 202 |
'total_sug': sum(len(r.get('suggestions', [])) for r in all_results),
|
| 203 |
+
'orphan_count': len(orphans),
|
| 204 |
+
'orphan_urls': orphans[:100],
|
| 205 |
}
|
| 206 |
update_run_status(sb, run_id, "completed", len(all_results), summary)
|
| 207 |
+
tt = time.time() - start_time
|
| 208 |
+
log.append(f"β
DONE! {len(all_results)} pages in {tt:.0f}s Β· {len(orphans)} orphans")
|
| 209 |
+
log.append(f"Broken:{summary['total_broken']} Redirects:{summary['total_redirects']} Flags:{summary['total_flags']} Dups:{summary['total_dups']}")
|
| 210 |
+
log.append("β Past Runs β Refresh β Generate Report")
|
|
|
|
|
|
|
| 211 |
audit_state.set_running(False)
|
| 212 |
+
yield "\n".join(log[-40:]), f"β
Done β {len(all_results)} pages in {tt:.0f}s"
|
| 213 |
|
| 214 |
except Exception as e:
|
| 215 |
+
log.append(f"β {str(e)}")
|
| 216 |
audit_state.set_running(False)
|
| 217 |
+
try:
|
| 218 |
+
c = get_completed_count(sb, run_id)
|
| 219 |
+
update_run_status(sb, run_id, "paused", c)
|
| 220 |
+
except: pass
|
| 221 |
+
yield "\n".join(log[-40:]), "β Error β progress saved"
|
|
|
|
|
|
|
| 222 |
|
| 223 |
|
| 224 |
def pause_audit():
|
| 225 |
if audit_state.is_running():
|
| 226 |
audit_state.request_pause()
|
| 227 |
+
return "βΈοΈ Stopping after current page..."
|
| 228 |
return "No audit running."
|
| 229 |
|
| 230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
def resume_audit(run_id, domain, batch_size, timeout, delay, workers):
|
| 232 |
if sb is None:
|
| 233 |
yield "β Supabase not connected.", ""
|
| 234 |
return
|
|
|
|
| 235 |
if not run_id:
|
| 236 |
+
yield "β Select a run first.", ""
|
| 237 |
return
|
| 238 |
|
| 239 |
+
all_urls = get_pending_urls(sb, run_id)
|
| 240 |
+
done = get_completed_urls(sb, run_id)
|
| 241 |
+
remaining = [u for u in all_urls if u not in done]
|
| 242 |
|
| 243 |
if not remaining:
|
| 244 |
+
update_run_status(sb, run_id, "completed", len(done))
|
| 245 |
+
yield "β
Already complete!", ""
|
| 246 |
return
|
| 247 |
|
|
|
|
| 248 |
try:
|
| 249 |
runs = get_all_runs(sb)
|
| 250 |
+
rd = next((r for r in runs if r['id'] == run_id), None)
|
| 251 |
+
if rd: domain = rd.get('domain', domain)
|
| 252 |
+
except: pass
|
|
|
|
|
|
|
| 253 |
|
| 254 |
audit_state.set_running(True, run_id)
|
| 255 |
audit_state.resume()
|
| 256 |
update_run_status(sb, run_id, "running")
|
| 257 |
|
| 258 |
+
total = len(all_urls)
|
| 259 |
+
batch_size, timeout, workers = int(batch_size), int(timeout), int(workers)
|
|
|
|
|
|
|
| 260 |
start_time = time.time()
|
| 261 |
+
bn = 0
|
| 262 |
+
log = [f"βΆοΈ Resuming β {len(remaining)} left ({len(done)} done)"]
|
| 263 |
+
yield "\n".join(log), f"Resuming: {len(done)}/{total}"
|
|
|
|
| 264 |
|
| 265 |
try:
|
| 266 |
+
for bs in range(0, len(remaining), batch_size):
|
| 267 |
if audit_state.is_paused():
|
| 268 |
+
c = get_completed_count(sb, run_id)
|
| 269 |
+
update_run_status(sb, run_id, "paused", c)
|
| 270 |
+
log.append(f"βΈοΈ PAUSED {c}/{total}")
|
| 271 |
audit_state.set_running(False)
|
| 272 |
+
yield "\n".join(log[-40:]), f"βΈοΈ Paused {c}/{total}"
|
| 273 |
return
|
| 274 |
|
| 275 |
+
be = min(bs + batch_size, len(remaining))
|
| 276 |
+
bu = remaining[bs:be]
|
| 277 |
+
bn += 1
|
| 278 |
+
br = []
|
| 279 |
|
| 280 |
+
for j, url in enumerate(bu):
|
| 281 |
if audit_state.is_paused():
|
| 282 |
+
if br: save_batch_results(sb, run_id, br)
|
| 283 |
+
c = get_completed_count(sb, run_id)
|
| 284 |
+
update_run_status(sb, run_id, "paused", c)
|
| 285 |
+
log.append(f"βΈοΈ PAUSED {c}/{total}")
|
|
|
|
| 286 |
audit_state.set_running(False)
|
| 287 |
+
yield "\n".join(log[-40:]), f"βΈοΈ Paused {c}/{total}"
|
| 288 |
return
|
| 289 |
|
| 290 |
+
gi = len(done) + bs + j + 1
|
| 291 |
elapsed = time.time() - start_time
|
| 292 |
+
proc = bs + j + 1
|
| 293 |
+
eta = (elapsed / proc) * (len(remaining) - proc)
|
| 294 |
+
eta_s = f"{int(eta//60)}m{int(eta%60)}s" if eta > 60 else f"{eta:.0f}s"
|
| 295 |
+
|
| 296 |
+
result = audit_page(url, domain, DEFAULT_BODY_SELECTORS,
|
| 297 |
+
suggestion_map=DEFAULT_SUGGESTION_MAP, timeout=timeout, concurrent_workers=workers)
|
| 298 |
+
br.append(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
|
| 300 |
short = url.replace('https://www.', '').replace('https://', '')[:70]
|
| 301 |
if result['error']:
|
| 302 |
+
log.append(f"β [{gi}/{total}] {short}")
|
| 303 |
else:
|
| 304 |
b = result['broken_int_count'] + result['broken_ext_count']
|
| 305 |
+
log.append(f"[{gi}/{total}] {short} {'π΄'+str(b) if b else 'β
'}")
|
| 306 |
+
|
| 307 |
+
yield "\n".join(log[-40:]), f"π {gi}/{total} ({gi*100//total}%) ETA:{eta_s}"
|
| 308 |
+
if j < len(bu) - 1: time.sleep(delay)
|
| 309 |
+
|
| 310 |
+
if br:
|
| 311 |
+
save_batch_results(sb, run_id, br)
|
| 312 |
+
c = get_completed_count(sb, run_id)
|
| 313 |
+
update_run_status(sb, run_id, "running", c)
|
| 314 |
+
log.append(f"πΎ Batch {bn} β {c}/{total}")
|
| 315 |
+
del br
|
| 316 |
+
|
| 317 |
+
log.append("π Orphan analysis...")
|
| 318 |
+
yield "\n".join(log[-40:]), "π Orphans..."
|
| 319 |
+
|
| 320 |
+
ap = get_all_page_results(sb, run_id)
|
| 321 |
+
ar = [p['result'] for p in ap]
|
| 322 |
+
tgt, pg = set(), set()
|
| 323 |
+
for r in ar:
|
| 324 |
+
pg.add(r['url'].rstrip('/').split('?')[0])
|
| 325 |
+
for lk in r.get('internal_links', []): tgt.add(lk['url'].rstrip('/').split('?')[0])
|
| 326 |
+
orph = sorted([p for p in pg if p not in tgt])
|
| 327 |
+
|
| 328 |
+
fs = {
|
| 329 |
+
'total_pages': len(ar), 'total_int': sum(r.get('int_count',0) for r in ar),
|
| 330 |
+
'total_ext': sum(r.get('ext_count',0) for r in ar),
|
| 331 |
+
'total_broken': sum(r.get('broken_int_count',0)+r.get('broken_ext_count',0) for r in ar),
|
| 332 |
+
'total_redirects': sum(r.get('redirect_int_count',0)+r.get('redirect_ext_count',0) for r in ar),
|
| 333 |
+
'total_flags': sum(r.get('follow_flag_count',0) for r in ar),
|
| 334 |
+
'total_dups': sum(r.get('duplicate_count',0) for r in ar),
|
| 335 |
+
'total_sug': sum(len(r.get('suggestions',[])) for r in ar),
|
| 336 |
+
'orphan_count': len(orph), 'orphan_urls': orph[:100],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
}
|
| 338 |
+
update_run_status(sb, run_id, "completed", len(ar), fs)
|
| 339 |
+
tt = time.time() - start_time
|
| 340 |
+
log.append(f"β
DONE! {len(ar)} pages in {tt:.0f}s Β· {len(orph)} orphans")
|
|
|
|
| 341 |
audit_state.set_running(False)
|
| 342 |
+
yield "\n".join(log[-40:]), f"β
Done β {len(ar)} pages"
|
| 343 |
|
| 344 |
except Exception as e:
|
| 345 |
+
log.append(f"β {str(e)}")
|
| 346 |
audit_state.set_running(False)
|
| 347 |
try:
|
| 348 |
+
c = get_completed_count(sb, run_id)
|
| 349 |
+
update_run_status(sb, run_id, "paused", c)
|
| 350 |
+
except: pass
|
| 351 |
+
yield "\n".join(log[-40:]), "β Error"
|
|
|
|
| 352 |
|
| 353 |
|
| 354 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 355 |
+
# PAST RUNS β returns ONLY strings, no component objects
|
| 356 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 357 |
|
| 358 |
+
def load_runs_html():
|
| 359 |
+
"""Returns HTML table only."""
|
| 360 |
if sb is None:
|
| 361 |
+
return "<p>β Supabase not connected</p>"
|
|
|
|
| 362 |
runs = get_all_runs(sb)
|
| 363 |
if not runs:
|
| 364 |
+
return "<p>No saved runs yet.</p>"
|
| 365 |
+
|
| 366 |
+
html = '<table style="width:100%;border-collapse:collapse;font-size:13px;">'
|
| 367 |
+
html += '<tr style="background:#f1f5f9;"><th style="padding:8px;text-align:left;">Run</th><th style="padding:8px;text-align:center;">Status</th><th style="padding:8px;text-align:center;">Pages</th><th style="padding:8px;text-align:center;">Broken</th><th style="padding:8px;text-align:center;">Flags</th><th style="padding:8px;text-align:center;">Dups</th><th style="padding:8px;text-align:center;">Orphans</th></tr>'
|
| 368 |
|
| 369 |
+
for r in runs:
|
| 370 |
+
s = r.get('summary', {}) or {}
|
| 371 |
+
st = r.get('status', '?')
|
| 372 |
+
sc = {'completed':'#059669','paused':'#d97706','running':'#2563eb'}.get(st,'#888')
|
| 373 |
+
sb2 = {'completed':'rgba(5,150,105,0.1)','paused':'rgba(217,119,6,0.1)','running':'rgba(37,99,235,0.1)'}.get(st,'rgba(136,136,136,0.1)')
|
| 374 |
+
cr = r.get('created_at','')[:16].replace('T',' ')
|
| 375 |
+
html += f'<tr style="border-bottom:1px solid #e2e8f0;"><td style="padding:8px;"><b>{r.get("name","?")}</b><br><span style="font-size:10px;color:#94a3b8;">{cr}</span></td><td style="padding:8px;text-align:center;"><span style="background:{sb2};color:{sc};padding:2px 8px;border-radius:10px;font-size:10px;font-weight:700;">{st.upper()}</span></td><td style="padding:8px;text-align:center;font-weight:700;">{r.get("completed_urls",0)}/{r.get("total_urls",0)}</td><td style="padding:8px;text-align:center;color:#dc2626;font-weight:700;">{s.get("total_broken","β")}</td><td style="padding:8px;text-align:center;color:#dc2626;font-weight:700;">{s.get("total_flags","β")}</td><td style="padding:8px;text-align:center;color:#db2777;font-weight:700;">{s.get("total_dups","β")}</td><td style="padding:8px;text-align:center;color:#dc2626;font-weight:700;">{s.get("orphan_count","β")}</td></tr>'
|
| 376 |
+
html += '</table>'
|
| 377 |
+
return html
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
def load_runs_choices():
|
| 381 |
+
"""Returns list of (label, value) tuples for dropdown."""
|
| 382 |
+
if sb is None:
|
| 383 |
+
return []
|
| 384 |
+
runs = get_all_runs(sb)
|
| 385 |
+
if not runs:
|
| 386 |
+
return []
|
| 387 |
choices = []
|
| 388 |
for r in runs:
|
| 389 |
+
st = r.get('status', '?')
|
| 390 |
+
label = f"{r.get('name','?')} [{st.upper()}] ({r.get('completed_urls',0)}/{r.get('total_urls',0)})"
|
|
|
|
|
|
|
| 391 |
choices.append((label, r['id']))
|
| 392 |
+
return choices
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
|
| 394 |
|
| 395 |
def generate_report_for_run(run_id, domain):
|
| 396 |
if sb is None or not run_id:
|
| 397 |
+
return None, "β No run selected."
|
|
|
|
| 398 |
try:
|
| 399 |
run = None
|
| 400 |
+
for r in get_all_runs(sb):
|
|
|
|
| 401 |
if r['id'] == run_id:
|
| 402 |
run = r
|
| 403 |
break
|
|
|
|
| 404 |
pages = get_all_page_results(sb, run_id)
|
| 405 |
if not pages:
|
| 406 |
+
return None, "β No data."
|
|
|
|
| 407 |
results = [p['result'] for p in pages]
|
| 408 |
+
s = (run.get('summary', {}) or {}) if run else {}
|
| 409 |
+
rh = generate_report(results, s.get('orphan_urls', []), run.get('domain', domain) if run else domain)
|
| 410 |
+
tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.html', prefix='Audit_')
|
| 411 |
+
tmp.write(rh.encode('utf-8'))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
tmp.close()
|
| 413 |
+
return tmp.name, f"β
Report β {len(results)} pages"
|
|
|
|
| 414 |
except Exception as e:
|
| 415 |
+
return None, f"β {str(e)}"
|
| 416 |
|
| 417 |
|
| 418 |
def generate_csv_for_run(run_id):
|
| 419 |
if sb is None or not run_id:
|
| 420 |
return None, "β No run selected."
|
|
|
|
| 421 |
try:
|
| 422 |
pages = get_all_page_results(sb, run_id)
|
| 423 |
if not pages:
|
| 424 |
return None, "β No data."
|
| 425 |
+
rows = [{'URL': p['result'].get('url',''), 'Internal': p['result'].get('int_count',0),
|
| 426 |
+
'External': p['result'].get('ext_count',0),
|
| 427 |
+
'Broken': p['result'].get('broken_int_count',0)+p['result'].get('broken_ext_count',0),
|
| 428 |
+
'Redirects': p['result'].get('redirect_int_count',0)+p['result'].get('redirect_ext_count',0),
|
| 429 |
+
'Flags': p['result'].get('follow_flag_count',0),
|
| 430 |
+
'Dups': p['result'].get('duplicate_count',0)} for p in pages]
|
| 431 |
+
tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.csv', prefix='Audit_')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
pd.DataFrame(rows).to_csv(tmp.name, index=False)
|
| 433 |
tmp.close()
|
| 434 |
+
return tmp.name, f"β
CSV β {len(rows)} rows"
|
| 435 |
except Exception as e:
|
| 436 |
+
return None, f"β {str(e)}"
|
| 437 |
|
| 438 |
|
| 439 |
def delete_selected_run(run_id):
|
| 440 |
if sb is None or not run_id:
|
| 441 |
+
return "β No run selected."
|
| 442 |
try:
|
| 443 |
delete_run(sb, run_id)
|
| 444 |
+
return "ποΈ Deleted. Click Refresh."
|
|
|
|
| 445 |
except Exception as e:
|
| 446 |
+
return f"β {str(e)}"
|
| 447 |
|
| 448 |
|
| 449 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 450 |
+
# UI
|
| 451 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 452 |
|
| 453 |
with gr.Blocks(title="Link Audit Tool", theme=gr.themes.Soft()) as app:
|
| 454 |
|
| 455 |
+
gr.HTML("""<div style="background:linear-gradient(135deg,#1e3a5f,#2563eb);padding:24px 28px;border-radius:12px;color:white;margin-bottom:16px;">
|
|
|
|
| 456 |
<p style="font-size:10px;font-weight:700;letter-spacing:1.5px;text-transform:uppercase;color:#93c5fd;margin-bottom:8px;">SEO LINK AUDIT TOOL</p>
|
| 457 |
<h1 style="margin:0 0 4px 0;font-size:24px;">π Bulk Link Audit</h1>
|
| 458 |
+
<p style="margin:0;opacity:0.8;font-size:13px;">Upload URLs β batch crawl β pause/resume β generate report</p></div>""")
|
|
|
|
|
|
|
| 459 |
|
| 460 |
+
conn = "ποΈ β
Supabase Connected" if sb else "ποΈ β Not Connected"
|
| 461 |
+
gr.Markdown(f"**{conn}**")
|
| 462 |
|
| 463 |
with gr.Tabs():
|
| 464 |
|
|
|
|
| 465 |
with gr.Tab("π New Audit"):
|
| 466 |
with gr.Row():
|
| 467 |
with gr.Column(scale=2):
|
| 468 |
file_input = gr.File(label="Upload Excel / CSV", file_types=[".xlsx", ".csv", ".xls"])
|
| 469 |
+
pasted_urls = gr.Textbox(label="Or paste URLs (one per line)", lines=5)
|
|
|
|
| 470 |
with gr.Column(scale=1):
|
| 471 |
domain_input = gr.Textbox(label="Your Domain", value="edstellar.com")
|
| 472 |
batch_size_input = gr.Slider(5, 50, value=25, step=5, label="Batch Size")
|
| 473 |
timeout_input = gr.Slider(5, 60, value=15, step=5, label="Timeout (s)")
|
| 474 |
+
delay_input = gr.Slider(0, 5, value=1.0, step=0.5, label="Delay (s)")
|
| 475 |
+
workers_input = gr.Slider(1, 10, value=5, step=1, label="Parallel checks")
|
| 476 |
|
| 477 |
with gr.Row():
|
| 478 |
run_btn = gr.Button("π Run Audit", variant="primary", scale=2)
|
|
|
|
| 481 |
progress_text = gr.Textbox(label="Status", interactive=False)
|
| 482 |
log_output = gr.Textbox(label="Audit Log", lines=20, interactive=False)
|
| 483 |
|
| 484 |
+
run_btn.click(fn=run_audit,
|
|
|
|
| 485 |
inputs=[file_input, pasted_urls, domain_input, batch_size_input, timeout_input, delay_input, workers_input],
|
| 486 |
+
outputs=[log_output, progress_text])
|
|
|
|
| 487 |
pause_btn.click(fn=pause_audit, outputs=[progress_text])
|
| 488 |
|
|
|
|
| 489 |
with gr.Tab("π Past Runs"):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
with gr.Row():
|
| 491 |
+
refresh_btn = gr.Button("π Refresh", variant="secondary")
|
| 492 |
+
runs_html = gr.HTML(value="<p>Click Refresh to load.</p>")
|
| 493 |
+
run_dropdown = gr.Dropdown(label="Select Run", choices=[], interactive=True, type="value")
|
|
|
|
| 494 |
|
| 495 |
+
with gr.Row():
|
| 496 |
+
report_btn = gr.Button("π HTML Report", variant="primary")
|
| 497 |
+
csv_btn = gr.Button("π CSV", variant="secondary")
|
| 498 |
+
resume_btn = gr.Button("βΆοΈ Resume", variant="primary")
|
| 499 |
+
delete_btn = gr.Button("ποΈ Delete", variant="stop")
|
| 500 |
|
| 501 |
+
action_status = gr.Textbox(label="Status", interactive=False)
|
| 502 |
with gr.Row():
|
| 503 |
+
report_file = gr.File(label="Report Download", interactive=False)
|
| 504 |
+
csv_file = gr.File(label="CSV Download", interactive=False)
|
| 505 |
|
| 506 |
+
gr.Markdown("---\n### Resume Controls")
|
|
|
|
| 507 |
resume_progress = gr.Textbox(label="Resume Status", interactive=False)
|
| 508 |
resume_log = gr.Textbox(label="Resume Log", lines=15, interactive=False)
|
| 509 |
resume_pause_btn = gr.Button("βΈοΈ Pause Resume", variant="stop")
|
| 510 |
|
| 511 |
+
# Refresh: update HTML and dropdown separately
|
| 512 |
+
refresh_btn.click(fn=load_runs_html, outputs=[runs_html])
|
| 513 |
+
refresh_btn.click(fn=load_runs_choices, outputs=[run_dropdown])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
|
| 515 |
+
report_btn.click(fn=generate_report_for_run, inputs=[run_dropdown, domain_input], outputs=[report_file, action_status])
|
| 516 |
csv_btn.click(fn=generate_csv_for_run, inputs=[run_dropdown], outputs=[csv_file, action_status])
|
| 517 |
+
delete_btn.click(fn=delete_selected_run, inputs=[run_dropdown], outputs=[action_status])
|
| 518 |
|
| 519 |
+
resume_btn.click(fn=resume_audit,
|
|
|
|
|
|
|
|
|
|
| 520 |
inputs=[run_dropdown, domain_input, batch_size_input, timeout_input, delay_input, workers_input],
|
| 521 |
+
outputs=[resume_log, resume_progress])
|
|
|
|
| 522 |
resume_pause_btn.click(fn=pause_audit, outputs=[resume_progress])
|
| 523 |
|
| 524 |
|