"""Browser-based URL verifier — fallback for hosts that block scripted HTTP. Many insurer / aggregator sites (Akamai, Cloudflare) return 403/503 to httpx but load fine in a real browser. This tool uses headless Chromium (via Playwright) to verify those URLs the way a human would. Two modes: 1. Standalone: `python tools/browser_verify.py [urls...]` - Reads URLs from argv or MUST_FIX.md - Renders each in headless Chromium - Writes results to tools/browser_verified.json 2. Library: `from tools.browser_verify import verify_one` - check_link_rot.py imports this to retry after httpx failure Verdict rules: ALIVE — page loaded with status 200-399 AND title doesn't say "404/not found/error" DEAD — main-resource HTTP 4xx/5xx OR title says 404 TIMEOUT — page didn't finish loading within 25s The allowlist (tools/browser_verified.json) is checked-in so the daily cron trusts past verifications for 30 days before re-running browser checks. """ from __future__ import annotations import json import re import sys import time from pathlib import Path PROJECT_ROOT = Path(__file__).resolve().parent.parent ALLOWLIST = PROJECT_ROOT / "tools" / "browser_verified.json" MUST_FIX = PROJECT_ROOT / "MUST_FIX.md" DEAD_TITLE_PATTERNS = re.compile( r"(?i)404|not[\s-]found|page[\s-]not[\s-]available|error[\s-]occurred|access[\s-]denied" ) def verify_one(url: str, page, context) -> dict: """Render `url` and return a verdict dict. PDFs use the browser's request stack (request.head) — chromium would try to download them which fails the page.goto contract. HTML uses full navigation so the JS challenge from Akamai/DataDome resolves. """ is_pdf = url.lower().endswith(".pdf") or "/pdf/" in url.lower() last_err = "" if is_pdf: # Many insurer CDNs (Star Health on Akamai, ICICI) require a challenge # cookie from the parent host before serving PDFs. Warm up if 403. warmed = False for attempt in range(3): try: resp = context.request.get(url, timeout=60000, max_redirects=10) status = resp.status content_type = resp.headers.get("content-type", "") body_head = b"" try: body_head = resp.body()[:5] if status == 200 else b"" except Exception: # noqa: BLE001 body_head = b"" is_real_pdf = body_head == b"%PDF-" or "pdf" in content_type.lower() if 200 <= status < 400 and is_real_pdf: return {"verdict": "ALIVE", "status": status, "title": "[pdf]", "reason": f"ct={content_type}", "ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")} # 403 from Akamai-style bot defence — try cookie warmup on parent host if status == 403 and not warmed: try: host_root = re.sub(r"(https?://[^/]+).*", r"\1", url) page.goto(host_root, wait_until="domcontentloaded", timeout=30000) page.wait_for_timeout(2500) # let JS challenge resolve warmed = True continue except Exception: # noqa: BLE001 pass if status >= 400: return {"verdict": "DEAD", "status": status, "title": "[pdf]", "reason": f"main_status={status}", "ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")} return {"verdict": "DEAD", "status": status, "title": "[pdf]", "reason": f"not_pdf ct={content_type} head={body_head!r}", "ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")} except Exception as e: # noqa: BLE001 last_err = str(e).splitlines()[0][:160] if attempt < 2: time.sleep(2) continue return {"verdict": "ERROR", "status": 0, "title": "[pdf]", "reason": last_err, "ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")} # HTML path for attempt in range(2): try: response = page.goto(url, wait_until="domcontentloaded", timeout=45000) status = response.status if response else 0 title = page.title() or "" if status >= 400: return {"verdict": "DEAD", "status": status, "title": title[:120], "reason": f"main_status={status}", "ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")} if DEAD_TITLE_PATTERNS.search(title): return {"verdict": "DEAD", "status": status, "title": title[:120], "reason": f"title={title!r}", "ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")} return {"verdict": "ALIVE", "status": status, "title": title[:120], "reason": "", "ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")} except Exception as e: # noqa: BLE001 last_err = str(e).splitlines()[0][:160] if attempt == 0: time.sleep(2) continue verdict = "TIMEOUT" if "Timeout" in last_err or "timeout" in last_err else "ERROR" return {"verdict": verdict, "status": 0, "title": "", "reason": last_err, "ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")} return {"verdict": "ERROR", "status": 0, "title": "", "reason": last_err, "ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")} def parse_must_fix() -> list[str]: """Extract URLs from MUST_FIX.md table.""" if not MUST_FIX.exists(): return [] urls: list[str] = [] for line in MUST_FIX.read_text().splitlines(): if not line.startswith("|") or "url" in line.lower() and "status" in line.lower(): continue parts = [p.strip() for p in line.strip("|").split("|")] if len(parts) >= 2: m = re.search(r"https?://\S+", parts[1]) if m: urls.append(m.group(0)) return urls def load_allowlist() -> dict: if ALLOWLIST.exists(): try: return json.loads(ALLOWLIST.read_text()) except json.JSONDecodeError: return {} return {} def save_allowlist(d: dict) -> None: ALLOWLIST.write_text(json.dumps(d, indent=2, sort_keys=True)) def main(argv: list[str]) -> int: from playwright.sync_api import sync_playwright urls = argv[1:] if len(argv) > 1 else parse_must_fix() if not urls: print("[browser-verify] no URLs to check — pass on argv or populate MUST_FIX.md") return 0 allowlist = load_allowlist() counts = {"ALIVE": 0, "DEAD": 0, "TIMEOUT": 0, "ERROR": 0} from playwright_stealth import Stealth with sync_playwright() as p: # Launch with anti-bot-detection flags. Akamai/Cloudflare detect # chrome-headless-shell easily; we use full chromium + stealth tricks. browser = p.chromium.launch( headless=True, args=[ "--disable-blink-features=AutomationControlled", "--disable-features=IsolateOrigins,site-per-process", "--no-sandbox", ], ) context = browser.new_context( user_agent=( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/126.0 Safari/537.36" ), viewport={"width": 1366, "height": 800}, locale="en-IN", extra_http_headers={ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-IN,en;q=0.9", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Ch-Ua": '"Chromium";v="126", "Google Chrome";v="126"', "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": '"macOS"', }, ) Stealth().apply_stealth_sync(context) page = context.new_page() for i, url in enumerate(urls, 1): print(f" [{i}/{len(urls)}] {url[:90]}", flush=True) v = verify_one(url, page, context) counts[v["verdict"]] = counts.get(v["verdict"], 0) + 1 if v["verdict"] == "ALIVE": allowlist[url] = v print(f" -> {v['verdict']} | {v.get('status', '')} | {v.get('title', '')[:80]}") browser.close() save_allowlist(allowlist) print( f"\n[browser-verify] {sum(counts.values())} URLs checked | " f"ALIVE={counts['ALIVE']} DEAD={counts['DEAD']} " f"TIMEOUT={counts['TIMEOUT']} ERROR={counts['ERROR']}" ) print(f"[browser-verify] allowlist saved -> {ALLOWLIST.relative_to(PROJECT_ROOT)}") return 0 if __name__ == "__main__": sys.exit(main(sys.argv))