Spaces:

rohitsar567
/

InsuranceBot

Sleeping

App Files Files Community

InsuranceBot / tools /browser_verify.py

rohitsar567

Deploy v1 — single-Docker FastAPI + Next.js + RAG + voice + faithfulness

9c879c3 verified about 2 months ago

Raw

History Blame Contribute Delete

9 kB

	"""Browser-based URL verifier — fallback for hosts that block scripted HTTP.

	Many insurer / aggregator sites (Akamai, Cloudflare) return 403/503 to httpx
	but load fine in a real browser. This tool uses headless Chromium (via
	Playwright) to verify those URLs the way a human would.

	Two modes:
	1. Standalone: `python tools/browser_verify.py [urls...]`
	- Reads URLs from argv or MUST_FIX.md
	- Renders each in headless Chromium
	- Writes results to tools/browser_verified.json

	2. Library: `from tools.browser_verify import verify_one`
	- check_link_rot.py imports this to retry after httpx failure

	Verdict rules:
	ALIVE — page loaded with status 200-399 AND title doesn't say "404/not found/error"
	DEAD — main-resource HTTP 4xx/5xx OR title says 404
	TIMEOUT — page didn't finish loading within 25s

	The allowlist (tools/browser_verified.json) is checked-in so the daily cron
	trusts past verifications for 30 days before re-running browser checks.
	"""

	from __future__ import annotations

	import json
	import re
	import sys
	import time
	from pathlib import Path

	PROJECT_ROOT = Path(__file__).resolve().parent.parent
	ALLOWLIST = PROJECT_ROOT / "tools" / "browser_verified.json"
	MUST_FIX = PROJECT_ROOT / "MUST_FIX.md"

	DEAD_TITLE_PATTERNS = re.compile(
	r"(?i)404\|not[\s-]found\|page[\s-]not[\s-]available\|error[\s-]occurred\|access[\s-]denied"
	)


	def verify_one(url: str, page, context) -> dict:
	"""Render `url` and return a verdict dict.

	PDFs use the browser's request stack (request.head) — chromium would try
	to download them which fails the page.goto contract. HTML uses full
	navigation so the JS challenge from Akamai/DataDome resolves.
	"""
	is_pdf = url.lower().endswith(".pdf") or "/pdf/" in url.lower()
	last_err = ""

	if is_pdf:
	# Many insurer CDNs (Star Health on Akamai, ICICI) require a challenge
	# cookie from the parent host before serving PDFs. Warm up if 403.
	warmed = False
	for attempt in range(3):
	try:
	resp = context.request.get(url, timeout=60000, max_redirects=10)
	status = resp.status
	content_type = resp.headers.get("content-type", "")
	body_head = b""
	try:
	body_head = resp.body()[:5] if status == 200 else b""
	except Exception: # noqa: BLE001
	body_head = b""
	is_real_pdf = body_head == b"%PDF-" or "pdf" in content_type.lower()
	if 200 <= status < 400 and is_real_pdf:
	return {"verdict": "ALIVE", "status": status, "title": "[pdf]",
	"reason": f"ct={content_type}", "ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")}
	# 403 from Akamai-style bot defence — try cookie warmup on parent host
	if status == 403 and not warmed:
	try:
	host_root = re.sub(r"(https?://[^/]+).*", r"\1", url)
	page.goto(host_root, wait_until="domcontentloaded", timeout=30000)
	page.wait_for_timeout(2500) # let JS challenge resolve
	warmed = True
	continue
	except Exception: # noqa: BLE001
	pass
	if status >= 400:
	return {"verdict": "DEAD", "status": status, "title": "[pdf]",
	"reason": f"main_status={status}", "ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")}
	return {"verdict": "DEAD", "status": status, "title": "[pdf]",
	"reason": f"not_pdf ct={content_type} head={body_head!r}",
	"ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")}
	except Exception as e: # noqa: BLE001
	last_err = str(e).splitlines()[0][:160]
	if attempt < 2:
	time.sleep(2)
	continue
	return {"verdict": "ERROR", "status": 0, "title": "[pdf]",
	"reason": last_err, "ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")}

	# HTML path
	for attempt in range(2):
	try:
	response = page.goto(url, wait_until="domcontentloaded", timeout=45000)
	status = response.status if response else 0
	title = page.title() or ""
	if status >= 400:
	return {"verdict": "DEAD", "status": status, "title": title[:120],
	"reason": f"main_status={status}", "ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")}
	if DEAD_TITLE_PATTERNS.search(title):
	return {"verdict": "DEAD", "status": status, "title": title[:120],
	"reason": f"title={title!r}", "ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")}
	return {"verdict": "ALIVE", "status": status, "title": title[:120],
	"reason": "", "ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")}
	except Exception as e: # noqa: BLE001
	last_err = str(e).splitlines()[0][:160]
	if attempt == 0:
	time.sleep(2)
	continue
	verdict = "TIMEOUT" if "Timeout" in last_err or "timeout" in last_err else "ERROR"
	return {"verdict": verdict, "status": 0, "title": "", "reason": last_err,
	"ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")}
	return {"verdict": "ERROR", "status": 0, "title": "", "reason": last_err,
	"ts": time.strftime("%Y-%m-%dT%H:%M:%S%z")}


	def parse_must_fix() -> list[str]:
	"""Extract URLs from MUST_FIX.md table."""
	if not MUST_FIX.exists():
	return []
	urls: list[str] = []
	for line in MUST_FIX.read_text().splitlines():
	if not line.startswith("\|") or "url" in line.lower() and "status" in line.lower():
	continue
	parts = [p.strip() for p in line.strip("\|").split("\|")]
	if len(parts) >= 2:
	m = re.search(r"https?://\S+", parts[1])
	if m:
	urls.append(m.group(0))
	return urls


	def load_allowlist() -> dict:
	if ALLOWLIST.exists():
	try:
	return json.loads(ALLOWLIST.read_text())
	except json.JSONDecodeError:
	return {}
	return {}


	def save_allowlist(d: dict) -> None:
	ALLOWLIST.write_text(json.dumps(d, indent=2, sort_keys=True))


	def main(argv: list[str]) -> int:
	from playwright.sync_api import sync_playwright

	urls = argv[1:] if len(argv) > 1 else parse_must_fix()
	if not urls:
	print("[browser-verify] no URLs to check — pass on argv or populate MUST_FIX.md")
	return 0

	allowlist = load_allowlist()
	counts = {"ALIVE": 0, "DEAD": 0, "TIMEOUT": 0, "ERROR": 0}

	from playwright_stealth import Stealth

	with sync_playwright() as p:
	# Launch with anti-bot-detection flags. Akamai/Cloudflare detect
	# chrome-headless-shell easily; we use full chromium + stealth tricks.
	browser = p.chromium.launch(
	headless=True,
	args=[
	"--disable-blink-features=AutomationControlled",
	"--disable-features=IsolateOrigins,site-per-process",
	"--no-sandbox",
	],
	)
	context = browser.new_context(
	user_agent=(
	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
	"(KHTML, like Gecko) Chrome/126.0 Safari/537.36"
	),
	viewport={"width": 1366, "height": 800},
	locale="en-IN",
	extra_http_headers={
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
	"Accept-Language": "en-IN,en;q=0.9",
	"Sec-Fetch-Dest": "document",
	"Sec-Fetch-Mode": "navigate",
	"Sec-Fetch-Site": "none",
	"Sec-Ch-Ua": '"Chromium";v="126", "Google Chrome";v="126"',
	"Sec-Ch-Ua-Mobile": "?0",
	"Sec-Ch-Ua-Platform": '"macOS"',
	},
	)
	Stealth().apply_stealth_sync(context)
	page = context.new_page()

	for i, url in enumerate(urls, 1):
	print(f" [{i}/{len(urls)}] {url[:90]}", flush=True)
	v = verify_one(url, page, context)
	counts[v["verdict"]] = counts.get(v["verdict"], 0) + 1
	if v["verdict"] == "ALIVE":
	allowlist[url] = v
	print(f" -> {v['verdict']} \| {v.get('status', '')} \| {v.get('title', '')[:80]}")

	browser.close()

	save_allowlist(allowlist)
	print(
	f"\n[browser-verify] {sum(counts.values())} URLs checked \| "
	f"ALIVE={counts['ALIVE']} DEAD={counts['DEAD']} "
	f"TIMEOUT={counts['TIMEOUT']} ERROR={counts['ERROR']}"
	)
	print(f"[browser-verify] allowlist saved -> {ALLOWLIST.relative_to(PROJECT_ROOT)}")
	return 0


	if __name__ == "__main__":
	sys.exit(main(sys.argv))