import asyncio from urllib.parse import urljoin, urlparse from playwright.async_api import async_playwright from storage.db import SessionLocal from storage.models import CrawlPage from datetime import datetime async def crawl(start_url: str, max_pages: int = 50) -> dict: """Crawl all internal pages of a website. Returns summary.""" if not start_url.startswith("http"): start_url = "https://" + start_url base_domain = urlparse(start_url).netloc visited = set() to_visit = [start_url] results = [] async with async_playwright() as p: browser = await p.chromium.launch( args=["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"] ) context = await browser.new_context() while to_visit and len(visited) < max_pages: url = to_visit.pop(0) if url in visited: continue visited.add(url) page = await context.new_page() result = {"url": url, "status": "ok", "status_code": None, "error": None} try: response = await page.goto(url, wait_until="domcontentloaded", timeout=20000) result["status_code"] = response.status if response else None if response and response.status >= 400: result["status"] = "broken" # Collect internal links links = await page.eval_on_selector_all( "a[href]", "els => els.map(e => e.href)" ) for link in links: parsed = urlparse(link) if parsed.netloc == base_domain and link not in visited: clean = f"{parsed.scheme}://{parsed.netloc}{parsed.path}" if clean not in to_visit and clean not in visited: to_visit.append(clean) except Exception as e: result["status"] = "error" result["error"] = str(e)[:200] results.append(result) await page.close() await browser.close() # Save to DB _save_crawl(start_url, results) broken = [r for r in results if r["status"] != "ok"] return { "check": "crawler", "overall": "error" if any(r["status"] == "error" for r in broken) else ("warning" if broken else "ok"), "pages_crawled": len(results), "broken_pages": broken, "all_pages": results, "summary": f"✅ Crawled {len(results)} pages | ⚠️ {len(broken)} broken" if not broken else f"❌ {len(broken)} broken pages found", } def _save_crawl(site_url: str, results: list): db = SessionLocal() try: for r in results: db.add(CrawlPage( site_url=site_url, page_url=r["url"], status=r["status"], status_code=r.get("status_code"), error=r.get("error"), created_at=datetime.utcnow(), )) db.commit() finally: db.close()