| import asyncio |
| from urllib.parse import urljoin, urlparse |
| from playwright.async_api import async_playwright |
| from storage.db import SessionLocal |
| from storage.models import CrawlPage |
| from datetime import datetime |
|
|
|
|
| async def crawl(start_url: str, max_pages: int = 50) -> dict: |
| """Crawl all internal pages of a website. Returns summary.""" |
| if not start_url.startswith("http"): |
| start_url = "https://" + start_url |
|
|
| base_domain = urlparse(start_url).netloc |
| visited = set() |
| to_visit = [start_url] |
| results = [] |
|
|
| async with async_playwright() as p: |
| browser = await p.chromium.launch( |
| args=["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"] |
| ) |
| context = await browser.new_context() |
|
|
| while to_visit and len(visited) < max_pages: |
| url = to_visit.pop(0) |
| if url in visited: |
| continue |
| visited.add(url) |
|
|
| page = await context.new_page() |
| result = {"url": url, "status": "ok", "status_code": None, "error": None} |
|
|
| try: |
| response = await page.goto(url, wait_until="domcontentloaded", timeout=20000) |
| result["status_code"] = response.status if response else None |
|
|
| if response and response.status >= 400: |
| result["status"] = "broken" |
|
|
| |
| links = await page.eval_on_selector_all( |
| "a[href]", |
| "els => els.map(e => e.href)" |
| ) |
| for link in links: |
| parsed = urlparse(link) |
| if parsed.netloc == base_domain and link not in visited: |
| clean = f"{parsed.scheme}://{parsed.netloc}{parsed.path}" |
| if clean not in to_visit and clean not in visited: |
| to_visit.append(clean) |
|
|
| except Exception as e: |
| result["status"] = "error" |
| result["error"] = str(e)[:200] |
|
|
| results.append(result) |
| await page.close() |
|
|
| await browser.close() |
|
|
| |
| _save_crawl(start_url, results) |
|
|
| broken = [r for r in results if r["status"] != "ok"] |
| return { |
| "check": "crawler", |
| "overall": "error" if any(r["status"] == "error" for r in broken) else ("warning" if broken else "ok"), |
| "pages_crawled": len(results), |
| "broken_pages": broken, |
| "all_pages": results, |
| "summary": f"✅ Crawled {len(results)} pages | ⚠️ {len(broken)} broken" if not broken else f"❌ {len(broken)} broken pages found", |
| } |
|
|
|
|
| def _save_crawl(site_url: str, results: list): |
| db = SessionLocal() |
| try: |
| for r in results: |
| db.add(CrawlPage( |
| site_url=site_url, |
| page_url=r["url"], |
| status=r["status"], |
| status_code=r.get("status_code"), |
| error=r.get("error"), |
| created_at=datetime.utcnow(), |
| )) |
| db.commit() |
| finally: |
| db.close() |
|
|