uptime / browser /crawler.py
HuB
Deploy webguard to Hugging Face Space
b987c84
import asyncio
from urllib.parse import urljoin, urlparse
from playwright.async_api import async_playwright
from storage.db import SessionLocal
from storage.models import CrawlPage
from datetime import datetime
async def crawl(start_url: str, max_pages: int = 50) -> dict:
"""Crawl all internal pages of a website. Returns summary."""
if not start_url.startswith("http"):
start_url = "https://" + start_url
base_domain = urlparse(start_url).netloc
visited = set()
to_visit = [start_url]
results = []
async with async_playwright() as p:
browser = await p.chromium.launch(
args=["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"]
)
context = await browser.new_context()
while to_visit and len(visited) < max_pages:
url = to_visit.pop(0)
if url in visited:
continue
visited.add(url)
page = await context.new_page()
result = {"url": url, "status": "ok", "status_code": None, "error": None}
try:
response = await page.goto(url, wait_until="domcontentloaded", timeout=20000)
result["status_code"] = response.status if response else None
if response and response.status >= 400:
result["status"] = "broken"
# Collect internal links
links = await page.eval_on_selector_all(
"a[href]",
"els => els.map(e => e.href)"
)
for link in links:
parsed = urlparse(link)
if parsed.netloc == base_domain and link not in visited:
clean = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
if clean not in to_visit and clean not in visited:
to_visit.append(clean)
except Exception as e:
result["status"] = "error"
result["error"] = str(e)[:200]
results.append(result)
await page.close()
await browser.close()
# Save to DB
_save_crawl(start_url, results)
broken = [r for r in results if r["status"] != "ok"]
return {
"check": "crawler",
"overall": "error" if any(r["status"] == "error" for r in broken) else ("warning" if broken else "ok"),
"pages_crawled": len(results),
"broken_pages": broken,
"all_pages": results,
"summary": f"✅ Crawled {len(results)} pages | ⚠️ {len(broken)} broken" if not broken else f"❌ {len(broken)} broken pages found",
}
def _save_crawl(site_url: str, results: list):
db = SessionLocal()
try:
for r in results:
db.add(CrawlPage(
site_url=site_url,
page_url=r["url"],
status=r["status"],
status_code=r.get("status_code"),
error=r.get("error"),
created_at=datetime.utcnow(),
))
db.commit()
finally:
db.close()