"""Website crawl tool vendored from folder/tools/crawl_website.py. This module provides website crawling functionality that starts from a given URL and crawls linked pages in a breadth-first manner, prioritizing navigation links. """ from urllib.parse import urljoin, urlparse import aiohttp import structlog from bs4 import BeautifulSoup from src.tools.vendored.web_search_core import ( ScrapeResult, WebpageSnippet, scrape_urls, ssl_context, ) logger = structlog.get_logger() async def crawl_website(starting_url: str) -> list[ScrapeResult] | str: """Crawl the pages of a website starting with the starting_url and then descending into the pages linked from there. Prioritizes links found in headers/navigation, then body links, then subsequent pages. Args: starting_url: Starting URL to scrape Returns: List of ScrapeResult objects which have the following fields: - url: The URL of the web page - title: The title of the web page - description: The description of the web page - text: The text content of the web page """ if not starting_url: return "Empty URL provided" # Ensure URL has a protocol if not starting_url.startswith(("http://", "https://")): starting_url = "http://" + starting_url max_pages = 10 base_domain = urlparse(starting_url).netloc async def extract_links(html: str, current_url: str) -> tuple[list[str], list[str]]: """Extract prioritized links from HTML content""" soup = BeautifulSoup(html, "html.parser") nav_links = set() body_links = set() # Find navigation/header links for nav_element in soup.find_all(["nav", "header"]): for a in nav_element.find_all("a", href=True): link = urljoin(current_url, a["href"]) if urlparse(link).netloc == base_domain: nav_links.add(link) # Find remaining body links for a in soup.find_all("a", href=True): link = urljoin(current_url, a["href"]) if urlparse(link).netloc == base_domain and link not in nav_links: body_links.add(link) return list(nav_links), list(body_links) async def fetch_page(url: str) -> str: """Fetch HTML content from a URL""" connector = aiohttp.TCPConnector(ssl=ssl_context) async with aiohttp.ClientSession(connector=connector) as session: try: timeout = aiohttp.ClientTimeout(total=30) async with session.get(url, timeout=timeout) as response: if response.status == 200: return await response.text() return "" except Exception as e: logger.warning("Error fetching URL", url=url, error=str(e)) return "" # Initialize with starting URL queue: list[str] = [starting_url] next_level_queue: list[str] = [] all_pages_to_scrape: set[str] = set([starting_url]) # Breadth-first crawl while queue and len(all_pages_to_scrape) < max_pages: current_url = queue.pop(0) # Fetch and process the page html_content = await fetch_page(current_url) if html_content: nav_links, body_links = await extract_links(html_content, current_url) # Add unvisited nav links to current queue (higher priority) remaining_slots = max_pages - len(all_pages_to_scrape) for link in nav_links: link = link.rstrip("/") if link not in all_pages_to_scrape and remaining_slots > 0: queue.append(link) all_pages_to_scrape.add(link) remaining_slots -= 1 # Add unvisited body links to next level queue (lower priority) for link in body_links: link = link.rstrip("/") if link not in all_pages_to_scrape and remaining_slots > 0: next_level_queue.append(link) all_pages_to_scrape.add(link) remaining_slots -= 1 # If current queue is empty, add next level links if not queue: queue = next_level_queue next_level_queue = [] # Convert set to list for final processing pages_to_scrape = list(all_pages_to_scrape)[:max_pages] pages_to_scrape_snippets: list[WebpageSnippet] = [ WebpageSnippet(url=page, title="", description="") for page in pages_to_scrape ] # Use scrape_urls to get the content for all discovered pages result = await scrape_urls(pages_to_scrape_snippets) return result