Spaces:
Runtime error
Runtime error
| """ | |
| Controlled website crawler for user-provided URLs. | |
| """ | |
| from typing import List, Dict, Set | |
| from urllib.parse import urlparse, urljoin | |
| from fetch import fetch_page, get_internal_links | |
| import time | |
| def crawl_website(start_url: str, query: str, max_pages: int = 12, max_depth: int = 1) -> List[Dict[str, str]]: | |
| """Crawl a website starting from a URL, following internal links.""" | |
| parsed_start = urlparse(start_url) | |
| base_domain = parsed_start.netloc.lower() | |
| base_path = parsed_start.path.rstrip('/') | |
| pages = [] | |
| visited: Set[str] = set() | |
| to_visit: List[tuple[str, int]] = [(start_url, 0)] | |
| while to_visit and len(pages) < max_pages: | |
| current_url, depth = to_visit.pop(0) | |
| if current_url in visited: | |
| continue | |
| if depth > max_depth: | |
| continue | |
| print(f"Fetching (depth {depth}): {current_url}") | |
| page = fetch_page(current_url) | |
| if page: | |
| pages.append(page) | |
| visited.add(current_url) | |
| if depth < max_depth and len(pages) < max_pages: | |
| try: | |
| import httpx | |
| headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} | |
| with httpx.Client(timeout=10.0, follow_redirects=True) as client: | |
| response = client.get(current_url, headers=headers) | |
| if response.status_code == 200 and 'text/html' in response.headers.get('content-type', '').lower(): | |
| html = response.text | |
| links = get_internal_links(html, current_url, same_domain_only=True) | |
| for link in links: | |
| if link not in visited: | |
| to_visit.append((link, depth + 1)) | |
| except Exception as e: | |
| print(f"Error getting links from {current_url}: {e}") | |
| time.sleep(0.5) | |
| return pages | |