My_campus_agent / crawl.py
BenjaminKaindu0506's picture
Initial commit: UA Student Navigator Chatbot with OpenRouter integration
1be7393
"""
Controlled website crawler for user-provided URLs.
"""
from typing import List, Dict, Set
from urllib.parse import urlparse, urljoin
from fetch import fetch_page, get_internal_links
import time
def crawl_website(start_url: str, query: str, max_pages: int = 12, max_depth: int = 1) -> List[Dict[str, str]]:
"""Crawl a website starting from a URL, following internal links."""
parsed_start = urlparse(start_url)
base_domain = parsed_start.netloc.lower()
base_path = parsed_start.path.rstrip('/')
pages = []
visited: Set[str] = set()
to_visit: List[tuple[str, int]] = [(start_url, 0)]
while to_visit and len(pages) < max_pages:
current_url, depth = to_visit.pop(0)
if current_url in visited:
continue
if depth > max_depth:
continue
print(f"Fetching (depth {depth}): {current_url}")
page = fetch_page(current_url)
if page:
pages.append(page)
visited.add(current_url)
if depth < max_depth and len(pages) < max_pages:
try:
import httpx
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
with httpx.Client(timeout=10.0, follow_redirects=True) as client:
response = client.get(current_url, headers=headers)
if response.status_code == 200 and 'text/html' in response.headers.get('content-type', '').lower():
html = response.text
links = get_internal_links(html, current_url, same_domain_only=True)
for link in links:
if link not in visited:
to_visit.append((link, depth + 1))
except Exception as e:
print(f"Error getting links from {current_url}: {e}")
time.sleep(0.5)
return pages