Spaces:

BenjaminKaindu0506
/

My_campus_agent

Runtime error

Initial commit: UA Student Navigator Chatbot with OpenRouter integration

1be7393 4 months ago

2.08 kB

	"""
	Controlled website crawler for user-provided URLs.
	"""
	from typing import List, Dict, Set
	from urllib.parse import urlparse, urljoin
	from fetch import fetch_page, get_internal_links
	import time


	def crawl_website(start_url: str, query: str, max_pages: int = 12, max_depth: int = 1) -> List[Dict[str, str]]:
	"""Crawl a website starting from a URL, following internal links."""
	parsed_start = urlparse(start_url)
	base_domain = parsed_start.netloc.lower()
	base_path = parsed_start.path.rstrip('/')

	pages = []
	visited: Set[str] = set()
	to_visit: List[tuple[str, int]] = [(start_url, 0)]

	while to_visit and len(pages) < max_pages:
	current_url, depth = to_visit.pop(0)

	if current_url in visited:
	continue

	if depth > max_depth:
	continue

	print(f"Fetching (depth {depth}): {current_url}")
	page = fetch_page(current_url)

	if page:
	pages.append(page)
	visited.add(current_url)

	if depth < max_depth and len(pages) < max_pages:
	try:
	import httpx
	headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
	with httpx.Client(timeout=10.0, follow_redirects=True) as client:
	response = client.get(current_url, headers=headers)
	if response.status_code == 200 and 'text/html' in response.headers.get('content-type', '').lower():
	html = response.text
	links = get_internal_links(html, current_url, same_domain_only=True)

	for link in links:
	if link not in visited:
	to_visit.append((link, depth + 1))
	except Exception as e:
	print(f"Error getting links from {current_url}: {e}")

	time.sleep(0.5)

	return pages