Spaces:

BenjaminKaindu0506
/

My_campus_agent

Runtime error

App Files Files Community

My_campus_agent / fetch.py

BenjaminKaindu0506

Add MCP browser fallbacks for search and URL fetching

285b4d8 2 months ago

raw

history blame contribute delete

11.8 kB

	"""
	Fetch and extract content from web pages.
	Includes Wayback Machine fallback for 403 errors.
	"""
	import httpx
	from bs4 import BeautifulSoup
	from typing import Optional, Dict
	from urllib.parse import urljoin, urlparse
	import re
	import json
	import time


	def clean_text(text: str) -> str:
	"""Clean extracted text."""
	text = re.sub(r'\s+', ' ', text)
	text = text.strip()
	return text


	def extract_main_content(html: str, url: str, max_chars: int = 120000) -> str:
	"""Extract main content from HTML, removing scripts, styles, nav, footer."""
	soup = BeautifulSoup(html, 'lxml')

	for script in soup(['script', 'style', 'noscript']):
	script.decompose()

	for nav in soup.find_all(['nav', 'header', 'footer', 'aside']):
	nav.decompose()

	junk_patterns = ['nav', 'navigation', 'menu', 'sidebar', 'footer', 'header', 'cookie', 'banner', 'advertisement', 'ad-', 'social-']

	for element in soup.find_all(class_=re.compile('\|'.join(junk_patterns), re.I)):
	element.decompose()

	for element in soup.find_all(id=re.compile('\|'.join(junk_patterns), re.I)):
	element.decompose()

	main_content = None
	main_selectors = ['main', 'article', '[role="main"]', '.content', '.main-content', '.post-content', '#content', '#main-content', '#main']

	for selector in main_selectors:
	main_content = soup.select_one(selector)
	if main_content:
	break

	if not main_content:
	main_content = soup.find('body')

	if not main_content:
	return ""

	text = main_content.get_text(separator=' ', strip=True)
	text = clean_text(text)

	if len(text) > max_chars:
	text = text[:max_chars] + "... [truncated]"

	return text


	def get_wayback_snapshot(url: str, timeout: float = 20.0) -> Optional[str]:
	"""Get the latest Wayback Machine snapshot URL for a given URL."""
	try:
	api_url = f"https://web.archive.org/cdx/search/cdx?url={url}&output=json&limit=1&collapse=urlkey"
	headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'}

	for attempt in range(2):
	try:
	with httpx.Client(timeout=timeout, headers=headers) as client:
	response = client.get(api_url)
	response.raise_for_status()
	data = response.json()
	if len(data) > 1:
	timestamp = data[1][1]
	wayback_url = f"https://web.archive.org/web/{timestamp}/{url}"
	return wayback_url
	break
	except httpx.TimeoutException:
	if attempt == 0:
	print(f"Wayback API timeout, retrying...")
	time.sleep(1)
	continue
	else:
	raise
	except Exception as e:
	if attempt == 0:
	print(f"Wayback API error, retrying...")
	time.sleep(1)
	continue
	else:
	raise

	return None
	except Exception as e:
	print(f"Error getting Wayback snapshot for {url}: {e}")
	return None


	def extract_wayback_content(html: str) -> Optional[str]:
	"""Extract original content from Wayback Machine wrapper HTML."""
	try:
	soup = BeautifulSoup(html, 'html.parser')
	content_divs = soup.find_all('div', id='webpage')
	if not content_divs:
	content_divs = soup.find_all('div', class_='webpage')
	if not content_divs:
	content_divs = soup.find_all('div', {'id': re.compile('content\|main', re.I)})

	if content_divs:
	return str(content_divs[0])

	body = soup.find('body')
	if body:
	for elem in body.find_all(['div', 'script', 'style'], class_=re.compile('wm-\|wayback', re.I)):
	elem.decompose()
	return str(body)

	return html
	except Exception as e:
	print(f"Error extracting Wayback content: {e}")
	return html


	def fetch_page(url: str, timeout: float = 30.0, use_wayback_fallback: bool = True) -> Optional[Dict[str, str]]:
	"""Fetch a webpage and extract its content. Falls back to Wayback Machine if 403 error occurs."""
	headers = {
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.9',
	'Accept-Encoding': 'gzip, deflate, br',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1',
	'Sec-Fetch-Dest': 'document',
	'Sec-Fetch-Mode': 'navigate',
	'Sec-Fetch-Site': 'none',
	'Cache-Control': 'max-age=0'
	}

	try:
	with httpx.Client(timeout=timeout, follow_redirects=True, headers=headers) as client:
	response = client.get(url, headers=headers)
	response.raise_for_status()

	content_type = response.headers.get('content-type', '').lower()
	if 'text/html' not in content_type:
	print(f"Skipping non-HTML content: {content_type}")
	return None

	html = response.text
	soup = BeautifulSoup(html, 'lxml')
	title_tag = soup.find('title')
	title = title_tag.get_text(strip=True) if title_tag else 'No title'
	content = extract_main_content(html, url)

	if not content or len(content) < 100:
	print(f"Warning: Very little content extracted from {url}")

	return {'url': url, 'title': title, 'content': content}

	except httpx.TimeoutException:
	print(f"Timeout fetching {url}")
	return None
	except httpx.HTTPStatusError as e:
	if e.response.status_code == 403 and use_wayback_fallback:
	print(f"HTTP 403 error fetching {url}, trying Wayback Machine...")
	wayback_result = fetch_from_wayback(url, timeout)
	if wayback_result:
	return wayback_result

	# If Wayback also fails, try MCP browser fallback
	try:
	from mcp_fallback import mcp_fetch_url_fallback
	print(f"Wayback Machine failed, trying MCP browser fallback for {url}...")
	mcp_result = mcp_fetch_url_fallback(url)
	if mcp_result:
	return mcp_result
	except ImportError:
	pass # MCP not available
	except Exception as mcp_error:
	print(f"MCP fallback error: {mcp_error}")

	return None
	else:
	print(f"HTTP error {e.response.status_code} fetching {url}")
	# Try MCP browser as last resort
	try:
	from mcp_fallback import mcp_fetch_url_fallback
	print(f"Trying MCP browser fallback for {url}...")
	mcp_result = mcp_fetch_url_fallback(url)
	if mcp_result:
	return mcp_result
	except ImportError:
	pass
	except Exception as mcp_error:
	print(f"MCP fallback error: {mcp_error}")
	return None
	except Exception as e:
	print(f"Error fetching {url}: {e}")
	# Try MCP browser as last resort
	try:
	from mcp_fallback import mcp_fetch_url_fallback
	print(f"Trying MCP browser fallback for {url}...")
	mcp_result = mcp_fetch_url_fallback(url)
	if mcp_result:
	return mcp_result
	except ImportError:
	pass
	except Exception as mcp_error:
	print(f"MCP fallback error: {mcp_error}")
	return None


	def fetch_from_wayback(url: str, timeout: float = 30.0) -> Optional[Dict[str, str]]:
	"""Fetch a webpage from Wayback Machine (Internet Archive)."""
	try:
	wayback_url = get_wayback_snapshot(url, timeout=10.0)
	if not wayback_url:
	print(f"No Wayback Machine snapshot found for {url}")
	return None

	print(f"Fetching from Wayback Machine: {wayback_url}")
	headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'}

	with httpx.Client(timeout=timeout, follow_redirects=True, headers=headers) as client:
	response = client.get(wayback_url)
	response.raise_for_status()
	html = response.text
	original_html = extract_wayback_content(html)
	if original_html:
	html = original_html

	soup = BeautifulSoup(html, 'lxml')
	title_tag = soup.find('title')
	title = title_tag.get_text(strip=True) if title_tag else ''

	title = re.sub(r'^.?Wayback Machine\s[:\-]\s*', '', title, flags=re.I)
	title = re.sub(r'\s[:\-]\sWayback Machine.*?$', '', title, flags=re.I)
	title = re.sub(r'^.?Internet Archive\s[:\-]\s*', '', title, flags=re.I)

	if not title or title.lower() in ['wayback machine', 'internet archive']:
	h1 = soup.find('h1')
	if h1:
	title = h1.get_text(strip=True)
	else:
	meta_title = soup.find('meta', property='og:title')
	if meta_title:
	title = meta_title.get('content', '')

	if not title or title.lower() in ['wayback machine', 'internet archive', 'no title']:
	parsed = urlparse(url)
	title = parsed.path.strip('/').replace('/', ' - ').replace('-', ' ').title()
	if not title:
	title = parsed.netloc.replace('.', ' ').title()

	if not title:
	title = 'No title'

	content = extract_main_content(html, url)

	if not content or len(content) < 100:
	print(f"Warning: Very little content extracted from Wayback snapshot for {url}")

	return {'url': url, 'title': title, 'content': content, 'source': 'wayback_machine'}

	except httpx.TimeoutException:
	print(f"Timeout fetching from Wayback Machine: {url}")
	return None
	except httpx.HTTPStatusError as e:
	print(f"HTTP error {e.response.status_code} fetching from Wayback Machine: {url}")
	return None
	except Exception as e:
	print(f"Error fetching from Wayback Machine {url}: {e}")
	return None


	def get_internal_links(html: str, base_url: str, same_domain_only: bool = True) -> list:
	"""Extract internal links from HTML."""
	soup = BeautifulSoup(html, 'lxml')
	base_parsed = urlparse(base_url)
	base_domain = base_parsed.netloc.lower()

	links = []
	seen = set()

	for a_tag in soup.find_all('a', href=True):
	href = a_tag['href']
	absolute_url = urljoin(base_url, href)
	parsed = urlparse(absolute_url)
	clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
	if parsed.query:
	clean_url += f"?{parsed.query}"

	if same_domain_only:
	if parsed.netloc.lower() != base_domain:
	continue

	if clean_url in seen or clean_url == base_url:
	continue

	seen.add(clean_url)
	links.append(clean_url)

	return links