Spaces:

AlBaraa63
/

MissionControlMCP

Sleeping

App Files Files Community

MissionControlMCP / tools /web_fetcher.py

AlBaraa63

Upload 33 files

f1b19d3 verified about 2 months ago

raw

history blame contribute delete

5.6 kB

	"""
	Web Fetcher Tool - Fetch and extract content from web pages
	"""
	import logging
	from typing import Dict, Any
	import sys
	import os

	# Add parent directory to path for imports
	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	from utils.helpers import validate_url, clean_text, format_timestamp

	logger = logging.getLogger(__name__)


	def fetch_web_content(url: str, extract_text_only: bool = True, timeout: int = 30) -> Dict[str, Any]:
	"""
	Fetch content from a web URL.

	Args:
	url: URL to fetch
	extract_text_only: If True, extract only text content; if False, return HTML
	timeout: Request timeout in seconds

	Returns:
	Dictionary containing fetched content, status code, and metadata
	"""
	try:
	import requests
	from bs4 import BeautifulSoup

	# Validate URL
	if not validate_url(url):
	raise ValueError(f"Invalid URL format: {url}")

	# Set headers to mimic a browser
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	# Fetch content
	response = requests.get(url, headers=headers, timeout=timeout)
	response.raise_for_status()

	content = ""
	content_type = response.headers.get('Content-Type', '')

	if extract_text_only and 'text/html' in content_type:
	# Parse HTML and extract text
	soup = BeautifulSoup(response.text, 'html.parser')

	# Extract title
	title = soup.title.string if soup.title else "No title"

	# Extract links
	links = []
	for link in soup.find_all('a', href=True):
	href = link.get('href', '')
	if href and not href.startswith('#'):
	links.append(href)

	# Remove script and style elements
	for script in soup(["script", "style", "nav", "footer", "header"]):
	script.decompose()

	# Get text
	text = soup.get_text()

	# Clean up text
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	content = '\n'.join(chunk for chunk in chunks if chunk)

	# Further clean
	content = clean_text(content)

	else:
	# Return raw content
	content = response.text
	title = "N/A (non-HTML content)"
	links = []

	# Build metadata
	metadata = {
	"url": url,
	"status_code": response.status_code,
	"content_type": content_type,
	"content_length": len(content),
	"encoding": response.encoding,
	"timestamp": format_timestamp(),
	"headers": dict(response.headers)
	}

	return {
	"content": content,
	"status_code": response.status_code,
	"title": title,
	"links": links,
	"metadata": metadata
	}

	except requests.exceptions.RequestException as e:
	logger.error(f"Request error fetching {url}: {e}")
	raise
	except Exception as e:
	logger.error(f"Error fetching web content: {e}")
	raise


	def fetch_multiple_urls(urls: list, extract_text_only: bool = True) -> list:
	"""
	Fetch content from multiple URLs.

	Args:
	urls: List of URLs to fetch
	extract_text_only: Whether to extract text only

	Returns:
	List of results for each URL
	"""
	results = []
	for idx, url in enumerate(urls):
	try:
	result = fetch_web_content(url, extract_text_only)
	result["index"] = idx
	result["success"] = True
	results.append(result)
	except Exception as e:
	logger.error(f"Error fetching URL at index {idx} ({url}): {e}")
	results.append({
	"index": idx,
	"url": url,
	"success": False,
	"error": str(e),
	"content": "",
	"status_code": 0
	})

	return results


	def extract_links(url: str) -> Dict[str, Any]:
	"""
	Extract all links from a web page.

	Args:
	url: URL to extract links from

	Returns:
	Dictionary with extracted links
	"""
	try:
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin

	response = requests.get(url, timeout=30)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, 'html.parser')

	links = []
	for link in soup.find_all('a', href=True):
	absolute_url = urljoin(url, link['href'])
	links.append({
	"text": link.get_text(strip=True),
	"href": absolute_url
	})

	return {
	"url": url,
	"total_links": len(links),
	"links": links
	}

	except Exception as e:
	logger.error(f"Error extracting links: {e}")
	raise