MissionControlMCP / tools /web_fetcher.py
AlBaraa63's picture
Upload 33 files
f1b19d3 verified
"""
Web Fetcher Tool - Fetch and extract content from web pages
"""
import logging
from typing import Dict, Any
import sys
import os
# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.helpers import validate_url, clean_text, format_timestamp
logger = logging.getLogger(__name__)
def fetch_web_content(url: str, extract_text_only: bool = True, timeout: int = 30) -> Dict[str, Any]:
"""
Fetch content from a web URL.
Args:
url: URL to fetch
extract_text_only: If True, extract only text content; if False, return HTML
timeout: Request timeout in seconds
Returns:
Dictionary containing fetched content, status code, and metadata
"""
try:
import requests
from bs4 import BeautifulSoup
# Validate URL
if not validate_url(url):
raise ValueError(f"Invalid URL format: {url}")
# Set headers to mimic a browser
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# Fetch content
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
content = ""
content_type = response.headers.get('Content-Type', '')
if extract_text_only and 'text/html' in content_type:
# Parse HTML and extract text
soup = BeautifulSoup(response.text, 'html.parser')
# Extract title
title = soup.title.string if soup.title else "No title"
# Extract links
links = []
for link in soup.find_all('a', href=True):
href = link.get('href', '')
if href and not href.startswith('#'):
links.append(href)
# Remove script and style elements
for script in soup(["script", "style", "nav", "footer", "header"]):
script.decompose()
# Get text
text = soup.get_text()
# Clean up text
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
content = '\n'.join(chunk for chunk in chunks if chunk)
# Further clean
content = clean_text(content)
else:
# Return raw content
content = response.text
title = "N/A (non-HTML content)"
links = []
# Build metadata
metadata = {
"url": url,
"status_code": response.status_code,
"content_type": content_type,
"content_length": len(content),
"encoding": response.encoding,
"timestamp": format_timestamp(),
"headers": dict(response.headers)
}
return {
"content": content,
"status_code": response.status_code,
"title": title,
"links": links,
"metadata": metadata
}
except requests.exceptions.RequestException as e:
logger.error(f"Request error fetching {url}: {e}")
raise
except Exception as e:
logger.error(f"Error fetching web content: {e}")
raise
def fetch_multiple_urls(urls: list, extract_text_only: bool = True) -> list:
"""
Fetch content from multiple URLs.
Args:
urls: List of URLs to fetch
extract_text_only: Whether to extract text only
Returns:
List of results for each URL
"""
results = []
for idx, url in enumerate(urls):
try:
result = fetch_web_content(url, extract_text_only)
result["index"] = idx
result["success"] = True
results.append(result)
except Exception as e:
logger.error(f"Error fetching URL at index {idx} ({url}): {e}")
results.append({
"index": idx,
"url": url,
"success": False,
"error": str(e),
"content": "",
"status_code": 0
})
return results
def extract_links(url: str) -> Dict[str, Any]:
"""
Extract all links from a web page.
Args:
url: URL to extract links from
Returns:
Dictionary with extracted links
"""
try:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
response = requests.get(url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
links = []
for link in soup.find_all('a', href=True):
absolute_url = urljoin(url, link['href'])
links.append({
"text": link.get_text(strip=True),
"href": absolute_url
})
return {
"url": url,
"total_links": len(links),
"links": links
}
except Exception as e:
logger.error(f"Error extracting links: {e}")
raise