Spaces:

AlBaraa63
/

MissionControlMCP

Sleeping

File size: 5,595 Bytes

f1b19d3

"""

Web Fetcher Tool - Fetch and extract content from web pages

"""
import logging
from typing import Dict, Any
import sys
import os

# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from utils.helpers import validate_url, clean_text, format_timestamp

logger = logging.getLogger(__name__)


def fetch_web_content(url: str, extract_text_only: bool = True, timeout: int = 30) -> Dict[str, Any]:
    """

    Fetch content from a web URL.

    

    Args:

        url: URL to fetch

        extract_text_only: If True, extract only text content; if False, return HTML

        timeout: Request timeout in seconds

        

    Returns:

        Dictionary containing fetched content, status code, and metadata

    """
    try:
        import requests
        from bs4 import BeautifulSoup
        
        # Validate URL
        if not validate_url(url):
            raise ValueError(f"Invalid URL format: {url}")
        
        # Set headers to mimic a browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # Fetch content
        response = requests.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()
        
        content = ""
        content_type = response.headers.get('Content-Type', '')
        
        if extract_text_only and 'text/html' in content_type:
            # Parse HTML and extract text
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract title
            title = soup.title.string if soup.title else "No title"
            
            # Extract links
            links = []
            for link in soup.find_all('a', href=True):
                href = link.get('href', '')
                if href and not href.startswith('#'):
                    links.append(href)
            
            # Remove script and style elements
            for script in soup(["script", "style", "nav", "footer", "header"]):
                script.decompose()
            
            # Get text
            text = soup.get_text()
            
            # Clean up text
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            content = '\n'.join(chunk for chunk in chunks if chunk)
            
            # Further clean
            content = clean_text(content)
            
        else:
            # Return raw content
            content = response.text
            title = "N/A (non-HTML content)"
            links = []
        
        # Build metadata
        metadata = {
            "url": url,
            "status_code": response.status_code,
            "content_type": content_type,
            "content_length": len(content),
            "encoding": response.encoding,
            "timestamp": format_timestamp(),
            "headers": dict(response.headers)
        }
        
        return {
            "content": content,
            "status_code": response.status_code,
            "title": title,
            "links": links,
            "metadata": metadata
        }
        
    except requests.exceptions.RequestException as e:
        logger.error(f"Request error fetching {url}: {e}")
        raise
    except Exception as e:
        logger.error(f"Error fetching web content: {e}")
        raise


def fetch_multiple_urls(urls: list, extract_text_only: bool = True) -> list:
    """

    Fetch content from multiple URLs.

    

    Args:

        urls: List of URLs to fetch

        extract_text_only: Whether to extract text only

        

    Returns:

        List of results for each URL

    """
    results = []
    for idx, url in enumerate(urls):
        try:
            result = fetch_web_content(url, extract_text_only)
            result["index"] = idx
            result["success"] = True
            results.append(result)
        except Exception as e:
            logger.error(f"Error fetching URL at index {idx} ({url}): {e}")
            results.append({
                "index": idx,
                "url": url,
                "success": False,
                "error": str(e),
                "content": "",
                "status_code": 0
            })
    
    return results


def extract_links(url: str) -> Dict[str, Any]:
    """

    Extract all links from a web page.

    

    Args:

        url: URL to extract links from

        

    Returns:

        Dictionary with extracted links

    """
    try:
        import requests
        from bs4 import BeautifulSoup
        from urllib.parse import urljoin
        
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        links = []
        for link in soup.find_all('a', href=True):
            absolute_url = urljoin(url, link['href'])
            links.append({
                "text": link.get_text(strip=True),
                "href": absolute_url
            })
        
        return {
            "url": url,
            "total_links": len(links),
            "links": links
        }
        
    except Exception as e:
        logger.error(f"Error extracting links: {e}")
        raise