Spaces:
Sleeping
Sleeping
| """ | |
| Web Fetcher Tool - Fetch and extract content from web pages | |
| """ | |
| import logging | |
| from typing import Dict, Any | |
| import sys | |
| import os | |
| # Add parent directory to path for imports | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from utils.helpers import validate_url, clean_text, format_timestamp | |
| logger = logging.getLogger(__name__) | |
| def fetch_web_content(url: str, extract_text_only: bool = True, timeout: int = 30) -> Dict[str, Any]: | |
| """ | |
| Fetch content from a web URL. | |
| Args: | |
| url: URL to fetch | |
| extract_text_only: If True, extract only text content; if False, return HTML | |
| timeout: Request timeout in seconds | |
| Returns: | |
| Dictionary containing fetched content, status code, and metadata | |
| """ | |
| try: | |
| import requests | |
| from bs4 import BeautifulSoup | |
| # Validate URL | |
| if not validate_url(url): | |
| raise ValueError(f"Invalid URL format: {url}") | |
| # Set headers to mimic a browser | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| # Fetch content | |
| response = requests.get(url, headers=headers, timeout=timeout) | |
| response.raise_for_status() | |
| content = "" | |
| content_type = response.headers.get('Content-Type', '') | |
| if extract_text_only and 'text/html' in content_type: | |
| # Parse HTML and extract text | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Extract title | |
| title = soup.title.string if soup.title else "No title" | |
| # Extract links | |
| links = [] | |
| for link in soup.find_all('a', href=True): | |
| href = link.get('href', '') | |
| if href and not href.startswith('#'): | |
| links.append(href) | |
| # Remove script and style elements | |
| for script in soup(["script", "style", "nav", "footer", "header"]): | |
| script.decompose() | |
| # Get text | |
| text = soup.get_text() | |
| # Clean up text | |
| lines = (line.strip() for line in text.splitlines()) | |
| chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
| content = '\n'.join(chunk for chunk in chunks if chunk) | |
| # Further clean | |
| content = clean_text(content) | |
| else: | |
| # Return raw content | |
| content = response.text | |
| title = "N/A (non-HTML content)" | |
| links = [] | |
| # Build metadata | |
| metadata = { | |
| "url": url, | |
| "status_code": response.status_code, | |
| "content_type": content_type, | |
| "content_length": len(content), | |
| "encoding": response.encoding, | |
| "timestamp": format_timestamp(), | |
| "headers": dict(response.headers) | |
| } | |
| return { | |
| "content": content, | |
| "status_code": response.status_code, | |
| "title": title, | |
| "links": links, | |
| "metadata": metadata | |
| } | |
| except requests.exceptions.RequestException as e: | |
| logger.error(f"Request error fetching {url}: {e}") | |
| raise | |
| except Exception as e: | |
| logger.error(f"Error fetching web content: {e}") | |
| raise | |
| def fetch_multiple_urls(urls: list, extract_text_only: bool = True) -> list: | |
| """ | |
| Fetch content from multiple URLs. | |
| Args: | |
| urls: List of URLs to fetch | |
| extract_text_only: Whether to extract text only | |
| Returns: | |
| List of results for each URL | |
| """ | |
| results = [] | |
| for idx, url in enumerate(urls): | |
| try: | |
| result = fetch_web_content(url, extract_text_only) | |
| result["index"] = idx | |
| result["success"] = True | |
| results.append(result) | |
| except Exception as e: | |
| logger.error(f"Error fetching URL at index {idx} ({url}): {e}") | |
| results.append({ | |
| "index": idx, | |
| "url": url, | |
| "success": False, | |
| "error": str(e), | |
| "content": "", | |
| "status_code": 0 | |
| }) | |
| return results | |
| def extract_links(url: str) -> Dict[str, Any]: | |
| """ | |
| Extract all links from a web page. | |
| Args: | |
| url: URL to extract links from | |
| Returns: | |
| Dictionary with extracted links | |
| """ | |
| try: | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin | |
| response = requests.get(url, timeout=30) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| links = [] | |
| for link in soup.find_all('a', href=True): | |
| absolute_url = urljoin(url, link['href']) | |
| links.append({ | |
| "text": link.get_text(strip=True), | |
| "href": absolute_url | |
| }) | |
| return { | |
| "url": url, | |
| "total_links": len(links), | |
| "links": links | |
| } | |
| except Exception as e: | |
| logger.error(f"Error extracting links: {e}") | |
| raise | |