Spaces:
Running
Running
| """Web scraper tool implementation.""" | |
| from __future__ import annotations | |
| from dataclasses import dataclass, field | |
| from typing import Any | |
| from urllib.parse import urlparse | |
| import httpx | |
| from src.tools.base import Tool, ToolParameter, ToolResult | |
| from src.utils.exceptions import ScrapingError | |
| from src.utils.logging import get_logger | |
| logger = get_logger(__name__) | |
| class WebScraperTool(Tool): | |
| """Tool for extracting content from web pages.""" | |
| name: str = "web_scrape" | |
| description: str = "Extract and parse content from a web page URL. Useful for getting detailed information from a specific page." | |
| parameters: list[ToolParameter] = field(default_factory=lambda: [ | |
| ToolParameter( | |
| name="url", | |
| type="string", | |
| description="The full URL to scrape", | |
| required=True, | |
| ), | |
| ToolParameter( | |
| name="extract_type", | |
| type="string", | |
| description="Type of content extraction: 'full' for all content, 'article' for main article, 'summary' for brief summary", | |
| required=False, | |
| default="article", | |
| enum=["full", "article", "summary"], | |
| ), | |
| ]) | |
| async def execute(self, **kwargs: Any) -> ToolResult: | |
| """Scrape content from a URL. | |
| Args: | |
| url: URL to scrape | |
| extract_type: Type of extraction (full/article/summary) | |
| Returns: | |
| ToolResult with scraped content | |
| """ | |
| url = kwargs.get("url", "") | |
| extract_type = kwargs.get("extract_type", "article") | |
| if not url: | |
| return ToolResult.fail("URL cannot be empty") | |
| # Validate URL | |
| try: | |
| parsed = urlparse(url) | |
| if not parsed.scheme or not parsed.netloc: | |
| return ToolResult.fail(f"Invalid URL: {url}") | |
| except Exception: | |
| return ToolResult.fail(f"Invalid URL format: {url}") | |
| try: | |
| content = await self._scrape_url(url, extract_type) | |
| if not content.get("text"): | |
| return ToolResult.fail(f"No content extracted from: {url}") | |
| return ToolResult.ok({ | |
| "url": url, | |
| "title": content.get("title", ""), | |
| "text": content.get("text", ""), | |
| "extract_type": extract_type, | |
| "word_count": len(content.get("text", "").split()), | |
| }) | |
| except ScrapingError as e: | |
| return ToolResult.fail(str(e)) | |
| except Exception as e: | |
| logger.error(f"Scraping failed for {url}: {e}") | |
| return ToolResult.fail(f"Scraping failed: {e}") | |
| async def _scrape_url(self, url: str, extract_type: str) -> dict[str, str]: | |
| """Scrape content from URL. | |
| Args: | |
| url: URL to scrape | |
| extract_type: Type of extraction | |
| Returns: | |
| Dictionary with title and text | |
| """ | |
| # Fetch the page | |
| async with httpx.AsyncClient() as client: | |
| response = await client.get( | |
| url, | |
| follow_redirects=True, | |
| timeout=30.0, | |
| headers={ | |
| "User-Agent": "Mozilla/5.0 (compatible; AskTheWebAgent/1.0)", | |
| }, | |
| ) | |
| response.raise_for_status() | |
| html = response.text | |
| # Extract content based on type | |
| if extract_type == "full": | |
| return self._extract_full(html) | |
| elif extract_type == "summary": | |
| return self._extract_summary(html) | |
| else: # article | |
| return self._extract_article(html) | |
| def _extract_full(self, html: str) -> dict[str, str]: | |
| """Extract all text content from HTML. | |
| Args: | |
| html: HTML content | |
| Returns: | |
| Dictionary with title and full text | |
| """ | |
| try: | |
| from bs4 import BeautifulSoup | |
| except ImportError: | |
| raise ScrapingError( | |
| "beautifulsoup4 package required. Install with: pip install beautifulsoup4" | |
| ) | |
| soup = BeautifulSoup(html, "html.parser") | |
| # Remove script and style elements | |
| for element in soup(["script", "style", "nav", "footer", "header"]): | |
| element.decompose() | |
| # Get title | |
| title = "" | |
| if soup.title: | |
| title = soup.title.string or "" | |
| # Get text | |
| text = soup.get_text(separator="\n", strip=True) | |
| # Clean up whitespace | |
| lines = [line.strip() for line in text.splitlines() if line.strip()] | |
| text = "\n".join(lines) | |
| return {"title": title, "text": text} | |
| def _extract_article(self, html: str) -> dict[str, str]: | |
| """Extract main article content using trafilatura. | |
| Args: | |
| html: HTML content | |
| Returns: | |
| Dictionary with title and article text | |
| """ | |
| try: | |
| import trafilatura | |
| except ImportError: | |
| # Fall back to full extraction | |
| logger.warning("trafilatura not installed, falling back to full extraction") | |
| return self._extract_full(html) | |
| # Extract with trafilatura | |
| result = trafilatura.extract( | |
| html, | |
| include_comments=False, | |
| include_tables=True, | |
| no_fallback=False, | |
| ) | |
| # Get metadata for title | |
| metadata = trafilatura.extract_metadata(html) | |
| title = metadata.title if metadata else "" | |
| return { | |
| "title": title or "", | |
| "text": result or "", | |
| } | |
| def _extract_summary(self, html: str) -> dict[str, str]: | |
| """Extract a brief summary from the page. | |
| Args: | |
| html: HTML content | |
| Returns: | |
| Dictionary with title and summary text | |
| """ | |
| # Get full article first | |
| content = self._extract_article(html) | |
| # Truncate to first ~500 words for summary | |
| words = content["text"].split() | |
| if len(words) > 500: | |
| content["text"] = " ".join(words[:500]) + "..." | |
| return content | |