Spaces:

debashis2007
/

ask-the-web-agent

Running

App Files Files Community

ask-the-web-agent / src /tools /web_scraper.py

debashis2007

Upload folder using huggingface_hub

75bea1c verified about 2 months ago

raw

history blame contribute delete

6.22 kB

	"""Web scraper tool implementation."""

	from __future__ import annotations

	from dataclasses import dataclass, field
	from typing import Any
	from urllib.parse import urlparse

	import httpx

	from src.tools.base import Tool, ToolParameter, ToolResult
	from src.utils.exceptions import ScrapingError
	from src.utils.logging import get_logger

	logger = get_logger(__name__)


	@dataclass
	class WebScraperTool(Tool):
	"""Tool for extracting content from web pages."""

	name: str = "web_scrape"
	description: str = "Extract and parse content from a web page URL. Useful for getting detailed information from a specific page."
	parameters: list[ToolParameter] = field(default_factory=lambda: [
	ToolParameter(
	name="url",
	type="string",
	description="The full URL to scrape",
	required=True,
	),
	ToolParameter(
	name="extract_type",
	type="string",
	description="Type of content extraction: 'full' for all content, 'article' for main article, 'summary' for brief summary",
	required=False,
	default="article",
	enum=["full", "article", "summary"],
	),
	])

	async def execute(self, **kwargs: Any) -> ToolResult:
	"""Scrape content from a URL.

	Args:
	url: URL to scrape
	extract_type: Type of extraction (full/article/summary)

	Returns:
	ToolResult with scraped content
	"""
	url = kwargs.get("url", "")
	extract_type = kwargs.get("extract_type", "article")

	if not url:
	return ToolResult.fail("URL cannot be empty")

	# Validate URL
	try:
	parsed = urlparse(url)
	if not parsed.scheme or not parsed.netloc:
	return ToolResult.fail(f"Invalid URL: {url}")
	except Exception:
	return ToolResult.fail(f"Invalid URL format: {url}")

	try:
	content = await self._scrape_url(url, extract_type)

	if not content.get("text"):
	return ToolResult.fail(f"No content extracted from: {url}")

	return ToolResult.ok({
	"url": url,
	"title": content.get("title", ""),
	"text": content.get("text", ""),
	"extract_type": extract_type,
	"word_count": len(content.get("text", "").split()),
	})

	except ScrapingError as e:
	return ToolResult.fail(str(e))
	except Exception as e:
	logger.error(f"Scraping failed for {url}: {e}")
	return ToolResult.fail(f"Scraping failed: {e}")

	async def _scrape_url(self, url: str, extract_type: str) -> dict[str, str]:
	"""Scrape content from URL.

	Args:
	url: URL to scrape
	extract_type: Type of extraction

	Returns:
	Dictionary with title and text
	"""
	# Fetch the page
	async with httpx.AsyncClient() as client:
	response = await client.get(
	url,
	follow_redirects=True,
	timeout=30.0,
	headers={
	"User-Agent": "Mozilla/5.0 (compatible; AskTheWebAgent/1.0)",
	},
	)
	response.raise_for_status()
	html = response.text

	# Extract content based on type
	if extract_type == "full":
	return self._extract_full(html)
	elif extract_type == "summary":
	return self._extract_summary(html)
	else: # article
	return self._extract_article(html)

	def _extract_full(self, html: str) -> dict[str, str]:
	"""Extract all text content from HTML.

	Args:
	html: HTML content

	Returns:
	Dictionary with title and full text
	"""
	try:
	from bs4 import BeautifulSoup
	except ImportError:
	raise ScrapingError(
	"beautifulsoup4 package required. Install with: pip install beautifulsoup4"
	)

	soup = BeautifulSoup(html, "html.parser")

	# Remove script and style elements
	for element in soup(["script", "style", "nav", "footer", "header"]):
	element.decompose()

	# Get title
	title = ""
	if soup.title:
	title = soup.title.string or ""

	# Get text
	text = soup.get_text(separator="\n", strip=True)

	# Clean up whitespace
	lines = [line.strip() for line in text.splitlines() if line.strip()]
	text = "\n".join(lines)

	return {"title": title, "text": text}

	def _extract_article(self, html: str) -> dict[str, str]:
	"""Extract main article content using trafilatura.

	Args:
	html: HTML content

	Returns:
	Dictionary with title and article text
	"""
	try:
	import trafilatura
	except ImportError:
	# Fall back to full extraction
	logger.warning("trafilatura not installed, falling back to full extraction")
	return self._extract_full(html)

	# Extract with trafilatura
	result = trafilatura.extract(
	html,
	include_comments=False,
	include_tables=True,
	no_fallback=False,
	)

	# Get metadata for title
	metadata = trafilatura.extract_metadata(html)
	title = metadata.title if metadata else ""

	return {
	"title": title or "",
	"text": result or "",
	}

	def _extract_summary(self, html: str) -> dict[str, str]:
	"""Extract a brief summary from the page.

	Args:
	html: HTML content

	Returns:
	Dictionary with title and summary text
	"""
	# Get full article first
	content = self._extract_article(html)

	# Truncate to first ~500 words for summary
	words = content["text"].split()
	if len(words) > 500:
	content["text"] = " ".join(words[:500]) + "..."

	return content