Spaces:

MHamdan
/

web-content-extractor

Sleeping

App Files Files Community

web-content-extractor / tool.py

MHamdan

Upload tool

3058953 verified 11 months ago

raw

history blame contribute delete

3.49 kB

	from smolagents import Tool
	from typing import Any, Optional

	class SimpleTool(Tool):
	name = "extract_web_content"
	description = "Extracts and processes content from a given webpage."
	inputs = {"url":{"type":"string","description":"The webpage URL to scrape."},"content_type":{"type":"string","nullable":True,"description":"Type of content to extract ('all', 'text', 'links', 'headers'). Defaults to 'all'."}}
	output_type = "string"

	def forward(self, url: str, content_type: Optional[str] = "all") -> str:
	"""Extracts and processes content from a given webpage.

	Args:
	url: The webpage URL to scrape.
	content_type: Type of content to extract ('all', 'text', 'links', 'headers').
	Defaults to 'all'.

	Returns:
	str: Extracted and processed content from the webpage.
	"""
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse
	import re

	try:
	# Validate URL
	parsed_url = urlparse(url)
	if not all([parsed_url.scheme, parsed_url.netloc]):
	return "Error: Invalid URL format. Please provide a valid URL."

	# Fetch webpage
	headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	# Parse content
	soup = BeautifulSoup(response.text, 'html.parser')

	# Remove scripts and styles
	for tag in soup(['script', 'style']):
	tag.decompose()

	# Handle different content types
	if content_type == "text":
	text = soup.get_text()
	text = re.sub(r'\s+', ' ', text).strip()
	return f"Text Content:\n{text[:2000]}..."

	elif content_type == "links":
	links = []
	for link in soup.find_all('a', href=True):
	if link.text.strip() and link['href'].startswith(('http', 'https')):
	text = re.sub(r'\s+', ' ', link.text).strip()
	links.append(f"- {text}: {link['href']}")
	return "Found Links:\n" + "\n".join(links[:10])

	elif content_type == "headers":
	headers = []
	for h in soup.find_all(['h1', 'h2', 'h3']):
	text = re.sub(r'\s+', ' ', h.text).strip()
	if text:
	headers.append(f"- {text}")
	return "Page Headers:\n" + "\n".join(headers)

	else:
	# Get basic info
	title = soup.title.string if soup.title else "No title found"
	title = re.sub(r'\s+', ' ', title).strip() if title else "No title found"

	# Get text content
	text = soup.get_text()
	text = re.sub(r'\s+', ' ', text).strip()

	# Format output
	output = [
	f"URL: {url}",
	f"Title: {title}",
	"\nContent Preview:",
	text[:1000] + "..."
	]

	return "\n".join(output)

	except requests.exceptions.RequestException as e:
	return f"Error accessing webpage: {str(e)}"
	except Exception as e:
	return f"Error processing webpage: {str(e)}"