from smolagents import Tool from typing import Any, Optional class SimpleTool(Tool): name = "extract_web_content" description = "Extracts and processes content from a given webpage." inputs = {"url":{"type":"string","description":"The webpage URL to scrape."},"content_type":{"type":"string","nullable":True,"description":"Type of content to extract ('all', 'text', 'links', 'headers'). Defaults to 'all'."}} output_type = "string" def forward(self, url: str, content_type: Optional[str] = "all") -> str: """Extracts and processes content from a given webpage. Args: url: The webpage URL to scrape. content_type: Type of content to extract ('all', 'text', 'links', 'headers'). Defaults to 'all'. Returns: str: Extracted and processed content from the webpage. """ import requests from bs4 import BeautifulSoup from urllib.parse import urlparse import re try: # Validate URL parsed_url = urlparse(url) if not all([parsed_url.scheme, parsed_url.netloc]): return "Error: Invalid URL format. Please provide a valid URL." # Fetch webpage headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'} response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() # Parse content soup = BeautifulSoup(response.text, 'html.parser') # Remove scripts and styles for tag in soup(['script', 'style']): tag.decompose() # Handle different content types if content_type == "text": text = soup.get_text() text = re.sub(r'\s+', ' ', text).strip() return f"Text Content:\n{text[:2000]}..." elif content_type == "links": links = [] for link in soup.find_all('a', href=True): if link.text.strip() and link['href'].startswith(('http', 'https')): text = re.sub(r'\s+', ' ', link.text).strip() links.append(f"- {text}: {link['href']}") return "Found Links:\n" + "\n".join(links[:10]) elif content_type == "headers": headers = [] for h in soup.find_all(['h1', 'h2', 'h3']): text = re.sub(r'\s+', ' ', h.text).strip() if text: headers.append(f"- {text}") return "Page Headers:\n" + "\n".join(headers) else: # Get basic info title = soup.title.string if soup.title else "No title found" title = re.sub(r'\s+', ' ', title).strip() if title else "No title found" # Get text content text = soup.get_text() text = re.sub(r'\s+', ' ', text).strip() # Format output output = [ f"URL: {url}", f"Title: {title}", "\nContent Preview:", text[:1000] + "..." ] return "\n".join(output) except requests.exceptions.RequestException as e: return f"Error accessing webpage: {str(e)}" except Exception as e: return f"Error processing webpage: {str(e)}"