Spaces:
Sleeping
Sleeping
| from smolagents import Tool | |
| from typing import Any, Optional | |
| class SimpleTool(Tool): | |
| name = "extract_web_content" | |
| description = "Extracts and processes content from a given webpage." | |
| inputs = {"url":{"type":"string","description":"The webpage URL to scrape."},"content_type":{"type":"string","nullable":True,"description":"Type of content to extract ('all', 'text', 'links', 'headers'). Defaults to 'all'."}} | |
| output_type = "string" | |
| def forward(self, url: str, content_type: Optional[str] = "all") -> str: | |
| """Extracts and processes content from a given webpage. | |
| Args: | |
| url: The webpage URL to scrape. | |
| content_type: Type of content to extract ('all', 'text', 'links', 'headers'). | |
| Defaults to 'all'. | |
| Returns: | |
| str: Extracted and processed content from the webpage. | |
| """ | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urlparse | |
| import re | |
| try: | |
| # Validate URL | |
| parsed_url = urlparse(url) | |
| if not all([parsed_url.scheme, parsed_url.netloc]): | |
| return "Error: Invalid URL format. Please provide a valid URL." | |
| # Fetch webpage | |
| headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'} | |
| response = requests.get(url, headers=headers, timeout=10) | |
| response.raise_for_status() | |
| # Parse content | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Remove scripts and styles | |
| for tag in soup(['script', 'style']): | |
| tag.decompose() | |
| # Handle different content types | |
| if content_type == "text": | |
| text = soup.get_text() | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return f"Text Content:\n{text[:2000]}..." | |
| elif content_type == "links": | |
| links = [] | |
| for link in soup.find_all('a', href=True): | |
| if link.text.strip() and link['href'].startswith(('http', 'https')): | |
| text = re.sub(r'\s+', ' ', link.text).strip() | |
| links.append(f"- {text}: {link['href']}") | |
| return "Found Links:\n" + "\n".join(links[:10]) | |
| elif content_type == "headers": | |
| headers = [] | |
| for h in soup.find_all(['h1', 'h2', 'h3']): | |
| text = re.sub(r'\s+', ' ', h.text).strip() | |
| if text: | |
| headers.append(f"- {text}") | |
| return "Page Headers:\n" + "\n".join(headers) | |
| else: | |
| # Get basic info | |
| title = soup.title.string if soup.title else "No title found" | |
| title = re.sub(r'\s+', ' ', title).strip() if title else "No title found" | |
| # Get text content | |
| text = soup.get_text() | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| # Format output | |
| output = [ | |
| f"URL: {url}", | |
| f"Title: {title}", | |
| "\nContent Preview:", | |
| text[:1000] + "..." | |
| ] | |
| return "\n".join(output) | |
| except requests.exceptions.RequestException as e: | |
| return f"Error accessing webpage: {str(e)}" | |
| except Exception as e: | |
| return f"Error processing webpage: {str(e)}" |