Spaces:

MHamdan
/

web-content-extractor

Sleeping

+from smolagents import Tool
+from typing import Any, Optional
+class SimpleTool(Tool):
+    name = "extract_web_content"
+    description = "Extracts and processes content from a given webpage."
+    inputs = {"url":{"type":"string","description":"The webpage URL to scrape."},"content_type":{"type":"string","nullable":True,"description":"Type of content to extract ('all', 'text', 'links', 'headers'). Defaults to 'all'."}}
+    output_type = "string"
+    def forward(self, url: str, content_type: Optional[str] = "all") -> str:
+        """Extracts and processes content from a given webpage.
+        Args:
+            url: The webpage URL to scrape.
+            content_type: Type of content to extract ('all', 'text', 'links', 'headers').
+                         Defaults to 'all'.
+        Returns:
+            str: Extracted and processed content from the webpage.
+        """
+        import requests
+        from bs4 import BeautifulSoup
+        from urllib.parse import urlparse
+        import re
+        try:
+            # Validate URL
+            parsed_url = urlparse(url)
+            if not all([parsed_url.scheme, parsed_url.netloc]):
+                return "Error: Invalid URL format. Please provide a valid URL."
+            # Fetch webpage
+            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
+            response = requests.get(url, headers=headers, timeout=10)
+            response.raise_for_status()
+            # Parse content
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Remove scripts and styles
+            for tag in soup(['script', 'style']):
+                tag.decompose()
+            # Handle different content types
+            if content_type == "text":
+                text = soup.get_text()
+                text = re.sub(r'\s+', ' ', text).strip()
+                return f"Text Content:\n{text[:2000]}..."
+            elif content_type == "links":
+                links = []
+                for link in soup.find_all('a', href=True):
+                    if link.text.strip() and link['href'].startswith(('http', 'https')):
+                        text = re.sub(r'\s+', ' ', link.text).strip()
+                        links.append(f"- {text}: {link['href']}")
+                return "Found Links:\n" + "\n".join(links[:10])
+            elif content_type == "headers":
+                headers = []
+                for h in soup.find_all(['h1', 'h2', 'h3']):
+                    text = re.sub(r'\s+', ' ', h.text).strip()
+                    if text:
+                        headers.append(f"- {text}")
+                return "Page Headers:\n" + "\n".join(headers)
+            else:
+                # Get basic info
+                title = soup.title.string if soup.title else "No title found"
+                title = re.sub(r'\s+', ' ', title).strip() if title else "No title found"
+                # Get text content
+                text = soup.get_text()
+                text = re.sub(r'\s+', ' ', text).strip()
+                # Format output
+                output = [
+                    f"URL: {url}",
+                    f"Title: {title}",
+                    "\nContent Preview:",
+                    text[:1000] + "..."
+                ]
+                return "\n".join(output)
+        except requests.exceptions.RequestException as e:
+            return f"Error accessing webpage: {str(e)}"
+        except Exception as e:
+            return f"Error processing webpage: {str(e)}"