MHamdan commited on
Commit
3058953
·
verified ·
1 Parent(s): 12e370c

Upload tool

Browse files
Files changed (3) hide show
  1. app.py +6 -0
  2. requirements.txt +3 -0
  3. tool.py +88 -0
app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from smolagents import launch_gradio_demo
2
+ from tool import SimpleTool
3
+
4
+ tool = SimpleTool()
5
+
6
+ launch_gradio_demo(tool)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ bs4
2
+ smolagents
3
+ requests
tool.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents import Tool
2
+ from typing import Any, Optional
3
+
4
+ class SimpleTool(Tool):
5
+ name = "extract_web_content"
6
+ description = "Extracts and processes content from a given webpage."
7
+ inputs = {"url":{"type":"string","description":"The webpage URL to scrape."},"content_type":{"type":"string","nullable":True,"description":"Type of content to extract ('all', 'text', 'links', 'headers'). Defaults to 'all'."}}
8
+ output_type = "string"
9
+
10
+ def forward(self, url: str, content_type: Optional[str] = "all") -> str:
11
+ """Extracts and processes content from a given webpage.
12
+
13
+ Args:
14
+ url: The webpage URL to scrape.
15
+ content_type: Type of content to extract ('all', 'text', 'links', 'headers').
16
+ Defaults to 'all'.
17
+
18
+ Returns:
19
+ str: Extracted and processed content from the webpage.
20
+ """
21
+ import requests
22
+ from bs4 import BeautifulSoup
23
+ from urllib.parse import urlparse
24
+ import re
25
+
26
+ try:
27
+ # Validate URL
28
+ parsed_url = urlparse(url)
29
+ if not all([parsed_url.scheme, parsed_url.netloc]):
30
+ return "Error: Invalid URL format. Please provide a valid URL."
31
+
32
+ # Fetch webpage
33
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
34
+ response = requests.get(url, headers=headers, timeout=10)
35
+ response.raise_for_status()
36
+
37
+ # Parse content
38
+ soup = BeautifulSoup(response.text, 'html.parser')
39
+
40
+ # Remove scripts and styles
41
+ for tag in soup(['script', 'style']):
42
+ tag.decompose()
43
+
44
+ # Handle different content types
45
+ if content_type == "text":
46
+ text = soup.get_text()
47
+ text = re.sub(r'\s+', ' ', text).strip()
48
+ return f"Text Content:\n{text[:2000]}..."
49
+
50
+ elif content_type == "links":
51
+ links = []
52
+ for link in soup.find_all('a', href=True):
53
+ if link.text.strip() and link['href'].startswith(('http', 'https')):
54
+ text = re.sub(r'\s+', ' ', link.text).strip()
55
+ links.append(f"- {text}: {link['href']}")
56
+ return "Found Links:\n" + "\n".join(links[:10])
57
+
58
+ elif content_type == "headers":
59
+ headers = []
60
+ for h in soup.find_all(['h1', 'h2', 'h3']):
61
+ text = re.sub(r'\s+', ' ', h.text).strip()
62
+ if text:
63
+ headers.append(f"- {text}")
64
+ return "Page Headers:\n" + "\n".join(headers)
65
+
66
+ else:
67
+ # Get basic info
68
+ title = soup.title.string if soup.title else "No title found"
69
+ title = re.sub(r'\s+', ' ', title).strip() if title else "No title found"
70
+
71
+ # Get text content
72
+ text = soup.get_text()
73
+ text = re.sub(r'\s+', ' ', text).strip()
74
+
75
+ # Format output
76
+ output = [
77
+ f"URL: {url}",
78
+ f"Title: {title}",
79
+ "\nContent Preview:",
80
+ text[:1000] + "..."
81
+ ]
82
+
83
+ return "\n".join(output)
84
+
85
+ except requests.exceptions.RequestException as e:
86
+ return f"Error accessing webpage: {str(e)}"
87
+ except Exception as e:
88
+ return f"Error processing webpage: {str(e)}"