File size: 2,147 Bytes
06ba83e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from agency_swarm.tools import BaseTool
from pydantic import Field
import requests
from bs4 import BeautifulSoup

class WebScrapingTool(BaseTool):
    """
    A tool for performing web scraping tasks using BeautifulSoup.
    This tool fetches a web page, parses the HTML content, and extracts specific data based on given criteria or tags.
    """

    url: str = Field(
        ..., description="The URL of the web page to scrape."
    )
    tag: str = Field(
        ..., description="The HTML tag to search for in the web page."
    )
    attribute: str = Field(
        None, description="The attribute of the HTML tag to filter by, if any."
    )
    attribute_value: str = Field(
        None, description="The value of the attribute to filter by, if any."
    )

    def run(self):
        """
        Fetches the web page, parses the HTML content, and extracts data based on the specified tag and attribute.
        """
        try:
            # Make an HTTP request to fetch the web page
            response = requests.get(self.url)
            response.raise_for_status()  # Raise an error for bad responses

            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find all elements matching the specified tag and attribute
            if self.attribute and self.attribute_value:
                elements = soup.find_all(self.tag, {self.attribute: self.attribute_value})
            else:
                elements = soup.find_all(self.tag)

            # Extract and return the text content of the found elements
            extracted_data = [element.get_text(strip=True) for element in elements]
            return extracted_data

        except requests.RequestException as e:
            return f"An error occurred while fetching the web page: {e}"
        except Exception as e:
            return f"An error occurred during parsing or extraction: {e}"

if __name__ == "__main__":
    tool = WebScrapingTool(
        url="https://example.com",
        tag="p",
        attribute="class",
        attribute_value="content"
    )
    print(tool.run())