Spaces:

varun324242
/

agent1

Runtime error

App Files Files Community

agent1 / CompetitorTrackingAgent /tools /WebScrapingTool.py

varun324242

Upload folder using huggingface_hub

06ba83e verified over 1 year ago

raw

history blame contribute delete

2.15 kB

	from agency_swarm.tools import BaseTool
	from pydantic import Field
	import requests
	from bs4 import BeautifulSoup

	class WebScrapingTool(BaseTool):
	"""
	A tool for performing web scraping tasks using BeautifulSoup.
	This tool fetches a web page, parses the HTML content, and extracts specific data based on given criteria or tags.
	"""

	url: str = Field(
	..., description="The URL of the web page to scrape."
	)
	tag: str = Field(
	..., description="The HTML tag to search for in the web page."
	)
	attribute: str = Field(
	None, description="The attribute of the HTML tag to filter by, if any."
	)
	attribute_value: str = Field(
	None, description="The value of the attribute to filter by, if any."
	)

	def run(self):
	"""
	Fetches the web page, parses the HTML content, and extracts data based on the specified tag and attribute.
	"""
	try:
	# Make an HTTP request to fetch the web page
	response = requests.get(self.url)
	response.raise_for_status() # Raise an error for bad responses

	# Parse the HTML content using BeautifulSoup
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find all elements matching the specified tag and attribute
	if self.attribute and self.attribute_value:
	elements = soup.find_all(self.tag, {self.attribute: self.attribute_value})
	else:
	elements = soup.find_all(self.tag)

	# Extract and return the text content of the found elements
	extracted_data = [element.get_text(strip=True) for element in elements]
	return extracted_data

	except requests.RequestException as e:
	return f"An error occurred while fetching the web page: {e}"
	except Exception as e:
	return f"An error occurred during parsing or extraction: {e}"

	if __name__ == "__main__":
	tool = WebScrapingTool(
	url="https://example.com",
	tag="p",
	attribute="class",
	attribute_value="content"
	)
	print(tool.run())