Spaces:

varun324242
/

QuickAgent

Configuration error

App Files Files Community

QuickAgent / ValidationAgent /tools /SearchAndScrape.py

varun324242

Upload 20 files

eceb45a verified over 1 year ago

raw

history blame contribute delete

2.7 kB

	from agency_swarm.tools import BaseTool
	from pydantic import Field
	import logging
	import os
	import time

	try:
	from firecrawl import FirecrawlApp
	except ImportError:
	raise ImportError(
	"Required packages not found. Please install them using:\n"
	"pip install firecrawl"
	)

	# Initialize Firecrawl
	FIRECRAWL_API_KEY = "fc-5fadfeae30314d4ea8a3d9afaa75c493"
	firecrawl_app = FirecrawlApp(api_key=FIRECRAWL_API_KEY)

	class SearchAndScrape(BaseTool):
	"""
	This tool scrapes content using Firecrawl based on a provided query.
	"""

	query: str = Field(
	...,
	description="The search query to look for",
	examples=["market trends in technology 2024", "AI industry statistics"]
	)

	def scrape_with_retry(self, url, max_retries=3):
	"""Helper function to scrape URL with retry logic"""
	problematic_domains = [
	'sparktoro.com', 'j-jdis.com', 'linkedin.com',
	'facebook.com', 'twitter.com', 'reddit.com', '.pdf'
	]

	if any(domain in url.lower() for domain in problematic_domains):
	logging.info(f"Skipping problematic URL: {url}")
	return None

	for attempt in range(max_retries):
	try:
	response = firecrawl_app.scrape_url(
	url=url,
	params={'formats': ['markdown']}
	)

	if response and response.get('markdown'):
	content = response.get('markdown')
	if len(content.strip()) > 200:
	return content
	return None
	except Exception as e:
	logging.error(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
	if attempt < max_retries - 1:
	time.sleep(2)
	continue
	return None

	def run(self):
	logging.info(f"Scraping content for query: {self.query}")
	# Here you would typically have a list of URLs to scrape based on the query.
	# For this example, we will assume a predefined list of URLs.
	search_results = ["http://example.com/article1", "http://example.com/article2"] # Placeholder URLs

	if not search_results:
	return "No search results found."

	for url in search_results:
	logging.info(f"Attempting to scrape URL: {url}")
	content = self.scrape_with_retry(url)
	if content:
	logging.info(f"Successfully scraped content from {url}")
	return f"Content from {url}:\n\n{content}"

	return "Failed to scrape content from any of the search results"