QuickAgent / ValidationAgent /tools /SearchAndScrape.py
varun324242's picture
Upload 20 files
eceb45a verified
from agency_swarm.tools import BaseTool
from pydantic import Field
import logging
import os
import time
try:
from firecrawl import FirecrawlApp
except ImportError:
raise ImportError(
"Required packages not found. Please install them using:\n"
"pip install firecrawl"
)
# Initialize Firecrawl
FIRECRAWL_API_KEY = "fc-5fadfeae30314d4ea8a3d9afaa75c493"
firecrawl_app = FirecrawlApp(api_key=FIRECRAWL_API_KEY)
class SearchAndScrape(BaseTool):
"""
This tool scrapes content using Firecrawl based on a provided query.
"""
query: str = Field(
...,
description="The search query to look for",
examples=["market trends in technology 2024", "AI industry statistics"]
)
def scrape_with_retry(self, url, max_retries=3):
"""Helper function to scrape URL with retry logic"""
problematic_domains = [
'sparktoro.com', 'j-jdis.com', 'linkedin.com',
'facebook.com', 'twitter.com', 'reddit.com', '.pdf'
]
if any(domain in url.lower() for domain in problematic_domains):
logging.info(f"Skipping problematic URL: {url}")
return None
for attempt in range(max_retries):
try:
response = firecrawl_app.scrape_url(
url=url,
params={'formats': ['markdown']}
)
if response and response.get('markdown'):
content = response.get('markdown')
if len(content.strip()) > 200:
return content
return None
except Exception as e:
logging.error(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
if attempt < max_retries - 1:
time.sleep(2)
continue
return None
def run(self):
logging.info(f"Scraping content for query: {self.query}")
# Here you would typically have a list of URLs to scrape based on the query.
# For this example, we will assume a predefined list of URLs.
search_results = ["http://example.com/article1", "http://example.com/article2"] # Placeholder URLs
if not search_results:
return "No search results found."
for url in search_results:
logging.info(f"Attempting to scrape URL: {url}")
content = self.scrape_with_retry(url)
if content:
logging.info(f"Successfully scraped content from {url}")
return f"Content from {url}:\n\n{content}"
return "Failed to scrape content from any of the search results"