from agency_swarm.tools import BaseTool from pydantic import Field import logging import os import time try: from firecrawl import FirecrawlApp except ImportError: raise ImportError( "Required packages not found. Please install them using:\n" "pip install firecrawl" ) # Initialize Firecrawl FIRECRAWL_API_KEY = "fc-5fadfeae30314d4ea8a3d9afaa75c493" firecrawl_app = FirecrawlApp(api_key=FIRECRAWL_API_KEY) class SearchAndScrape(BaseTool): """ This tool scrapes content using Firecrawl based on a provided query. """ query: str = Field( ..., description="The search query to look for", examples=["market trends in technology 2024", "AI industry statistics"] ) def scrape_with_retry(self, url, max_retries=3): """Helper function to scrape URL with retry logic""" problematic_domains = [ 'sparktoro.com', 'j-jdis.com', 'linkedin.com', 'facebook.com', 'twitter.com', 'reddit.com', '.pdf' ] if any(domain in url.lower() for domain in problematic_domains): logging.info(f"Skipping problematic URL: {url}") return None for attempt in range(max_retries): try: response = firecrawl_app.scrape_url( url=url, params={'formats': ['markdown']} ) if response and response.get('markdown'): content = response.get('markdown') if len(content.strip()) > 200: return content return None except Exception as e: logging.error(f"Attempt {attempt + 1} failed for {url}: {str(e)}") if attempt < max_retries - 1: time.sleep(2) continue return None def run(self): logging.info(f"Scraping content for query: {self.query}") # Here you would typically have a list of URLs to scrape based on the query. # For this example, we will assume a predefined list of URLs. search_results = ["http://example.com/article1", "http://example.com/article2"] # Placeholder URLs if not search_results: return "No search results found." for url in search_results: logging.info(f"Attempting to scrape URL: {url}") content = self.scrape_with_retry(url) if content: logging.info(f"Successfully scraped content from {url}") return f"Content from {url}:\n\n{content}" return "Failed to scrape content from any of the search results"