Spaces:
Configuration error
Configuration error
| from agency_swarm.tools import BaseTool | |
| from pydantic import Field | |
| import logging | |
| import os | |
| import time | |
| try: | |
| from firecrawl import FirecrawlApp | |
| except ImportError: | |
| raise ImportError( | |
| "Required packages not found. Please install them using:\n" | |
| "pip install firecrawl" | |
| ) | |
| # Initialize Firecrawl | |
| FIRECRAWL_API_KEY = "fc-5fadfeae30314d4ea8a3d9afaa75c493" | |
| firecrawl_app = FirecrawlApp(api_key=FIRECRAWL_API_KEY) | |
| class SearchAndScrape(BaseTool): | |
| """ | |
| This tool scrapes content using Firecrawl based on a provided query. | |
| """ | |
| query: str = Field( | |
| ..., | |
| description="The search query to look for", | |
| examples=["market trends in technology 2024", "AI industry statistics"] | |
| ) | |
| def scrape_with_retry(self, url, max_retries=3): | |
| """Helper function to scrape URL with retry logic""" | |
| problematic_domains = [ | |
| 'sparktoro.com', 'j-jdis.com', 'linkedin.com', | |
| 'facebook.com', 'twitter.com', 'reddit.com', '.pdf' | |
| ] | |
| if any(domain in url.lower() for domain in problematic_domains): | |
| logging.info(f"Skipping problematic URL: {url}") | |
| return None | |
| for attempt in range(max_retries): | |
| try: | |
| response = firecrawl_app.scrape_url( | |
| url=url, | |
| params={'formats': ['markdown']} | |
| ) | |
| if response and response.get('markdown'): | |
| content = response.get('markdown') | |
| if len(content.strip()) > 200: | |
| return content | |
| return None | |
| except Exception as e: | |
| logging.error(f"Attempt {attempt + 1} failed for {url}: {str(e)}") | |
| if attempt < max_retries - 1: | |
| time.sleep(2) | |
| continue | |
| return None | |
| def run(self): | |
| logging.info(f"Scraping content for query: {self.query}") | |
| # Here you would typically have a list of URLs to scrape based on the query. | |
| # For this example, we will assume a predefined list of URLs. | |
| search_results = ["http://example.com/article1", "http://example.com/article2"] # Placeholder URLs | |
| if not search_results: | |
| return "No search results found." | |
| for url in search_results: | |
| logging.info(f"Attempting to scrape URL: {url}") | |
| content = self.scrape_with_retry(url) | |
| if content: | |
| logging.info(f"Successfully scraped content from {url}") | |
| return f"Content from {url}:\n\n{content}" | |
| return "Failed to scrape content from any of the search results" |