File size: 2,696 Bytes
eceb45a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from agency_swarm.tools import BaseTool
from pydantic import Field
import logging
import os
import time

try:
    from firecrawl import FirecrawlApp
except ImportError:
    raise ImportError(
        "Required packages not found. Please install them using:\n"
        "pip install firecrawl"
    )

# Initialize Firecrawl
FIRECRAWL_API_KEY = "fc-5fadfeae30314d4ea8a3d9afaa75c493"
firecrawl_app = FirecrawlApp(api_key=FIRECRAWL_API_KEY)

class SearchAndScrape(BaseTool):
    """
    This tool scrapes content using Firecrawl based on a provided query.
    """
    
    query: str = Field(
        ..., 
        description="The search query to look for",
        examples=["market trends in technology 2024", "AI industry statistics"]
    )

    def scrape_with_retry(self, url, max_retries=3):
        """Helper function to scrape URL with retry logic"""
        problematic_domains = [
            'sparktoro.com', 'j-jdis.com', 'linkedin.com',
            'facebook.com', 'twitter.com', 'reddit.com', '.pdf'
        ]
        
        if any(domain in url.lower() for domain in problematic_domains):
            logging.info(f"Skipping problematic URL: {url}")
            return None

        for attempt in range(max_retries):
            try:
                response = firecrawl_app.scrape_url(
                    url=url,
                    params={'formats': ['markdown']}
                )
                
                if response and response.get('markdown'):
                    content = response.get('markdown')
                    if len(content.strip()) > 200:
                        return content
                return None
            except Exception as e:
                logging.error(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
                if attempt < max_retries - 1:
                    time.sleep(2)
                continue
        return None

    def run(self):
        logging.info(f"Scraping content for query: {self.query}")
        # Here you would typically have a list of URLs to scrape based on the query.
        # For this example, we will assume a predefined list of URLs.
        search_results = ["http://example.com/article1", "http://example.com/article2"]  # Placeholder URLs

        if not search_results:
            return "No search results found."

        for url in search_results:
            logging.info(f"Attempting to scrape URL: {url}")
            content = self.scrape_with_retry(url)
            if content:
                logging.info(f"Successfully scraped content from {url}")
                return f"Content from {url}:\n\n{content}"

        return "Failed to scrape content from any of the search results"