File size: 5,785 Bytes
a2cbcac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import os
from typing import Dict, Any, Optional, List
import aiohttp
from .utils import ToolResult
from ..config import config

FIRECRAWL_BASE_URL = "https://api.firecrawl.dev/v0"

async def scrape_url(
    url: str,
    extract_main_content: bool = True,
    include_html: bool = False
) -> ToolResult:
    """
    Scrape a URL using Firecrawl API.
    
    Args:
        url: URL to scrape
        extract_main_content: Whether to extract main content only
        include_html: Whether to include HTML in response
        
    Returns:
        ToolResult with scraped content
    """
    firecrawl_key = config.get_api_key('firecrawl')
    
    if not firecrawl_key:
        return ToolResult(
            success=False,
            error="FIRECRAWL_API_KEY not found in environment variables"
        )
    
    try:
        headers = {
            "Authorization": f"Bearer {firecrawl_key}",
            "Content-Type": "application/json"
        }
        
        payload = {
            "url": url,
            "extractorOptions": {
                "mode": "markdown" if extract_main_content else "html"
            },
            "pageOptions": {
                "includeHtml": include_html,
                "onlyMainContent": extract_main_content
            }
        }
        
        async with aiohttp.ClientSession() as session:
            async with session.post(f"{FIRECRAWL_BASE_URL}/scrape", json=payload, headers=headers) as response:
                if response.status == 200:
                    data = await response.json()
                    
                    # Extract relevant data
                    content = data.get('data', {}).get('content', '')
                    html = data.get('data', {}).get('html', '') if include_html else None
                    metadata = data.get('data', {}).get('metadata', {})
                    
                    return ToolResult(
                        success=True,
                        data={
                            'url': url,
                            'content': content,
                            'html': html,
                            'title': metadata.get('title', ''),
                            'description': metadata.get('description', ''),
                            'keywords': metadata.get('keywords', []),
                            'content_length': len(content)
                        }
                    )
                else:
                    error_text = await response.text()
                    return ToolResult(
                        success=False,
                        error=f"HTTP {response.status}: {error_text}"
                    )
                    
    except Exception as e:
        return ToolResult(
            success=False,
            error=f"Web scraping failed: {str(e)}"
        )

async def crawl_website(
    url: str,
    max_pages: int = 5,
    include_paths: Optional[List[str]] = None,
    exclude_paths: Optional[List[str]] = None
) -> ToolResult:
    """
    Crawl a website using Firecrawl API.
    
    Args:
        url: Base URL to crawl
        max_pages: Maximum number of pages to crawl
        include_paths: List of paths to include
        exclude_paths: List of paths to exclude
        
    Returns:
        ToolResult with crawled content
    """
    firecrawl_key = config.get_api_key('firecrawl')
    
    if not firecrawl_key:
        return ToolResult(
            success=False,
            error="FIRECRAWL_API_KEY not found in environment variables"
        )
    
    try:
        headers = {
            "Authorization": f"Bearer {firecrawl_key}",
            "Content-Type": "application/json"
        }
        
        payload = {
            "url": url,
            "crawlerOptions": {
                "limit": max_pages,
                "includePaths": include_paths or [],
                "excludePaths": exclude_paths or []
            },
            "pageOptions": {
                "onlyMainContent": True
            }
        }
        
        async with aiohttp.ClientSession() as session:
            async with session.post(f"{FIRECRAWL_BASE_URL}/crawl", json=payload, headers=headers) as response:
                if response.status == 200:
                    data = await response.json()
                    
                    # Extract crawled pages
                    pages = data.get('data', [])
                    formatted_pages = []
                    
                    for page in pages:
                        formatted_page = {
                            'url': page.get('url', ''),
                            'content': page.get('content', ''),
                            'title': page.get('metadata', {}).get('title', ''),
                            'description': page.get('metadata', {}).get('description', '')
                        }
                        formatted_pages.append(formatted_page)
                    
                    return ToolResult(
                        success=True,
                        data={
                            'base_url': url,
                            'pages_crawled': len(formatted_pages),
                            'pages': formatted_pages,
                            'total_content_length': sum(len(p['content']) for p in formatted_pages)
                        }
                    )
                else:
                    error_text = await response.text()
                    return ToolResult(
                        success=False,
                        error=f"HTTP {response.status}: {error_text}"
                    )
                    
    except Exception as e:
        return ToolResult(
            success=False,
            error=f"Website crawling failed: {str(e)}"
        )