Spaces:
Sleeping
Sleeping
File size: 5,785 Bytes
a2cbcac | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | import os
from typing import Dict, Any, Optional, List
import aiohttp
from .utils import ToolResult
from ..config import config
FIRECRAWL_BASE_URL = "https://api.firecrawl.dev/v0"
async def scrape_url(
url: str,
extract_main_content: bool = True,
include_html: bool = False
) -> ToolResult:
"""
Scrape a URL using Firecrawl API.
Args:
url: URL to scrape
extract_main_content: Whether to extract main content only
include_html: Whether to include HTML in response
Returns:
ToolResult with scraped content
"""
firecrawl_key = config.get_api_key('firecrawl')
if not firecrawl_key:
return ToolResult(
success=False,
error="FIRECRAWL_API_KEY not found in environment variables"
)
try:
headers = {
"Authorization": f"Bearer {firecrawl_key}",
"Content-Type": "application/json"
}
payload = {
"url": url,
"extractorOptions": {
"mode": "markdown" if extract_main_content else "html"
},
"pageOptions": {
"includeHtml": include_html,
"onlyMainContent": extract_main_content
}
}
async with aiohttp.ClientSession() as session:
async with session.post(f"{FIRECRAWL_BASE_URL}/scrape", json=payload, headers=headers) as response:
if response.status == 200:
data = await response.json()
# Extract relevant data
content = data.get('data', {}).get('content', '')
html = data.get('data', {}).get('html', '') if include_html else None
metadata = data.get('data', {}).get('metadata', {})
return ToolResult(
success=True,
data={
'url': url,
'content': content,
'html': html,
'title': metadata.get('title', ''),
'description': metadata.get('description', ''),
'keywords': metadata.get('keywords', []),
'content_length': len(content)
}
)
else:
error_text = await response.text()
return ToolResult(
success=False,
error=f"HTTP {response.status}: {error_text}"
)
except Exception as e:
return ToolResult(
success=False,
error=f"Web scraping failed: {str(e)}"
)
async def crawl_website(
url: str,
max_pages: int = 5,
include_paths: Optional[List[str]] = None,
exclude_paths: Optional[List[str]] = None
) -> ToolResult:
"""
Crawl a website using Firecrawl API.
Args:
url: Base URL to crawl
max_pages: Maximum number of pages to crawl
include_paths: List of paths to include
exclude_paths: List of paths to exclude
Returns:
ToolResult with crawled content
"""
firecrawl_key = config.get_api_key('firecrawl')
if not firecrawl_key:
return ToolResult(
success=False,
error="FIRECRAWL_API_KEY not found in environment variables"
)
try:
headers = {
"Authorization": f"Bearer {firecrawl_key}",
"Content-Type": "application/json"
}
payload = {
"url": url,
"crawlerOptions": {
"limit": max_pages,
"includePaths": include_paths or [],
"excludePaths": exclude_paths or []
},
"pageOptions": {
"onlyMainContent": True
}
}
async with aiohttp.ClientSession() as session:
async with session.post(f"{FIRECRAWL_BASE_URL}/crawl", json=payload, headers=headers) as response:
if response.status == 200:
data = await response.json()
# Extract crawled pages
pages = data.get('data', [])
formatted_pages = []
for page in pages:
formatted_page = {
'url': page.get('url', ''),
'content': page.get('content', ''),
'title': page.get('metadata', {}).get('title', ''),
'description': page.get('metadata', {}).get('description', '')
}
formatted_pages.append(formatted_page)
return ToolResult(
success=True,
data={
'base_url': url,
'pages_crawled': len(formatted_pages),
'pages': formatted_pages,
'total_content_length': sum(len(p['content']) for p in formatted_pages)
}
)
else:
error_text = await response.text()
return ToolResult(
success=False,
error=f"HTTP {response.status}: {error_text}"
)
except Exception as e:
return ToolResult(
success=False,
error=f"Website crawling failed: {str(e)}"
)
|