File size: 2,883 Bytes
3ae68d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import time
import requests
from typing import Optional
from urllib.parse import urlparse
from src.config import settings

def scrape_url(url: str, api_key: Optional[str] = None) -> str:
    """
    Scrapes the target URL using Firecrawl API and returns the markdown content.
    If no api_key is provided, it falls back to settings.firecrawl_api_key.
    If both are absent, it operates in Firecrawl Keyless mode (no Authorization header).
    
    Includes security scheme validation and exponential backoff retry for HTTP 429 (Rate Limits).
    """
    # Security Validation: Enforce http/https to prevent internal file or protocol attacks
    parsed = urlparse(url)
    if parsed.scheme not in ["http", "https"]:
        raise ValueError("Invalid URL protocol. Only HTTP and HTTPS schemes are supported.")
        
    endpoint = "https://api.firecrawl.dev/v2/scrape"
    
    headers = {
        "Content-Type": "application/json"
    }
    
    # Use the passed key, or fallback to the environment configuration
    key_to_use = api_key or settings.firecrawl_api_key
    if key_to_use:
        headers["Authorization"] = f"Bearer {key_to_use}"
        
    payload = {
        "url": url,
        "formats": ["markdown"]
    }
    
    max_retries = 3
    backoff = 1.0
    
    for attempt in range(max_retries):
        try:
            response = requests.post(endpoint, json=payload, headers=headers, timeout=30)
            
            # Handle rate limiting with backoff
            if response.status_code == 429:
                if attempt == max_retries - 1:
                    raise RuntimeError("Firecrawl rate limit exceeded. Max retries reached.")
                time.sleep(backoff)
                backoff *= 2
                continue
                
            if response.status_code == 401:
                raise ValueError("Firecrawl authentication failed. Please check your API key.")
                
            response.raise_for_status()
            
            data = response.json()
            if not data.get("success"):
                error_msg = data.get("error", "Unknown error occurred during Firecrawl scraping.")
                raise ValueError(f"Firecrawl scraping failed: {error_msg}")
                
            markdown_content = data.get("data", {}).get("markdown", "")
            if not markdown_content:
                raise ValueError("Firecrawl succeeded but returned empty markdown content.")
                
            return markdown_content
            
        except requests.exceptions.RequestException as e:
            if attempt == max_retries - 1:
                raise RuntimeError(f"Network request to Firecrawl failed after {max_retries} attempts: {str(e)}")
            time.sleep(backoff)
            backoff *= 2
            
    raise RuntimeError("Firecrawl scraping failed due to retry exhaustion.")