Spaces:
Running
Running
File size: 2,883 Bytes
3ae68d6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | import time
import requests
from typing import Optional
from urllib.parse import urlparse
from src.config import settings
def scrape_url(url: str, api_key: Optional[str] = None) -> str:
"""
Scrapes the target URL using Firecrawl API and returns the markdown content.
If no api_key is provided, it falls back to settings.firecrawl_api_key.
If both are absent, it operates in Firecrawl Keyless mode (no Authorization header).
Includes security scheme validation and exponential backoff retry for HTTP 429 (Rate Limits).
"""
# Security Validation: Enforce http/https to prevent internal file or protocol attacks
parsed = urlparse(url)
if parsed.scheme not in ["http", "https"]:
raise ValueError("Invalid URL protocol. Only HTTP and HTTPS schemes are supported.")
endpoint = "https://api.firecrawl.dev/v2/scrape"
headers = {
"Content-Type": "application/json"
}
# Use the passed key, or fallback to the environment configuration
key_to_use = api_key or settings.firecrawl_api_key
if key_to_use:
headers["Authorization"] = f"Bearer {key_to_use}"
payload = {
"url": url,
"formats": ["markdown"]
}
max_retries = 3
backoff = 1.0
for attempt in range(max_retries):
try:
response = requests.post(endpoint, json=payload, headers=headers, timeout=30)
# Handle rate limiting with backoff
if response.status_code == 429:
if attempt == max_retries - 1:
raise RuntimeError("Firecrawl rate limit exceeded. Max retries reached.")
time.sleep(backoff)
backoff *= 2
continue
if response.status_code == 401:
raise ValueError("Firecrawl authentication failed. Please check your API key.")
response.raise_for_status()
data = response.json()
if not data.get("success"):
error_msg = data.get("error", "Unknown error occurred during Firecrawl scraping.")
raise ValueError(f"Firecrawl scraping failed: {error_msg}")
markdown_content = data.get("data", {}).get("markdown", "")
if not markdown_content:
raise ValueError("Firecrawl succeeded but returned empty markdown content.")
return markdown_content
except requests.exceptions.RequestException as e:
if attempt == max_retries - 1:
raise RuntimeError(f"Network request to Firecrawl failed after {max_retries} attempts: {str(e)}")
time.sleep(backoff)
backoff *= 2
raise RuntimeError("Firecrawl scraping failed due to retry exhaustion.")
|