Yash030's picture
deploy: Hugging Face Space clean release
3ae68d6
Raw
History Blame Contribute Delete
2.88 kB
import time
import requests
from typing import Optional
from urllib.parse import urlparse
from src.config import settings
def scrape_url(url: str, api_key: Optional[str] = None) -> str:
"""
Scrapes the target URL using Firecrawl API and returns the markdown content.
If no api_key is provided, it falls back to settings.firecrawl_api_key.
If both are absent, it operates in Firecrawl Keyless mode (no Authorization header).
Includes security scheme validation and exponential backoff retry for HTTP 429 (Rate Limits).
"""
# Security Validation: Enforce http/https to prevent internal file or protocol attacks
parsed = urlparse(url)
if parsed.scheme not in ["http", "https"]:
raise ValueError("Invalid URL protocol. Only HTTP and HTTPS schemes are supported.")
endpoint = "https://api.firecrawl.dev/v2/scrape"
headers = {
"Content-Type": "application/json"
}
# Use the passed key, or fallback to the environment configuration
key_to_use = api_key or settings.firecrawl_api_key
if key_to_use:
headers["Authorization"] = f"Bearer {key_to_use}"
payload = {
"url": url,
"formats": ["markdown"]
}
max_retries = 3
backoff = 1.0
for attempt in range(max_retries):
try:
response = requests.post(endpoint, json=payload, headers=headers, timeout=30)
# Handle rate limiting with backoff
if response.status_code == 429:
if attempt == max_retries - 1:
raise RuntimeError("Firecrawl rate limit exceeded. Max retries reached.")
time.sleep(backoff)
backoff *= 2
continue
if response.status_code == 401:
raise ValueError("Firecrawl authentication failed. Please check your API key.")
response.raise_for_status()
data = response.json()
if not data.get("success"):
error_msg = data.get("error", "Unknown error occurred during Firecrawl scraping.")
raise ValueError(f"Firecrawl scraping failed: {error_msg}")
markdown_content = data.get("data", {}).get("markdown", "")
if not markdown_content:
raise ValueError("Firecrawl succeeded but returned empty markdown content.")
return markdown_content
except requests.exceptions.RequestException as e:
if attempt == max_retries - 1:
raise RuntimeError(f"Network request to Firecrawl failed after {max_retries} attempts: {str(e)}")
time.sleep(backoff)
backoff *= 2
raise RuntimeError("Firecrawl scraping failed due to retry exhaustion.")