borderless / apis /firecrawl.py
spagestic's picture
Firecrawl scrape/crawl integration
f440f8f
Raw
History Blame Contribute Delete
5.49 kB
"""Firecrawl API client for scraping and crawling official web pages."""
from __future__ import annotations
import os
from functools import lru_cache
from typing import Any
from firecrawl import Firecrawl
from firecrawl.v2.types import ScrapeOptions
SCRAPE_MARKDOWN_LIMIT = 8000
CRAWL_PAGE_MARKDOWN_LIMIT = 4000
def _truncate(text: str | None, limit: int) -> str:
if not text:
return ""
if len(text) <= limit:
return text
return text[:limit] + "\n… (truncated)"
@lru_cache(maxsize=1)
def _client() -> Firecrawl:
api_key = os.environ.get("FIRECRAWL_API_KEY")
if not api_key:
raise RuntimeError("FIRECRAWL_API_KEY is not set")
return Firecrawl(api_key=api_key)
def _location(country: str | None) -> dict[str, Any] | None:
if not country:
return None
normalized = country.strip().upper()
if not normalized:
return None
return {"country": normalized, "languages": ["en"]}
def _metadata_dict(metadata: Any) -> dict[str, Any]:
if metadata is None:
return {}
if hasattr(metadata, "model_dump"):
return metadata.model_dump()
if isinstance(metadata, dict):
return metadata
return {}
def _page_from_document(doc: Any) -> dict[str, Any]:
metadata = _metadata_dict(getattr(doc, "metadata", None))
source_url = metadata.get("source_url") or metadata.get("url") or metadata.get("og_url")
return {
"url": source_url or "",
"title": metadata.get("title") or "",
"markdown": _truncate(getattr(doc, "markdown", None), CRAWL_PAGE_MARKDOWN_LIMIT),
"source_url": source_url or "",
"status_code": metadata.get("status_code"),
"description": metadata.get("description") or "",
}
def scrape_page(
url: str,
*,
only_main_content: bool = True,
country: str | None = None,
languages: list[str] | None = None,
max_age: int | None = None,
timeout: int = 60,
) -> dict[str, Any]:
"""
Scrape a single URL and return normalized markdown content.
Args:
url: Full HTTPS URL to scrape.
only_main_content: Drop navigation, footer, and ads when possible.
country: Optional ISO-2 country code for geo-targeted content.
languages: Optional preferred languages; defaults from country or English.
max_age: Cache freshness window in milliseconds; ``None`` uses SDK default.
timeout: Request timeout in seconds.
"""
normalized_url = (url or "").strip()
if not normalized_url:
return {"error": "url is required", "url": url}
try:
location = _location(country)
if location and languages:
location["languages"] = languages
kwargs: dict[str, Any] = {
"formats": ["markdown"],
"only_main_content": only_main_content,
"timeout": timeout * 1000,
}
if location:
kwargs["location"] = location
if max_age is not None:
kwargs["max_age"] = max_age
doc = _client().scrape(normalized_url, **kwargs)
page = _page_from_document(doc)
page["url"] = normalized_url
page["markdown"] = _truncate(page["markdown"], SCRAPE_MARKDOWN_LIMIT)
return page
except Exception as exc:
return {"error": str(exc), "url": normalized_url}
def crawl_site(
url: str,
*,
limit: int = 10,
max_depth: int = 2,
include_paths: list[str] | None = None,
exclude_paths: list[str] | None = None,
country: str | None = None,
poll_interval: int = 2,
timeout: int = 120,
) -> dict[str, Any]:
"""
Crawl a website starting from a URL and return normalized page summaries.
Args:
url: Starting URL for the crawl.
limit: Maximum number of pages to crawl.
max_depth: Maximum discovery depth from the start URL.
include_paths: Optional path prefixes to include.
exclude_paths: Optional path prefixes to exclude.
country: Optional ISO-2 country code for geo-targeted content.
poll_interval: Seconds between crawl status polls.
timeout: Maximum seconds to wait for crawl completion.
"""
normalized_url = (url or "").strip()
if not normalized_url:
return {"error": "url is required", "url": url}
try:
location = _location(country)
scrape_options = ScrapeOptions(
formats=["markdown"],
only_main_content=True,
location=location,
)
kwargs: dict[str, Any] = {
"limit": limit,
"max_discovery_depth": max_depth,
"scrape_options": scrape_options,
"poll_interval": poll_interval,
"timeout": timeout,
}
if include_paths:
kwargs["include_paths"] = include_paths
if exclude_paths:
kwargs["exclude_paths"] = exclude_paths
job = _client().crawl(normalized_url, **kwargs)
pages = [_page_from_document(doc) for doc in (job.data or [])]
return {
"start_url": normalized_url,
"pages_found": len(pages),
"status": getattr(job, "status", None),
"pages": pages,
}
except Exception as exc:
return {"error": str(exc), "start_url": normalized_url}