| """Firecrawl API client for scraping and crawling official web pages."""
|
|
|
| from __future__ import annotations
|
|
|
| import os
|
| from functools import lru_cache
|
| from typing import Any
|
|
|
| from firecrawl import Firecrawl
|
| from firecrawl.v2.types import ScrapeOptions
|
|
|
| SCRAPE_MARKDOWN_LIMIT = 8000
|
| CRAWL_PAGE_MARKDOWN_LIMIT = 4000
|
|
|
|
|
| def _truncate(text: str | None, limit: int) -> str:
|
| if not text:
|
| return ""
|
| if len(text) <= limit:
|
| return text
|
| return text[:limit] + "\n… (truncated)"
|
|
|
|
|
| @lru_cache(maxsize=1)
|
| def _client() -> Firecrawl:
|
| api_key = os.environ.get("FIRECRAWL_API_KEY")
|
| if not api_key:
|
| raise RuntimeError("FIRECRAWL_API_KEY is not set")
|
| return Firecrawl(api_key=api_key)
|
|
|
|
|
| def _location(country: str | None) -> dict[str, Any] | None:
|
| if not country:
|
| return None
|
| normalized = country.strip().upper()
|
| if not normalized:
|
| return None
|
| return {"country": normalized, "languages": ["en"]}
|
|
|
|
|
| def _metadata_dict(metadata: Any) -> dict[str, Any]:
|
| if metadata is None:
|
| return {}
|
| if hasattr(metadata, "model_dump"):
|
| return metadata.model_dump()
|
| if isinstance(metadata, dict):
|
| return metadata
|
| return {}
|
|
|
|
|
| def _page_from_document(doc: Any) -> dict[str, Any]:
|
| metadata = _metadata_dict(getattr(doc, "metadata", None))
|
| source_url = metadata.get("source_url") or metadata.get("url") or metadata.get("og_url")
|
| return {
|
| "url": source_url or "",
|
| "title": metadata.get("title") or "",
|
| "markdown": _truncate(getattr(doc, "markdown", None), CRAWL_PAGE_MARKDOWN_LIMIT),
|
| "source_url": source_url or "",
|
| "status_code": metadata.get("status_code"),
|
| "description": metadata.get("description") or "",
|
| }
|
|
|
|
|
| def scrape_page(
|
| url: str,
|
| *,
|
| only_main_content: bool = True,
|
| country: str | None = None,
|
| languages: list[str] | None = None,
|
| max_age: int | None = None,
|
| timeout: int = 60,
|
| ) -> dict[str, Any]:
|
| """
|
| Scrape a single URL and return normalized markdown content.
|
|
|
| Args:
|
| url: Full HTTPS URL to scrape.
|
| only_main_content: Drop navigation, footer, and ads when possible.
|
| country: Optional ISO-2 country code for geo-targeted content.
|
| languages: Optional preferred languages; defaults from country or English.
|
| max_age: Cache freshness window in milliseconds; ``None`` uses SDK default.
|
| timeout: Request timeout in seconds.
|
| """
|
| normalized_url = (url or "").strip()
|
| if not normalized_url:
|
| return {"error": "url is required", "url": url}
|
|
|
| try:
|
| location = _location(country)
|
| if location and languages:
|
| location["languages"] = languages
|
|
|
| kwargs: dict[str, Any] = {
|
| "formats": ["markdown"],
|
| "only_main_content": only_main_content,
|
| "timeout": timeout * 1000,
|
| }
|
| if location:
|
| kwargs["location"] = location
|
| if max_age is not None:
|
| kwargs["max_age"] = max_age
|
|
|
| doc = _client().scrape(normalized_url, **kwargs)
|
| page = _page_from_document(doc)
|
| page["url"] = normalized_url
|
| page["markdown"] = _truncate(page["markdown"], SCRAPE_MARKDOWN_LIMIT)
|
| return page
|
| except Exception as exc:
|
| return {"error": str(exc), "url": normalized_url}
|
|
|
|
|
| def crawl_site(
|
| url: str,
|
| *,
|
| limit: int = 10,
|
| max_depth: int = 2,
|
| include_paths: list[str] | None = None,
|
| exclude_paths: list[str] | None = None,
|
| country: str | None = None,
|
| poll_interval: int = 2,
|
| timeout: int = 120,
|
| ) -> dict[str, Any]:
|
| """
|
| Crawl a website starting from a URL and return normalized page summaries.
|
|
|
| Args:
|
| url: Starting URL for the crawl.
|
| limit: Maximum number of pages to crawl.
|
| max_depth: Maximum discovery depth from the start URL.
|
| include_paths: Optional path prefixes to include.
|
| exclude_paths: Optional path prefixes to exclude.
|
| country: Optional ISO-2 country code for geo-targeted content.
|
| poll_interval: Seconds between crawl status polls.
|
| timeout: Maximum seconds to wait for crawl completion.
|
| """
|
| normalized_url = (url or "").strip()
|
| if not normalized_url:
|
| return {"error": "url is required", "url": url}
|
|
|
| try:
|
| location = _location(country)
|
| scrape_options = ScrapeOptions(
|
| formats=["markdown"],
|
| only_main_content=True,
|
| location=location,
|
| )
|
|
|
| kwargs: dict[str, Any] = {
|
| "limit": limit,
|
| "max_discovery_depth": max_depth,
|
| "scrape_options": scrape_options,
|
| "poll_interval": poll_interval,
|
| "timeout": timeout,
|
| }
|
| if include_paths:
|
| kwargs["include_paths"] = include_paths
|
| if exclude_paths:
|
| kwargs["exclude_paths"] = exclude_paths
|
|
|
| job = _client().crawl(normalized_url, **kwargs)
|
| pages = [_page_from_document(doc) for doc in (job.data or [])]
|
| return {
|
| "start_url": normalized_url,
|
| "pages_found": len(pages),
|
| "status": getattr(job, "status", None),
|
| "pages": pages,
|
| }
|
| except Exception as exc:
|
| return {"error": str(exc), "start_url": normalized_url}
|
|
|