"""Firecrawl API client for scraping and crawling official web pages.""" from __future__ import annotations import os from functools import lru_cache from typing import Any from firecrawl import Firecrawl from firecrawl.v2.types import ScrapeOptions SCRAPE_MARKDOWN_LIMIT = 8000 CRAWL_PAGE_MARKDOWN_LIMIT = 4000 def _truncate(text: str | None, limit: int) -> str: if not text: return "" if len(text) <= limit: return text return text[:limit] + "\n… (truncated)" @lru_cache(maxsize=1) def _client() -> Firecrawl: api_key = os.environ.get("FIRECRAWL_API_KEY") if not api_key: raise RuntimeError("FIRECRAWL_API_KEY is not set") return Firecrawl(api_key=api_key) def _location(country: str | None) -> dict[str, Any] | None: if not country: return None normalized = country.strip().upper() if not normalized: return None return {"country": normalized, "languages": ["en"]} def _metadata_dict(metadata: Any) -> dict[str, Any]: if metadata is None: return {} if hasattr(metadata, "model_dump"): return metadata.model_dump() if isinstance(metadata, dict): return metadata return {} def _page_from_document(doc: Any) -> dict[str, Any]: metadata = _metadata_dict(getattr(doc, "metadata", None)) source_url = metadata.get("source_url") or metadata.get("url") or metadata.get("og_url") return { "url": source_url or "", "title": metadata.get("title") or "", "markdown": _truncate(getattr(doc, "markdown", None), CRAWL_PAGE_MARKDOWN_LIMIT), "source_url": source_url or "", "status_code": metadata.get("status_code"), "description": metadata.get("description") or "", } def scrape_page( url: str, *, only_main_content: bool = True, country: str | None = None, languages: list[str] | None = None, max_age: int | None = None, timeout: int = 60, ) -> dict[str, Any]: """ Scrape a single URL and return normalized markdown content. Args: url: Full HTTPS URL to scrape. only_main_content: Drop navigation, footer, and ads when possible. country: Optional ISO-2 country code for geo-targeted content. languages: Optional preferred languages; defaults from country or English. max_age: Cache freshness window in milliseconds; ``None`` uses SDK default. timeout: Request timeout in seconds. """ normalized_url = (url or "").strip() if not normalized_url: return {"error": "url is required", "url": url} try: location = _location(country) if location and languages: location["languages"] = languages kwargs: dict[str, Any] = { "formats": ["markdown"], "only_main_content": only_main_content, "timeout": timeout * 1000, } if location: kwargs["location"] = location if max_age is not None: kwargs["max_age"] = max_age doc = _client().scrape(normalized_url, **kwargs) page = _page_from_document(doc) page["url"] = normalized_url page["markdown"] = _truncate(page["markdown"], SCRAPE_MARKDOWN_LIMIT) return page except Exception as exc: return {"error": str(exc), "url": normalized_url} def crawl_site( url: str, *, limit: int = 10, max_depth: int = 2, include_paths: list[str] | None = None, exclude_paths: list[str] | None = None, country: str | None = None, poll_interval: int = 2, timeout: int = 120, ) -> dict[str, Any]: """ Crawl a website starting from a URL and return normalized page summaries. Args: url: Starting URL for the crawl. limit: Maximum number of pages to crawl. max_depth: Maximum discovery depth from the start URL. include_paths: Optional path prefixes to include. exclude_paths: Optional path prefixes to exclude. country: Optional ISO-2 country code for geo-targeted content. poll_interval: Seconds between crawl status polls. timeout: Maximum seconds to wait for crawl completion. """ normalized_url = (url or "").strip() if not normalized_url: return {"error": "url is required", "url": url} try: location = _location(country) scrape_options = ScrapeOptions( formats=["markdown"], only_main_content=True, location=location, ) kwargs: dict[str, Any] = { "limit": limit, "max_discovery_depth": max_depth, "scrape_options": scrape_options, "poll_interval": poll_interval, "timeout": timeout, } if include_paths: kwargs["include_paths"] = include_paths if exclude_paths: kwargs["exclude_paths"] = exclude_paths job = _client().crawl(normalized_url, **kwargs) pages = [_page_from_document(doc) for doc in (job.data or [])] return { "start_url": normalized_url, "pages_found": len(pages), "status": getattr(job, "status", None), "pages": pages, } except Exception as exc: return {"error": str(exc), "start_url": normalized_url}