Spaces:

build-small-hackathon
/

borderless

Running

App Files Files Community

borderless / apis /firecrawl.py

spagestic

Firecrawl scrape/crawl integration

f440f8f 29 days ago

Raw

History Blame Contribute Delete

5.49 kB

	"""Firecrawl API client for scraping and crawling official web pages."""

	from __future__ import annotations

	import os
	from functools import lru_cache
	from typing import Any

	from firecrawl import Firecrawl
	from firecrawl.v2.types import ScrapeOptions

	SCRAPE_MARKDOWN_LIMIT = 8000
	CRAWL_PAGE_MARKDOWN_LIMIT = 4000


	def _truncate(text: str \| None, limit: int) -> str:
	if not text:
	return ""
	if len(text) <= limit:
	return text
	return text[:limit] + "\n… (truncated)"


	@lru_cache(maxsize=1)
	def _client() -> Firecrawl:
	api_key = os.environ.get("FIRECRAWL_API_KEY")
	if not api_key:
	raise RuntimeError("FIRECRAWL_API_KEY is not set")
	return Firecrawl(api_key=api_key)


	def _location(country: str \| None) -> dict[str, Any] \| None:
	if not country:
	return None
	normalized = country.strip().upper()
	if not normalized:
	return None
	return {"country": normalized, "languages": ["en"]}


	def _metadata_dict(metadata: Any) -> dict[str, Any]:
	if metadata is None:
	return {}
	if hasattr(metadata, "model_dump"):
	return metadata.model_dump()
	if isinstance(metadata, dict):
	return metadata
	return {}


	def _page_from_document(doc: Any) -> dict[str, Any]:
	metadata = _metadata_dict(getattr(doc, "metadata", None))
	source_url = metadata.get("source_url") or metadata.get("url") or metadata.get("og_url")
	return {
	"url": source_url or "",
	"title": metadata.get("title") or "",
	"markdown": _truncate(getattr(doc, "markdown", None), CRAWL_PAGE_MARKDOWN_LIMIT),
	"source_url": source_url or "",
	"status_code": metadata.get("status_code"),
	"description": metadata.get("description") or "",
	}


	def scrape_page(
	url: str,
	*,
	only_main_content: bool = True,
	country: str \| None = None,
	languages: list[str] \| None = None,
	max_age: int \| None = None,
	timeout: int = 60,
	) -> dict[str, Any]:
	"""
	Scrape a single URL and return normalized markdown content.

	Args:
	url: Full HTTPS URL to scrape.
	only_main_content: Drop navigation, footer, and ads when possible.
	country: Optional ISO-2 country code for geo-targeted content.
	languages: Optional preferred languages; defaults from country or English.
	max_age: Cache freshness window in milliseconds; ``None`` uses SDK default.
	timeout: Request timeout in seconds.
	"""
	normalized_url = (url or "").strip()
	if not normalized_url:
	return {"error": "url is required", "url": url}

	try:
	location = _location(country)
	if location and languages:
	location["languages"] = languages

	kwargs: dict[str, Any] = {
	"formats": ["markdown"],
	"only_main_content": only_main_content,
	"timeout": timeout * 1000,
	}
	if location:
	kwargs["location"] = location
	if max_age is not None:
	kwargs["max_age"] = max_age

	doc = _client().scrape(normalized_url, **kwargs)
	page = _page_from_document(doc)
	page["url"] = normalized_url
	page["markdown"] = _truncate(page["markdown"], SCRAPE_MARKDOWN_LIMIT)
	return page
	except Exception as exc:
	return {"error": str(exc), "url": normalized_url}


	def crawl_site(
	url: str,
	*,
	limit: int = 10,
	max_depth: int = 2,
	include_paths: list[str] \| None = None,
	exclude_paths: list[str] \| None = None,
	country: str \| None = None,
	poll_interval: int = 2,
	timeout: int = 120,
	) -> dict[str, Any]:
	"""
	Crawl a website starting from a URL and return normalized page summaries.

	Args:
	url: Starting URL for the crawl.
	limit: Maximum number of pages to crawl.
	max_depth: Maximum discovery depth from the start URL.
	include_paths: Optional path prefixes to include.
	exclude_paths: Optional path prefixes to exclude.
	country: Optional ISO-2 country code for geo-targeted content.
	poll_interval: Seconds between crawl status polls.
	timeout: Maximum seconds to wait for crawl completion.
	"""
	normalized_url = (url or "").strip()
	if not normalized_url:
	return {"error": "url is required", "url": url}

	try:
	location = _location(country)
	scrape_options = ScrapeOptions(
	formats=["markdown"],
	only_main_content=True,
	location=location,
	)

	kwargs: dict[str, Any] = {
	"limit": limit,
	"max_discovery_depth": max_depth,
	"scrape_options": scrape_options,
	"poll_interval": poll_interval,
	"timeout": timeout,
	}
	if include_paths:
	kwargs["include_paths"] = include_paths
	if exclude_paths:
	kwargs["exclude_paths"] = exclude_paths

	job = _client().crawl(normalized_url, **kwargs)
	pages = [_page_from_document(doc) for doc in (job.data or [])]
	return {
	"start_url": normalized_url,
	"pages_found": len(pages),
	"status": getattr(job, "status", None),
	"pages": pages,
	}
	except Exception as exc:
	return {"error": str(exc), "start_url": normalized_url}