from __future__ import annotations import logging import time from typing import TYPE_CHECKING, Any import requests from langchain_core.documents import Document if TYPE_CHECKING: from open_webui.retrieval.web.main import SearchResult log = logging.getLogger(__name__) DEFAULT_FIRECRAWL_API_BASE_URL = 'https://api.firecrawl.dev' FIRECRAWL_RETRY_STATUS_CODES = {429, 500, 502, 503, 504} FIRECRAWL_MAX_RETRIES = 2 def build_firecrawl_url(base_url: str | None, path: str) -> str: base_url = (base_url or DEFAULT_FIRECRAWL_API_BASE_URL).rstrip('/') path = path.lstrip('/') if base_url.endswith('/v2'): return f'{base_url}/{path}' return f'{base_url}/v2/{path}' def build_firecrawl_headers(api_key: str | None) -> dict[str, str]: return { 'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key or ""}', } def get_firecrawl_timeout_seconds(timeout: Any) -> float | None: if timeout in (None, ''): return None try: timeout = float(timeout) except (TypeError, ValueError): return None return timeout if timeout > 0 else None def get_firecrawl_scrape_timeout_ms(timeout: Any) -> int | None: timeout_seconds = get_firecrawl_timeout_seconds(timeout) if timeout_seconds is None: return None # Firecrawl v2 expects scrape timeouts in milliseconds. return min(300000, max(1000, int(timeout_seconds * 1000))) def get_firecrawl_client_timeout_seconds(timeout: Any, fallback: float = 60) -> float: # Keep the local HTTP timeout slightly above Firecrawl's scrape timeout. return (get_firecrawl_timeout_seconds(timeout) or fallback) + 10 def get_firecrawl_retry_delay(headers: Any, attempt: int) -> float: retry_after = headers.get('Retry-After') if headers else None if retry_after: try: return min(10.0, max(0.0, float(retry_after))) except (TypeError, ValueError): pass return min(8.0, float(2**attempt)) def request_firecrawl_json( method: str, url: str, *, headers: dict[str, str], json: dict[str, Any] | None = None, timeout: float | None = None, verify: bool = True, ) -> dict[str, Any]: last_error = None for attempt in range(FIRECRAWL_MAX_RETRIES + 1): try: response = requests.request( method, url, headers=headers, json=json, timeout=timeout, verify=verify, ) if response.status_code in FIRECRAWL_RETRY_STATUS_CODES and attempt < FIRECRAWL_MAX_RETRIES: delay = get_firecrawl_retry_delay(response.headers, attempt) log.warning( 'Firecrawl %s %s returned HTTP %s; retrying in %.1fs', method, url, response.status_code, delay, ) time.sleep(delay) continue response.raise_for_status() return response.json() except (requests.ConnectionError, requests.Timeout) as e: last_error = e if attempt >= FIRECRAWL_MAX_RETRIES: break delay = get_firecrawl_retry_delay(None, attempt) log.warning('Firecrawl %s %s failed; retrying in %.1fs: %s', method, url, delay, e) time.sleep(delay) if last_error: raise last_error raise RuntimeError(f'Firecrawl {method} {url} failed without a response') def get_firecrawl_result_url(result: dict[str, Any]) -> str: metadata = result.get('metadata') or {} return ( result.get('url') or result.get('link') or metadata.get('url') or metadata.get('sourceURL') or metadata.get('source_url') or '' ) def scrape_firecrawl_url( firecrawl_url: str, firecrawl_api_key: str, url: str, *, verify_ssl: bool = True, timeout: Any = None, params: dict[str, Any] | None = None, ) -> Document | None: payload = { 'url': url, 'formats': ['markdown'], 'skipTlsVerification': not verify_ssl, 'removeBase64Images': True, **(params or {}), } scrape_timeout_ms = get_firecrawl_scrape_timeout_ms(timeout) if scrape_timeout_ms is not None: payload['timeout'] = scrape_timeout_ms response = request_firecrawl_json( 'POST', build_firecrawl_url(firecrawl_url, 'scrape'), headers=build_firecrawl_headers(firecrawl_api_key), json=payload, timeout=get_firecrawl_client_timeout_seconds(timeout), verify=verify_ssl, ) data = response.get('data') or {} content = data.get('markdown') or '' if not isinstance(content, str) or not content.strip(): return None metadata = data.get('metadata') or {} document_metadata = {'source': get_firecrawl_result_url(data) or url} if metadata.get('title'): document_metadata['title'] = metadata['title'] if metadata.get('description'): document_metadata['description'] = metadata['description'] return Document(page_content=content, metadata=document_metadata) def search_firecrawl( firecrawl_url: str, firecrawl_api_key: str, query: str, count: int, filter_list: list[str] | None = None, ) -> list[SearchResult]: try: response = request_firecrawl_json( 'POST', build_firecrawl_url(firecrawl_url, 'search'), headers=build_firecrawl_headers(firecrawl_api_key), json={ 'query': query, 'limit': count, 'timeout': count * 3000, 'ignoreInvalidURLs': True, }, timeout=count * 3 + 10, ) data = response.get('data') or {} results = data.get('web') or [] if filter_list: from open_webui.retrieval.web.main import get_filtered_results results = get_filtered_results(results, filter_list) from open_webui.retrieval.web.main import SearchResult search_results = [] for result in results[:count]: url = get_firecrawl_result_url(result) if not url: continue metadata = result.get('metadata') or {} search_results.append( SearchResult( link=url, title=result.get('title') or metadata.get('title'), snippet=result.get('description') or result.get('snippet') or metadata.get('description'), ) ) log.info(f'FireCrawl search results: {search_results}') return search_results except Exception as e: log.error(f'Error in FireCrawl search: {e}') return []