Spaces:
Paused
Paused
| { | |
| "openapi": "3.0.0", | |
| "info": { | |
| "title": "Firecrawl API", | |
| "version": "0.0.0", | |
| "description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.", | |
| "contact": { | |
| "name": "Firecrawl Support", | |
| "url": "https://firecrawl.dev/support", | |
| "email": "support@firecrawl.dev" | |
| } | |
| }, | |
| "servers": [ | |
| { | |
| "url": "https://api.firecrawl.dev/v0" | |
| } | |
| ], | |
| "paths": { | |
| "/scrape": { | |
| "post": { | |
| "summary": "Scrape a single URL and optionally extract information using an LLM", | |
| "operationId": "scrapeAndExtractFromUrl", | |
| "tags": ["Scraping"], | |
| "security": [ | |
| { | |
| "bearerAuth": [] | |
| } | |
| ], | |
| "requestBody": { | |
| "required": true, | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "url": { | |
| "type": "string", | |
| "format": "uri", | |
| "description": "The URL to scrape" | |
| }, | |
| "pageOptions": { | |
| "type": "object", | |
| "properties": { | |
| "headers": { | |
| "type": "object", | |
| "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." | |
| }, | |
| "includeHtml": { | |
| "type": "boolean", | |
| "description": "Include the HTML version of the content on page. Will output a html key in the response.", | |
| "default": false | |
| }, | |
| "includeRawHtml": { | |
| "type": "boolean", | |
| "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", | |
| "default": false | |
| }, | |
| "onlyIncludeTags": { | |
| "type": "array", | |
| "items": { | |
| "type": "string" | |
| }, | |
| "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" | |
| }, | |
| "onlyMainContent": { | |
| "type": "boolean", | |
| "description": "Only return the main content of the page excluding headers, navs, footers, etc.", | |
| "default": false | |
| }, | |
| "removeTags": { | |
| "type": "array", | |
| "items": { | |
| "type": "string" | |
| }, | |
| "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" | |
| }, | |
| "replaceAllPathsWithAbsolutePaths": { | |
| "type": "boolean", | |
| "description": "Replace all relative paths with absolute paths for images and links", | |
| "default": false | |
| }, | |
| "screenshot": { | |
| "type": "boolean", | |
| "description": "Include a screenshot of the top of the page that you are scraping.", | |
| "default": false | |
| }, | |
| "fullPageScreenshot": { | |
| "type": "boolean", | |
| "description": "Include a full page screenshot of the page that you are scraping.", | |
| "default": false | |
| }, | |
| "waitFor": { | |
| "type": "integer", | |
| "description": "Wait x amount of milliseconds for the page to load to fetch content", | |
| "default": 0 | |
| } | |
| } | |
| }, | |
| "extractorOptions": { | |
| "type": "object", | |
| "description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.", | |
| "default": {}, | |
| "properties": { | |
| "mode": { | |
| "type": "string", | |
| "enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"], | |
| "description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM." | |
| }, | |
| "extractionPrompt": { | |
| "type": "string", | |
| "description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes." | |
| }, | |
| "extractionSchema": { | |
| "type": "object", | |
| "additionalProperties": true, | |
| "description": "The schema for the data to be extracted, required only for LLM extraction modes.", | |
| "required": [ | |
| "company_mission", | |
| "supports_sso", | |
| "is_open_source" | |
| ] | |
| } | |
| } | |
| }, | |
| "timeout": { | |
| "type": "integer", | |
| "description": "Timeout in milliseconds for the request", | |
| "default": 30000 | |
| } | |
| }, | |
| "required": ["url"] | |
| } | |
| } | |
| } | |
| }, | |
| "responses": { | |
| "200": { | |
| "description": "Successful response", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "$ref": "#/components/schemas/ScrapeResponse" | |
| } | |
| } | |
| } | |
| }, | |
| "402": { | |
| "description": "Payment required", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "error": { | |
| "type": "string", | |
| "example": "Payment required to access this resource." | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "429": { | |
| "description": "Too many requests", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "error": { | |
| "type": "string", | |
| "example": "Request rate limit exceeded. Please wait and try again later." | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "500": { | |
| "description": "Server error", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "error": { | |
| "type": "string", | |
| "example": "An unexpected error occurred on the server." | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "/crawl": { | |
| "post": { | |
| "summary": "Crawl multiple URLs based on options", | |
| "operationId": "crawlUrls", | |
| "tags": ["Crawling"], | |
| "security": [ | |
| { | |
| "bearerAuth": [] | |
| } | |
| ], | |
| "requestBody": { | |
| "required": true, | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "url": { | |
| "type": "string", | |
| "format": "uri", | |
| "description": "The base URL to start crawling from" | |
| }, | |
| "crawlerOptions": { | |
| "type": "object", | |
| "properties": { | |
| "includes": { | |
| "type": "array", | |
| "items": { | |
| "type": "string" | |
| }, | |
| "description": "URL patterns to include" | |
| }, | |
| "excludes": { | |
| "type": "array", | |
| "items": { | |
| "type": "string" | |
| }, | |
| "description": "URL patterns to exclude" | |
| }, | |
| "generateImgAltText": { | |
| "type": "boolean", | |
| "description": "Generate alt text for images using LLMs (must have a paid plan)", | |
| "default": false | |
| }, | |
| "returnOnlyUrls": { | |
| "type": "boolean", | |
| "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.", | |
| "default": false | |
| }, | |
| "maxDepth": { | |
| "type": "integer", | |
| "description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern." | |
| }, | |
| "mode": { | |
| "type": "string", | |
| "enum": ["default", "fast"], | |
| "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.", | |
| "default": "default" | |
| }, | |
| "ignoreSitemap": { | |
| "type": "boolean", | |
| "description": "Ignore the website sitemap when crawling", | |
| "default": false | |
| }, | |
| "limit": { | |
| "type": "integer", | |
| "description": "Maximum number of pages to crawl", | |
| "default": 10000 | |
| }, | |
| "allowBackwardCrawling": { | |
| "type": "boolean", | |
| "description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'", | |
| "default": false | |
| }, | |
| "allowExternalContentLinks": { | |
| "type": "boolean", | |
| "description": "Allows the crawler to follow links to external websites.", | |
| "default": false | |
| } | |
| } | |
| }, | |
| "pageOptions": { | |
| "type": "object", | |
| "properties": { | |
| "headers": { | |
| "type": "object", | |
| "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." | |
| }, | |
| "includeHtml": { | |
| "type": "boolean", | |
| "description": "Include the HTML version of the content on page. Will output a html key in the response.", | |
| "default": false | |
| }, | |
| "includeRawHtml": { | |
| "type": "boolean", | |
| "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", | |
| "default": false | |
| }, | |
| "onlyIncludeTags": { | |
| "type": "array", | |
| "items": { | |
| "type": "string" | |
| }, | |
| "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" | |
| }, | |
| "onlyMainContent": { | |
| "type": "boolean", | |
| "description": "Only return the main content of the page excluding headers, navs, footers, etc.", | |
| "default": false | |
| }, | |
| "removeTags": { | |
| "type": "array", | |
| "items": { | |
| "type": "string" | |
| }, | |
| "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" | |
| }, | |
| "replaceAllPathsWithAbsolutePaths": { | |
| "type": "boolean", | |
| "description": "Replace all relative paths with absolute paths for images and links", | |
| "default": false | |
| }, | |
| "screenshot": { | |
| "type": "boolean", | |
| "description": "Include a screenshot of the top of the page that you are scraping.", | |
| "default": false | |
| }, | |
| "fullPageScreenshot": { | |
| "type": "boolean", | |
| "description": "Include a full page screenshot of the page that you are scraping.", | |
| "default": false | |
| }, | |
| "waitFor": { | |
| "type": "integer", | |
| "description": "Wait x amount of milliseconds for the page to load to fetch content", | |
| "default": 0 | |
| } | |
| } | |
| } | |
| }, | |
| "required": ["url"] | |
| } | |
| } | |
| } | |
| }, | |
| "responses": { | |
| "200": { | |
| "description": "Successful response", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "$ref": "#/components/schemas/CrawlResponse" | |
| } | |
| } | |
| } | |
| }, | |
| "402": { | |
| "description": "Payment required", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "error": { | |
| "type": "string", | |
| "example": "Payment required to access this resource." | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "429": { | |
| "description": "Too many requests", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "error": { | |
| "type": "string", | |
| "example": "Request rate limit exceeded. Please wait and try again later." | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "500": { | |
| "description": "Server error", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "error": { | |
| "type": "string", | |
| "example": "An unexpected error occurred on the server." | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "/search": { | |
| "post": { | |
| "summary": "Search for a keyword in Google, returns top page results with markdown content for each page", | |
| "operationId": "searchGoogle", | |
| "tags": ["Search"], | |
| "security": [ | |
| { | |
| "bearerAuth": [] | |
| } | |
| ], | |
| "requestBody": { | |
| "required": true, | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "query": { | |
| "type": "string", | |
| "format": "uri", | |
| "description": "The query to search for" | |
| }, | |
| "pageOptions": { | |
| "type": "object", | |
| "properties": { | |
| "onlyMainContent": { | |
| "type": "boolean", | |
| "description": "Only return the main content of the page excluding headers, navs, footers, etc.", | |
| "default": false | |
| }, | |
| "fetchPageContent": { | |
| "type": "boolean", | |
| "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.", | |
| "default": true | |
| }, | |
| "includeHtml": { | |
| "type": "boolean", | |
| "description": "Include the HTML version of the content on page. Will output a html key in the response.", | |
| "default": false | |
| }, | |
| "includeRawHtml": { | |
| "type": "boolean", | |
| "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", | |
| "default": false | |
| } | |
| } | |
| }, | |
| "searchOptions": { | |
| "type": "object", | |
| "properties": { | |
| "limit": { | |
| "type": "integer", | |
| "description": "Maximum number of results. Max is 20 during beta." | |
| } | |
| } | |
| } | |
| }, | |
| "required": ["query"] | |
| } | |
| } | |
| } | |
| }, | |
| "responses": { | |
| "200": { | |
| "description": "Successful response", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "$ref": "#/components/schemas/SearchResponse" | |
| } | |
| } | |
| } | |
| }, | |
| "402": { | |
| "description": "Payment required", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "error": { | |
| "type": "string", | |
| "example": "Payment required to access this resource." | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "429": { | |
| "description": "Too many requests", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "error": { | |
| "type": "string", | |
| "example": "Request rate limit exceeded. Please wait and try again later." | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "500": { | |
| "description": "Server error", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "error": { | |
| "type": "string", | |
| "example": "An unexpected error occurred on the server." | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "/crawl/status/{jobId}": { | |
| "get": { | |
| "tags": ["Crawl"], | |
| "summary": "Get the status of a crawl job", | |
| "operationId": "getCrawlStatus", | |
| "security": [ | |
| { | |
| "bearerAuth": [] | |
| } | |
| ], | |
| "parameters": [ | |
| { | |
| "name": "jobId", | |
| "in": "path", | |
| "description": "ID of the crawl job", | |
| "required": true, | |
| "schema": { | |
| "type": "string" | |
| } | |
| } | |
| ], | |
| "responses": { | |
| "200": { | |
| "description": "Successful response", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "status": { | |
| "type": "string", | |
| "description": "Status of the job (completed, active, failed, paused)" | |
| }, | |
| "current": { | |
| "type": "integer", | |
| "description": "Current page number" | |
| }, | |
| "total": { | |
| "type": "integer", | |
| "description": "Total number of pages" | |
| }, | |
| "data": { | |
| "type": "array", | |
| "items": { | |
| "$ref": "#/components/schemas/CrawlStatusResponseObj" | |
| }, | |
| "description": "Data returned from the job (null when it is in progress)" | |
| }, | |
| "partial_data": { | |
| "type": "array", | |
| "items": { | |
| "$ref": "#/components/schemas/CrawlStatusResponseObj" | |
| }, | |
| "description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array." | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "402": { | |
| "description": "Payment required", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "error": { | |
| "type": "string", | |
| "example": "Payment required to access this resource." | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "429": { | |
| "description": "Too many requests", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "error": { | |
| "type": "string", | |
| "example": "Request rate limit exceeded. Please wait and try again later." | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "500": { | |
| "description": "Server error", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "error": { | |
| "type": "string", | |
| "example": "An unexpected error occurred on the server." | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "/crawl/cancel/{jobId}": { | |
| "delete": { | |
| "tags": ["Crawl"], | |
| "summary": "Cancel a crawl job", | |
| "operationId": "cancelCrawlJob", | |
| "security": [ | |
| { | |
| "bearerAuth": [] | |
| } | |
| ], | |
| "parameters": [ | |
| { | |
| "name": "jobId", | |
| "in": "path", | |
| "description": "ID of the crawl job", | |
| "required": true, | |
| "schema": { | |
| "type": "string" | |
| } | |
| } | |
| ], | |
| "responses": { | |
| "200": { | |
| "description": "Successful response", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "status": { | |
| "type": "string", | |
| "description": "Returns cancelled." | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "402": { | |
| "description": "Payment required", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "error": { | |
| "type": "string", | |
| "example": "Payment required to access this resource." | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "429": { | |
| "description": "Too many requests", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "error": { | |
| "type": "string", | |
| "example": "Request rate limit exceeded. Please wait and try again later." | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "500": { | |
| "description": "Server error", | |
| "content": { | |
| "application/json": { | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "error": { | |
| "type": "string", | |
| "example": "An unexpected error occurred on the server." | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "components": { | |
| "securitySchemes": { | |
| "bearerAuth": { | |
| "type": "http", | |
| "scheme": "bearer" | |
| } | |
| }, | |
| "schemas": { | |
| "ScrapeResponse": { | |
| "type": "object", | |
| "properties": { | |
| "success": { | |
| "type": "boolean" | |
| }, | |
| "data": { | |
| "type": "object", | |
| "properties": { | |
| "markdown": { | |
| "type": "string" | |
| }, | |
| "content": { | |
| "type": "string" | |
| }, | |
| "html": { | |
| "type": "string", | |
| "nullable": true, | |
| "description": "HTML version of the content on page if `includeHtml` is true" | |
| }, | |
| "rawHtml": { | |
| "type": "string", | |
| "nullable": true, | |
| "description": "Raw HTML content of the page if `includeRawHtml` is true" | |
| }, | |
| "metadata": { | |
| "type": "object", | |
| "properties": { | |
| "title": { | |
| "type": "string" | |
| }, | |
| "description": { | |
| "type": "string" | |
| }, | |
| "language": { | |
| "type": "string", | |
| "nullable": true | |
| }, | |
| "sourceURL": { | |
| "type": "string", | |
| "format": "uri" | |
| }, | |
| "<any other metadata> ": { | |
| "type": "string" | |
| }, | |
| "pageStatusCode": { | |
| "type": "integer", | |
| "description": "The status code of the page" | |
| }, | |
| "pageError": { | |
| "type": "string", | |
| "nullable": true, | |
| "description": "The error message of the page" | |
| } | |
| } | |
| }, | |
| "llm_extraction": { | |
| "type": "object", | |
| "description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.", | |
| "nullable": true | |
| }, | |
| "warning": { | |
| "type": "string", | |
| "nullable": true, | |
| "description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction." | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "CrawlStatusResponseObj": { | |
| "type": "object", | |
| "properties": { | |
| "markdown": { | |
| "type": "string" | |
| }, | |
| "content": { | |
| "type": "string" | |
| }, | |
| "html": { | |
| "type": "string", | |
| "nullable": true, | |
| "description": "HTML version of the content on page if `includeHtml` is true" | |
| }, | |
| "rawHtml": { | |
| "type": "string", | |
| "nullable": true, | |
| "description": "Raw HTML content of the page if `includeRawHtml` is true" | |
| }, | |
| "index": { | |
| "type": "integer", | |
| "description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from." | |
| }, | |
| "metadata": { | |
| "type": "object", | |
| "properties": { | |
| "title": { | |
| "type": "string" | |
| }, | |
| "description": { | |
| "type": "string" | |
| }, | |
| "language": { | |
| "type": "string", | |
| "nullable": true | |
| }, | |
| "sourceURL": { | |
| "type": "string", | |
| "format": "uri" | |
| }, | |
| "<any other metadata> ": { | |
| "type": "string" | |
| }, | |
| "pageStatusCode": { | |
| "type": "integer", | |
| "description": "The status code of the page" | |
| }, | |
| "pageError": { | |
| "type": "string", | |
| "nullable": true, | |
| "description": "The error message of the page" | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "SearchResponse": { | |
| "type": "object", | |
| "properties": { | |
| "success": { | |
| "type": "boolean" | |
| }, | |
| "data": { | |
| "type": "array", | |
| "items": { | |
| "type": "object", | |
| "properties": { | |
| "url": { | |
| "type": "string" | |
| }, | |
| "markdown": { | |
| "type": "string" | |
| }, | |
| "content": { | |
| "type": "string" | |
| }, | |
| "metadata": { | |
| "type": "object", | |
| "properties": { | |
| "title": { | |
| "type": "string" | |
| }, | |
| "description": { | |
| "type": "string" | |
| }, | |
| "language": { | |
| "type": "string", | |
| "nullable": true | |
| }, | |
| "sourceURL": { | |
| "type": "string", | |
| "format": "uri" | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "CrawlResponse": { | |
| "type": "object", | |
| "properties": { | |
| "jobId": { | |
| "type": "string" | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "security": [ | |
| { | |
| "bearerAuth": [] | |
| } | |
| ] | |
| } |