Spaces:

Echo-AI-official
/

Fire-crawl

Paused

App Files Files Community

Echo-AI-official commited on Apr 17, 2025

Commit

0e759d2

verified ·

1 Parent(s): 5ca4a31

Upload 280 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +4 -0
.env +24 -0
.env.example +80 -0
.env.local +15 -0
.gitattributes +2 -35
.gitignore +13 -0
.prettierrc +3 -0
Dockerfile +75 -0
docker-entrypoint.sh +24 -0
jest.config.js +8 -0
jest.setup.js +1 -0
openapi-v0.json +924 -0
openapi.json +929 -0
package.json +156 -0
pnpm-lock.yaml +0 -0
requests.http +127 -0
sharedLibs/go-html-to-md/.gitignore +2 -0
sharedLibs/go-html-to-md/README.md +7 -0
sharedLibs/go-html-to-md/go.mod +16 -0
sharedLibs/go-html-to-md/go.sum +64 -0
sharedLibs/go-html-to-md/html-to-markdown.go +25 -0
sharedLibs/html-transformer/.gitignore +1 -0
sharedLibs/html-transformer/Cargo.lock +1235 -0
sharedLibs/html-transformer/Cargo.toml +15 -0
sharedLibs/html-transformer/src/lib.rs +394 -0
src/__tests__/concurrency-limit.test.ts +209 -0
src/__tests__/deep-research/unit/deep-research-redis.test.ts +135 -0
src/__tests__/e2e_extract/index.test.ts +340 -0
src/__tests__/e2e_full_withAuth/index.test.ts +1762 -0
src/__tests__/e2e_map/index.test.ts +117 -0
src/__tests__/e2e_noAuth/index.test.ts +212 -0
src/__tests__/e2e_v1_withAuth/index.test.ts +1066 -0
src/__tests__/e2e_v1_withAuth_all_params/index.test.ts +711 -0
src/__tests__/e2e_withAuth/index.test.ts +862 -0
src/__tests__/queue-concurrency-integration.test.ts +269 -0
src/__tests__/snips/batch-scrape.test.ts +51 -0
src/__tests__/snips/billing.test.ts +197 -0
src/__tests__/snips/crawl.test.ts +75 -0
src/__tests__/snips/extract.test.ts +59 -0
src/__tests__/snips/lib.ts +273 -0
src/__tests__/snips/map.test.ts +34 -0
src/__tests__/snips/mocks/map-query-params.json +0 -0
src/__tests__/snips/mocks/mocking-works-properly.json +107 -0
src/__tests__/snips/scrape.test.ts +330 -0
src/__tests__/snips/search.test.ts +23 -0
src/__tests__/snips/utils/collect-mocks.js +14 -0
src/control.ts +2 -0
src/controllers/__tests__/crawl.test.ts +51 -0
src/controllers/auth.ts +519 -0
src/controllers/v0/admin/acuc-cache-clear.ts +24 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,4 @@

+/node_modules/
+/dist/
+.env
+*.csv

.env ADDED Viewed

	@@ -0,0 +1,24 @@

+# ./apps/api/.env
+# ===== Required ENVS ======
+NUM_WORKERS_PER_QUEUE=8
+PORT=3002
+HOST=0.0.0.0
+# For running locally, use redis://localhost:6379
+REDIS_URL=redis://localhost:6379
+# For running locally, use redis://localhost:6379
+REDIS_RATE_LIMIT_URL=redis://localhost:6379
+PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000/html # Note: The docs mention this value for Docker self-hosting, its role in a pure local setup isn't fully detailed here but listed as required.
+## Keep DB authentication off for the basic setup
+USE_DB_AUTHENTICATION=false
+# ===== Optional ENVS ======
+# You can leave the rest blank or commented out for the initial setup
+# SUPABASE_ANON_TOKEN=
+# SUPABASE_URL=
+# SUPABASE_SERVICE_TOKEN=
+# ... other optional variables ...
+LOGGING_LEVEL=INFO # Default logging level

.env.example ADDED Viewed

	@@ -0,0 +1,80 @@

+# ===== Required ENVS ======
+NUM_WORKERS_PER_QUEUE=8
+PORT=3002
+HOST=0.0.0.0
+REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
+REDIS_RATE_LIMIT_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
+PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000/html
+## To turn on DB authentication, you need to set up supabase.
+USE_DB_AUTHENTICATION=true
+# ===== Optional ENVS ======
+# SearchApi key. Head to https://searchapi.com/ to get your API key
+SEARCHAPI_API_KEY=
+# SearchApi engine, defaults to google. Available options: google, bing, baidu, google_news, etc. Head to https://searchapi.com/ to explore more engines
+SEARCHAPI_ENGINE=
+# Supabase Setup (used to support DB authentication, advanced logging, etc.)
+SUPABASE_ANON_TOKEN=
+SUPABASE_URL=
+SUPABASE_SERVICE_TOKEN=
+# Other Optionals
+# use if you've set up authentication and want to test with a real API key
+TEST_API_KEY=
+# set if you'd like to test the scraping rate limit
+RATE_LIMIT_TEST_API_KEY_SCRAPE=
+# set if you'd like to test the crawling rate limit
+RATE_LIMIT_TEST_API_KEY_CRAWL=
+# set if you'd like to use scraping Be to handle JS blocking
+SCRAPING_BEE_API_KEY=
+# add for LLM dependednt features (image alt generation, etc.)
+OPENAI_API_KEY=
+BULL_AUTH_KEY=@
+# set if you have a llamaparse key you'd like to use to parse pdfs
+LLAMAPARSE_API_KEY=
+# set if you'd like to send slack server health status messages
+SLACK_WEBHOOK_URL=
+# set if you'd like to send posthog events like job logs
+POSTHOG_API_KEY=
+# set if you'd like to send posthog events like job logs
+POSTHOG_HOST=
+STRIPE_PRICE_ID_STANDARD=
+STRIPE_PRICE_ID_SCALE=
+STRIPE_PRICE_ID_STARTER=
+STRIPE_PRICE_ID_HOBBY=
+STRIPE_PRICE_ID_HOBBY_YEARLY=
+STRIPE_PRICE_ID_STANDARD_NEW=
+STRIPE_PRICE_ID_STANDARD_NEW_YEARLY=
+STRIPE_PRICE_ID_GROWTH=
+STRIPE_PRICE_ID_GROWTH_YEARLY=
+# set if you'd like to use the fire engine closed beta
+FIRE_ENGINE_BETA_URL=
+# Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request)
+PROXY_SERVER=
+PROXY_USERNAME=
+PROXY_PASSWORD=
+# set if you'd like to block media requests to save proxy bandwidth
+BLOCK_MEDIA=
+# Set this to the URL of your webhook when using the self-hosted version of FireCrawl
+SELF_HOSTED_WEBHOOK_URL=
+# Resend API Key for transactional emails
+RESEND_API_KEY=
+# LOGGING_LEVEL determines the verbosity of logs that the system will output.
+# Available levels are:
+# NONE - No logs will be output.
+# ERROR - For logging error messages that indicate a failure in a specific operation.
+# WARN - For logging potentially harmful situations that are not necessarily errors.
+# INFO - For logging informational messages that highlight the progress of the application.
+# DEBUG - For logging detailed information on the flow through the system, primarily used for debugging.
+# TRACE - For logging more detailed information than the DEBUG level.
+# Set LOGGING_LEVEL to one of the above options to control logging output.
+LOGGING_LEVEL=INFO

.env.local ADDED Viewed

	@@ -0,0 +1,15 @@

+NUM_WORKERS_PER_QUEUE=8
+PORT=
+HOST=
+SUPABASE_ANON_TOKEN=
+SUPABASE_URL=
+SUPABASE_SERVICE_TOKEN=
+REDIS_URL=
+REDIS_RATE_LIMIT_URL=
+SCRAPING_BEE_API_KEY=
+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
+BULL_AUTH_KEY=
+LOGTAIL_KEY=
+PLAYWRIGHT_MICROSERVICE_URL=
+SEARCHAPI_API_KEY=

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ # Auto detect text files and perform LF normalization
2	+ * text=auto

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+/node_modules/
+/dist/
+.env
+*.csv
+dump.rdb
+/mongo-data
+/.next/
+.rdb
+.sentryclirc
+.env.*

.prettierrc ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "trailingComma": "all"
+}

Dockerfile ADDED Viewed

	@@ -0,0 +1,75 @@

+# Dockerfile (place this in the root of your firecrawl project)
+# 1. Base Image: Use a Node.js LTS version that includes build tools
+FROM node:18
+# 2. Environment Variables
+ENV PNPM_HOME="/pnpm"
+ENV PATH="$PNPM_HOME:$PATH"
+# Prevent apt-get from asking questions
+ENV DEBIAN_FRONTEND=noninteractive
+# Set Node environment (can be overridden by supervisor conf or HF secrets)
+ENV NODE_ENV=production
+# 3. Install System Dependencies: Redis, Supervisor, Git, and utilities
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    redis-server \
+    supervisor \
+    git \
+    curl \
+    wget \
+    gnupg \
+    # Clean up apt cache
+    && rm -rf /var/lib/apt/lists/*
+# 4. Install pnpm v9+ globally
+RUN npm install -g pnpm@9
+# 5. Set Application Directory
+WORKDIR /usr/src/app
+# 6. Copy Package Definitions & Install Dependencies (Leverages Docker cache)
+# Copy root files first
+COPY package.json pnpm-lock.yaml ./
+# Copy workspace config if it exists (use wildcard *)
+COPY pnpm-workspace.yaml* ./
+# Copy the specific package.json for the api app
+COPY apps/api/package.json ./apps/api/
+# Install ALL monorepo dependencies using the lockfile (includes devDeps needed for playwright)
+RUN pnpm install --frozen-lockfile
+# 7. Install Playwright Browsers & Dependencies
+# This command downloads browsers (e.g., Chromium) AND tries to install needed OS libraries.
+# Run this using the 'api' package context, assuming playwright is its dependency.
+# Specify the browser(s) you need (e.g., chromium). Check Firecrawl needs.
+RUN pnpm --filter api exec playwright install --with-deps chromium
+# 8. Copy Application Code
+# Copy the rest of your Firecrawl project code into the image
+COPY . .
+# 9. Copy Supervisor Configuration
+COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
+# 10. Configure Environment for Internal Communication (inside the container)
+# These should match the values expected by the app when running internally
+ENV PORT=3002 \
+    HOST=0.0.0.0 \
+    REDIS_URL=redis://localhost:6379 \
+    REDIS_RATE_LIMIT_URL=redis://localhost:6379 \
+    USE_DB_AUTHENTICATION=false \
+    # Add any other required non-secret ENVs here
+    LOGGING_LEVEL=INFO
+# --- Configure Hugging Face Space specific settings ---
+# Hugging Face will map its public port (e.g., 7860) to this internal port
+EXPOSE 3002
+# Health check endpoint (if Firecrawl has one, e.g., /test or /health)
+# HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
+#   CMD curl -f http://localhost:3002/test || exit 1
+# (Uncomment and adjust HEALTHCHECK if you know the correct endpoint)
+# 11. Start Supervisor
+# This command starts supervisord, which in turn starts redis, the api, and the worker(s) based on supervisord.conf
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]

docker-entrypoint.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/bin/bash -e
+if [ "$UID" -eq 0 ]; then
+  set +e # disable failing on errror
+  ulimit -n 65535
+  echo "NEW ULIMIT: $(ulimit -n)"
+  set -e # enable failing on error
+else
+  echo ENTRYPOINT DID NOT RUN AS ROOT
+fi
+if [ "$FLY_PROCESS_GROUP" = "app" ]; then
+  echo "RUNNING app"
+  node --max-old-space-size=8192 dist/src/index.js
+elif [ "$FLY_PROCESS_GROUP" = "worker" ]; then
+  echo "RUNNING worker"
+  node --max-old-space-size=8192 dist/src/services/queue-worker.js
+elif [ "$FLY_PROCESS_GROUP" = "index-worker" ]; then
+  echo "RUNNING index worker"
+  node --max-old-space-size=8192 dist/src/services/indexing/index-worker.js
+else
+  echo "NO FLY PROCESS GROUP"
+  node --max-old-space-size=8192 dist/src/index.js
+fi

jest.config.js ADDED Viewed

	@@ -0,0 +1,8 @@

+module.exports = {
+  preset: "ts-jest",
+  testEnvironment: "node",
+  setupFiles: ["./jest.setup.js"],
+  // ignore dist folder root dir
+  modulePathIgnorePatterns: ["<rootDir>/dist/"],
+};

jest.setup.js ADDED Viewed

	@@ -0,0 +1 @@


1	+ // global.fetch = require('jest-fetch-mock');

openapi-v0.json ADDED Viewed

	@@ -0,0 +1,924 @@

+{
+  "openapi": "3.0.0",
+  "info": {
+    "title": "Firecrawl API",
+    "version": "0.0.0",
+    "description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.",
+    "contact": {
+      "name": "Firecrawl Support",
+      "url": "https://firecrawl.dev/support",
+      "email": "support@firecrawl.dev"
+    }
+  },
+  "servers": [
+    {
+      "url": "https://api.firecrawl.dev/v0"
+    }
+  ],
+  "paths": {
+    "/scrape": {
+      "post": {
+        "summary": "Scrape a single URL and optionally extract information using an LLM",
+        "operationId": "scrapeAndExtractFromUrl",
+        "tags": ["Scraping"],
+        "security": [
+          {
+            "bearerAuth": []
+          }
+        ],
+        "requestBody": {
+          "required": true,
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "properties": {
+                  "url": {
+                    "type": "string",
+                    "format": "uri",
+                    "description": "The URL to scrape"
+                  },
+                  "pageOptions": {
+                    "type": "object",
+                    "properties": {
+                      "headers": {
+                        "type": "object",
+                        "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
+                      },
+                      "includeHtml": {
+                        "type": "boolean",
+                        "description": "Include the HTML version of the content on page. Will output a html key in the response.",
+                        "default": false
+                      },
+                      "includeRawHtml": {
+                        "type": "boolean",
+                        "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
+                        "default": false
+                      },
+                      "onlyIncludeTags": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
+                      },
+                      "onlyMainContent": {
+                        "type": "boolean",
+                        "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
+                        "default": false
+                      },
+                      "removeTags": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
+                      },
+                      "replaceAllPathsWithAbsolutePaths": {
+                        "type": "boolean",
+                        "description": "Replace all relative paths with absolute paths for images and links",
+                        "default": false
+                      },
+                      "screenshot": {
+                        "type": "boolean",
+                        "description": "Include a screenshot of the top of the page that you are scraping.",
+                        "default": false
+                      },
+                      "fullPageScreenshot": {
+                        "type": "boolean",
+                        "description": "Include a full page screenshot of the page that you are scraping.",
+                        "default": false
+                      },
+                      "waitFor": {
+                        "type": "integer",
+                        "description": "Wait x amount of milliseconds for the page to load to fetch content",
+                        "default": 0
+                      }
+                    }
+                  },
+                  "extractorOptions": {
+                    "type": "object",
+                    "description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
+                    "default": {},
+                    "properties": {
+                      "mode": {
+                        "type": "string",
+                        "enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
+                        "description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
+                      },
+                      "extractionPrompt": {
+                        "type": "string",
+                        "description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
+                      },
+                      "extractionSchema": {
+                        "type": "object",
+                        "additionalProperties": true,
+                        "description": "The schema for the data to be extracted, required only for LLM extraction modes.",
+                        "required": [
+                          "company_mission",
+                          "supports_sso",
+                          "is_open_source"
+                        ]
+                      }
+                    }
+                  },
+                  "timeout": {
+                    "type": "integer",
+                    "description": "Timeout in milliseconds for the request",
+                    "default": 30000
+                  }
+                },
+                "required": ["url"]
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "Successful response",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ScrapeResponse"
+                }
+              }
+            }
+          },
+          "402": {
+            "description": "Payment required",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Payment required to access this resource."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Too many requests",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Request rate limit exceeded. Please wait and try again later."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Server error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "An unexpected error occurred on the server."
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/crawl": {
+      "post": {
+        "summary": "Crawl multiple URLs based on options",
+        "operationId": "crawlUrls",
+        "tags": ["Crawling"],
+        "security": [
+          {
+            "bearerAuth": []
+          }
+        ],
+        "requestBody": {
+          "required": true,
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "properties": {
+                  "url": {
+                    "type": "string",
+                    "format": "uri",
+                    "description": "The base URL to start crawling from"
+                  },
+                  "crawlerOptions": {
+                    "type": "object",
+                    "properties": {
+                      "includes": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "URL patterns to include"
+                      },
+                      "excludes": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "URL patterns to exclude"
+                      },
+                      "generateImgAltText": {
+                        "type": "boolean",
+                        "description": "Generate alt text for images using LLMs (must have a paid plan)",
+                        "default": false
+                      },
+                      "returnOnlyUrls": {
+                        "type": "boolean",
+                        "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
+                        "default": false
+                      },
+                      "maxDepth": {
+                        "type": "integer",
+                        "description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
+                      },
+                      "mode": {
+                        "type": "string",
+                        "enum": ["default", "fast"],
+                        "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
+                        "default": "default"
+                      },
+                      "ignoreSitemap": {
+                        "type": "boolean",
+                        "description": "Ignore the website sitemap when crawling",
+                        "default": false
+                      },
+                      "limit": {
+                        "type": "integer",
+                        "description": "Maximum number of pages to crawl",
+                        "default": 10000
+                      },
+                      "allowBackwardCrawling": {
+                        "type": "boolean",
+                        "description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'",
+                        "default": false
+                      },
+                      "allowExternalContentLinks": {
+                        "type": "boolean",
+                        "description": "Allows the crawler to follow links to external websites.",
+                        "default": false
+                      }
+                    }
+                  },
+                  "pageOptions": {
+                    "type": "object",
+                    "properties": {
+                      "headers": {
+                        "type": "object",
+                        "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
+                      },
+                      "includeHtml": {
+                        "type": "boolean",
+                        "description": "Include the HTML version of the content on page. Will output a html key in the response.",
+                        "default": false
+                      },
+                      "includeRawHtml": {
+                        "type": "boolean",
+                        "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
+                        "default": false
+                      },
+                      "onlyIncludeTags": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
+                      },
+                      "onlyMainContent": {
+                        "type": "boolean",
+                        "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
+                        "default": false
+                      },
+                      "removeTags": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
+                      },
+                      "replaceAllPathsWithAbsolutePaths": {
+                        "type": "boolean",
+                        "description": "Replace all relative paths with absolute paths for images and links",
+                        "default": false
+                      },
+                      "screenshot": {
+                        "type": "boolean",
+                        "description": "Include a screenshot of the top of the page that you are scraping.",
+                        "default": false
+                      },
+                      "fullPageScreenshot": {
+                        "type": "boolean",
+                        "description": "Include a full page screenshot of the page that you are scraping.",
+                        "default": false
+                      },
+                      "waitFor": {
+                        "type": "integer",
+                        "description": "Wait x amount of milliseconds for the page to load to fetch content",
+                        "default": 0
+                      }
+                    }
+                  }
+                },
+                "required": ["url"]
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "Successful response",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/CrawlResponse"
+                }
+              }
+            }
+          },
+          "402": {
+            "description": "Payment required",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Payment required to access this resource."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Too many requests",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Request rate limit exceeded. Please wait and try again later."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Server error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "An unexpected error occurred on the server."
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/search": {
+      "post": {
+        "summary": "Search for a keyword in Google, returns top page results with markdown content for each page",
+        "operationId": "searchGoogle",
+        "tags": ["Search"],
+        "security": [
+          {
+            "bearerAuth": []
+          }
+        ],
+        "requestBody": {
+          "required": true,
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "properties": {
+                  "query": {
+                    "type": "string",
+                    "format": "uri",
+                    "description": "The query to search for"
+                  },
+                  "pageOptions": {
+                    "type": "object",
+                    "properties": {
+                      "onlyMainContent": {
+                        "type": "boolean",
+                        "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
+                        "default": false
+                      },
+                      "fetchPageContent": {
+                        "type": "boolean",
+                        "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
+                        "default": true
+                      },
+                      "includeHtml": {
+                        "type": "boolean",
+                        "description": "Include the HTML version of the content on page. Will output a html key in the response.",
+                        "default": false
+                      },
+                      "includeRawHtml": {
+                        "type": "boolean",
+                        "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
+                        "default": false
+                      }
+                    }
+                  },
+                  "searchOptions": {
+                    "type": "object",
+                    "properties": {
+                      "limit": {
+                        "type": "integer",
+                        "description": "Maximum number of results. Max is 20 during beta."
+                      }
+                    }
+                  }
+                },
+                "required": ["query"]
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "Successful response",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/SearchResponse"
+                }
+              }
+            }
+          },
+          "402": {
+            "description": "Payment required",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Payment required to access this resource."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Too many requests",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Request rate limit exceeded. Please wait and try again later."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Server error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "An unexpected error occurred on the server."
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/crawl/status/{jobId}": {
+      "get": {
+        "tags": ["Crawl"],
+        "summary": "Get the status of a crawl job",
+        "operationId": "getCrawlStatus",
+        "security": [
+          {
+            "bearerAuth": []
+          }
+        ],
+        "parameters": [
+          {
+            "name": "jobId",
+            "in": "path",
+            "description": "ID of the crawl job",
+            "required": true,
+            "schema": {
+              "type": "string"
+            }
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "Successful response",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "status": {
+                      "type": "string",
+                      "description": "Status of the job (completed, active, failed, paused)"
+                    },
+                    "current": {
+                      "type": "integer",
+                      "description": "Current page number"
+                    },
+                    "total": {
+                      "type": "integer",
+                      "description": "Total number of pages"
+                    },
+                    "data": {
+                      "type": "array",
+                      "items": {
+                        "$ref": "#/components/schemas/CrawlStatusResponseObj"
+                      },
+                      "description": "Data returned from the job (null when it is in progress)"
+                    },
+                    "partial_data": {
+                      "type": "array",
+                      "items": {
+                        "$ref": "#/components/schemas/CrawlStatusResponseObj"
+                      },
+                      "description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "402": {
+            "description": "Payment required",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Payment required to access this resource."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Too many requests",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Request rate limit exceeded. Please wait and try again later."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Server error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "An unexpected error occurred on the server."
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/crawl/cancel/{jobId}": {
+      "delete": {
+        "tags": ["Crawl"],
+        "summary": "Cancel a crawl job",
+        "operationId": "cancelCrawlJob",
+        "security": [
+          {
+            "bearerAuth": []
+          }
+        ],
+        "parameters": [
+          {
+            "name": "jobId",
+            "in": "path",
+            "description": "ID of the crawl job",
+            "required": true,
+            "schema": {
+              "type": "string"
+            }
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "Successful response",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "status": {
+                      "type": "string",
+                      "description": "Returns cancelled."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "402": {
+            "description": "Payment required",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Payment required to access this resource."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Too many requests",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Request rate limit exceeded. Please wait and try again later."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Server error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "An unexpected error occurred on the server."
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  },
+  "components": {
+    "securitySchemes": {
+      "bearerAuth": {
+        "type": "http",
+        "scheme": "bearer"
+      }
+    },
+    "schemas": {
+      "ScrapeResponse": {
+        "type": "object",
+        "properties": {
+          "success": {
+            "type": "boolean"
+          },
+          "data": {
+            "type": "object",
+            "properties": {
+              "markdown": {
+                "type": "string"
+              },
+              "content": {
+                "type": "string"
+              },
+              "html": {
+                "type": "string",
+                "nullable": true,
+                "description": "HTML version of the content on page if `includeHtml`  is true"
+              },
+              "rawHtml": {
+                "type": "string",
+                "nullable": true,
+                "description": "Raw HTML content of the page if `includeRawHtml`  is true"
+              },
+              "metadata": {
+                "type": "object",
+                "properties": {
+                  "title": {
+                    "type": "string"
+                  },
+                  "description": {
+                    "type": "string"
+                  },
+                  "language": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "sourceURL": {
+                    "type": "string",
+                    "format": "uri"
+                  },
+                  "<any other metadata> ": {
+                    "type": "string"
+                  },
+                  "pageStatusCode": {
+                    "type": "integer",
+                    "description": "The status code of the page"
+                  },
+                  "pageError": {
+                    "type": "string",
+                    "nullable": true,
+                    "description": "The error message of the page"
+                  }
+                }
+              },
+              "llm_extraction": {
+                "type": "object",
+                "description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
+                "nullable": true
+              },
+              "warning": {
+                "type": "string",
+                "nullable": true,
+                "description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
+              }
+            }
+          }
+        }
+      },
+      "CrawlStatusResponseObj": {
+        "type": "object",
+        "properties": {
+          "markdown": {
+            "type": "string"
+          },
+          "content": {
+            "type": "string"
+          },
+          "html": {
+            "type": "string",
+            "nullable": true,
+            "description": "HTML version of the content on page if `includeHtml`  is true"
+          },
+          "rawHtml": {
+            "type": "string",
+            "nullable": true,
+            "description": "Raw HTML content of the page if `includeRawHtml`  is true"
+          },
+          "index": {
+            "type": "integer",
+            "description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
+          },
+          "metadata": {
+            "type": "object",
+            "properties": {
+              "title": {
+                "type": "string"
+              },
+              "description": {
+                "type": "string"
+              },
+              "language": {
+                "type": "string",
+                "nullable": true
+              },
+              "sourceURL": {
+                "type": "string",
+                "format": "uri"
+              },
+              "<any other metadata> ": {
+                "type": "string"
+              },
+              "pageStatusCode": {
+                "type": "integer",
+                "description": "The status code of the page"
+              },
+              "pageError": {
+                "type": "string",
+                "nullable": true,
+                "description": "The error message of the page"
+              }
+            }
+          }
+        }
+      },
+      "SearchResponse": {
+        "type": "object",
+        "properties": {
+          "success": {
+            "type": "boolean"
+          },
+          "data": {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "url": {
+                  "type": "string"
+                },
+                "markdown": {
+                  "type": "string"
+                },
+                "content": {
+                  "type": "string"
+                },
+                "metadata": {
+                  "type": "object",
+                  "properties": {
+                    "title": {
+                      "type": "string"
+                    },
+                    "description": {
+                      "type": "string"
+                    },
+                    "language": {
+                      "type": "string",
+                      "nullable": true
+                    },
+                    "sourceURL": {
+                      "type": "string",
+                      "format": "uri"
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      },
+      "CrawlResponse": {
+        "type": "object",
+        "properties": {
+          "jobId": {
+            "type": "string"
+          }
+        }
+      }
+    }
+  },
+  "security": [
+    {
+      "bearerAuth": []
+    }
+  ]
+}

openapi.json ADDED Viewed

	@@ -0,0 +1,929 @@

+{
+  "openapi": "3.0.0",
+  "info": {
+    "title": "Firecrawl API",
+    "version": "1.0.0",
+    "description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.",
+    "contact": {
+      "name": "Firecrawl Support",
+      "url": "https://firecrawl.dev/support",
+      "email": "support@firecrawl.dev"
+    }
+  },
+  "servers": [
+    {
+      "url": "https://api.firecrawl.dev/v0"
+    }
+  ],
+  "paths": {
+    "/scrape": {
+      "post": {
+        "summary": "Scrape a single URL",
+        "operationId": "scrape",
+        "tags": ["Scraping"],
+        "security": [
+          {
+            "bearerAuth": []
+          }
+        ],
+        "requestBody": {
+          "required": true,
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "properties": {
+                  "url": {
+                    "type": "string",
+                    "format": "uri",
+                    "description": "The URL to scrape"
+                  },
+                  "formats": {
+                    "type": "array",
+                    "items": {
+                      "type": "string",
+                      "enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"]
+                    },
+                    "description": "Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)",
+                    "default": ["markdown"]
+                  },
+                  "headers": {
+                    "type": "object",
+                    "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
+                  },
+                  "includeTags": {
+                    "type": "array",
+                    "items": {
+                      "type": "string"
+                    },
+                    "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
+                  },
+                  "excludeTags": {
+                    "type": "array",
+                    "items": {
+                      "type": "string"
+                    },
+                    "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
+                  },
+                  "onlyMainContent": {
+                    "type": "boolean",
+                    "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
+                    "default": true
+                  },
+                  "timeout": {
+                    "type": "integer",
+                    "description": "Timeout in milliseconds for the request",
+                    "default": 30000
+                  },
+                  "waitFor": {
+                    "type": "integer",
+                    "description": "Wait x amount of milliseconds for the page to load to fetch content",
+                    "default": 0
+                  }
+                },
+                "required": ["url"]
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "Successful response",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ScrapeResponse"
+                }
+              }
+            }
+          },
+          "402": {
+            "description": "Payment required",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Payment required to access this resource."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Too many requests",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Request rate limit exceeded. Please wait and try again later."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Server error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "An unexpected error occurred on the server."
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/crawl": {
+      "post": {
+        "summary": "Crawl multiple URLs based on options",
+        "operationId": "crawlUrls",
+        "tags": ["Crawling"],
+        "security": [
+          {
+            "bearerAuth": []
+          }
+        ],
+        "requestBody": {
+          "required": true,
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "properties": {
+                  "url": {
+                    "type": "string",
+                    "format": "uri",
+                    "description": "The base URL to start crawling from"
+                  },
+                  "crawlerOptions": {
+                    "type": "object",
+                    "properties": {
+                      "includes": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "URL patterns to include"
+                      },
+                      "excludes": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "URL patterns to exclude"
+                      },
+                      "generateImgAltText": {
+                        "type": "boolean",
+                        "description": "Generate alt text for images using LLMs (must have a paid plan)",
+                        "default": false
+                      },
+                      "returnOnlyUrls": {
+                        "type": "boolean",
+                        "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
+                        "default": false
+                      },
+                      "maxDepth": {
+                        "type": "integer",
+                        "description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
+                      },
+                      "mode": {
+                        "type": "string",
+                        "enum": ["default", "fast"],
+                        "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
+                        "default": "default"
+                      },
+                      "ignoreSitemap": {
+                        "type": "boolean",
+                        "description": "Ignore the website sitemap when crawling",
+                        "default": false
+                      },
+                      "limit": {
+                        "type": "integer",
+                        "description": "Maximum number of pages to crawl",
+                        "default": 10000
+                      },
+                      "allowBackwardCrawling": {
+                        "type": "boolean",
+                        "description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'",
+                        "default": false
+                      },
+                      "allowExternalContentLinks": {
+                        "type": "boolean",
+                        "description": "Allows the crawler to follow links to external websites.",
+                        "default": false
+                      }
+                    }
+                  },
+                  "pageOptions": {
+                    "type": "object",
+                    "properties": {
+                      "headers": {
+                        "type": "object",
+                        "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
+                      },
+                      "includeHtml": {
+                        "type": "boolean",
+                        "description": "Include the HTML version of the content on page. Will output a html key in the response.",
+                        "default": false
+                      },
+                      "includeRawHtml": {
+                        "type": "boolean",
+                        "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
+                        "default": false
+                      },
+                      "onlyIncludeTags": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
+                      },
+                      "onlyMainContent": {
+                        "type": "boolean",
+                        "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
+                        "default": false
+                      },
+                      "removeTags": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
+                      },
+                      "replaceAllPathsWithAbsolutePaths": {
+                        "type": "boolean",
+                        "description": "Replace all relative paths with absolute paths for images and links",
+                        "default": false
+                      },
+                      "screenshot": {
+                        "type": "boolean",
+                        "description": "Include a screenshot of the top of the page that you are scraping.",
+                        "default": false
+                      },
+                      "fullPageScreenshot": {
+                        "type": "boolean",
+                        "description": "Include a full page screenshot of the page that you are scraping.",
+                        "default": false
+                      },
+                      "waitFor": {
+                        "type": "integer",
+                        "description": "Wait x amount of milliseconds for the page to load to fetch content",
+                        "default": 0
+                      }
+                    }
+                  }
+                },
+                "required": ["url"]
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "Successful response",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/CrawlResponse"
+                }
+              }
+            }
+          },
+          "402": {
+            "description": "Payment required",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Payment required to access this resource."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Too many requests",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Request rate limit exceeded. Please wait and try again later."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Server error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "An unexpected error occurred on the server."
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/search": {
+      "post": {
+        "summary": "Search for a keyword in Google, returns top page results with markdown content for each page",
+        "operationId": "searchGoogle",
+        "tags": ["Search"],
+        "security": [
+          {
+            "bearerAuth": []
+          }
+        ],
+        "requestBody": {
+          "required": true,
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "properties": {
+                  "query": {
+                    "type": "string",
+                    "format": "uri",
+                    "description": "The query to search for"
+                  },
+                  "pageOptions": {
+                    "type": "object",
+                    "properties": {
+                      "onlyMainContent": {
+                        "type": "boolean",
+                        "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
+                        "default": false
+                      },
+                      "fetchPageContent": {
+                        "type": "boolean",
+                        "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
+                        "default": true
+                      },
+                      "includeHtml": {
+                        "type": "boolean",
+                        "description": "Include the HTML version of the content on page. Will output a html key in the response.",
+                        "default": false
+                      },
+                      "includeRawHtml": {
+                        "type": "boolean",
+                        "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
+                        "default": false
+                      }
+                    }
+                  },
+                  "searchOptions": {
+                    "type": "object",
+                    "properties": {
+                      "limit": {
+                        "type": "integer",
+                        "description": "Maximum number of results. Max is 20 during beta."
+                      }
+                    }
+                  }
+                },
+                "required": ["query"]
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "Successful response",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/SearchResponse"
+                }
+              }
+            }
+          },
+          "402": {
+            "description": "Payment required",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Payment required to access this resource."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Too many requests",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Request rate limit exceeded. Please wait and try again later."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Server error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "An unexpected error occurred on the server."
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/crawl/status/{jobId}": {
+      "get": {
+        "tags": ["Crawl"],
+        "summary": "Get the status of a crawl job",
+        "operationId": "getCrawlStatus",
+        "security": [
+          {
+            "bearerAuth": []
+          }
+        ],
+        "parameters": [
+          {
+            "name": "jobId",
+            "in": "path",
+            "description": "ID of the crawl job",
+            "required": true,
+            "schema": {
+              "type": "string"
+            }
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "Successful response",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "status": {
+                      "type": "string",
+                      "description": "Status of the job (completed, active, failed, paused)"
+                    },
+                    "current": {
+                      "type": "integer",
+                      "description": "Current page number"
+                    },
+                    "total": {
+                      "type": "integer",
+                      "description": "Total number of pages"
+                    },
+                    "data": {
+                      "type": "array",
+                      "items": {
+                        "$ref": "#/components/schemas/CrawlStatusResponseObj"
+                      },
+                      "description": "Data returned from the job (null when it is in progress)"
+                    },
+                    "partial_data": {
+                      "type": "array",
+                      "items": {
+                        "$ref": "#/components/schemas/CrawlStatusResponseObj"
+                      },
+                      "description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "402": {
+            "description": "Payment required",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Payment required to access this resource."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Too many requests",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Request rate limit exceeded. Please wait and try again later."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Server error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "An unexpected error occurred on the server."
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/crawl/cancel/{jobId}": {
+      "delete": {
+        "tags": ["Crawl"],
+        "summary": "Cancel a crawl job",
+        "operationId": "cancelCrawlJob",
+        "security": [
+          {
+            "bearerAuth": []
+          }
+        ],
+        "parameters": [
+          {
+            "name": "jobId",
+            "in": "path",
+            "description": "ID of the crawl job",
+            "required": true,
+            "schema": {
+              "type": "string"
+            }
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "Successful response",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "status": {
+                      "type": "string",
+                      "description": "Returns cancelled."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "402": {
+            "description": "Payment required",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Payment required to access this resource."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Too many requests",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "Request rate limit exceeded. Please wait and try again later."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Server error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "error": {
+                      "type": "string",
+                      "example": "An unexpected error occurred on the server."
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  },
+  "components": {
+    "securitySchemes": {
+      "bearerAuth": {
+        "type": "http",
+        "scheme": "bearer"
+      }
+    },
+    "schemas": {
+      "ScrapeResponse": {
+        "type": "object",
+        "properties": {
+          "success": {
+            "type": "boolean"
+          },
+          "warning": {
+            "type": "string",
+            "nullable": true,
+            "description": "Warning message to let you know of any issues."
+          },
+          "data": {
+            "type": "object",
+            "properties": {
+              "markdown": {
+                "type": "string",
+                "nullable": true,
+                "description": "Markdown content of the page if the `markdown` format was specified (default)"
+              },
+              "html": {
+                "type": "string",
+                "nullable": true,
+                "description": "HTML version of the content on page if the `html` format was specified"
+              },
+              "rawHtml": {
+                "type": "string",
+                "nullable": true,
+                "description": "Raw HTML content of the page if the `rawHtml` format was specified"
+              },
+              "links": {
+                "type": "array",
+                "items": {
+                  "type": "string",
+                  "format": "uri"
+                },
+                "nullable": true,
+                "description": "Links on the page if the `links` format was specified"
+              },
+              "screenshot": {
+                "type": "string",
+                "nullable": true,
+                "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
+              },
+              "metadata": {
+                "type": "object",
+                "properties": {
+                  "title": {
+                    "type": "string"
+                  },
+                  "description": {
+                    "type": "string"
+                  },
+                  "language": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "sourceURL": {
+                    "type": "string",
+                    "format": "uri"
+                  },
+                  "<any other metadata> ": {
+                    "type": "string"
+                  },
+                  "statusCode": {
+                    "type": "integer",
+                    "description": "The status code of the page"
+                  },
+                  "error": {
+                    "type": "string",
+                    "nullable": true,
+                    "description": "The error message of the page"
+                  }
+                }
+              }
+            }
+          }
+        }
+      },
+      "CrawlStatusResponseObj": {
+        "type": "object",
+        "properties": {
+          "markdown": {
+            "type": "string",
+            "nullable": true,
+            "description": "Markdown content of the page if the `markdown` format was specified (default)"
+          },
+          "html": {
+            "type": "string",
+            "nullable": true,
+            "description": "HTML version of the content on page if the `html` format was specified"
+          },
+          "rawHtml": {
+            "type": "string",
+            "nullable": true,
+            "description": "Raw HTML content of the page if the `rawHtml` format was specified"
+          },
+          "links": {
+            "type": "array",
+            "items": {
+              "type": "string",
+              "format": "uri"
+            },
+            "nullable": true,
+            "description": "Links on the page if the `links` format was specified"
+          },
+          "screenshot": {
+            "type": "string",
+            "nullable": true,
+            "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
+          },
+          "metadata": {
+            "type": "object",
+            "properties": {
+              "title": {
+                "type": "string"
+              },
+              "description": {
+                "type": "string"
+              },
+              "language": {
+                "type": "string",
+                "nullable": true
+              },
+              "sourceURL": {
+                "type": "string",
+                "format": "uri"
+              },
+              "<any other metadata> ": {
+                "type": "string"
+              },
+              "statusCode": {
+                "type": "integer",
+                "description": "The status code of the page"
+              },
+              "error": {
+                "type": "string",
+                "nullable": true,
+                "description": "The error message of the page"
+              }
+            }
+          }
+        }
+      },
+      "SearchResponse": {
+        "type": "object",
+        "properties": {
+          "success": {
+            "type": "boolean"
+          },
+          "data": {
+            "type": "array",
+            "items": {
+              "markdown": {
+                "type": "string",
+                "nullable": true,
+                "description": "Markdown content of the page if the `markdown` format was specified (default)"
+              },
+              "html": {
+                "type": "string",
+                "nullable": true,
+                "description": "HTML version of the content on page if the `html` format was specified"
+              },
+              "rawHtml": {
+                "type": "string",
+                "nullable": true,
+                "description": "Raw HTML content of the page if the `rawHtml` format was specified"
+              },
+              "links": {
+                "type": "array",
+                "items": {
+                  "type": "string",
+                  "format": "uri"
+                },
+                "nullable": true,
+                "description": "Links on the page if the `links` format was specified"
+              },
+              "screenshot": {
+                "type": "string",
+                "nullable": true,
+                "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
+              },
+              "metadata": {
+                "type": "object",
+                "properties": {
+                  "title": {
+                    "type": "string"
+                  },
+                  "description": {
+                    "type": "string"
+                  },
+                  "language": {
+                    "type": "string",
+                    "nullable": true
+                  },
+                  "sourceURL": {
+                    "type": "string",
+                    "format": "uri"
+                  },
+                  "<any other metadata> ": {
+                    "type": "string"
+                  },
+                  "statusCode": {
+                    "type": "integer",
+                    "description": "The status code of the page"
+                  },
+                  "error": {
+                    "type": "string",
+                    "nullable": true,
+                    "description": "The error message of the page"
+                  }
+                }
+              }
+            }
+          }
+        }
+      },
+      "CrawlResponse": {
+        "type": "object",
+        "properties": {
+          "success": {
+            "type": "boolean"
+          },
+          "id": {
+            "type": "string"
+          },
+          "url": {
+            "type": "string",
+            "format": "uri"
+          }
+        }
+      }
+    }
+  },
+  "security": [
+    {
+      "bearerAuth": []
+    }
+  ]
+}

package.json ADDED Viewed

	@@ -0,0 +1,156 @@

+{
+  "name": "firecrawl-scraper-js",
+  "version": "1.0.0",
+  "description": "",
+  "main": "index.js",
+  "scripts": {
+    "start": "nodemon --exec ts-node src/index.ts",
+    "start:production": "tsc && node dist/src/index.js",
+    "format": "prettier --write \"src/**/*.(js|ts)\"",
+    "flyio": "node dist/src/index.js",
+    "start:dev": "nodemon --exec ts-node src/index.ts",
+    "build": "tsc",
+    "build:nosentry": "tsc",
+    "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
+    "test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
+    "test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'",
+    "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth|src/scraper/scrapeURL)'",
+    "test:snips": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false src/__tests__/snips/*.test.ts",
+    "workers": "nodemon --exec ts-node src/services/queue-worker.ts",
+    "worker:production": "node dist/src/services/queue-worker.js",
+    "index-worker": "nodemon --exec ts-node src/services/indexing/index-worker.ts",
+    "index-worker:production": "node dist/src/services/indexing/index-worker.js",
+    "mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest",
+    "mongo-docker-console": "docker exec -it mongodb mongosh",
+    "run-example": "npx ts-node src/example.ts",
+    "deploy:fly": "flyctl deploy --build-secret SENTRY_AUTH_TOKEN=$(dotenv -p SENTRY_AUTH_TOKEN) --depot=false",
+    "deploy:fly:staging": "fly deploy -c fly.staging.toml --depot=false",
+    "sentry:sourcemaps": "sentry-cli sourcemaps inject --org caleb-peffer --project firecrawl-scraper-js ./dist && sentry-cli sourcemaps upload --org caleb-peffer --project firecrawl-scraper-js ./dist"
+  },
+  "author": "",
+  "license": "ISC",
+  "devDependencies": {
+    "@jest/globals": "^29.7.0",
+    "@tsconfig/recommended": "^1.0.3",
+    "@types/body-parser": "^1.19.2",
+    "@types/cors": "^2.8.13",
+    "@types/escape-html": "^1.0.4",
+    "@types/express": "^4.17.21",
+    "@types/express-ws": "^3.0.5",
+    "@types/jest": "^29.5.12",
+    "@types/lodash": "^4.17.14",
+    "@types/node": "^20.14.1",
+    "@types/pdf-parse": "^1.1.4",
+    "@types/supertest": "^6.0.2",
+    "jest": "^29.6.3",
+    "jest-fetch-mock": "^3.0.3",
+    "nodemon": "^2.0.20",
+    "prettier": "^3.4.2",
+    "supertest": "^6.3.3",
+    "ts-jest": "^29.1.1",
+    "ts-node": "^10.9.1",
+    "typescript": "^5.8.3"
+  },
+  "dependencies": {
+    "@ai-sdk/anthropic": "^1.2.4",
+    "@ai-sdk/deepinfra": "^0.2.4",
+    "@ai-sdk/fireworks": "^0.2.4",
+    "@ai-sdk/google": "^1.2.3",
+    "@ai-sdk/google-vertex": "^2.2.15",
+    "@ai-sdk/groq": "^1.2.1",
+    "@ai-sdk/openai": "^1.3.12",
+    "@anthropic-ai/sdk": "^0.24.3",
+    "@apidevtools/json-schema-ref-parser": "^11.7.3",
+    "@brillout/import": "^0.2.2",
+    "@bull-board/api": "^5.20.5",
+    "@bull-board/express": "^5.20.5",
+    "@devil7softwares/pos": "^1.0.2",
+    "@dqbd/tiktoken": "^1.0.17",
+    "@google-cloud/storage": "^7.16.0",
+    "@nangohq/node": "^0.40.8",
+    "@openrouter/ai-sdk-provider": "^0.4.5",
+    "@pinecone-database/pinecone": "^4.0.0",
+    "@sentry/cli": "^2.33.1",
+    "@sentry/node": "^8.26.0",
+    "@sentry/profiling-node": "^8.26.0",
+    "@supabase/supabase-js": "^2.44.2",
+    "@types/ws": "^8.5.12",
+    "ai": "^4.3.4",
+    "ajv": "^8.16.0",
+    "async": "^3.2.5",
+    "async-mutex": "^0.5.0",
+    "axios": "^1.3.4",
+    "axios-retry": "^4.5.0",
+    "body-parser": "^1.20.1",
+    "bottleneck": "^2.19.5",
+    "bullmq": "^5.36.0",
+    "cacheable-lookup": "^6.1.0",
+    "cheerio": "^1.0.0-rc.12",
+    "cohere": "^1.1.1",
+    "cohere-ai": "^7.14.0",
+    "cors": "^2.8.5",
+    "cron-parser": "^4.9.0",
+    "date-fns": "^3.6.0",
+    "dotenv": "^16.3.1",
+    "dotenv-cli": "^7.4.2",
+    "escape-html": "^1.0.3",
+    "express": "^4.18.2",
+    "express-rate-limit": "^7.3.1",
+    "express-ws": "^5.0.2",
+    "git-diff": "^2.0.6",
+    "glob": "^10.4.2",
+    "gpt3-tokenizer": "^1.1.5",
+    "ioredis": "^5.4.1",
+    "ip-address": "^10.0.1",
+    "joplin-turndown-plugin-gfm": "^1.0.12",
+    "jsdom": "^26.0.0",
+    "json-schema-to-zod": "^2.3.0",
+    "keyword-extractor": "^0.0.28",
+    "koffi": "^2.9.0",
+    "languagedetect": "^2.0.0",
+    "lodash": "^4.17.21",
+    "logsnag": "^1.0.0",
+    "luxon": "^3.4.3",
+    "mammoth": "^1.7.2",
+    "marked": "^14.1.2",
+    "md5": "^2.3.0",
+    "moment": "^2.29.4",
+    "mongoose": "^8.4.4",
+    "natural": "^7.0.7",
+    "ollama-ai-provider": "^1.2.0",
+    "parse-diff": "^0.11.1",
+    "pdf-parse": "^1.1.1",
+    "pos": "^0.4.2",
+    "posthog-node": "^4.0.1",
+    "promptable": "^0.0.10",
+    "puppeteer": "^22.12.1",
+    "rate-limiter-flexible": "2.4.2",
+    "redlock": "5.0.0-beta.2",
+    "resend": "^3.4.0",
+    "robots-parser": "^3.0.1",
+    "scrapingbee": "^1.7.4",
+    "stripe": "^16.1.0",
+    "supabase": "^1.77.9",
+    "systeminformation": "^5.22.11",
+    "tldts": "^6.1.75",
+    "turndown": "^7.1.3",
+    "turndown-plugin-gfm": "^1.0.2",
+    "typesense": "^1.5.4",
+    "undici": "^6.20.1",
+    "unstructured-client": "^0.11.3",
+    "uuid": "^10.0.0",
+    "winston": "^3.14.2",
+    "winston-transport": "^4.8.0",
+    "wordpos": "^2.1.0",
+    "ws": "^8.18.0",
+    "xml2js": "^0.6.2",
+    "zod": "^3.24.2"
+  },
+  "nodemonConfig": {
+    "ignore": [
+      "*.docx",
+      "*.json",
+      "temp"
+    ]
+  }
+}

pnpm-lock.yaml ADDED Viewed

The diff for this file is too large to render. See raw diff

requests.http ADDED Viewed

	@@ -0,0 +1,127 @@

+# Pick your baseUrl here:
+# @baseUrl = http://localhost:3002
+@baseUrl = https://api.firecrawl.dev
+### Scrape Website
+# @name scrape
+POST {{baseUrl}}/v1/scrape HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+{
+  "url": "https://firecrawl.dev"
+}
+### Crawl Website
+# @name crawl
+POST {{baseUrl}}/v1/crawl HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+{
+  "url":"https://firecrawl.dev"
+}
+### Check Crawl Status
+@crawlId = {{crawl.response.body.$.id}}
+# @name crawlStatus
+GET {{baseUrl}}/v1/crawl/{{crawlId}} HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+### Cancel Crawl
+@crawlId = {{crawl.response.body.$.id}}
+# @name cancelCrawl
+DELETE {{baseUrl}}/v1/crawl/{{crawlId}} HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+### Extract website
+# @name extract
+POST {{baseUrl}}/v1/extract HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+{
+  "urls": ["https://firecrawl.dev"],
+  "schema": {
+    "type": "object",
+    "properties": {
+      "companyName": {
+        "type": "string"
+      },
+      "companyDescription": {
+        "type": "string"
+      }
+    }
+  },
+  "agent": {
+    "model": "fire-1"
+  },
+  "origin": "api-sdk"
+}
+### Check Extract Status
+@extractId = {{extract.response.body.$.id}}
+# @name extractStatus
+GET {{baseUrl}}/v1/extract/{{extractId}} HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+### Batch Scrape Websites
+# @name batchScrape
+POST {{baseUrl}}/v1/batch/scrape HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+{
+  "urls": [
+    "firecrawl.dev",
+    "mendable.ai"
+  ]
+}
+### Check Batch Scrape Status
+@batchScrapeId = {{batchScrape.response.body.$.id}}
+# @name batchScrapeStatus
+GET {{baseUrl}}/v1/crawl/{{batchScrapeId}} HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+### Map Website
+# @name map
+POST {{baseUrl}}/v1/map HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+{
+  "url": "firecrawl.dev",
+  "sitemapOnly": true
+}
+### Generate LLMs TXT
+# @name generateLlmsTxt
+POST {{baseUrl}}/v1/llmstxt HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+{
+  "url": "https://firecrawl.dev",
+  "maxUrls": 1,
+  "showFullText": false
+}
+### Check Generate LLMs TXT Status
+@generateLlmsTxtId = {{generateLlmsTxt.response.body.$.id}}
+# @name generateLlmsTxtStatus
+GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+### Search
+# @name search
+POST {{baseUrl}}/v1/search HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+{
+  "query": "firecrawl",
+  "limit": 50
+}

sharedLibs/go-html-to-md/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ html-to-markdown.so
2	+ html-to-markdown.h

sharedLibs/go-html-to-md/README.md ADDED Viewed

	@@ -0,0 +1,7 @@

+To build the go-html-to-md library, run the following command:
+```bash
+cd apps/api/src/lib/go-html-to-md
+go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go
+chmod +x html-to-markdown.so
+```

sharedLibs/go-html-to-md/go.mod ADDED Viewed

	@@ -0,0 +1,16 @@

+module html-to-markdown.go
+go 1.19
+require github.com/tomkosm/html-to-markdown v0.0.0-20250128162844-2f19490e042d
+require (
+	github.com/PuerkitoBio/goquery v1.9.2 // indirect
+	github.com/andybalholm/cascadia v1.3.2 // indirect
+	github.com/kr/pretty v0.3.0 // indirect
+	golang.org/x/net v0.25.0 // indirect
+	gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
+	gopkg.in/yaml.v2 v2.4.0 // indirect
+)
+replace github.com/JohannesKaufmann/html-to-markdown => github.com/tomkosm/html-to-markdown v0.0.0-20250128162844-2f19490e042d

sharedLibs/go-html-to-md/go.sum ADDED Viewed

	@@ -0,0 +1,64 @@

+github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
+github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
+github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
+github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
+github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
+github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
+github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
+github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
+github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
+github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k=
+github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
+github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y=
+github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
+github.com/tomkosm/html-to-markdown v0.0.0-20250128162844-2f19490e042d h1:NBs5X/qGdcYalsplADJxPR5CjhMWo4PxcjJeIjXm2Ww=
+github.com/tomkosm/html-to-markdown v0.0.0-20250128162844-2f19490e042d/go.mod h1:I2mfsDlV0RelCsTjeYh9mdXdwD2M70rA7LT/y2girik=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
+github.com/yuin/goldmark v1.7.1 h1:3bajkSilaCbjdKVsKdZjZCLBNPL9pYzrCakKaf4U49U=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
+golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
+golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
+golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
+golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
+golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
+golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
+golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
+golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
+gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
+gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
+gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=

sharedLibs/go-html-to-md/html-to-markdown.go ADDED Viewed

	@@ -0,0 +1,25 @@

+package main
+import (
+	"C"
+	// "log"
+	md "github.com/tomkosm/html-to-markdown"
+	"github.com/tomkosm/html-to-markdown/plugin"
+)
+//export ConvertHTMLToMarkdown
+func ConvertHTMLToMarkdown(html *C.char) *C.char {
+	converter := md.NewConverter("", true, nil)
+	converter.Use(plugin.GitHubFlavored())
+	markdown, err := converter.ConvertString(C.GoString(html))
+	if err != nil {
+		// log.Fatal(err)
+	}
+	return C.CString(markdown)
+}
+func main() {
+	// This function is required for the main package
+}

sharedLibs/html-transformer/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ target

sharedLibs/html-transformer/Cargo.lock ADDED Viewed

	@@ -0,0 +1,1235 @@

+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+[[package]]
+name = "autocfg"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+[[package]]
+name = "bitflags"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+[[package]]
+name = "convert_case"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
+[[package]]
+name = "cssparser"
+version = "0.27.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a"
+dependencies = [
+ "cssparser-macros",
+ "dtoa-short",
+ "itoa 0.4.8",
+ "matches",
+ "phf 0.8.0",
+ "proc-macro2",
+ "quote",
+ "smallvec",
+ "syn 1.0.109",
+]
+[[package]]
+name = "cssparser"
+version = "0.29.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f93d03419cb5950ccfd3daf3ff1c7a36ace64609a1a8746d493df1ca0afde0fa"
+dependencies = [
+ "cssparser-macros",
+ "dtoa-short",
+ "itoa 1.0.14",
+ "matches",
+ "phf 0.10.1",
+ "proc-macro2",
+ "quote",
+ "smallvec",
+ "syn 1.0.109",
+]
+[[package]]
+name = "cssparser-macros"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
+dependencies = [
+ "quote",
+ "syn 2.0.96",
+]
+[[package]]
+name = "derive_more"
+version = "0.99.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce"
+dependencies = [
+ "convert_case",
+ "proc-macro2",
+ "quote",
+ "rustc_version",
+ "syn 2.0.96",
+]
+[[package]]
+name = "displaydoc"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.96",
+]
+[[package]]
+name = "dtoa"
+version = "1.0.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653"
+[[package]]
+name = "dtoa-short"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
+dependencies = [
+ "dtoa",
+]
+[[package]]
+name = "encoding_rs"
+version = "0.8.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
+dependencies = [
+ "cfg-if",
+]
+[[package]]
+name = "equivalent"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+[[package]]
+name = "foldhash"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f"
+[[package]]
+name = "form_urlencoded"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
+dependencies = [
+ "percent-encoding",
+]
+[[package]]
+name = "futf"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
+dependencies = [
+ "mac",
+ "new_debug_unreachable",
+]
+[[package]]
+name = "fxhash"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
+dependencies = [
+ "byteorder",
+]
+[[package]]
+name = "getrandom"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi 0.9.0+wasi-snapshot-preview1",
+]
+[[package]]
+name = "getrandom"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi 0.11.0+wasi-snapshot-preview1",
+]
+[[package]]
+name = "hashbrown"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+[[package]]
+name = "hashbrown"
+version = "0.15.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash",
+]
+[[package]]
+name = "html-transformer"
+version = "0.1.0"
+dependencies = [
+ "kuchikiki",
+ "libc",
+ "lol_html",
+ "serde",
+ "serde_json",
+ "url",
+]
+[[package]]
+name = "html5ever"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
+dependencies = [
+ "log",
+ "mac",
+ "markup5ever",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+[[package]]
+name = "icu_collections"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
+dependencies = [
+ "displaydoc",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+[[package]]
+name = "icu_locid"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
+dependencies = [
+ "displaydoc",
+ "litemap",
+ "tinystr",
+ "writeable",
+ "zerovec",
+]
+[[package]]
+name = "icu_locid_transform"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
+dependencies = [
+ "displaydoc",
+ "icu_locid",
+ "icu_locid_transform_data",
+ "icu_provider",
+ "tinystr",
+ "zerovec",
+]
+[[package]]
+name = "icu_locid_transform_data"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
+[[package]]
+name = "icu_normalizer"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
+dependencies = [
+ "displaydoc",
+ "icu_collections",
+ "icu_normalizer_data",
+ "icu_properties",
+ "icu_provider",
+ "smallvec",
+ "utf16_iter",
+ "utf8_iter",
+ "write16",
+ "zerovec",
+]
+[[package]]
+name = "icu_normalizer_data"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
+[[package]]
+name = "icu_properties"
+version = "1.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5"
+dependencies = [
+ "displaydoc",
+ "icu_collections",
+ "icu_locid_transform",
+ "icu_properties_data",
+ "icu_provider",
+ "tinystr",
+ "zerovec",
+]
+[[package]]
+name = "icu_properties_data"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
+[[package]]
+name = "icu_provider"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
+dependencies = [
+ "displaydoc",
+ "icu_locid",
+ "icu_provider_macros",
+ "stable_deref_trait",
+ "tinystr",
+ "writeable",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+[[package]]
+name = "icu_provider_macros"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.96",
+]
+[[package]]
+name = "idna"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e"
+dependencies = [
+ "idna_adapter",
+ "smallvec",
+ "utf8_iter",
+]
+[[package]]
+name = "idna_adapter"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71"
+dependencies = [
+ "icu_normalizer",
+ "icu_properties",
+]
+[[package]]
+name = "indexmap"
+version = "1.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
+dependencies = [
+ "autocfg",
+ "hashbrown 0.12.3",
+]
+[[package]]
+name = "itoa"
+version = "0.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
+[[package]]
+name = "itoa"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
+[[package]]
+name = "kuchikiki"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f29e4755b7b995046f510a7520c42b2fed58b77bd94d5a87a8eb43d2fd126da8"
+dependencies = [
+ "cssparser 0.27.2",
+ "html5ever",
+ "indexmap",
+ "matches",
+ "selectors 0.22.0",
+]
+[[package]]
+name = "libc"
+version = "0.2.169"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
+[[package]]
+name = "litemap"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
+[[package]]
+name = "lock_api"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
+dependencies = [
+ "autocfg",
+ "scopeguard",
+]
+[[package]]
+name = "log"
+version = "0.4.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f"
+[[package]]
+name = "lol_html"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b1058123f6262982b891dccc395cff0144d9439de366460b47fab719258b96e"
+dependencies = [
+ "bitflags 2.8.0",
+ "cfg-if",
+ "cssparser 0.29.6",
+ "encoding_rs",
+ "hashbrown 0.15.2",
+ "memchr",
+ "mime",
+ "selectors 0.24.0",
+ "thiserror",
+]
+[[package]]
+name = "mac"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
+[[package]]
+name = "markup5ever"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
+dependencies = [
+ "log",
+ "phf 0.10.1",
+ "phf_codegen 0.10.0",
+ "string_cache",
+ "string_cache_codegen",
+ "tendril",
+]
+[[package]]
+name = "matches"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5"
+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+[[package]]
+name = "mime"
+version = "0.3.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
+[[package]]
+name = "new_debug_unreachable"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
+[[package]]
+name = "nodrop"
+version = "0.1.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
+[[package]]
+name = "once_cell"
+version = "1.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
+[[package]]
+name = "parking_lot"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
+dependencies = [
+ "lock_api",
+ "parking_lot_core",
+]
+[[package]]
+name = "parking_lot_core"
+version = "0.9.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall",
+ "smallvec",
+ "windows-targets",
+]
+[[package]]
+name = "percent-encoding"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
+[[package]]
+name = "phf"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12"
+dependencies = [
+ "phf_macros 0.8.0",
+ "phf_shared 0.8.0",
+ "proc-macro-hack",
+]
+[[package]]
+name = "phf"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
+dependencies = [
+ "phf_macros 0.10.0",
+ "phf_shared 0.10.0",
+ "proc-macro-hack",
+]
+[[package]]
+name = "phf_codegen"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
+dependencies = [
+ "phf_generator 0.8.0",
+ "phf_shared 0.8.0",
+]
+[[package]]
+name = "phf_codegen"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
+dependencies = [
+ "phf_generator 0.10.0",
+ "phf_shared 0.10.0",
+]
+[[package]]
+name = "phf_generator"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
+dependencies = [
+ "phf_shared 0.8.0",
+ "rand 0.7.3",
+]
+[[package]]
+name = "phf_generator"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
+dependencies = [
+ "phf_shared 0.10.0",
+ "rand 0.8.5",
+]
+[[package]]
+name = "phf_macros"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c"
+dependencies = [
+ "phf_generator 0.8.0",
+ "phf_shared 0.8.0",
+ "proc-macro-hack",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+[[package]]
+name = "phf_macros"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "58fdf3184dd560f160dd73922bea2d5cd6e8f064bf4b13110abd81b03697b4e0"
+dependencies = [
+ "phf_generator 0.10.0",
+ "phf_shared 0.10.0",
+ "proc-macro-hack",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+[[package]]
+name = "phf_shared"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7"
+dependencies = [
+ "siphasher",
+]
+[[package]]
+name = "phf_shared"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
+dependencies = [
+ "siphasher",
+]
+[[package]]
+name = "ppv-lite86"
+version = "0.2.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
+dependencies = [
+ "zerocopy",
+]
+[[package]]
+name = "precomputed-hash"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
+[[package]]
+name = "proc-macro-hack"
+version = "0.5.20+deprecated"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
+[[package]]
+name = "proc-macro2"
+version = "1.0.93"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99"
+dependencies = [
+ "unicode-ident",
+]
+[[package]]
+name = "quote"
+version = "1.0.38"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
+dependencies = [
+ "proc-macro2",
+]
+[[package]]
+name = "rand"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
+dependencies = [
+ "getrandom 0.1.16",
+ "libc",
+ "rand_chacha 0.2.2",
+ "rand_core 0.5.1",
+ "rand_hc",
+ "rand_pcg",
+]
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "libc",
+ "rand_chacha 0.3.1",
+ "rand_core 0.6.4",
+]
+[[package]]
+name = "rand_chacha"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.5.1",
+]
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.6.4",
+]
+[[package]]
+name = "rand_core"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
+dependencies = [
+ "getrandom 0.1.16",
+]
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom 0.2.15",
+]
+[[package]]
+name = "rand_hc"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
+dependencies = [
+ "rand_core 0.5.1",
+]
+[[package]]
+name = "rand_pcg"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
+dependencies = [
+ "rand_core 0.5.1",
+]
+[[package]]
+name = "redox_syscall"
+version = "0.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834"
+dependencies = [
+ "bitflags 2.8.0",
+]
+[[package]]
+name = "rustc_version"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
+dependencies = [
+ "semver",
+]
+[[package]]
+name = "ryu"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+[[package]]
+name = "selectors"
+version = "0.22.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe"
+dependencies = [
+ "bitflags 1.3.2",
+ "cssparser 0.27.2",
+ "derive_more",
+ "fxhash",
+ "log",
+ "matches",
+ "phf 0.8.0",
+ "phf_codegen 0.8.0",
+ "precomputed-hash",
+ "servo_arc 0.1.1",
+ "smallvec",
+ "thin-slice",
+]
+[[package]]
+name = "selectors"
+version = "0.24.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c37578180969d00692904465fb7f6b3d50b9a2b952b87c23d0e2e5cb5013416"
+dependencies = [
+ "bitflags 1.3.2",
+ "cssparser 0.29.6",
+ "derive_more",
+ "fxhash",
+ "log",
+ "phf 0.8.0",
+ "phf_codegen 0.8.0",
+ "precomputed-hash",
+ "servo_arc 0.2.0",
+ "smallvec",
+]
+[[package]]
+name = "semver"
+version = "1.0.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03"
+[[package]]
+name = "serde"
+version = "1.0.217"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
+dependencies = [
+ "serde_derive",
+]
+[[package]]
+name = "serde_derive"
+version = "1.0.217"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.96",
+]
+[[package]]
+name = "serde_json"
+version = "1.0.137"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "930cfb6e6abf99298aaad7d29abbef7a9999a9a8806a40088f55f0dcec03146b"
+dependencies = [
+ "itoa 1.0.14",
+ "memchr",
+ "ryu",
+ "serde",
+]
+[[package]]
+name = "servo_arc"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432"
+dependencies = [
+ "nodrop",
+ "stable_deref_trait",
+]
+[[package]]
+name = "servo_arc"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d52aa42f8fdf0fed91e5ce7f23d8138441002fa31dca008acf47e6fd4721f741"
+dependencies = [
+ "nodrop",
+ "stable_deref_trait",
+]
+[[package]]
+name = "siphasher"
+version = "0.3.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
+[[package]]
+name = "smallvec"
+version = "1.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
+[[package]]
+name = "string_cache"
+version = "0.8.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b"
+dependencies = [
+ "new_debug_unreachable",
+ "once_cell",
+ "parking_lot",
+ "phf_shared 0.10.0",
+ "precomputed-hash",
+ "serde",
+]
+[[package]]
+name = "string_cache_codegen"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988"
+dependencies = [
+ "phf_generator 0.10.0",
+ "phf_shared 0.10.0",
+ "proc-macro2",
+ "quote",
+]
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+[[package]]
+name = "syn"
+version = "2.0.96"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+[[package]]
+name = "synstructure"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.96",
+]
+[[package]]
+name = "tendril"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
+dependencies = [
+ "futf",
+ "mac",
+ "utf-8",
+]
+[[package]]
+name = "thin-slice"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c"
+[[package]]
+name = "thiserror"
+version = "2.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc"
+dependencies = [
+ "thiserror-impl",
+]
+[[package]]
+name = "thiserror-impl"
+version = "2.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.96",
+]
+[[package]]
+name = "tinystr"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
+dependencies = [
+ "displaydoc",
+ "zerovec",
+]
+[[package]]
+name = "unicode-ident"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "11cd88e12b17c6494200a9c1b683a04fcac9573ed74cd1b62aeb2727c5592243"
+[[package]]
+name = "url"
+version = "2.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60"
+dependencies = [
+ "form_urlencoded",
+ "idna",
+ "percent-encoding",
+]
+[[package]]
+name = "utf-8"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
+[[package]]
+name = "utf16_iter"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
+[[package]]
+name = "utf8_iter"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
+[[package]]
+name = "wasi"
+version = "0.9.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+[[package]]
+name = "write16"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
+[[package]]
+name = "writeable"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
+[[package]]
+name = "yoke"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40"
+dependencies = [
+ "serde",
+ "stable_deref_trait",
+ "yoke-derive",
+ "zerofrom",
+]
+[[package]]
+name = "yoke-derive"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.96",
+ "synstructure",
+]
+[[package]]
+name = "zerocopy"
+version = "0.7.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
+dependencies = [
+ "byteorder",
+ "zerocopy-derive",
+]
+[[package]]
+name = "zerocopy-derive"
+version = "0.7.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.96",
+]
+[[package]]
+name = "zerofrom"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e"
+dependencies = [
+ "zerofrom-derive",
+]
+[[package]]
+name = "zerofrom-derive"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.96",
+ "synstructure",
+]
+[[package]]
+name = "zerovec"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
+dependencies = [
+ "yoke",
+ "zerofrom",
+ "zerovec-derive",
+]
+[[package]]
+name = "zerovec-derive"
+version = "0.10.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.96",
+]

sharedLibs/html-transformer/Cargo.toml ADDED Viewed

	@@ -0,0 +1,15 @@

+[package]
+name = "html-transformer"
+version = "0.1.0"
+edition = "2021"
+[dependencies]
+libc = "0.2.0"
+lol_html = "2.2.0"
+kuchikiki = "0.8.2"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+url = "2.5.4"
+[lib]
+crate-type = ["cdylib"]

sharedLibs/html-transformer/src/lib.rs ADDED Viewed

	@@ -0,0 +1,394 @@

+use std::{collections::HashMap, ffi::{CStr, CString}};
+use kuchikiki::{parse_html, traits::TendrilSink};
+use serde::Deserialize;
+use serde_json::Value;
+use url::Url;
+/// Extracts links from HTML
+///
+/// # Safety
+/// Input options must be a C HTML string. Output will be a JSON string array. Output string must be freed with free_string.
+#[no_mangle]
+pub unsafe extern "C" fn extract_links(html: *const libc::c_char) -> *mut libc::c_char {
+    let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
+    let document = parse_html().one(html);
+    let mut out: Vec<String> = Vec::new();
+    let anchors: Vec<_> = document.select("a[href]").unwrap().collect();
+    for anchor in anchors {
+        let mut href = anchor.attributes.borrow().get("href").unwrap().to_string();
+        if href.starts_with("http:/") && !href.starts_with("http://") {
+            href = format!("http://{}", &href[6..]);
+        } else if href.starts_with("https:/") && !href.starts_with("https://") {
+            href = format!("https://{}", &href[7..]);
+        }
+        out.push(href);
+    }
+    CString::new(serde_json::ser::to_string(&out).unwrap()).unwrap().into_raw()
+}
+macro_rules! insert_meta_name {
+    ($out:ident, $document:ident, $metaName:expr, $outName:expr) => {
+        if let Some(x) = $document.select(&format!("meta[name=\"{}\"]", $metaName)).unwrap().next().and_then(|description| description.attributes.borrow().get("content").map(|x| x.to_string())) {
+            $out.insert(($outName).to_string(), Value::String(x));
+        }
+    };
+}
+macro_rules! insert_meta_property {
+    ($out:ident, $document:ident, $metaName:expr, $outName:expr) => {
+        if let Some(x) = $document.select(&format!("meta[property=\"{}\"]", $metaName)).unwrap().next().and_then(|description| description.attributes.borrow().get("content").map(|x| x.to_string())) {
+            $out.insert(($outName).to_string(), Value::String(x));
+        }
+    };
+}
+/// Extracts metadata from HTML
+///
+/// # Safety
+/// Input options must be a C HTML string. Output will be a JSON object. Output string must be freed with free_string.
+#[no_mangle]
+pub unsafe extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut libc::c_char {
+    let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
+    let document = parse_html().one(html);
+    let mut out = HashMap::<String, Value>::new();
+    if let Some(title) = document.select("title").unwrap().next() {
+        out.insert("title".to_string(), Value::String(title.text_contents()));
+    }
+    // insert_meta_name!(out, document, "description", "description");
+    if let Some(favicon_link) = document.select("link[rel=\"icon\"]").unwrap().next()
+        .and_then(|x| x.attributes.borrow().get("href").map(|x| x.to_string()))
+        .or_else(|| document.select("link[rel*=\"icon\"]").unwrap().next()
+            .and_then(|x| x.attributes.borrow().get("href").map(|x| x.to_string()))) {
+        out.insert("favicon".to_string(), Value::String(favicon_link));
+    }
+    if let Some(lang) = document.select("html[lang]").unwrap().next().and_then(|x| x.attributes.borrow().get("lang").map(|x| x.to_string())) {
+        out.insert("language".to_string(), Value::String(lang));
+    }
+    // insert_meta_name!(out, document, "keywords", "keywords");
+    // insert_meta_name!(out, document, "robots", "robots");
+    insert_meta_property!(out, document, "og:title", "ogTitle");
+    insert_meta_property!(out, document, "og:description", "ogDescription");
+    insert_meta_property!(out, document, "og:url", "ogUrl");
+    insert_meta_property!(out, document, "og:image", "ogImage");
+    insert_meta_property!(out, document, "og:audio", "ogAudio");
+    insert_meta_property!(out, document, "og:determiner", "ogDeterminer");
+    insert_meta_property!(out, document, "og:locale", "ogLocale");
+    for meta in document.select("meta[property=\"og:locale:alternate\"]").unwrap() {
+        let attrs = meta.attributes.borrow();
+        if let Some(content) = attrs.get("content") {
+            if let Some(v) = out.get_mut("og:locale:alternate") {
+                match v {
+                    Value::Array(x) => {
+                        x.push(Value::String(content.to_string()));
+                    },
+                    _ => unreachable!(),
+                }
+            } else {
+                out.insert("og:locale:alternate".to_string(), Value::Array(vec! [Value::String(content.to_string())]));
+            }
+        }
+    }
+    insert_meta_property!(out, document, "og:site_name", "ogSiteName");
+    insert_meta_property!(out, document, "og:video", "ogVideo");
+    insert_meta_name!(out, document, "article:section", "articleSection");
+    insert_meta_name!(out, document, "article:tag", "articleTag");
+    insert_meta_property!(out, document, "article:published_time", "publishedTime");
+    insert_meta_property!(out, document, "article:modified_time", "modifiedTime");
+    insert_meta_name!(out, document, "dcterms.keywords", "dcTermsKeywords");
+    insert_meta_name!(out, document, "dc.description", "dcDescription");
+    insert_meta_name!(out, document, "dc.subject", "dcSubject");
+    insert_meta_name!(out, document, "dcterms.subject", "dcTermsSubject");
+    insert_meta_name!(out, document, "dcterms.audience", "dcTermsAudience");
+    insert_meta_name!(out, document, "dc.type", "dcType");
+    insert_meta_name!(out, document, "dcterms.type", "dcTermsType");
+    insert_meta_name!(out, document, "dc.date", "dcDate");
+    insert_meta_name!(out, document, "dc.date.created", "dcDateCreated");
+    insert_meta_name!(out, document, "dcterms.created", "dcTermsCreated");
+    for meta in document.select("meta").unwrap() {
+        let meta = meta.as_node().as_element().unwrap();
+        let attrs = meta.attributes.borrow();
+        if let Some(name) = attrs.get("name").or_else(|| attrs.get("property")) {
+            if let Some(content) = attrs.get("content") {
+                if let Some(v) = out.get(name) {
+                    match v {
+                        Value::String(_) => {
+                            if name != "title" { // preserve title tag in metadata
+                                out.insert(name.to_string(), Value::Array(vec! [v.clone(), Value::String(content.to_string())]));
+                            }
+                        },
+                        Value::Array(_) => {
+                            match out.get_mut(name) {
+                                Some(Value::Array(x)) => {
+                                    x.push(Value::String(content.to_string()));
+                                },
+                                _ => unreachable!(),
+                            }
+                        },
+                        _ => unreachable!(),
+                    }
+                } else {
+                    out.insert(name.to_string(), Value::String(content.to_string()));
+                }
+            }
+        }
+    }
+    CString::new(serde_json::ser::to_string(&out).unwrap()).unwrap().into_raw()
+}
+const EXCLUDE_NON_MAIN_TAGS: [&str; 41] = [
+    "header",
+    "footer",
+    "nav",
+    "aside",
+    ".header",
+    ".top",
+    ".navbar",
+    "#header",
+    ".footer",
+    ".bottom",
+    "#footer",
+    ".sidebar",
+    ".side",
+    ".aside",
+    "#sidebar",
+    ".modal",
+    ".popup",
+    "#modal",
+    ".overlay",
+    ".ad",
+    ".ads",
+    ".advert",
+    "#ad",
+    ".lang-selector",
+    ".language",
+    "#language-selector",
+    ".social",
+    ".social-media",
+    ".social-links",
+    "#social",
+    ".menu",
+    ".navigation",
+    "#nav",
+    ".breadcrumbs",
+    "#breadcrumbs",
+    ".share",
+    "#share",
+    ".widget",
+    "#widget",
+    ".cookie",
+    "#cookie",
+];
+const FORCE_INCLUDE_MAIN_TAGS: [&str; 13] = [
+    "#main",
+    // swoogo event software as .widget in all of their content
+    ".swoogo-cols",
+    ".swoogo-text",
+    ".swoogo-table-div",
+    ".swoogo-space",
+    ".swoogo-alert",
+    ".swoogo-sponsors",
+    ".swoogo-title",
+    ".swoogo-tabs",
+    ".swoogo-logo",
+    ".swoogo-image",
+    ".swoogo-button",
+    ".swoogo-agenda",
+];
+#[derive(Deserialize)]
+struct TranformHTMLOptions {
+    html: String,
+    url: String,
+    include_tags: Vec<String>,
+    exclude_tags: Vec<String>,
+    only_main_content: bool,
+}
+struct ImageSource {
+    url: String,
+    size: i32,
+    is_x: bool,
+}
+fn _transform_html_inner(opts: TranformHTMLOptions) -> Result<String, ()> {
+    let mut document = parse_html().one(opts.html);
+    if !opts.include_tags.is_empty() {
+        let new_document = parse_html().one("<div></div>");
+        let root = new_document.select_first("div")?;
+        for x in opts.include_tags.iter() {
+            let matching_nodes: Vec<_> = document.select(x)?.collect();
+            for tag in matching_nodes {
+                root.as_node().append(tag.as_node().clone());
+            }
+        }
+        document = new_document;
+    }
+    while let Ok(x) = document.select_first("head") {
+        x.as_node().detach();
+    }
+    while let Ok(x) = document.select_first("meta") {
+        x.as_node().detach();
+    }
+    while let Ok(x) = document.select_first("noscript") {
+        x.as_node().detach();
+    }
+    while let Ok(x) = document.select_first("style") {
+        x.as_node().detach();
+    }
+    while let Ok(x) = document.select_first("script") {
+        x.as_node().detach();
+    }
+    for x in opts.exclude_tags.iter() {
+        // TODO: implement weird version
+        while let Ok(x) = document.select_first(x) {
+            x.as_node().detach();
+        }
+    }
+    if opts.only_main_content {
+        for x in EXCLUDE_NON_MAIN_TAGS.iter() {
+            let x: Vec<_> = document.select(x)?.collect();
+            for tag in x {
+                if !FORCE_INCLUDE_MAIN_TAGS.iter().any(|x| tag.as_node().select(x).is_ok_and(|mut x| x.next().is_some())) {
+                    tag.as_node().detach();
+                }
+            }
+        }
+    }
+    let srcset_images: Vec<_> = document.select("img[srcset]")?.collect();
+    for img in srcset_images {
+        let mut sizes: Vec<ImageSource> = img.attributes.borrow().get("srcset").ok_or(())?.split(",").filter_map(|x| {
+            let tok: Vec<&str> = x.trim().split(" ").collect();
+            let tok_1 = if tok.len() > 1 && !tok[1].is_empty() {
+                tok[1]
+            } else {
+                "1x"
+            };
+            if let Ok(parsed_size) = tok_1[..tok_1.len()-1].parse() {
+                Some(ImageSource {
+                    url: tok[0].to_string(),
+                    size: parsed_size,
+                    is_x: tok_1.ends_with("x")
+                })
+            } else {
+                None
+            }
+        }).collect();
+        if sizes.iter().all(|x| x.is_x) {
+            if let Some(src) = img.attributes.borrow().get("src").map(|x| x.to_string()) {
+                sizes.push(ImageSource {
+                    url: src,
+                    size: 1,
+                    is_x: true,
+                });
+            }
+        }
+        sizes.sort_by(|a, b| b.size.cmp(&a.size));
+        if let Some(biggest) = sizes.first() {
+            img.attributes.borrow_mut().insert("src", biggest.url.clone());
+        }
+    }
+    let url = Url::parse(&opts.url).map_err(|_| ())?;
+    let src_images: Vec<_> = document.select("img[src]")?.collect();
+    for img in src_images {
+        let old = img.attributes.borrow().get("src").map(|x| x.to_string()).ok_or(())?;
+        if let Ok(new) = url.join(&old) {
+            img.attributes.borrow_mut().insert("src", new.to_string());
+        }
+    }
+    let href_anchors: Vec<_> = document.select("a[href]")?.collect();
+    for anchor in href_anchors {
+        let old = anchor.attributes.borrow().get("href").map(|x| x.to_string()).ok_or(())?;
+        if let Ok(new) = url.join(&old) {
+            anchor.attributes.borrow_mut().insert("href", new.to_string());
+        }
+    }
+    Ok(document.to_string())
+}
+/// Transforms rawHtml to html (formerly removeUnwantedElements)
+///
+/// # Safety
+/// Input options must be a C JSON string. Output will be an HTML string. Output string must be freed with free_string.
+#[no_mangle]
+pub unsafe extern "C" fn transform_html(opts: *const libc::c_char) -> *mut libc::c_char {
+    let opts: TranformHTMLOptions = match unsafe { CStr::from_ptr(opts) }.to_str().map_err(|_| ()).and_then(|x| serde_json::de::from_str(x).map_err(|_| ())) {
+        Ok(x) => x,
+        Err(_) => {
+            return CString::new("RUSTFC:ERROR").unwrap().into_raw();
+        }
+    };
+    let out = match _transform_html_inner(opts) {
+        Ok(x) => x,
+        Err(_) => "RUSTFC:ERROR".to_string(),
+    };
+    CString::new(out).unwrap().into_raw()
+}
+fn _get_inner_json(html: &str) -> Result<String, ()> {
+    Ok(parse_html().one(html).select_first("body")?.text_contents())
+}
+/// For JSON pages retrieved by browser engines, this function can be used to transform it back into valid JSON.
+///
+/// # Safety
+/// Input must be a C HTML string. Output will be an HTML string. Output string must be freed with free_string.
+#[no_mangle]
+pub unsafe extern "C" fn get_inner_json(html: *const libc::c_char) -> *mut libc::c_char {
+    let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
+    let out = match _get_inner_json(html) {
+        Ok(x) => x,
+        Err(_) => "RUSTFC:ERROR".to_string(),
+    };
+    CString::new(out).unwrap().into_raw()
+}
+/// Frees a string allocated in Rust-land.
+///
+/// # Safety
+/// ptr must be a non-freed string pointer returned by Rust code.
+#[no_mangle]
+pub unsafe extern "C" fn free_string(ptr: *mut libc::c_char) {
+    drop(unsafe { CString::from_raw(ptr) })
+}

src/__tests__/concurrency-limit.test.ts ADDED Viewed

	@@ -0,0 +1,209 @@

+import { redisConnection } from "../services/queue-service";
+import {
+  cleanOldConcurrencyLimitEntries,
+  getConcurrencyLimitActiveJobs,
+  pushConcurrencyLimitActiveJob,
+  removeConcurrencyLimitActiveJob,
+  takeConcurrencyLimitedJob,
+  pushConcurrencyLimitedJob,
+  getConcurrencyQueueJobsCount,
+  ConcurrencyLimitedJob,
+} from "../lib/concurrency-limit";
+// Mock Redis client
+jest.mock("../services/queue-service", () => ({
+  redisConnection: {
+    zremrangebyscore: jest.fn(),
+    zrangebyscore: jest.fn(),
+    zadd: jest.fn(),
+    zrem: jest.fn(),
+    zmpop: jest.fn(),
+    zcard: jest.fn(),
+  },
+}));
+describe("Concurrency Limit", () => {
+  const mockTeamId = "test-team-id";
+  const mockJobId = "test-job-id";
+  const mockNow = 1000000;
+  beforeEach(() => {
+    jest.clearAllMocks();
+  });
+  describe("cleanOldConcurrencyLimitEntries", () => {
+    it("should remove entries older than current timestamp", async () => {
+      await cleanOldConcurrencyLimitEntries(mockTeamId, mockNow);
+      expect(redisConnection.zremrangebyscore).toHaveBeenCalledWith(
+        "concurrency-limiter:test-team-id",
+        -Infinity,
+        mockNow
+      );
+    });
+  });
+  describe("getConcurrencyLimitActiveJobs", () => {
+    it("should return active jobs after given timestamp", async () => {
+      const mockActiveJobs = ["job1", "job2"];
+      (redisConnection.zrangebyscore as jest.Mock).mockResolvedValue(mockActiveJobs);
+      const result = await getConcurrencyLimitActiveJobs(mockTeamId, mockNow);
+      expect(result).toEqual(mockActiveJobs);
+      expect(redisConnection.zrangebyscore).toHaveBeenCalledWith(
+        "concurrency-limiter:test-team-id",
+        mockNow,
+        Infinity
+      );
+    });
+    it("should return empty array when no active jobs", async () => {
+      (redisConnection.zrangebyscore as jest.Mock).mockResolvedValue([]);
+      const result = await getConcurrencyLimitActiveJobs(mockTeamId, mockNow);
+      expect(result).toEqual([]);
+    });
+  });
+  describe("pushConcurrencyLimitActiveJob", () => {
+    it("should add job with expiration timestamp", async () => {
+      await pushConcurrencyLimitActiveJob(mockTeamId, mockJobId, 2 * 60 * 1000, mockNow);
+      expect(redisConnection.zadd).toHaveBeenCalledWith(
+        "concurrency-limiter:test-team-id",
+        mockNow + 2 * 60 * 1000, // stalledJobTimeoutMs
+        mockJobId
+      );
+    });
+  });
+  describe("removeConcurrencyLimitActiveJob", () => {
+    it("should remove job from active jobs", async () => {
+      await removeConcurrencyLimitActiveJob(mockTeamId, mockJobId);
+      expect(redisConnection.zrem).toHaveBeenCalledWith(
+        "concurrency-limiter:test-team-id",
+        mockJobId
+      );
+    });
+  });
+  describe("Queue Operations", () => {
+    const mockJob: ConcurrencyLimitedJob = {
+      id: mockJobId,
+      data: { test: "data" },
+      opts: {},
+      priority: 1,
+    };
+    describe("takeConcurrencyLimitedJob", () => {
+      it("should return null when queue is empty", async () => {
+        (redisConnection.zmpop as jest.Mock).mockResolvedValue(null);
+        const result = await takeConcurrencyLimitedJob(mockTeamId);
+        expect(result).toBeNull();
+      });
+      it("should return and remove the highest priority job", async () => {
+        (redisConnection.zmpop as jest.Mock).mockResolvedValue([
+          "key",
+          [[JSON.stringify(mockJob)]],
+        ]);
+        const result = await takeConcurrencyLimitedJob(mockTeamId);
+        expect(result).toEqual(mockJob);
+        expect(redisConnection.zmpop).toHaveBeenCalledWith(
+          1,
+          "concurrency-limit-queue:test-team-id",
+          "MIN"
+        );
+      });
+    });
+    describe("pushConcurrencyLimitedJob", () => {
+      it("should add job to queue with priority", async () => {
+        await pushConcurrencyLimitedJob(mockTeamId, mockJob);
+        expect(redisConnection.zadd).toHaveBeenCalledWith(
+          "concurrency-limit-queue:test-team-id",
+          mockJob.priority,
+          JSON.stringify(mockJob)
+        );
+      });
+      it("should use default priority 1 when not specified", async () => {
+        const jobWithoutPriority = { ...mockJob };
+        delete jobWithoutPriority.priority;
+        await pushConcurrencyLimitedJob(mockTeamId, jobWithoutPriority);
+        expect(redisConnection.zadd).toHaveBeenCalledWith(
+          "concurrency-limit-queue:test-team-id",
+          1,
+          JSON.stringify(jobWithoutPriority)
+        );
+      });
+    });
+    describe("getConcurrencyQueueJobsCount", () => {
+      it("should return the number of jobs in queue", async () => {
+        const mockCount = 5;
+        (redisConnection.zcard as jest.Mock).mockResolvedValue(mockCount);
+        const result = await getConcurrencyQueueJobsCount(mockTeamId);
+        expect(result).toBe(mockCount);
+        expect(redisConnection.zcard).toHaveBeenCalledWith(
+          "concurrency-limit-queue:test-team-id"
+        );
+      });
+      it("should return 0 for empty queue", async () => {
+        (redisConnection.zcard as jest.Mock).mockResolvedValue(0);
+        const result = await getConcurrencyQueueJobsCount(mockTeamId);
+        expect(result).toBe(0);
+      });
+    });
+  });
+  describe("Integration Scenarios", () => {
+    it("should handle complete job lifecycle", async () => {
+      const mockJob: ConcurrencyLimitedJob = {
+        id: "lifecycle-test",
+        data: { test: "lifecycle" },
+        opts: {},
+      };
+      // Push job to queue
+      await pushConcurrencyLimitedJob(mockTeamId, mockJob);
+      expect(redisConnection.zadd).toHaveBeenCalled();
+      // Take job from queue
+      (redisConnection.zmpop as jest.Mock).mockResolvedValue([
+        "key",
+        [[JSON.stringify(mockJob)]],
+      ]);
+      const takenJob = await takeConcurrencyLimitedJob(mockTeamId);
+      expect(takenJob).toEqual(mockJob);
+      // Add to active jobs
+      await pushConcurrencyLimitActiveJob(mockTeamId, mockJob.id, 2 * 60 * 1000, mockNow);
+      expect(redisConnection.zadd).toHaveBeenCalled();
+      // Verify active jobs
+      (redisConnection.zrangebyscore as jest.Mock).mockResolvedValue([mockJob.id]);
+      const activeJobs = await getConcurrencyLimitActiveJobs(mockTeamId, mockNow);
+      expect(activeJobs).toContain(mockJob.id);
+      // Remove from active jobs
+      await removeConcurrencyLimitActiveJob(mockTeamId, mockJob.id);
+      expect(redisConnection.zrem).toHaveBeenCalled();
+    });
+  });
+});

src/__tests__/deep-research/unit/deep-research-redis.test.ts ADDED Viewed

	@@ -0,0 +1,135 @@

+import { redisConnection } from "../../../services/queue-service";
+import {
+  saveDeepResearch,
+  getDeepResearch,
+  updateDeepResearch,
+  getDeepResearchExpiry,
+  StoredDeepResearch,
+} from "../../../lib/deep-research/deep-research-redis";
+jest.mock("../../../services/queue-service", () => ({
+  redisConnection: {
+    set: jest.fn(),
+    get: jest.fn(),
+    expire: jest.fn(),
+    pttl: jest.fn(),
+  },
+}));
+describe("Deep Research Redis Operations", () => {
+  const mockResearch: StoredDeepResearch = {
+    id: "test-id",
+    team_id: "team-1",
+    createdAt: Date.now(),
+    status: "processing",
+    currentDepth: 0,
+    maxDepth: 5,
+    completedSteps: 0,
+    totalExpectedSteps: 25,
+    findings: [],
+    sources: [],
+    activities: [],
+    summaries: [],
+  };
+  beforeEach(() => {
+    jest.clearAllMocks();
+  });
+  describe("saveDeepResearch", () => {
+    it("should save research data to Redis with TTL", async () => {
+      await saveDeepResearch("test-id", mockResearch);
+      expect(redisConnection.set).toHaveBeenCalledWith(
+        "deep-research:test-id",
+        JSON.stringify(mockResearch)
+      );
+      expect(redisConnection.expire).toHaveBeenCalledWith(
+        "deep-research:test-id",
+        6 * 60 * 60
+      );
+    });
+  });
+  describe("getDeepResearch", () => {
+    it("should retrieve research data from Redis", async () => {
+      (redisConnection.get as jest.Mock).mockResolvedValue(
+        JSON.stringify(mockResearch)
+      );
+      const result = await getDeepResearch("test-id");
+      expect(result).toEqual(mockResearch);
+      expect(redisConnection.get).toHaveBeenCalledWith("deep-research:test-id");
+    });
+    it("should return null when research not found", async () => {
+      (redisConnection.get as jest.Mock).mockResolvedValue(null);
+      const result = await getDeepResearch("non-existent-id");
+      expect(result).toBeNull();
+    });
+  });
+  describe("updateDeepResearch", () => {
+    it("should update existing research with new data", async () => {
+      (redisConnection.get as jest.Mock).mockResolvedValue(
+        JSON.stringify(mockResearch)
+      );
+      const update = {
+        status: "completed" as const,
+        finalAnalysis: "Test analysis",
+        activities: [
+          {
+            type: "search" as const,
+            status: "complete" as const,
+            message: "New activity",
+            timestamp: new Date().toISOString(),
+            depth: 1,
+          },
+        ],
+      };
+      await updateDeepResearch("test-id", update);
+      const expectedUpdate = {
+        ...mockResearch,
+        ...update,
+        activities: [...mockResearch.activities, ...update.activities],
+      };
+      expect(redisConnection.set).toHaveBeenCalledWith(
+        "deep-research:test-id",
+        JSON.stringify(expectedUpdate)
+      );
+      expect(redisConnection.expire).toHaveBeenCalledWith(
+        "deep-research:test-id",
+        6 * 60 * 60
+      );
+    });
+    it("should do nothing if research not found", async () => {
+      (redisConnection.get as jest.Mock).mockResolvedValue(null);
+      await updateDeepResearch("test-id", { status: "completed" });
+      expect(redisConnection.set).not.toHaveBeenCalled();
+      expect(redisConnection.expire).not.toHaveBeenCalled();
+    });
+  });
+  describe("getDeepResearchExpiry", () => {
+    it("should return correct expiry date", async () => {
+      const mockTTL = 3600000; // 1 hour in milliseconds
+      (redisConnection.pttl as jest.Mock).mockResolvedValue(mockTTL);
+      const result = await getDeepResearchExpiry("test-id");
+      expect(result).toBeInstanceOf(Date);
+      expect(result.getTime()).toBeCloseTo(
+        new Date().getTime() + mockTTL,
+        -2 // Allow 100ms precision
+      );
+    });
+  });
+});

src/__tests__/e2e_extract/index.test.ts ADDED Viewed

	@@ -0,0 +1,340 @@

+import request from "supertest";
+import dotenv from "dotenv";
+import {
+  FirecrawlCrawlResponse,
+  FirecrawlCrawlStatusResponse,
+  FirecrawlScrapeResponse,
+} from "../../types";
+dotenv.config();
+const TEST_URL = "http://127.0.0.1:3002";
+describe("E2E Tests for Extract API Routes", () => {
+  it.concurrent(
+    "should return authors of blog posts on firecrawl.dev",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/extract")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          urls: ["https://firecrawl.dev/*"],
+          prompt: "Who are the authors of the blog posts?",
+          schema: {
+            type: "object",
+            properties: {
+              authors: { type: "array", items: { type: "string" } },
+            },
+          },
+        });
+      console.log(response.body);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data).toHaveProperty("authors");
+      let gotItRight = 0;
+      for (const author of response.body.data?.authors) {
+        if (author.includes("Caleb Peffer")) gotItRight++;
+        if (author.includes("Gergő Móricz")) gotItRight++;
+        if (author.includes("Eric Ciarla")) gotItRight++;
+        if (author.includes("Nicolas Camara")) gotItRight++;
+        if (author.includes("Jon")) gotItRight++;
+        if (author.includes("Wendong")) gotItRight++;
+      }
+      expect(gotItRight).toBeGreaterThan(1);
+    },
+    60000,
+  );
+  it.concurrent(
+    "should return founders of firecrawl.dev (allowExternalLinks = true)",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/extract")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          urls: ["firecrawl.dev/*"],
+          prompt: "Who are the founders of the company?",
+          allowExternalLinks: true,
+          schema: {
+            type: "object",
+            properties: {
+              founders: { type: "array", items: { type: "string" } },
+            },
+          },
+        });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data).toHaveProperty("founders");
+      console.log(response.body.data?.founders);
+      let gotItRight = 0;
+      for (const founder of response.body.data?.founders) {
+        if (founder.includes("Caleb")) gotItRight++;
+        if (founder.includes("Eric")) gotItRight++;
+        if (founder.includes("Nicolas")) gotItRight++;
+        if (founder.includes("nick")) gotItRight++;
+        if (founder.includes("eric")) gotItRight++;
+        if (founder.includes("jon-noronha")) gotItRight++;
+      }
+      expect(gotItRight).toBeGreaterThanOrEqual(2);
+    },
+    60000,
+  );
+  it.concurrent(
+    "should return hiring opportunities on firecrawl.dev (allowExternalLinks = true)",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/extract")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          urls: ["https://firecrawl.dev/*"],
+          prompt: "What are they hiring for?",
+          allowExternalLinks: true,
+          schema: {
+            type: "array",
+            items: {
+              type: "string",
+            },
+            required: ["items"],
+          },
+        });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      console.log(response.body.data);
+      let gotItRight = 0;
+      for (const hiring of response.body.data?.items) {
+        if (hiring.includes("Firecrawl Example Creator")) gotItRight++;
+        if (hiring.includes("Senior Frontend Engineer")) gotItRight++;
+        if (hiring.includes("Technical Chief of Staff")) gotItRight++;
+        if (hiring.includes("Founding Web Automation Engineer")) gotItRight++;
+        if (hiring.includes("Founding Fullstack Engineer")) gotItRight++;
+      }
+      expect(gotItRight).toBeGreaterThan(2);
+    },
+    60000,
+  );
+  it.concurrent(
+    "should return PCI DSS compliance for Fivetran",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/extract")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          urls: ["fivetran.com/*"],
+          prompt: "Does Fivetran have PCI DSS compliance?",
+          allowExternalLinks: true,
+          schema: {
+            type: "object",
+            properties: {
+              pciDssCompliance: { type: "boolean" },
+            },
+          },
+        });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data?.pciDssCompliance).toBe(true);
+    },
+    60000,
+  );
+  it.concurrent(
+    "should return Azure Data Connectors for Fivetran",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/extract")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          urls: ["fivetran.com/*"],
+          prompt: "What are the Azure Data Connectors they offer?",
+          schema: {
+            type: "array",
+            items: {
+              type: "object",
+              properties: {
+                connector: { type: "string" },
+                description: { type: "string" },
+                supportsCaptureDelete: { type: "boolean" },
+              },
+            },
+          },
+        });
+      console.log(response.body);
+      // expect(response.statusCode).toBe(200);
+      // expect(response.body).toHaveProperty("data");
+      // expect(response.body.data?.pciDssCompliance).toBe(true);
+    },
+    60000,
+  );
+  it.concurrent(
+    "should return Greenhouse Applicant Tracking System for Abnormal Security",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/extract")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          urls: [
+            "https://careers.abnormalsecurity.com/jobs/6119456003?gh_jid=6119456003",
+          ],
+          prompt: "what applicant tracking system is this company using?",
+          schema: {
+            type: "object",
+            properties: {
+              isGreenhouseATS: { type: "boolean" },
+              answer: { type: "string" },
+            },
+          },
+          allowExternalLinks: true,
+        });
+      console.log(response.body);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data?.isGreenhouseATS).toBe(true);
+    },
+    60000,
+  );
+  it.concurrent(
+    "should return mintlify api components",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/extract")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          urls: ["https://mintlify.com/docs/*"],
+          prompt: "what are the 4 API components?",
+          schema: {
+            type: "array",
+            items: {
+              type: "object",
+              properties: {
+                component: { type: "string" },
+              },
+            },
+            required: ["items"],
+          },
+          allowExternalLinks: true,
+        });
+      console.log(response.body.data?.items);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data?.items.length).toBe(4);
+      let gotItRight = 0;
+      for (const component of response.body.data?.items) {
+        if (component.component.toLowerCase().includes("parameter"))
+          gotItRight++;
+        if (component.component.toLowerCase().includes("response"))
+          gotItRight++;
+        if (component.component.toLowerCase().includes("expandable"))
+          gotItRight++;
+        if (component.component.toLowerCase().includes("sticky")) gotItRight++;
+        if (component.component.toLowerCase().includes("examples"))
+          gotItRight++;
+      }
+      expect(gotItRight).toBeGreaterThan(2);
+    },
+    60000,
+  );
+  it.concurrent(
+    "should return information about Eric Ciarla",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/extract")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          urls: ["https://ericciarla.com/"],
+          prompt:
+            "Who is Eric Ciarla? Where does he work? Where did he go to school?",
+          schema: {
+            type: "object",
+            properties: {
+              name: { type: "string" },
+              work: { type: "string" },
+              education: { type: "string" },
+            },
+            required: ["name", "work", "education"],
+          },
+          allowExternalLinks: true,
+        });
+      console.log(response.body.data);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data?.name).toBe("Eric Ciarla");
+      expect(response.body.data?.work).toBeDefined();
+      expect(response.body.data?.education).toBeDefined();
+    },
+    60000,
+  );
+  it.concurrent(
+    "should extract information without a schema",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/extract")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          urls: ["https://docs.firecrawl.dev"],
+          prompt: "What is the title and description of the page?",
+        });
+      console.log(response.body.data);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      expect(typeof response.body.data).toBe("object");
+      expect(Object.keys(response.body.data).length).toBeGreaterThan(0);
+    },
+    60000,
+  );
+  it.concurrent(
+    "should extract information with scrapeOptions.waitFor",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/extract")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          urls: ["https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/"],
+          prompt: "What is the content right after the #content-1 id?",
+          schema: {
+            type: "object",
+            properties: {
+              content: { type: "string" },
+            },
+            required: ["content"],
+          },
+          scrapeOptions: {
+            waitFor: 6000,
+          }
+        });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      expect(typeof response.body.data).toBe("object");
+      expect(response.body.data?.content).toBeDefined();
+      expect(response.body.data?.content).toBe("Content loaded after 5 seconds!");
+    },
+    60000,
+  );
+});

src/__tests__/e2e_full_withAuth/index.test.ts ADDED Viewed

	@@ -0,0 +1,1762 @@

+import request from "supertest";
+import dotenv from "dotenv";
+import { v4 as uuidv4 } from "uuid";
+import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
+dotenv.config();
+// const TEST_URL = 'http://localhost:3002'
+const TEST_URL = "http://127.0.0.1:3002";
+describe("E2E Tests for API Routes", () => {
+  beforeAll(() => {
+    process.env.USE_DB_AUTHENTICATION = "true";
+  });
+  afterAll(() => {
+    delete process.env.USE_DB_AUTHENTICATION;
+  });
+  describe("GET /", () => {
+    it.concurrent("should return Hello, world! message", async () => {
+      const response = await request(TEST_URL).get("/");
+      expect(response.statusCode).toBe(200);
+      expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io");
+    });
+  });
+  describe("GET /test", () => {
+    it.concurrent("should return Hello, world! message", async () => {
+      const response = await request(TEST_URL).get("/test");
+      expect(response.statusCode).toBe(200);
+      expect(response.text).toContain("Hello, world!");
+    });
+  });
+  describe("POST /v0/scrape", () => {
+    it.concurrent("should require authorization", async () => {
+      const response = await request(TEST_URL).post("/v0/scrape");
+      expect(response.statusCode).toBe(401);
+    });
+    it.concurrent(
+      "should return an error response with an invalid API key",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer invalid-api-key`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://firecrawl.dev" });
+        expect(response.statusCode).toBe(401);
+      },
+    );
+    it.concurrent("should return an error for a blocklisted URL", async () => {
+      const blocklistedUrl = "https://facebook.com/fake-test";
+      const response = await request(TEST_URL)
+        .post("/v0/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: blocklistedUrl });
+      expect(response.statusCode).toBe(403);
+      expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
+    });
+    it.concurrent(
+      "should return a successful response with a valid API key",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://roastmywebsite.ai" });
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("markdown");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data).not.toHaveProperty("html");
+        expect(response.body.data.content).toContain("_Roast_");
+        expect(response.body.data.metadata).toHaveProperty("title");
+        expect(response.body.data.metadata).toHaveProperty("description");
+        expect(response.body.data.metadata).toHaveProperty("keywords");
+        expect(response.body.data.metadata).toHaveProperty("robots");
+        expect(response.body.data.metadata).toHaveProperty("ogTitle");
+        expect(response.body.data.metadata).toHaveProperty("ogDescription");
+        expect(response.body.data.metadata).toHaveProperty("ogUrl");
+        expect(response.body.data.metadata).toHaveProperty("ogImage");
+        expect(response.body.data.metadata).toHaveProperty("ogLocaleAlternate");
+        expect(response.body.data.metadata).toHaveProperty("ogSiteName");
+        expect(response.body.data.metadata).toHaveProperty("sourceURL");
+        expect(response.body.data.metadata).toHaveProperty("pageStatusCode");
+        expect(response.body.data.metadata.pageError).toBeUndefined();
+        expect(response.body.data.metadata.title).toBe("Roast My Website");
+        expect(response.body.data.metadata.description).toBe(
+          "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
+        );
+        expect(response.body.data.metadata.keywords).toBe(
+          "Roast My Website,Roast,Website,GitHub,Firecrawl",
+        );
+        expect(response.body.data.metadata.robots).toBe("follow, index");
+        expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
+        expect(response.body.data.metadata.ogDescription).toBe(
+          "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
+        );
+        expect(response.body.data.metadata.ogUrl).toBe(
+          "https://www.roastmywebsite.ai",
+        );
+        expect(response.body.data.metadata.ogImage).toBe(
+          "https://www.roastmywebsite.ai/og.png",
+        );
+        expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
+        expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
+        expect(response.body.data.metadata.sourceURL).toBe(
+          "https://roastmywebsite.ai",
+        );
+        expect(response.body.data.metadata.pageStatusCode).toBe(200);
+      },
+      30000,
+    ); // 30 seconds timeout
+    it.concurrent(
+      "should return a successful response with a valid API key and includeHtml set to true",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://roastmywebsite.ai",
+            pageOptions: { includeHtml: true },
+          });
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("markdown");
+        expect(response.body.data).toHaveProperty("html");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.content).toContain("_Roast_");
+        expect(response.body.data.markdown).toContain("_Roast_");
+        expect(response.body.data.html).toContain("<h1");
+        expect(response.body.data.metadata.pageStatusCode).toBe(200);
+        expect(response.body.data.metadata.pageError).toBeUndefined();
+      },
+      30000,
+    ); // 30 seconds timeout
+    it.concurrent(
+      "should return a successful response with a valid API key and includeRawHtml set to true",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://roastmywebsite.ai",
+            pageOptions: { includeRawHtml: true },
+          });
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("markdown");
+        expect(response.body.data).toHaveProperty("rawHtml");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.content).toContain("_Roast_");
+        expect(response.body.data.markdown).toContain("_Roast_");
+        expect(response.body.data.rawHtml).toContain("<h1");
+        expect(response.body.data.metadata.pageStatusCode).toBe(200);
+        expect(response.body.data.metadata.pageError).toBeUndefined();
+      },
+      30000,
+    ); // 30 seconds timeout
+    it.concurrent(
+      "should return a successful response for a valid scrape with PDF file",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" });
+        await new Promise((r) => setTimeout(r, 6000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.content).toContain(
+          "We present spectrophotometric observations of the Broad Line Radio Galaxy",
+        );
+        expect(response.body.data.metadata.pageStatusCode).toBe(200);
+        expect(response.body.data.metadata.pageError).toBeUndefined();
+      },
+      60000,
+    ); // 60 seconds
+    it.concurrent(
+      "should return a successful response for a valid scrape with PDF file without explicit .pdf extension",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://arxiv.org/pdf/astro-ph/9301001" });
+        await new Promise((r) => setTimeout(r, 6000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.content).toContain(
+          "We present spectrophotometric observations of the Broad Line Radio Galaxy",
+        );
+        expect(response.body.data.metadata.pageStatusCode).toBe(200);
+        expect(response.body.data.metadata.pageError).toBeUndefined();
+      },
+      60000,
+    ); // 60 seconds
+    it.concurrent(
+      "should return a successful response for a valid scrape with PDF file and parsePDF set to false",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://arxiv.org/pdf/astro-ph/9301001.pdf",
+            pageOptions: { parsePDF: false },
+          });
+        await new Promise((r) => setTimeout(r, 6000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.content).toContain(
+          "/Title(arXiv:astro-ph/9301001v1  7 Jan 1993)>>endobj",
+        );
+      },
+      60000,
+    ); // 60 seconds
+    it.concurrent(
+      "should return a successful response with a valid API key with removeTags option",
+      async () => {
+        const responseWithoutRemoveTags = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://www.scrapethissite.com/" });
+        expect(responseWithoutRemoveTags.statusCode).toBe(200);
+        expect(responseWithoutRemoveTags.body).toHaveProperty("data");
+        expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
+        expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
+        expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
+        expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
+        expect(responseWithoutRemoveTags.body.data.content).toContain(
+          "Scrape This Site",
+        );
+        expect(responseWithoutRemoveTags.body.data.content).toContain(
+          "Lessons and Videos",
+        ); // #footer
+        expect(responseWithoutRemoveTags.body.data.content).toContain(
+          "[Sandbox](",
+        ); // .nav
+        expect(responseWithoutRemoveTags.body.data.content).toContain(
+          "web scraping",
+        ); // strong
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://www.scrapethissite.com/",
+            pageOptions: { removeTags: [".nav", "#footer", "strong"] },
+          });
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("markdown");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data).not.toHaveProperty("html");
+        expect(response.body.data.content).toContain("Scrape This Site");
+        expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
+        expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
+        expect(response.body.data.content).not.toContain("web scraping"); // strong
+      },
+      30000,
+    ); // 30 seconds timeout
+    // TODO: add this test back once we nail the waitFor option to be more deterministic
+    // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
+    //   const startTime = Date.now();
+    //   const response = await request(TEST_URL)
+    //     .post("/v0/scrape")
+    //     .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+    //     .set("Content-Type", "application/json")
+    //     .send({ url: "https://firecrawl.dev", pageOptions: { waitFor: 7000 } });
+    //   const endTime = Date.now();
+    //   const duration = endTime - startTime;
+    //   expect(response.statusCode).toBe(200);
+    //   expect(response.body).toHaveProperty("data");
+    //   expect(response.body.data).toHaveProperty("content");
+    //   expect(response.body.data).toHaveProperty("markdown");
+    //   expect(response.body.data).toHaveProperty("metadata");
+    //   expect(response.body.data).not.toHaveProperty("html");
+    //   expect(response.body.data.content).toContain("🔥 Firecrawl");
+    //   expect(duration).toBeGreaterThanOrEqual(7000);
+    // }, 12000); // 12 seconds timeout
+    it.concurrent(
+      "should return a successful response for a scrape with 400 page",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://httpstat.us/400" });
+        await new Promise((r) => setTimeout(r, 5000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.metadata.pageStatusCode).toBe(400);
+        expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
+          "bad request",
+        );
+      },
+      60000,
+    ); // 60 seconds
+    it.concurrent(
+      "should return a successful response for a scrape with 401 page",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://httpstat.us/401" });
+        await new Promise((r) => setTimeout(r, 5000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.metadata.pageStatusCode).toBe(401);
+        expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
+          "unauthorized",
+        );
+      },
+      60000,
+    ); // 60 seconds
+    it.concurrent(
+      "should return a successful response for a scrape with 403 page",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://httpstat.us/403" });
+        await new Promise((r) => setTimeout(r, 5000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.metadata.pageStatusCode).toBe(403);
+        expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
+          "forbidden",
+        );
+      },
+      60000,
+    ); // 60 seconds
+    it.concurrent(
+      "should return a successful response for a scrape with 404 page",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://httpstat.us/404" });
+        await new Promise((r) => setTimeout(r, 5000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.metadata.pageStatusCode).toBe(404);
+        expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
+          "not found",
+        );
+      },
+      60000,
+    ); // 60 seconds
+    it.concurrent(
+      "should return a successful response for a scrape with 405 page",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://httpstat.us/405" });
+        await new Promise((r) => setTimeout(r, 5000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.metadata.pageStatusCode).toBe(405);
+        expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
+          "method not allowed",
+        );
+      },
+      60000,
+    ); // 60 seconds
+    it.concurrent(
+      "should return a successful response for a scrape with 500 page",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://httpstat.us/500" });
+        await new Promise((r) => setTimeout(r, 5000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.metadata.pageStatusCode).toBe(500);
+        expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
+          "internal server error",
+        );
+      },
+      60000,
+    ); // 60 seconds
+  });
+  describe("POST /v0/crawl", () => {
+    it.concurrent("should require authorization", async () => {
+      const response = await request(TEST_URL).post("/v0/crawl");
+      expect(response.statusCode).toBe(401);
+    });
+    it.concurrent(
+      "should return an error response with an invalid API key",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer invalid-api-key`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://firecrawl.dev" });
+        expect(response.statusCode).toBe(401);
+      },
+    );
+    it.concurrent("should return an error for a blocklisted URL", async () => {
+      const blocklistedUrl = "https://twitter.com/fake-test";
+      const response = await request(TEST_URL)
+        .post("/v0/crawl")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: blocklistedUrl });
+      expect(response.statusCode).toBe(403);
+      expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
+    });
+    it.concurrent(
+      "should return a successful response with a valid API key for crawl",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://firecrawl.dev" });
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("jobId");
+        expect(response.body.jobId).toMatch(
+          /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
+        );
+      },
+    );
+    it.concurrent(
+      "should prevent duplicate requests using the same idempotency key",
+      async () => {
+        const uniqueIdempotencyKey = uuidv4();
+        // First request with the idempotency key
+        const firstResponse = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .set("x-idempotency-key", uniqueIdempotencyKey)
+          .send({ url: "https://docs.firecrawl.dev" });
+        expect(firstResponse.statusCode).toBe(200);
+        // Second request with the same idempotency key
+        const secondResponse = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .set("x-idempotency-key", uniqueIdempotencyKey)
+          .send({ url: "https://docs.firecrawl.dev" });
+        expect(secondResponse.statusCode).toBe(409);
+        expect(secondResponse.body.error).toBe("Idempotency key already used");
+      },
+    );
+    it.concurrent(
+      "should return a successful response with a valid API key and valid includes option",
+      async () => {
+        const crawlResponse = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://mendable.ai",
+            limit: 10,
+            crawlerOptions: {
+              includes: ["blog/*"],
+            },
+          });
+        let response;
+        let isFinished = false;
+        while (!isFinished) {
+          response = await request(TEST_URL)
+            .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(response.statusCode).toBe(200);
+          expect(response.body).toHaveProperty("status");
+          isFinished = response.body.status === "completed";
+          if (!isFinished) {
+            await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        const completedResponse = response;
+        const urls = completedResponse.body.data.map(
+          (item: any) => item.metadata?.sourceURL,
+        );
+        expect(urls.length).toBeGreaterThan(5);
+        urls.forEach((url: string) => {
+          expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy();
+        });
+        expect(completedResponse.statusCode).toBe(200);
+        expect(completedResponse.body).toHaveProperty("status");
+        expect(completedResponse.body.status).toBe("completed");
+        expect(completedResponse.body).toHaveProperty("data");
+        expect(completedResponse.body.data[0]).toHaveProperty("content");
+        expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        expect(completedResponse.body.data[0].content).toContain("Mendable");
+        expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
+          200,
+        );
+        expect(
+          completedResponse.body.data[0].metadata.pageError,
+        ).toBeUndefined();
+      },
+      60000,
+    ); // 60 seconds
+    it.concurrent(
+      "should return a successful response with a valid API key and valid excludes option",
+      async () => {
+        const crawlResponse = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://mendable.ai",
+            limit: 10,
+            crawlerOptions: {
+              excludes: ["blog/*"],
+            },
+          });
+        let isFinished = false;
+        let response;
+        while (!isFinished) {
+          response = await request(TEST_URL)
+            .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(response.statusCode).toBe(200);
+          expect(response.body).toHaveProperty("status");
+          isFinished = response.body.status === "completed";
+          if (!isFinished) {
+            await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        const completedResponse = response;
+        const urls = completedResponse.body.data.map(
+          (item: any) => item.metadata?.sourceURL,
+        );
+        expect(urls.length).toBeGreaterThan(5);
+        urls.forEach((url: string) => {
+          expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
+        });
+      },
+      90000,
+    ); // 90 seconds
+    it.concurrent(
+      "should return a successful response with a valid API key and limit to 3",
+      async () => {
+        const crawlResponse = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://mendable.ai",
+            crawlerOptions: { limit: 3 },
+          });
+        let isFinished = false;
+        let response;
+        while (!isFinished) {
+          response = await request(TEST_URL)
+            .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(response.statusCode).toBe(200);
+          expect(response.body).toHaveProperty("status");
+          isFinished = response.body.status === "completed";
+          if (!isFinished) {
+            await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        const completedResponse = response;
+        expect(completedResponse.statusCode).toBe(200);
+        expect(completedResponse.body).toHaveProperty("status");
+        expect(completedResponse.body.status).toBe("completed");
+        expect(completedResponse.body).toHaveProperty("data");
+        expect(completedResponse.body.data.length).toBe(3);
+        expect(completedResponse.body.data[0]).toHaveProperty("content");
+        expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        expect(completedResponse.body.data[0].content).toContain("Mendable");
+        expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
+          200,
+        );
+        expect(
+          completedResponse.body.data[0].metadata.pageError,
+        ).toBeUndefined();
+      },
+      60000,
+    ); // 60 seconds
+    it.concurrent(
+      "should return a successful response with max depth option for a valid crawl job",
+      async () => {
+        const crawlResponse = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://www.scrapethissite.com",
+            crawlerOptions: { maxDepth: 1 },
+          });
+        expect(crawlResponse.statusCode).toBe(200);
+        const response = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("status");
+        expect(["active", "waiting"]).toContain(response.body.status);
+        // wait for 60 seconds
+        let isCompleted = false;
+        while (!isCompleted) {
+          const statusCheckResponse = await request(TEST_URL)
+            .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(statusCheckResponse.statusCode).toBe(200);
+          isCompleted = statusCheckResponse.body.status === "completed";
+          if (!isCompleted) {
+            await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        const completedResponse = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(completedResponse.statusCode).toBe(200);
+        expect(completedResponse.body).toHaveProperty("status");
+        expect(completedResponse.body.status).toBe("completed");
+        expect(completedResponse.body).toHaveProperty("data");
+        expect(completedResponse.body.data[0]).toHaveProperty("content");
+        expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
+          200,
+        );
+        expect(
+          completedResponse.body.data[0].metadata.pageError,
+        ).toBeUndefined();
+        const urls = completedResponse.body.data.map(
+          (item: any) => item.metadata?.sourceURL,
+        );
+        expect(urls.length).toBeGreaterThan(1);
+        // Check if all URLs have a maximum depth of 1
+        urls.forEach((url: string) => {
+          const pathSplits = new URL(url).pathname.split("/");
+          const depth =
+            pathSplits.length -
+            (pathSplits[0].length === 0 &&
+            pathSplits[pathSplits.length - 1].length === 0
+              ? 1
+              : 0);
+          expect(depth).toBeLessThanOrEqual(2);
+        });
+      },
+      180000,
+    );
+    it.concurrent(
+      "should return a successful response with relative max depth option for a valid crawl job",
+      async () => {
+        const crawlResponse = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://www.scrapethissite.com/pages/",
+            crawlerOptions: { maxDepth: 1 },
+          });
+        expect(crawlResponse.statusCode).toBe(200);
+        const response = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("status");
+        expect(["active", "waiting"]).toContain(response.body.status);
+        // wait for 60 seconds
+        let isCompleted = false;
+        while (!isCompleted) {
+          const statusCheckResponse = await request(TEST_URL)
+            .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(statusCheckResponse.statusCode).toBe(200);
+          isCompleted = statusCheckResponse.body.status === "completed";
+          if (!isCompleted) {
+            await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        const completedResponse = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(completedResponse.statusCode).toBe(200);
+        expect(completedResponse.body).toHaveProperty("status");
+        expect(completedResponse.body.status).toBe("completed");
+        expect(completedResponse.body).toHaveProperty("data");
+        expect(completedResponse.body.data[0]).toHaveProperty("content");
+        expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        const urls = completedResponse.body.data.map(
+          (item: any) => item.metadata?.sourceURL,
+        );
+        expect(urls.length).toBeGreaterThan(1);
+        // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1
+        urls.forEach((url: string) => {
+          const pathSplits = new URL(url).pathname.split("/");
+          const depth =
+            pathSplits.length -
+            (pathSplits[0].length === 0 &&
+            pathSplits[pathSplits.length - 1].length === 0
+              ? 1
+              : 0);
+          expect(depth).toBeLessThanOrEqual(3);
+        });
+      },
+      180000,
+    );
+    it.concurrent(
+      "should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero",
+      async () => {
+        const crawlResponse = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://www.mendable.ai",
+            crawlerOptions: { maxDepth: 0 },
+          });
+        expect(crawlResponse.statusCode).toBe(200);
+        const response = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("status");
+        expect(["active", "waiting"]).toContain(response.body.status);
+        // wait for 60 seconds
+        let isCompleted = false;
+        while (!isCompleted) {
+          const statusCheckResponse = await request(TEST_URL)
+            .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(statusCheckResponse.statusCode).toBe(200);
+          isCompleted = statusCheckResponse.body.status === "completed";
+          if (!isCompleted) {
+            await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        const completedResponse = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        const testurls = completedResponse.body.data.map(
+          (item: any) => item.metadata?.sourceURL,
+        );
+        //console.log(testurls)
+        expect(completedResponse.statusCode).toBe(200);
+        expect(completedResponse.body).toHaveProperty("status");
+        expect(completedResponse.body.status).toBe("completed");
+        expect(completedResponse.body).toHaveProperty("data");
+        expect(completedResponse.body.data[0]).toHaveProperty("content");
+        expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        const urls = completedResponse.body.data.map(
+          (item: any) => item.metadata?.sourceURL,
+        );
+        expect(urls.length).toBeGreaterThanOrEqual(1);
+        // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1
+        urls.forEach((url: string) => {
+          const pathSplits = new URL(url).pathname.split("/");
+          const depth =
+            pathSplits.length -
+            (pathSplits[0].length === 0 &&
+            pathSplits[pathSplits.length - 1].length === 0
+              ? 1
+              : 0);
+          expect(depth).toBeLessThanOrEqual(1);
+        });
+      },
+      180000,
+    );
+    // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => {
+    //   const crawlResponse = await request(TEST_URL)
+    //     .post("/v0/crawl")
+    //     .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+    //     .set("Content-Type", "application/json")
+    //     .send({
+    //       url: "https://mendable.ai",
+    //       crawlerOptions: { limit: 10 },
+    //     });
+    //   const response = await request(TEST_URL)
+    //     .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+    //     .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+    //   expect(response.statusCode).toBe(200);
+    //   expect(response.body).toHaveProperty("status");
+    //   expect(response.body.status).toBe("active");
+    //   let isCompleted = false;
+    //   while (!isCompleted) {
+    //     const statusCheckResponse = await request(TEST_URL)
+    //       .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+    //       .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+    //     expect(statusCheckResponse.statusCode).toBe(200);
+    //     isCompleted = statusCheckResponse.body.status === "completed";
+    //     if (!isCompleted) {
+    //       await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+    //     }
+    //   }
+    //   const completedResponse = await request(TEST_URL)
+    //     .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+    //     .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+    //   expect(completedResponse.statusCode).toBe(200);
+    //   expect(completedResponse.body).toHaveProperty("status");
+    //   expect(completedResponse.body.status).toBe("completed");
+    //   expect(completedResponse.body).toHaveProperty("data");
+    //   expect(completedResponse.body.data.length).toBe(10);
+    //   expect(completedResponse.body.data[0]).toHaveProperty("content");
+    //   expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+    //   expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+    //   expect(completedResponse.body.data[0].content).toContain("Mendable");
+    //   expect(completedResponse.body.data[0].content).not.toContain("main menu");
+    // }, 60000); // 60 seconds
+    it.concurrent(
+      "should return a successful response for a valid crawl job with includeHtml set to true option",
+      async () => {
+        const crawlResponse = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://roastmywebsite.ai",
+            pageOptions: { includeHtml: true },
+          });
+        expect(crawlResponse.statusCode).toBe(200);
+        const response = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("status");
+        expect(["active", "waiting"]).toContain(response.body.status);
+        let isCompleted = false;
+        while (!isCompleted) {
+          const statusCheckResponse = await request(TEST_URL)
+            .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(statusCheckResponse.statusCode).toBe(200);
+          isCompleted = statusCheckResponse.body.status === "completed";
+          if (!isCompleted) {
+            await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        const completedResponse = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(completedResponse.statusCode).toBe(200);
+        expect(completedResponse.body).toHaveProperty("status");
+        expect(completedResponse.body.status).toBe("completed");
+        expect(completedResponse.body).toHaveProperty("data");
+        expect(completedResponse.body.data[0]).toHaveProperty("content");
+        expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
+          200,
+        );
+        expect(
+          completedResponse.body.data[0].metadata.pageError,
+        ).toBeUndefined();
+        // 120 seconds
+        expect(completedResponse.body.data[0]).toHaveProperty("html");
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        expect(completedResponse.body.data[0].content).toContain("_Roast_");
+        expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
+        expect(completedResponse.body.data[0].html).toContain("<h1");
+        expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
+          200,
+        );
+        expect(
+          completedResponse.body.data[0].metadata.pageError,
+        ).toBeUndefined();
+      },
+      180000,
+    );
+    it.concurrent(
+      "should crawl external content links when allowed",
+      async () => {
+        const crawlInitResponse = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://mendable.ai",
+            crawlerOptions: {
+              allowExternalContentLinks: true,
+              ignoreSitemap: true,
+              returnOnlyUrls: true,
+              limit: 50,
+            },
+          });
+        expect(crawlInitResponse.statusCode).toBe(200);
+        expect(crawlInitResponse.body).toHaveProperty("jobId");
+        let crawlStatus: string = "scraping";
+        let crawlData = [];
+        while (crawlStatus !== "completed") {
+          const statusResponse = await request(TEST_URL)
+            .get(`/v0/crawl/status/${crawlInitResponse.body.jobId}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          crawlStatus = statusResponse.body.status;
+          if (statusResponse.body.data) {
+            crawlData = statusResponse.body.data;
+          }
+          if (crawlStatus !== "completed") {
+            await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        expect(crawlData.length).toBeGreaterThan(0);
+        expect(crawlData).toEqual(
+          expect.arrayContaining([
+            expect.objectContaining({
+              url: expect.stringContaining(
+                "https://firecrawl.dev/?ref=mendable+banner",
+              ),
+            }),
+            expect.objectContaining({
+              url: expect.stringContaining("https://mendable.ai/pricing"),
+            }),
+            expect.objectContaining({
+              url: expect.stringContaining("https://x.com/CalebPeffer"),
+            }),
+          ]),
+        );
+      },
+      180000,
+    ); // 3 minutes timeout
+  });
+  describe("POST /v0/crawlWebsitePreview", () => {
+    it.concurrent("should require authorization", async () => {
+      const response = await request(TEST_URL).post("/v0/crawlWebsitePreview");
+      expect(response.statusCode).toBe(401);
+    });
+    it.concurrent(
+      "should return an error response with an invalid API key",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/crawlWebsitePreview")
+          .set("Authorization", `Bearer invalid-api-key`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://firecrawl.dev" });
+        expect(response.statusCode).toBe(401);
+      },
+    );
+    // it.concurrent("should return an error for a blocklisted URL", async () => {
+    //   const blocklistedUrl = "https://instagram.com/fake-test";
+    //   const response = await request(TEST_URL)
+    //     .post("/v0/crawlWebsitePreview")
+    //     .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+    //     .set("Content-Type", "application/json")
+    //     .send({ url: blocklistedUrl });
+    // // is returning 429 instead of 403
+    //   expect(response.statusCode).toBe(403);
+    //   expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
+    // });
+    it.concurrent(
+      "should return a timeout error when scraping takes longer than the specified timeout",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://firecrawl.dev", timeout: 1000 });
+        expect(response.statusCode).toBe(408);
+      },
+      3000,
+    );
+  });
+  describe("POST /v0/search", () => {
+    it.concurrent("should require authorization", async () => {
+      const response = await request(TEST_URL).post("/v0/search");
+      expect(response.statusCode).toBe(401);
+    });
+    it.concurrent(
+      "should return an error response with an invalid API key",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/search")
+          .set("Authorization", `Bearer invalid-api-key`)
+          .set("Content-Type", "application/json")
+          .send({ query: "test" });
+        expect(response.statusCode).toBe(401);
+      },
+    );
+    it.concurrent(
+      "should return a successful response with a valid API key for search",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/search")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ query: "test" });
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("success");
+        expect(response.body.success).toBe(true);
+        expect(response.body).toHaveProperty("data");
+      },
+      30000,
+    ); // 30 seconds timeout
+  });
+  describe("GET /v0/crawl/status/:jobId", () => {
+    it.concurrent("should require authorization", async () => {
+      const response = await request(TEST_URL).get("/v0/crawl/status/123");
+      expect(response.statusCode).toBe(401);
+    });
+    it.concurrent(
+      "should return an error response with an invalid API key",
+      async () => {
+        const response = await request(TEST_URL)
+          .get("/v0/crawl/status/123")
+          .set("Authorization", `Bearer invalid-api-key`);
+        expect(response.statusCode).toBe(401);
+      },
+    );
+    it.concurrent(
+      "should return Job not found for invalid job ID",
+      async () => {
+        const response = await request(TEST_URL)
+          .get("/v0/crawl/status/invalidJobId")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(response.statusCode).toBe(404);
+      },
+    );
+    it.concurrent(
+      "should return a successful crawl status response for a valid crawl job",
+      async () => {
+        const crawlResponse = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://mendable.ai/blog" });
+        expect(crawlResponse.statusCode).toBe(200);
+        let isCompleted = false;
+        let completedResponse;
+        while (!isCompleted) {
+          const response = await request(TEST_URL)
+            .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(response.statusCode).toBe(200);
+          expect(response.body).toHaveProperty("status");
+          if (response.body.status === "completed") {
+            isCompleted = true;
+            completedResponse = response;
+          } else {
+            await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        expect(completedResponse.body).toHaveProperty("status");
+        expect(completedResponse.body.status).toBe("completed");
+        expect(completedResponse.body).toHaveProperty("data");
+        expect(completedResponse.body.data[0]).toHaveProperty("content");
+        expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        expect(completedResponse.body.data[0].content).toContain("Mendable");
+        expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
+          200,
+        );
+        expect(
+          completedResponse.body.data[0].metadata.pageError,
+        ).toBeUndefined();
+        const childrenLinks = completedResponse.body.data.filter(
+          (doc) =>
+            doc.metadata &&
+            doc.metadata.sourceURL &&
+            doc.metadata.sourceURL.includes("mendable.ai/blog"),
+        );
+        expect(childrenLinks.length).toBe(completedResponse.body.data.length);
+      },
+      180000,
+    ); // 120 seconds
+    it.concurrent(
+      "should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ",
+      async () => {
+        const crawlResponse = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://arxiv.org/pdf/astro-ph/9301001",
+            crawlerOptions: {
+              limit: 10,
+              excludes: [
+                "list/*",
+                "login",
+                "abs/*",
+                "static/*",
+                "about/*",
+                "archive/*",
+              ],
+            },
+          });
+        expect(crawlResponse.statusCode).toBe(200);
+        let isCompleted = false;
+        let completedResponse;
+        while (!isCompleted) {
+          const response = await request(TEST_URL)
+            .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(response.statusCode).toBe(200);
+          expect(response.body).toHaveProperty("status");
+          if (response.body.status === "completed") {
+            isCompleted = true;
+            completedResponse = response;
+          } else {
+            await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        expect(completedResponse.body.status).toBe("completed");
+        expect(completedResponse.body).toHaveProperty("data");
+        expect(completedResponse.body.data.length).toEqual(1);
+        expect(completedResponse.body.data).toEqual(
+          expect.arrayContaining([
+            expect.objectContaining({
+              content: expect.stringContaining(
+                "asymmetries might represent, for instance, preferred source orientations to our line of sight.",
+              ),
+            }),
+          ]),
+        );
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
+          200,
+        );
+        expect(
+          completedResponse.body.data[0].metadata.pageError,
+        ).toBeUndefined();
+      },
+      180000,
+    ); // 120 seconds
+    it.concurrent(
+      "should return a successful response for a valid crawl job with includeHtml set to true option (2)",
+      async () => {
+        const crawlResponse = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://roastmywebsite.ai",
+            pageOptions: { includeHtml: true },
+          });
+        expect(crawlResponse.statusCode).toBe(200);
+        const response = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("status");
+        expect(["active", "waiting"]).toContain(response.body.status);
+        let isFinished = false;
+        let completedResponse;
+        while (!isFinished) {
+          const response = await request(TEST_URL)
+            .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(response.statusCode).toBe(200);
+          expect(response.body).toHaveProperty("status");
+          if (response.body.status === "completed") {
+            isFinished = true;
+            completedResponse = response;
+          } else {
+            await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        expect(completedResponse.statusCode).toBe(200);
+        expect(completedResponse.body).toHaveProperty("status");
+        expect(completedResponse.body.status).toBe("completed");
+        expect(completedResponse.body).toHaveProperty("data");
+        expect(completedResponse.body.data[0]).toHaveProperty("content");
+        expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        expect(completedResponse.body.data[0]).toHaveProperty("html");
+        expect(completedResponse.body.data[0].content).toContain("_Roast_");
+        expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
+        expect(completedResponse.body.data[0].html).toContain("<h1");
+        expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
+          200,
+        );
+        expect(
+          completedResponse.body.data[0].metadata.pageError,
+        ).toBeUndefined();
+      },
+      60000,
+    );
+  }); // 60 seconds
+  it.concurrent(
+    "should return a successful response for a valid crawl job with allowBackwardCrawling set to true option",
+    async () => {
+      const crawlResponse = await request(TEST_URL)
+        .post("/v0/crawl")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          url: "https://mendable.ai/blog",
+          pageOptions: { includeHtml: true },
+          crawlerOptions: { allowBackwardCrawling: true },
+        });
+      expect(crawlResponse.statusCode).toBe(200);
+      let isFinished = false;
+      let completedResponse;
+      while (!isFinished) {
+        const response = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("status");
+        if (response.body.status === "completed") {
+          isFinished = true;
+          completedResponse = response;
+        } else {
+          await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
+        }
+      }
+      expect(completedResponse.statusCode).toBe(200);
+      expect(completedResponse.body).toHaveProperty("status");
+      expect(completedResponse.body.status).toBe("completed");
+      expect(completedResponse.body).toHaveProperty("data");
+      expect(completedResponse.body.data[0]).toHaveProperty("content");
+      expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+      expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+      expect(completedResponse.body.data[0]).toHaveProperty("html");
+      expect(completedResponse.body.data[0].content).toContain("Mendable");
+      expect(completedResponse.body.data[0].markdown).toContain("Mendable");
+      expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+      expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
+      const onlyChildrenLinks = completedResponse.body.data.filter((doc) => {
+        return (
+          doc.metadata &&
+          doc.metadata.sourceURL &&
+          doc.metadata.sourceURL.includes("mendable.ai/blog")
+        );
+      });
+      expect(completedResponse.body.data.length).toBeGreaterThan(
+        onlyChildrenLinks.length,
+      );
+    },
+    60000,
+  );
+  it.concurrent(
+    "If someone cancels a crawl job, it should turn into failed status",
+    async () => {
+      const crawlResponse = await request(TEST_URL)
+        .post("/v0/crawl")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: "https://jestjs.io" });
+      expect(crawlResponse.statusCode).toBe(200);
+      await new Promise((r) => setTimeout(r, 20000));
+      const responseCancel = await request(TEST_URL)
+        .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+      expect(responseCancel.statusCode).toBe(200);
+      expect(responseCancel.body).toHaveProperty("status");
+      expect(responseCancel.body.status).toBe("cancelled");
+      await new Promise((r) => setTimeout(r, 10000));
+      const completedResponse = await request(TEST_URL)
+        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+      expect(completedResponse.statusCode).toBe(200);
+      expect(completedResponse.body).toHaveProperty("status");
+      expect(completedResponse.body.status).toBe("failed");
+      expect(completedResponse.body).toHaveProperty("data");
+      expect(completedResponse.body.data).toBeNull();
+      expect(completedResponse.body).toHaveProperty("partial_data");
+      expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
+      expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
+      expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
+      expect(
+        completedResponse.body.partial_data[0].metadata.pageStatusCode,
+      ).toBe(200);
+      expect(
+        completedResponse.body.partial_data[0].metadata.pageError,
+      ).toBeUndefined();
+    },
+    60000,
+  ); // 60 seconds
+  describe("POST /v0/scrape with LLM Extraction", () => {
+    it.concurrent(
+      "should extract data using LLM extraction mode",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://mendable.ai",
+            pageOptions: {
+              onlyMainContent: true,
+            },
+            extractorOptions: {
+              mode: "llm-extraction",
+              extractionPrompt:
+                "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
+              extractionSchema: {
+                type: "object",
+                properties: {
+                  company_mission: {
+                    type: "string",
+                  },
+                  supports_sso: {
+                    type: "boolean",
+                  },
+                  is_open_source: {
+                    type: "boolean",
+                  },
+                },
+                required: ["company_mission", "supports_sso", "is_open_source"],
+              },
+            },
+          });
+        // Ensure that the job was successfully created before proceeding with LLM extraction
+        expect(response.statusCode).toBe(200);
+        // Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
+        let llmExtraction = response.body.data.llm_extraction;
+        // Check if the llm_extraction object has the required properties with correct types and values
+        expect(llmExtraction).toHaveProperty("company_mission");
+        expect(typeof llmExtraction.company_mission).toBe("string");
+        expect(llmExtraction).toHaveProperty("supports_sso");
+        expect(llmExtraction.supports_sso).toBe(true);
+        expect(typeof llmExtraction.supports_sso).toBe("boolean");
+        expect(llmExtraction).toHaveProperty("is_open_source");
+        expect(llmExtraction.is_open_source).toBe(false);
+        expect(typeof llmExtraction.is_open_source).toBe("boolean");
+      },
+      60000,
+    ); // 60 secs
+    it.concurrent(
+      "should extract data using LLM extraction mode with RawHtml",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://mendable.ai",
+            extractorOptions: {
+              mode: "llm-extraction-from-raw-html",
+              extractionPrompt:
+                "Based on the information on the page, what are the primary and secondary CTA buttons?",
+              extractionSchema: {
+                type: "object",
+                properties: {
+                  primary_cta: {
+                    type: "string",
+                  },
+                  secondary_cta: {
+                    type: "string",
+                  },
+                },
+                required: ["primary_cta", "secondary_cta"],
+              },
+            },
+          });
+        // Ensure that the job was successfully created before proceeding with LLM extraction
+        expect(response.statusCode).toBe(200);
+        // Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
+        let llmExtraction = response.body.data.llm_extraction;
+        // Check if the llm_extraction object has the required properties with correct types and values
+        expect(llmExtraction).toHaveProperty("primary_cta");
+        expect(typeof llmExtraction.primary_cta).toBe("string");
+        expect(llmExtraction).toHaveProperty("secondary_cta");
+        expect(typeof llmExtraction.secondary_cta).toBe("string");
+      },
+      60000,
+    ); // 60 secs
+  });
+  // describe("POST /v0/scrape for Top 100 Companies", () => {
+  //   it.concurrent("should extract data for the top 100 companies", async () => {
+  //     const response = await request(TEST_URL)
+  //       .post("/v0/scrape")
+  //       .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+  //       .set("Content-Type", "application/json")
+  //       .send({
+  //         url: "https://companiesmarketcap.com/",
+  //         pageOptions: {
+  //           onlyMainContent: true
+  //         },
+  //         extractorOptions: {
+  //           mode: "llm-extraction",
+  //           extractionPrompt: "Extract the name, market cap, price, and today's change for the top 20 companies listed on the page.",
+  //           extractionSchema: {
+  //             type: "object",
+  //             properties: {
+  //               companies: {
+  //                 type: "array",
+  //                 items: {
+  //                   type: "object",
+  //                   properties: {
+  //                     rank: { type: "number" },
+  //                     name: { type: "string" },
+  //                     marketCap: { type: "string" },
+  //                     price: { type: "string" },
+  //                     todayChange: { type: "string" }
+  //                   },
+  //                   required: ["rank", "name", "marketCap", "price", "todayChange"]
+  //                 }
+  //               }
+  //             },
+  //             required: ["companies"]
+  //           }
+  //         }
+  //       });
+  //     // Print the response body to the console for debugging purposes
+  //     console.log("Response companies:", response.body.data.llm_extraction.companies);
+  //     // Check if the response has the correct structure and data types
+  //     expect(response.status).toBe(200);
+  //     expect(Array.isArray(response.body.data.llm_extraction.companies)).toBe(true);
+  //     expect(response.body.data.llm_extraction.companies.length).toBe(40);
+  //     // Sample check for the first company
+  //     const firstCompany = response.body.data.llm_extraction.companies[0];
+  //     expect(firstCompany).toHaveProperty("name");
+  //     expect(typeof firstCompany.name).toBe("string");
+  //     expect(firstCompany).toHaveProperty("marketCap");
+  //     expect(typeof firstCompany.marketCap).toBe("string");
+  //     expect(firstCompany).toHaveProperty("price");
+  //     expect(typeof firstCompany.price).toBe("string");
+  //     expect(firstCompany).toHaveProperty("todayChange");
+  //     expect(typeof firstCompany.todayChange).toBe("string");
+  //   }, 120000); // 120 secs
+  // });
+  describe("POST /v0/crawl with fast mode", () => {
+    it.concurrent(
+      "should complete the crawl under 20 seconds",
+      async () => {
+        const startTime = Date.now();
+        const crawlResponse = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://flutterbricks.com",
+            crawlerOptions: {
+              mode: "fast",
+            },
+          });
+        expect(crawlResponse.statusCode).toBe(200);
+        const jobId = crawlResponse.body.jobId;
+        let statusResponse;
+        let isFinished = false;
+        while (!isFinished) {
+          statusResponse = await request(TEST_URL)
+            .get(`/v0/crawl/status/${jobId}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(statusResponse.statusCode).toBe(200);
+          isFinished = statusResponse.body.status === "completed";
+          if (!isFinished) {
+            await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        // const endTime = Date.now();
+        // const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
+        // console.log(`Time elapsed: ${timeElapsed} seconds`);
+        expect(statusResponse.body.status).toBe("completed");
+        expect(statusResponse.body).toHaveProperty("data");
+        expect(statusResponse.body.data[0]).toHaveProperty("content");
+        expect(statusResponse.body.data[0]).toHaveProperty("markdown");
+        expect(statusResponse.body.data[0]).toHaveProperty("metadata");
+        expect(statusResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+        expect(statusResponse.body.data[0].metadata.pageError).toBeUndefined();
+        const results = statusResponse.body.data;
+        // results.forEach((result, i) => {
+        //   console.log(result.metadata.sourceURL);
+        // });
+        expect(results.length).toBeGreaterThanOrEqual(10);
+        expect(results.length).toBeLessThanOrEqual(15);
+      },
+      20000,
+    );
+    // it.concurrent("should complete the crawl in more than 10 seconds", async () => {
+    //   const startTime = Date.now();
+    //   const crawlResponse = await request(TEST_URL)
+    //     .post("/v0/crawl")
+    //     .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+    //     .set("Content-Type", "application/json")
+    //     .send({
+    //       url: "https://flutterbricks.com",
+    //     });
+    //   expect(crawlResponse.statusCode).toBe(200);
+    //   const jobId = crawlResponse.body.jobId;
+    //   let statusResponse;
+    //   let isFinished = false;
+    //   while (!isFinished) {
+    //     statusResponse = await request(TEST_URL)
+    //       .get(`/v0/crawl/status/${jobId}`)
+    //       .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+    //     expect(statusResponse.statusCode).toBe(200);
+    //     isFinished = statusResponse.body.status === "completed";
+    //     if (!isFinished) {
+    //       await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+    //     }
+    //   }
+    //   const endTime = Date.now();
+    //   const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
+    //   console.log(`Time elapsed: ${timeElapsed} seconds`);
+    //   expect(statusResponse.body.status).toBe("completed");
+    //   expect(statusResponse.body).toHaveProperty("data");
+    //   expect(statusResponse.body.data[0]).toHaveProperty("content");
+    //   expect(statusResponse.body.data[0]).toHaveProperty("markdown");
+    //   const results = statusResponse.body.data;
+    //   // results.forEach((result, i) => {
+    //   //   console.log(result.metadata.sourceURL);
+    //   // });
+    //   expect(results.length).toBeGreaterThanOrEqual(10);
+    //   expect(results.length).toBeLessThanOrEqual(15);
+    // }, 50000);// 15 seconds timeout to account for network delays
+  });
+  describe("GET /is-production", () => {
+    it.concurrent("should return the production status", async () => {
+      const response = await request(TEST_URL).get("/is-production");
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("isProduction");
+    });
+  });
+  describe("Rate Limiter", () => {
+    it.concurrent(
+      "should return 429 when rate limit is exceeded for preview token",
+      async () => {
+        for (let i = 0; i < 5; i++) {
+          const response = await request(TEST_URL)
+            .post("/v0/scrape")
+            .set("Authorization", `Bearer ${process.env.PREVIEW_TOKEN}`)
+            .set("Content-Type", "application/json")
+            .send({ url: "https://www.scrapethissite.com" });
+          expect(response.statusCode).toBe(200);
+        }
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.PREVIEW_TOKEN}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://www.scrapethissite.com" });
+        expect(response.statusCode).toBe(429);
+      },
+      90000,
+    );
+  });
+  // it.concurrent("should return 429 when rate limit is exceeded for API key", async () => {
+  //   for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_SCRAPE); i++) {
+  //     const response = await request(TEST_URL)
+  //       .post("/v0/scrape")
+  //       .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+  //       .set("Content-Type", "application/json")
+  //       .send({ url: "https://www.scrapethissite.com" });
+  //     expect(response.statusCode).toBe(200);
+  //   }
+  //   const response = await request(TEST_URL)
+  //     .post("/v0/scrape")
+  //     .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+  //     .set("Content-Type", "application/json")
+  //     .send({ url: "https://www.scrapethissite.com" });
+  //   expect(response.statusCode).toBe(429);
+  // }, 60000);
+  // it.concurrent("should return 429 when rate limit is exceeded for API key", async () => {
+  //   for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_CRAWL); i++) {
+  //     const response = await request(TEST_URL)
+  //       .post("/v0/crawl")
+  //       .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+  //       .set("Content-Type", "application/json")
+  //       .send({ url: "https://www.scrapethissite.com" });
+  //     expect(response.statusCode).toBe(200);
+  //   }
+  //   const response = await request(TEST_URL)
+  //     .post("/v0/crawl")
+  //     .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+  //     .set("Content-Type", "application/json")
+  //     .send({ url: "https://www.scrapethissite.com" });
+  //   expect(response.statusCode).toBe(429);
+  // }, 60000);
+});

src/__tests__/e2e_map/index.test.ts ADDED Viewed

	@@ -0,0 +1,117 @@

+import request from "supertest";
+import dotenv from "dotenv";
+dotenv.config();
+const TEST_URL = "http://127.0.0.1:3002";
+describe("E2E Tests for Map API Routes", () => {
+  it.concurrent(
+    "(feat-search)should return links containing 'smart-crawl'",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/map")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          url: "https://firecrawl.dev",
+          sitemapOnly: false,
+          search: "smart-crawl",
+        });
+      console.log(response.body);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("links");
+      expect(response.body.links.length).toBeGreaterThan(0);
+      expect(response.body.links[0]).toContain("firecrawl.dev/smart-crawl");
+    },
+    60000,
+  );
+  it.concurrent(
+    "(feat-subdomains) should return mapped links for firecrawl.dev with subdomains included",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/map")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          url: "https://firecrawl.dev",
+          sitemapOnly: false,
+          includeSubdomains: true,
+        });
+      console.log(response.body);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("links");
+      expect(response.body.links.length).toBeGreaterThan(0);
+      expect(response.body.links[response.body.links.length - 1]).toContain(
+        "docs.firecrawl.dev",
+      );
+    },
+    60000,
+  );
+  it.concurrent(
+    "(feat-sitemap-only) should return mapped links for firecrawl.dev with sitemap only",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/map")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          url: "https://firecrawl.dev",
+          sitemapOnly: true,
+        });
+      console.log(response.body);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("links");
+      expect(response.body.links.length).toBeGreaterThan(0);
+      expect(response.body.links[response.body.links.length - 1]).not.toContain(
+        "docs.firecrawl.dev",
+      );
+    },
+    60000,
+  );
+  it.concurrent(
+    "(feat-limit) should return mapped links for firecrawl.dev with a limit",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/map")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          url: "https://firecrawl.dev",
+          sitemapOnly: false,
+          limit: 10,
+        });
+      console.log(response.body);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("links");
+      expect(response.body.links.length).toBeLessThanOrEqual(10);
+    },
+    60000,
+  );
+  it.concurrent(
+    "(feat-sitemap-large) should return more than 1900 links for geekflare sitemap",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/map")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          url: "https://geekflare.com/sitemap_index.xml",
+          sitemapOnly: true,
+        });
+      console.log(response.body);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("links");
+      expect(response.body.links.length).toBeGreaterThan(1900);
+    },
+    60000,
+  );
+});

src/__tests__/e2e_noAuth/index.test.ts ADDED Viewed

	@@ -0,0 +1,212 @@

+import request from "supertest";
+import dotenv from "dotenv";
+import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
+const fs = require("fs");
+const path = require("path");
+dotenv.config();
+const TEST_URL = "http://127.0.0.1:3002";
+describe("E2E Tests for API Routes with No Authentication", () => {
+  let originalEnv: NodeJS.ProcessEnv;
+  // save original process.env
+  beforeAll(() => {
+    originalEnv = { ...process.env };
+    process.env.USE_DB_AUTHENTICATION = "false";
+    process.env.SUPABASE_ANON_TOKEN = "";
+    process.env.SUPABASE_URL = "";
+    process.env.SUPABASE_SERVICE_TOKEN = "";
+    process.env.SCRAPING_BEE_API_KEY = "";
+    process.env.OPENAI_API_KEY = "";
+    process.env.BULL_AUTH_KEY = "";
+    process.env.PLAYWRIGHT_MICROSERVICE_URL = "";
+    process.env.LLAMAPARSE_API_KEY = "";
+    process.env.TEST_API_KEY = "";
+    process.env.POSTHOG_API_KEY = "";
+    process.env.POSTHOG_HOST = "";
+  });
+  // restore original process.env
+  afterAll(() => {
+    process.env = originalEnv;
+  });
+  describe("GET /", () => {
+    it("should return Hello, world! message", async () => {
+      const response = await request(TEST_URL).get("/");
+      expect(response.statusCode).toBe(200);
+      expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io");
+    });
+  });
+  describe("GET /test", () => {
+    it("should return Hello, world! message", async () => {
+      const response = await request(TEST_URL).get("/test");
+      expect(response.statusCode).toBe(200);
+      expect(response.text).toContain("Hello, world!");
+    });
+  });
+  describe("POST /v0/scrape", () => {
+    it("should not require authorization", async () => {
+      const response = await request(TEST_URL).post("/v0/scrape");
+      expect(response.statusCode).not.toBe(401);
+    });
+    it("should return an error for a blocklisted URL without requiring authorization", async () => {
+      const blocklistedUrl = "https://facebook.com/fake-test";
+      const response = await request(TEST_URL)
+        .post("/v0/scrape")
+        .set("Content-Type", "application/json")
+        .send({ url: blocklistedUrl });
+      expect(response.statusCode).toBe(403);
+      expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
+    });
+    it("should return a successful response", async () => {
+      const response = await request(TEST_URL)
+        .post("/v0/scrape")
+        .set("Content-Type", "application/json")
+        .send({ url: "https://firecrawl.dev" });
+      expect(response.statusCode).toBe(200);
+    }, 10000); // 10 seconds timeout
+  });
+  describe("POST /v0/crawl", () => {
+    it("should not require authorization", async () => {
+      const response = await request(TEST_URL).post("/v0/crawl");
+      expect(response.statusCode).not.toBe(401);
+    });
+    it("should return an error for a blocklisted URL", async () => {
+      const blocklistedUrl = "https://twitter.com/fake-test";
+      const response = await request(TEST_URL)
+        .post("/v0/crawl")
+        .set("Content-Type", "application/json")
+        .send({ url: blocklistedUrl });
+      expect(response.statusCode).toBe(403);
+      expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
+    });
+    it("should return a successful response", async () => {
+      const response = await request(TEST_URL)
+        .post("/v0/crawl")
+        .set("Content-Type", "application/json")
+        .send({ url: "https://firecrawl.dev" });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("jobId");
+      expect(response.body.jobId).toMatch(
+        /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
+      );
+    });
+  });
+  describe("POST /v0/crawlWebsitePreview", () => {
+    it("should not require authorization", async () => {
+      const response = await request(TEST_URL).post("/v0/crawlWebsitePreview");
+      expect(response.statusCode).not.toBe(401);
+    });
+    it("should return an error for a blocklisted URL", async () => {
+      const blocklistedUrl = "https://instagram.com/fake-test";
+      const response = await request(TEST_URL)
+        .post("/v0/crawlWebsitePreview")
+        .set("Content-Type", "application/json")
+        .send({ url: blocklistedUrl });
+      expect(response.statusCode).toBe(403);
+      expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
+    });
+    it("should return a successful response", async () => {
+      const response = await request(TEST_URL)
+        .post("/v0/crawlWebsitePreview")
+        .set("Content-Type", "application/json")
+        .send({ url: "https://firecrawl.dev" });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("jobId");
+      expect(response.body.jobId).toMatch(
+        /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
+      );
+    });
+  });
+  describe("POST /v0/search", () => {
+    it("should require not authorization", async () => {
+      const response = await request(TEST_URL).post("/v0/search");
+      expect(response.statusCode).not.toBe(401);
+    });
+    it("should return no error response with an invalid API key", async () => {
+      const response = await request(TEST_URL)
+        .post("/v0/search")
+        .set("Authorization", `Bearer invalid-api-key`)
+        .set("Content-Type", "application/json")
+        .send({ query: "test" });
+      expect(response.statusCode).not.toBe(401);
+    });
+    it("should return a successful response without a valid API key", async () => {
+      const response = await request(TEST_URL)
+        .post("/v0/search")
+        .set("Content-Type", "application/json")
+        .send({ query: "test" });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("success");
+      expect(response.body.success).toBe(true);
+      expect(response.body).toHaveProperty("data");
+    }, 20000);
+  });
+  describe("GET /v0/crawl/status/:jobId", () => {
+    it("should not require authorization", async () => {
+      const response = await request(TEST_URL).get("/v0/crawl/status/123");
+      expect(response.statusCode).not.toBe(401);
+    });
+    it("should return Job not found for invalid job ID", async () => {
+      const response = await request(TEST_URL).get(
+        "/v0/crawl/status/invalidJobId",
+      );
+      expect(response.statusCode).toBe(404);
+    });
+    it("should return a successful response for a valid crawl job", async () => {
+      const crawlResponse = await request(TEST_URL)
+        .post("/v0/crawl")
+        .set("Content-Type", "application/json")
+        .send({ url: "https://firecrawl.dev" });
+      expect(crawlResponse.statusCode).toBe(200);
+      const response = await request(TEST_URL).get(
+        `/v0/crawl/status/${crawlResponse.body.jobId}`,
+      );
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("status");
+      expect(response.body.status).toBe("active");
+      // wait for 30 seconds
+      await new Promise((r) => setTimeout(r, 30000));
+      const completedResponse = await request(TEST_URL).get(
+        `/v0/crawl/status/${crawlResponse.body.jobId}`,
+      );
+      expect(completedResponse.statusCode).toBe(200);
+      expect(completedResponse.body).toHaveProperty("status");
+      expect(completedResponse.body.status).toBe("completed");
+      expect(completedResponse.body).toHaveProperty("data");
+      expect(completedResponse.body.data[0]).toHaveProperty("content");
+      expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+      expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+    }, 60000); // 60 seconds
+  });
+  describe("GET /is-production", () => {
+    it("should return the production status", async () => {
+      const response = await request(TEST_URL).get("/is-production");
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("isProduction");
+    });
+  });
+});

src/__tests__/e2e_v1_withAuth/index.test.ts ADDED Viewed

	@@ -0,0 +1,1066 @@

+import request from "supertest";
+import { configDotenv } from "dotenv";
+import { ScrapeRequestInput } from "../../controllers/v1/types";
+import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
+configDotenv();
+const TEST_URL = "http://127.0.0.1:3002";
+describe("E2E Tests for v1 API Routes", () => {
+  beforeAll(() => {
+    process.env.USE_DB_AUTHENTICATION = "true";
+  });
+  afterAll(() => {
+    delete process.env.USE_DB_AUTHENTICATION;
+  });
+  describe("GET /is-production", () => {
+    it.concurrent("should return the production status", async () => {
+      const response: any = await request(TEST_URL).get("/is-production");
+      console.log(
+        "process.env.USE_DB_AUTHENTICATION",
+        process.env.USE_DB_AUTHENTICATION,
+      );
+      console.log("?", process.env.USE_DB_AUTHENTICATION === "true");
+      const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
+      console.log("!!useDbAuthentication", !!useDbAuthentication);
+      console.log("!useDbAuthentication", !useDbAuthentication);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("isProduction");
+    });
+  });
+  describe("POST /v1/scrape", () => {
+    it.concurrent("should require authorization", async () => {
+      const response: any = await request(TEST_URL)
+        .post("/v1/scrape")
+        .send({ url: "https://firecrawl.dev" });
+      expect(response.statusCode).toBe(401);
+    });
+    it.concurrent("should throw error for blocklisted URL", async () => {
+      const scrapeRequest: ScrapeRequestInput = {
+        url: "https://facebook.com/fake-test",
+      };
+      const response = await request(TEST_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      expect(response.statusCode).toBe(403);
+      expect(response.body.error).toBe(BLOCKLISTED_URL_MESSAGE);
+    });
+    it.concurrent(
+      "should return an error response with an invalid API key",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v1/scrape")
+          .set("Authorization", `Bearer invalid-api-key`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://firecrawl.dev" });
+        expect(response.statusCode).toBe(401);
+      },
+    );
+    it.concurrent(
+      "should return a successful response with a valid API key",
+      async () => {
+        const scrapeRequest: ScrapeRequestInput = {
+          url: "https://roastmywebsite.ai",
+        };
+        const response: any = await request(TEST_URL)
+          .post("/v1/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send(scrapeRequest);
+        expect(response.statusCode).toBe(200);
+        if (!("data" in response.body)) {
+          throw new Error("Expected response body to have 'data' property");
+        }
+        expect(response.body.data).not.toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("markdown");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data).not.toHaveProperty("html");
+        expect(response.body.data.markdown).toContain("_Roast_");
+        expect(response.body.data.metadata.error).toBeUndefined();
+        expect(response.body.data.metadata.title).toBe("Roast My Website");
+        expect(response.body.data.metadata.description).toBe(
+          "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
+        );
+        expect(response.body.data.metadata.keywords).toBe(
+          "Roast My Website,Roast,Website,GitHub,Firecrawl",
+        );
+        expect(response.body.data.metadata.robots).toBe("follow, index");
+        expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
+        expect(response.body.data.metadata.ogDescription).toBe(
+          "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
+        );
+        expect(response.body.data.metadata.ogUrl).toBe(
+          "https://www.roastmywebsite.ai",
+        );
+        expect(response.body.data.metadata.ogImage).toBe(
+          "https://www.roastmywebsite.ai/og.png",
+        );
+        expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
+        expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
+        expect(response.body.data.metadata.sourceURL).toBe(
+          "https://roastmywebsite.ai",
+        );
+        expect(response.body.data.metadata.statusCode).toBe(200);
+      },
+      30000,
+    ); // 30 seconds timeout
+    it.concurrent(
+      "should return a successful response with a valid API key",
+      async () => {
+        const scrapeRequest: ScrapeRequestInput = {
+          url: "https://arxiv.org/abs/2410.04840",
+        };
+        const response: any = await request(TEST_URL)
+          .post("/v1/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send(scrapeRequest);
+        expect(response.statusCode).toBe(200);
+        if (!("data" in response.body)) {
+          throw new Error("Expected response body to have 'data' property");
+        }
+        expect(response.body.data).not.toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("markdown");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data).not.toHaveProperty("html");
+        expect(response.body.data.markdown).toContain("Strong Model Collapse");
+        expect(response.body.data.metadata.error).toBeUndefined();
+        expect(response.body.data.metadata.description).toContain(
+          "Abstract page for arXiv paper 2410.04840: Strong Model Collapse",
+        );
+        expect(response.body.data.metadata.citation_title).toBe(
+          "Strong Model Collapse",
+        );
+        expect(response.body.data.metadata.citation_author).toEqual([
+          "Dohmatob, Elvis",
+          "Feng, Yunzhen",
+          "Subramonian, Arjun",
+          "Kempe, Julia",
+        ]);
+        expect(response.body.data.metadata.citation_date).toBe("2024/10/07");
+        expect(response.body.data.metadata.citation_online_date).toBe(
+          "2024/10/08",
+        );
+        expect(response.body.data.metadata.citation_pdf_url).toBe(
+          "http://arxiv.org/pdf/2410.04840",
+        );
+        expect(response.body.data.metadata.citation_arxiv_id).toBe(
+          "2410.04840",
+        );
+        expect(response.body.data.metadata.citation_abstract).toContain(
+          "Within the scaling laws paradigm",
+        );
+        expect(response.body.data.metadata.sourceURL).toBe(
+          "https://arxiv.org/abs/2410.04840",
+        );
+        expect(response.body.data.metadata.statusCode).toBe(200);
+      },
+      30000,
+    );
+    it.concurrent(
+      "should return a successful response with a valid API key and includeHtml set to true",
+      async () => {
+        const scrapeRequest: ScrapeRequestInput = {
+          url: "https://roastmywebsite.ai",
+          formats: ["markdown", "html"],
+        };
+        const response: any = await request(TEST_URL)
+          .post("/v1/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send(scrapeRequest);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        if (!("data" in response.body)) {
+          throw new Error("Expected response body to have 'data' property");
+        }
+        expect(response.body.data).toHaveProperty("markdown");
+        expect(response.body.data).toHaveProperty("html");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.markdown).toContain("_Roast_");
+        expect(response.body.data.html).toContain("<h1");
+        expect(response.body.data.metadata.statusCode).toBe(200);
+        expect(response.body.data.metadata.error).toBeUndefined();
+      },
+      30000,
+    );
+    it.concurrent(
+      "should return a successful response for a valid scrape with PDF file",
+      async () => {
+        const scrapeRequest: ScrapeRequestInput = {
+          url: "https://arxiv.org/pdf/astro-ph/9301001.pdf",
+          //   formats: ["markdown", "html"],
+        };
+        const response: any = await request(TEST_URL)
+          .post("/v1/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send(scrapeRequest);
+        await new Promise((r) => setTimeout(r, 6000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        if (!("data" in response.body)) {
+          throw new Error("Expected response body to have 'data' property");
+        }
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.markdown).toContain(
+          "Broad Line Radio Galaxy",
+        );
+        expect(response.body.data.metadata.statusCode).toBe(200);
+        expect(response.body.data.metadata.error).toBeUndefined();
+      },
+      60000,
+    );
+    it.concurrent(
+      "should return a successful response for a valid scrape with PDF file without explicit .pdf extension",
+      async () => {
+        const scrapeRequest: ScrapeRequestInput = {
+          url: "https://arxiv.org/pdf/astro-ph/9301001",
+        };
+        const response: any = await request(TEST_URL)
+          .post("/v1/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send(scrapeRequest);
+        await new Promise((r) => setTimeout(r, 6000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        if (!("data" in response.body)) {
+          throw new Error("Expected response body to have 'data' property");
+        }
+        expect(response.body.data).toHaveProperty("markdown");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.markdown).toContain(
+          "Broad Line Radio Galaxy",
+        );
+        expect(response.body.data.metadata.statusCode).toBe(200);
+        expect(response.body.data.metadata.error).toBeUndefined();
+      },
+      60000,
+    );
+    it.concurrent(
+      "should return a successful response with a valid API key with removeTags option",
+      async () => {
+        const scrapeRequest: ScrapeRequestInput = {
+          url: "https://www.scrapethissite.com/",
+          onlyMainContent: false, // default is true
+        };
+        const responseWithoutRemoveTags: any = await request(TEST_URL)
+          .post("/v1/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send(scrapeRequest);
+        expect(responseWithoutRemoveTags.statusCode).toBe(200);
+        expect(responseWithoutRemoveTags.body).toHaveProperty("data");
+        if (!("data" in responseWithoutRemoveTags.body)) {
+          throw new Error("Expected response body to have 'data' property");
+        }
+        expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
+        expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
+        expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
+        expect(responseWithoutRemoveTags.body.data.markdown).toContain(
+          "[FAQ](/faq/)",
+        ); // .nav
+        expect(responseWithoutRemoveTags.body.data.markdown).toContain(
+          "Hartley Brody 2023",
+        ); // #footer
+        const scrapeRequestWithRemoveTags: ScrapeRequestInput = {
+          url: "https://www.scrapethissite.com/",
+          excludeTags: [".nav", "#footer", "strong"],
+          onlyMainContent: false, // default is true
+        };
+        const response: any = await request(TEST_URL)
+          .post("/v1/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send(scrapeRequestWithRemoveTags);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        if (!("data" in response.body)) {
+          throw new Error("Expected response body to have 'data' property");
+        }
+        expect(response.body.data).toHaveProperty("markdown");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data).not.toHaveProperty("html");
+        expect(response.body.data.markdown).not.toContain("Hartley Brody 2023");
+        expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); //
+      },
+      30000,
+    );
+    it.concurrent(
+      "should return a successful response for a scrape with 400 page",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v1/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://httpstat.us/400" });
+        await new Promise((r) => setTimeout(r, 5000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        if (!("data" in response.body)) {
+          throw new Error("Expected response body to have 'data' property");
+        }
+        expect(response.body.data).toHaveProperty("markdown");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.metadata.statusCode).toBe(400);
+      },
+      60000,
+    );
+    it.concurrent(
+      "should return a successful response for a scrape with 401 page",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v1/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://httpstat.us/401" });
+        await new Promise((r) => setTimeout(r, 5000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        if (!("data" in response.body)) {
+          throw new Error("Expected response body to have 'data' property");
+        }
+        expect(response.body.data).toHaveProperty("markdown");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.metadata.statusCode).toBe(401);
+      },
+      60000,
+    );
+    // Removed it as we want to retry fallback to the next scraper
+    // it.concurrent('should return a successful response for a scrape with 403 page', async () => {
+    //   const response: any = await request(TEST_URL)
+    //     .post('/v1/scrape')
+    //     .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+    //     .set('Content-Type', 'application/json')
+    //     .send({ url: 'https://httpstat.us/403' });
+    //   await new Promise((r) => setTimeout(r, 5000));
+    //   expect(response.statusCode).toBe(200);
+    //   expect(response.body).toHaveProperty('data');
+    //   if (!("data" in response.body)) {
+    //     throw new Error("Expected response body to have 'data' property");
+    //   }
+    //   expect(response.body.data).toHaveProperty('markdown');
+    //   expect(response.body.data).toHaveProperty('metadata');
+    //   expect(response.body.data.metadata.statusCode).toBe(403);
+    // }, 60000);
+    it.concurrent(
+      "should return a successful response for a scrape with 404 page",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v1/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://httpstat.us/404" });
+        await new Promise((r) => setTimeout(r, 5000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        if (!("data" in response.body)) {
+          throw new Error("Expected response body to have 'data' property");
+        }
+        expect(response.body.data).toHaveProperty("markdown");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.metadata.statusCode).toBe(404);
+      },
+      60000,
+    );
+    // it.concurrent('should return a successful response for a scrape with 405 page', async () => {
+    //   const response: any = await request(TEST_URL)
+    //     .post('/v1/scrape')
+    //     .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+    //     .set('Content-Type', 'application/json')
+    //     .send({ url: 'https://httpstat.us/405' });
+    //   await new Promise((r) => setTimeout(r, 5000));
+    //   expect(response.statusCode).toBe(200);
+    //   expect(response.body).toHaveProperty('data');
+    //   if (!("data" in response.body)) {
+    //     throw new Error("Expected response body to have 'data' property");
+    //   }
+    //   expect(response.body.data).toHaveProperty('markdown');
+    //   expect(response.body.data).toHaveProperty('metadata');
+    //   expect(response.body.data.metadata.statusCode).toBe(405);
+    // }, 60000);
+    // it.concurrent('should return a successful response for a scrape with 500 page', async () => {
+    //   const response: any = await request(TEST_URL)
+    //     .post('/v1/scrape')
+    //     .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+    //     .set('Content-Type', 'application/json')
+    //     .send({ url: 'https://httpstat.us/500' });
+    //   await new Promise((r) => setTimeout(r, 5000));
+    //   expect(response.statusCode).toBe(200);
+    //   expect(response.body).toHaveProperty('data');
+    //   if (!("data" in response.body)) {
+    //     throw new Error("Expected response body to have 'data' property");
+    //   }
+    //   expect(response.body.data).toHaveProperty('markdown');
+    //   expect(response.body.data).toHaveProperty('metadata');
+    //   expect(response.body.data.metadata.statusCode).toBe(500);
+    // }, 60000);
+    it.concurrent(
+      "should return a timeout error when scraping takes longer than the specified timeout",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v1/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://firecrawl.dev", timeout: 1000 });
+        expect(response.statusCode).toBe(408);
+      },
+      3000,
+    );
+    it.concurrent(
+      "should return a successful response with a valid API key and includeHtml set to true",
+      async () => {
+        const scrapeRequest: ScrapeRequestInput = {
+          url: "https://roastmywebsite.ai",
+          formats: ["html", "rawHtml"],
+        };
+        const response: any = await request(TEST_URL)
+          .post("/v1/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send(scrapeRequest);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        if (!("data" in response.body)) {
+          throw new Error("Expected response body to have 'data' property");
+        }
+        expect(response.body.data).not.toHaveProperty("markdown");
+        expect(response.body.data).toHaveProperty("html");
+        expect(response.body.data).toHaveProperty("rawHtml");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.html).toContain("<h1");
+        expect(response.body.data.rawHtml).toContain("<html");
+        expect(response.body.data.metadata.statusCode).toBe(200);
+        expect(response.body.data.metadata.error).toBeUndefined();
+      },
+      30000,
+    );
+    it.concurrent(
+      "should return a successful response with waitFor",
+      async () => {
+        const scrapeRequest: ScrapeRequestInput = {
+          url: "https://ycombinator.com/companies",
+          formats: ["markdown"],
+          waitFor: 8000,
+        };
+        const response: any = await request(TEST_URL)
+          .post("/v1/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send(scrapeRequest);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        if (!("data" in response.body)) {
+          throw new Error("Expected response body to have 'data' property");
+        }
+        expect(response.body.data).toHaveProperty("markdown");
+        expect(response.body.data).not.toHaveProperty("html");
+        expect(response.body.data).not.toHaveProperty("links");
+        expect(response.body.data).not.toHaveProperty("rawHtml");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.markdown).toContain("PagerDuty");
+        expect(response.body.data.metadata.statusCode).toBe(200);
+        expect(response.body.data.metadata.error).toBeUndefined();
+      },
+      30000,
+    );
+    it.concurrent(
+      "should return a successful response with a valid links on page",
+      async () => {
+        const scrapeRequest: ScrapeRequestInput = {
+          url: "https://roastmywebsite.ai",
+          formats: ["links"],
+        };
+        const response: any = await request(TEST_URL)
+          .post("/v1/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send(scrapeRequest);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        if (!("data" in response.body)) {
+          throw new Error("Expected response body to have 'data' property");
+        }
+        expect(response.body.data).not.toHaveProperty("html");
+        expect(response.body.data).not.toHaveProperty("rawHtml");
+        expect(response.body.data).toHaveProperty("links");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.links).toContain("https://firecrawl.dev");
+        expect(response.body.data.metadata.statusCode).toBe(200);
+        expect(response.body.data.metadata.error).toBeUndefined();
+      },
+      30000,
+    );
+  });
+  describe("POST /v1/map", () => {
+    it.concurrent("should require authorization", async () => {
+      const response: any = await request(TEST_URL)
+        .post("/v1/map")
+        .send({ url: "https://firecrawl.dev" });
+      expect(response.statusCode).toBe(401);
+    });
+    it.concurrent(
+      "should return an error response with an invalid API key",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v1/map")
+          .set("Authorization", `Bearer invalid-api-key`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://firecrawl.dev" });
+        expect(response.statusCode).toBe(401);
+      },
+    );
+    it.concurrent(
+      "should return a successful response with a valid API key",
+      async () => {
+        const mapRequest = {
+          url: "https://roastmywebsite.ai",
+        };
+        const response: any = await request(TEST_URL)
+          .post("/v1/map")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send(mapRequest);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("success", true);
+        expect(response.body).toHaveProperty("links");
+        if (!("links" in response.body)) {
+          throw new Error("Expected response body to have 'links' property");
+        }
+        const links = response.body.links as unknown[];
+        expect(Array.isArray(links)).toBe(true);
+        expect(links.length).toBeGreaterThan(0);
+      },
+    );
+    it.concurrent(
+      "should return a successful response with a valid API key and search",
+      async () => {
+        const mapRequest = {
+          url: "https://usemotion.com",
+          search: "pricing",
+        };
+        const response: any = await request(TEST_URL)
+          .post("/v1/map")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send(mapRequest);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("success", true);
+        expect(response.body).toHaveProperty("links");
+        if (!("links" in response.body)) {
+          throw new Error("Expected response body to have 'links' property");
+        }
+        const links = response.body.links as unknown[];
+        expect(Array.isArray(links)).toBe(true);
+        expect(links.length).toBeGreaterThan(0);
+        expect(links[0]).toContain("usemotion.com/pricing");
+      },
+    );
+    it.concurrent(
+      "should return a successful response with a valid API key and search and allowSubdomains",
+      async () => {
+        const mapRequest = {
+          url: "https://firecrawl.dev",
+          search: "docs",
+          includeSubdomains: true,
+        };
+        const response: any = await request(TEST_URL)
+          .post("/v1/map")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send(mapRequest);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("success", true);
+        expect(response.body).toHaveProperty("links");
+        if (!("links" in response.body)) {
+          throw new Error("Expected response body to have 'links' property");
+        }
+        const links = response.body.links as unknown[];
+        expect(Array.isArray(links)).toBe(true);
+        expect(links.length).toBeGreaterThan(0);
+        const containsDocsFirecrawlDev = links.some((link: string) =>
+          link.includes("docs.firecrawl.dev"),
+        );
+        expect(containsDocsFirecrawlDev).toBe(true);
+      },
+    );
+    it.concurrent(
+      "should return a successful response with a valid API key and search and allowSubdomains and www",
+      async () => {
+        const mapRequest = {
+          url: "https://www.firecrawl.dev",
+          search: "docs",
+          includeSubdomains: true,
+        };
+        const response: any = await request(TEST_URL)
+          .post("/v1/map")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send(mapRequest);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("success", true);
+        expect(response.body).toHaveProperty("links");
+        if (!("links" in response.body)) {
+          throw new Error("Expected response body to have 'links' property");
+        }
+        const links = response.body.links as unknown[];
+        expect(Array.isArray(links)).toBe(true);
+        expect(links.length).toBeGreaterThan(0);
+        const containsDocsFirecrawlDev = links.some((link: string) =>
+          link.includes("docs.firecrawl.dev"),
+        );
+        expect(containsDocsFirecrawlDev).toBe(true);
+      },
+      10000,
+    );
+    it.concurrent(
+      "should return a successful response with a valid API key and search and not allowSubdomains and www",
+      async () => {
+        const mapRequest = {
+          url: "https://www.firecrawl.dev",
+          search: "docs",
+          includeSubdomains: false,
+        };
+        const response: any = await request(TEST_URL)
+          .post("/v1/map")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send(mapRequest);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("success", true);
+        expect(response.body).toHaveProperty("links");
+        if (!("links" in response.body)) {
+          throw new Error("Expected response body to have 'links' property");
+        }
+        const links = response.body.links as unknown[];
+        expect(Array.isArray(links)).toBe(true);
+        expect(links.length).toBeGreaterThan(0);
+        expect(links[0]).not.toContain("docs.firecrawl.dev");
+      },
+    );
+    it.concurrent("should return an error for invalid URL", async () => {
+      const mapRequest = {
+        url: "invalid-url",
+        includeSubdomains: true,
+        search: "test",
+      };
+      const response: any = await request(TEST_URL)
+        .post("/v1/map")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(mapRequest);
+      expect(response.statusCode).toBe(400);
+      expect(response.body).toHaveProperty("success", false);
+      expect(response.body).toHaveProperty("error");
+    });
+  });
+  describe("POST /v1/crawl", () => {
+    it.concurrent("should require authorization", async () => {
+      const response: any = await request(TEST_URL)
+        .post("/v1/crawl")
+        .send({ url: "https://firecrawl.dev" });
+      expect(response.statusCode).toBe(401);
+    });
+    it.concurrent("should throw error for blocklisted URL", async () => {
+      const scrapeRequest: ScrapeRequestInput = {
+        url: "https://facebook.com/fake-test",
+      };
+      const response = await request(TEST_URL)
+        .post("/v1/crawl")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      expect(response.statusCode).toBe(403);
+      expect(response.body.error).toBe(BLOCKLISTED_URL_MESSAGE);
+    });
+    it.concurrent(
+      "should return an error response with an invalid API key",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v1/crawl")
+          .set("Authorization", `Bearer invalid-api-key`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://firecrawl.dev" });
+        expect(response.statusCode).toBe(401);
+      },
+    );
+    it.concurrent("should return a successful response", async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/crawl")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: "https://firecrawl.dev" });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("id");
+      expect(response.body.id).toMatch(
+        /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
+      );
+      expect(response.body).toHaveProperty("success", true);
+      expect(response.body).toHaveProperty("url");
+      expect(response.body.url).toContain("/v1/crawl/");
+    });
+    it.concurrent(
+      "should return a successful response with a valid API key and valid includes option",
+      async () => {
+        const crawlResponse = await request(TEST_URL)
+          .post("/v1/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://firecrawl.dev",
+            limit: 40,
+            includePaths: ["blog/*"],
+          });
+        let response;
+        let isFinished = false;
+        while (!isFinished) {
+          response = await request(TEST_URL)
+            .get(`/v1/crawl/${crawlResponse.body.id}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(response.statusCode).toBe(200);
+          expect(response.body).toHaveProperty("status");
+          isFinished = response.body.status === "completed";
+          if (!isFinished) {
+            await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
+        const completedResponse = await request(TEST_URL)
+          .get(`/v1/crawl/${crawlResponse.body.id}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        const urls = completedResponse.body.data.map(
+          (item: any) => item.metadata?.sourceURL,
+        );
+        expect(urls.length).toBeGreaterThan(5);
+        urls.forEach((url: string) => {
+          expect(url).toContain("firecrawl.dev/blog");
+        });
+        expect(completedResponse.statusCode).toBe(200);
+        expect(completedResponse.body).toHaveProperty("status");
+        expect(completedResponse.body.status).toBe("completed");
+        expect(completedResponse.body).toHaveProperty("data");
+        expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        expect(completedResponse.body.data[0]).not.toHaveProperty("content"); // v0
+        expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
+        expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
+      },
+      180000,
+    ); // 180 seconds
+    it.concurrent(
+      "should return a successful response with a valid API key and valid excludes option",
+      async () => {
+        const crawlResponse = await request(TEST_URL)
+          .post("/v1/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://firecrawl.dev",
+            limit: 40,
+            excludePaths: ["blog/*"],
+          });
+        let isFinished = false;
+        let response;
+        while (!isFinished) {
+          response = await request(TEST_URL)
+            .get(`/v1/crawl/${crawlResponse.body.id}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(response.statusCode).toBe(200);
+          expect(response.body).toHaveProperty("status");
+          isFinished = response.body.status === "completed";
+          if (!isFinished) {
+            await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
+        const completedResponse = await request(TEST_URL)
+          .get(`/v1/crawl/${crawlResponse.body.id}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        const urls = completedResponse.body.data.map(
+          (item: any) => item.metadata?.sourceURL,
+        );
+        expect(urls.length).toBeGreaterThan(3);
+        urls.forEach((url: string) => {
+          expect(url.startsWith("https://www.firecrawl.dev/blog/")).toBeFalsy();
+        });
+      },
+      90000,
+    ); // 90 seconds
+    it.concurrent(
+      "should return a successful response with max depth option for a valid crawl job",
+      async () => {
+        const crawlResponse = await request(TEST_URL)
+          .post("/v1/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://www.scrapethissite.com",
+            maxDepth: 1,
+          });
+        expect(crawlResponse.statusCode).toBe(200);
+        const response = await request(TEST_URL)
+          .get(`/v1/crawl/${crawlResponse.body.id}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("status");
+        expect(["active", "waiting", "completed", "scraping"]).toContain(
+          response.body.status,
+        );
+        // wait for 60 seconds
+        let isCompleted = false;
+        while (!isCompleted) {
+          const statusCheckResponse = await request(TEST_URL)
+            .get(`/v1/crawl/${crawlResponse.body.id}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(statusCheckResponse.statusCode).toBe(200);
+          isCompleted = statusCheckResponse.body.status === "completed";
+          if (!isCompleted) {
+            await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        const completedResponse = await request(TEST_URL)
+          .get(`/v1/crawl/${crawlResponse.body.id}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(completedResponse.statusCode).toBe(200);
+        expect(completedResponse.body).toHaveProperty("status");
+        expect(completedResponse.body.status).toBe("completed");
+        expect(completedResponse.body).toHaveProperty("data");
+        expect(completedResponse.body.data[0]).not.toHaveProperty("content");
+        expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
+        expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
+        const urls = completedResponse.body.data.map(
+          (item: any) => item.metadata?.sourceURL,
+        );
+        expect(urls.length).toBeGreaterThan(1);
+        // Check if all URLs have a maximum depth of 1
+        urls.forEach((url: string) => {
+          const pathSplits = new URL(url).pathname.split("/");
+          const depth =
+            pathSplits.length -
+            (pathSplits[0].length === 0 &&
+            pathSplits[pathSplits.length - 1].length === 0
+              ? 1
+              : 0);
+          expect(depth).toBeLessThanOrEqual(2);
+        });
+      },
+      180000,
+    );
+  });
+  describe("GET /v1/crawl/:jobId", () => {
+    it.concurrent("should require authorization", async () => {
+      const response = await request(TEST_URL).get("/v1/crawl/123");
+      expect(response.statusCode).toBe(401);
+    });
+    it.concurrent(
+      "should return an error response with an invalid API key",
+      async () => {
+        const response = await request(TEST_URL)
+          .get("/v1/crawl/123")
+          .set("Authorization", `Bearer invalid-api-key`);
+        expect(response.statusCode).toBe(401);
+      },
+    );
+    it.concurrent(
+      "should return Job not found for invalid job ID",
+      async () => {
+        const response = await request(TEST_URL)
+          .get("/v1/crawl/invalidJobId")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(response.statusCode).toBe(404);
+      },
+    );
+    it.concurrent(
+      "should return a successful crawl status response for a valid crawl job",
+      async () => {
+        const crawlResponse = await request(TEST_URL)
+          .post("/v1/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://docs.firecrawl.dev" });
+        expect(crawlResponse.statusCode).toBe(200);
+        let isCompleted = false;
+        while (!isCompleted) {
+          const response = await request(TEST_URL)
+            .get(`/v1/crawl/${crawlResponse.body.id}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(response.statusCode).toBe(200);
+          expect(response.body).toHaveProperty("status");
+          if (response.body.status === "completed") {
+            isCompleted = true;
+          } else {
+            await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
+        const completedResponse = await request(TEST_URL)
+          .get(`/v1/crawl/${crawlResponse.body.id}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(completedResponse.body).toHaveProperty("status");
+        expect(completedResponse.body.status).toBe("completed");
+        expect(completedResponse.body).toHaveProperty("data");
+        expect(completedResponse.body.data[0]).not.toHaveProperty("content");
+        expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
+        expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
+        const childrenLinks = completedResponse.body.data.filter(
+          (doc) => doc.metadata && doc.metadata.sourceURL,
+        );
+        expect(childrenLinks.length).toBe(completedResponse.body.data.length);
+      },
+      180000,
+    ); // 120 seconds
+    it.concurrent(
+      "If someone cancels a crawl job, it should turn into failed status",
+      async () => {
+        const crawlResponse = await request(TEST_URL)
+          .post("/v1/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://docs.firecrawl.dev", limit: 10 });
+        expect(crawlResponse.statusCode).toBe(200);
+        await new Promise((r) => setTimeout(r, 10000));
+        const responseCancel = await request(TEST_URL)
+          .delete(`/v1/crawl/${crawlResponse.body.id}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(responseCancel.statusCode).toBe(200);
+        expect(responseCancel.body).toHaveProperty("status");
+        expect(responseCancel.body.status).toBe("cancelled");
+        await new Promise((r) => setTimeout(r, 10000));
+        const completedResponse = await request(TEST_URL)
+          .get(`/v1/crawl/${crawlResponse.body.id}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(completedResponse.statusCode).toBe(200);
+        expect(completedResponse.body).toHaveProperty("status");
+        expect(completedResponse.body.status).toBe("cancelled");
+        expect(completedResponse.body).toHaveProperty("data");
+        expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
+        expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
+      },
+      60000,
+    ); // 60 seconds
+  });
+});

src/__tests__/e2e_v1_withAuth_all_params/index.test.ts ADDED Viewed

	@@ -0,0 +1,711 @@

+import request from "supertest";
+import { configDotenv } from "dotenv";
+import { ScrapeRequest } from "../../controllers/v1/types";
+configDotenv();
+const FIRECRAWL_API_URL = "http://127.0.0.1:3002";
+const E2E_TEST_SERVER_URL = "http://firecrawl-e2e-test.vercel.app"; // @rafaelsideguide/firecrawl-e2e-test
+describe("E2E Tests for v1 API Routes", () => {
+  it.concurrent(
+    "should return a successful response for a scrape with 403 page",
+    async () => {
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: "https://httpstat.us/403" });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(response.body.data).toHaveProperty("markdown");
+      expect(response.body.data).toHaveProperty("metadata");
+      expect(response.body.data.metadata.statusCode).toBe(403);
+    },
+    30000,
+  );
+  it.concurrent(
+    "should handle 'formats:markdown (default)' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(response.body.data).toHaveProperty("markdown");
+      expect(response.body.data.markdown).toContain(
+        "This page is used for end-to-end (e2e) testing with Firecrawl.",
+      );
+      expect(response.body.data.markdown).toContain(
+        "Content with id #content-1",
+      );
+      // expect(response.body.data.markdown).toContain("Loading...");
+      expect(response.body.data.markdown).toContain("Click me!");
+      expect(response.body.data.markdown).toContain(
+        "Power your AI apps with clean data crawled from any website. It's also open-source.",
+      ); // firecrawl.dev inside an iframe
+      expect(response.body.data.markdown).toContain(
+        "This content loads only when you see it. Don't blink! 👼",
+      ); // the browser always scroll to the bottom
+      expect(response.body.data.markdown).not.toContain("Header"); // Only main content is returned by default
+      expect(response.body.data.markdown).not.toContain("footer"); // Only main content is returned by default
+      expect(response.body.data.markdown).not.toContain(
+        "This content is only visible on mobile",
+      );
+    },
+    30000,
+  );
+  it.concurrent(
+    "should handle 'formats:html' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        formats: ["html"],
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(response.body.data).not.toHaveProperty("markdown");
+      expect(response.body.data).toHaveProperty("html");
+      expect(response.body.data.html).not.toContain(
+        '<header class="row-start-1" style="">Header</header>',
+      );
+      expect(response.body.data.html).toContain(
+        '<p style="">This page is used for end-to-end (e2e) testing with Firecrawl.</p>',
+      );
+    },
+    30000,
+  );
+  it.concurrent(
+    "should handle 'rawHtml' in 'formats' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        formats: ["rawHtml"],
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(response.body.data).not.toHaveProperty("markdown");
+      expect(response.body.data).toHaveProperty("rawHtml");
+      expect(response.body.data.rawHtml).toContain(
+        ">This page is used for end-to-end (e2e) testing with Firecrawl.</p>",
+      );
+      expect(response.body.data.rawHtml).toContain(">Header</header>");
+    },
+    30000,
+  );
+  // - TODO: tests for links
+  // - TODO: tests for screenshot
+  // - TODO: tests for screenshot@fullPage
+  it.concurrent(
+    "should handle 'headers' parameter correctly",
+    async () => {
+      // @ts-ignore
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        headers: { "e2e-header-test": "firecrawl" },
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(response.body.data.markdown).toContain(
+        "e2e-header-test: firecrawl",
+      );
+    },
+    30000,
+  );
+  it.concurrent(
+    "should handle 'includeTags' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        includeTags: ["#content-1"],
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(response.body.data.markdown).not.toContain(
+        "<p>This page is used for end-to-end (e2e) testing with Firecrawl.</p>",
+      );
+      expect(response.body.data.markdown).toContain(
+        "Content with id #content-1",
+      );
+    },
+    30000,
+  );
+  it.concurrent(
+    "should handle 'excludeTags' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        excludeTags: ["#content-1"],
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(response.body.data.markdown).toContain(
+        "This page is used for end-to-end (e2e) testing with Firecrawl.",
+      );
+      expect(response.body.data.markdown).not.toContain(
+        "Content with id #content-1",
+      );
+    },
+    30000,
+  );
+  it.concurrent(
+    "should handle 'onlyMainContent' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        formats: ["html", "markdown"],
+        onlyMainContent: false,
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(response.body.data.markdown).toContain(
+        "This page is used for end-to-end (e2e) testing with Firecrawl.",
+      );
+      expect(response.body.data.html).toContain(
+        '<header class="row-start-1" style="">Header</header>',
+      );
+    },
+    30000,
+  );
+  it.concurrent(
+    "should handle 'timeout' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        timeout: 500,
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      expect(response.statusCode).toBe(408);
+      if (!("error" in response.body)) {
+        throw new Error("Expected response body to have 'error' property");
+      }
+      expect(response.body.error).toBe("Request timed out");
+      expect(response.body.success).toBe(false);
+    },
+    30000,
+  );
+  it.concurrent(
+    "should handle 'mobile' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        mobile: true,
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      expect(response.statusCode).toBe(200);
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(response.body.data.markdown).toContain(
+        "This content is only visible on mobile",
+      );
+    },
+    30000,
+  );
+  it.concurrent(
+    "should handle 'parsePDF' parameter correctly",
+    async () => {
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" });
+      await new Promise((r) => setTimeout(r, 6000));
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(response.body.data.markdown).toContain(
+        "arXiv:astro-ph/9301001v1 7 Jan 1993",
+      );
+      expect(response.body.data.markdown).not.toContain(
+        "h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm",
+      );
+      const responseNoParsePDF: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          url: "https://arxiv.org/pdf/astro-ph/9301001.pdf",
+          parsePDF: false,
+        });
+      await new Promise((r) => setTimeout(r, 6000));
+      expect(responseNoParsePDF.statusCode).toBe(200);
+      expect(responseNoParsePDF.body).toHaveProperty("data");
+      if (!("data" in responseNoParsePDF.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(responseNoParsePDF.body.data.markdown).toContain(
+        "h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm",
+      );
+    },
+    30000,
+  );
+  // it.concurrent("should handle 'location' parameter correctly",
+  //   async () => {
+  //     const scrapeRequest: ScrapeRequest = {
+  //       url: "https://roastmywebsite.ai",
+  //       location: {
+  //         country: "US",
+  //         languages: ["en"]
+  //       }
+  //     };
+  //     const response: any = await request(FIRECRAWL_API_URL)
+  //       .post("/v1/scrape")
+  //       .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+  //       .set("Content-Type", "application/json")
+  //       .send(scrapeRequest);
+  //     expect(response.statusCode).toBe(200);
+  //     // Add assertions to verify location is handled correctly
+  //   },
+  // 30000);
+  it.concurrent(
+    "should handle 'skipTlsVerification' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: "https://expired.badssl.com/",
+        timeout: 120000,
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      console.log("Error1a");
+      // console.log(response.body)
+      expect(response.statusCode).toBe(200);
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(response.body.data.metadata.pageStatusCode).toBe(500);
+      console.log("Error?");
+      const scrapeRequestWithSkipTlsVerification = {
+        url: "https://expired.badssl.com/",
+        skipTlsVerification: true,
+        timeout: 120000,
+      } as ScrapeRequest;
+      const responseWithSkipTlsVerification: any = await request(
+        FIRECRAWL_API_URL,
+      )
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequestWithSkipTlsVerification);
+      console.log("Error1b");
+      // console.log(responseWithSkipTlsVerification.body)
+      expect(responseWithSkipTlsVerification.statusCode).toBe(200);
+      if (!("data" in responseWithSkipTlsVerification.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      // console.log(responseWithSkipTlsVerification.body.data)
+      expect(responseWithSkipTlsVerification.body.data.markdown).toContain(
+        "badssl.com",
+      );
+    },
+    60000,
+  );
+  it.concurrent(
+    "should handle 'removeBase64Images' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        removeBase64Images: true,
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      expect(response.statusCode).toBe(200);
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      // console.log(response.body.data.markdown)
+      // - TODO: not working for every image
+      // expect(response.body.data.markdown).toContain("Image-Removed");
+    },
+    30000,
+  );
+  it.concurrent(
+    "should handle 'action wait' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        actions: [
+          {
+            type: "wait",
+            milliseconds: 10000,
+          },
+        ],
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      expect(response.statusCode).toBe(200);
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(response.body.data.markdown).not.toContain("Loading...");
+      expect(response.body.data.markdown).toContain(
+        "Content loaded after 5 seconds!",
+      );
+    },
+    30000,
+  );
+  // screenshot
+  it.concurrent(
+    "should handle 'action screenshot' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        actions: [
+          {
+            type: "screenshot",
+          },
+        ],
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      expect(response.statusCode).toBe(200);
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      if (!response.body.data.actions?.screenshots) {
+        throw new Error("Expected response body to have screenshots array");
+      }
+      expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(
+        0,
+      );
+      expect(response.body.data.actions.screenshots[0]).toContain(
+        "https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-",
+      );
+      // TODO compare screenshot with expected screenshot
+    },
+    30000,
+  );
+  it.concurrent(
+    "should handle 'action screenshot@fullPage' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        actions: [
+          {
+            type: "screenshot",
+            fullPage: true,
+          },
+          {
+            type: "scrape",
+          },
+        ],
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      expect(response.statusCode).toBe(200);
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      // console.log(response.body.data.actions?.screenshots[0])
+      if (!response.body.data.actions?.screenshots) {
+        throw new Error("Expected response body to have screenshots array");
+      }
+      expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(
+        0,
+      );
+      expect(response.body.data.actions.screenshots[0]).toContain(
+        "https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-",
+      );
+      if (!response.body.data.actions?.scrapes) {
+        throw new Error("Expected response body to have scrapes array");
+      }
+      expect(response.body.data.actions.scrapes[0].url).toBe(
+        "https://firecrawl-e2e-test.vercel.app/",
+      );
+      expect(response.body.data.actions.scrapes[0].html).toContain(
+        "This page is used for end-to-end (e2e) testing with Firecrawl.</p>",
+      );
+      // TODO compare screenshot with expected full page screenshot
+    },
+    30000,
+  );
+  it.concurrent(
+    "should handle 'action click' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        actions: [
+          {
+            type: "click",
+            selector: "#click-me",
+          },
+        ],
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      expect(response.statusCode).toBe(200);
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(response.body.data.markdown).not.toContain("Click me!");
+      expect(response.body.data.markdown).toContain(
+        "Text changed after click!",
+      );
+    },
+    30000,
+  );
+  it.concurrent(
+    "should handle 'action write' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        formats: ["html"],
+        actions: [
+          {
+            type: "click",
+            selector: "#input-1",
+          },
+          {
+            type: "write",
+            text: "Hello, world!",
+          },
+        ],
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      expect(response.statusCode).toBe(200);
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      // TODO: fix this test (need to fix fire-engine first)
+      // uncomment the following line:
+      // expect(response.body.data.html).toContain("<input id=\"input-1\" type=\"text\" placeholder=\"Enter text here...\" style=\"padding:8px;margin:10px;border:1px solid #ccc;border-radius:4px;background-color:#000\" value=\"Hello, world!\">");
+    },
+    30000,
+  );
+  // TODO: fix this test (need to fix fire-engine first)
+  it.concurrent(
+    "should handle 'action pressKey' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        formats: ["markdown"],
+        actions: [
+          {
+            type: "press",
+            key: "ArrowDown",
+          },
+        ],
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      // // TODO: fix this test (need to fix fire-engine first)
+      // // right now response.body is: { success: false, error: '(Internal server error) - null' }
+      // expect(response.statusCode).toBe(200);
+      // if (!("data" in response.body)) {
+      //   throw new Error("Expected response body to have 'data' property");
+      // }
+      // expect(response.body.data.markdown).toContain("Last Key Clicked: ArrowDown")
+    },
+    30000,
+  );
+  // TODO: fix this test (need to fix fire-engine first)
+  it.concurrent(
+    "should handle 'action scroll' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        formats: ["markdown"],
+        actions: [
+          {
+            type: "click",
+            selector: "#scroll-bottom-loader",
+          },
+          {
+            type: "scroll",
+            direction: "down",
+            amount: 2000,
+          },
+        ],
+      } as ScrapeRequest;
+      const response: any = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+      // TODO: uncomment this tests
+      // expect(response.statusCode).toBe(200);
+      // if (!("data" in response.body)) {
+      //   throw new Error("Expected response body to have 'data' property");
+      // }
+      //
+      // expect(response.body.data.markdown).toContain("You have reached the bottom!")
+    },
+    30000,
+  );
+  // TODO: test scrape action
+});

src/__tests__/e2e_withAuth/index.test.ts ADDED Viewed

	@@ -0,0 +1,862 @@

+import request from "supertest";
+import dotenv from "dotenv";
+dotenv.config();
+const TEST_URL = "http://127.0.0.1:3002";
+describe("E2E Tests for v0 API Routes", () => {
+  beforeAll(() => {
+    process.env.USE_DB_AUTHENTICATION = "true";
+  });
+  afterAll(() => {
+    delete process.env.USE_DB_AUTHENTICATION;
+  });
+  describe("GET /is-production", () => {
+    it.concurrent("should return the production status", async () => {
+      const response = await request(TEST_URL).get("/is-production");
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("isProduction");
+    });
+  });
+  describe("POST /v0/scrape", () => {
+    it.concurrent("should require authorization", async () => {
+      const response: any = await request(TEST_URL).post("/v0/scrape");
+      expect(response.statusCode).toBe(401);
+    });
+    it.concurrent(
+      "should return an error response with an invalid API key",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer invalid-api-key`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://firecrawl.dev" });
+        expect(response.statusCode).toBe(401);
+      },
+    );
+    it.concurrent(
+      "should return a successful response with a valid API key",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://roastmywebsite.ai" });
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("markdown");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data).not.toHaveProperty("html");
+        expect(response.body.data.content).toContain("_Roast_");
+        expect(response.body.data.metadata.pageError).toBeUndefined();
+        expect(response.body.data.metadata.title).toBe("Roast My Website");
+        expect(response.body.data.metadata.description).toBe(
+          "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
+        );
+        expect(response.body.data.metadata.keywords).toBe(
+          "Roast My Website,Roast,Website,GitHub,Firecrawl",
+        );
+        expect(response.body.data.metadata.robots).toBe("follow, index");
+        expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
+        expect(response.body.data.metadata.ogDescription).toBe(
+          "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
+        );
+        expect(response.body.data.metadata.ogUrl).toBe(
+          "https://www.roastmywebsite.ai",
+        );
+        expect(response.body.data.metadata.ogImage).toBe(
+          "https://www.roastmywebsite.ai/og.png",
+        );
+        expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
+        expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
+        expect(response.body.data.metadata.sourceURL).toBe(
+          "https://roastmywebsite.ai",
+        );
+        expect(response.body.data.metadata.pageStatusCode).toBe(200);
+      },
+      30000,
+    ); // 30 seconds timeout
+    it.concurrent(
+      "should return a successful response with a valid API key and includeHtml set to true",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://roastmywebsite.ai",
+            pageOptions: { includeHtml: true },
+          });
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("markdown");
+        expect(response.body.data).toHaveProperty("html");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.content).toContain("_Roast_");
+        expect(response.body.data.markdown).toContain("_Roast_");
+        expect(response.body.data.html).toContain("<h1");
+        expect(response.body.data.metadata.pageStatusCode).toBe(200);
+        expect(response.body.data.metadata.pageError).toBeUndefined();
+      },
+      30000,
+    ); // 30 seconds timeout
+    it.concurrent(
+      "should return a successful response for a valid scrape with PDF file",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" });
+        await new Promise((r) => setTimeout(r, 6000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.content).toContain(
+          "We present spectrophotometric observations of the Broad Line Radio Galaxy",
+        );
+        expect(response.body.data.metadata.pageStatusCode).toBe(200);
+        expect(response.body.data.metadata.pageError).toBeUndefined();
+      },
+      60000,
+    ); // 60 seconds
+    it.concurrent(
+      "should return a successful response for a valid scrape with PDF file without explicit .pdf extension",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://arxiv.org/pdf/astro-ph/9301001" });
+        await new Promise((r) => setTimeout(r, 6000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.content).toContain(
+          "We present spectrophotometric observations of the Broad Line Radio Galaxy",
+        );
+        expect(response.body.data.metadata.pageStatusCode).toBe(200);
+        expect(response.body.data.metadata.pageError).toBeUndefined();
+      },
+      60000,
+    ); // 60 seconds
+    it.concurrent(
+      "should return a successful response with a valid API key with removeTags option",
+      async () => {
+        const responseWithoutRemoveTags: any = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://www.scrapethissite.com/" });
+        expect(responseWithoutRemoveTags.statusCode).toBe(200);
+        expect(responseWithoutRemoveTags.body).toHaveProperty("data");
+        expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
+        expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
+        expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
+        expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
+        expect(responseWithoutRemoveTags.body.data.content).toContain(
+          "Scrape This Site",
+        );
+        expect(responseWithoutRemoveTags.body.data.content).toContain(
+          "Lessons and Videos",
+        ); // #footer
+        expect(responseWithoutRemoveTags.body.data.content).toContain(
+          "[Sandbox](",
+        ); // .nav
+        expect(responseWithoutRemoveTags.body.data.content).toContain(
+          "web scraping",
+        ); // strong
+        const response: any = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://www.scrapethissite.com/",
+            pageOptions: { removeTags: [".nav", "#footer", "strong"] },
+          });
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("markdown");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data).not.toHaveProperty("html");
+        expect(response.body.data.content).toContain("Scrape This Site");
+        expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
+        expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
+        expect(response.body.data.content).not.toContain("web scraping"); // strong
+      },
+      30000,
+    ); // 30 seconds timeout
+    it.concurrent(
+      "should return a successful response for a scrape with 400 page",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://httpstat.us/400" });
+        await new Promise((r) => setTimeout(r, 5000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.metadata.pageStatusCode).toBe(400);
+        expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
+          "bad request",
+        );
+      },
+      60000,
+    ); // 60 seconds
+    it.concurrent(
+      "should return a successful response for a scrape with 401 page",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://httpstat.us/401" });
+        await new Promise((r) => setTimeout(r, 5000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.metadata.pageStatusCode).toBe(401);
+        expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
+          "unauthorized",
+        );
+      },
+      60000,
+    ); // 60 seconds
+    it.concurrent(
+      "should return a successful response for a scrape with 403 page",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://httpstat.us/403" });
+        await new Promise((r) => setTimeout(r, 5000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.metadata.pageStatusCode).toBe(403);
+        expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
+          "forbidden",
+        );
+      },
+      60000,
+    ); // 60 seconds
+    it.concurrent(
+      "should return a successful response for a scrape with 404 page",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://httpstat.us/404" });
+        await new Promise((r) => setTimeout(r, 5000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.metadata.pageStatusCode).toBe(404);
+      },
+      60000,
+    ); // 60 seconds
+    it.concurrent(
+      "should return a successful response for a scrape with 405 page",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://httpstat.us/405" });
+        await new Promise((r) => setTimeout(r, 5000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.metadata.pageStatusCode).toBe(405);
+      },
+      60000,
+    ); // 60 seconds
+    it.concurrent(
+      "should return a successful response for a scrape with 500 page",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://httpstat.us/500" });
+        await new Promise((r) => setTimeout(r, 5000));
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("content");
+        expect(response.body.data).toHaveProperty("metadata");
+        expect(response.body.data.metadata.pageStatusCode).toBe(500);
+      },
+      60000,
+    ); // 60 seconds
+  });
+  describe("POST /v0/crawl", () => {
+    it.concurrent("should require authorization", async () => {
+      const response: any = await request(TEST_URL).post("/v0/crawl");
+      expect(response.statusCode).toBe(401);
+    });
+    it.concurrent(
+      "should return an error response with an invalid API key",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer invalid-api-key`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://firecrawl.dev" });
+        expect(response.statusCode).toBe(401);
+      },
+    );
+    it.concurrent(
+      "should return a successful response with a valid API key for crawl",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://firecrawl.dev" });
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("jobId");
+        expect(response.body.jobId).toMatch(
+          /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
+        );
+      },
+    );
+    it.concurrent(
+      "should return a successful response with a valid API key and valid includes option",
+      async () => {
+        const crawlResponse: any = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://mendable.ai",
+            limit: 10,
+            crawlerOptions: {
+              includes: ["blog/*"],
+            },
+          });
+        let response: any;
+        let isFinished = false;
+        while (!isFinished) {
+          response = await request(TEST_URL)
+            .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(response.statusCode).toBe(200);
+          expect(response.body).toHaveProperty("status");
+          isFinished = response.body.status === "completed";
+          if (!isFinished) {
+            await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
+        const completedResponse = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        const urls = completedResponse.body.data.map(
+          (item: any) => item.metadata?.sourceURL,
+        );
+        expect(urls.length).toBeGreaterThan(5);
+        urls.forEach((url: string) => {
+          expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy();
+        });
+        expect(completedResponse.statusCode).toBe(200);
+        expect(completedResponse.body).toHaveProperty("status");
+        expect(completedResponse.body.status).toBe("completed");
+        expect(completedResponse.body).toHaveProperty("data");
+        expect(completedResponse.body.data[0]).toHaveProperty("content");
+        expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        expect(completedResponse.body.data[0].content).toContain("Mendable");
+        expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
+          200,
+        );
+        expect(
+          completedResponse.body.data[0].metadata.pageError,
+        ).toBeUndefined();
+      },
+      180000,
+    ); // 180 seconds
+    it.concurrent(
+      "should return a successful response with a valid API key and valid excludes option",
+      async () => {
+        const crawlResponse: any = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://mendable.ai",
+            limit: 10,
+            crawlerOptions: {
+              excludes: ["blog/*"],
+            },
+          });
+        let isFinished = false;
+        let response: any;
+        while (!isFinished) {
+          response = await request(TEST_URL)
+            .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(response.statusCode).toBe(200);
+          expect(response.body).toHaveProperty("status");
+          isFinished = response.body.status === "completed";
+          if (!isFinished) {
+            await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
+        const completedResponse: any = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        const urls = completedResponse.body.data.map(
+          (item: any) => item.metadata?.sourceURL,
+        );
+        expect(urls.length).toBeGreaterThan(5);
+        urls.forEach((url: string) => {
+          expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
+        });
+      },
+      90000,
+    ); // 90 seconds
+    it.concurrent(
+      "should return a successful response with max depth option for a valid crawl job",
+      async () => {
+        const crawlResponse: any = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://www.scrapethissite.com",
+            crawlerOptions: { maxDepth: 1 },
+          });
+        expect(crawlResponse.statusCode).toBe(200);
+        const response: any = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("status");
+        expect(["active", "waiting"]).toContain(response.body.status);
+        // wait for 60 seconds
+        let isCompleted = false;
+        while (!isCompleted) {
+          const statusCheckResponse = await request(TEST_URL)
+            .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(statusCheckResponse.statusCode).toBe(200);
+          isCompleted = statusCheckResponse.body.status === "completed";
+          if (!isCompleted) {
+            await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        const completedResponse: any = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(completedResponse.statusCode).toBe(200);
+        expect(completedResponse.body).toHaveProperty("status");
+        expect(completedResponse.body.status).toBe("completed");
+        expect(completedResponse.body).toHaveProperty("data");
+        expect(completedResponse.body.data[0]).toHaveProperty("content");
+        expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
+          200,
+        );
+        expect(
+          completedResponse.body.data[0].metadata.pageError,
+        ).toBeUndefined();
+        const urls = completedResponse.body.data.map(
+          (item: any) => item.metadata?.sourceURL,
+        );
+        expect(urls.length).toBeGreaterThan(1);
+        // Check if all URLs have a maximum depth of 1
+        urls.forEach((url: string) => {
+          const pathSplits = new URL(url).pathname.split("/");
+          const depth =
+            pathSplits.length -
+            (pathSplits[0].length === 0 &&
+            pathSplits[pathSplits.length - 1].length === 0
+              ? 1
+              : 0);
+          expect(depth).toBeLessThanOrEqual(2);
+        });
+      },
+      180000,
+    );
+  });
+  describe("POST /v0/crawlWebsitePreview", () => {
+    it.concurrent("should require authorization", async () => {
+      const response: any = await request(TEST_URL).post(
+        "/v0/crawlWebsitePreview",
+      );
+      expect(response.statusCode).toBe(401);
+    });
+    it.concurrent(
+      "should return an error response with an invalid API key",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v0/crawlWebsitePreview")
+          .set("Authorization", `Bearer invalid-api-key`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://firecrawl.dev" });
+        expect(response.statusCode).toBe(401);
+      },
+    );
+    it.concurrent(
+      "should return a timeout error when scraping takes longer than the specified timeout",
+      async () => {
+        const response: any = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://firecrawl.dev", timeout: 1000 });
+        expect(response.statusCode).toBe(408);
+      },
+      3000,
+    );
+  });
+  describe("POST /v0/search", () => {
+    it.concurrent("should require authorization", async () => {
+      const response = await request(TEST_URL).post("/v0/search");
+      expect(response.statusCode).toBe(401);
+    });
+    it.concurrent(
+      "should return an error response with an invalid API key",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/search")
+          .set("Authorization", `Bearer invalid-api-key`)
+          .set("Content-Type", "application/json")
+          .send({ query: "test" });
+        expect(response.statusCode).toBe(401);
+      },
+    );
+    it.concurrent(
+      "should return a successful response with a valid API key for search",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/search")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ query: "test" });
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("success");
+        expect(response.body.success).toBe(true);
+        expect(response.body).toHaveProperty("data");
+      },
+      60000,
+    ); // 60 seconds timeout
+  });
+  describe("GET /v0/crawl/status/:jobId", () => {
+    it.concurrent("should require authorization", async () => {
+      const response = await request(TEST_URL).get("/v0/crawl/status/123");
+      expect(response.statusCode).toBe(401);
+    });
+    it.concurrent(
+      "should return an error response with an invalid API key",
+      async () => {
+        const response = await request(TEST_URL)
+          .get("/v0/crawl/status/123")
+          .set("Authorization", `Bearer invalid-api-key`);
+        expect(response.statusCode).toBe(401);
+      },
+    );
+    it.concurrent(
+      "should return Job not found for invalid job ID",
+      async () => {
+        const response = await request(TEST_URL)
+          .get("/v0/crawl/status/invalidJobId")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(response.statusCode).toBe(404);
+      },
+    );
+    it.concurrent(
+      "should return a successful crawl status response for a valid crawl job",
+      async () => {
+        const crawlResponse = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({ url: "https://firecrawl.dev/blog" });
+        expect(crawlResponse.statusCode).toBe(200);
+        let isCompleted = false;
+        while (!isCompleted) {
+          const response = await request(TEST_URL)
+            .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+            .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          expect(response.statusCode).toBe(200);
+          expect(response.body).toHaveProperty("status");
+          if (response.body.status === "completed") {
+            isCompleted = true;
+          } else {
+            await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
+          }
+        }
+        await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
+        const completedResponse = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(completedResponse.body).toHaveProperty("status");
+        expect(completedResponse.body.status).toBe("completed");
+        expect(completedResponse.body).toHaveProperty("data");
+        expect(completedResponse.body.data[0]).toHaveProperty("content");
+        expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+        expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+        expect(completedResponse.body.data[0].content).toContain("Firecrawl");
+        expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
+          200,
+        );
+        expect(
+          completedResponse.body.data[0].metadata.pageError,
+        ).toBeUndefined();
+        const childrenLinks = completedResponse.body.data.filter(
+          (doc) =>
+            doc.metadata &&
+            doc.metadata.sourceURL &&
+            doc.metadata.sourceURL.includes("firecrawl.dev/blog"),
+        );
+        expect(childrenLinks.length).toBe(completedResponse.body.data.length);
+      },
+      180000,
+    ); // 120 seconds
+    // TODO: review the test below
+    // it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => {
+    //   const crawlResponse = await request(TEST_URL)
+    //     .post('/v0/crawl')
+    //     .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+    //     .set('Content-Type', 'application/json')
+    //     .send({ url: 'https://arxiv.org/list/astro-ph/1993-01',
+    //       crawlerOptions: {
+    //         limit: 10,
+    //         returnOnlyUrls: true
+    //       }});
+    //   expect(crawlResponse.statusCode).toBe(200);
+    //   let isCompleted = false;
+    //   let completedResponse;
+    //   while (!isCompleted) {
+    //     const response = await request(TEST_URL)
+    //       .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+    //       .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
+    //     expect(response.statusCode).toBe(200);
+    //     expect(response.body).toHaveProperty('status');
+    //     if (response.body.status === 'completed') {
+    //       isCompleted = true;
+    //       completedResponse = response;
+    //     } else {
+    //       await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
+    //     }
+    //   }
+    //     expect(completedResponse.body.status).toBe('completed');
+    //     expect(completedResponse.body).toHaveProperty('data');
+    //     expect(completedResponse.body.data.length).toEqual(1);
+    //     expect(completedResponse.body.data).toEqual(
+    //       expect.arrayContaining([
+    //         expect.objectContaining({
+    //           content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.')
+    //         })
+    //       ])
+    //     );
+    //     expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+    //     expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
+    //     expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
+    // }, 180000); // 120 seconds
+    it.concurrent(
+      "If someone cancels a crawl job, it should turn into failed status",
+      async () => {
+        const crawlResponse = await request(TEST_URL)
+          .post("/v0/crawl")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://docs.tatum.io",
+            crawlerOptions: { limit: 200 },
+          });
+        expect(crawlResponse.statusCode).toBe(200);
+        await new Promise((r) => setTimeout(r, 10000));
+        const responseCancel = await request(TEST_URL)
+          .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+        expect(responseCancel.statusCode).toBe(200);
+        expect(responseCancel.body).toHaveProperty("status");
+        expect(responseCancel.body.status).toBe("cancelled");
+        await new Promise((r) => setTimeout(r, 10000));
+        const completedResponse = await request(TEST_URL)
+          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .maxResponseSize(4000000000);
+        expect(completedResponse.statusCode).toBe(200);
+        expect(completedResponse.body).toHaveProperty("status");
+        expect(completedResponse.body.status).toBe("failed");
+        expect(completedResponse.body).toHaveProperty("data");
+        let isNullOrEmptyArray = false;
+        if (
+          completedResponse.body.data === null ||
+          completedResponse.body.data.length === 0
+        ) {
+          isNullOrEmptyArray = true;
+        }
+        expect(isNullOrEmptyArray).toBe(true);
+        expect(completedResponse.body.data).toEqual(expect.arrayContaining([]));
+        expect(completedResponse.body).toHaveProperty("partial_data");
+        expect(completedResponse.body.partial_data[0]).toHaveProperty(
+          "content",
+        );
+        expect(completedResponse.body.partial_data[0]).toHaveProperty(
+          "markdown",
+        );
+        expect(completedResponse.body.partial_data[0]).toHaveProperty(
+          "metadata",
+        );
+        expect(
+          completedResponse.body.partial_data[0].metadata.pageStatusCode,
+        ).toBe(200);
+        expect(
+          completedResponse.body.partial_data[0].metadata.pageError,
+        ).toBeUndefined();
+      },
+      60000,
+    ); // 60 seconds
+  });
+  describe("POST /v0/scrape with LLM Extraction", () => {
+    it.concurrent(
+      "should extract data using LLM extraction mode",
+      async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://mendable.ai",
+            pageOptions: {
+              onlyMainContent: true,
+            },
+            extractorOptions: {
+              mode: "llm-extraction",
+              extractionPrompt:
+                "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
+              extractionSchema: {
+                type: "object",
+                properties: {
+                  company_mission: {
+                    type: "string",
+                  },
+                  supports_sso: {
+                    type: "boolean",
+                  },
+                  is_open_source: {
+                    type: "boolean",
+                  },
+                },
+                required: ["company_mission", "supports_sso", "is_open_source"],
+              },
+            },
+          });
+        // Ensure that the job was successfully created before proceeding with LLM extraction
+        expect(response.statusCode).toBe(200);
+        // Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
+        let llmExtraction = response.body.data.llm_extraction;
+        // Check if the llm_extraction object has the required properties with correct types and values
+        expect(llmExtraction).toHaveProperty("company_mission");
+        expect(typeof llmExtraction.company_mission).toBe("string");
+        expect(llmExtraction).toHaveProperty("supports_sso");
+        expect(llmExtraction.supports_sso).toBe(true);
+        expect(typeof llmExtraction.supports_sso).toBe("boolean");
+        expect(llmExtraction).toHaveProperty("is_open_source");
+        expect(llmExtraction.is_open_source).toBe(false);
+        expect(typeof llmExtraction.is_open_source).toBe("boolean");
+      },
+      60000,
+    ); // 60 secs
+  });
+});

src/__tests__/queue-concurrency-integration.test.ts ADDED Viewed

	@@ -0,0 +1,269 @@

+import { redisConnection } from "../services/queue-service";
+import { addScrapeJob, addScrapeJobs } from "../services/queue-jobs";
+import {
+  cleanOldConcurrencyLimitEntries,
+  pushConcurrencyLimitActiveJob,
+  takeConcurrencyLimitedJob,
+  removeConcurrencyLimitActiveJob,
+} from "../lib/concurrency-limit";
+import { WebScraperOptions } from "../types";
+import { getACUCTeam } from "../controllers/auth";
+// Mock all the dependencies
+const mockAdd = jest.fn();
+jest.mock("../services/queue-service", () => ({
+  redisConnection: {
+    zremrangebyscore: jest.fn(),
+    zrangebyscore: jest.fn(),
+    zadd: jest.fn(),
+    zrem: jest.fn(),
+    zmpop: jest.fn(),
+    zcard: jest.fn(),
+    smembers: jest.fn(),
+  },
+  getScrapeQueue: jest.fn(() => ({
+    add: mockAdd,
+  })),
+}));
+jest.mock("uuid", () => ({
+  v4: jest.fn(() => "mock-uuid"),
+}));
+describe("Queue Concurrency Integration", () => {
+  const mockTeamId = "test-team-id";
+  const mockNow = Date.now();
+  const defaultScrapeOptions = {
+    formats: ["markdown"] as (
+      | "markdown"
+      | "html"
+      | "rawHtml"
+      | "links"
+      | "screenshot"
+      | "screenshot@fullPage"
+      | "extract"
+      | "json"
+    )[],
+    onlyMainContent: true,
+    waitFor: 0,
+    mobile: false,
+    parsePDF: false,
+    timeout: 30000,
+    extract: {
+      mode: "llm" as const,
+      systemPrompt: "test",
+      schema: {},
+    },
+    extractOptions: { mode: "llm" as const, systemPrompt: "test" },
+    javascript: true,
+    headers: {},
+    cookies: [],
+    blockResources: true,
+    skipTlsVerification: false,
+    removeBase64Images: true,
+    fastMode: false,
+    blockAds: true,
+  };
+  beforeEach(() => {
+    jest.clearAllMocks();
+    jest.spyOn(Date, "now").mockImplementation(() => mockNow);
+  });
+  describe("Single Job Addition", () => {
+    const mockWebScraperOptions: WebScraperOptions = {
+      url: "https://test.com",
+      mode: "single_urls",
+      team_id: mockTeamId,
+      scrapeOptions: defaultScrapeOptions,
+      crawlerOptions: null,
+    };
+    it("should add job directly to BullMQ when under concurrency limit", async () => {
+      // Mock current active jobs to be under limit
+      (redisConnection.zrangebyscore as jest.Mock).mockResolvedValue([]);
+      await addScrapeJob(mockWebScraperOptions);
+      // Should have checked concurrency
+      expect(redisConnection.zrangebyscore).toHaveBeenCalled();
+      // Should have added to BullMQ
+      expect(mockAdd).toHaveBeenCalled();
+      // Should have added to active jobs
+      expect(redisConnection.zadd).toHaveBeenCalledWith(
+        expect.stringContaining("concurrency-limiter"),
+        expect.any(Number),
+        expect.any(String),
+      );
+    });
+    it("should add job to concurrency queue when at concurrency limit", async () => {
+      // Mock current active jobs to be at limit
+      (getACUCTeam as jest.Mock).mockResolvedValue({
+        concurrency: 15,
+      } as any);
+      const activeJobs = Array(15).fill("active-job");
+      (redisConnection.zrangebyscore as jest.Mock).mockResolvedValue(
+        activeJobs,
+      );
+      await addScrapeJob(mockWebScraperOptions);
+      // Should have checked concurrency
+      expect(redisConnection.zrangebyscore).toHaveBeenCalled();
+      // Should NOT have added to BullMQ
+      expect(mockAdd).not.toHaveBeenCalled();
+      // Should have added to concurrency queue
+      expect(redisConnection.zadd).toHaveBeenCalledWith(
+        expect.stringContaining("concurrency-limit-queue"),
+        expect.any(Number),
+        expect.stringContaining("mock-uuid"),
+      );
+    });
+  });
+  describe("Batch Job Addition", () => {
+    const createMockJobs = (count: number) =>
+      Array(count)
+        .fill(null)
+        .map((_, i) => ({
+          data: {
+            url: `https://test${i}.com`,
+            mode: "single_urls",
+            team_id: mockTeamId,
+            scrapeOptions: defaultScrapeOptions,
+          } as WebScraperOptions,
+          opts: {
+            jobId: `job-${i}`,
+            priority: 1,
+          },
+        }));
+    it("should handle batch jobs respecting concurrency limits", async () => {
+      const maxConcurrency = 15;
+      (getACUCTeam as jest.Mock).mockResolvedValue({
+        concurrency: maxConcurrency,
+      } as any);
+      const totalJobs = maxConcurrency + 5; // Some jobs should go to queue
+      const mockJobs = createMockJobs(totalJobs);
+      // Mock current active jobs to be empty
+      (redisConnection.zrangebyscore as jest.Mock).mockResolvedValue([]);
+      await addScrapeJobs(mockJobs);
+      // Should have added maxConcurrency jobs to BullMQ
+      expect(mockAdd).toHaveBeenCalledTimes(maxConcurrency);
+      // Should have added remaining jobs to concurrency queue
+      expect(redisConnection.zadd).toHaveBeenCalledWith(
+        expect.stringContaining("concurrency-limit-queue"),
+        expect.any(Number),
+        expect.any(String),
+      );
+    });
+    it("should handle empty job array", async () => {
+      const result = await addScrapeJobs([]);
+      expect(result).toBe(true);
+      expect(mockAdd).not.toHaveBeenCalled();
+      expect(redisConnection.zadd).not.toHaveBeenCalled();
+    });
+  });
+  describe("Queue Worker Integration", () => {
+    it("should process next queued job when active job completes", async () => {
+      const mockJob = {
+        id: "test-job",
+        data: {
+          team_id: mockTeamId,
+        },
+      };
+      // Mock a queued job
+      const queuedJob = {
+        id: "queued-job",
+        data: { test: "data" },
+        opts: {},
+      };
+      (redisConnection.zmpop as jest.Mock).mockResolvedValueOnce([
+        "key",
+        [[JSON.stringify(queuedJob)]],
+      ]);
+      // Simulate job completion in worker
+      await removeConcurrencyLimitActiveJob(mockTeamId, mockJob.id);
+      await cleanOldConcurrencyLimitEntries(mockTeamId);
+      const nextJob = await takeConcurrencyLimitedJob(mockTeamId);
+      // Should have taken next job from queue
+      expect(nextJob).toEqual(queuedJob);
+      // Should have added new job to active jobs
+      await pushConcurrencyLimitActiveJob(mockTeamId, nextJob!.id, 2 * 60 * 1000);
+      expect(redisConnection.zadd).toHaveBeenCalledWith(
+        expect.stringContaining("concurrency-limiter"),
+        expect.any(Number),
+        nextJob!.id,
+      );
+    });
+    it("should handle job failure and cleanup", async () => {
+      const mockJob = {
+        id: "failing-job",
+        data: {
+          team_id: mockTeamId,
+        },
+      };
+      // Add job to active jobs
+      await pushConcurrencyLimitActiveJob(mockTeamId, mockJob.id, 2 * 60 * 1000);
+      // Simulate job failure and cleanup
+      await removeConcurrencyLimitActiveJob(mockTeamId, mockJob.id);
+      await cleanOldConcurrencyLimitEntries(mockTeamId);
+      // Verify job was removed from active jobs
+      expect(redisConnection.zrem).toHaveBeenCalledWith(
+        expect.stringContaining("concurrency-limiter"),
+        mockJob.id,
+      );
+    });
+  });
+  describe("Edge Cases", () => {
+    it("should handle stalled jobs cleanup", async () => {
+      const stalledTime = mockNow - 3 * 60 * 1000; // 3 minutes ago
+      // Mock stalled jobs in Redis
+      (redisConnection.zrangebyscore as jest.Mock).mockResolvedValueOnce([
+        "stalled-job",
+      ]);
+      await cleanOldConcurrencyLimitEntries(mockTeamId, mockNow);
+      // Should have cleaned up stalled jobs
+      expect(redisConnection.zremrangebyscore).toHaveBeenCalledWith(
+        expect.stringContaining("concurrency-limiter"),
+        -Infinity,
+        mockNow,
+      );
+    });
+    it("should handle race conditions in job queue processing", async () => {
+      // Mock a race condition where job is taken by another worker
+      (redisConnection.zmpop as jest.Mock).mockResolvedValueOnce(null);
+      const nextJob = await takeConcurrencyLimitedJob(mockTeamId);
+      // Should handle gracefully when no job is available
+      expect(nextJob).toBeNull();
+    });
+  });
+});

src/__tests__/snips/batch-scrape.test.ts ADDED Viewed

	@@ -0,0 +1,51 @@

+import { batchScrape } from "./lib";
+describe("Batch scrape tests", () => {
+    it.concurrent("works", async () => {
+        const response = await batchScrape({
+            urls: ["http://firecrawl.dev"]
+        });
+        expect(response.body.data[0]).toHaveProperty("markdown");
+        expect(response.body.data[0].markdown).toContain("Firecrawl");
+    }, 180000);
+    if (!process.env.TEST_SUITE_SELF_HOSTED) {
+        describe("JSON format", () => {
+            it.concurrent("works", async () => {
+                const response = await batchScrape({
+                    urls: ["http://firecrawl.dev"],
+                    formats: ["json"],
+                    jsonOptions: {
+                        prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
+                        schema: {
+                            type: "object",
+                            properties: {
+                                company_mission: {
+                                    type: "string",
+                                },
+                                supports_sso: {
+                                    type: "boolean",
+                                },
+                                is_open_source: {
+                                    type: "boolean",
+                                },
+                            },
+                            required: ["company_mission", "supports_sso", "is_open_source"],
+                        },
+                    },
+                });
+                expect(response.body.data[0]).toHaveProperty("json");
+                expect(response.body.data[0].json).toHaveProperty("company_mission");
+                expect(typeof response.body.data[0].json.company_mission).toBe("string");
+                expect(response.body.data[0].json).toHaveProperty("supports_sso");
+                expect(response.body.data[0].json.supports_sso).toBe(false);
+                expect(typeof response.body.data[0].json.supports_sso).toBe("boolean");
+                expect(response.body.data[0].json).toHaveProperty("is_open_source");
+                expect(response.body.data[0].json.is_open_source).toBe(true);
+                expect(typeof response.body.data[0].json.is_open_source).toBe("boolean");
+            }, 180000);
+        });
+    }
+});

src/__tests__/snips/billing.test.ts ADDED Viewed

	@@ -0,0 +1,197 @@

+// import { batchScrape, crawl, creditUsage, extract, map, scrape, search, tokenUsage } from "./lib";
+// const sleep = (ms: number) => new Promise(x => setTimeout(() => x(true), ms));
+// const sleepForBatchBilling = () => sleep(20000);
+// beforeAll(async () => {
+//     // Wait for previous test runs to stop billing processing
+//     if (!process.env.TEST_SUITE_SELF_HOSTED) {
+//         await sleep(40000);
+//     }
+// }, 50000);
+// describe("Billing tests", () => {
+//     if (process.env.TEST_SUITE_SELF_HOSTED) {
+//         it("dummy", () => {
+//             expect(true).toBe(true);
+//         });
+//     } else {
+//         it("bills scrape correctly", async () => {
+//             const rc1 = (await creditUsage()).remaining_credits;
+//             // Run all scrape operations in parallel with Promise.all
+//             await Promise.all([
+//                 // scrape 1: regular fc.dev scrape (1 credit)
+//                 scrape({
+//                     url: "https://firecrawl.dev"
+//                 }),
+//                 // scrape 1.1: regular fc.dev scrape (1 credit)
+//                 scrape({
+//                     url: "https://firecrawl.dev"
+//                 }),
+//                 // scrape 2: fc.dev with json (5 credits)
+//                 scrape({
+//                     url: "https://firecrawl.dev",
+//                     formats: ["json"],
+//                     jsonOptions: {
+//                         schema: {
+//                             type: "object",
+//                             properties: {
+//                                 is_open_source: { type: "boolean" },
+//                             },
+//                             required: ["is_open_source"],
+//                         },
+//                     },
+//                 })
+//             ]);
+//             // sum: 7 credits
+//             await sleepForBatchBilling();
+//             const rc2 = (await creditUsage()).remaining_credits;
+//             expect(rc1 - rc2).toBe(7);
+//         }, 120000);
+//         it("bills batch scrape correctly", async () => {
+//             const rc1 = (await creditUsage()).remaining_credits;
+//             // Run both scrape operations in parallel with Promise.all
+//             const [scrape1, scrape2] = await Promise.all([
+//                 // scrape 1: regular batch scrape with failing domain (2 credits)
+//                 batchScrape({
+//                     urls: [
+//                         "https://firecrawl.dev",
+//                         "https://mendable.ai",
+//                         "https://thisdomaindoesnotexistandwillfail.fcr",
+//                     ],
+//                 }),
+//                 // scrape 2: batch scrape with json (10 credits)
+//                 batchScrape({
+//                     urls: [
+//                         "https://firecrawl.dev",
+//                         "https://mendable.ai",
+//                         "https://thisdomaindoesnotexistandwillfail.fcr",
+//                     ],
+//                     formats: ["json"],
+//                     jsonOptions: {
+//                         schema: {
+//                             type: "object",
+//                             properties: {
+//                                 four_word_summary: { type: "string" },
+//                             },
+//                             required: ["four_word_summary"],
+//                         },
+//                     },
+//                 })
+//             ]);
+//             // sum: 12 credits
+//             await sleepForBatchBilling();
+//             const rc2 = (await creditUsage()).remaining_credits;
+//             expect(rc1 - rc2).toBe(12);
+//         }, 600000);
+//         it("bills crawl correctly", async () => {
+//             const rc1 = (await creditUsage()).remaining_credits;
+//             // Run both crawl operations in parallel with Promise.all
+//             const [crawl1, crawl2] = await Promise.all([
+//                 // crawl 1: regular fc.dev crawl (x credits)
+//                 crawl({
+//                     url: "https://firecrawl.dev",
+//                 }),
+//                 // crawl 2: fc.dev crawl with json (5y credits)
+//                 crawl({
+//                     url: "https://firecrawl.dev",
+//                     scrapeOptions: {
+//                         formats: ["json"],
+//                         jsonOptions: {
+//                             schema: {
+//                                 type: "object",
+//                                 properties: {
+//                                     four_word_summary: { type: "string" },
+//                                 },
+//                                 required: ["four_word_summary"],
+//                             },
+//                         },
+//                     }
+//                 })
+//             ]);
+//             expect(crawl1.success).toBe(true);
+//             expect(crawl2.success).toBe(true);
+//             // sum: x+5y credits
+//             await sleepForBatchBilling();
+//             const rc2 = (await creditUsage()).remaining_credits;
+//             if (crawl1.success && crawl2.success) {
+//                 expect(rc1 - rc2).toBe(crawl1.completed + crawl2.completed * 5);
+//             }
+//         }, 600000);
+//         it("bills map correctly", async () => {
+//             const rc1 = (await creditUsage()).remaining_credits;
+//             await map({ url: "https://firecrawl.dev" });
+//             await sleepForBatchBilling();
+//             const rc2 = (await creditUsage()).remaining_credits;
+//             expect(rc1 - rc2).toBe(1);
+//         }, 60000);
+//         it("bills search correctly", async () => {
+//             const rc1 = (await creditUsage()).remaining_credits;
+//             const results = await search({
+//                 query: "firecrawl"
+//             });
+//             await sleepForBatchBilling();
+//             const rc2 = (await creditUsage()).remaining_credits;
+//             expect(rc1 - rc2).toBe(results.length);
+//         }, 60000);
+//         it("bills extract correctly", async () => {
+//             const rc1 = (await tokenUsage()).remaining_tokens;
+//             await extract({
+//                 urls: ["https://firecrawl.dev"],
+//                 schema: {
+//                     "type": "object",
+//                     "properties": {
+//                         "is_open_source": {
+//                             "type": "boolean"
+//                         }
+//                     },
+//                     "required": [
+//                         "is_open_source"
+//                     ]
+//                 },
+//                 origin: "api-sdk",
+//             });
+//             await sleepForBatchBilling();
+//             const rc2 = (await tokenUsage()).remaining_tokens;
+//             expect(rc1 - rc2).toBe(305);
+//         }, 300000);
+//     }
+// });
+// temporarily disabled
+it("is mocked", () => {
+    expect(true).toBe(true);
+});

src/__tests__/snips/crawl.test.ts ADDED Viewed

	@@ -0,0 +1,75 @@

+import { crawl } from "./lib";
+describe("Crawl tests", () => {
+    it.concurrent("works", async () => {
+        await crawl({
+            url: "https://firecrawl.dev",
+            limit: 10,
+        });
+    }, 120000);
+    it.concurrent("filters URLs properly", async () => {
+        const res = await crawl({
+            url: "https://firecrawl.dev/pricing",
+            includePaths: ["^/pricing$"],
+            limit: 10,
+        });
+        expect(res.success).toBe(true);
+        if (res.success) {
+            expect(res.completed).toBe(1);
+            expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing");
+        }
+    }, 120000);
+    it.concurrent("filters URLs properly when using regexOnFullURL", async () => {
+        const res = await crawl({
+            url: "https://firecrawl.dev/pricing",
+            includePaths: ["^https://(www\\.)?firecrawl\\.dev/pricing$"],
+            regexOnFullURL: true,
+            limit: 10,
+        });
+        expect(res.success).toBe(true);
+        if (res.success) {
+            expect(res.completed).toBe(1);
+            expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing");
+        }
+    }, 120000);
+    // TEMP: Flaky
+    // it.concurrent("discovers URLs properly when origin is not included", async () => {
+    //     const res = await crawl({
+    //         url: "https://firecrawl.dev",
+    //         includePaths: ["^/blog"],
+    //         ignoreSitemap: true,
+    //         limit: 10,
+    //     });
+    //     expect(res.success).toBe(true);
+    //     if (res.success) {
+    //         expect(res.data.length).toBeGreaterThan(1);
+    //         for (const page of res.data) {
+    //             expect(page.metadata.url ?? page.metadata.sourceURL).toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog/);
+    //         }
+    //     }
+    // }, 300000);
+    // TEMP: Flaky
+    // it.concurrent("discovers URLs properly when maxDiscoveryDepth is provided", async () => {
+    //     const res = await crawl({
+    //         url: "https://firecrawl.dev",
+    //         ignoreSitemap: true,
+    //         maxDiscoveryDepth: 1,
+    //         limit: 10,
+    //     });
+    //     expect(res.success).toBe(true);
+    //     if (res.success) {
+    //         expect(res.data.length).toBeGreaterThan(1);
+    //         for (const page of res.data) {
+    //             expect(page.metadata.url ?? page.metadata.sourceURL).not.toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog\/.+$/);
+    //         }
+    //     }
+    // }, 300000);
+});

src/__tests__/snips/extract.test.ts ADDED Viewed

	@@ -0,0 +1,59 @@

+import { extract } from "./lib";
+describe("Extract tests", () => {
+    if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY || process.env.OLLAMA_BASE_URL) {
+        it.concurrent("works", async () => {
+            const res = await extract({
+                urls: ["https://firecrawl.dev"],
+                schema: {
+                    "type": "object",
+                    "properties": {
+                        "company_mission": {
+                            "type": "string"
+                        },
+                        "is_open_source": {
+                            "type": "boolean"
+                        }
+                    },
+                    "required": [
+                        "company_mission",
+                        "is_open_source"
+                    ]
+                },
+                origin: "api-sdk",
+            });
+            expect(res.data).toHaveProperty("company_mission");
+            expect(typeof res.data.company_mission).toBe("string")
+            expect(res.data).toHaveProperty("is_open_source");
+            expect(typeof res.data.is_open_source).toBe("boolean");
+            expect(res.data.is_open_source).toBe(true);
+        }, 60000);
+        it.concurrent("works with unsupported JSON schema parameters", async () => {
+            const res = await extract({
+                urls: ["https://firecrawl.dev"],
+                schema: {
+                    "type": "object",
+                    "properties": {
+                        "company_name": {
+                            "type": "string",
+                            "pattern": "^[a-zA-Z0-9]+$"
+                        },
+                    },
+                    "required": [
+                        "company_name"
+                    ]
+                },
+                origin: "api-sdk",
+            });
+            expect(res.data).toHaveProperty("company_name");
+            expect(typeof res.data.company_name).toBe("string")
+        }, 60000);
+    } else {
+        it.concurrent("dummy test", () => {
+            expect(true).toBe(true);
+        });
+    }
+});

src/__tests__/snips/lib.ts ADDED Viewed

	@@ -0,0 +1,273 @@

+import { configDotenv } from "dotenv";
+configDotenv();
+import { ScrapeRequestInput, Document, ExtractRequestInput, ExtractResponse, CrawlRequestInput, MapRequestInput, BatchScrapeRequestInput, SearchRequestInput, CrawlStatusResponse } from "../../controllers/v1/types";
+import request from "supertest";
+// =========================================
+// Configuration
+// =========================================
+const TEST_URL = "http://127.0.0.1:3002";
+// =========================================
+// Scrape API
+// =========================================
+async function scrapeRaw(body: ScrapeRequestInput) {
+    return await request(TEST_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(body);
+}
+function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrapeRaw>>) {
+    expect(response.statusCode).toBe(200);
+    expect(response.body.success).toBe(true);
+    expect(typeof response.body.data).toBe("object");
+}
+export async function scrape(body: ScrapeRequestInput): Promise<Document> {
+    const raw = await scrapeRaw(body);
+    expectScrapeToSucceed(raw);
+    return raw.body.data;
+}
+export async function scrapeStatusRaw(jobId: string) {
+    return await request(TEST_URL)
+        .get("/v1/scrape/" + encodeURIComponent(jobId))
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .send();
+}
+export async function scrapeStatus(jobId: string): Promise<Document> {
+    const raw = await scrapeStatusRaw(jobId);
+    expect(raw.statusCode).toBe(200);
+    expect(raw.body.success).toBe(true);
+    expect(typeof raw.body.data).toBe("object");
+    expect(raw.body.data).not.toBeNull();
+    expect(raw.body.data).toBeDefined();
+    return raw.body.data;
+}
+// =========================================
+// Crawl API
+// =========================================
+async function crawlStart(body: CrawlRequestInput) {
+    return await request(TEST_URL)
+        .post("/v1/crawl")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(body);
+}
+async function crawlStatus(id: string) {
+    return await request(TEST_URL)
+        .get("/v1/crawl/" + encodeURIComponent(id))
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .send();
+}
+function expectCrawlStartToSucceed(response: Awaited<ReturnType<typeof crawlStart>>) {
+    expect(response.statusCode).toBe(200);
+    expect(response.body.success).toBe(true);
+    expect(typeof response.body.id).toBe("string");
+}
+function expectCrawlToSucceed(response: Awaited<ReturnType<typeof crawlStatus>>) {
+    expect(response.statusCode).toBe(200);
+    expect(response.body.success).toBe(true);
+    expect(typeof response.body.status).toBe("string");
+    expect(response.body.status).toBe("completed");
+    expect(response.body).toHaveProperty("data");
+    expect(Array.isArray(response.body.data)).toBe(true);
+    expect(response.body.data.length).toBeGreaterThan(0);
+}
+export async function crawl(body: CrawlRequestInput): Promise<CrawlStatusResponse> {
+    const cs = await crawlStart(body);
+    expectCrawlStartToSucceed(cs);
+    let x;
+    do {
+        x = await crawlStatus(cs.body.id);
+        expect(x.statusCode).toBe(200);
+        expect(typeof x.body.status).toBe("string");
+    } while (x.body.status === "scraping");
+    expectCrawlToSucceed(x);
+    return x.body;
+}
+// =========================================
+// Batch Scrape API
+// =========================================
+async function batchScrapeStart(body: BatchScrapeRequestInput) {
+    return await request(TEST_URL)
+        .post("/v1/batch/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(body);
+}
+async function batchScrapeStatus(id: string) {
+    return await request(TEST_URL)
+        .get("/v1/batch/scrape/" + encodeURIComponent(id))
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .send();
+}
+function expectBatchScrapeStartToSucceed(response: Awaited<ReturnType<typeof batchScrape>>) {
+    expect(response.statusCode).toBe(200);
+    expect(response.body.success).toBe(true);
+    expect(typeof response.body.id).toBe("string");
+}
+function expectBatchScrapeToSucceed(response: Awaited<ReturnType<typeof batchScrapeStatus>>) {
+    expect(response.statusCode).toBe(200);
+    expect(response.body.success).toBe(true);
+    expect(typeof response.body.status).toBe("string");
+    expect(response.body.status).toBe("completed");
+    expect(response.body).toHaveProperty("data");
+    expect(Array.isArray(response.body.data)).toBe(true);
+    expect(response.body.data.length).toBeGreaterThan(0);
+}
+export async function batchScrape(body: BatchScrapeRequestInput): ReturnType<typeof batchScrapeStatus> {
+    const bss = await batchScrapeStart(body);
+    expectBatchScrapeStartToSucceed(bss);
+    let x;
+    do {
+        x = await batchScrapeStatus(bss.body.id);
+        expect(x.statusCode).toBe(200);
+        expect(typeof x.body.status).toBe("string");
+    } while (x.body.status === "scraping");
+    expectBatchScrapeToSucceed(x);
+    return x;
+}
+// =========================================
+// Map API
+// =========================================
+export async function map(body: MapRequestInput) {
+    return await request(TEST_URL)
+        .post("/v1/map")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(body);
+}
+export function expectMapToSucceed(response: Awaited<ReturnType<typeof map>>) {
+    expect(response.statusCode).toBe(200);
+    expect(response.body.success).toBe(true);
+    expect(Array.isArray(response.body.links)).toBe(true);
+    expect(response.body.links.length).toBeGreaterThan(0);
+}
+// =========================================
+// Extract API
+// =========================================
+async function extractStart(body: ExtractRequestInput) {
+    return await request(TEST_URL)
+        .post("/v1/extract")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(body);
+}
+async function extractStatus(id: string) {
+    return await request(TEST_URL)
+        .get("/v1/extract/" + encodeURIComponent(id))
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .send();
+}
+function expectExtractStartToSucceed(response: Awaited<ReturnType<typeof extractStart>>) {
+    expect(response.statusCode).toBe(200);
+    expect(response.body.success).toBe(true);
+    expect(typeof response.body.id).toBe("string");
+}
+function expectExtractToSucceed(response: Awaited<ReturnType<typeof extractStatus>>) {
+    expect(response.statusCode).toBe(200);
+    expect(response.body.success).toBe(true);
+    expect(typeof response.body.status).toBe("string");
+    expect(response.body.status).toBe("completed");
+    expect(response.body).toHaveProperty("data");
+}
+export async function extract(body: ExtractRequestInput): Promise<ExtractResponse> {
+    const es = await extractStart(body);
+    expectExtractStartToSucceed(es);
+    let x;
+    do {
+        x = await extractStatus(es.body.id);
+        expect(x.statusCode).toBe(200);
+        expect(typeof x.body.status).toBe("string");
+    } while (x.body.status === "processing");
+    expectExtractToSucceed(x);
+    return x.body;
+}
+// =========================================
+// Search API
+// =========================================
+async function searchRaw(body: SearchRequestInput) {
+    return await request(TEST_URL)
+        .post("/v1/search")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(body);
+}
+function expectSearchToSucceed(response: Awaited<ReturnType<typeof searchRaw>>) {
+    expect(response.statusCode).toBe(200);
+    expect(response.body.success).toBe(true);
+    expect(typeof response.body.data).toBe("object");
+    expect(Array.isArray(response.body.data)).toBe(true);
+    expect(response.body.data.length).toBeGreaterThan(0);
+}
+export async function search(body: SearchRequestInput): Promise<Document[]> {
+    const raw = await searchRaw(body);
+    expectSearchToSucceed(raw);
+    return raw.body.data;
+}
+// =========================================
+// Billing API
+// =========================================
+export async function creditUsage(): Promise<{ remaining_credits: number }> {
+    const req = (await request(TEST_URL)
+    .get("/v1/team/credit-usage")
+    .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+    .set("Content-Type", "application/json"));
+    if (req.status !== 200) {
+        throw req.body;
+    }
+    return req.body.data;
+}
+export async function tokenUsage(): Promise<{ remaining_tokens: number }> {
+    return (await request(TEST_URL)
+        .get("/v1/team/token-usage")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")).body.data;
+}

src/__tests__/snips/map.test.ts ADDED Viewed

	@@ -0,0 +1,34 @@

+import { expectMapToSucceed, map } from "./lib";
+describe("Map tests", () => {
+  it.concurrent("basic map succeeds", async () => {
+    const response = await map({
+      url: "http://firecrawl.dev",
+    });
+    expectMapToSucceed(response);
+  }, 10000);
+  it.concurrent("times out properly", async () => {
+    const response = await map({
+      url: "http://firecrawl.dev",
+      timeout: 1
+    });
+    expect(response.statusCode).toBe(408);
+    expect(response.body.success).toBe(false);
+    expect(response.body.error).toBe("Request timed out");
+  }, 10000);
+  it.concurrent("handles query parameters correctly", async () => {
+    let response = await map({
+      url: "https://www.hfea.gov.uk",
+      sitemapOnly: true,
+      useMock: "map-query-params",
+    });
+    expect(response.statusCode).toBe(200);
+    expect(response.body.success).toBe(true);
+    expect(response.body.links.some(x => x.match(/^https:\/\/www\.hfea\.gov\.uk\/choose-a-clinic\/clinic-search\/results\/?\?options=\d+$/))).toBe(true);
+  }, 60000);
+});

src/__tests__/snips/mocks/map-query-params.json ADDED Viewed

The diff for this file is too large to render. See raw diff

src/__tests__/snips/mocks/mocking-works-properly.json ADDED Viewed

	@@ -0,0 +1,107 @@

+[
+    {
+        "time": 1735911273239,
+        "options": {
+            "url": "<fire-engine>/scrape",
+            "method": "POST",
+            "body": {
+                "url": "http://firecrawl.dev",
+                "engine": "chrome-cdp",
+                "instantReturn": true,
+                "skipTlsVerification": false,
+                "priority": 10,
+                "mobile": false,
+                "timeout": 15000
+            },
+            "headers": {},
+            "ignoreResponse": false,
+            "ignoreFailure": false,
+            "tryCount": 3
+        },
+        "result": {
+            "status": 200,
+            "headers": {},
+            "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp\",\"processing\":true}"
+        }
+    },
+    {
+        "time": 1735911273354,
+        "options": {
+            "url": "<fire-engine>/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
+            "method": "GET",
+            "headers": {},
+            "ignoreResponse": false,
+            "ignoreFailure": false,
+            "tryCount": 1
+        },
+        "result": {
+            "status": 200,
+            "headers": {},
+            "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"prioritized\",\"processing\":true}"
+        }
+    },
+    {
+        "time": 1735911273720,
+        "options": {
+            "url": "<fire-engine>/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
+            "method": "GET",
+            "headers": {},
+            "ignoreResponse": false,
+            "ignoreFailure": false,
+            "tryCount": 1
+        },
+        "result": {
+            "status": 200,
+            "headers": {},
+            "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"active\",\"processing\":true}"
+        }
+    },
+    {
+        "time": 1735911274092,
+        "options": {
+            "url": "<fire-engine>/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
+            "method": "GET",
+            "headers": {},
+            "ignoreResponse": false,
+            "ignoreFailure": false,
+            "tryCount": 1
+        },
+        "result": {
+            "status": 200,
+            "headers": {},
+            "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"active\",\"processing\":true}"
+        }
+    },
+    {
+        "time": 1735911274467,
+        "options": {
+            "url": "<fire-engine>/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
+            "method": "GET",
+            "headers": {},
+            "ignoreResponse": false,
+            "ignoreFailure": false,
+            "tryCount": 1
+        },
+        "result": {
+            "status": 200,
+            "headers": {},
+            "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"active\",\"processing\":true}"
+        }
+    },
+    {
+        "time": 1735911274947,
+        "options": {
+            "url": "<fire-engine>/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
+            "method": "GET",
+            "headers": {},
+            "ignoreResponse": false,
+            "ignoreFailure": false,
+            "tryCount": 1
+        },
+        "result": {
+            "status": 200,
+            "headers": {},
+            "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"completed\",\"processing\":false,\"timeTaken\":1.204,\"content\":\"<!DOCTYPE html><html lang=\\\"en\\\"><body><p>this is fake data coming from the mocking system!</p></body></html>\",\"url\":\"https://www.firecrawl.dev/\",\"screenshots\":[],\"actionContent\":[],\"pageStatusCode\":200,\"responseHeaders\":{\"X-DNS-Prefetch-Control\":\"off\",\"age\":\"0\",\"cache-control\":\"private, no-cache, no-store, max-age=0, must-revalidate\",\"content-encoding\":\"br\",\"content-type\":\"text/html; charset=utf-8\",\"date\":\"Fri, 03 Jan 2025 13:34:34 GMT\",\"link\":\"</_next/static/media/171883e03d2067b6-s.p.woff2>; rel=preload; as=\\\"font\\\"; crossorigin=\\\"\\\"; type=\\\"font/woff2\\\", </_next/static/media/a34f9d1faa5f3315-s.p.woff2>; rel=preload; as=\\\"font\\\"; crossorigin=\\\"\\\"; type=\\\"font/woff2\\\", </_next/static/media/c4c7b0ec92b72e30-s.p.woff2>; rel=preload; as=\\\"font\\\"; crossorigin=\\\"\\\"; type=\\\"font/woff2\\\"\",\"permissions-policy\":\"keyboard-map=(), attribution-reporting=(), run-ad-auction=(), private-state-token-redemption=(), private-state-token-issuance=(), join-ad-interest-group=(), idle-detection=(), compute-pressure=(), browsing-topics=()\",\"server\":\"Vercel\",\"strict-transport-security\":\"max-age=63072000\",\"vary\":\"RSC, Next-Router-State-Tree, Next-Router-Prefetch\",\"x-matched-path\":\"/\",\"x-powered-by\":\"Next.js\",\"x-vercel-cache\":\"MISS\",\"x-vercel-id\":\"iad1::iad1::bs88l-1735911273932-1f7bba7a8b45\"},\"invalidTlsCert\":false,\"file\":null}"
+        }
+    }
+]

src/__tests__/snips/scrape.test.ts ADDED Viewed

	@@ -0,0 +1,330 @@

+import { scrape, scrapeStatus } from "./lib";
+describe("Scrape tests", () => {
+  it.concurrent("mocking works properly", async () => {
+    // depends on falsified mock mocking-works-properly
+    // this test will fail if mock is bypassed with real data -- firecrawl.dev will never have
+    // that as its actual markdown output
+    const response = await scrape({
+      url: "http://firecrawl.dev",
+      useMock: "mocking-works-properly",
+    });
+    expect(response.markdown).toBe(
+      "this is fake data coming from the mocking system!",
+    );
+  }, 30000);
+  it.concurrent("works", async () => {
+    const response = await scrape({
+      url: "http://firecrawl.dev"
+    });
+    expect(response.markdown).toContain("Firecrawl");
+  }, 30000);
+  it.concurrent("scrape status works", async () => {
+    const response = await scrape({
+      url: "http://firecrawl.dev"
+    });
+    expect(response.markdown).toContain("Firecrawl");
+    const status = await scrapeStatus(response.metadata.scrapeId!);
+    expect(JSON.stringify(status)).toBe(JSON.stringify(response));
+  }, 60000);
+  it.concurrent("handles non-UTF-8 encodings", async () => {
+    const response = await scrape({
+      url: "https://www.rtpro.yamaha.co.jp/RT/docs/misc/kanji-sjis.html",
+    });
+    expect(response.markdown).toContain("ぐ け げ こ ご さ ざ し じ す ず せ ぜ そ ぞ た");
+  }, 30000);
+  if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) {
+    it.concurrent("self-hosted proxy works", async () => {
+      const response = await scrape({
+        url: "https://icanhazip.com"
+      });
+      expect(response.markdown?.trim()).toBe(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]);
+    }, 30000);
+  }
+  if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.PLAYWRIGHT_MICROSERVICE_URL) {
+    it.concurrent("waitFor works", async () => {
+      const response = await scrape({
+        url: "http://firecrawl.dev",
+        waitFor: 2000,
+      });
+      expect(response.markdown).toContain("Firecrawl");
+    }, 30000);
+  }
+  describe("JSON scrape support", () => {
+    it.concurrent("returns parseable JSON", async () => {
+      const response = await scrape({
+        url: "https://jsonplaceholder.typicode.com/todos/1",
+        formats: ["rawHtml"],
+      });
+      const obj = JSON.parse(response.rawHtml!);
+      expect(obj.id).toBe(1);
+    }, 30000);
+  });
+  if (!process.env.TEST_SUITE_SELF_HOSTED) {
+    // describe("Ad blocking (f-e dependant)", () => {
+    //   it.concurrent("blocks ads by default", async () => {
+    //     const response = await scrape({
+    //       url: "https://www.allrecipes.com/recipe/18185/yum/",
+    //     });
+    //     expect(response.markdown).not.toContain(".g.doubleclick.net/");
+    //   }, 30000);
+    //   it.concurrent("doesn't block ads if explicitly disabled", async () => {
+    //     const response = await scrape({
+    //       url: "https://www.allrecipes.com/recipe/18185/yum/",
+    //       blockAds: false,
+    //     });
+    //     expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//);
+    //   }, 30000);
+    // });
+    describe("Change Tracking format", () => {
+      it.concurrent("works", async () => {
+        const response = await scrape({
+          url: "https://example.com",
+          formats: ["markdown", "changeTracking"],
+        });
+        expect(response.changeTracking).toBeDefined();
+        expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
+      }, 30000);
+      it.concurrent("includes git diff when requested", async () => {
+        const response = await scrape({
+          url: "https://example.com",
+          formats: ["markdown", "changeTracking"],
+          changeTrackingOptions: {
+            modes: ["git-diff"]
+          }
+        });
+        expect(response.changeTracking).toBeDefined();
+        expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
+        if (response.changeTracking?.changeStatus === "changed") {
+          expect(response.changeTracking?.diff).toBeDefined();
+          expect(response.changeTracking?.diff?.text).toBeDefined();
+          expect(response.changeTracking?.diff?.json).toBeDefined();
+          expect(response.changeTracking?.diff?.json.files).toBeInstanceOf(Array);
+        }
+      }, 30000);
+      it.concurrent("includes structured output when requested", async () => {
+        const response = await scrape({
+          url: "https://example.com",
+          formats: ["markdown", "changeTracking"],
+          changeTrackingOptions: {
+            modes: ["json"],
+            prompt: "Summarize the changes between the previous and current content",
+          }
+        });
+        expect(response.changeTracking).toBeDefined();
+        expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
+        if (response.changeTracking?.changeStatus === "changed") {
+          expect(response.changeTracking?.json).toBeDefined();
+        }
+      }, 30000);
+      it.concurrent("supports schema-based extraction for change tracking", async () => {
+        const response = await scrape({
+          url: "https://example.com",
+          formats: ["markdown", "changeTracking"],
+          changeTrackingOptions: {
+            modes: ["json"],
+            schema: {
+              type: "object",
+              properties: {
+                pricing: {
+                  type: "object",
+                  properties: {
+                    amount: { type: "number" },
+                    currency: { type: "string" }
+                  }
+                },
+                features: {
+                  type: "array",
+                  items: { type: "string" }
+                }
+              }
+            }
+          }
+        });
+        expect(response.changeTracking).toBeDefined();
+        expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
+        if (response.changeTracking?.changeStatus === "changed") {
+          expect(response.changeTracking?.json).toBeDefined();
+          if (response.changeTracking?.json.pricing) {
+            expect(response.changeTracking?.json.pricing).toHaveProperty("old");
+            expect(response.changeTracking?.json.pricing).toHaveProperty("new");
+          }
+          if (response.changeTracking?.json.features) {
+            expect(response.changeTracking?.json.features).toHaveProperty("old");
+            expect(response.changeTracking?.json.features).toHaveProperty("new");
+          }
+        }
+      }, 30000);
+      it.concurrent("supports both git-diff and structured modes together", async () => {
+        const response = await scrape({
+          url: "https://example.com",
+          formats: ["markdown", "changeTracking"],
+          changeTrackingOptions: {
+            modes: ["git-diff", "json"],
+            schema: {
+              type: "object",
+              properties: {
+                summary: { type: "string" },
+                changes: { type: "array", items: { type: "string" } }
+              }
+            }
+          }
+        });
+        expect(response.changeTracking).toBeDefined();
+        expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
+        if (response.changeTracking?.changeStatus === "changed") {
+          expect(response.changeTracking?.diff).toBeDefined();
+          expect(response.changeTracking?.diff?.text).toBeDefined();
+          expect(response.changeTracking?.diff?.json).toBeDefined();
+          expect(response.changeTracking?.json).toBeDefined();
+          expect(response.changeTracking?.json).toHaveProperty("summary");
+          expect(response.changeTracking?.json).toHaveProperty("changes");
+        }
+      }, 30000);
+    });
+    describe("Location API (f-e dependant)", () => {
+      it.concurrent("works without specifying an explicit location", async () => {
+        await scrape({
+          url: "https://iplocation.com",
+        });
+      }, 30000);
+      it.concurrent("works with country US", async () => {
+        const response = await scrape({
+          url: "https://iplocation.com",
+          location: { country: "US" },
+        });
+        expect(response.markdown).toContain("| Country | United States |");
+      }, 30000);
+    });
+    describe("Screenshot (f-e/sb dependant)", () => {
+      it.concurrent("screenshot format works", async () => {
+        const response = await scrape({
+          url: "http://firecrawl.dev",
+          formats: ["screenshot"]
+        });
+        expect(typeof response.screenshot).toBe("string");
+      }, 30000);
+      it.concurrent("screenshot@fullPage format works", async () => {
+        const response = await scrape({
+          url: "http://firecrawl.dev",
+          formats: ["screenshot@fullPage"]
+        });
+        expect(typeof response.screenshot).toBe("string");
+      }, 30000);
+    });
+    describe("Proxy API (f-e dependant)", () => {
+      it.concurrent("undefined works", async () => {
+        await scrape({
+          url: "http://firecrawl.dev",
+        });
+      }, 30000);
+      it.concurrent("basic works", async () => {
+        await scrape({
+          url: "http://firecrawl.dev",
+          proxy: "basic",
+        });
+      }, 30000);
+      it.concurrent("stealth works", async () => {
+        await scrape({
+          url: "http://firecrawl.dev",
+          proxy: "stealth",
+          timeout: 120000,
+        });
+      }, 130000);
+    });
+    // Temporarily disabled, too flaky
+    // describe("PDF (f-e dependant)", () => {
+    //   it.concurrent("works for PDFs behind anti-bot", async () => {
+    //     const response = await scrape({
+    //       url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf"
+    //     });
+    //     expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix");
+    //   }, 60000);
+    // });
+  }
+  if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY || process.env.OLLAMA_BASE_URL) {
+    describe("JSON format", () => {
+      it.concurrent("works", async () => {
+        const response = await scrape({
+          url: "http://firecrawl.dev",
+          formats: ["json"],
+          jsonOptions: {
+            prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
+            schema: {
+              type: "object",
+              properties: {
+                company_mission: {
+                  type: "string",
+                },
+                supports_sso: {
+                  type: "boolean",
+                },
+                is_open_source: {
+                  type: "boolean",
+                },
+              },
+              required: ["company_mission", "supports_sso", "is_open_source"],
+            },
+          },
+        });
+        expect(response).toHaveProperty("json");
+        expect(response.json).toHaveProperty("company_mission");
+        expect(typeof response.json.company_mission).toBe("string");
+        expect(response.json).toHaveProperty("supports_sso");
+        expect(response.json.supports_sso).toBe(false);
+        expect(typeof response.json.supports_sso).toBe("boolean");
+        expect(response.json).toHaveProperty("is_open_source");
+        expect(response.json.is_open_source).toBe(true);
+        expect(typeof response.json.is_open_source).toBe("boolean");
+      }, 30000);
+    });
+  }
+});

src/__tests__/snips/search.test.ts ADDED Viewed

	@@ -0,0 +1,23 @@

+import { search } from "./lib";
+describe("Search tests", () => {
+  it.concurrent("works", async () => {
+    await search({
+      query: "firecrawl"
+    });
+  }, 60000);
+  it.concurrent("works with scrape", async () => {
+    const res = await search({
+      query: "firecrawl",
+      limit: 5,
+      scrapeOptions: {
+        formats: ["markdown"],
+      },
+    });
+    for (const doc of res) {
+      expect(doc.markdown).toBeDefined();
+    }
+  }, 60000);
+});

src/__tests__/snips/utils/collect-mocks.js ADDED Viewed

	@@ -0,0 +1,14 @@

+const path = require("path");
+const fs = require("fs");
+const mocksDirPath = path.join(__dirname, "../../../scraper/scrapeURL/mocks");
+const files = fs.readdirSync(mocksDirPath);
+const contents = files.map((x) =>
+  JSON.parse(fs.readFileSync(path.join(mocksDirPath, x), "utf8")),
+);
+fs.writeFileSync(
+  path.join(__dirname, "../mocks/" + process.argv[2] + ".json"),
+  JSON.stringify(contents, undefined, 4),
+);

src/control.ts ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ // ! IN CASE OPENAI goes down, then activate the fallback -> true
2	+ export const is_fallback = false;

src/controllers/__tests__/crawl.test.ts ADDED Viewed

	@@ -0,0 +1,51 @@

+import { crawlController } from "../v0/crawl";
+import { Request, Response } from "express";
+import { authenticateUser } from "../auth"; // Ensure this import is correct
+import { createIdempotencyKey } from "../../services/idempotency/create";
+import { validateIdempotencyKey } from "../../services/idempotency/validate";
+import { v4 as uuidv4 } from "uuid";
+jest.mock("../auth", () => ({
+  authenticateUser: jest.fn().mockResolvedValue({
+    success: true,
+    team_id: "team123",
+    error: null,
+    status: 200,
+  }),
+  reduce: jest.fn(),
+}));
+jest.mock("../../services/idempotency/validate");
+describe("crawlController", () => {
+  it("should prevent duplicate requests using the same idempotency key", async () => {
+    const req = {
+      headers: {
+        "x-idempotency-key": await uuidv4(),
+        Authorization: `Bearer ${process.env.TEST_API_KEY}`,
+      },
+      body: {
+        url: "https://mendable.ai",
+      },
+    } as unknown as Request;
+    const res = {
+      status: jest.fn().mockReturnThis(),
+      json: jest.fn(),
+    } as unknown as Response;
+    // Mock the idempotency key validation to return false for the second call
+    (validateIdempotencyKey as jest.Mock)
+      .mockResolvedValueOnce(true)
+      .mockResolvedValueOnce(false);
+    // First request should succeed
+    await crawlController(req, res);
+    expect(res.status).not.toHaveBeenCalledWith(409);
+    // Second request with the same key should fail
+    await crawlController(req, res);
+    expect(res.status).toHaveBeenCalledWith(409);
+    expect(res.json).toHaveBeenCalledWith({
+      error: "Idempotency key already used",
+    });
+  });
+});

src/controllers/auth.ts ADDED Viewed

	@@ -0,0 +1,519 @@

+import { parseApi } from "../lib/parseApi";
+import { getRateLimiter } from "../services/rate-limiter";
+import {
+  AuthResponse,
+  NotificationType,
+  RateLimiterMode,
+} from "../types";
+import { supabase_rr_service, supabase_service } from "../services/supabase";
+import { withAuth } from "../lib/withAuth";
+import { RateLimiterRedis } from "rate-limiter-flexible";
+import { sendNotification } from "../services/notification/email_notification";
+import { logger } from "../lib/logger";
+import { redlock } from "../services/redlock";
+import { deleteKey, getValue } from "../services/redis";
+import { setValue } from "../services/redis";
+import { validate } from "uuid";
+import * as Sentry from "@sentry/node";
+import { AuthCreditUsageChunk, AuthCreditUsageChunkFromTeam } from "./v1/types";
+// const { data, error } = await supabase_service
+//     .from('api_keys')
+//     .select(`
+//       key,
+//       team_id,
+//       teams (
+//         subscriptions (
+//           price_id
+//         )
+//       )
+//     `)
+//     .eq('key', normalizedApi)
+//     .limit(1)
+//     .single();
+function normalizedApiIsUuid(potentialUuid: string): boolean {
+  // Check if the string is a valid UUID
+  return validate(potentialUuid);
+}
+export async function setCachedACUC(
+  api_key: string,
+  is_extract: boolean,
+  acuc:
+    | AuthCreditUsageChunk
+    | null
+    | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null),
+) {
+  const cacheKeyACUC = `acuc_${api_key}_${is_extract ? "extract" : "scrape"}`;
+  const redLockKey = `lock_${cacheKeyACUC}`;
+  try {
+    await redlock.using([redLockKey], 10000, {}, async (signal) => {
+      if (typeof acuc === "function") {
+        acuc = acuc(JSON.parse((await getValue(cacheKeyACUC)) ?? "null"));
+        if (acuc === null) {
+          if (signal.aborted) {
+            throw signal.error;
+          }
+          return;
+        }
+      }
+      if (signal.aborted) {
+        throw signal.error;
+      }
+      // Cache for 10 minutes. - mogery
+      await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true);
+    });
+  } catch (error) {
+    logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
+  }
+}
+const mockPreviewACUC: (team_id: string, is_extract: boolean) => AuthCreditUsageChunk = (team_id, is_extract) => ({
+  api_key: "preview",
+  team_id,
+  sub_id: "bypass",
+  sub_current_period_start: new Date().toISOString(),
+  sub_current_period_end: new Date(new Date().getTime() + 30 * 24 * 60 * 60 * 1000).toISOString(),
+  sub_user_id: "bypass",
+  price_id: "bypass",
+  rate_limits: {
+    crawl: 2,
+    scrape: 10,
+    extract: 10,
+    search: 5,
+    map: 5,
+    preview: 5,
+    crawlStatus: 500,
+    extractStatus: 500,
+    extractAgentPreview: 1,
+    scrapeAgentPreview: 5,
+  },
+  price_credits: 99999999,
+  credits_used: 0,
+  coupon_credits: 99999999,
+  adjusted_credits_used: 0,
+  remaining_credits: 99999999,
+  total_credits_sum: 99999999,
+  plan_priority: {
+    bucketLimit: 25,
+    planModifier: 0.1,
+  },
+  concurrency: is_extract ? 200 : 2,
+  is_extract,
+});
+const mockACUC: () => AuthCreditUsageChunk = () => ({
+  api_key: "bypass",
+  team_id: "bypass",
+  sub_id: "bypass",
+  sub_current_period_start: new Date().toISOString(),
+  sub_current_period_end: new Date(new Date().getTime() + 30 * 24 * 60 * 60 * 1000).toISOString(),
+  sub_user_id: "bypass",
+  price_id: "bypass",
+  rate_limits: {
+    crawl: 99999999,
+    scrape: 99999999,
+    extract: 99999999,
+    search: 99999999,
+    map: 99999999,
+    preview: 99999999,
+    crawlStatus: 99999999,
+    extractStatus: 99999999,
+    extractAgentPreview: 99999999,
+    scrapeAgentPreview: 99999999,
+  },
+  price_credits: 99999999,
+  credits_used: 0,
+  coupon_credits: 99999999,
+  adjusted_credits_used: 0,
+  remaining_credits: 99999999,
+  total_credits_sum: 99999999,
+  plan_priority: {
+    bucketLimit: 25,
+    planModifier: 0.1,
+  },
+  concurrency: 99999999,
+  is_extract: false,
+});
+export async function getACUC(
+  api_key: string,
+  cacheOnly = false,
+  useCache = true,
+  mode?: RateLimiterMode,
+): Promise<AuthCreditUsageChunk | null> {
+  let isExtract =
+      mode === RateLimiterMode.Extract ||
+      mode === RateLimiterMode.ExtractStatus;
+  if (api_key === process.env.PREVIEW_TOKEN) {
+    const acuc = mockPreviewACUC(api_key, isExtract);
+    acuc.is_extract = isExtract;
+    return acuc;
+  }
+  if (process.env.USE_DB_AUTHENTICATION !== "true") {
+    const acuc = mockACUC();
+    acuc.is_extract = isExtract;
+    return acuc;
+  }
+  const cacheKeyACUC = `acuc_${api_key}_${isExtract ? "extract" : "scrape"}`;
+  if (useCache) {
+    const cachedACUC = await getValue(cacheKeyACUC);
+    if (cachedACUC !== null) {
+      return JSON.parse(cachedACUC);
+    }
+  }
+  if (!cacheOnly) {
+    let data;
+    let error;
+    let retries = 0;
+    const maxRetries = 5;
+    while (retries < maxRetries) {
+      const client =
+        Math.random() > (2/3) ? supabase_rr_service : supabase_service;
+      ({ data, error } = await client.rpc(
+        "auth_credit_usage_chunk_30",
+        { input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true },
+        { get: true },
+      ));
+      if (!error) {
+        break;
+      }
+      logger.warn(
+        `Failed to retrieve authentication and credit usage data after ${retries}, trying again...`,
+        { error }
+      );
+      retries++;
+      if (retries === maxRetries) {
+        throw new Error(
+          "Failed to retrieve authentication and credit usage data after 3 attempts: " +
+            JSON.stringify(error),
+        );
+      }
+      // Wait for a short time before retrying
+      await new Promise((resolve) => setTimeout(resolve, 200));
+    }
+    const chunk: AuthCreditUsageChunk | null =
+      data.length === 0 ? null : data[0].team_id === null ? null : data[0];
+    // NOTE: Should we cache null chunks? - mogery
+    if (chunk !== null && useCache) {
+      setCachedACUC(api_key, isExtract, chunk);
+    }
+    return chunk ? { ...chunk, is_extract: isExtract } : null;
+  } else {
+    return null;
+  }
+}
+export async function setCachedACUCTeam(
+  team_id: string,
+  is_extract: boolean,
+  acuc:
+    | AuthCreditUsageChunkFromTeam
+    | null
+    | ((acuc: AuthCreditUsageChunkFromTeam) => AuthCreditUsageChunkFromTeam | null),
+) {
+  const cacheKeyACUC = `acuc_team_${team_id}_${is_extract ? "extract" : "scrape"}`;
+  const redLockKey = `lock_${cacheKeyACUC}`;
+  try {
+    await redlock.using([redLockKey], 10000, {}, async (signal) => {
+      if (typeof acuc === "function") {
+        acuc = acuc(JSON.parse((await getValue(cacheKeyACUC)) ?? "null"));
+        if (acuc === null) {
+          if (signal.aborted) {
+            throw signal.error;
+          }
+          return;
+        }
+      }
+      if (signal.aborted) {
+        throw signal.error;
+      }
+      // Cache for 10 minutes. - mogery
+      await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true);
+    });
+  } catch (error) {
+    logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
+  }
+}
+export async function getACUCTeam(
+  team_id: string,
+  cacheOnly = false,
+  useCache = true,
+  mode?: RateLimiterMode,
+): Promise<AuthCreditUsageChunkFromTeam | null> {
+  let isExtract =
+      mode === RateLimiterMode.Extract ||
+      mode === RateLimiterMode.ExtractStatus;
+  if (team_id.startsWith("preview")) {
+    const acuc = mockPreviewACUC(team_id, isExtract);
+    return acuc;
+  }
+  if (process.env.USE_DB_AUTHENTICATION !== "true") {
+    const acuc = mockACUC();
+    acuc.is_extract = isExtract;
+    return acuc;
+  }
+  const cacheKeyACUC = `acuc_team_${team_id}_${isExtract ? "extract" : "scrape"}`;
+  if (useCache) {
+    const cachedACUC = await getValue(cacheKeyACUC);
+    if (cachedACUC !== null) {
+      return JSON.parse(cachedACUC);
+    }
+  }
+  if (!cacheOnly) {
+    let data;
+    let error;
+    let retries = 0;
+    const maxRetries = 5;
+    while (retries < maxRetries) {
+      const client =
+        Math.random() > (2/3) ? supabase_rr_service : supabase_service;
+      ({ data, error } = await client.rpc(
+        "auth_credit_usage_chunk_30_from_team",
+        { input_team: team_id, i_is_extract: isExtract, tally_untallied_credits: true },
+        { get: true },
+      ));
+      if (!error) {
+        break;
+      }
+      logger.warn(
+        `Failed to retrieve authentication and credit usage data after ${retries}, trying again...`,
+        { error }
+      );
+      retries++;
+      if (retries === maxRetries) {
+        throw new Error(
+          "Failed to retrieve authentication and credit usage data after 3 attempts: " +
+            JSON.stringify(error),
+        );
+      }
+      // Wait for a short time before retrying
+      await new Promise((resolve) => setTimeout(resolve, 200));
+    }
+    const chunk: AuthCreditUsageChunk | null =
+      data.length === 0 ? null : data[0].team_id === null ? null : data[0];
+    // NOTE: Should we cache null chunks? - mogery
+    if (chunk !== null && useCache) {
+      setCachedACUCTeam(team_id, isExtract, chunk);
+    }
+    return chunk ? { ...chunk, is_extract: isExtract } : null;
+  } else {
+    return null;
+  }
+}
+export async function clearACUC(api_key: string): Promise<void> {
+  // Delete cache for all rate limiter modes
+  const modes = [true, false];
+  await Promise.all(
+    modes.map(async (mode) => {
+      const cacheKey = `acuc_${api_key}_${mode ? "extract" : "scrape"}`;
+      await deleteKey(cacheKey);
+    }),
+  );
+  // Also clear the base cache key
+  await deleteKey(`acuc_${api_key}`);
+}
+export async function clearACUCTeam(team_id: string): Promise<void> {
+  // Delete cache for all rate limiter modes
+  const modes = [true, false];
+  await Promise.all(
+    modes.map(async (mode) => {
+      const cacheKey = `acuc_team_${team_id}_${mode ? "extract" : "scrape"}`;
+      await deleteKey(cacheKey);
+    }),
+  );
+  // Also clear the base cache key
+  await deleteKey(`acuc_team_${team_id}`);
+}
+export async function authenticateUser(
+  req,
+  res,
+  mode?: RateLimiterMode,
+): Promise<AuthResponse> {
+  return withAuth(supaAuthenticateUser, {
+    success: true,
+    chunk: null,
+    team_id: "bypass",
+  })(req, res, mode);
+}
+export async function supaAuthenticateUser(
+  req,
+  res,
+  mode?: RateLimiterMode,
+): Promise<AuthResponse> {
+  const authHeader =
+    req.headers.authorization ??
+    (req.headers["sec-websocket-protocol"]
+      ? `Bearer ${req.headers["sec-websocket-protocol"]}`
+      : null);
+  if (!authHeader) {
+    return { success: false, error: "Unauthorized", status: 401 };
+  }
+  const token = authHeader.split(" ")[1]; // Extract the token from "Bearer <token>"
+  if (!token) {
+    return {
+      success: false,
+      error: "Unauthorized: Token missing",
+      status: 401,
+    };
+  }
+  const incomingIP = (req.headers["x-preview-ip"] || req.headers["x-forwarded-for"] ||
+    req.socket.remoteAddress) as string;
+  const iptoken = incomingIP + token;
+  let rateLimiter: RateLimiterRedis;
+  let subscriptionData: { team_id: string} | null = null;
+  let normalizedApi: string;
+  let teamId: string | null = null;
+  let priceId: string | null = null;
+  let chunk: AuthCreditUsageChunk | null = null;
+  if (token == "this_is_just_a_preview_token") {
+    throw new Error(
+      "Unauthenticated Playground calls are temporarily disabled due to abuse. Please sign up.",
+    );
+  }
+  if (token == process.env.PREVIEW_TOKEN) {
+    if (mode == RateLimiterMode.CrawlStatus) {
+      rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
+    } else if (mode == RateLimiterMode.ExtractStatus) {
+      rateLimiter = getRateLimiter(RateLimiterMode.ExtractStatus, token);
+    } else {
+      rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
+    }
+    teamId = `preview_${iptoken}`;
+  } else {
+    normalizedApi = parseApi(token);
+    if (!normalizedApiIsUuid(normalizedApi)) {
+      return {
+        success: false,
+        error: "Unauthorized: Invalid token",
+        status: 401,
+      };
+    }
+    chunk = await getACUC(normalizedApi, false, true, mode);
+    if (chunk === null) {
+      return {
+        success: false,
+        error: "Unauthorized: Invalid token",
+        status: 401,
+      };
+    }
+    teamId = chunk.team_id;
+    priceId = chunk.price_id;
+    subscriptionData = {
+      team_id: teamId,
+    };
+    rateLimiter = getRateLimiter(
+      mode ?? RateLimiterMode.Crawl,
+      chunk.rate_limits,
+    );
+  }
+  const team_endpoint_token =
+    token === process.env.PREVIEW_TOKEN ? iptoken : teamId;
+  try {
+    await rateLimiter.consume(team_endpoint_token);
+  } catch (rateLimiterRes) {
+    logger.error(`Rate limit exceeded: ${rateLimiterRes}`, {
+      teamId,
+      priceId,
+      mode,
+      rateLimits: chunk?.rate_limits,
+      rateLimiterRes,
+    });
+    const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
+    const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
+    // We can only send a rate limit email every 7 days, send notification already has the date in between checking
+    const startDate = new Date();
+    const endDate = new Date();
+    endDate.setDate(endDate.getDate() + 7);
+    // await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
+    return {
+      success: false,
+      error: `Rate limit exceeded. Consumed (req/min): ${rateLimiterRes.consumedPoints}, Remaining (req/min): ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
+      status: 429,
+    };
+  }
+  if (
+    token === process.env.PREVIEW_TOKEN &&
+    (mode === RateLimiterMode.Scrape ||
+      mode === RateLimiterMode.Preview ||
+      mode === RateLimiterMode.Map ||
+      mode === RateLimiterMode.Crawl ||
+      mode === RateLimiterMode.CrawlStatus ||
+      mode === RateLimiterMode.Extract ||
+      mode === RateLimiterMode.Search)
+  ) {
+    return {
+      success: true,
+      team_id: `preview_${iptoken}`,
+      chunk: null,
+    };
+    // check the origin of the request and make sure its from firecrawl.dev
+    // const origin = req.headers.origin;
+    // if (origin && origin.includes("firecrawl.dev")){
+    //   return { success: true, team_id: "preview" };
+    // }
+    // if(process.env.ENV !== "production") {
+    //   return { success: true, team_id: "preview" };
+    // }
+    // return { success: false, error: "Unauthorized: Invalid token", status: 401 };
+  }
+  return {
+    success: true,
+    team_id: teamId ?? undefined,
+    chunk,
+  };
+}

src/controllers/v0/admin/acuc-cache-clear.ts ADDED Viewed

	@@ -0,0 +1,24 @@

+import { Request, Response } from "express";
+import { supabase_service } from "../../../services/supabase";
+import { clearACUC, clearACUCTeam } from "../../auth";
+import { logger } from "../../../lib/logger";
+export async function acucCacheClearController(req: Request, res: Response) {
+  try {
+    const team_id: string = req.body.team_id;
+    const keys = await supabase_service
+      .from("api_keys")
+      .select("*")
+      .eq("team_id", team_id);
+    await Promise.all((keys.data ?? []).map((x) => clearACUC(x.key)));
+    await clearACUCTeam(team_id);
+    logger.info(`ACUC cache cleared for team ${team_id}`);
+    res.json({ ok: true });
+  } catch (error) {
+    logger.error(`Error clearing ACUC cache via API route: ${error}`);
+    res.status(500).json({ error: "Internal server error" });
+  }
+}