Echo-AI-official commited on
Commit
0e759d2
·
verified ·
1 Parent(s): 5ca4a31

Upload 280 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +4 -0
  2. .env +24 -0
  3. .env.example +80 -0
  4. .env.local +15 -0
  5. .gitattributes +2 -35
  6. .gitignore +13 -0
  7. .prettierrc +3 -0
  8. Dockerfile +75 -0
  9. docker-entrypoint.sh +24 -0
  10. jest.config.js +8 -0
  11. jest.setup.js +1 -0
  12. openapi-v0.json +924 -0
  13. openapi.json +929 -0
  14. package.json +156 -0
  15. pnpm-lock.yaml +0 -0
  16. requests.http +127 -0
  17. sharedLibs/go-html-to-md/.gitignore +2 -0
  18. sharedLibs/go-html-to-md/README.md +7 -0
  19. sharedLibs/go-html-to-md/go.mod +16 -0
  20. sharedLibs/go-html-to-md/go.sum +64 -0
  21. sharedLibs/go-html-to-md/html-to-markdown.go +25 -0
  22. sharedLibs/html-transformer/.gitignore +1 -0
  23. sharedLibs/html-transformer/Cargo.lock +1235 -0
  24. sharedLibs/html-transformer/Cargo.toml +15 -0
  25. sharedLibs/html-transformer/src/lib.rs +394 -0
  26. src/__tests__/concurrency-limit.test.ts +209 -0
  27. src/__tests__/deep-research/unit/deep-research-redis.test.ts +135 -0
  28. src/__tests__/e2e_extract/index.test.ts +340 -0
  29. src/__tests__/e2e_full_withAuth/index.test.ts +1762 -0
  30. src/__tests__/e2e_map/index.test.ts +117 -0
  31. src/__tests__/e2e_noAuth/index.test.ts +212 -0
  32. src/__tests__/e2e_v1_withAuth/index.test.ts +1066 -0
  33. src/__tests__/e2e_v1_withAuth_all_params/index.test.ts +711 -0
  34. src/__tests__/e2e_withAuth/index.test.ts +862 -0
  35. src/__tests__/queue-concurrency-integration.test.ts +269 -0
  36. src/__tests__/snips/batch-scrape.test.ts +51 -0
  37. src/__tests__/snips/billing.test.ts +197 -0
  38. src/__tests__/snips/crawl.test.ts +75 -0
  39. src/__tests__/snips/extract.test.ts +59 -0
  40. src/__tests__/snips/lib.ts +273 -0
  41. src/__tests__/snips/map.test.ts +34 -0
  42. src/__tests__/snips/mocks/map-query-params.json +0 -0
  43. src/__tests__/snips/mocks/mocking-works-properly.json +107 -0
  44. src/__tests__/snips/scrape.test.ts +330 -0
  45. src/__tests__/snips/search.test.ts +23 -0
  46. src/__tests__/snips/utils/collect-mocks.js +14 -0
  47. src/control.ts +2 -0
  48. src/controllers/__tests__/crawl.test.ts +51 -0
  49. src/controllers/auth.ts +519 -0
  50. src/controllers/v0/admin/acuc-cache-clear.ts +24 -0
.dockerignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ /node_modules/
2
+ /dist/
3
+ .env
4
+ *.csv
.env ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ./apps/api/.env
2
+
3
+ # ===== Required ENVS ======
4
+ NUM_WORKERS_PER_QUEUE=8
5
+ PORT=3002
6
+ HOST=0.0.0.0
7
+
8
+ # For running locally, use redis://localhost:6379
9
+ REDIS_URL=redis://localhost:6379
10
+
11
+ # For running locally, use redis://localhost:6379
12
+ REDIS_RATE_LIMIT_URL=redis://localhost:6379
13
+ PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000/html # Note: The docs mention this value for Docker self-hosting, its role in a pure local setup isn't fully detailed here but listed as required.
14
+
15
+ ## Keep DB authentication off for the basic setup
16
+ USE_DB_AUTHENTICATION=false
17
+
18
+ # ===== Optional ENVS ======
19
+ # You can leave the rest blank or commented out for the initial setup
20
+ # SUPABASE_ANON_TOKEN=
21
+ # SUPABASE_URL=
22
+ # SUPABASE_SERVICE_TOKEN=
23
+ # ... other optional variables ...
24
+ LOGGING_LEVEL=INFO # Default logging level
.env.example ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ===== Required ENVS ======
2
+ NUM_WORKERS_PER_QUEUE=8
3
+ PORT=3002
4
+ HOST=0.0.0.0
5
+ REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
6
+ REDIS_RATE_LIMIT_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
7
+ PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000/html
8
+
9
+ ## To turn on DB authentication, you need to set up supabase.
10
+ USE_DB_AUTHENTICATION=true
11
+
12
+ # ===== Optional ENVS ======
13
+
14
+ # SearchApi key. Head to https://searchapi.com/ to get your API key
15
+ SEARCHAPI_API_KEY=
16
+ # SearchApi engine, defaults to google. Available options: google, bing, baidu, google_news, etc. Head to https://searchapi.com/ to explore more engines
17
+ SEARCHAPI_ENGINE=
18
+
19
+ # Supabase Setup (used to support DB authentication, advanced logging, etc.)
20
+ SUPABASE_ANON_TOKEN=
21
+ SUPABASE_URL=
22
+ SUPABASE_SERVICE_TOKEN=
23
+
24
+ # Other Optionals
25
+ # use if you've set up authentication and want to test with a real API key
26
+ TEST_API_KEY=
27
+ # set if you'd like to test the scraping rate limit
28
+ RATE_LIMIT_TEST_API_KEY_SCRAPE=
29
+ # set if you'd like to test the crawling rate limit
30
+ RATE_LIMIT_TEST_API_KEY_CRAWL=
31
+ # set if you'd like to use scraping Be to handle JS blocking
32
+ SCRAPING_BEE_API_KEY=
33
+ # add for LLM dependednt features (image alt generation, etc.)
34
+ OPENAI_API_KEY=
35
+ BULL_AUTH_KEY=@
36
+ # set if you have a llamaparse key you'd like to use to parse pdfs
37
+ LLAMAPARSE_API_KEY=
38
+ # set if you'd like to send slack server health status messages
39
+ SLACK_WEBHOOK_URL=
40
+ # set if you'd like to send posthog events like job logs
41
+ POSTHOG_API_KEY=
42
+ # set if you'd like to send posthog events like job logs
43
+ POSTHOG_HOST=
44
+
45
+ STRIPE_PRICE_ID_STANDARD=
46
+ STRIPE_PRICE_ID_SCALE=
47
+ STRIPE_PRICE_ID_STARTER=
48
+ STRIPE_PRICE_ID_HOBBY=
49
+ STRIPE_PRICE_ID_HOBBY_YEARLY=
50
+ STRIPE_PRICE_ID_STANDARD_NEW=
51
+ STRIPE_PRICE_ID_STANDARD_NEW_YEARLY=
52
+ STRIPE_PRICE_ID_GROWTH=
53
+ STRIPE_PRICE_ID_GROWTH_YEARLY=
54
+
55
+ # set if you'd like to use the fire engine closed beta
56
+ FIRE_ENGINE_BETA_URL=
57
+
58
+ # Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request)
59
+ PROXY_SERVER=
60
+ PROXY_USERNAME=
61
+ PROXY_PASSWORD=
62
+ # set if you'd like to block media requests to save proxy bandwidth
63
+ BLOCK_MEDIA=
64
+
65
+ # Set this to the URL of your webhook when using the self-hosted version of FireCrawl
66
+ SELF_HOSTED_WEBHOOK_URL=
67
+
68
+ # Resend API Key for transactional emails
69
+ RESEND_API_KEY=
70
+
71
+ # LOGGING_LEVEL determines the verbosity of logs that the system will output.
72
+ # Available levels are:
73
+ # NONE - No logs will be output.
74
+ # ERROR - For logging error messages that indicate a failure in a specific operation.
75
+ # WARN - For logging potentially harmful situations that are not necessarily errors.
76
+ # INFO - For logging informational messages that highlight the progress of the application.
77
+ # DEBUG - For logging detailed information on the flow through the system, primarily used for debugging.
78
+ # TRACE - For logging more detailed information than the DEBUG level.
79
+ # Set LOGGING_LEVEL to one of the above options to control logging output.
80
+ LOGGING_LEVEL=INFO
.env.local ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NUM_WORKERS_PER_QUEUE=8
2
+ PORT=
3
+ HOST=
4
+ SUPABASE_ANON_TOKEN=
5
+ SUPABASE_URL=
6
+ SUPABASE_SERVICE_TOKEN=
7
+ REDIS_URL=
8
+ REDIS_RATE_LIMIT_URL=
9
+ SCRAPING_BEE_API_KEY=
10
+ OPENAI_API_KEY=
11
+ ANTHROPIC_API_KEY=
12
+ BULL_AUTH_KEY=
13
+ LOGTAIL_KEY=
14
+ PLAYWRIGHT_MICROSERVICE_URL=
15
+ SEARCHAPI_API_KEY=
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Auto detect text files and perform LF normalization
2
+ * text=auto
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /node_modules/
2
+ /dist/
3
+ .env
4
+ *.csv
5
+ dump.rdb
6
+ /mongo-data
7
+
8
+ /.next/
9
+
10
+ .rdb
11
+ .sentryclirc
12
+
13
+ .env.*
.prettierrc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "trailingComma": "all"
3
+ }
Dockerfile ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile (place this in the root of your firecrawl project)
2
+
3
+ # 1. Base Image: Use a Node.js LTS version that includes build tools
4
+ FROM node:18
5
+
6
+ # 2. Environment Variables
7
+ ENV PNPM_HOME="/pnpm"
8
+ ENV PATH="$PNPM_HOME:$PATH"
9
+ # Prevent apt-get from asking questions
10
+ ENV DEBIAN_FRONTEND=noninteractive
11
+ # Set Node environment (can be overridden by supervisor conf or HF secrets)
12
+ ENV NODE_ENV=production
13
+
14
+ # 3. Install System Dependencies: Redis, Supervisor, Git, and utilities
15
+ RUN apt-get update && apt-get install -y --no-install-recommends \
16
+ redis-server \
17
+ supervisor \
18
+ git \
19
+ curl \
20
+ wget \
21
+ gnupg \
22
+ # Clean up apt cache
23
+ && rm -rf /var/lib/apt/lists/*
24
+
25
+ # 4. Install pnpm v9+ globally
26
+ RUN npm install -g pnpm@9
27
+
28
+ # 5. Set Application Directory
29
+ WORKDIR /usr/src/app
30
+
31
+ # 6. Copy Package Definitions & Install Dependencies (Leverages Docker cache)
32
+ # Copy root files first
33
+ COPY package.json pnpm-lock.yaml ./
34
+ # Copy workspace config if it exists (use wildcard *)
35
+ COPY pnpm-workspace.yaml* ./
36
+ # Copy the specific package.json for the api app
37
+ COPY apps/api/package.json ./apps/api/
38
+
39
+ # Install ALL monorepo dependencies using the lockfile (includes devDeps needed for playwright)
40
+ RUN pnpm install --frozen-lockfile
41
+
42
+ # 7. Install Playwright Browsers & Dependencies
43
+ # This command downloads browsers (e.g., Chromium) AND tries to install needed OS libraries.
44
+ # Run this using the 'api' package context, assuming playwright is its dependency.
45
+ # Specify the browser(s) you need (e.g., chromium). Check Firecrawl needs.
46
+ RUN pnpm --filter api exec playwright install --with-deps chromium
47
+
48
+ # 8. Copy Application Code
49
+ # Copy the rest of your Firecrawl project code into the image
50
+ COPY . .
51
+
52
+ # 9. Copy Supervisor Configuration
53
+ COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
54
+
55
+ # 10. Configure Environment for Internal Communication (inside the container)
56
+ # These should match the values expected by the app when running internally
57
+ ENV PORT=3002 \
58
+ HOST=0.0.0.0 \
59
+ REDIS_URL=redis://localhost:6379 \
60
+ REDIS_RATE_LIMIT_URL=redis://localhost:6379 \
61
+ USE_DB_AUTHENTICATION=false \
62
+ # Add any other required non-secret ENVs here
63
+ LOGGING_LEVEL=INFO
64
+
65
+ # --- Configure Hugging Face Space specific settings ---
66
+ # Hugging Face will map its public port (e.g., 7860) to this internal port
67
+ EXPOSE 3002
68
+ # Health check endpoint (if Firecrawl has one, e.g., /test or /health)
69
+ # HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
70
+ # CMD curl -f http://localhost:3002/test || exit 1
71
+ # (Uncomment and adjust HEALTHCHECK if you know the correct endpoint)
72
+
73
+ # 11. Start Supervisor
74
+ # This command starts supervisord, which in turn starts redis, the api, and the worker(s) based on supervisord.conf
75
+ CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
docker-entrypoint.sh ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash -e
2
+
3
+ if [ "$UID" -eq 0 ]; then
4
+ set +e # disable failing on errror
5
+ ulimit -n 65535
6
+ echo "NEW ULIMIT: $(ulimit -n)"
7
+ set -e # enable failing on error
8
+ else
9
+ echo ENTRYPOINT DID NOT RUN AS ROOT
10
+ fi
11
+
12
+ if [ "$FLY_PROCESS_GROUP" = "app" ]; then
13
+ echo "RUNNING app"
14
+ node --max-old-space-size=8192 dist/src/index.js
15
+ elif [ "$FLY_PROCESS_GROUP" = "worker" ]; then
16
+ echo "RUNNING worker"
17
+ node --max-old-space-size=8192 dist/src/services/queue-worker.js
18
+ elif [ "$FLY_PROCESS_GROUP" = "index-worker" ]; then
19
+ echo "RUNNING index worker"
20
+ node --max-old-space-size=8192 dist/src/services/indexing/index-worker.js
21
+ else
22
+ echo "NO FLY PROCESS GROUP"
23
+ node --max-old-space-size=8192 dist/src/index.js
24
+ fi
jest.config.js ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ module.exports = {
2
+ preset: "ts-jest",
3
+ testEnvironment: "node",
4
+ setupFiles: ["./jest.setup.js"],
5
+ // ignore dist folder root dir
6
+ modulePathIgnorePatterns: ["<rootDir>/dist/"],
7
+
8
+ };
jest.setup.js ADDED
@@ -0,0 +1 @@
 
 
1
+ // global.fetch = require('jest-fetch-mock');
openapi-v0.json ADDED
@@ -0,0 +1,924 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "openapi": "3.0.0",
3
+ "info": {
4
+ "title": "Firecrawl API",
5
+ "version": "0.0.0",
6
+ "description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.",
7
+ "contact": {
8
+ "name": "Firecrawl Support",
9
+ "url": "https://firecrawl.dev/support",
10
+ "email": "support@firecrawl.dev"
11
+ }
12
+ },
13
+ "servers": [
14
+ {
15
+ "url": "https://api.firecrawl.dev/v0"
16
+ }
17
+ ],
18
+ "paths": {
19
+ "/scrape": {
20
+ "post": {
21
+ "summary": "Scrape a single URL and optionally extract information using an LLM",
22
+ "operationId": "scrapeAndExtractFromUrl",
23
+ "tags": ["Scraping"],
24
+ "security": [
25
+ {
26
+ "bearerAuth": []
27
+ }
28
+ ],
29
+ "requestBody": {
30
+ "required": true,
31
+ "content": {
32
+ "application/json": {
33
+ "schema": {
34
+ "type": "object",
35
+ "properties": {
36
+ "url": {
37
+ "type": "string",
38
+ "format": "uri",
39
+ "description": "The URL to scrape"
40
+ },
41
+ "pageOptions": {
42
+ "type": "object",
43
+ "properties": {
44
+ "headers": {
45
+ "type": "object",
46
+ "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
47
+ },
48
+ "includeHtml": {
49
+ "type": "boolean",
50
+ "description": "Include the HTML version of the content on page. Will output a html key in the response.",
51
+ "default": false
52
+ },
53
+ "includeRawHtml": {
54
+ "type": "boolean",
55
+ "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
56
+ "default": false
57
+ },
58
+ "onlyIncludeTags": {
59
+ "type": "array",
60
+ "items": {
61
+ "type": "string"
62
+ },
63
+ "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
64
+ },
65
+ "onlyMainContent": {
66
+ "type": "boolean",
67
+ "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
68
+ "default": false
69
+ },
70
+ "removeTags": {
71
+ "type": "array",
72
+ "items": {
73
+ "type": "string"
74
+ },
75
+ "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
76
+ },
77
+ "replaceAllPathsWithAbsolutePaths": {
78
+ "type": "boolean",
79
+ "description": "Replace all relative paths with absolute paths for images and links",
80
+ "default": false
81
+ },
82
+ "screenshot": {
83
+ "type": "boolean",
84
+ "description": "Include a screenshot of the top of the page that you are scraping.",
85
+ "default": false
86
+ },
87
+ "fullPageScreenshot": {
88
+ "type": "boolean",
89
+ "description": "Include a full page screenshot of the page that you are scraping.",
90
+ "default": false
91
+ },
92
+ "waitFor": {
93
+ "type": "integer",
94
+ "description": "Wait x amount of milliseconds for the page to load to fetch content",
95
+ "default": 0
96
+ }
97
+ }
98
+ },
99
+ "extractorOptions": {
100
+ "type": "object",
101
+ "description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
102
+ "default": {},
103
+ "properties": {
104
+ "mode": {
105
+ "type": "string",
106
+ "enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
107
+ "description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
108
+ },
109
+ "extractionPrompt": {
110
+ "type": "string",
111
+ "description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
112
+ },
113
+ "extractionSchema": {
114
+ "type": "object",
115
+ "additionalProperties": true,
116
+ "description": "The schema for the data to be extracted, required only for LLM extraction modes.",
117
+ "required": [
118
+ "company_mission",
119
+ "supports_sso",
120
+ "is_open_source"
121
+ ]
122
+ }
123
+ }
124
+ },
125
+ "timeout": {
126
+ "type": "integer",
127
+ "description": "Timeout in milliseconds for the request",
128
+ "default": 30000
129
+ }
130
+ },
131
+ "required": ["url"]
132
+ }
133
+ }
134
+ }
135
+ },
136
+ "responses": {
137
+ "200": {
138
+ "description": "Successful response",
139
+ "content": {
140
+ "application/json": {
141
+ "schema": {
142
+ "$ref": "#/components/schemas/ScrapeResponse"
143
+ }
144
+ }
145
+ }
146
+ },
147
+ "402": {
148
+ "description": "Payment required",
149
+ "content": {
150
+ "application/json": {
151
+ "schema": {
152
+ "type": "object",
153
+ "properties": {
154
+ "error": {
155
+ "type": "string",
156
+ "example": "Payment required to access this resource."
157
+ }
158
+ }
159
+ }
160
+ }
161
+ }
162
+ },
163
+ "429": {
164
+ "description": "Too many requests",
165
+ "content": {
166
+ "application/json": {
167
+ "schema": {
168
+ "type": "object",
169
+ "properties": {
170
+ "error": {
171
+ "type": "string",
172
+ "example": "Request rate limit exceeded. Please wait and try again later."
173
+ }
174
+ }
175
+ }
176
+ }
177
+ }
178
+ },
179
+ "500": {
180
+ "description": "Server error",
181
+ "content": {
182
+ "application/json": {
183
+ "schema": {
184
+ "type": "object",
185
+ "properties": {
186
+ "error": {
187
+ "type": "string",
188
+ "example": "An unexpected error occurred on the server."
189
+ }
190
+ }
191
+ }
192
+ }
193
+ }
194
+ }
195
+ }
196
+ }
197
+ },
198
+ "/crawl": {
199
+ "post": {
200
+ "summary": "Crawl multiple URLs based on options",
201
+ "operationId": "crawlUrls",
202
+ "tags": ["Crawling"],
203
+ "security": [
204
+ {
205
+ "bearerAuth": []
206
+ }
207
+ ],
208
+ "requestBody": {
209
+ "required": true,
210
+ "content": {
211
+ "application/json": {
212
+ "schema": {
213
+ "type": "object",
214
+ "properties": {
215
+ "url": {
216
+ "type": "string",
217
+ "format": "uri",
218
+ "description": "The base URL to start crawling from"
219
+ },
220
+ "crawlerOptions": {
221
+ "type": "object",
222
+ "properties": {
223
+ "includes": {
224
+ "type": "array",
225
+ "items": {
226
+ "type": "string"
227
+ },
228
+ "description": "URL patterns to include"
229
+ },
230
+ "excludes": {
231
+ "type": "array",
232
+ "items": {
233
+ "type": "string"
234
+ },
235
+ "description": "URL patterns to exclude"
236
+ },
237
+ "generateImgAltText": {
238
+ "type": "boolean",
239
+ "description": "Generate alt text for images using LLMs (must have a paid plan)",
240
+ "default": false
241
+ },
242
+ "returnOnlyUrls": {
243
+ "type": "boolean",
244
+ "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
245
+ "default": false
246
+ },
247
+ "maxDepth": {
248
+ "type": "integer",
249
+ "description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
250
+ },
251
+ "mode": {
252
+ "type": "string",
253
+ "enum": ["default", "fast"],
254
+ "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
255
+ "default": "default"
256
+ },
257
+ "ignoreSitemap": {
258
+ "type": "boolean",
259
+ "description": "Ignore the website sitemap when crawling",
260
+ "default": false
261
+ },
262
+ "limit": {
263
+ "type": "integer",
264
+ "description": "Maximum number of pages to crawl",
265
+ "default": 10000
266
+ },
267
+ "allowBackwardCrawling": {
268
+ "type": "boolean",
269
+ "description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'",
270
+ "default": false
271
+ },
272
+ "allowExternalContentLinks": {
273
+ "type": "boolean",
274
+ "description": "Allows the crawler to follow links to external websites.",
275
+ "default": false
276
+ }
277
+ }
278
+ },
279
+ "pageOptions": {
280
+ "type": "object",
281
+ "properties": {
282
+ "headers": {
283
+ "type": "object",
284
+ "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
285
+ },
286
+ "includeHtml": {
287
+ "type": "boolean",
288
+ "description": "Include the HTML version of the content on page. Will output a html key in the response.",
289
+ "default": false
290
+ },
291
+ "includeRawHtml": {
292
+ "type": "boolean",
293
+ "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
294
+ "default": false
295
+ },
296
+ "onlyIncludeTags": {
297
+ "type": "array",
298
+ "items": {
299
+ "type": "string"
300
+ },
301
+ "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
302
+ },
303
+ "onlyMainContent": {
304
+ "type": "boolean",
305
+ "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
306
+ "default": false
307
+ },
308
+ "removeTags": {
309
+ "type": "array",
310
+ "items": {
311
+ "type": "string"
312
+ },
313
+ "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
314
+ },
315
+ "replaceAllPathsWithAbsolutePaths": {
316
+ "type": "boolean",
317
+ "description": "Replace all relative paths with absolute paths for images and links",
318
+ "default": false
319
+ },
320
+ "screenshot": {
321
+ "type": "boolean",
322
+ "description": "Include a screenshot of the top of the page that you are scraping.",
323
+ "default": false
324
+ },
325
+ "fullPageScreenshot": {
326
+ "type": "boolean",
327
+ "description": "Include a full page screenshot of the page that you are scraping.",
328
+ "default": false
329
+ },
330
+ "waitFor": {
331
+ "type": "integer",
332
+ "description": "Wait x amount of milliseconds for the page to load to fetch content",
333
+ "default": 0
334
+ }
335
+ }
336
+ }
337
+ },
338
+ "required": ["url"]
339
+ }
340
+ }
341
+ }
342
+ },
343
+ "responses": {
344
+ "200": {
345
+ "description": "Successful response",
346
+ "content": {
347
+ "application/json": {
348
+ "schema": {
349
+ "$ref": "#/components/schemas/CrawlResponse"
350
+ }
351
+ }
352
+ }
353
+ },
354
+ "402": {
355
+ "description": "Payment required",
356
+ "content": {
357
+ "application/json": {
358
+ "schema": {
359
+ "type": "object",
360
+ "properties": {
361
+ "error": {
362
+ "type": "string",
363
+ "example": "Payment required to access this resource."
364
+ }
365
+ }
366
+ }
367
+ }
368
+ }
369
+ },
370
+ "429": {
371
+ "description": "Too many requests",
372
+ "content": {
373
+ "application/json": {
374
+ "schema": {
375
+ "type": "object",
376
+ "properties": {
377
+ "error": {
378
+ "type": "string",
379
+ "example": "Request rate limit exceeded. Please wait and try again later."
380
+ }
381
+ }
382
+ }
383
+ }
384
+ }
385
+ },
386
+ "500": {
387
+ "description": "Server error",
388
+ "content": {
389
+ "application/json": {
390
+ "schema": {
391
+ "type": "object",
392
+ "properties": {
393
+ "error": {
394
+ "type": "string",
395
+ "example": "An unexpected error occurred on the server."
396
+ }
397
+ }
398
+ }
399
+ }
400
+ }
401
+ }
402
+ }
403
+ }
404
+ },
405
+ "/search": {
406
+ "post": {
407
+ "summary": "Search for a keyword in Google, returns top page results with markdown content for each page",
408
+ "operationId": "searchGoogle",
409
+ "tags": ["Search"],
410
+ "security": [
411
+ {
412
+ "bearerAuth": []
413
+ }
414
+ ],
415
+ "requestBody": {
416
+ "required": true,
417
+ "content": {
418
+ "application/json": {
419
+ "schema": {
420
+ "type": "object",
421
+ "properties": {
422
+ "query": {
423
+ "type": "string",
424
+ "format": "uri",
425
+ "description": "The query to search for"
426
+ },
427
+ "pageOptions": {
428
+ "type": "object",
429
+ "properties": {
430
+ "onlyMainContent": {
431
+ "type": "boolean",
432
+ "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
433
+ "default": false
434
+ },
435
+ "fetchPageContent": {
436
+ "type": "boolean",
437
+ "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
438
+ "default": true
439
+ },
440
+ "includeHtml": {
441
+ "type": "boolean",
442
+ "description": "Include the HTML version of the content on page. Will output a html key in the response.",
443
+ "default": false
444
+ },
445
+ "includeRawHtml": {
446
+ "type": "boolean",
447
+ "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
448
+ "default": false
449
+ }
450
+ }
451
+ },
452
+ "searchOptions": {
453
+ "type": "object",
454
+ "properties": {
455
+ "limit": {
456
+ "type": "integer",
457
+ "description": "Maximum number of results. Max is 20 during beta."
458
+ }
459
+ }
460
+ }
461
+ },
462
+ "required": ["query"]
463
+ }
464
+ }
465
+ }
466
+ },
467
+ "responses": {
468
+ "200": {
469
+ "description": "Successful response",
470
+ "content": {
471
+ "application/json": {
472
+ "schema": {
473
+ "$ref": "#/components/schemas/SearchResponse"
474
+ }
475
+ }
476
+ }
477
+ },
478
+ "402": {
479
+ "description": "Payment required",
480
+ "content": {
481
+ "application/json": {
482
+ "schema": {
483
+ "type": "object",
484
+ "properties": {
485
+ "error": {
486
+ "type": "string",
487
+ "example": "Payment required to access this resource."
488
+ }
489
+ }
490
+ }
491
+ }
492
+ }
493
+ },
494
+ "429": {
495
+ "description": "Too many requests",
496
+ "content": {
497
+ "application/json": {
498
+ "schema": {
499
+ "type": "object",
500
+ "properties": {
501
+ "error": {
502
+ "type": "string",
503
+ "example": "Request rate limit exceeded. Please wait and try again later."
504
+ }
505
+ }
506
+ }
507
+ }
508
+ }
509
+ },
510
+ "500": {
511
+ "description": "Server error",
512
+ "content": {
513
+ "application/json": {
514
+ "schema": {
515
+ "type": "object",
516
+ "properties": {
517
+ "error": {
518
+ "type": "string",
519
+ "example": "An unexpected error occurred on the server."
520
+ }
521
+ }
522
+ }
523
+ }
524
+ }
525
+ }
526
+ }
527
+ }
528
+ },
529
+ "/crawl/status/{jobId}": {
530
+ "get": {
531
+ "tags": ["Crawl"],
532
+ "summary": "Get the status of a crawl job",
533
+ "operationId": "getCrawlStatus",
534
+ "security": [
535
+ {
536
+ "bearerAuth": []
537
+ }
538
+ ],
539
+ "parameters": [
540
+ {
541
+ "name": "jobId",
542
+ "in": "path",
543
+ "description": "ID of the crawl job",
544
+ "required": true,
545
+ "schema": {
546
+ "type": "string"
547
+ }
548
+ }
549
+ ],
550
+ "responses": {
551
+ "200": {
552
+ "description": "Successful response",
553
+ "content": {
554
+ "application/json": {
555
+ "schema": {
556
+ "type": "object",
557
+ "properties": {
558
+ "status": {
559
+ "type": "string",
560
+ "description": "Status of the job (completed, active, failed, paused)"
561
+ },
562
+ "current": {
563
+ "type": "integer",
564
+ "description": "Current page number"
565
+ },
566
+ "total": {
567
+ "type": "integer",
568
+ "description": "Total number of pages"
569
+ },
570
+ "data": {
571
+ "type": "array",
572
+ "items": {
573
+ "$ref": "#/components/schemas/CrawlStatusResponseObj"
574
+ },
575
+ "description": "Data returned from the job (null when it is in progress)"
576
+ },
577
+ "partial_data": {
578
+ "type": "array",
579
+ "items": {
580
+ "$ref": "#/components/schemas/CrawlStatusResponseObj"
581
+ },
582
+ "description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
583
+ }
584
+ }
585
+ }
586
+ }
587
+ }
588
+ },
589
+ "402": {
590
+ "description": "Payment required",
591
+ "content": {
592
+ "application/json": {
593
+ "schema": {
594
+ "type": "object",
595
+ "properties": {
596
+ "error": {
597
+ "type": "string",
598
+ "example": "Payment required to access this resource."
599
+ }
600
+ }
601
+ }
602
+ }
603
+ }
604
+ },
605
+ "429": {
606
+ "description": "Too many requests",
607
+ "content": {
608
+ "application/json": {
609
+ "schema": {
610
+ "type": "object",
611
+ "properties": {
612
+ "error": {
613
+ "type": "string",
614
+ "example": "Request rate limit exceeded. Please wait and try again later."
615
+ }
616
+ }
617
+ }
618
+ }
619
+ }
620
+ },
621
+ "500": {
622
+ "description": "Server error",
623
+ "content": {
624
+ "application/json": {
625
+ "schema": {
626
+ "type": "object",
627
+ "properties": {
628
+ "error": {
629
+ "type": "string",
630
+ "example": "An unexpected error occurred on the server."
631
+ }
632
+ }
633
+ }
634
+ }
635
+ }
636
+ }
637
+ }
638
+ }
639
+ },
640
+ "/crawl/cancel/{jobId}": {
641
+ "delete": {
642
+ "tags": ["Crawl"],
643
+ "summary": "Cancel a crawl job",
644
+ "operationId": "cancelCrawlJob",
645
+ "security": [
646
+ {
647
+ "bearerAuth": []
648
+ }
649
+ ],
650
+ "parameters": [
651
+ {
652
+ "name": "jobId",
653
+ "in": "path",
654
+ "description": "ID of the crawl job",
655
+ "required": true,
656
+ "schema": {
657
+ "type": "string"
658
+ }
659
+ }
660
+ ],
661
+ "responses": {
662
+ "200": {
663
+ "description": "Successful response",
664
+ "content": {
665
+ "application/json": {
666
+ "schema": {
667
+ "type": "object",
668
+ "properties": {
669
+ "status": {
670
+ "type": "string",
671
+ "description": "Returns cancelled."
672
+ }
673
+ }
674
+ }
675
+ }
676
+ }
677
+ },
678
+ "402": {
679
+ "description": "Payment required",
680
+ "content": {
681
+ "application/json": {
682
+ "schema": {
683
+ "type": "object",
684
+ "properties": {
685
+ "error": {
686
+ "type": "string",
687
+ "example": "Payment required to access this resource."
688
+ }
689
+ }
690
+ }
691
+ }
692
+ }
693
+ },
694
+ "429": {
695
+ "description": "Too many requests",
696
+ "content": {
697
+ "application/json": {
698
+ "schema": {
699
+ "type": "object",
700
+ "properties": {
701
+ "error": {
702
+ "type": "string",
703
+ "example": "Request rate limit exceeded. Please wait and try again later."
704
+ }
705
+ }
706
+ }
707
+ }
708
+ }
709
+ },
710
+ "500": {
711
+ "description": "Server error",
712
+ "content": {
713
+ "application/json": {
714
+ "schema": {
715
+ "type": "object",
716
+ "properties": {
717
+ "error": {
718
+ "type": "string",
719
+ "example": "An unexpected error occurred on the server."
720
+ }
721
+ }
722
+ }
723
+ }
724
+ }
725
+ }
726
+ }
727
+ }
728
+ }
729
+ },
730
+ "components": {
731
+ "securitySchemes": {
732
+ "bearerAuth": {
733
+ "type": "http",
734
+ "scheme": "bearer"
735
+ }
736
+ },
737
+ "schemas": {
738
+ "ScrapeResponse": {
739
+ "type": "object",
740
+ "properties": {
741
+ "success": {
742
+ "type": "boolean"
743
+ },
744
+ "data": {
745
+ "type": "object",
746
+ "properties": {
747
+ "markdown": {
748
+ "type": "string"
749
+ },
750
+ "content": {
751
+ "type": "string"
752
+ },
753
+ "html": {
754
+ "type": "string",
755
+ "nullable": true,
756
+ "description": "HTML version of the content on page if `includeHtml` is true"
757
+ },
758
+ "rawHtml": {
759
+ "type": "string",
760
+ "nullable": true,
761
+ "description": "Raw HTML content of the page if `includeRawHtml` is true"
762
+ },
763
+ "metadata": {
764
+ "type": "object",
765
+ "properties": {
766
+ "title": {
767
+ "type": "string"
768
+ },
769
+ "description": {
770
+ "type": "string"
771
+ },
772
+ "language": {
773
+ "type": "string",
774
+ "nullable": true
775
+ },
776
+ "sourceURL": {
777
+ "type": "string",
778
+ "format": "uri"
779
+ },
780
+ "<any other metadata> ": {
781
+ "type": "string"
782
+ },
783
+ "pageStatusCode": {
784
+ "type": "integer",
785
+ "description": "The status code of the page"
786
+ },
787
+ "pageError": {
788
+ "type": "string",
789
+ "nullable": true,
790
+ "description": "The error message of the page"
791
+ }
792
+
793
+ }
794
+ },
795
+ "llm_extraction": {
796
+ "type": "object",
797
+ "description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
798
+ "nullable": true
799
+ },
800
+ "warning": {
801
+ "type": "string",
802
+ "nullable": true,
803
+ "description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
804
+ }
805
+ }
806
+ }
807
+ }
808
+ },
809
+ "CrawlStatusResponseObj": {
810
+ "type": "object",
811
+ "properties": {
812
+ "markdown": {
813
+ "type": "string"
814
+ },
815
+ "content": {
816
+ "type": "string"
817
+ },
818
+ "html": {
819
+ "type": "string",
820
+ "nullable": true,
821
+ "description": "HTML version of the content on page if `includeHtml` is true"
822
+ },
823
+ "rawHtml": {
824
+ "type": "string",
825
+ "nullable": true,
826
+ "description": "Raw HTML content of the page if `includeRawHtml` is true"
827
+ },
828
+ "index": {
829
+ "type": "integer",
830
+ "description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
831
+ },
832
+ "metadata": {
833
+ "type": "object",
834
+ "properties": {
835
+ "title": {
836
+ "type": "string"
837
+ },
838
+ "description": {
839
+ "type": "string"
840
+ },
841
+ "language": {
842
+ "type": "string",
843
+ "nullable": true
844
+ },
845
+ "sourceURL": {
846
+ "type": "string",
847
+ "format": "uri"
848
+ },
849
+ "<any other metadata> ": {
850
+ "type": "string"
851
+ },
852
+ "pageStatusCode": {
853
+ "type": "integer",
854
+ "description": "The status code of the page"
855
+ },
856
+ "pageError": {
857
+ "type": "string",
858
+ "nullable": true,
859
+ "description": "The error message of the page"
860
+ }
861
+ }
862
+ }
863
+ }
864
+ },
865
+ "SearchResponse": {
866
+ "type": "object",
867
+ "properties": {
868
+ "success": {
869
+ "type": "boolean"
870
+ },
871
+ "data": {
872
+ "type": "array",
873
+ "items": {
874
+ "type": "object",
875
+ "properties": {
876
+ "url": {
877
+ "type": "string"
878
+ },
879
+ "markdown": {
880
+ "type": "string"
881
+ },
882
+ "content": {
883
+ "type": "string"
884
+ },
885
+ "metadata": {
886
+ "type": "object",
887
+ "properties": {
888
+ "title": {
889
+ "type": "string"
890
+ },
891
+ "description": {
892
+ "type": "string"
893
+ },
894
+ "language": {
895
+ "type": "string",
896
+ "nullable": true
897
+ },
898
+ "sourceURL": {
899
+ "type": "string",
900
+ "format": "uri"
901
+ }
902
+ }
903
+ }
904
+ }
905
+ }
906
+ }
907
+ }
908
+ },
909
+ "CrawlResponse": {
910
+ "type": "object",
911
+ "properties": {
912
+ "jobId": {
913
+ "type": "string"
914
+ }
915
+ }
916
+ }
917
+ }
918
+ },
919
+ "security": [
920
+ {
921
+ "bearerAuth": []
922
+ }
923
+ ]
924
+ }
openapi.json ADDED
@@ -0,0 +1,929 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "openapi": "3.0.0",
3
+ "info": {
4
+ "title": "Firecrawl API",
5
+ "version": "1.0.0",
6
+ "description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.",
7
+ "contact": {
8
+ "name": "Firecrawl Support",
9
+ "url": "https://firecrawl.dev/support",
10
+ "email": "support@firecrawl.dev"
11
+ }
12
+ },
13
+ "servers": [
14
+ {
15
+ "url": "https://api.firecrawl.dev/v0"
16
+ }
17
+ ],
18
+ "paths": {
19
+ "/scrape": {
20
+ "post": {
21
+ "summary": "Scrape a single URL",
22
+ "operationId": "scrape",
23
+ "tags": ["Scraping"],
24
+ "security": [
25
+ {
26
+ "bearerAuth": []
27
+ }
28
+ ],
29
+ "requestBody": {
30
+ "required": true,
31
+ "content": {
32
+ "application/json": {
33
+ "schema": {
34
+ "type": "object",
35
+ "properties": {
36
+ "url": {
37
+ "type": "string",
38
+ "format": "uri",
39
+ "description": "The URL to scrape"
40
+ },
41
+ "formats": {
42
+ "type": "array",
43
+ "items": {
44
+ "type": "string",
45
+ "enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"]
46
+ },
47
+ "description": "Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)",
48
+ "default": ["markdown"]
49
+ },
50
+ "headers": {
51
+ "type": "object",
52
+ "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
53
+ },
54
+ "includeTags": {
55
+ "type": "array",
56
+ "items": {
57
+ "type": "string"
58
+ },
59
+ "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
60
+ },
61
+ "excludeTags": {
62
+ "type": "array",
63
+ "items": {
64
+ "type": "string"
65
+ },
66
+ "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
67
+ },
68
+ "onlyMainContent": {
69
+ "type": "boolean",
70
+ "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
71
+ "default": true
72
+ },
73
+ "timeout": {
74
+ "type": "integer",
75
+ "description": "Timeout in milliseconds for the request",
76
+ "default": 30000
77
+ },
78
+ "waitFor": {
79
+ "type": "integer",
80
+ "description": "Wait x amount of milliseconds for the page to load to fetch content",
81
+ "default": 0
82
+ }
83
+ },
84
+ "required": ["url"]
85
+ }
86
+ }
87
+ }
88
+ },
89
+ "responses": {
90
+ "200": {
91
+ "description": "Successful response",
92
+ "content": {
93
+ "application/json": {
94
+ "schema": {
95
+ "$ref": "#/components/schemas/ScrapeResponse"
96
+ }
97
+ }
98
+ }
99
+ },
100
+ "402": {
101
+ "description": "Payment required",
102
+ "content": {
103
+ "application/json": {
104
+ "schema": {
105
+ "type": "object",
106
+ "properties": {
107
+ "error": {
108
+ "type": "string",
109
+ "example": "Payment required to access this resource."
110
+ }
111
+ }
112
+ }
113
+ }
114
+ }
115
+ },
116
+ "429": {
117
+ "description": "Too many requests",
118
+ "content": {
119
+ "application/json": {
120
+ "schema": {
121
+ "type": "object",
122
+ "properties": {
123
+ "error": {
124
+ "type": "string",
125
+ "example": "Request rate limit exceeded. Please wait and try again later."
126
+ }
127
+ }
128
+ }
129
+ }
130
+ }
131
+ },
132
+ "500": {
133
+ "description": "Server error",
134
+ "content": {
135
+ "application/json": {
136
+ "schema": {
137
+ "type": "object",
138
+ "properties": {
139
+ "error": {
140
+ "type": "string",
141
+ "example": "An unexpected error occurred on the server."
142
+ }
143
+ }
144
+ }
145
+ }
146
+ }
147
+ }
148
+ }
149
+ }
150
+ },
151
+ "/crawl": {
152
+ "post": {
153
+ "summary": "Crawl multiple URLs based on options",
154
+ "operationId": "crawlUrls",
155
+ "tags": ["Crawling"],
156
+ "security": [
157
+ {
158
+ "bearerAuth": []
159
+ }
160
+ ],
161
+ "requestBody": {
162
+ "required": true,
163
+ "content": {
164
+ "application/json": {
165
+ "schema": {
166
+ "type": "object",
167
+ "properties": {
168
+ "url": {
169
+ "type": "string",
170
+ "format": "uri",
171
+ "description": "The base URL to start crawling from"
172
+ },
173
+ "crawlerOptions": {
174
+ "type": "object",
175
+ "properties": {
176
+ "includes": {
177
+ "type": "array",
178
+ "items": {
179
+ "type": "string"
180
+ },
181
+ "description": "URL patterns to include"
182
+ },
183
+ "excludes": {
184
+ "type": "array",
185
+ "items": {
186
+ "type": "string"
187
+ },
188
+ "description": "URL patterns to exclude"
189
+ },
190
+ "generateImgAltText": {
191
+ "type": "boolean",
192
+ "description": "Generate alt text for images using LLMs (must have a paid plan)",
193
+ "default": false
194
+ },
195
+ "returnOnlyUrls": {
196
+ "type": "boolean",
197
+ "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
198
+ "default": false
199
+ },
200
+ "maxDepth": {
201
+ "type": "integer",
202
+ "description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
203
+ },
204
+ "mode": {
205
+ "type": "string",
206
+ "enum": ["default", "fast"],
207
+ "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
208
+ "default": "default"
209
+ },
210
+ "ignoreSitemap": {
211
+ "type": "boolean",
212
+ "description": "Ignore the website sitemap when crawling",
213
+ "default": false
214
+ },
215
+ "limit": {
216
+ "type": "integer",
217
+ "description": "Maximum number of pages to crawl",
218
+ "default": 10000
219
+ },
220
+ "allowBackwardCrawling": {
221
+ "type": "boolean",
222
+ "description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'",
223
+ "default": false
224
+ },
225
+ "allowExternalContentLinks": {
226
+ "type": "boolean",
227
+ "description": "Allows the crawler to follow links to external websites.",
228
+ "default": false
229
+ }
230
+ }
231
+ },
232
+ "pageOptions": {
233
+ "type": "object",
234
+ "properties": {
235
+ "headers": {
236
+ "type": "object",
237
+ "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
238
+ },
239
+ "includeHtml": {
240
+ "type": "boolean",
241
+ "description": "Include the HTML version of the content on page. Will output a html key in the response.",
242
+ "default": false
243
+ },
244
+ "includeRawHtml": {
245
+ "type": "boolean",
246
+ "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
247
+ "default": false
248
+ },
249
+ "onlyIncludeTags": {
250
+ "type": "array",
251
+ "items": {
252
+ "type": "string"
253
+ },
254
+ "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
255
+ },
256
+ "onlyMainContent": {
257
+ "type": "boolean",
258
+ "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
259
+ "default": false
260
+ },
261
+ "removeTags": {
262
+ "type": "array",
263
+ "items": {
264
+ "type": "string"
265
+ },
266
+ "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
267
+ },
268
+ "replaceAllPathsWithAbsolutePaths": {
269
+ "type": "boolean",
270
+ "description": "Replace all relative paths with absolute paths for images and links",
271
+ "default": false
272
+ },
273
+ "screenshot": {
274
+ "type": "boolean",
275
+ "description": "Include a screenshot of the top of the page that you are scraping.",
276
+ "default": false
277
+ },
278
+ "fullPageScreenshot": {
279
+ "type": "boolean",
280
+ "description": "Include a full page screenshot of the page that you are scraping.",
281
+ "default": false
282
+ },
283
+ "waitFor": {
284
+ "type": "integer",
285
+ "description": "Wait x amount of milliseconds for the page to load to fetch content",
286
+ "default": 0
287
+ }
288
+ }
289
+ }
290
+ },
291
+ "required": ["url"]
292
+ }
293
+ }
294
+ }
295
+ },
296
+ "responses": {
297
+ "200": {
298
+ "description": "Successful response",
299
+ "content": {
300
+ "application/json": {
301
+ "schema": {
302
+ "$ref": "#/components/schemas/CrawlResponse"
303
+ }
304
+ }
305
+ }
306
+ },
307
+ "402": {
308
+ "description": "Payment required",
309
+ "content": {
310
+ "application/json": {
311
+ "schema": {
312
+ "type": "object",
313
+ "properties": {
314
+ "error": {
315
+ "type": "string",
316
+ "example": "Payment required to access this resource."
317
+ }
318
+ }
319
+ }
320
+ }
321
+ }
322
+ },
323
+ "429": {
324
+ "description": "Too many requests",
325
+ "content": {
326
+ "application/json": {
327
+ "schema": {
328
+ "type": "object",
329
+ "properties": {
330
+ "error": {
331
+ "type": "string",
332
+ "example": "Request rate limit exceeded. Please wait and try again later."
333
+ }
334
+ }
335
+ }
336
+ }
337
+ }
338
+ },
339
+ "500": {
340
+ "description": "Server error",
341
+ "content": {
342
+ "application/json": {
343
+ "schema": {
344
+ "type": "object",
345
+ "properties": {
346
+ "error": {
347
+ "type": "string",
348
+ "example": "An unexpected error occurred on the server."
349
+ }
350
+ }
351
+ }
352
+ }
353
+ }
354
+ }
355
+ }
356
+ }
357
+ },
358
+ "/search": {
359
+ "post": {
360
+ "summary": "Search for a keyword in Google, returns top page results with markdown content for each page",
361
+ "operationId": "searchGoogle",
362
+ "tags": ["Search"],
363
+ "security": [
364
+ {
365
+ "bearerAuth": []
366
+ }
367
+ ],
368
+ "requestBody": {
369
+ "required": true,
370
+ "content": {
371
+ "application/json": {
372
+ "schema": {
373
+ "type": "object",
374
+ "properties": {
375
+ "query": {
376
+ "type": "string",
377
+ "format": "uri",
378
+ "description": "The query to search for"
379
+ },
380
+ "pageOptions": {
381
+ "type": "object",
382
+ "properties": {
383
+ "onlyMainContent": {
384
+ "type": "boolean",
385
+ "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
386
+ "default": false
387
+ },
388
+ "fetchPageContent": {
389
+ "type": "boolean",
390
+ "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
391
+ "default": true
392
+ },
393
+ "includeHtml": {
394
+ "type": "boolean",
395
+ "description": "Include the HTML version of the content on page. Will output a html key in the response.",
396
+ "default": false
397
+ },
398
+ "includeRawHtml": {
399
+ "type": "boolean",
400
+ "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
401
+ "default": false
402
+ }
403
+ }
404
+ },
405
+ "searchOptions": {
406
+ "type": "object",
407
+ "properties": {
408
+ "limit": {
409
+ "type": "integer",
410
+ "description": "Maximum number of results. Max is 20 during beta."
411
+ }
412
+ }
413
+ }
414
+ },
415
+ "required": ["query"]
416
+ }
417
+ }
418
+ }
419
+ },
420
+ "responses": {
421
+ "200": {
422
+ "description": "Successful response",
423
+ "content": {
424
+ "application/json": {
425
+ "schema": {
426
+ "$ref": "#/components/schemas/SearchResponse"
427
+ }
428
+ }
429
+ }
430
+ },
431
+ "402": {
432
+ "description": "Payment required",
433
+ "content": {
434
+ "application/json": {
435
+ "schema": {
436
+ "type": "object",
437
+ "properties": {
438
+ "error": {
439
+ "type": "string",
440
+ "example": "Payment required to access this resource."
441
+ }
442
+ }
443
+ }
444
+ }
445
+ }
446
+ },
447
+ "429": {
448
+ "description": "Too many requests",
449
+ "content": {
450
+ "application/json": {
451
+ "schema": {
452
+ "type": "object",
453
+ "properties": {
454
+ "error": {
455
+ "type": "string",
456
+ "example": "Request rate limit exceeded. Please wait and try again later."
457
+ }
458
+ }
459
+ }
460
+ }
461
+ }
462
+ },
463
+ "500": {
464
+ "description": "Server error",
465
+ "content": {
466
+ "application/json": {
467
+ "schema": {
468
+ "type": "object",
469
+ "properties": {
470
+ "error": {
471
+ "type": "string",
472
+ "example": "An unexpected error occurred on the server."
473
+ }
474
+ }
475
+ }
476
+ }
477
+ }
478
+ }
479
+ }
480
+ }
481
+ },
482
+ "/crawl/status/{jobId}": {
483
+ "get": {
484
+ "tags": ["Crawl"],
485
+ "summary": "Get the status of a crawl job",
486
+ "operationId": "getCrawlStatus",
487
+ "security": [
488
+ {
489
+ "bearerAuth": []
490
+ }
491
+ ],
492
+ "parameters": [
493
+ {
494
+ "name": "jobId",
495
+ "in": "path",
496
+ "description": "ID of the crawl job",
497
+ "required": true,
498
+ "schema": {
499
+ "type": "string"
500
+ }
501
+ }
502
+ ],
503
+ "responses": {
504
+ "200": {
505
+ "description": "Successful response",
506
+ "content": {
507
+ "application/json": {
508
+ "schema": {
509
+ "type": "object",
510
+ "properties": {
511
+ "status": {
512
+ "type": "string",
513
+ "description": "Status of the job (completed, active, failed, paused)"
514
+ },
515
+ "current": {
516
+ "type": "integer",
517
+ "description": "Current page number"
518
+ },
519
+ "total": {
520
+ "type": "integer",
521
+ "description": "Total number of pages"
522
+ },
523
+ "data": {
524
+ "type": "array",
525
+ "items": {
526
+ "$ref": "#/components/schemas/CrawlStatusResponseObj"
527
+ },
528
+ "description": "Data returned from the job (null when it is in progress)"
529
+ },
530
+ "partial_data": {
531
+ "type": "array",
532
+ "items": {
533
+ "$ref": "#/components/schemas/CrawlStatusResponseObj"
534
+ },
535
+ "description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
536
+ }
537
+ }
538
+ }
539
+ }
540
+ }
541
+ },
542
+ "402": {
543
+ "description": "Payment required",
544
+ "content": {
545
+ "application/json": {
546
+ "schema": {
547
+ "type": "object",
548
+ "properties": {
549
+ "error": {
550
+ "type": "string",
551
+ "example": "Payment required to access this resource."
552
+ }
553
+ }
554
+ }
555
+ }
556
+ }
557
+ },
558
+ "429": {
559
+ "description": "Too many requests",
560
+ "content": {
561
+ "application/json": {
562
+ "schema": {
563
+ "type": "object",
564
+ "properties": {
565
+ "error": {
566
+ "type": "string",
567
+ "example": "Request rate limit exceeded. Please wait and try again later."
568
+ }
569
+ }
570
+ }
571
+ }
572
+ }
573
+ },
574
+ "500": {
575
+ "description": "Server error",
576
+ "content": {
577
+ "application/json": {
578
+ "schema": {
579
+ "type": "object",
580
+ "properties": {
581
+ "error": {
582
+ "type": "string",
583
+ "example": "An unexpected error occurred on the server."
584
+ }
585
+ }
586
+ }
587
+ }
588
+ }
589
+ }
590
+ }
591
+ }
592
+ },
593
+ "/crawl/cancel/{jobId}": {
594
+ "delete": {
595
+ "tags": ["Crawl"],
596
+ "summary": "Cancel a crawl job",
597
+ "operationId": "cancelCrawlJob",
598
+ "security": [
599
+ {
600
+ "bearerAuth": []
601
+ }
602
+ ],
603
+ "parameters": [
604
+ {
605
+ "name": "jobId",
606
+ "in": "path",
607
+ "description": "ID of the crawl job",
608
+ "required": true,
609
+ "schema": {
610
+ "type": "string"
611
+ }
612
+ }
613
+ ],
614
+ "responses": {
615
+ "200": {
616
+ "description": "Successful response",
617
+ "content": {
618
+ "application/json": {
619
+ "schema": {
620
+ "type": "object",
621
+ "properties": {
622
+ "status": {
623
+ "type": "string",
624
+ "description": "Returns cancelled."
625
+ }
626
+ }
627
+ }
628
+ }
629
+ }
630
+ },
631
+ "402": {
632
+ "description": "Payment required",
633
+ "content": {
634
+ "application/json": {
635
+ "schema": {
636
+ "type": "object",
637
+ "properties": {
638
+ "error": {
639
+ "type": "string",
640
+ "example": "Payment required to access this resource."
641
+ }
642
+ }
643
+ }
644
+ }
645
+ }
646
+ },
647
+ "429": {
648
+ "description": "Too many requests",
649
+ "content": {
650
+ "application/json": {
651
+ "schema": {
652
+ "type": "object",
653
+ "properties": {
654
+ "error": {
655
+ "type": "string",
656
+ "example": "Request rate limit exceeded. Please wait and try again later."
657
+ }
658
+ }
659
+ }
660
+ }
661
+ }
662
+ },
663
+ "500": {
664
+ "description": "Server error",
665
+ "content": {
666
+ "application/json": {
667
+ "schema": {
668
+ "type": "object",
669
+ "properties": {
670
+ "error": {
671
+ "type": "string",
672
+ "example": "An unexpected error occurred on the server."
673
+ }
674
+ }
675
+ }
676
+ }
677
+ }
678
+ }
679
+ }
680
+ }
681
+ }
682
+ },
683
+ "components": {
684
+ "securitySchemes": {
685
+ "bearerAuth": {
686
+ "type": "http",
687
+ "scheme": "bearer"
688
+ }
689
+ },
690
+ "schemas": {
691
+ "ScrapeResponse": {
692
+ "type": "object",
693
+ "properties": {
694
+ "success": {
695
+ "type": "boolean"
696
+ },
697
+ "warning": {
698
+ "type": "string",
699
+ "nullable": true,
700
+ "description": "Warning message to let you know of any issues."
701
+ },
702
+ "data": {
703
+ "type": "object",
704
+ "properties": {
705
+ "markdown": {
706
+ "type": "string",
707
+ "nullable": true,
708
+ "description": "Markdown content of the page if the `markdown` format was specified (default)"
709
+ },
710
+ "html": {
711
+ "type": "string",
712
+ "nullable": true,
713
+ "description": "HTML version of the content on page if the `html` format was specified"
714
+ },
715
+ "rawHtml": {
716
+ "type": "string",
717
+ "nullable": true,
718
+ "description": "Raw HTML content of the page if the `rawHtml` format was specified"
719
+ },
720
+ "links": {
721
+ "type": "array",
722
+ "items": {
723
+ "type": "string",
724
+ "format": "uri"
725
+ },
726
+ "nullable": true,
727
+ "description": "Links on the page if the `links` format was specified"
728
+ },
729
+ "screenshot": {
730
+ "type": "string",
731
+ "nullable": true,
732
+ "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
733
+ },
734
+ "metadata": {
735
+ "type": "object",
736
+ "properties": {
737
+ "title": {
738
+ "type": "string"
739
+ },
740
+ "description": {
741
+ "type": "string"
742
+ },
743
+ "language": {
744
+ "type": "string",
745
+ "nullable": true
746
+ },
747
+ "sourceURL": {
748
+ "type": "string",
749
+ "format": "uri"
750
+ },
751
+ "<any other metadata> ": {
752
+ "type": "string"
753
+ },
754
+ "statusCode": {
755
+ "type": "integer",
756
+ "description": "The status code of the page"
757
+ },
758
+ "error": {
759
+ "type": "string",
760
+ "nullable": true,
761
+ "description": "The error message of the page"
762
+ }
763
+ }
764
+ }
765
+ }
766
+ }
767
+ }
768
+ },
769
+ "CrawlStatusResponseObj": {
770
+ "type": "object",
771
+ "properties": {
772
+ "markdown": {
773
+ "type": "string",
774
+ "nullable": true,
775
+ "description": "Markdown content of the page if the `markdown` format was specified (default)"
776
+ },
777
+ "html": {
778
+ "type": "string",
779
+ "nullable": true,
780
+ "description": "HTML version of the content on page if the `html` format was specified"
781
+ },
782
+ "rawHtml": {
783
+ "type": "string",
784
+ "nullable": true,
785
+ "description": "Raw HTML content of the page if the `rawHtml` format was specified"
786
+ },
787
+ "links": {
788
+ "type": "array",
789
+ "items": {
790
+ "type": "string",
791
+ "format": "uri"
792
+ },
793
+ "nullable": true,
794
+ "description": "Links on the page if the `links` format was specified"
795
+ },
796
+ "screenshot": {
797
+ "type": "string",
798
+ "nullable": true,
799
+ "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
800
+ },
801
+ "metadata": {
802
+ "type": "object",
803
+ "properties": {
804
+ "title": {
805
+ "type": "string"
806
+ },
807
+ "description": {
808
+ "type": "string"
809
+ },
810
+ "language": {
811
+ "type": "string",
812
+ "nullable": true
813
+ },
814
+ "sourceURL": {
815
+ "type": "string",
816
+ "format": "uri"
817
+ },
818
+ "<any other metadata> ": {
819
+ "type": "string"
820
+ },
821
+ "statusCode": {
822
+ "type": "integer",
823
+ "description": "The status code of the page"
824
+ },
825
+ "error": {
826
+ "type": "string",
827
+ "nullable": true,
828
+ "description": "The error message of the page"
829
+ }
830
+ }
831
+ }
832
+ }
833
+ },
834
+ "SearchResponse": {
835
+ "type": "object",
836
+ "properties": {
837
+ "success": {
838
+ "type": "boolean"
839
+ },
840
+ "data": {
841
+ "type": "array",
842
+ "items": {
843
+ "markdown": {
844
+ "type": "string",
845
+ "nullable": true,
846
+ "description": "Markdown content of the page if the `markdown` format was specified (default)"
847
+ },
848
+ "html": {
849
+ "type": "string",
850
+ "nullable": true,
851
+ "description": "HTML version of the content on page if the `html` format was specified"
852
+ },
853
+ "rawHtml": {
854
+ "type": "string",
855
+ "nullable": true,
856
+ "description": "Raw HTML content of the page if the `rawHtml` format was specified"
857
+ },
858
+ "links": {
859
+ "type": "array",
860
+ "items": {
861
+ "type": "string",
862
+ "format": "uri"
863
+ },
864
+ "nullable": true,
865
+ "description": "Links on the page if the `links` format was specified"
866
+ },
867
+ "screenshot": {
868
+ "type": "string",
869
+ "nullable": true,
870
+ "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
871
+ },
872
+ "metadata": {
873
+ "type": "object",
874
+ "properties": {
875
+ "title": {
876
+ "type": "string"
877
+ },
878
+ "description": {
879
+ "type": "string"
880
+ },
881
+ "language": {
882
+ "type": "string",
883
+ "nullable": true
884
+ },
885
+ "sourceURL": {
886
+ "type": "string",
887
+ "format": "uri"
888
+ },
889
+ "<any other metadata> ": {
890
+ "type": "string"
891
+ },
892
+ "statusCode": {
893
+ "type": "integer",
894
+ "description": "The status code of the page"
895
+ },
896
+ "error": {
897
+ "type": "string",
898
+ "nullable": true,
899
+ "description": "The error message of the page"
900
+ }
901
+ }
902
+ }
903
+ }
904
+ }
905
+ }
906
+ },
907
+ "CrawlResponse": {
908
+ "type": "object",
909
+ "properties": {
910
+ "success": {
911
+ "type": "boolean"
912
+ },
913
+ "id": {
914
+ "type": "string"
915
+ },
916
+ "url": {
917
+ "type": "string",
918
+ "format": "uri"
919
+ }
920
+ }
921
+ }
922
+ }
923
+ },
924
+ "security": [
925
+ {
926
+ "bearerAuth": []
927
+ }
928
+ ]
929
+ }
package.json ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "firecrawl-scraper-js",
3
+ "version": "1.0.0",
4
+ "description": "",
5
+ "main": "index.js",
6
+ "scripts": {
7
+ "start": "nodemon --exec ts-node src/index.ts",
8
+ "start:production": "tsc && node dist/src/index.js",
9
+ "format": "prettier --write \"src/**/*.(js|ts)\"",
10
+ "flyio": "node dist/src/index.js",
11
+ "start:dev": "nodemon --exec ts-node src/index.ts",
12
+ "build": "tsc",
13
+ "build:nosentry": "tsc",
14
+ "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
15
+ "test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
16
+ "test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'",
17
+ "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth|src/scraper/scrapeURL)'",
18
+ "test:snips": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false src/__tests__/snips/*.test.ts",
19
+ "workers": "nodemon --exec ts-node src/services/queue-worker.ts",
20
+ "worker:production": "node dist/src/services/queue-worker.js",
21
+ "index-worker": "nodemon --exec ts-node src/services/indexing/index-worker.ts",
22
+ "index-worker:production": "node dist/src/services/indexing/index-worker.js",
23
+ "mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest",
24
+ "mongo-docker-console": "docker exec -it mongodb mongosh",
25
+ "run-example": "npx ts-node src/example.ts",
26
+ "deploy:fly": "flyctl deploy --build-secret SENTRY_AUTH_TOKEN=$(dotenv -p SENTRY_AUTH_TOKEN) --depot=false",
27
+ "deploy:fly:staging": "fly deploy -c fly.staging.toml --depot=false",
28
+ "sentry:sourcemaps": "sentry-cli sourcemaps inject --org caleb-peffer --project firecrawl-scraper-js ./dist && sentry-cli sourcemaps upload --org caleb-peffer --project firecrawl-scraper-js ./dist"
29
+ },
30
+ "author": "",
31
+ "license": "ISC",
32
+ "devDependencies": {
33
+ "@jest/globals": "^29.7.0",
34
+ "@tsconfig/recommended": "^1.0.3",
35
+ "@types/body-parser": "^1.19.2",
36
+ "@types/cors": "^2.8.13",
37
+ "@types/escape-html": "^1.0.4",
38
+ "@types/express": "^4.17.21",
39
+ "@types/express-ws": "^3.0.5",
40
+ "@types/jest": "^29.5.12",
41
+ "@types/lodash": "^4.17.14",
42
+ "@types/node": "^20.14.1",
43
+ "@types/pdf-parse": "^1.1.4",
44
+ "@types/supertest": "^6.0.2",
45
+ "jest": "^29.6.3",
46
+ "jest-fetch-mock": "^3.0.3",
47
+ "nodemon": "^2.0.20",
48
+ "prettier": "^3.4.2",
49
+ "supertest": "^6.3.3",
50
+ "ts-jest": "^29.1.1",
51
+ "ts-node": "^10.9.1",
52
+ "typescript": "^5.8.3"
53
+ },
54
+ "dependencies": {
55
+ "@ai-sdk/anthropic": "^1.2.4",
56
+ "@ai-sdk/deepinfra": "^0.2.4",
57
+ "@ai-sdk/fireworks": "^0.2.4",
58
+ "@ai-sdk/google": "^1.2.3",
59
+ "@ai-sdk/google-vertex": "^2.2.15",
60
+ "@ai-sdk/groq": "^1.2.1",
61
+ "@ai-sdk/openai": "^1.3.12",
62
+ "@anthropic-ai/sdk": "^0.24.3",
63
+ "@apidevtools/json-schema-ref-parser": "^11.7.3",
64
+ "@brillout/import": "^0.2.2",
65
+ "@bull-board/api": "^5.20.5",
66
+ "@bull-board/express": "^5.20.5",
67
+ "@devil7softwares/pos": "^1.0.2",
68
+ "@dqbd/tiktoken": "^1.0.17",
69
+ "@google-cloud/storage": "^7.16.0",
70
+ "@nangohq/node": "^0.40.8",
71
+ "@openrouter/ai-sdk-provider": "^0.4.5",
72
+ "@pinecone-database/pinecone": "^4.0.0",
73
+ "@sentry/cli": "^2.33.1",
74
+ "@sentry/node": "^8.26.0",
75
+ "@sentry/profiling-node": "^8.26.0",
76
+ "@supabase/supabase-js": "^2.44.2",
77
+ "@types/ws": "^8.5.12",
78
+ "ai": "^4.3.4",
79
+ "ajv": "^8.16.0",
80
+ "async": "^3.2.5",
81
+ "async-mutex": "^0.5.0",
82
+ "axios": "^1.3.4",
83
+ "axios-retry": "^4.5.0",
84
+ "body-parser": "^1.20.1",
85
+ "bottleneck": "^2.19.5",
86
+ "bullmq": "^5.36.0",
87
+ "cacheable-lookup": "^6.1.0",
88
+ "cheerio": "^1.0.0-rc.12",
89
+ "cohere": "^1.1.1",
90
+ "cohere-ai": "^7.14.0",
91
+ "cors": "^2.8.5",
92
+ "cron-parser": "^4.9.0",
93
+ "date-fns": "^3.6.0",
94
+ "dotenv": "^16.3.1",
95
+ "dotenv-cli": "^7.4.2",
96
+ "escape-html": "^1.0.3",
97
+ "express": "^4.18.2",
98
+ "express-rate-limit": "^7.3.1",
99
+ "express-ws": "^5.0.2",
100
+ "git-diff": "^2.0.6",
101
+ "glob": "^10.4.2",
102
+ "gpt3-tokenizer": "^1.1.5",
103
+ "ioredis": "^5.4.1",
104
+ "ip-address": "^10.0.1",
105
+ "joplin-turndown-plugin-gfm": "^1.0.12",
106
+ "jsdom": "^26.0.0",
107
+ "json-schema-to-zod": "^2.3.0",
108
+ "keyword-extractor": "^0.0.28",
109
+ "koffi": "^2.9.0",
110
+ "languagedetect": "^2.0.0",
111
+ "lodash": "^4.17.21",
112
+ "logsnag": "^1.0.0",
113
+ "luxon": "^3.4.3",
114
+ "mammoth": "^1.7.2",
115
+ "marked": "^14.1.2",
116
+ "md5": "^2.3.0",
117
+ "moment": "^2.29.4",
118
+ "mongoose": "^8.4.4",
119
+ "natural": "^7.0.7",
120
+ "ollama-ai-provider": "^1.2.0",
121
+ "parse-diff": "^0.11.1",
122
+ "pdf-parse": "^1.1.1",
123
+ "pos": "^0.4.2",
124
+ "posthog-node": "^4.0.1",
125
+ "promptable": "^0.0.10",
126
+ "puppeteer": "^22.12.1",
127
+ "rate-limiter-flexible": "2.4.2",
128
+ "redlock": "5.0.0-beta.2",
129
+ "resend": "^3.4.0",
130
+ "robots-parser": "^3.0.1",
131
+ "scrapingbee": "^1.7.4",
132
+ "stripe": "^16.1.0",
133
+ "supabase": "^1.77.9",
134
+ "systeminformation": "^5.22.11",
135
+ "tldts": "^6.1.75",
136
+ "turndown": "^7.1.3",
137
+ "turndown-plugin-gfm": "^1.0.2",
138
+ "typesense": "^1.5.4",
139
+ "undici": "^6.20.1",
140
+ "unstructured-client": "^0.11.3",
141
+ "uuid": "^10.0.0",
142
+ "winston": "^3.14.2",
143
+ "winston-transport": "^4.8.0",
144
+ "wordpos": "^2.1.0",
145
+ "ws": "^8.18.0",
146
+ "xml2js": "^0.6.2",
147
+ "zod": "^3.24.2"
148
+ },
149
+ "nodemonConfig": {
150
+ "ignore": [
151
+ "*.docx",
152
+ "*.json",
153
+ "temp"
154
+ ]
155
+ }
156
+ }
pnpm-lock.yaml ADDED
The diff for this file is too large to render. See raw diff
 
requests.http ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pick your baseUrl here:
2
+ # @baseUrl = http://localhost:3002
3
+ @baseUrl = https://api.firecrawl.dev
4
+
5
+ ### Scrape Website
6
+ # @name scrape
7
+ POST {{baseUrl}}/v1/scrape HTTP/1.1
8
+ Authorization: Bearer {{$dotenv TEST_API_KEY}}
9
+ content-type: application/json
10
+
11
+ {
12
+ "url": "https://firecrawl.dev"
13
+ }
14
+
15
+ ### Crawl Website
16
+ # @name crawl
17
+ POST {{baseUrl}}/v1/crawl HTTP/1.1
18
+ Authorization: Bearer {{$dotenv TEST_API_KEY}}
19
+ content-type: application/json
20
+
21
+ {
22
+ "url":"https://firecrawl.dev"
23
+ }
24
+
25
+ ### Check Crawl Status
26
+ @crawlId = {{crawl.response.body.$.id}}
27
+ # @name crawlStatus
28
+ GET {{baseUrl}}/v1/crawl/{{crawlId}} HTTP/1.1
29
+ Authorization: Bearer {{$dotenv TEST_API_KEY}}
30
+
31
+ ### Cancel Crawl
32
+ @crawlId = {{crawl.response.body.$.id}}
33
+ # @name cancelCrawl
34
+ DELETE {{baseUrl}}/v1/crawl/{{crawlId}} HTTP/1.1
35
+ Authorization: Bearer {{$dotenv TEST_API_KEY}}
36
+
37
+ ### Extract website
38
+ # @name extract
39
+ POST {{baseUrl}}/v1/extract HTTP/1.1
40
+ Authorization: Bearer {{$dotenv TEST_API_KEY}}
41
+ content-type: application/json
42
+
43
+ {
44
+ "urls": ["https://firecrawl.dev"],
45
+ "schema": {
46
+ "type": "object",
47
+ "properties": {
48
+ "companyName": {
49
+ "type": "string"
50
+ },
51
+ "companyDescription": {
52
+ "type": "string"
53
+ }
54
+ }
55
+ },
56
+ "agent": {
57
+ "model": "fire-1"
58
+ },
59
+ "origin": "api-sdk"
60
+ }
61
+
62
+ ### Check Extract Status
63
+ @extractId = {{extract.response.body.$.id}}
64
+ # @name extractStatus
65
+ GET {{baseUrl}}/v1/extract/{{extractId}} HTTP/1.1
66
+ Authorization: Bearer {{$dotenv TEST_API_KEY}}
67
+
68
+ ### Batch Scrape Websites
69
+ # @name batchScrape
70
+ POST {{baseUrl}}/v1/batch/scrape HTTP/1.1
71
+ Authorization: Bearer {{$dotenv TEST_API_KEY}}
72
+ content-type: application/json
73
+
74
+ {
75
+ "urls": [
76
+ "firecrawl.dev",
77
+ "mendable.ai"
78
+ ]
79
+ }
80
+
81
+ ### Check Batch Scrape Status
82
+ @batchScrapeId = {{batchScrape.response.body.$.id}}
83
+ # @name batchScrapeStatus
84
+ GET {{baseUrl}}/v1/crawl/{{batchScrapeId}} HTTP/1.1
85
+ Authorization: Bearer {{$dotenv TEST_API_KEY}}
86
+
87
+ ### Map Website
88
+ # @name map
89
+ POST {{baseUrl}}/v1/map HTTP/1.1
90
+ Authorization: Bearer {{$dotenv TEST_API_KEY}}
91
+ content-type: application/json
92
+
93
+ {
94
+ "url": "firecrawl.dev",
95
+ "sitemapOnly": true
96
+ }
97
+
98
+ ### Generate LLMs TXT
99
+ # @name generateLlmsTxt
100
+ POST {{baseUrl}}/v1/llmstxt HTTP/1.1
101
+ Authorization: Bearer {{$dotenv TEST_API_KEY}}
102
+ content-type: application/json
103
+
104
+ {
105
+ "url": "https://firecrawl.dev",
106
+ "maxUrls": 1,
107
+ "showFullText": false
108
+ }
109
+
110
+
111
+ ### Check Generate LLMs TXT Status
112
+ @generateLlmsTxtId = {{generateLlmsTxt.response.body.$.id}}
113
+ # @name generateLlmsTxtStatus
114
+ GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
115
+ Authorization: Bearer {{$dotenv TEST_API_KEY}}
116
+
117
+
118
+ ### Search
119
+ # @name search
120
+ POST {{baseUrl}}/v1/search HTTP/1.1
121
+ Authorization: Bearer {{$dotenv TEST_API_KEY}}
122
+ content-type: application/json
123
+
124
+ {
125
+ "query": "firecrawl",
126
+ "limit": 50
127
+ }
sharedLibs/go-html-to-md/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ html-to-markdown.so
2
+ html-to-markdown.h
sharedLibs/go-html-to-md/README.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ To build the go-html-to-md library, run the following command:
2
+
3
+ ```bash
4
+ cd apps/api/src/lib/go-html-to-md
5
+ go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go
6
+ chmod +x html-to-markdown.so
7
+ ```
sharedLibs/go-html-to-md/go.mod ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module html-to-markdown.go
2
+
3
+ go 1.19
4
+
5
+ require github.com/tomkosm/html-to-markdown v0.0.0-20250128162844-2f19490e042d
6
+
7
+ require (
8
+ github.com/PuerkitoBio/goquery v1.9.2 // indirect
9
+ github.com/andybalholm/cascadia v1.3.2 // indirect
10
+ github.com/kr/pretty v0.3.0 // indirect
11
+ golang.org/x/net v0.25.0 // indirect
12
+ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
13
+ gopkg.in/yaml.v2 v2.4.0 // indirect
14
+ )
15
+
16
+ replace github.com/JohannesKaufmann/html-to-markdown => github.com/tomkosm/html-to-markdown v0.0.0-20250128162844-2f19490e042d
sharedLibs/go-html-to-md/go.sum ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
2
+ github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
3
+ github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
4
+ github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
5
+ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
6
+ github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
7
+ github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
8
+ github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
9
+ github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
10
+ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
11
+ github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
12
+ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
13
+ github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
14
+ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
15
+ github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k=
16
+ github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
17
+ github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y=
18
+ github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
19
+ github.com/tomkosm/html-to-markdown v0.0.0-20250128162844-2f19490e042d h1:NBs5X/qGdcYalsplADJxPR5CjhMWo4PxcjJeIjXm2Ww=
20
+ github.com/tomkosm/html-to-markdown v0.0.0-20250128162844-2f19490e042d/go.mod h1:I2mfsDlV0RelCsTjeYh9mdXdwD2M70rA7LT/y2girik=
21
+ github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
22
+ github.com/yuin/goldmark v1.7.1 h1:3bajkSilaCbjdKVsKdZjZCLBNPL9pYzrCakKaf4U49U=
23
+ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
24
+ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
25
+ golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
26
+ golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
27
+ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
28
+ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
29
+ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
30
+ golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
31
+ golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
32
+ golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
33
+ golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
34
+ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
35
+ golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
36
+ golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
37
+ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
38
+ golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
39
+ golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
40
+ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
41
+ golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
42
+ golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
43
+ golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
44
+ golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
45
+ golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
46
+ golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
47
+ golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
48
+ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
49
+ golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
50
+ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
51
+ golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
52
+ golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
53
+ golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
54
+ golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
55
+ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
56
+ golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
57
+ golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
58
+ gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
59
+ gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
60
+ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
61
+ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
62
+ gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
63
+ gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
64
+ gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
sharedLibs/go-html-to-md/html-to-markdown.go ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package main
2
+
3
+ import (
4
+ "C"
5
+ // "log"
6
+
7
+ md "github.com/tomkosm/html-to-markdown"
8
+ "github.com/tomkosm/html-to-markdown/plugin"
9
+ )
10
+
11
+ //export ConvertHTMLToMarkdown
12
+ func ConvertHTMLToMarkdown(html *C.char) *C.char {
13
+ converter := md.NewConverter("", true, nil)
14
+ converter.Use(plugin.GitHubFlavored())
15
+
16
+ markdown, err := converter.ConvertString(C.GoString(html))
17
+ if err != nil {
18
+ // log.Fatal(err)
19
+ }
20
+ return C.CString(markdown)
21
+ }
22
+
23
+ func main() {
24
+ // This function is required for the main package
25
+ }
sharedLibs/html-transformer/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ target
sharedLibs/html-transformer/Cargo.lock ADDED
@@ -0,0 +1,1235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is automatically @generated by Cargo.
2
+ # It is not intended for manual editing.
3
+ version = 4
4
+
5
+ [[package]]
6
+ name = "allocator-api2"
7
+ version = "0.2.21"
8
+ source = "registry+https://github.com/rust-lang/crates.io-index"
9
+ checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
10
+
11
+ [[package]]
12
+ name = "autocfg"
13
+ version = "1.4.0"
14
+ source = "registry+https://github.com/rust-lang/crates.io-index"
15
+ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
16
+
17
+ [[package]]
18
+ name = "bitflags"
19
+ version = "1.3.2"
20
+ source = "registry+https://github.com/rust-lang/crates.io-index"
21
+ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
22
+
23
+ [[package]]
24
+ name = "bitflags"
25
+ version = "2.8.0"
26
+ source = "registry+https://github.com/rust-lang/crates.io-index"
27
+ checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
28
+
29
+ [[package]]
30
+ name = "byteorder"
31
+ version = "1.5.0"
32
+ source = "registry+https://github.com/rust-lang/crates.io-index"
33
+ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
34
+
35
+ [[package]]
36
+ name = "cfg-if"
37
+ version = "1.0.0"
38
+ source = "registry+https://github.com/rust-lang/crates.io-index"
39
+ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
40
+
41
+ [[package]]
42
+ name = "convert_case"
43
+ version = "0.4.0"
44
+ source = "registry+https://github.com/rust-lang/crates.io-index"
45
+ checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
46
+
47
+ [[package]]
48
+ name = "cssparser"
49
+ version = "0.27.2"
50
+ source = "registry+https://github.com/rust-lang/crates.io-index"
51
+ checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a"
52
+ dependencies = [
53
+ "cssparser-macros",
54
+ "dtoa-short",
55
+ "itoa 0.4.8",
56
+ "matches",
57
+ "phf 0.8.0",
58
+ "proc-macro2",
59
+ "quote",
60
+ "smallvec",
61
+ "syn 1.0.109",
62
+ ]
63
+
64
+ [[package]]
65
+ name = "cssparser"
66
+ version = "0.29.6"
67
+ source = "registry+https://github.com/rust-lang/crates.io-index"
68
+ checksum = "f93d03419cb5950ccfd3daf3ff1c7a36ace64609a1a8746d493df1ca0afde0fa"
69
+ dependencies = [
70
+ "cssparser-macros",
71
+ "dtoa-short",
72
+ "itoa 1.0.14",
73
+ "matches",
74
+ "phf 0.10.1",
75
+ "proc-macro2",
76
+ "quote",
77
+ "smallvec",
78
+ "syn 1.0.109",
79
+ ]
80
+
81
+ [[package]]
82
+ name = "cssparser-macros"
83
+ version = "0.6.1"
84
+ source = "registry+https://github.com/rust-lang/crates.io-index"
85
+ checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
86
+ dependencies = [
87
+ "quote",
88
+ "syn 2.0.96",
89
+ ]
90
+
91
+ [[package]]
92
+ name = "derive_more"
93
+ version = "0.99.18"
94
+ source = "registry+https://github.com/rust-lang/crates.io-index"
95
+ checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce"
96
+ dependencies = [
97
+ "convert_case",
98
+ "proc-macro2",
99
+ "quote",
100
+ "rustc_version",
101
+ "syn 2.0.96",
102
+ ]
103
+
104
+ [[package]]
105
+ name = "displaydoc"
106
+ version = "0.2.5"
107
+ source = "registry+https://github.com/rust-lang/crates.io-index"
108
+ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
109
+ dependencies = [
110
+ "proc-macro2",
111
+ "quote",
112
+ "syn 2.0.96",
113
+ ]
114
+
115
+ [[package]]
116
+ name = "dtoa"
117
+ version = "1.0.9"
118
+ source = "registry+https://github.com/rust-lang/crates.io-index"
119
+ checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653"
120
+
121
+ [[package]]
122
+ name = "dtoa-short"
123
+ version = "0.3.5"
124
+ source = "registry+https://github.com/rust-lang/crates.io-index"
125
+ checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
126
+ dependencies = [
127
+ "dtoa",
128
+ ]
129
+
130
+ [[package]]
131
+ name = "encoding_rs"
132
+ version = "0.8.35"
133
+ source = "registry+https://github.com/rust-lang/crates.io-index"
134
+ checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
135
+ dependencies = [
136
+ "cfg-if",
137
+ ]
138
+
139
+ [[package]]
140
+ name = "equivalent"
141
+ version = "1.0.1"
142
+ source = "registry+https://github.com/rust-lang/crates.io-index"
143
+ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
144
+
145
+ [[package]]
146
+ name = "foldhash"
147
+ version = "0.1.4"
148
+ source = "registry+https://github.com/rust-lang/crates.io-index"
149
+ checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f"
150
+
151
+ [[package]]
152
+ name = "form_urlencoded"
153
+ version = "1.2.1"
154
+ source = "registry+https://github.com/rust-lang/crates.io-index"
155
+ checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
156
+ dependencies = [
157
+ "percent-encoding",
158
+ ]
159
+
160
+ [[package]]
161
+ name = "futf"
162
+ version = "0.1.5"
163
+ source = "registry+https://github.com/rust-lang/crates.io-index"
164
+ checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
165
+ dependencies = [
166
+ "mac",
167
+ "new_debug_unreachable",
168
+ ]
169
+
170
+ [[package]]
171
+ name = "fxhash"
172
+ version = "0.2.1"
173
+ source = "registry+https://github.com/rust-lang/crates.io-index"
174
+ checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
175
+ dependencies = [
176
+ "byteorder",
177
+ ]
178
+
179
+ [[package]]
180
+ name = "getrandom"
181
+ version = "0.1.16"
182
+ source = "registry+https://github.com/rust-lang/crates.io-index"
183
+ checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
184
+ dependencies = [
185
+ "cfg-if",
186
+ "libc",
187
+ "wasi 0.9.0+wasi-snapshot-preview1",
188
+ ]
189
+
190
+ [[package]]
191
+ name = "getrandom"
192
+ version = "0.2.15"
193
+ source = "registry+https://github.com/rust-lang/crates.io-index"
194
+ checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
195
+ dependencies = [
196
+ "cfg-if",
197
+ "libc",
198
+ "wasi 0.11.0+wasi-snapshot-preview1",
199
+ ]
200
+
201
+ [[package]]
202
+ name = "hashbrown"
203
+ version = "0.12.3"
204
+ source = "registry+https://github.com/rust-lang/crates.io-index"
205
+ checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
206
+
207
+ [[package]]
208
+ name = "hashbrown"
209
+ version = "0.15.2"
210
+ source = "registry+https://github.com/rust-lang/crates.io-index"
211
+ checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
212
+ dependencies = [
213
+ "allocator-api2",
214
+ "equivalent",
215
+ "foldhash",
216
+ ]
217
+
218
+ [[package]]
219
+ name = "html-transformer"
220
+ version = "0.1.0"
221
+ dependencies = [
222
+ "kuchikiki",
223
+ "libc",
224
+ "lol_html",
225
+ "serde",
226
+ "serde_json",
227
+ "url",
228
+ ]
229
+
230
+ [[package]]
231
+ name = "html5ever"
232
+ version = "0.26.0"
233
+ source = "registry+https://github.com/rust-lang/crates.io-index"
234
+ checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
235
+ dependencies = [
236
+ "log",
237
+ "mac",
238
+ "markup5ever",
239
+ "proc-macro2",
240
+ "quote",
241
+ "syn 1.0.109",
242
+ ]
243
+
244
+ [[package]]
245
+ name = "icu_collections"
246
+ version = "1.5.0"
247
+ source = "registry+https://github.com/rust-lang/crates.io-index"
248
+ checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
249
+ dependencies = [
250
+ "displaydoc",
251
+ "yoke",
252
+ "zerofrom",
253
+ "zerovec",
254
+ ]
255
+
256
+ [[package]]
257
+ name = "icu_locid"
258
+ version = "1.5.0"
259
+ source = "registry+https://github.com/rust-lang/crates.io-index"
260
+ checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
261
+ dependencies = [
262
+ "displaydoc",
263
+ "litemap",
264
+ "tinystr",
265
+ "writeable",
266
+ "zerovec",
267
+ ]
268
+
269
+ [[package]]
270
+ name = "icu_locid_transform"
271
+ version = "1.5.0"
272
+ source = "registry+https://github.com/rust-lang/crates.io-index"
273
+ checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
274
+ dependencies = [
275
+ "displaydoc",
276
+ "icu_locid",
277
+ "icu_locid_transform_data",
278
+ "icu_provider",
279
+ "tinystr",
280
+ "zerovec",
281
+ ]
282
+
283
+ [[package]]
284
+ name = "icu_locid_transform_data"
285
+ version = "1.5.0"
286
+ source = "registry+https://github.com/rust-lang/crates.io-index"
287
+ checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
288
+
289
+ [[package]]
290
+ name = "icu_normalizer"
291
+ version = "1.5.0"
292
+ source = "registry+https://github.com/rust-lang/crates.io-index"
293
+ checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
294
+ dependencies = [
295
+ "displaydoc",
296
+ "icu_collections",
297
+ "icu_normalizer_data",
298
+ "icu_properties",
299
+ "icu_provider",
300
+ "smallvec",
301
+ "utf16_iter",
302
+ "utf8_iter",
303
+ "write16",
304
+ "zerovec",
305
+ ]
306
+
307
+ [[package]]
308
+ name = "icu_normalizer_data"
309
+ version = "1.5.0"
310
+ source = "registry+https://github.com/rust-lang/crates.io-index"
311
+ checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
312
+
313
+ [[package]]
314
+ name = "icu_properties"
315
+ version = "1.5.1"
316
+ source = "registry+https://github.com/rust-lang/crates.io-index"
317
+ checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5"
318
+ dependencies = [
319
+ "displaydoc",
320
+ "icu_collections",
321
+ "icu_locid_transform",
322
+ "icu_properties_data",
323
+ "icu_provider",
324
+ "tinystr",
325
+ "zerovec",
326
+ ]
327
+
328
+ [[package]]
329
+ name = "icu_properties_data"
330
+ version = "1.5.0"
331
+ source = "registry+https://github.com/rust-lang/crates.io-index"
332
+ checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
333
+
334
+ [[package]]
335
+ name = "icu_provider"
336
+ version = "1.5.0"
337
+ source = "registry+https://github.com/rust-lang/crates.io-index"
338
+ checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
339
+ dependencies = [
340
+ "displaydoc",
341
+ "icu_locid",
342
+ "icu_provider_macros",
343
+ "stable_deref_trait",
344
+ "tinystr",
345
+ "writeable",
346
+ "yoke",
347
+ "zerofrom",
348
+ "zerovec",
349
+ ]
350
+
351
+ [[package]]
352
+ name = "icu_provider_macros"
353
+ version = "1.5.0"
354
+ source = "registry+https://github.com/rust-lang/crates.io-index"
355
+ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
356
+ dependencies = [
357
+ "proc-macro2",
358
+ "quote",
359
+ "syn 2.0.96",
360
+ ]
361
+
362
+ [[package]]
363
+ name = "idna"
364
+ version = "1.0.3"
365
+ source = "registry+https://github.com/rust-lang/crates.io-index"
366
+ checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e"
367
+ dependencies = [
368
+ "idna_adapter",
369
+ "smallvec",
370
+ "utf8_iter",
371
+ ]
372
+
373
+ [[package]]
374
+ name = "idna_adapter"
375
+ version = "1.2.0"
376
+ source = "registry+https://github.com/rust-lang/crates.io-index"
377
+ checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71"
378
+ dependencies = [
379
+ "icu_normalizer",
380
+ "icu_properties",
381
+ ]
382
+
383
+ [[package]]
384
+ name = "indexmap"
385
+ version = "1.9.3"
386
+ source = "registry+https://github.com/rust-lang/crates.io-index"
387
+ checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
388
+ dependencies = [
389
+ "autocfg",
390
+ "hashbrown 0.12.3",
391
+ ]
392
+
393
+ [[package]]
394
+ name = "itoa"
395
+ version = "0.4.8"
396
+ source = "registry+https://github.com/rust-lang/crates.io-index"
397
+ checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
398
+
399
+ [[package]]
400
+ name = "itoa"
401
+ version = "1.0.14"
402
+ source = "registry+https://github.com/rust-lang/crates.io-index"
403
+ checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
404
+
405
+ [[package]]
406
+ name = "kuchikiki"
407
+ version = "0.8.2"
408
+ source = "registry+https://github.com/rust-lang/crates.io-index"
409
+ checksum = "f29e4755b7b995046f510a7520c42b2fed58b77bd94d5a87a8eb43d2fd126da8"
410
+ dependencies = [
411
+ "cssparser 0.27.2",
412
+ "html5ever",
413
+ "indexmap",
414
+ "matches",
415
+ "selectors 0.22.0",
416
+ ]
417
+
418
+ [[package]]
419
+ name = "libc"
420
+ version = "0.2.169"
421
+ source = "registry+https://github.com/rust-lang/crates.io-index"
422
+ checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
423
+
424
+ [[package]]
425
+ name = "litemap"
426
+ version = "0.7.4"
427
+ source = "registry+https://github.com/rust-lang/crates.io-index"
428
+ checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
429
+
430
+ [[package]]
431
+ name = "lock_api"
432
+ version = "0.4.12"
433
+ source = "registry+https://github.com/rust-lang/crates.io-index"
434
+ checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
435
+ dependencies = [
436
+ "autocfg",
437
+ "scopeguard",
438
+ ]
439
+
440
+ [[package]]
441
+ name = "log"
442
+ version = "0.4.25"
443
+ source = "registry+https://github.com/rust-lang/crates.io-index"
444
+ checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f"
445
+
446
+ [[package]]
447
+ name = "lol_html"
448
+ version = "2.2.0"
449
+ source = "registry+https://github.com/rust-lang/crates.io-index"
450
+ checksum = "3b1058123f6262982b891dccc395cff0144d9439de366460b47fab719258b96e"
451
+ dependencies = [
452
+ "bitflags 2.8.0",
453
+ "cfg-if",
454
+ "cssparser 0.29.6",
455
+ "encoding_rs",
456
+ "hashbrown 0.15.2",
457
+ "memchr",
458
+ "mime",
459
+ "selectors 0.24.0",
460
+ "thiserror",
461
+ ]
462
+
463
+ [[package]]
464
+ name = "mac"
465
+ version = "0.1.1"
466
+ source = "registry+https://github.com/rust-lang/crates.io-index"
467
+ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
468
+
469
+ [[package]]
470
+ name = "markup5ever"
471
+ version = "0.11.0"
472
+ source = "registry+https://github.com/rust-lang/crates.io-index"
473
+ checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
474
+ dependencies = [
475
+ "log",
476
+ "phf 0.10.1",
477
+ "phf_codegen 0.10.0",
478
+ "string_cache",
479
+ "string_cache_codegen",
480
+ "tendril",
481
+ ]
482
+
483
+ [[package]]
484
+ name = "matches"
485
+ version = "0.1.10"
486
+ source = "registry+https://github.com/rust-lang/crates.io-index"
487
+ checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5"
488
+
489
+ [[package]]
490
+ name = "memchr"
491
+ version = "2.7.4"
492
+ source = "registry+https://github.com/rust-lang/crates.io-index"
493
+ checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
494
+
495
+ [[package]]
496
+ name = "mime"
497
+ version = "0.3.17"
498
+ source = "registry+https://github.com/rust-lang/crates.io-index"
499
+ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
500
+
501
+ [[package]]
502
+ name = "new_debug_unreachable"
503
+ version = "1.0.6"
504
+ source = "registry+https://github.com/rust-lang/crates.io-index"
505
+ checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
506
+
507
+ [[package]]
508
+ name = "nodrop"
509
+ version = "0.1.14"
510
+ source = "registry+https://github.com/rust-lang/crates.io-index"
511
+ checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
512
+
513
+ [[package]]
514
+ name = "once_cell"
515
+ version = "1.20.2"
516
+ source = "registry+https://github.com/rust-lang/crates.io-index"
517
+ checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
518
+
519
+ [[package]]
520
+ name = "parking_lot"
521
+ version = "0.12.3"
522
+ source = "registry+https://github.com/rust-lang/crates.io-index"
523
+ checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
524
+ dependencies = [
525
+ "lock_api",
526
+ "parking_lot_core",
527
+ ]
528
+
529
+ [[package]]
530
+ name = "parking_lot_core"
531
+ version = "0.9.10"
532
+ source = "registry+https://github.com/rust-lang/crates.io-index"
533
+ checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
534
+ dependencies = [
535
+ "cfg-if",
536
+ "libc",
537
+ "redox_syscall",
538
+ "smallvec",
539
+ "windows-targets",
540
+ ]
541
+
542
+ [[package]]
543
+ name = "percent-encoding"
544
+ version = "2.3.1"
545
+ source = "registry+https://github.com/rust-lang/crates.io-index"
546
+ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
547
+
548
+ [[package]]
549
+ name = "phf"
550
+ version = "0.8.0"
551
+ source = "registry+https://github.com/rust-lang/crates.io-index"
552
+ checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12"
553
+ dependencies = [
554
+ "phf_macros 0.8.0",
555
+ "phf_shared 0.8.0",
556
+ "proc-macro-hack",
557
+ ]
558
+
559
+ [[package]]
560
+ name = "phf"
561
+ version = "0.10.1"
562
+ source = "registry+https://github.com/rust-lang/crates.io-index"
563
+ checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
564
+ dependencies = [
565
+ "phf_macros 0.10.0",
566
+ "phf_shared 0.10.0",
567
+ "proc-macro-hack",
568
+ ]
569
+
570
+ [[package]]
571
+ name = "phf_codegen"
572
+ version = "0.8.0"
573
+ source = "registry+https://github.com/rust-lang/crates.io-index"
574
+ checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
575
+ dependencies = [
576
+ "phf_generator 0.8.0",
577
+ "phf_shared 0.8.0",
578
+ ]
579
+
580
+ [[package]]
581
+ name = "phf_codegen"
582
+ version = "0.10.0"
583
+ source = "registry+https://github.com/rust-lang/crates.io-index"
584
+ checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
585
+ dependencies = [
586
+ "phf_generator 0.10.0",
587
+ "phf_shared 0.10.0",
588
+ ]
589
+
590
+ [[package]]
591
+ name = "phf_generator"
592
+ version = "0.8.0"
593
+ source = "registry+https://github.com/rust-lang/crates.io-index"
594
+ checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
595
+ dependencies = [
596
+ "phf_shared 0.8.0",
597
+ "rand 0.7.3",
598
+ ]
599
+
600
+ [[package]]
601
+ name = "phf_generator"
602
+ version = "0.10.0"
603
+ source = "registry+https://github.com/rust-lang/crates.io-index"
604
+ checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
605
+ dependencies = [
606
+ "phf_shared 0.10.0",
607
+ "rand 0.8.5",
608
+ ]
609
+
610
+ [[package]]
611
+ name = "phf_macros"
612
+ version = "0.8.0"
613
+ source = "registry+https://github.com/rust-lang/crates.io-index"
614
+ checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c"
615
+ dependencies = [
616
+ "phf_generator 0.8.0",
617
+ "phf_shared 0.8.0",
618
+ "proc-macro-hack",
619
+ "proc-macro2",
620
+ "quote",
621
+ "syn 1.0.109",
622
+ ]
623
+
624
+ [[package]]
625
+ name = "phf_macros"
626
+ version = "0.10.0"
627
+ source = "registry+https://github.com/rust-lang/crates.io-index"
628
+ checksum = "58fdf3184dd560f160dd73922bea2d5cd6e8f064bf4b13110abd81b03697b4e0"
629
+ dependencies = [
630
+ "phf_generator 0.10.0",
631
+ "phf_shared 0.10.0",
632
+ "proc-macro-hack",
633
+ "proc-macro2",
634
+ "quote",
635
+ "syn 1.0.109",
636
+ ]
637
+
638
+ [[package]]
639
+ name = "phf_shared"
640
+ version = "0.8.0"
641
+ source = "registry+https://github.com/rust-lang/crates.io-index"
642
+ checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7"
643
+ dependencies = [
644
+ "siphasher",
645
+ ]
646
+
647
+ [[package]]
648
+ name = "phf_shared"
649
+ version = "0.10.0"
650
+ source = "registry+https://github.com/rust-lang/crates.io-index"
651
+ checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
652
+ dependencies = [
653
+ "siphasher",
654
+ ]
655
+
656
+ [[package]]
657
+ name = "ppv-lite86"
658
+ version = "0.2.20"
659
+ source = "registry+https://github.com/rust-lang/crates.io-index"
660
+ checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
661
+ dependencies = [
662
+ "zerocopy",
663
+ ]
664
+
665
+ [[package]]
666
+ name = "precomputed-hash"
667
+ version = "0.1.1"
668
+ source = "registry+https://github.com/rust-lang/crates.io-index"
669
+ checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
670
+
671
+ [[package]]
672
+ name = "proc-macro-hack"
673
+ version = "0.5.20+deprecated"
674
+ source = "registry+https://github.com/rust-lang/crates.io-index"
675
+ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
676
+
677
+ [[package]]
678
+ name = "proc-macro2"
679
+ version = "1.0.93"
680
+ source = "registry+https://github.com/rust-lang/crates.io-index"
681
+ checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99"
682
+ dependencies = [
683
+ "unicode-ident",
684
+ ]
685
+
686
+ [[package]]
687
+ name = "quote"
688
+ version = "1.0.38"
689
+ source = "registry+https://github.com/rust-lang/crates.io-index"
690
+ checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
691
+ dependencies = [
692
+ "proc-macro2",
693
+ ]
694
+
695
+ [[package]]
696
+ name = "rand"
697
+ version = "0.7.3"
698
+ source = "registry+https://github.com/rust-lang/crates.io-index"
699
+ checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
700
+ dependencies = [
701
+ "getrandom 0.1.16",
702
+ "libc",
703
+ "rand_chacha 0.2.2",
704
+ "rand_core 0.5.1",
705
+ "rand_hc",
706
+ "rand_pcg",
707
+ ]
708
+
709
+ [[package]]
710
+ name = "rand"
711
+ version = "0.8.5"
712
+ source = "registry+https://github.com/rust-lang/crates.io-index"
713
+ checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
714
+ dependencies = [
715
+ "libc",
716
+ "rand_chacha 0.3.1",
717
+ "rand_core 0.6.4",
718
+ ]
719
+
720
+ [[package]]
721
+ name = "rand_chacha"
722
+ version = "0.2.2"
723
+ source = "registry+https://github.com/rust-lang/crates.io-index"
724
+ checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
725
+ dependencies = [
726
+ "ppv-lite86",
727
+ "rand_core 0.5.1",
728
+ ]
729
+
730
+ [[package]]
731
+ name = "rand_chacha"
732
+ version = "0.3.1"
733
+ source = "registry+https://github.com/rust-lang/crates.io-index"
734
+ checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
735
+ dependencies = [
736
+ "ppv-lite86",
737
+ "rand_core 0.6.4",
738
+ ]
739
+
740
+ [[package]]
741
+ name = "rand_core"
742
+ version = "0.5.1"
743
+ source = "registry+https://github.com/rust-lang/crates.io-index"
744
+ checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
745
+ dependencies = [
746
+ "getrandom 0.1.16",
747
+ ]
748
+
749
+ [[package]]
750
+ name = "rand_core"
751
+ version = "0.6.4"
752
+ source = "registry+https://github.com/rust-lang/crates.io-index"
753
+ checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
754
+ dependencies = [
755
+ "getrandom 0.2.15",
756
+ ]
757
+
758
+ [[package]]
759
+ name = "rand_hc"
760
+ version = "0.2.0"
761
+ source = "registry+https://github.com/rust-lang/crates.io-index"
762
+ checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
763
+ dependencies = [
764
+ "rand_core 0.5.1",
765
+ ]
766
+
767
+ [[package]]
768
+ name = "rand_pcg"
769
+ version = "0.2.1"
770
+ source = "registry+https://github.com/rust-lang/crates.io-index"
771
+ checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
772
+ dependencies = [
773
+ "rand_core 0.5.1",
774
+ ]
775
+
776
+ [[package]]
777
+ name = "redox_syscall"
778
+ version = "0.5.8"
779
+ source = "registry+https://github.com/rust-lang/crates.io-index"
780
+ checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834"
781
+ dependencies = [
782
+ "bitflags 2.8.0",
783
+ ]
784
+
785
+ [[package]]
786
+ name = "rustc_version"
787
+ version = "0.4.1"
788
+ source = "registry+https://github.com/rust-lang/crates.io-index"
789
+ checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
790
+ dependencies = [
791
+ "semver",
792
+ ]
793
+
794
+ [[package]]
795
+ name = "ryu"
796
+ version = "1.0.18"
797
+ source = "registry+https://github.com/rust-lang/crates.io-index"
798
+ checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
799
+
800
+ [[package]]
801
+ name = "scopeguard"
802
+ version = "1.2.0"
803
+ source = "registry+https://github.com/rust-lang/crates.io-index"
804
+ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
805
+
806
+ [[package]]
807
+ name = "selectors"
808
+ version = "0.22.0"
809
+ source = "registry+https://github.com/rust-lang/crates.io-index"
810
+ checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe"
811
+ dependencies = [
812
+ "bitflags 1.3.2",
813
+ "cssparser 0.27.2",
814
+ "derive_more",
815
+ "fxhash",
816
+ "log",
817
+ "matches",
818
+ "phf 0.8.0",
819
+ "phf_codegen 0.8.0",
820
+ "precomputed-hash",
821
+ "servo_arc 0.1.1",
822
+ "smallvec",
823
+ "thin-slice",
824
+ ]
825
+
826
+ [[package]]
827
+ name = "selectors"
828
+ version = "0.24.0"
829
+ source = "registry+https://github.com/rust-lang/crates.io-index"
830
+ checksum = "0c37578180969d00692904465fb7f6b3d50b9a2b952b87c23d0e2e5cb5013416"
831
+ dependencies = [
832
+ "bitflags 1.3.2",
833
+ "cssparser 0.29.6",
834
+ "derive_more",
835
+ "fxhash",
836
+ "log",
837
+ "phf 0.8.0",
838
+ "phf_codegen 0.8.0",
839
+ "precomputed-hash",
840
+ "servo_arc 0.2.0",
841
+ "smallvec",
842
+ ]
843
+
844
+ [[package]]
845
+ name = "semver"
846
+ version = "1.0.25"
847
+ source = "registry+https://github.com/rust-lang/crates.io-index"
848
+ checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03"
849
+
850
+ [[package]]
851
+ name = "serde"
852
+ version = "1.0.217"
853
+ source = "registry+https://github.com/rust-lang/crates.io-index"
854
+ checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
855
+ dependencies = [
856
+ "serde_derive",
857
+ ]
858
+
859
+ [[package]]
860
+ name = "serde_derive"
861
+ version = "1.0.217"
862
+ source = "registry+https://github.com/rust-lang/crates.io-index"
863
+ checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
864
+ dependencies = [
865
+ "proc-macro2",
866
+ "quote",
867
+ "syn 2.0.96",
868
+ ]
869
+
870
+ [[package]]
871
+ name = "serde_json"
872
+ version = "1.0.137"
873
+ source = "registry+https://github.com/rust-lang/crates.io-index"
874
+ checksum = "930cfb6e6abf99298aaad7d29abbef7a9999a9a8806a40088f55f0dcec03146b"
875
+ dependencies = [
876
+ "itoa 1.0.14",
877
+ "memchr",
878
+ "ryu",
879
+ "serde",
880
+ ]
881
+
882
+ [[package]]
883
+ name = "servo_arc"
884
+ version = "0.1.1"
885
+ source = "registry+https://github.com/rust-lang/crates.io-index"
886
+ checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432"
887
+ dependencies = [
888
+ "nodrop",
889
+ "stable_deref_trait",
890
+ ]
891
+
892
+ [[package]]
893
+ name = "servo_arc"
894
+ version = "0.2.0"
895
+ source = "registry+https://github.com/rust-lang/crates.io-index"
896
+ checksum = "d52aa42f8fdf0fed91e5ce7f23d8138441002fa31dca008acf47e6fd4721f741"
897
+ dependencies = [
898
+ "nodrop",
899
+ "stable_deref_trait",
900
+ ]
901
+
902
+ [[package]]
903
+ name = "siphasher"
904
+ version = "0.3.11"
905
+ source = "registry+https://github.com/rust-lang/crates.io-index"
906
+ checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
907
+
908
+ [[package]]
909
+ name = "smallvec"
910
+ version = "1.13.2"
911
+ source = "registry+https://github.com/rust-lang/crates.io-index"
912
+ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
913
+
914
+ [[package]]
915
+ name = "stable_deref_trait"
916
+ version = "1.2.0"
917
+ source = "registry+https://github.com/rust-lang/crates.io-index"
918
+ checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
919
+
920
+ [[package]]
921
+ name = "string_cache"
922
+ version = "0.8.7"
923
+ source = "registry+https://github.com/rust-lang/crates.io-index"
924
+ checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b"
925
+ dependencies = [
926
+ "new_debug_unreachable",
927
+ "once_cell",
928
+ "parking_lot",
929
+ "phf_shared 0.10.0",
930
+ "precomputed-hash",
931
+ "serde",
932
+ ]
933
+
934
+ [[package]]
935
+ name = "string_cache_codegen"
936
+ version = "0.5.2"
937
+ source = "registry+https://github.com/rust-lang/crates.io-index"
938
+ checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988"
939
+ dependencies = [
940
+ "phf_generator 0.10.0",
941
+ "phf_shared 0.10.0",
942
+ "proc-macro2",
943
+ "quote",
944
+ ]
945
+
946
+ [[package]]
947
+ name = "syn"
948
+ version = "1.0.109"
949
+ source = "registry+https://github.com/rust-lang/crates.io-index"
950
+ checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
951
+ dependencies = [
952
+ "proc-macro2",
953
+ "quote",
954
+ "unicode-ident",
955
+ ]
956
+
957
+ [[package]]
958
+ name = "syn"
959
+ version = "2.0.96"
960
+ source = "registry+https://github.com/rust-lang/crates.io-index"
961
+ checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80"
962
+ dependencies = [
963
+ "proc-macro2",
964
+ "quote",
965
+ "unicode-ident",
966
+ ]
967
+
968
+ [[package]]
969
+ name = "synstructure"
970
+ version = "0.13.1"
971
+ source = "registry+https://github.com/rust-lang/crates.io-index"
972
+ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
973
+ dependencies = [
974
+ "proc-macro2",
975
+ "quote",
976
+ "syn 2.0.96",
977
+ ]
978
+
979
+ [[package]]
980
+ name = "tendril"
981
+ version = "0.4.3"
982
+ source = "registry+https://github.com/rust-lang/crates.io-index"
983
+ checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
984
+ dependencies = [
985
+ "futf",
986
+ "mac",
987
+ "utf-8",
988
+ ]
989
+
990
+ [[package]]
991
+ name = "thin-slice"
992
+ version = "0.1.1"
993
+ source = "registry+https://github.com/rust-lang/crates.io-index"
994
+ checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c"
995
+
996
+ [[package]]
997
+ name = "thiserror"
998
+ version = "2.0.11"
999
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1000
+ checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc"
1001
+ dependencies = [
1002
+ "thiserror-impl",
1003
+ ]
1004
+
1005
+ [[package]]
1006
+ name = "thiserror-impl"
1007
+ version = "2.0.11"
1008
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1009
+ checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2"
1010
+ dependencies = [
1011
+ "proc-macro2",
1012
+ "quote",
1013
+ "syn 2.0.96",
1014
+ ]
1015
+
1016
+ [[package]]
1017
+ name = "tinystr"
1018
+ version = "0.7.6"
1019
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1020
+ checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
1021
+ dependencies = [
1022
+ "displaydoc",
1023
+ "zerovec",
1024
+ ]
1025
+
1026
+ [[package]]
1027
+ name = "unicode-ident"
1028
+ version = "1.0.15"
1029
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1030
+ checksum = "11cd88e12b17c6494200a9c1b683a04fcac9573ed74cd1b62aeb2727c5592243"
1031
+
1032
+ [[package]]
1033
+ name = "url"
1034
+ version = "2.5.4"
1035
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1036
+ checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60"
1037
+ dependencies = [
1038
+ "form_urlencoded",
1039
+ "idna",
1040
+ "percent-encoding",
1041
+ ]
1042
+
1043
+ [[package]]
1044
+ name = "utf-8"
1045
+ version = "0.7.6"
1046
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1047
+ checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
1048
+
1049
+ [[package]]
1050
+ name = "utf16_iter"
1051
+ version = "1.0.5"
1052
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1053
+ checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
1054
+
1055
+ [[package]]
1056
+ name = "utf8_iter"
1057
+ version = "1.0.4"
1058
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1059
+ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
1060
+
1061
+ [[package]]
1062
+ name = "wasi"
1063
+ version = "0.9.0+wasi-snapshot-preview1"
1064
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1065
+ checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
1066
+
1067
+ [[package]]
1068
+ name = "wasi"
1069
+ version = "0.11.0+wasi-snapshot-preview1"
1070
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1071
+ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
1072
+
1073
+ [[package]]
1074
+ name = "windows-targets"
1075
+ version = "0.52.6"
1076
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1077
+ checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
1078
+ dependencies = [
1079
+ "windows_aarch64_gnullvm",
1080
+ "windows_aarch64_msvc",
1081
+ "windows_i686_gnu",
1082
+ "windows_i686_gnullvm",
1083
+ "windows_i686_msvc",
1084
+ "windows_x86_64_gnu",
1085
+ "windows_x86_64_gnullvm",
1086
+ "windows_x86_64_msvc",
1087
+ ]
1088
+
1089
+ [[package]]
1090
+ name = "windows_aarch64_gnullvm"
1091
+ version = "0.52.6"
1092
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1093
+ checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
1094
+
1095
+ [[package]]
1096
+ name = "windows_aarch64_msvc"
1097
+ version = "0.52.6"
1098
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1099
+ checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
1100
+
1101
+ [[package]]
1102
+ name = "windows_i686_gnu"
1103
+ version = "0.52.6"
1104
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1105
+ checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
1106
+
1107
+ [[package]]
1108
+ name = "windows_i686_gnullvm"
1109
+ version = "0.52.6"
1110
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1111
+ checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
1112
+
1113
+ [[package]]
1114
+ name = "windows_i686_msvc"
1115
+ version = "0.52.6"
1116
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1117
+ checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
1118
+
1119
+ [[package]]
1120
+ name = "windows_x86_64_gnu"
1121
+ version = "0.52.6"
1122
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1123
+ checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
1124
+
1125
+ [[package]]
1126
+ name = "windows_x86_64_gnullvm"
1127
+ version = "0.52.6"
1128
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1129
+ checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
1130
+
1131
+ [[package]]
1132
+ name = "windows_x86_64_msvc"
1133
+ version = "0.52.6"
1134
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1135
+ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
1136
+
1137
+ [[package]]
1138
+ name = "write16"
1139
+ version = "1.0.0"
1140
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1141
+ checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
1142
+
1143
+ [[package]]
1144
+ name = "writeable"
1145
+ version = "0.5.5"
1146
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1147
+ checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
1148
+
1149
+ [[package]]
1150
+ name = "yoke"
1151
+ version = "0.7.5"
1152
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1153
+ checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40"
1154
+ dependencies = [
1155
+ "serde",
1156
+ "stable_deref_trait",
1157
+ "yoke-derive",
1158
+ "zerofrom",
1159
+ ]
1160
+
1161
+ [[package]]
1162
+ name = "yoke-derive"
1163
+ version = "0.7.5"
1164
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1165
+ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
1166
+ dependencies = [
1167
+ "proc-macro2",
1168
+ "quote",
1169
+ "syn 2.0.96",
1170
+ "synstructure",
1171
+ ]
1172
+
1173
+ [[package]]
1174
+ name = "zerocopy"
1175
+ version = "0.7.35"
1176
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1177
+ checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
1178
+ dependencies = [
1179
+ "byteorder",
1180
+ "zerocopy-derive",
1181
+ ]
1182
+
1183
+ [[package]]
1184
+ name = "zerocopy-derive"
1185
+ version = "0.7.35"
1186
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1187
+ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
1188
+ dependencies = [
1189
+ "proc-macro2",
1190
+ "quote",
1191
+ "syn 2.0.96",
1192
+ ]
1193
+
1194
+ [[package]]
1195
+ name = "zerofrom"
1196
+ version = "0.1.5"
1197
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1198
+ checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e"
1199
+ dependencies = [
1200
+ "zerofrom-derive",
1201
+ ]
1202
+
1203
+ [[package]]
1204
+ name = "zerofrom-derive"
1205
+ version = "0.1.5"
1206
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1207
+ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808"
1208
+ dependencies = [
1209
+ "proc-macro2",
1210
+ "quote",
1211
+ "syn 2.0.96",
1212
+ "synstructure",
1213
+ ]
1214
+
1215
+ [[package]]
1216
+ name = "zerovec"
1217
+ version = "0.10.4"
1218
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1219
+ checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
1220
+ dependencies = [
1221
+ "yoke",
1222
+ "zerofrom",
1223
+ "zerovec-derive",
1224
+ ]
1225
+
1226
+ [[package]]
1227
+ name = "zerovec-derive"
1228
+ version = "0.10.3"
1229
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1230
+ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
1231
+ dependencies = [
1232
+ "proc-macro2",
1233
+ "quote",
1234
+ "syn 2.0.96",
1235
+ ]
sharedLibs/html-transformer/Cargo.toml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [package]
2
+ name = "html-transformer"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+
6
+ [dependencies]
7
+ libc = "0.2.0"
8
+ lol_html = "2.2.0"
9
+ kuchikiki = "0.8.2"
10
+ serde = { version = "1.0", features = ["derive"] }
11
+ serde_json = "1.0"
12
+ url = "2.5.4"
13
+
14
+ [lib]
15
+ crate-type = ["cdylib"]
sharedLibs/html-transformer/src/lib.rs ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ use std::{collections::HashMap, ffi::{CStr, CString}};
2
+
3
+ use kuchikiki::{parse_html, traits::TendrilSink};
4
+ use serde::Deserialize;
5
+ use serde_json::Value;
6
+ use url::Url;
7
+
8
+ /// Extracts links from HTML
9
+ ///
10
+ /// # Safety
11
+ /// Input options must be a C HTML string. Output will be a JSON string array. Output string must be freed with free_string.
12
+ #[no_mangle]
13
+ pub unsafe extern "C" fn extract_links(html: *const libc::c_char) -> *mut libc::c_char {
14
+ let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
15
+
16
+ let document = parse_html().one(html);
17
+
18
+ let mut out: Vec<String> = Vec::new();
19
+
20
+ let anchors: Vec<_> = document.select("a[href]").unwrap().collect();
21
+ for anchor in anchors {
22
+ let mut href = anchor.attributes.borrow().get("href").unwrap().to_string();
23
+
24
+ if href.starts_with("http:/") && !href.starts_with("http://") {
25
+ href = format!("http://{}", &href[6..]);
26
+ } else if href.starts_with("https:/") && !href.starts_with("https://") {
27
+ href = format!("https://{}", &href[7..]);
28
+ }
29
+
30
+ out.push(href);
31
+ }
32
+
33
+ CString::new(serde_json::ser::to_string(&out).unwrap()).unwrap().into_raw()
34
+ }
35
+
36
+ macro_rules! insert_meta_name {
37
+ ($out:ident, $document:ident, $metaName:expr, $outName:expr) => {
38
+ if let Some(x) = $document.select(&format!("meta[name=\"{}\"]", $metaName)).unwrap().next().and_then(|description| description.attributes.borrow().get("content").map(|x| x.to_string())) {
39
+ $out.insert(($outName).to_string(), Value::String(x));
40
+ }
41
+ };
42
+ }
43
+
44
+ macro_rules! insert_meta_property {
45
+ ($out:ident, $document:ident, $metaName:expr, $outName:expr) => {
46
+ if let Some(x) = $document.select(&format!("meta[property=\"{}\"]", $metaName)).unwrap().next().and_then(|description| description.attributes.borrow().get("content").map(|x| x.to_string())) {
47
+ $out.insert(($outName).to_string(), Value::String(x));
48
+ }
49
+ };
50
+ }
51
+
52
+ /// Extracts metadata from HTML
53
+ ///
54
+ /// # Safety
55
+ /// Input options must be a C HTML string. Output will be a JSON object. Output string must be freed with free_string.
56
+ #[no_mangle]
57
+ pub unsafe extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut libc::c_char {
58
+ let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
59
+
60
+ let document = parse_html().one(html);
61
+ let mut out = HashMap::<String, Value>::new();
62
+
63
+ if let Some(title) = document.select("title").unwrap().next() {
64
+ out.insert("title".to_string(), Value::String(title.text_contents()));
65
+ }
66
+ // insert_meta_name!(out, document, "description", "description");
67
+
68
+ if let Some(favicon_link) = document.select("link[rel=\"icon\"]").unwrap().next()
69
+ .and_then(|x| x.attributes.borrow().get("href").map(|x| x.to_string()))
70
+ .or_else(|| document.select("link[rel*=\"icon\"]").unwrap().next()
71
+ .and_then(|x| x.attributes.borrow().get("href").map(|x| x.to_string()))) {
72
+ out.insert("favicon".to_string(), Value::String(favicon_link));
73
+ }
74
+
75
+ if let Some(lang) = document.select("html[lang]").unwrap().next().and_then(|x| x.attributes.borrow().get("lang").map(|x| x.to_string())) {
76
+ out.insert("language".to_string(), Value::String(lang));
77
+ }
78
+
79
+ // insert_meta_name!(out, document, "keywords", "keywords");
80
+ // insert_meta_name!(out, document, "robots", "robots");
81
+ insert_meta_property!(out, document, "og:title", "ogTitle");
82
+ insert_meta_property!(out, document, "og:description", "ogDescription");
83
+ insert_meta_property!(out, document, "og:url", "ogUrl");
84
+ insert_meta_property!(out, document, "og:image", "ogImage");
85
+ insert_meta_property!(out, document, "og:audio", "ogAudio");
86
+ insert_meta_property!(out, document, "og:determiner", "ogDeterminer");
87
+ insert_meta_property!(out, document, "og:locale", "ogLocale");
88
+
89
+ for meta in document.select("meta[property=\"og:locale:alternate\"]").unwrap() {
90
+ let attrs = meta.attributes.borrow();
91
+
92
+ if let Some(content) = attrs.get("content") {
93
+ if let Some(v) = out.get_mut("og:locale:alternate") {
94
+ match v {
95
+ Value::Array(x) => {
96
+ x.push(Value::String(content.to_string()));
97
+ },
98
+ _ => unreachable!(),
99
+ }
100
+ } else {
101
+ out.insert("og:locale:alternate".to_string(), Value::Array(vec! [Value::String(content.to_string())]));
102
+ }
103
+ }
104
+ }
105
+
106
+ insert_meta_property!(out, document, "og:site_name", "ogSiteName");
107
+ insert_meta_property!(out, document, "og:video", "ogVideo");
108
+ insert_meta_name!(out, document, "article:section", "articleSection");
109
+ insert_meta_name!(out, document, "article:tag", "articleTag");
110
+ insert_meta_property!(out, document, "article:published_time", "publishedTime");
111
+ insert_meta_property!(out, document, "article:modified_time", "modifiedTime");
112
+ insert_meta_name!(out, document, "dcterms.keywords", "dcTermsKeywords");
113
+ insert_meta_name!(out, document, "dc.description", "dcDescription");
114
+ insert_meta_name!(out, document, "dc.subject", "dcSubject");
115
+ insert_meta_name!(out, document, "dcterms.subject", "dcTermsSubject");
116
+ insert_meta_name!(out, document, "dcterms.audience", "dcTermsAudience");
117
+ insert_meta_name!(out, document, "dc.type", "dcType");
118
+ insert_meta_name!(out, document, "dcterms.type", "dcTermsType");
119
+ insert_meta_name!(out, document, "dc.date", "dcDate");
120
+ insert_meta_name!(out, document, "dc.date.created", "dcDateCreated");
121
+ insert_meta_name!(out, document, "dcterms.created", "dcTermsCreated");
122
+
123
+ for meta in document.select("meta").unwrap() {
124
+ let meta = meta.as_node().as_element().unwrap();
125
+ let attrs = meta.attributes.borrow();
126
+
127
+ if let Some(name) = attrs.get("name").or_else(|| attrs.get("property")) {
128
+ if let Some(content) = attrs.get("content") {
129
+ if let Some(v) = out.get(name) {
130
+ match v {
131
+ Value::String(_) => {
132
+ if name != "title" { // preserve title tag in metadata
133
+ out.insert(name.to_string(), Value::Array(vec! [v.clone(), Value::String(content.to_string())]));
134
+ }
135
+ },
136
+ Value::Array(_) => {
137
+ match out.get_mut(name) {
138
+ Some(Value::Array(x)) => {
139
+ x.push(Value::String(content.to_string()));
140
+ },
141
+ _ => unreachable!(),
142
+ }
143
+ },
144
+ _ => unreachable!(),
145
+ }
146
+ } else {
147
+ out.insert(name.to_string(), Value::String(content.to_string()));
148
+ }
149
+ }
150
+ }
151
+ }
152
+
153
+ CString::new(serde_json::ser::to_string(&out).unwrap()).unwrap().into_raw()
154
+ }
155
+
156
+ const EXCLUDE_NON_MAIN_TAGS: [&str; 41] = [
157
+ "header",
158
+ "footer",
159
+ "nav",
160
+ "aside",
161
+ ".header",
162
+ ".top",
163
+ ".navbar",
164
+ "#header",
165
+ ".footer",
166
+ ".bottom",
167
+ "#footer",
168
+ ".sidebar",
169
+ ".side",
170
+ ".aside",
171
+ "#sidebar",
172
+ ".modal",
173
+ ".popup",
174
+ "#modal",
175
+ ".overlay",
176
+ ".ad",
177
+ ".ads",
178
+ ".advert",
179
+ "#ad",
180
+ ".lang-selector",
181
+ ".language",
182
+ "#language-selector",
183
+ ".social",
184
+ ".social-media",
185
+ ".social-links",
186
+ "#social",
187
+ ".menu",
188
+ ".navigation",
189
+ "#nav",
190
+ ".breadcrumbs",
191
+ "#breadcrumbs",
192
+ ".share",
193
+ "#share",
194
+ ".widget",
195
+ "#widget",
196
+ ".cookie",
197
+ "#cookie",
198
+ ];
199
+
200
+ const FORCE_INCLUDE_MAIN_TAGS: [&str; 13] = [
201
+ "#main",
202
+
203
+ // swoogo event software as .widget in all of their content
204
+ ".swoogo-cols",
205
+ ".swoogo-text",
206
+ ".swoogo-table-div",
207
+ ".swoogo-space",
208
+ ".swoogo-alert",
209
+ ".swoogo-sponsors",
210
+ ".swoogo-title",
211
+ ".swoogo-tabs",
212
+ ".swoogo-logo",
213
+ ".swoogo-image",
214
+ ".swoogo-button",
215
+ ".swoogo-agenda",
216
+ ];
217
+
218
+ #[derive(Deserialize)]
219
+ struct TranformHTMLOptions {
220
+ html: String,
221
+ url: String,
222
+ include_tags: Vec<String>,
223
+ exclude_tags: Vec<String>,
224
+ only_main_content: bool,
225
+ }
226
+
227
+ struct ImageSource {
228
+ url: String,
229
+ size: i32,
230
+ is_x: bool,
231
+ }
232
+
233
+ fn _transform_html_inner(opts: TranformHTMLOptions) -> Result<String, ()> {
234
+ let mut document = parse_html().one(opts.html);
235
+
236
+ if !opts.include_tags.is_empty() {
237
+ let new_document = parse_html().one("<div></div>");
238
+ let root = new_document.select_first("div")?;
239
+
240
+ for x in opts.include_tags.iter() {
241
+ let matching_nodes: Vec<_> = document.select(x)?.collect();
242
+ for tag in matching_nodes {
243
+ root.as_node().append(tag.as_node().clone());
244
+ }
245
+ }
246
+
247
+ document = new_document;
248
+ }
249
+
250
+ while let Ok(x) = document.select_first("head") {
251
+ x.as_node().detach();
252
+ }
253
+
254
+ while let Ok(x) = document.select_first("meta") {
255
+ x.as_node().detach();
256
+ }
257
+
258
+ while let Ok(x) = document.select_first("noscript") {
259
+ x.as_node().detach();
260
+ }
261
+
262
+ while let Ok(x) = document.select_first("style") {
263
+ x.as_node().detach();
264
+ }
265
+
266
+ while let Ok(x) = document.select_first("script") {
267
+ x.as_node().detach();
268
+ }
269
+
270
+ for x in opts.exclude_tags.iter() {
271
+ // TODO: implement weird version
272
+ while let Ok(x) = document.select_first(x) {
273
+ x.as_node().detach();
274
+ }
275
+ }
276
+
277
+ if opts.only_main_content {
278
+ for x in EXCLUDE_NON_MAIN_TAGS.iter() {
279
+ let x: Vec<_> = document.select(x)?.collect();
280
+ for tag in x {
281
+ if !FORCE_INCLUDE_MAIN_TAGS.iter().any(|x| tag.as_node().select(x).is_ok_and(|mut x| x.next().is_some())) {
282
+ tag.as_node().detach();
283
+ }
284
+ }
285
+ }
286
+ }
287
+
288
+ let srcset_images: Vec<_> = document.select("img[srcset]")?.collect();
289
+ for img in srcset_images {
290
+ let mut sizes: Vec<ImageSource> = img.attributes.borrow().get("srcset").ok_or(())?.split(",").filter_map(|x| {
291
+ let tok: Vec<&str> = x.trim().split(" ").collect();
292
+ let tok_1 = if tok.len() > 1 && !tok[1].is_empty() {
293
+ tok[1]
294
+ } else {
295
+ "1x"
296
+ };
297
+ if let Ok(parsed_size) = tok_1[..tok_1.len()-1].parse() {
298
+ Some(ImageSource {
299
+ url: tok[0].to_string(),
300
+ size: parsed_size,
301
+ is_x: tok_1.ends_with("x")
302
+ })
303
+ } else {
304
+ None
305
+ }
306
+ }).collect();
307
+
308
+ if sizes.iter().all(|x| x.is_x) {
309
+ if let Some(src) = img.attributes.borrow().get("src").map(|x| x.to_string()) {
310
+ sizes.push(ImageSource {
311
+ url: src,
312
+ size: 1,
313
+ is_x: true,
314
+ });
315
+ }
316
+ }
317
+
318
+ sizes.sort_by(|a, b| b.size.cmp(&a.size));
319
+
320
+ if let Some(biggest) = sizes.first() {
321
+ img.attributes.borrow_mut().insert("src", biggest.url.clone());
322
+ }
323
+ }
324
+
325
+ let url = Url::parse(&opts.url).map_err(|_| ())?;
326
+
327
+ let src_images: Vec<_> = document.select("img[src]")?.collect();
328
+ for img in src_images {
329
+ let old = img.attributes.borrow().get("src").map(|x| x.to_string()).ok_or(())?;
330
+ if let Ok(new) = url.join(&old) {
331
+ img.attributes.borrow_mut().insert("src", new.to_string());
332
+ }
333
+ }
334
+
335
+ let href_anchors: Vec<_> = document.select("a[href]")?.collect();
336
+ for anchor in href_anchors {
337
+ let old = anchor.attributes.borrow().get("href").map(|x| x.to_string()).ok_or(())?;
338
+ if let Ok(new) = url.join(&old) {
339
+ anchor.attributes.borrow_mut().insert("href", new.to_string());
340
+ }
341
+ }
342
+
343
+ Ok(document.to_string())
344
+ }
345
+
346
+ /// Transforms rawHtml to html (formerly removeUnwantedElements)
347
+ ///
348
+ /// # Safety
349
+ /// Input options must be a C JSON string. Output will be an HTML string. Output string must be freed with free_string.
350
+ #[no_mangle]
351
+ pub unsafe extern "C" fn transform_html(opts: *const libc::c_char) -> *mut libc::c_char {
352
+ let opts: TranformHTMLOptions = match unsafe { CStr::from_ptr(opts) }.to_str().map_err(|_| ()).and_then(|x| serde_json::de::from_str(x).map_err(|_| ())) {
353
+ Ok(x) => x,
354
+ Err(_) => {
355
+ return CString::new("RUSTFC:ERROR").unwrap().into_raw();
356
+ }
357
+ };
358
+
359
+ let out = match _transform_html_inner(opts) {
360
+ Ok(x) => x,
361
+ Err(_) => "RUSTFC:ERROR".to_string(),
362
+ };
363
+
364
+ CString::new(out).unwrap().into_raw()
365
+ }
366
+
367
+ fn _get_inner_json(html: &str) -> Result<String, ()> {
368
+ Ok(parse_html().one(html).select_first("body")?.text_contents())
369
+ }
370
+
371
+ /// For JSON pages retrieved by browser engines, this function can be used to transform it back into valid JSON.
372
+ ///
373
+ /// # Safety
374
+ /// Input must be a C HTML string. Output will be an HTML string. Output string must be freed with free_string.
375
+ #[no_mangle]
376
+ pub unsafe extern "C" fn get_inner_json(html: *const libc::c_char) -> *mut libc::c_char {
377
+ let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
378
+
379
+ let out = match _get_inner_json(html) {
380
+ Ok(x) => x,
381
+ Err(_) => "RUSTFC:ERROR".to_string(),
382
+ };
383
+
384
+ CString::new(out).unwrap().into_raw()
385
+ }
386
+
387
+ /// Frees a string allocated in Rust-land.
388
+ ///
389
+ /// # Safety
390
+ /// ptr must be a non-freed string pointer returned by Rust code.
391
+ #[no_mangle]
392
+ pub unsafe extern "C" fn free_string(ptr: *mut libc::c_char) {
393
+ drop(unsafe { CString::from_raw(ptr) })
394
+ }
src/__tests__/concurrency-limit.test.ts ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { redisConnection } from "../services/queue-service";
2
+ import {
3
+ cleanOldConcurrencyLimitEntries,
4
+ getConcurrencyLimitActiveJobs,
5
+ pushConcurrencyLimitActiveJob,
6
+ removeConcurrencyLimitActiveJob,
7
+ takeConcurrencyLimitedJob,
8
+ pushConcurrencyLimitedJob,
9
+ getConcurrencyQueueJobsCount,
10
+ ConcurrencyLimitedJob,
11
+ } from "../lib/concurrency-limit";
12
+
13
+ // Mock Redis client
14
+ jest.mock("../services/queue-service", () => ({
15
+ redisConnection: {
16
+ zremrangebyscore: jest.fn(),
17
+ zrangebyscore: jest.fn(),
18
+ zadd: jest.fn(),
19
+ zrem: jest.fn(),
20
+ zmpop: jest.fn(),
21
+ zcard: jest.fn(),
22
+ },
23
+ }));
24
+
25
+ describe("Concurrency Limit", () => {
26
+ const mockTeamId = "test-team-id";
27
+ const mockJobId = "test-job-id";
28
+ const mockNow = 1000000;
29
+
30
+ beforeEach(() => {
31
+ jest.clearAllMocks();
32
+ });
33
+
34
+ describe("cleanOldConcurrencyLimitEntries", () => {
35
+ it("should remove entries older than current timestamp", async () => {
36
+ await cleanOldConcurrencyLimitEntries(mockTeamId, mockNow);
37
+
38
+ expect(redisConnection.zremrangebyscore).toHaveBeenCalledWith(
39
+ "concurrency-limiter:test-team-id",
40
+ -Infinity,
41
+ mockNow
42
+ );
43
+ });
44
+ });
45
+
46
+ describe("getConcurrencyLimitActiveJobs", () => {
47
+ it("should return active jobs after given timestamp", async () => {
48
+ const mockActiveJobs = ["job1", "job2"];
49
+ (redisConnection.zrangebyscore as jest.Mock).mockResolvedValue(mockActiveJobs);
50
+
51
+ const result = await getConcurrencyLimitActiveJobs(mockTeamId, mockNow);
52
+
53
+ expect(result).toEqual(mockActiveJobs);
54
+ expect(redisConnection.zrangebyscore).toHaveBeenCalledWith(
55
+ "concurrency-limiter:test-team-id",
56
+ mockNow,
57
+ Infinity
58
+ );
59
+ });
60
+
61
+ it("should return empty array when no active jobs", async () => {
62
+ (redisConnection.zrangebyscore as jest.Mock).mockResolvedValue([]);
63
+
64
+ const result = await getConcurrencyLimitActiveJobs(mockTeamId, mockNow);
65
+
66
+ expect(result).toEqual([]);
67
+ });
68
+ });
69
+
70
+ describe("pushConcurrencyLimitActiveJob", () => {
71
+ it("should add job with expiration timestamp", async () => {
72
+ await pushConcurrencyLimitActiveJob(mockTeamId, mockJobId, 2 * 60 * 1000, mockNow);
73
+
74
+ expect(redisConnection.zadd).toHaveBeenCalledWith(
75
+ "concurrency-limiter:test-team-id",
76
+ mockNow + 2 * 60 * 1000, // stalledJobTimeoutMs
77
+ mockJobId
78
+ );
79
+ });
80
+ });
81
+
82
+ describe("removeConcurrencyLimitActiveJob", () => {
83
+ it("should remove job from active jobs", async () => {
84
+ await removeConcurrencyLimitActiveJob(mockTeamId, mockJobId);
85
+
86
+ expect(redisConnection.zrem).toHaveBeenCalledWith(
87
+ "concurrency-limiter:test-team-id",
88
+ mockJobId
89
+ );
90
+ });
91
+ });
92
+
93
+ describe("Queue Operations", () => {
94
+ const mockJob: ConcurrencyLimitedJob = {
95
+ id: mockJobId,
96
+ data: { test: "data" },
97
+ opts: {},
98
+ priority: 1,
99
+ };
100
+
101
+ describe("takeConcurrencyLimitedJob", () => {
102
+ it("should return null when queue is empty", async () => {
103
+ (redisConnection.zmpop as jest.Mock).mockResolvedValue(null);
104
+
105
+ const result = await takeConcurrencyLimitedJob(mockTeamId);
106
+
107
+ expect(result).toBeNull();
108
+ });
109
+
110
+ it("should return and remove the highest priority job", async () => {
111
+ (redisConnection.zmpop as jest.Mock).mockResolvedValue([
112
+ "key",
113
+ [[JSON.stringify(mockJob)]],
114
+ ]);
115
+
116
+ const result = await takeConcurrencyLimitedJob(mockTeamId);
117
+
118
+ expect(result).toEqual(mockJob);
119
+ expect(redisConnection.zmpop).toHaveBeenCalledWith(
120
+ 1,
121
+ "concurrency-limit-queue:test-team-id",
122
+ "MIN"
123
+ );
124
+ });
125
+ });
126
+
127
+ describe("pushConcurrencyLimitedJob", () => {
128
+ it("should add job to queue with priority", async () => {
129
+ await pushConcurrencyLimitedJob(mockTeamId, mockJob);
130
+
131
+ expect(redisConnection.zadd).toHaveBeenCalledWith(
132
+ "concurrency-limit-queue:test-team-id",
133
+ mockJob.priority,
134
+ JSON.stringify(mockJob)
135
+ );
136
+ });
137
+
138
+ it("should use default priority 1 when not specified", async () => {
139
+ const jobWithoutPriority = { ...mockJob };
140
+ delete jobWithoutPriority.priority;
141
+
142
+ await pushConcurrencyLimitedJob(mockTeamId, jobWithoutPriority);
143
+
144
+ expect(redisConnection.zadd).toHaveBeenCalledWith(
145
+ "concurrency-limit-queue:test-team-id",
146
+ 1,
147
+ JSON.stringify(jobWithoutPriority)
148
+ );
149
+ });
150
+ });
151
+
152
+ describe("getConcurrencyQueueJobsCount", () => {
153
+ it("should return the number of jobs in queue", async () => {
154
+ const mockCount = 5;
155
+ (redisConnection.zcard as jest.Mock).mockResolvedValue(mockCount);
156
+
157
+ const result = await getConcurrencyQueueJobsCount(mockTeamId);
158
+
159
+ expect(result).toBe(mockCount);
160
+ expect(redisConnection.zcard).toHaveBeenCalledWith(
161
+ "concurrency-limit-queue:test-team-id"
162
+ );
163
+ });
164
+
165
+ it("should return 0 for empty queue", async () => {
166
+ (redisConnection.zcard as jest.Mock).mockResolvedValue(0);
167
+
168
+ const result = await getConcurrencyQueueJobsCount(mockTeamId);
169
+
170
+ expect(result).toBe(0);
171
+ });
172
+ });
173
+ });
174
+
175
+ describe("Integration Scenarios", () => {
176
+ it("should handle complete job lifecycle", async () => {
177
+ const mockJob: ConcurrencyLimitedJob = {
178
+ id: "lifecycle-test",
179
+ data: { test: "lifecycle" },
180
+ opts: {},
181
+ };
182
+
183
+ // Push job to queue
184
+ await pushConcurrencyLimitedJob(mockTeamId, mockJob);
185
+ expect(redisConnection.zadd).toHaveBeenCalled();
186
+
187
+ // Take job from queue
188
+ (redisConnection.zmpop as jest.Mock).mockResolvedValue([
189
+ "key",
190
+ [[JSON.stringify(mockJob)]],
191
+ ]);
192
+ const takenJob = await takeConcurrencyLimitedJob(mockTeamId);
193
+ expect(takenJob).toEqual(mockJob);
194
+
195
+ // Add to active jobs
196
+ await pushConcurrencyLimitActiveJob(mockTeamId, mockJob.id, 2 * 60 * 1000, mockNow);
197
+ expect(redisConnection.zadd).toHaveBeenCalled();
198
+
199
+ // Verify active jobs
200
+ (redisConnection.zrangebyscore as jest.Mock).mockResolvedValue([mockJob.id]);
201
+ const activeJobs = await getConcurrencyLimitActiveJobs(mockTeamId, mockNow);
202
+ expect(activeJobs).toContain(mockJob.id);
203
+
204
+ // Remove from active jobs
205
+ await removeConcurrencyLimitActiveJob(mockTeamId, mockJob.id);
206
+ expect(redisConnection.zrem).toHaveBeenCalled();
207
+ });
208
+ });
209
+ });
src/__tests__/deep-research/unit/deep-research-redis.test.ts ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { redisConnection } from "../../../services/queue-service";
2
+ import {
3
+ saveDeepResearch,
4
+ getDeepResearch,
5
+ updateDeepResearch,
6
+ getDeepResearchExpiry,
7
+ StoredDeepResearch,
8
+ } from "../../../lib/deep-research/deep-research-redis";
9
+
10
+ jest.mock("../../../services/queue-service", () => ({
11
+ redisConnection: {
12
+ set: jest.fn(),
13
+ get: jest.fn(),
14
+ expire: jest.fn(),
15
+ pttl: jest.fn(),
16
+ },
17
+ }));
18
+
19
+ describe("Deep Research Redis Operations", () => {
20
+ const mockResearch: StoredDeepResearch = {
21
+ id: "test-id",
22
+ team_id: "team-1",
23
+ createdAt: Date.now(),
24
+ status: "processing",
25
+ currentDepth: 0,
26
+ maxDepth: 5,
27
+ completedSteps: 0,
28
+ totalExpectedSteps: 25,
29
+ findings: [],
30
+ sources: [],
31
+ activities: [],
32
+ summaries: [],
33
+ };
34
+
35
+ beforeEach(() => {
36
+ jest.clearAllMocks();
37
+ });
38
+
39
+ describe("saveDeepResearch", () => {
40
+ it("should save research data to Redis with TTL", async () => {
41
+ await saveDeepResearch("test-id", mockResearch);
42
+
43
+ expect(redisConnection.set).toHaveBeenCalledWith(
44
+ "deep-research:test-id",
45
+ JSON.stringify(mockResearch)
46
+ );
47
+ expect(redisConnection.expire).toHaveBeenCalledWith(
48
+ "deep-research:test-id",
49
+ 6 * 60 * 60
50
+ );
51
+ });
52
+ });
53
+
54
+ describe("getDeepResearch", () => {
55
+ it("should retrieve research data from Redis", async () => {
56
+ (redisConnection.get as jest.Mock).mockResolvedValue(
57
+ JSON.stringify(mockResearch)
58
+ );
59
+
60
+ const result = await getDeepResearch("test-id");
61
+ expect(result).toEqual(mockResearch);
62
+ expect(redisConnection.get).toHaveBeenCalledWith("deep-research:test-id");
63
+ });
64
+
65
+ it("should return null when research not found", async () => {
66
+ (redisConnection.get as jest.Mock).mockResolvedValue(null);
67
+
68
+ const result = await getDeepResearch("non-existent-id");
69
+ expect(result).toBeNull();
70
+ });
71
+ });
72
+
73
+ describe("updateDeepResearch", () => {
74
+ it("should update existing research with new data", async () => {
75
+ (redisConnection.get as jest.Mock).mockResolvedValue(
76
+ JSON.stringify(mockResearch)
77
+ );
78
+
79
+ const update = {
80
+ status: "completed" as const,
81
+ finalAnalysis: "Test analysis",
82
+ activities: [
83
+ {
84
+ type: "search" as const,
85
+ status: "complete" as const,
86
+ message: "New activity",
87
+ timestamp: new Date().toISOString(),
88
+ depth: 1,
89
+ },
90
+ ],
91
+ };
92
+
93
+ await updateDeepResearch("test-id", update);
94
+
95
+ const expectedUpdate = {
96
+ ...mockResearch,
97
+ ...update,
98
+ activities: [...mockResearch.activities, ...update.activities],
99
+ };
100
+
101
+ expect(redisConnection.set).toHaveBeenCalledWith(
102
+ "deep-research:test-id",
103
+ JSON.stringify(expectedUpdate)
104
+ );
105
+ expect(redisConnection.expire).toHaveBeenCalledWith(
106
+ "deep-research:test-id",
107
+ 6 * 60 * 60
108
+ );
109
+ });
110
+
111
+ it("should do nothing if research not found", async () => {
112
+ (redisConnection.get as jest.Mock).mockResolvedValue(null);
113
+
114
+ await updateDeepResearch("test-id", { status: "completed" });
115
+
116
+ expect(redisConnection.set).not.toHaveBeenCalled();
117
+ expect(redisConnection.expire).not.toHaveBeenCalled();
118
+ });
119
+ });
120
+
121
+ describe("getDeepResearchExpiry", () => {
122
+ it("should return correct expiry date", async () => {
123
+ const mockTTL = 3600000; // 1 hour in milliseconds
124
+ (redisConnection.pttl as jest.Mock).mockResolvedValue(mockTTL);
125
+
126
+ const result = await getDeepResearchExpiry("test-id");
127
+
128
+ expect(result).toBeInstanceOf(Date);
129
+ expect(result.getTime()).toBeCloseTo(
130
+ new Date().getTime() + mockTTL,
131
+ -2 // Allow 100ms precision
132
+ );
133
+ });
134
+ });
135
+ });
src/__tests__/e2e_extract/index.test.ts ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import request from "supertest";
2
+ import dotenv from "dotenv";
3
+ import {
4
+ FirecrawlCrawlResponse,
5
+ FirecrawlCrawlStatusResponse,
6
+ FirecrawlScrapeResponse,
7
+ } from "../../types";
8
+
9
+ dotenv.config();
10
+ const TEST_URL = "http://127.0.0.1:3002";
11
+
12
+ describe("E2E Tests for Extract API Routes", () => {
13
+ it.concurrent(
14
+ "should return authors of blog posts on firecrawl.dev",
15
+ async () => {
16
+ const response = await request(TEST_URL)
17
+ .post("/v1/extract")
18
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
19
+ .set("Content-Type", "application/json")
20
+ .send({
21
+ urls: ["https://firecrawl.dev/*"],
22
+ prompt: "Who are the authors of the blog posts?",
23
+ schema: {
24
+ type: "object",
25
+ properties: {
26
+ authors: { type: "array", items: { type: "string" } },
27
+ },
28
+ },
29
+ });
30
+
31
+ console.log(response.body);
32
+ expect(response.statusCode).toBe(200);
33
+ expect(response.body).toHaveProperty("data");
34
+ expect(response.body.data).toHaveProperty("authors");
35
+
36
+ let gotItRight = 0;
37
+ for (const author of response.body.data?.authors) {
38
+ if (author.includes("Caleb Peffer")) gotItRight++;
39
+ if (author.includes("Gergő Móricz")) gotItRight++;
40
+ if (author.includes("Eric Ciarla")) gotItRight++;
41
+ if (author.includes("Nicolas Camara")) gotItRight++;
42
+ if (author.includes("Jon")) gotItRight++;
43
+ if (author.includes("Wendong")) gotItRight++;
44
+ }
45
+
46
+ expect(gotItRight).toBeGreaterThan(1);
47
+ },
48
+ 60000,
49
+ );
50
+
51
+ it.concurrent(
52
+ "should return founders of firecrawl.dev (allowExternalLinks = true)",
53
+ async () => {
54
+ const response = await request(TEST_URL)
55
+ .post("/v1/extract")
56
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
57
+ .set("Content-Type", "application/json")
58
+ .send({
59
+ urls: ["firecrawl.dev/*"],
60
+ prompt: "Who are the founders of the company?",
61
+ allowExternalLinks: true,
62
+ schema: {
63
+ type: "object",
64
+ properties: {
65
+ founders: { type: "array", items: { type: "string" } },
66
+ },
67
+ },
68
+ });
69
+ expect(response.statusCode).toBe(200);
70
+ expect(response.body).toHaveProperty("data");
71
+ expect(response.body.data).toHaveProperty("founders");
72
+
73
+ console.log(response.body.data?.founders);
74
+ let gotItRight = 0;
75
+ for (const founder of response.body.data?.founders) {
76
+ if (founder.includes("Caleb")) gotItRight++;
77
+ if (founder.includes("Eric")) gotItRight++;
78
+ if (founder.includes("Nicolas")) gotItRight++;
79
+ if (founder.includes("nick")) gotItRight++;
80
+ if (founder.includes("eric")) gotItRight++;
81
+ if (founder.includes("jon-noronha")) gotItRight++;
82
+ }
83
+
84
+ expect(gotItRight).toBeGreaterThanOrEqual(2);
85
+ },
86
+ 60000,
87
+ );
88
+
89
+ it.concurrent(
90
+ "should return hiring opportunities on firecrawl.dev (allowExternalLinks = true)",
91
+ async () => {
92
+ const response = await request(TEST_URL)
93
+ .post("/v1/extract")
94
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
95
+ .set("Content-Type", "application/json")
96
+ .send({
97
+ urls: ["https://firecrawl.dev/*"],
98
+ prompt: "What are they hiring for?",
99
+ allowExternalLinks: true,
100
+ schema: {
101
+ type: "array",
102
+ items: {
103
+ type: "string",
104
+ },
105
+ required: ["items"],
106
+ },
107
+ });
108
+ expect(response.statusCode).toBe(200);
109
+ expect(response.body).toHaveProperty("data");
110
+ console.log(response.body.data);
111
+
112
+ let gotItRight = 0;
113
+ for (const hiring of response.body.data?.items) {
114
+ if (hiring.includes("Firecrawl Example Creator")) gotItRight++;
115
+ if (hiring.includes("Senior Frontend Engineer")) gotItRight++;
116
+ if (hiring.includes("Technical Chief of Staff")) gotItRight++;
117
+ if (hiring.includes("Founding Web Automation Engineer")) gotItRight++;
118
+ if (hiring.includes("Founding Fullstack Engineer")) gotItRight++;
119
+ }
120
+
121
+ expect(gotItRight).toBeGreaterThan(2);
122
+ },
123
+ 60000,
124
+ );
125
+
126
+ it.concurrent(
127
+ "should return PCI DSS compliance for Fivetran",
128
+ async () => {
129
+ const response = await request(TEST_URL)
130
+ .post("/v1/extract")
131
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
132
+ .set("Content-Type", "application/json")
133
+ .send({
134
+ urls: ["fivetran.com/*"],
135
+ prompt: "Does Fivetran have PCI DSS compliance?",
136
+ allowExternalLinks: true,
137
+ schema: {
138
+ type: "object",
139
+ properties: {
140
+ pciDssCompliance: { type: "boolean" },
141
+ },
142
+ },
143
+ });
144
+ expect(response.statusCode).toBe(200);
145
+ expect(response.body).toHaveProperty("data");
146
+ expect(response.body.data?.pciDssCompliance).toBe(true);
147
+ },
148
+ 60000,
149
+ );
150
+
151
+ it.concurrent(
152
+ "should return Azure Data Connectors for Fivetran",
153
+ async () => {
154
+ const response = await request(TEST_URL)
155
+ .post("/v1/extract")
156
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
157
+ .set("Content-Type", "application/json")
158
+ .send({
159
+ urls: ["fivetran.com/*"],
160
+ prompt: "What are the Azure Data Connectors they offer?",
161
+ schema: {
162
+ type: "array",
163
+ items: {
164
+ type: "object",
165
+ properties: {
166
+ connector: { type: "string" },
167
+ description: { type: "string" },
168
+ supportsCaptureDelete: { type: "boolean" },
169
+ },
170
+ },
171
+ },
172
+ });
173
+
174
+ console.log(response.body);
175
+ // expect(response.statusCode).toBe(200);
176
+ // expect(response.body).toHaveProperty("data");
177
+ // expect(response.body.data?.pciDssCompliance).toBe(true);
178
+ },
179
+ 60000,
180
+ );
181
+
182
+ it.concurrent(
183
+ "should return Greenhouse Applicant Tracking System for Abnormal Security",
184
+ async () => {
185
+ const response = await request(TEST_URL)
186
+ .post("/v1/extract")
187
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
188
+ .set("Content-Type", "application/json")
189
+ .send({
190
+ urls: [
191
+ "https://careers.abnormalsecurity.com/jobs/6119456003?gh_jid=6119456003",
192
+ ],
193
+ prompt: "what applicant tracking system is this company using?",
194
+ schema: {
195
+ type: "object",
196
+ properties: {
197
+ isGreenhouseATS: { type: "boolean" },
198
+ answer: { type: "string" },
199
+ },
200
+ },
201
+ allowExternalLinks: true,
202
+ });
203
+
204
+ console.log(response.body);
205
+ expect(response.statusCode).toBe(200);
206
+ expect(response.body).toHaveProperty("data");
207
+ expect(response.body.data?.isGreenhouseATS).toBe(true);
208
+ },
209
+ 60000,
210
+ );
211
+
212
+ it.concurrent(
213
+ "should return mintlify api components",
214
+ async () => {
215
+ const response = await request(TEST_URL)
216
+ .post("/v1/extract")
217
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
218
+ .set("Content-Type", "application/json")
219
+ .send({
220
+ urls: ["https://mintlify.com/docs/*"],
221
+ prompt: "what are the 4 API components?",
222
+ schema: {
223
+ type: "array",
224
+ items: {
225
+ type: "object",
226
+ properties: {
227
+ component: { type: "string" },
228
+ },
229
+ },
230
+ required: ["items"],
231
+ },
232
+ allowExternalLinks: true,
233
+ });
234
+
235
+ console.log(response.body.data?.items);
236
+ expect(response.statusCode).toBe(200);
237
+ expect(response.body).toHaveProperty("data");
238
+ expect(response.body.data?.items.length).toBe(4);
239
+ let gotItRight = 0;
240
+ for (const component of response.body.data?.items) {
241
+ if (component.component.toLowerCase().includes("parameter"))
242
+ gotItRight++;
243
+ if (component.component.toLowerCase().includes("response"))
244
+ gotItRight++;
245
+ if (component.component.toLowerCase().includes("expandable"))
246
+ gotItRight++;
247
+ if (component.component.toLowerCase().includes("sticky")) gotItRight++;
248
+ if (component.component.toLowerCase().includes("examples"))
249
+ gotItRight++;
250
+ }
251
+ expect(gotItRight).toBeGreaterThan(2);
252
+ },
253
+ 60000,
254
+ );
255
+
256
+ it.concurrent(
257
+ "should return information about Eric Ciarla",
258
+ async () => {
259
+ const response = await request(TEST_URL)
260
+ .post("/v1/extract")
261
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
262
+ .set("Content-Type", "application/json")
263
+ .send({
264
+ urls: ["https://ericciarla.com/"],
265
+ prompt:
266
+ "Who is Eric Ciarla? Where does he work? Where did he go to school?",
267
+ schema: {
268
+ type: "object",
269
+ properties: {
270
+ name: { type: "string" },
271
+ work: { type: "string" },
272
+ education: { type: "string" },
273
+ },
274
+ required: ["name", "work", "education"],
275
+ },
276
+ allowExternalLinks: true,
277
+ });
278
+
279
+ console.log(response.body.data);
280
+ expect(response.statusCode).toBe(200);
281
+ expect(response.body).toHaveProperty("data");
282
+ expect(response.body.data?.name).toBe("Eric Ciarla");
283
+ expect(response.body.data?.work).toBeDefined();
284
+ expect(response.body.data?.education).toBeDefined();
285
+ },
286
+ 60000,
287
+ );
288
+
289
+ it.concurrent(
290
+ "should extract information without a schema",
291
+ async () => {
292
+ const response = await request(TEST_URL)
293
+ .post("/v1/extract")
294
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
295
+ .set("Content-Type", "application/json")
296
+ .send({
297
+ urls: ["https://docs.firecrawl.dev"],
298
+ prompt: "What is the title and description of the page?",
299
+ });
300
+
301
+ console.log(response.body.data);
302
+ expect(response.statusCode).toBe(200);
303
+ expect(response.body).toHaveProperty("data");
304
+ expect(typeof response.body.data).toBe("object");
305
+ expect(Object.keys(response.body.data).length).toBeGreaterThan(0);
306
+ },
307
+ 60000,
308
+ );
309
+
310
+ it.concurrent(
311
+ "should extract information with scrapeOptions.waitFor",
312
+ async () => {
313
+ const response = await request(TEST_URL)
314
+ .post("/v1/extract")
315
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
316
+ .set("Content-Type", "application/json")
317
+ .send({
318
+ urls: ["https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/"],
319
+ prompt: "What is the content right after the #content-1 id?",
320
+ schema: {
321
+ type: "object",
322
+ properties: {
323
+ content: { type: "string" },
324
+ },
325
+ required: ["content"],
326
+ },
327
+ scrapeOptions: {
328
+ waitFor: 6000,
329
+ }
330
+ });
331
+
332
+ expect(response.statusCode).toBe(200);
333
+ expect(response.body).toHaveProperty("data");
334
+ expect(typeof response.body.data).toBe("object");
335
+ expect(response.body.data?.content).toBeDefined();
336
+ expect(response.body.data?.content).toBe("Content loaded after 5 seconds!");
337
+ },
338
+ 60000,
339
+ );
340
+ });
src/__tests__/e2e_full_withAuth/index.test.ts ADDED
@@ -0,0 +1,1762 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import request from "supertest";
2
+ import dotenv from "dotenv";
3
+ import { v4 as uuidv4 } from "uuid";
4
+ import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
5
+
6
+ dotenv.config();
7
+
8
+ // const TEST_URL = 'http://localhost:3002'
9
+ const TEST_URL = "http://127.0.0.1:3002";
10
+
11
+ describe("E2E Tests for API Routes", () => {
12
+ beforeAll(() => {
13
+ process.env.USE_DB_AUTHENTICATION = "true";
14
+ });
15
+
16
+ afterAll(() => {
17
+ delete process.env.USE_DB_AUTHENTICATION;
18
+ });
19
+ describe("GET /", () => {
20
+ it.concurrent("should return Hello, world! message", async () => {
21
+ const response = await request(TEST_URL).get("/");
22
+
23
+ expect(response.statusCode).toBe(200);
24
+ expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io");
25
+ });
26
+ });
27
+
28
+ describe("GET /test", () => {
29
+ it.concurrent("should return Hello, world! message", async () => {
30
+ const response = await request(TEST_URL).get("/test");
31
+ expect(response.statusCode).toBe(200);
32
+ expect(response.text).toContain("Hello, world!");
33
+ });
34
+ });
35
+
36
+ describe("POST /v0/scrape", () => {
37
+ it.concurrent("should require authorization", async () => {
38
+ const response = await request(TEST_URL).post("/v0/scrape");
39
+ expect(response.statusCode).toBe(401);
40
+ });
41
+
42
+ it.concurrent(
43
+ "should return an error response with an invalid API key",
44
+ async () => {
45
+ const response = await request(TEST_URL)
46
+ .post("/v0/scrape")
47
+ .set("Authorization", `Bearer invalid-api-key`)
48
+ .set("Content-Type", "application/json")
49
+ .send({ url: "https://firecrawl.dev" });
50
+ expect(response.statusCode).toBe(401);
51
+ },
52
+ );
53
+
54
+ it.concurrent("should return an error for a blocklisted URL", async () => {
55
+ const blocklistedUrl = "https://facebook.com/fake-test";
56
+ const response = await request(TEST_URL)
57
+ .post("/v0/scrape")
58
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
59
+ .set("Content-Type", "application/json")
60
+ .send({ url: blocklistedUrl });
61
+ expect(response.statusCode).toBe(403);
62
+ expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
63
+ });
64
+
65
+
66
+ it.concurrent(
67
+ "should return a successful response with a valid API key",
68
+ async () => {
69
+ const response = await request(TEST_URL)
70
+ .post("/v0/scrape")
71
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
72
+ .set("Content-Type", "application/json")
73
+ .send({ url: "https://roastmywebsite.ai" });
74
+ expect(response.statusCode).toBe(200);
75
+ expect(response.body).toHaveProperty("data");
76
+ expect(response.body.data).toHaveProperty("content");
77
+ expect(response.body.data).toHaveProperty("markdown");
78
+ expect(response.body.data).toHaveProperty("metadata");
79
+ expect(response.body.data).not.toHaveProperty("html");
80
+ expect(response.body.data.content).toContain("_Roast_");
81
+ expect(response.body.data.metadata).toHaveProperty("title");
82
+ expect(response.body.data.metadata).toHaveProperty("description");
83
+ expect(response.body.data.metadata).toHaveProperty("keywords");
84
+ expect(response.body.data.metadata).toHaveProperty("robots");
85
+ expect(response.body.data.metadata).toHaveProperty("ogTitle");
86
+ expect(response.body.data.metadata).toHaveProperty("ogDescription");
87
+ expect(response.body.data.metadata).toHaveProperty("ogUrl");
88
+ expect(response.body.data.metadata).toHaveProperty("ogImage");
89
+ expect(response.body.data.metadata).toHaveProperty("ogLocaleAlternate");
90
+ expect(response.body.data.metadata).toHaveProperty("ogSiteName");
91
+ expect(response.body.data.metadata).toHaveProperty("sourceURL");
92
+ expect(response.body.data.metadata).toHaveProperty("pageStatusCode");
93
+ expect(response.body.data.metadata.pageError).toBeUndefined();
94
+ expect(response.body.data.metadata.title).toBe("Roast My Website");
95
+ expect(response.body.data.metadata.description).toBe(
96
+ "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
97
+ );
98
+ expect(response.body.data.metadata.keywords).toBe(
99
+ "Roast My Website,Roast,Website,GitHub,Firecrawl",
100
+ );
101
+ expect(response.body.data.metadata.robots).toBe("follow, index");
102
+ expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
103
+ expect(response.body.data.metadata.ogDescription).toBe(
104
+ "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
105
+ );
106
+ expect(response.body.data.metadata.ogUrl).toBe(
107
+ "https://www.roastmywebsite.ai",
108
+ );
109
+ expect(response.body.data.metadata.ogImage).toBe(
110
+ "https://www.roastmywebsite.ai/og.png",
111
+ );
112
+ expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
113
+ expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
114
+ expect(response.body.data.metadata.sourceURL).toBe(
115
+ "https://roastmywebsite.ai",
116
+ );
117
+ expect(response.body.data.metadata.pageStatusCode).toBe(200);
118
+ },
119
+ 30000,
120
+ ); // 30 seconds timeout
121
+
122
+ it.concurrent(
123
+ "should return a successful response with a valid API key and includeHtml set to true",
124
+ async () => {
125
+ const response = await request(TEST_URL)
126
+ .post("/v0/scrape")
127
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
128
+ .set("Content-Type", "application/json")
129
+ .send({
130
+ url: "https://roastmywebsite.ai",
131
+ pageOptions: { includeHtml: true },
132
+ });
133
+ expect(response.statusCode).toBe(200);
134
+ expect(response.body).toHaveProperty("data");
135
+ expect(response.body.data).toHaveProperty("content");
136
+ expect(response.body.data).toHaveProperty("markdown");
137
+ expect(response.body.data).toHaveProperty("html");
138
+ expect(response.body.data).toHaveProperty("metadata");
139
+ expect(response.body.data.content).toContain("_Roast_");
140
+ expect(response.body.data.markdown).toContain("_Roast_");
141
+ expect(response.body.data.html).toContain("<h1");
142
+ expect(response.body.data.metadata.pageStatusCode).toBe(200);
143
+ expect(response.body.data.metadata.pageError).toBeUndefined();
144
+ },
145
+ 30000,
146
+ ); // 30 seconds timeout
147
+
148
+ it.concurrent(
149
+ "should return a successful response with a valid API key and includeRawHtml set to true",
150
+ async () => {
151
+ const response = await request(TEST_URL)
152
+ .post("/v0/scrape")
153
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
154
+ .set("Content-Type", "application/json")
155
+ .send({
156
+ url: "https://roastmywebsite.ai",
157
+ pageOptions: { includeRawHtml: true },
158
+ });
159
+ expect(response.statusCode).toBe(200);
160
+ expect(response.body).toHaveProperty("data");
161
+ expect(response.body.data).toHaveProperty("content");
162
+ expect(response.body.data).toHaveProperty("markdown");
163
+ expect(response.body.data).toHaveProperty("rawHtml");
164
+ expect(response.body.data).toHaveProperty("metadata");
165
+ expect(response.body.data.content).toContain("_Roast_");
166
+ expect(response.body.data.markdown).toContain("_Roast_");
167
+ expect(response.body.data.rawHtml).toContain("<h1");
168
+ expect(response.body.data.metadata.pageStatusCode).toBe(200);
169
+ expect(response.body.data.metadata.pageError).toBeUndefined();
170
+ },
171
+ 30000,
172
+ ); // 30 seconds timeout
173
+
174
+ it.concurrent(
175
+ "should return a successful response for a valid scrape with PDF file",
176
+ async () => {
177
+ const response = await request(TEST_URL)
178
+ .post("/v0/scrape")
179
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
180
+ .set("Content-Type", "application/json")
181
+ .send({ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" });
182
+ await new Promise((r) => setTimeout(r, 6000));
183
+
184
+ expect(response.statusCode).toBe(200);
185
+ expect(response.body).toHaveProperty("data");
186
+ expect(response.body.data).toHaveProperty("content");
187
+ expect(response.body.data).toHaveProperty("metadata");
188
+ expect(response.body.data.content).toContain(
189
+ "We present spectrophotometric observations of the Broad Line Radio Galaxy",
190
+ );
191
+ expect(response.body.data.metadata.pageStatusCode).toBe(200);
192
+ expect(response.body.data.metadata.pageError).toBeUndefined();
193
+ },
194
+ 60000,
195
+ ); // 60 seconds
196
+
197
+ it.concurrent(
198
+ "should return a successful response for a valid scrape with PDF file without explicit .pdf extension",
199
+ async () => {
200
+ const response = await request(TEST_URL)
201
+ .post("/v0/scrape")
202
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
203
+ .set("Content-Type", "application/json")
204
+ .send({ url: "https://arxiv.org/pdf/astro-ph/9301001" });
205
+ await new Promise((r) => setTimeout(r, 6000));
206
+
207
+ expect(response.statusCode).toBe(200);
208
+ expect(response.body).toHaveProperty("data");
209
+ expect(response.body.data).toHaveProperty("content");
210
+ expect(response.body.data).toHaveProperty("metadata");
211
+ expect(response.body.data.content).toContain(
212
+ "We present spectrophotometric observations of the Broad Line Radio Galaxy",
213
+ );
214
+ expect(response.body.data.metadata.pageStatusCode).toBe(200);
215
+ expect(response.body.data.metadata.pageError).toBeUndefined();
216
+ },
217
+ 60000,
218
+ ); // 60 seconds
219
+
220
+ it.concurrent(
221
+ "should return a successful response for a valid scrape with PDF file and parsePDF set to false",
222
+ async () => {
223
+ const response = await request(TEST_URL)
224
+ .post("/v0/scrape")
225
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
226
+ .set("Content-Type", "application/json")
227
+ .send({
228
+ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf",
229
+ pageOptions: { parsePDF: false },
230
+ });
231
+ await new Promise((r) => setTimeout(r, 6000));
232
+
233
+ expect(response.statusCode).toBe(200);
234
+ expect(response.body).toHaveProperty("data");
235
+ expect(response.body.data).toHaveProperty("content");
236
+ expect(response.body.data).toHaveProperty("metadata");
237
+ expect(response.body.data.content).toContain(
238
+ "/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj",
239
+ );
240
+ },
241
+ 60000,
242
+ ); // 60 seconds
243
+
244
+ it.concurrent(
245
+ "should return a successful response with a valid API key with removeTags option",
246
+ async () => {
247
+ const responseWithoutRemoveTags = await request(TEST_URL)
248
+ .post("/v0/scrape")
249
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
250
+ .set("Content-Type", "application/json")
251
+ .send({ url: "https://www.scrapethissite.com/" });
252
+ expect(responseWithoutRemoveTags.statusCode).toBe(200);
253
+ expect(responseWithoutRemoveTags.body).toHaveProperty("data");
254
+ expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
255
+ expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
256
+ expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
257
+ expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
258
+ expect(responseWithoutRemoveTags.body.data.content).toContain(
259
+ "Scrape This Site",
260
+ );
261
+ expect(responseWithoutRemoveTags.body.data.content).toContain(
262
+ "Lessons and Videos",
263
+ ); // #footer
264
+ expect(responseWithoutRemoveTags.body.data.content).toContain(
265
+ "[Sandbox](",
266
+ ); // .nav
267
+ expect(responseWithoutRemoveTags.body.data.content).toContain(
268
+ "web scraping",
269
+ ); // strong
270
+
271
+ const response = await request(TEST_URL)
272
+ .post("/v0/scrape")
273
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
274
+ .set("Content-Type", "application/json")
275
+ .send({
276
+ url: "https://www.scrapethissite.com/",
277
+ pageOptions: { removeTags: [".nav", "#footer", "strong"] },
278
+ });
279
+ expect(response.statusCode).toBe(200);
280
+ expect(response.body).toHaveProperty("data");
281
+ expect(response.body.data).toHaveProperty("content");
282
+ expect(response.body.data).toHaveProperty("markdown");
283
+ expect(response.body.data).toHaveProperty("metadata");
284
+ expect(response.body.data).not.toHaveProperty("html");
285
+ expect(response.body.data.content).toContain("Scrape This Site");
286
+ expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
287
+ expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
288
+ expect(response.body.data.content).not.toContain("web scraping"); // strong
289
+ },
290
+ 30000,
291
+ ); // 30 seconds timeout
292
+
293
+ // TODO: add this test back once we nail the waitFor option to be more deterministic
294
+ // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
295
+ // const startTime = Date.now();
296
+ // const response = await request(TEST_URL)
297
+ // .post("/v0/scrape")
298
+ // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
299
+ // .set("Content-Type", "application/json")
300
+ // .send({ url: "https://firecrawl.dev", pageOptions: { waitFor: 7000 } });
301
+ // const endTime = Date.now();
302
+ // const duration = endTime - startTime;
303
+
304
+ // expect(response.statusCode).toBe(200);
305
+ // expect(response.body).toHaveProperty("data");
306
+ // expect(response.body.data).toHaveProperty("content");
307
+ // expect(response.body.data).toHaveProperty("markdown");
308
+ // expect(response.body.data).toHaveProperty("metadata");
309
+ // expect(response.body.data).not.toHaveProperty("html");
310
+ // expect(response.body.data.content).toContain("🔥 Firecrawl");
311
+ // expect(duration).toBeGreaterThanOrEqual(7000);
312
+ // }, 12000); // 12 seconds timeout
313
+
314
+ it.concurrent(
315
+ "should return a successful response for a scrape with 400 page",
316
+ async () => {
317
+ const response = await request(TEST_URL)
318
+ .post("/v0/scrape")
319
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
320
+ .set("Content-Type", "application/json")
321
+ .send({ url: "https://httpstat.us/400" });
322
+ await new Promise((r) => setTimeout(r, 5000));
323
+
324
+ expect(response.statusCode).toBe(200);
325
+ expect(response.body).toHaveProperty("data");
326
+ expect(response.body.data).toHaveProperty("content");
327
+ expect(response.body.data).toHaveProperty("metadata");
328
+ expect(response.body.data.metadata.pageStatusCode).toBe(400);
329
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
330
+ "bad request",
331
+ );
332
+ },
333
+ 60000,
334
+ ); // 60 seconds
335
+
336
+ it.concurrent(
337
+ "should return a successful response for a scrape with 401 page",
338
+ async () => {
339
+ const response = await request(TEST_URL)
340
+ .post("/v0/scrape")
341
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
342
+ .set("Content-Type", "application/json")
343
+ .send({ url: "https://httpstat.us/401" });
344
+ await new Promise((r) => setTimeout(r, 5000));
345
+
346
+ expect(response.statusCode).toBe(200);
347
+ expect(response.body).toHaveProperty("data");
348
+ expect(response.body.data).toHaveProperty("content");
349
+ expect(response.body.data).toHaveProperty("metadata");
350
+ expect(response.body.data.metadata.pageStatusCode).toBe(401);
351
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
352
+ "unauthorized",
353
+ );
354
+ },
355
+ 60000,
356
+ ); // 60 seconds
357
+
358
+ it.concurrent(
359
+ "should return a successful response for a scrape with 403 page",
360
+ async () => {
361
+ const response = await request(TEST_URL)
362
+ .post("/v0/scrape")
363
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
364
+ .set("Content-Type", "application/json")
365
+ .send({ url: "https://httpstat.us/403" });
366
+
367
+ await new Promise((r) => setTimeout(r, 5000));
368
+ expect(response.statusCode).toBe(200);
369
+ expect(response.body).toHaveProperty("data");
370
+ expect(response.body.data).toHaveProperty("content");
371
+ expect(response.body.data).toHaveProperty("metadata");
372
+ expect(response.body.data.metadata.pageStatusCode).toBe(403);
373
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
374
+ "forbidden",
375
+ );
376
+ },
377
+ 60000,
378
+ ); // 60 seconds
379
+
380
+ it.concurrent(
381
+ "should return a successful response for a scrape with 404 page",
382
+ async () => {
383
+ const response = await request(TEST_URL)
384
+ .post("/v0/scrape")
385
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
386
+ .set("Content-Type", "application/json")
387
+ .send({ url: "https://httpstat.us/404" });
388
+ await new Promise((r) => setTimeout(r, 5000));
389
+
390
+ expect(response.statusCode).toBe(200);
391
+ expect(response.body).toHaveProperty("data");
392
+ expect(response.body.data).toHaveProperty("content");
393
+ expect(response.body.data).toHaveProperty("metadata");
394
+ expect(response.body.data.metadata.pageStatusCode).toBe(404);
395
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
396
+ "not found",
397
+ );
398
+ },
399
+ 60000,
400
+ ); // 60 seconds
401
+
402
+ it.concurrent(
403
+ "should return a successful response for a scrape with 405 page",
404
+ async () => {
405
+ const response = await request(TEST_URL)
406
+ .post("/v0/scrape")
407
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
408
+ .set("Content-Type", "application/json")
409
+ .send({ url: "https://httpstat.us/405" });
410
+ await new Promise((r) => setTimeout(r, 5000));
411
+
412
+ expect(response.statusCode).toBe(200);
413
+ expect(response.body).toHaveProperty("data");
414
+ expect(response.body.data).toHaveProperty("content");
415
+ expect(response.body.data).toHaveProperty("metadata");
416
+ expect(response.body.data.metadata.pageStatusCode).toBe(405);
417
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
418
+ "method not allowed",
419
+ );
420
+ },
421
+ 60000,
422
+ ); // 60 seconds
423
+
424
+ it.concurrent(
425
+ "should return a successful response for a scrape with 500 page",
426
+ async () => {
427
+ const response = await request(TEST_URL)
428
+ .post("/v0/scrape")
429
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
430
+ .set("Content-Type", "application/json")
431
+ .send({ url: "https://httpstat.us/500" });
432
+ await new Promise((r) => setTimeout(r, 5000));
433
+
434
+ expect(response.statusCode).toBe(200);
435
+ expect(response.body).toHaveProperty("data");
436
+ expect(response.body.data).toHaveProperty("content");
437
+ expect(response.body.data).toHaveProperty("metadata");
438
+ expect(response.body.data.metadata.pageStatusCode).toBe(500);
439
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
440
+ "internal server error",
441
+ );
442
+ },
443
+ 60000,
444
+ ); // 60 seconds
445
+ });
446
+
447
+ describe("POST /v0/crawl", () => {
448
+ it.concurrent("should require authorization", async () => {
449
+ const response = await request(TEST_URL).post("/v0/crawl");
450
+ expect(response.statusCode).toBe(401);
451
+ });
452
+
453
+ it.concurrent(
454
+ "should return an error response with an invalid API key",
455
+ async () => {
456
+ const response = await request(TEST_URL)
457
+ .post("/v0/crawl")
458
+ .set("Authorization", `Bearer invalid-api-key`)
459
+ .set("Content-Type", "application/json")
460
+ .send({ url: "https://firecrawl.dev" });
461
+ expect(response.statusCode).toBe(401);
462
+ },
463
+ );
464
+
465
+ it.concurrent("should return an error for a blocklisted URL", async () => {
466
+ const blocklistedUrl = "https://twitter.com/fake-test";
467
+ const response = await request(TEST_URL)
468
+ .post("/v0/crawl")
469
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
470
+ .set("Content-Type", "application/json")
471
+ .send({ url: blocklistedUrl });
472
+ expect(response.statusCode).toBe(403);
473
+ expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
474
+ });
475
+
476
+ it.concurrent(
477
+ "should return a successful response with a valid API key for crawl",
478
+ async () => {
479
+ const response = await request(TEST_URL)
480
+ .post("/v0/crawl")
481
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
482
+ .set("Content-Type", "application/json")
483
+ .send({ url: "https://firecrawl.dev" });
484
+ expect(response.statusCode).toBe(200);
485
+ expect(response.body).toHaveProperty("jobId");
486
+ expect(response.body.jobId).toMatch(
487
+ /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
488
+ );
489
+ },
490
+ );
491
+ it.concurrent(
492
+ "should prevent duplicate requests using the same idempotency key",
493
+ async () => {
494
+ const uniqueIdempotencyKey = uuidv4();
495
+
496
+ // First request with the idempotency key
497
+ const firstResponse = await request(TEST_URL)
498
+ .post("/v0/crawl")
499
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
500
+ .set("Content-Type", "application/json")
501
+ .set("x-idempotency-key", uniqueIdempotencyKey)
502
+ .send({ url: "https://docs.firecrawl.dev" });
503
+
504
+ expect(firstResponse.statusCode).toBe(200);
505
+
506
+ // Second request with the same idempotency key
507
+ const secondResponse = await request(TEST_URL)
508
+ .post("/v0/crawl")
509
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
510
+ .set("Content-Type", "application/json")
511
+ .set("x-idempotency-key", uniqueIdempotencyKey)
512
+ .send({ url: "https://docs.firecrawl.dev" });
513
+
514
+ expect(secondResponse.statusCode).toBe(409);
515
+ expect(secondResponse.body.error).toBe("Idempotency key already used");
516
+ },
517
+ );
518
+
519
+ it.concurrent(
520
+ "should return a successful response with a valid API key and valid includes option",
521
+ async () => {
522
+ const crawlResponse = await request(TEST_URL)
523
+ .post("/v0/crawl")
524
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
525
+ .set("Content-Type", "application/json")
526
+ .send({
527
+ url: "https://mendable.ai",
528
+ limit: 10,
529
+ crawlerOptions: {
530
+ includes: ["blog/*"],
531
+ },
532
+ });
533
+
534
+ let response;
535
+ let isFinished = false;
536
+
537
+ while (!isFinished) {
538
+ response = await request(TEST_URL)
539
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
540
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
541
+
542
+ expect(response.statusCode).toBe(200);
543
+ expect(response.body).toHaveProperty("status");
544
+ isFinished = response.body.status === "completed";
545
+
546
+ if (!isFinished) {
547
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
548
+ }
549
+ }
550
+
551
+ const completedResponse = response;
552
+
553
+ const urls = completedResponse.body.data.map(
554
+ (item: any) => item.metadata?.sourceURL,
555
+ );
556
+ expect(urls.length).toBeGreaterThan(5);
557
+ urls.forEach((url: string) => {
558
+ expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy();
559
+ });
560
+
561
+ expect(completedResponse.statusCode).toBe(200);
562
+ expect(completedResponse.body).toHaveProperty("status");
563
+ expect(completedResponse.body.status).toBe("completed");
564
+ expect(completedResponse.body).toHaveProperty("data");
565
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
566
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
567
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
568
+ expect(completedResponse.body.data[0].content).toContain("Mendable");
569
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
570
+ 200,
571
+ );
572
+ expect(
573
+ completedResponse.body.data[0].metadata.pageError,
574
+ ).toBeUndefined();
575
+ },
576
+ 60000,
577
+ ); // 60 seconds
578
+
579
+ it.concurrent(
580
+ "should return a successful response with a valid API key and valid excludes option",
581
+ async () => {
582
+ const crawlResponse = await request(TEST_URL)
583
+ .post("/v0/crawl")
584
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
585
+ .set("Content-Type", "application/json")
586
+ .send({
587
+ url: "https://mendable.ai",
588
+ limit: 10,
589
+ crawlerOptions: {
590
+ excludes: ["blog/*"],
591
+ },
592
+ });
593
+
594
+ let isFinished = false;
595
+ let response;
596
+
597
+ while (!isFinished) {
598
+ response = await request(TEST_URL)
599
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
600
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
601
+
602
+ expect(response.statusCode).toBe(200);
603
+ expect(response.body).toHaveProperty("status");
604
+ isFinished = response.body.status === "completed";
605
+
606
+ if (!isFinished) {
607
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
608
+ }
609
+ }
610
+
611
+ const completedResponse = response;
612
+
613
+ const urls = completedResponse.body.data.map(
614
+ (item: any) => item.metadata?.sourceURL,
615
+ );
616
+ expect(urls.length).toBeGreaterThan(5);
617
+ urls.forEach((url: string) => {
618
+ expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
619
+ });
620
+ },
621
+ 90000,
622
+ ); // 90 seconds
623
+
624
+ it.concurrent(
625
+ "should return a successful response with a valid API key and limit to 3",
626
+ async () => {
627
+ const crawlResponse = await request(TEST_URL)
628
+ .post("/v0/crawl")
629
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
630
+ .set("Content-Type", "application/json")
631
+ .send({
632
+ url: "https://mendable.ai",
633
+ crawlerOptions: { limit: 3 },
634
+ });
635
+
636
+ let isFinished = false;
637
+ let response;
638
+
639
+ while (!isFinished) {
640
+ response = await request(TEST_URL)
641
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
642
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
643
+
644
+ expect(response.statusCode).toBe(200);
645
+ expect(response.body).toHaveProperty("status");
646
+ isFinished = response.body.status === "completed";
647
+
648
+ if (!isFinished) {
649
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
650
+ }
651
+ }
652
+
653
+ const completedResponse = response;
654
+
655
+ expect(completedResponse.statusCode).toBe(200);
656
+ expect(completedResponse.body).toHaveProperty("status");
657
+ expect(completedResponse.body.status).toBe("completed");
658
+ expect(completedResponse.body).toHaveProperty("data");
659
+ expect(completedResponse.body.data.length).toBe(3);
660
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
661
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
662
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
663
+ expect(completedResponse.body.data[0].content).toContain("Mendable");
664
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
665
+ 200,
666
+ );
667
+ expect(
668
+ completedResponse.body.data[0].metadata.pageError,
669
+ ).toBeUndefined();
670
+ },
671
+ 60000,
672
+ ); // 60 seconds
673
+
674
+ it.concurrent(
675
+ "should return a successful response with max depth option for a valid crawl job",
676
+ async () => {
677
+ const crawlResponse = await request(TEST_URL)
678
+ .post("/v0/crawl")
679
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
680
+ .set("Content-Type", "application/json")
681
+ .send({
682
+ url: "https://www.scrapethissite.com",
683
+ crawlerOptions: { maxDepth: 1 },
684
+ });
685
+ expect(crawlResponse.statusCode).toBe(200);
686
+
687
+ const response = await request(TEST_URL)
688
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
689
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
690
+ expect(response.statusCode).toBe(200);
691
+ expect(response.body).toHaveProperty("status");
692
+ expect(["active", "waiting"]).toContain(response.body.status);
693
+ // wait for 60 seconds
694
+ let isCompleted = false;
695
+ while (!isCompleted) {
696
+ const statusCheckResponse = await request(TEST_URL)
697
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
698
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
699
+ expect(statusCheckResponse.statusCode).toBe(200);
700
+ isCompleted = statusCheckResponse.body.status === "completed";
701
+ if (!isCompleted) {
702
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
703
+ }
704
+ }
705
+ const completedResponse = await request(TEST_URL)
706
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
707
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
708
+
709
+ expect(completedResponse.statusCode).toBe(200);
710
+ expect(completedResponse.body).toHaveProperty("status");
711
+ expect(completedResponse.body.status).toBe("completed");
712
+ expect(completedResponse.body).toHaveProperty("data");
713
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
714
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
715
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
716
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
717
+ 200,
718
+ );
719
+ expect(
720
+ completedResponse.body.data[0].metadata.pageError,
721
+ ).toBeUndefined();
722
+ const urls = completedResponse.body.data.map(
723
+ (item: any) => item.metadata?.sourceURL,
724
+ );
725
+ expect(urls.length).toBeGreaterThan(1);
726
+
727
+ // Check if all URLs have a maximum depth of 1
728
+ urls.forEach((url: string) => {
729
+ const pathSplits = new URL(url).pathname.split("/");
730
+ const depth =
731
+ pathSplits.length -
732
+ (pathSplits[0].length === 0 &&
733
+ pathSplits[pathSplits.length - 1].length === 0
734
+ ? 1
735
+ : 0);
736
+ expect(depth).toBeLessThanOrEqual(2);
737
+ });
738
+ },
739
+ 180000,
740
+ );
741
+
742
+ it.concurrent(
743
+ "should return a successful response with relative max depth option for a valid crawl job",
744
+ async () => {
745
+ const crawlResponse = await request(TEST_URL)
746
+ .post("/v0/crawl")
747
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
748
+ .set("Content-Type", "application/json")
749
+ .send({
750
+ url: "https://www.scrapethissite.com/pages/",
751
+ crawlerOptions: { maxDepth: 1 },
752
+ });
753
+ expect(crawlResponse.statusCode).toBe(200);
754
+
755
+ const response = await request(TEST_URL)
756
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
757
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
758
+ expect(response.statusCode).toBe(200);
759
+ expect(response.body).toHaveProperty("status");
760
+ expect(["active", "waiting"]).toContain(response.body.status);
761
+ // wait for 60 seconds
762
+ let isCompleted = false;
763
+ while (!isCompleted) {
764
+ const statusCheckResponse = await request(TEST_URL)
765
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
766
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
767
+ expect(statusCheckResponse.statusCode).toBe(200);
768
+ isCompleted = statusCheckResponse.body.status === "completed";
769
+ if (!isCompleted) {
770
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
771
+ }
772
+ }
773
+ const completedResponse = await request(TEST_URL)
774
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
775
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
776
+
777
+ expect(completedResponse.statusCode).toBe(200);
778
+ expect(completedResponse.body).toHaveProperty("status");
779
+ expect(completedResponse.body.status).toBe("completed");
780
+ expect(completedResponse.body).toHaveProperty("data");
781
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
782
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
783
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
784
+ const urls = completedResponse.body.data.map(
785
+ (item: any) => item.metadata?.sourceURL,
786
+ );
787
+ expect(urls.length).toBeGreaterThan(1);
788
+
789
+ // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1
790
+ urls.forEach((url: string) => {
791
+ const pathSplits = new URL(url).pathname.split("/");
792
+ const depth =
793
+ pathSplits.length -
794
+ (pathSplits[0].length === 0 &&
795
+ pathSplits[pathSplits.length - 1].length === 0
796
+ ? 1
797
+ : 0);
798
+ expect(depth).toBeLessThanOrEqual(3);
799
+ });
800
+ },
801
+ 180000,
802
+ );
803
+
804
+ it.concurrent(
805
+ "should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero",
806
+ async () => {
807
+ const crawlResponse = await request(TEST_URL)
808
+ .post("/v0/crawl")
809
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
810
+ .set("Content-Type", "application/json")
811
+ .send({
812
+ url: "https://www.mendable.ai",
813
+ crawlerOptions: { maxDepth: 0 },
814
+ });
815
+ expect(crawlResponse.statusCode).toBe(200);
816
+
817
+ const response = await request(TEST_URL)
818
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
819
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
820
+ expect(response.statusCode).toBe(200);
821
+ expect(response.body).toHaveProperty("status");
822
+ expect(["active", "waiting"]).toContain(response.body.status);
823
+ // wait for 60 seconds
824
+ let isCompleted = false;
825
+ while (!isCompleted) {
826
+ const statusCheckResponse = await request(TEST_URL)
827
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
828
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
829
+ expect(statusCheckResponse.statusCode).toBe(200);
830
+ isCompleted = statusCheckResponse.body.status === "completed";
831
+ if (!isCompleted) {
832
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
833
+ }
834
+ }
835
+ const completedResponse = await request(TEST_URL)
836
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
837
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
838
+
839
+ const testurls = completedResponse.body.data.map(
840
+ (item: any) => item.metadata?.sourceURL,
841
+ );
842
+ //console.log(testurls)
843
+
844
+ expect(completedResponse.statusCode).toBe(200);
845
+ expect(completedResponse.body).toHaveProperty("status");
846
+ expect(completedResponse.body.status).toBe("completed");
847
+ expect(completedResponse.body).toHaveProperty("data");
848
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
849
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
850
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
851
+ const urls = completedResponse.body.data.map(
852
+ (item: any) => item.metadata?.sourceURL,
853
+ );
854
+ expect(urls.length).toBeGreaterThanOrEqual(1);
855
+
856
+ // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1
857
+ urls.forEach((url: string) => {
858
+ const pathSplits = new URL(url).pathname.split("/");
859
+ const depth =
860
+ pathSplits.length -
861
+ (pathSplits[0].length === 0 &&
862
+ pathSplits[pathSplits.length - 1].length === 0
863
+ ? 1
864
+ : 0);
865
+ expect(depth).toBeLessThanOrEqual(1);
866
+ });
867
+ },
868
+ 180000,
869
+ );
870
+
871
+ // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => {
872
+ // const crawlResponse = await request(TEST_URL)
873
+ // .post("/v0/crawl")
874
+ // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
875
+ // .set("Content-Type", "application/json")
876
+ // .send({
877
+ // url: "https://mendable.ai",
878
+ // crawlerOptions: { limit: 10 },
879
+ // });
880
+
881
+ // const response = await request(TEST_URL)
882
+ // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
883
+ // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
884
+ // expect(response.statusCode).toBe(200);
885
+ // expect(response.body).toHaveProperty("status");
886
+ // expect(response.body.status).toBe("active");
887
+
888
+ // let isCompleted = false;
889
+ // while (!isCompleted) {
890
+ // const statusCheckResponse = await request(TEST_URL)
891
+ // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
892
+ // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
893
+ // expect(statusCheckResponse.statusCode).toBe(200);
894
+ // isCompleted = statusCheckResponse.body.status === "completed";
895
+ // if (!isCompleted) {
896
+ // await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
897
+ // }
898
+ // }
899
+
900
+ // const completedResponse = await request(TEST_URL)
901
+ // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
902
+ // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
903
+
904
+ // expect(completedResponse.statusCode).toBe(200);
905
+ // expect(completedResponse.body).toHaveProperty("status");
906
+ // expect(completedResponse.body.status).toBe("completed");
907
+ // expect(completedResponse.body).toHaveProperty("data");
908
+ // expect(completedResponse.body.data.length).toBe(10);
909
+ // expect(completedResponse.body.data[0]).toHaveProperty("content");
910
+ // expect(completedResponse.body.data[0]).toHaveProperty("markdown");
911
+ // expect(completedResponse.body.data[0]).toHaveProperty("metadata");
912
+ // expect(completedResponse.body.data[0].content).toContain("Mendable");
913
+ // expect(completedResponse.body.data[0].content).not.toContain("main menu");
914
+ // }, 60000); // 60 seconds
915
+
916
+ it.concurrent(
917
+ "should return a successful response for a valid crawl job with includeHtml set to true option",
918
+ async () => {
919
+ const crawlResponse = await request(TEST_URL)
920
+ .post("/v0/crawl")
921
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
922
+ .set("Content-Type", "application/json")
923
+ .send({
924
+ url: "https://roastmywebsite.ai",
925
+ pageOptions: { includeHtml: true },
926
+ });
927
+ expect(crawlResponse.statusCode).toBe(200);
928
+
929
+ const response = await request(TEST_URL)
930
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
931
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
932
+ expect(response.statusCode).toBe(200);
933
+ expect(response.body).toHaveProperty("status");
934
+ expect(["active", "waiting"]).toContain(response.body.status);
935
+
936
+ let isCompleted = false;
937
+ while (!isCompleted) {
938
+ const statusCheckResponse = await request(TEST_URL)
939
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
940
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
941
+ expect(statusCheckResponse.statusCode).toBe(200);
942
+ isCompleted = statusCheckResponse.body.status === "completed";
943
+ if (!isCompleted) {
944
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
945
+ }
946
+ }
947
+
948
+ const completedResponse = await request(TEST_URL)
949
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
950
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
951
+
952
+ expect(completedResponse.statusCode).toBe(200);
953
+ expect(completedResponse.body).toHaveProperty("status");
954
+ expect(completedResponse.body.status).toBe("completed");
955
+ expect(completedResponse.body).toHaveProperty("data");
956
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
957
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
958
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
959
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
960
+ 200,
961
+ );
962
+ expect(
963
+ completedResponse.body.data[0].metadata.pageError,
964
+ ).toBeUndefined();
965
+
966
+ // 120 seconds
967
+ expect(completedResponse.body.data[0]).toHaveProperty("html");
968
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
969
+ expect(completedResponse.body.data[0].content).toContain("_Roast_");
970
+ expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
971
+ expect(completedResponse.body.data[0].html).toContain("<h1");
972
+
973
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
974
+ 200,
975
+ );
976
+ expect(
977
+ completedResponse.body.data[0].metadata.pageError,
978
+ ).toBeUndefined();
979
+ },
980
+ 180000,
981
+ );
982
+
983
+ it.concurrent(
984
+ "should crawl external content links when allowed",
985
+ async () => {
986
+ const crawlInitResponse = await request(TEST_URL)
987
+ .post("/v0/crawl")
988
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
989
+ .set("Content-Type", "application/json")
990
+ .send({
991
+ url: "https://mendable.ai",
992
+ crawlerOptions: {
993
+ allowExternalContentLinks: true,
994
+ ignoreSitemap: true,
995
+ returnOnlyUrls: true,
996
+ limit: 50,
997
+ },
998
+ });
999
+
1000
+ expect(crawlInitResponse.statusCode).toBe(200);
1001
+ expect(crawlInitResponse.body).toHaveProperty("jobId");
1002
+
1003
+ let crawlStatus: string = "scraping";
1004
+ let crawlData = [];
1005
+ while (crawlStatus !== "completed") {
1006
+ const statusResponse = await request(TEST_URL)
1007
+ .get(`/v0/crawl/status/${crawlInitResponse.body.jobId}`)
1008
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
1009
+ crawlStatus = statusResponse.body.status;
1010
+ if (statusResponse.body.data) {
1011
+ crawlData = statusResponse.body.data;
1012
+ }
1013
+ if (crawlStatus !== "completed") {
1014
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
1015
+ }
1016
+ }
1017
+ expect(crawlData.length).toBeGreaterThan(0);
1018
+ expect(crawlData).toEqual(
1019
+ expect.arrayContaining([
1020
+ expect.objectContaining({
1021
+ url: expect.stringContaining(
1022
+ "https://firecrawl.dev/?ref=mendable+banner",
1023
+ ),
1024
+ }),
1025
+ expect.objectContaining({
1026
+ url: expect.stringContaining("https://mendable.ai/pricing"),
1027
+ }),
1028
+ expect.objectContaining({
1029
+ url: expect.stringContaining("https://x.com/CalebPeffer"),
1030
+ }),
1031
+ ]),
1032
+ );
1033
+ },
1034
+ 180000,
1035
+ ); // 3 minutes timeout
1036
+ });
1037
+
1038
+ describe("POST /v0/crawlWebsitePreview", () => {
1039
+ it.concurrent("should require authorization", async () => {
1040
+ const response = await request(TEST_URL).post("/v0/crawlWebsitePreview");
1041
+ expect(response.statusCode).toBe(401);
1042
+ });
1043
+
1044
+ it.concurrent(
1045
+ "should return an error response with an invalid API key",
1046
+ async () => {
1047
+ const response = await request(TEST_URL)
1048
+ .post("/v0/crawlWebsitePreview")
1049
+ .set("Authorization", `Bearer invalid-api-key`)
1050
+ .set("Content-Type", "application/json")
1051
+ .send({ url: "https://firecrawl.dev" });
1052
+ expect(response.statusCode).toBe(401);
1053
+ },
1054
+ );
1055
+
1056
+ // it.concurrent("should return an error for a blocklisted URL", async () => {
1057
+ // const blocklistedUrl = "https://instagram.com/fake-test";
1058
+ // const response = await request(TEST_URL)
1059
+ // .post("/v0/crawlWebsitePreview")
1060
+ // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1061
+ // .set("Content-Type", "application/json")
1062
+ // .send({ url: blocklistedUrl });
1063
+ // // is returning 429 instead of 403
1064
+ // expect(response.statusCode).toBe(403);
1065
+ // expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
1066
+ // });
1067
+
1068
+ it.concurrent(
1069
+ "should return a timeout error when scraping takes longer than the specified timeout",
1070
+ async () => {
1071
+ const response = await request(TEST_URL)
1072
+ .post("/v0/scrape")
1073
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1074
+ .set("Content-Type", "application/json")
1075
+ .send({ url: "https://firecrawl.dev", timeout: 1000 });
1076
+
1077
+ expect(response.statusCode).toBe(408);
1078
+ },
1079
+ 3000,
1080
+ );
1081
+ });
1082
+
1083
+ describe("POST /v0/search", () => {
1084
+ it.concurrent("should require authorization", async () => {
1085
+ const response = await request(TEST_URL).post("/v0/search");
1086
+ expect(response.statusCode).toBe(401);
1087
+ });
1088
+
1089
+ it.concurrent(
1090
+ "should return an error response with an invalid API key",
1091
+ async () => {
1092
+ const response = await request(TEST_URL)
1093
+ .post("/v0/search")
1094
+ .set("Authorization", `Bearer invalid-api-key`)
1095
+ .set("Content-Type", "application/json")
1096
+ .send({ query: "test" });
1097
+ expect(response.statusCode).toBe(401);
1098
+ },
1099
+ );
1100
+
1101
+ it.concurrent(
1102
+ "should return a successful response with a valid API key for search",
1103
+ async () => {
1104
+ const response = await request(TEST_URL)
1105
+ .post("/v0/search")
1106
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1107
+ .set("Content-Type", "application/json")
1108
+ .send({ query: "test" });
1109
+ expect(response.statusCode).toBe(200);
1110
+ expect(response.body).toHaveProperty("success");
1111
+ expect(response.body.success).toBe(true);
1112
+ expect(response.body).toHaveProperty("data");
1113
+ },
1114
+ 30000,
1115
+ ); // 30 seconds timeout
1116
+ });
1117
+
1118
+ describe("GET /v0/crawl/status/:jobId", () => {
1119
+ it.concurrent("should require authorization", async () => {
1120
+ const response = await request(TEST_URL).get("/v0/crawl/status/123");
1121
+ expect(response.statusCode).toBe(401);
1122
+ });
1123
+
1124
+ it.concurrent(
1125
+ "should return an error response with an invalid API key",
1126
+ async () => {
1127
+ const response = await request(TEST_URL)
1128
+ .get("/v0/crawl/status/123")
1129
+ .set("Authorization", `Bearer invalid-api-key`);
1130
+ expect(response.statusCode).toBe(401);
1131
+ },
1132
+ );
1133
+
1134
+ it.concurrent(
1135
+ "should return Job not found for invalid job ID",
1136
+ async () => {
1137
+ const response = await request(TEST_URL)
1138
+ .get("/v0/crawl/status/invalidJobId")
1139
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
1140
+ expect(response.statusCode).toBe(404);
1141
+ },
1142
+ );
1143
+
1144
+ it.concurrent(
1145
+ "should return a successful crawl status response for a valid crawl job",
1146
+ async () => {
1147
+ const crawlResponse = await request(TEST_URL)
1148
+ .post("/v0/crawl")
1149
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1150
+ .set("Content-Type", "application/json")
1151
+ .send({ url: "https://mendable.ai/blog" });
1152
+ expect(crawlResponse.statusCode).toBe(200);
1153
+
1154
+ let isCompleted = false;
1155
+ let completedResponse;
1156
+
1157
+ while (!isCompleted) {
1158
+ const response = await request(TEST_URL)
1159
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
1160
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
1161
+ expect(response.statusCode).toBe(200);
1162
+ expect(response.body).toHaveProperty("status");
1163
+
1164
+ if (response.body.status === "completed") {
1165
+ isCompleted = true;
1166
+ completedResponse = response;
1167
+ } else {
1168
+ await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
1169
+ }
1170
+ }
1171
+ expect(completedResponse.body).toHaveProperty("status");
1172
+ expect(completedResponse.body.status).toBe("completed");
1173
+ expect(completedResponse.body).toHaveProperty("data");
1174
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
1175
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
1176
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
1177
+ expect(completedResponse.body.data[0].content).toContain("Mendable");
1178
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
1179
+ 200,
1180
+ );
1181
+ expect(
1182
+ completedResponse.body.data[0].metadata.pageError,
1183
+ ).toBeUndefined();
1184
+
1185
+ const childrenLinks = completedResponse.body.data.filter(
1186
+ (doc) =>
1187
+ doc.metadata &&
1188
+ doc.metadata.sourceURL &&
1189
+ doc.metadata.sourceURL.includes("mendable.ai/blog"),
1190
+ );
1191
+
1192
+ expect(childrenLinks.length).toBe(completedResponse.body.data.length);
1193
+ },
1194
+ 180000,
1195
+ ); // 120 seconds
1196
+
1197
+ it.concurrent(
1198
+ "should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ",
1199
+ async () => {
1200
+ const crawlResponse = await request(TEST_URL)
1201
+ .post("/v0/crawl")
1202
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1203
+ .set("Content-Type", "application/json")
1204
+ .send({
1205
+ url: "https://arxiv.org/pdf/astro-ph/9301001",
1206
+ crawlerOptions: {
1207
+ limit: 10,
1208
+ excludes: [
1209
+ "list/*",
1210
+ "login",
1211
+ "abs/*",
1212
+ "static/*",
1213
+ "about/*",
1214
+ "archive/*",
1215
+ ],
1216
+ },
1217
+ });
1218
+ expect(crawlResponse.statusCode).toBe(200);
1219
+
1220
+ let isCompleted = false;
1221
+ let completedResponse;
1222
+
1223
+ while (!isCompleted) {
1224
+ const response = await request(TEST_URL)
1225
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
1226
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
1227
+ expect(response.statusCode).toBe(200);
1228
+ expect(response.body).toHaveProperty("status");
1229
+
1230
+ if (response.body.status === "completed") {
1231
+ isCompleted = true;
1232
+ completedResponse = response;
1233
+ } else {
1234
+ await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
1235
+ }
1236
+ }
1237
+ expect(completedResponse.body.status).toBe("completed");
1238
+ expect(completedResponse.body).toHaveProperty("data");
1239
+ expect(completedResponse.body.data.length).toEqual(1);
1240
+ expect(completedResponse.body.data).toEqual(
1241
+ expect.arrayContaining([
1242
+ expect.objectContaining({
1243
+ content: expect.stringContaining(
1244
+ "asymmetries might represent, for instance, preferred source orientations to our line of sight.",
1245
+ ),
1246
+ }),
1247
+ ]),
1248
+ );
1249
+
1250
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
1251
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
1252
+ 200,
1253
+ );
1254
+ expect(
1255
+ completedResponse.body.data[0].metadata.pageError,
1256
+ ).toBeUndefined();
1257
+ },
1258
+ 180000,
1259
+ ); // 120 seconds
1260
+
1261
+ it.concurrent(
1262
+ "should return a successful response for a valid crawl job with includeHtml set to true option (2)",
1263
+ async () => {
1264
+ const crawlResponse = await request(TEST_URL)
1265
+ .post("/v0/crawl")
1266
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1267
+ .set("Content-Type", "application/json")
1268
+ .send({
1269
+ url: "https://roastmywebsite.ai",
1270
+ pageOptions: { includeHtml: true },
1271
+ });
1272
+ expect(crawlResponse.statusCode).toBe(200);
1273
+
1274
+ const response = await request(TEST_URL)
1275
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
1276
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
1277
+ expect(response.statusCode).toBe(200);
1278
+ expect(response.body).toHaveProperty("status");
1279
+ expect(["active", "waiting"]).toContain(response.body.status);
1280
+
1281
+ let isFinished = false;
1282
+ let completedResponse;
1283
+
1284
+ while (!isFinished) {
1285
+ const response = await request(TEST_URL)
1286
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
1287
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
1288
+ expect(response.statusCode).toBe(200);
1289
+ expect(response.body).toHaveProperty("status");
1290
+
1291
+ if (response.body.status === "completed") {
1292
+ isFinished = true;
1293
+ completedResponse = response;
1294
+ } else {
1295
+ await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
1296
+ }
1297
+ }
1298
+
1299
+ expect(completedResponse.statusCode).toBe(200);
1300
+ expect(completedResponse.body).toHaveProperty("status");
1301
+ expect(completedResponse.body.status).toBe("completed");
1302
+ expect(completedResponse.body).toHaveProperty("data");
1303
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
1304
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
1305
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
1306
+ expect(completedResponse.body.data[0]).toHaveProperty("html");
1307
+ expect(completedResponse.body.data[0].content).toContain("_Roast_");
1308
+ expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
1309
+ expect(completedResponse.body.data[0].html).toContain("<h1");
1310
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
1311
+ 200,
1312
+ );
1313
+ expect(
1314
+ completedResponse.body.data[0].metadata.pageError,
1315
+ ).toBeUndefined();
1316
+ },
1317
+ 60000,
1318
+ );
1319
+ }); // 60 seconds
1320
+
1321
+ it.concurrent(
1322
+ "should return a successful response for a valid crawl job with allowBackwardCrawling set to true option",
1323
+ async () => {
1324
+ const crawlResponse = await request(TEST_URL)
1325
+ .post("/v0/crawl")
1326
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1327
+ .set("Content-Type", "application/json")
1328
+ .send({
1329
+ url: "https://mendable.ai/blog",
1330
+ pageOptions: { includeHtml: true },
1331
+ crawlerOptions: { allowBackwardCrawling: true },
1332
+ });
1333
+ expect(crawlResponse.statusCode).toBe(200);
1334
+
1335
+ let isFinished = false;
1336
+ let completedResponse;
1337
+
1338
+ while (!isFinished) {
1339
+ const response = await request(TEST_URL)
1340
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
1341
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
1342
+ expect(response.statusCode).toBe(200);
1343
+ expect(response.body).toHaveProperty("status");
1344
+
1345
+ if (response.body.status === "completed") {
1346
+ isFinished = true;
1347
+ completedResponse = response;
1348
+ } else {
1349
+ await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
1350
+ }
1351
+ }
1352
+
1353
+ expect(completedResponse.statusCode).toBe(200);
1354
+ expect(completedResponse.body).toHaveProperty("status");
1355
+ expect(completedResponse.body.status).toBe("completed");
1356
+ expect(completedResponse.body).toHaveProperty("data");
1357
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
1358
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
1359
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
1360
+ expect(completedResponse.body.data[0]).toHaveProperty("html");
1361
+ expect(completedResponse.body.data[0].content).toContain("Mendable");
1362
+ expect(completedResponse.body.data[0].markdown).toContain("Mendable");
1363
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
1364
+ expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
1365
+
1366
+ const onlyChildrenLinks = completedResponse.body.data.filter((doc) => {
1367
+ return (
1368
+ doc.metadata &&
1369
+ doc.metadata.sourceURL &&
1370
+ doc.metadata.sourceURL.includes("mendable.ai/blog")
1371
+ );
1372
+ });
1373
+
1374
+ expect(completedResponse.body.data.length).toBeGreaterThan(
1375
+ onlyChildrenLinks.length,
1376
+ );
1377
+ },
1378
+ 60000,
1379
+ );
1380
+
1381
+ it.concurrent(
1382
+ "If someone cancels a crawl job, it should turn into failed status",
1383
+ async () => {
1384
+ const crawlResponse = await request(TEST_URL)
1385
+ .post("/v0/crawl")
1386
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1387
+ .set("Content-Type", "application/json")
1388
+ .send({ url: "https://jestjs.io" });
1389
+
1390
+ expect(crawlResponse.statusCode).toBe(200);
1391
+
1392
+ await new Promise((r) => setTimeout(r, 20000));
1393
+
1394
+ const responseCancel = await request(TEST_URL)
1395
+ .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
1396
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
1397
+ expect(responseCancel.statusCode).toBe(200);
1398
+ expect(responseCancel.body).toHaveProperty("status");
1399
+ expect(responseCancel.body.status).toBe("cancelled");
1400
+
1401
+ await new Promise((r) => setTimeout(r, 10000));
1402
+ const completedResponse = await request(TEST_URL)
1403
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
1404
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
1405
+
1406
+ expect(completedResponse.statusCode).toBe(200);
1407
+ expect(completedResponse.body).toHaveProperty("status");
1408
+ expect(completedResponse.body.status).toBe("failed");
1409
+ expect(completedResponse.body).toHaveProperty("data");
1410
+ expect(completedResponse.body.data).toBeNull();
1411
+ expect(completedResponse.body).toHaveProperty("partial_data");
1412
+ expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
1413
+ expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
1414
+ expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
1415
+ expect(
1416
+ completedResponse.body.partial_data[0].metadata.pageStatusCode,
1417
+ ).toBe(200);
1418
+ expect(
1419
+ completedResponse.body.partial_data[0].metadata.pageError,
1420
+ ).toBeUndefined();
1421
+ },
1422
+ 60000,
1423
+ ); // 60 seconds
1424
+
1425
+ describe("POST /v0/scrape with LLM Extraction", () => {
1426
+ it.concurrent(
1427
+ "should extract data using LLM extraction mode",
1428
+ async () => {
1429
+ const response = await request(TEST_URL)
1430
+ .post("/v0/scrape")
1431
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1432
+ .set("Content-Type", "application/json")
1433
+ .send({
1434
+ url: "https://mendable.ai",
1435
+ pageOptions: {
1436
+ onlyMainContent: true,
1437
+ },
1438
+ extractorOptions: {
1439
+ mode: "llm-extraction",
1440
+ extractionPrompt:
1441
+ "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
1442
+ extractionSchema: {
1443
+ type: "object",
1444
+ properties: {
1445
+ company_mission: {
1446
+ type: "string",
1447
+ },
1448
+ supports_sso: {
1449
+ type: "boolean",
1450
+ },
1451
+ is_open_source: {
1452
+ type: "boolean",
1453
+ },
1454
+ },
1455
+ required: ["company_mission", "supports_sso", "is_open_source"],
1456
+ },
1457
+ },
1458
+ });
1459
+
1460
+ // Ensure that the job was successfully created before proceeding with LLM extraction
1461
+ expect(response.statusCode).toBe(200);
1462
+
1463
+ // Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
1464
+ let llmExtraction = response.body.data.llm_extraction;
1465
+
1466
+ // Check if the llm_extraction object has the required properties with correct types and values
1467
+ expect(llmExtraction).toHaveProperty("company_mission");
1468
+ expect(typeof llmExtraction.company_mission).toBe("string");
1469
+ expect(llmExtraction).toHaveProperty("supports_sso");
1470
+ expect(llmExtraction.supports_sso).toBe(true);
1471
+ expect(typeof llmExtraction.supports_sso).toBe("boolean");
1472
+ expect(llmExtraction).toHaveProperty("is_open_source");
1473
+ expect(llmExtraction.is_open_source).toBe(false);
1474
+ expect(typeof llmExtraction.is_open_source).toBe("boolean");
1475
+ },
1476
+ 60000,
1477
+ ); // 60 secs
1478
+
1479
+ it.concurrent(
1480
+ "should extract data using LLM extraction mode with RawHtml",
1481
+ async () => {
1482
+ const response = await request(TEST_URL)
1483
+ .post("/v0/scrape")
1484
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1485
+ .set("Content-Type", "application/json")
1486
+ .send({
1487
+ url: "https://mendable.ai",
1488
+
1489
+ extractorOptions: {
1490
+ mode: "llm-extraction-from-raw-html",
1491
+ extractionPrompt:
1492
+ "Based on the information on the page, what are the primary and secondary CTA buttons?",
1493
+ extractionSchema: {
1494
+ type: "object",
1495
+ properties: {
1496
+ primary_cta: {
1497
+ type: "string",
1498
+ },
1499
+ secondary_cta: {
1500
+ type: "string",
1501
+ },
1502
+ },
1503
+ required: ["primary_cta", "secondary_cta"],
1504
+ },
1505
+ },
1506
+ });
1507
+
1508
+ // Ensure that the job was successfully created before proceeding with LLM extraction
1509
+ expect(response.statusCode).toBe(200);
1510
+
1511
+ // Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
1512
+ let llmExtraction = response.body.data.llm_extraction;
1513
+
1514
+ // Check if the llm_extraction object has the required properties with correct types and values
1515
+ expect(llmExtraction).toHaveProperty("primary_cta");
1516
+ expect(typeof llmExtraction.primary_cta).toBe("string");
1517
+ expect(llmExtraction).toHaveProperty("secondary_cta");
1518
+ expect(typeof llmExtraction.secondary_cta).toBe("string");
1519
+ },
1520
+ 60000,
1521
+ ); // 60 secs
1522
+ });
1523
+
1524
+ // describe("POST /v0/scrape for Top 100 Companies", () => {
1525
+ // it.concurrent("should extract data for the top 100 companies", async () => {
1526
+ // const response = await request(TEST_URL)
1527
+ // .post("/v0/scrape")
1528
+ // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1529
+ // .set("Content-Type", "application/json")
1530
+ // .send({
1531
+ // url: "https://companiesmarketcap.com/",
1532
+ // pageOptions: {
1533
+ // onlyMainContent: true
1534
+ // },
1535
+ // extractorOptions: {
1536
+ // mode: "llm-extraction",
1537
+ // extractionPrompt: "Extract the name, market cap, price, and today's change for the top 20 companies listed on the page.",
1538
+ // extractionSchema: {
1539
+ // type: "object",
1540
+ // properties: {
1541
+ // companies: {
1542
+ // type: "array",
1543
+ // items: {
1544
+ // type: "object",
1545
+ // properties: {
1546
+ // rank: { type: "number" },
1547
+ // name: { type: "string" },
1548
+ // marketCap: { type: "string" },
1549
+ // price: { type: "string" },
1550
+ // todayChange: { type: "string" }
1551
+ // },
1552
+ // required: ["rank", "name", "marketCap", "price", "todayChange"]
1553
+ // }
1554
+ // }
1555
+ // },
1556
+ // required: ["companies"]
1557
+ // }
1558
+ // }
1559
+ // });
1560
+
1561
+ // // Print the response body to the console for debugging purposes
1562
+ // console.log("Response companies:", response.body.data.llm_extraction.companies);
1563
+
1564
+ // // Check if the response has the correct structure and data types
1565
+ // expect(response.status).toBe(200);
1566
+ // expect(Array.isArray(response.body.data.llm_extraction.companies)).toBe(true);
1567
+ // expect(response.body.data.llm_extraction.companies.length).toBe(40);
1568
+
1569
+ // // Sample check for the first company
1570
+ // const firstCompany = response.body.data.llm_extraction.companies[0];
1571
+ // expect(firstCompany).toHaveProperty("name");
1572
+ // expect(typeof firstCompany.name).toBe("string");
1573
+ // expect(firstCompany).toHaveProperty("marketCap");
1574
+ // expect(typeof firstCompany.marketCap).toBe("string");
1575
+ // expect(firstCompany).toHaveProperty("price");
1576
+ // expect(typeof firstCompany.price).toBe("string");
1577
+ // expect(firstCompany).toHaveProperty("todayChange");
1578
+ // expect(typeof firstCompany.todayChange).toBe("string");
1579
+ // }, 120000); // 120 secs
1580
+ // });
1581
+
1582
+ describe("POST /v0/crawl with fast mode", () => {
1583
+ it.concurrent(
1584
+ "should complete the crawl under 20 seconds",
1585
+ async () => {
1586
+ const startTime = Date.now();
1587
+
1588
+ const crawlResponse = await request(TEST_URL)
1589
+ .post("/v0/crawl")
1590
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1591
+ .set("Content-Type", "application/json")
1592
+ .send({
1593
+ url: "https://flutterbricks.com",
1594
+ crawlerOptions: {
1595
+ mode: "fast",
1596
+ },
1597
+ });
1598
+
1599
+ expect(crawlResponse.statusCode).toBe(200);
1600
+
1601
+ const jobId = crawlResponse.body.jobId;
1602
+ let statusResponse;
1603
+ let isFinished = false;
1604
+
1605
+ while (!isFinished) {
1606
+ statusResponse = await request(TEST_URL)
1607
+ .get(`/v0/crawl/status/${jobId}`)
1608
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
1609
+
1610
+ expect(statusResponse.statusCode).toBe(200);
1611
+ isFinished = statusResponse.body.status === "completed";
1612
+
1613
+ if (!isFinished) {
1614
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
1615
+ }
1616
+ }
1617
+
1618
+ // const endTime = Date.now();
1619
+ // const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
1620
+
1621
+ // console.log(`Time elapsed: ${timeElapsed} seconds`);
1622
+
1623
+ expect(statusResponse.body.status).toBe("completed");
1624
+ expect(statusResponse.body).toHaveProperty("data");
1625
+ expect(statusResponse.body.data[0]).toHaveProperty("content");
1626
+ expect(statusResponse.body.data[0]).toHaveProperty("markdown");
1627
+ expect(statusResponse.body.data[0]).toHaveProperty("metadata");
1628
+ expect(statusResponse.body.data[0].metadata.pageStatusCode).toBe(200);
1629
+ expect(statusResponse.body.data[0].metadata.pageError).toBeUndefined();
1630
+
1631
+ const results = statusResponse.body.data;
1632
+ // results.forEach((result, i) => {
1633
+ // console.log(result.metadata.sourceURL);
1634
+ // });
1635
+ expect(results.length).toBeGreaterThanOrEqual(10);
1636
+ expect(results.length).toBeLessThanOrEqual(15);
1637
+ },
1638
+ 20000,
1639
+ );
1640
+
1641
+ // it.concurrent("should complete the crawl in more than 10 seconds", async () => {
1642
+ // const startTime = Date.now();
1643
+
1644
+ // const crawlResponse = await request(TEST_URL)
1645
+ // .post("/v0/crawl")
1646
+ // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1647
+ // .set("Content-Type", "application/json")
1648
+ // .send({
1649
+ // url: "https://flutterbricks.com",
1650
+ // });
1651
+
1652
+ // expect(crawlResponse.statusCode).toBe(200);
1653
+
1654
+ // const jobId = crawlResponse.body.jobId;
1655
+ // let statusResponse;
1656
+ // let isFinished = false;
1657
+
1658
+ // while (!isFinished) {
1659
+ // statusResponse = await request(TEST_URL)
1660
+ // .get(`/v0/crawl/status/${jobId}`)
1661
+ // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
1662
+
1663
+ // expect(statusResponse.statusCode).toBe(200);
1664
+ // isFinished = statusResponse.body.status === "completed";
1665
+
1666
+ // if (!isFinished) {
1667
+ // await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
1668
+ // }
1669
+ // }
1670
+
1671
+ // const endTime = Date.now();
1672
+ // const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
1673
+
1674
+ // console.log(`Time elapsed: ${timeElapsed} seconds`);
1675
+
1676
+ // expect(statusResponse.body.status).toBe("completed");
1677
+ // expect(statusResponse.body).toHaveProperty("data");
1678
+ // expect(statusResponse.body.data[0]).toHaveProperty("content");
1679
+ // expect(statusResponse.body.data[0]).toHaveProperty("markdown");
1680
+ // const results = statusResponse.body.data;
1681
+ // // results.forEach((result, i) => {
1682
+ // // console.log(result.metadata.sourceURL);
1683
+ // // });
1684
+ // expect(results.length).toBeGreaterThanOrEqual(10);
1685
+ // expect(results.length).toBeLessThanOrEqual(15);
1686
+
1687
+ // }, 50000);// 15 seconds timeout to account for network delays
1688
+ });
1689
+
1690
+ describe("GET /is-production", () => {
1691
+ it.concurrent("should return the production status", async () => {
1692
+ const response = await request(TEST_URL).get("/is-production");
1693
+ expect(response.statusCode).toBe(200);
1694
+ expect(response.body).toHaveProperty("isProduction");
1695
+ });
1696
+ });
1697
+
1698
+ describe("Rate Limiter", () => {
1699
+ it.concurrent(
1700
+ "should return 429 when rate limit is exceeded for preview token",
1701
+ async () => {
1702
+ for (let i = 0; i < 5; i++) {
1703
+ const response = await request(TEST_URL)
1704
+ .post("/v0/scrape")
1705
+ .set("Authorization", `Bearer ${process.env.PREVIEW_TOKEN}`)
1706
+ .set("Content-Type", "application/json")
1707
+ .send({ url: "https://www.scrapethissite.com" });
1708
+
1709
+ expect(response.statusCode).toBe(200);
1710
+ }
1711
+ const response = await request(TEST_URL)
1712
+ .post("/v0/scrape")
1713
+ .set("Authorization", `Bearer ${process.env.PREVIEW_TOKEN}`)
1714
+ .set("Content-Type", "application/json")
1715
+ .send({ url: "https://www.scrapethissite.com" });
1716
+
1717
+ expect(response.statusCode).toBe(429);
1718
+ },
1719
+ 90000,
1720
+ );
1721
+ });
1722
+
1723
+ // it.concurrent("should return 429 when rate limit is exceeded for API key", async () => {
1724
+ // for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_SCRAPE); i++) {
1725
+ // const response = await request(TEST_URL)
1726
+ // .post("/v0/scrape")
1727
+ // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1728
+ // .set("Content-Type", "application/json")
1729
+ // .send({ url: "https://www.scrapethissite.com" });
1730
+
1731
+ // expect(response.statusCode).toBe(200);
1732
+ // }
1733
+
1734
+ // const response = await request(TEST_URL)
1735
+ // .post("/v0/scrape")
1736
+ // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1737
+ // .set("Content-Type", "application/json")
1738
+ // .send({ url: "https://www.scrapethissite.com" });
1739
+
1740
+ // expect(response.statusCode).toBe(429);
1741
+ // }, 60000);
1742
+
1743
+ // it.concurrent("should return 429 when rate limit is exceeded for API key", async () => {
1744
+ // for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_CRAWL); i++) {
1745
+ // const response = await request(TEST_URL)
1746
+ // .post("/v0/crawl")
1747
+ // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1748
+ // .set("Content-Type", "application/json")
1749
+ // .send({ url: "https://www.scrapethissite.com" });
1750
+
1751
+ // expect(response.statusCode).toBe(200);
1752
+ // }
1753
+
1754
+ // const response = await request(TEST_URL)
1755
+ // .post("/v0/crawl")
1756
+ // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1757
+ // .set("Content-Type", "application/json")
1758
+ // .send({ url: "https://www.scrapethissite.com" });
1759
+
1760
+ // expect(response.statusCode).toBe(429);
1761
+ // }, 60000);
1762
+ });
src/__tests__/e2e_map/index.test.ts ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import request from "supertest";
2
+ import dotenv from "dotenv";
3
+
4
+ dotenv.config();
5
+ const TEST_URL = "http://127.0.0.1:3002";
6
+
7
+ describe("E2E Tests for Map API Routes", () => {
8
+ it.concurrent(
9
+ "(feat-search)should return links containing 'smart-crawl'",
10
+ async () => {
11
+ const response = await request(TEST_URL)
12
+ .post("/v1/map")
13
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
14
+ .set("Content-Type", "application/json")
15
+ .send({
16
+ url: "https://firecrawl.dev",
17
+ sitemapOnly: false,
18
+ search: "smart-crawl",
19
+ });
20
+
21
+ console.log(response.body);
22
+ expect(response.statusCode).toBe(200);
23
+ expect(response.body).toHaveProperty("links");
24
+ expect(response.body.links.length).toBeGreaterThan(0);
25
+ expect(response.body.links[0]).toContain("firecrawl.dev/smart-crawl");
26
+ },
27
+ 60000,
28
+ );
29
+
30
+ it.concurrent(
31
+ "(feat-subdomains) should return mapped links for firecrawl.dev with subdomains included",
32
+ async () => {
33
+ const response = await request(TEST_URL)
34
+ .post("/v1/map")
35
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
36
+ .set("Content-Type", "application/json")
37
+ .send({
38
+ url: "https://firecrawl.dev",
39
+ sitemapOnly: false,
40
+ includeSubdomains: true,
41
+ });
42
+
43
+ console.log(response.body);
44
+ expect(response.statusCode).toBe(200);
45
+ expect(response.body).toHaveProperty("links");
46
+ expect(response.body.links.length).toBeGreaterThan(0);
47
+ expect(response.body.links[response.body.links.length - 1]).toContain(
48
+ "docs.firecrawl.dev",
49
+ );
50
+ },
51
+ 60000,
52
+ );
53
+
54
+ it.concurrent(
55
+ "(feat-sitemap-only) should return mapped links for firecrawl.dev with sitemap only",
56
+ async () => {
57
+ const response = await request(TEST_URL)
58
+ .post("/v1/map")
59
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
60
+ .set("Content-Type", "application/json")
61
+ .send({
62
+ url: "https://firecrawl.dev",
63
+ sitemapOnly: true,
64
+ });
65
+
66
+ console.log(response.body);
67
+ expect(response.statusCode).toBe(200);
68
+ expect(response.body).toHaveProperty("links");
69
+ expect(response.body.links.length).toBeGreaterThan(0);
70
+ expect(response.body.links[response.body.links.length - 1]).not.toContain(
71
+ "docs.firecrawl.dev",
72
+ );
73
+ },
74
+ 60000,
75
+ );
76
+
77
+ it.concurrent(
78
+ "(feat-limit) should return mapped links for firecrawl.dev with a limit",
79
+ async () => {
80
+ const response = await request(TEST_URL)
81
+ .post("/v1/map")
82
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
83
+ .set("Content-Type", "application/json")
84
+ .send({
85
+ url: "https://firecrawl.dev",
86
+ sitemapOnly: false,
87
+ limit: 10,
88
+ });
89
+
90
+ console.log(response.body);
91
+ expect(response.statusCode).toBe(200);
92
+ expect(response.body).toHaveProperty("links");
93
+ expect(response.body.links.length).toBeLessThanOrEqual(10);
94
+ },
95
+ 60000,
96
+ );
97
+
98
+ it.concurrent(
99
+ "(feat-sitemap-large) should return more than 1900 links for geekflare sitemap",
100
+ async () => {
101
+ const response = await request(TEST_URL)
102
+ .post("/v1/map")
103
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
104
+ .set("Content-Type", "application/json")
105
+ .send({
106
+ url: "https://geekflare.com/sitemap_index.xml",
107
+ sitemapOnly: true,
108
+ });
109
+
110
+ console.log(response.body);
111
+ expect(response.statusCode).toBe(200);
112
+ expect(response.body).toHaveProperty("links");
113
+ expect(response.body.links.length).toBeGreaterThan(1900);
114
+ },
115
+ 60000,
116
+ );
117
+ });
src/__tests__/e2e_noAuth/index.test.ts ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import request from "supertest";
2
+ import dotenv from "dotenv";
3
+ import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
4
+ const fs = require("fs");
5
+ const path = require("path");
6
+
7
+ dotenv.config();
8
+
9
+ const TEST_URL = "http://127.0.0.1:3002";
10
+
11
+ describe("E2E Tests for API Routes with No Authentication", () => {
12
+ let originalEnv: NodeJS.ProcessEnv;
13
+
14
+ // save original process.env
15
+ beforeAll(() => {
16
+ originalEnv = { ...process.env };
17
+ process.env.USE_DB_AUTHENTICATION = "false";
18
+ process.env.SUPABASE_ANON_TOKEN = "";
19
+ process.env.SUPABASE_URL = "";
20
+ process.env.SUPABASE_SERVICE_TOKEN = "";
21
+ process.env.SCRAPING_BEE_API_KEY = "";
22
+ process.env.OPENAI_API_KEY = "";
23
+ process.env.BULL_AUTH_KEY = "";
24
+ process.env.PLAYWRIGHT_MICROSERVICE_URL = "";
25
+ process.env.LLAMAPARSE_API_KEY = "";
26
+ process.env.TEST_API_KEY = "";
27
+ process.env.POSTHOG_API_KEY = "";
28
+ process.env.POSTHOG_HOST = "";
29
+ });
30
+
31
+ // restore original process.env
32
+ afterAll(() => {
33
+ process.env = originalEnv;
34
+ });
35
+
36
+ describe("GET /", () => {
37
+ it("should return Hello, world! message", async () => {
38
+ const response = await request(TEST_URL).get("/");
39
+ expect(response.statusCode).toBe(200);
40
+ expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io");
41
+ });
42
+ });
43
+
44
+ describe("GET /test", () => {
45
+ it("should return Hello, world! message", async () => {
46
+ const response = await request(TEST_URL).get("/test");
47
+ expect(response.statusCode).toBe(200);
48
+ expect(response.text).toContain("Hello, world!");
49
+ });
50
+ });
51
+
52
+ describe("POST /v0/scrape", () => {
53
+ it("should not require authorization", async () => {
54
+ const response = await request(TEST_URL).post("/v0/scrape");
55
+ expect(response.statusCode).not.toBe(401);
56
+ });
57
+
58
+ it("should return an error for a blocklisted URL without requiring authorization", async () => {
59
+ const blocklistedUrl = "https://facebook.com/fake-test";
60
+ const response = await request(TEST_URL)
61
+ .post("/v0/scrape")
62
+ .set("Content-Type", "application/json")
63
+ .send({ url: blocklistedUrl });
64
+ expect(response.statusCode).toBe(403);
65
+ expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
66
+ });
67
+
68
+ it("should return a successful response", async () => {
69
+ const response = await request(TEST_URL)
70
+ .post("/v0/scrape")
71
+ .set("Content-Type", "application/json")
72
+ .send({ url: "https://firecrawl.dev" });
73
+ expect(response.statusCode).toBe(200);
74
+ }, 10000); // 10 seconds timeout
75
+ });
76
+
77
+ describe("POST /v0/crawl", () => {
78
+ it("should not require authorization", async () => {
79
+ const response = await request(TEST_URL).post("/v0/crawl");
80
+ expect(response.statusCode).not.toBe(401);
81
+ });
82
+
83
+ it("should return an error for a blocklisted URL", async () => {
84
+ const blocklistedUrl = "https://twitter.com/fake-test";
85
+ const response = await request(TEST_URL)
86
+ .post("/v0/crawl")
87
+ .set("Content-Type", "application/json")
88
+ .send({ url: blocklistedUrl });
89
+ expect(response.statusCode).toBe(403);
90
+ expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
91
+ });
92
+
93
+ it("should return a successful response", async () => {
94
+ const response = await request(TEST_URL)
95
+ .post("/v0/crawl")
96
+ .set("Content-Type", "application/json")
97
+ .send({ url: "https://firecrawl.dev" });
98
+ expect(response.statusCode).toBe(200);
99
+ expect(response.body).toHaveProperty("jobId");
100
+ expect(response.body.jobId).toMatch(
101
+ /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
102
+ );
103
+ });
104
+ });
105
+
106
+ describe("POST /v0/crawlWebsitePreview", () => {
107
+ it("should not require authorization", async () => {
108
+ const response = await request(TEST_URL).post("/v0/crawlWebsitePreview");
109
+ expect(response.statusCode).not.toBe(401);
110
+ });
111
+
112
+ it("should return an error for a blocklisted URL", async () => {
113
+ const blocklistedUrl = "https://instagram.com/fake-test";
114
+ const response = await request(TEST_URL)
115
+ .post("/v0/crawlWebsitePreview")
116
+ .set("Content-Type", "application/json")
117
+ .send({ url: blocklistedUrl });
118
+ expect(response.statusCode).toBe(403);
119
+ expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE);
120
+ });
121
+
122
+ it("should return a successful response", async () => {
123
+ const response = await request(TEST_URL)
124
+ .post("/v0/crawlWebsitePreview")
125
+ .set("Content-Type", "application/json")
126
+ .send({ url: "https://firecrawl.dev" });
127
+ expect(response.statusCode).toBe(200);
128
+ expect(response.body).toHaveProperty("jobId");
129
+ expect(response.body.jobId).toMatch(
130
+ /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
131
+ );
132
+ });
133
+ });
134
+
135
+ describe("POST /v0/search", () => {
136
+ it("should require not authorization", async () => {
137
+ const response = await request(TEST_URL).post("/v0/search");
138
+ expect(response.statusCode).not.toBe(401);
139
+ });
140
+
141
+ it("should return no error response with an invalid API key", async () => {
142
+ const response = await request(TEST_URL)
143
+ .post("/v0/search")
144
+ .set("Authorization", `Bearer invalid-api-key`)
145
+ .set("Content-Type", "application/json")
146
+ .send({ query: "test" });
147
+ expect(response.statusCode).not.toBe(401);
148
+ });
149
+
150
+ it("should return a successful response without a valid API key", async () => {
151
+ const response = await request(TEST_URL)
152
+ .post("/v0/search")
153
+ .set("Content-Type", "application/json")
154
+ .send({ query: "test" });
155
+ expect(response.statusCode).toBe(200);
156
+ expect(response.body).toHaveProperty("success");
157
+ expect(response.body.success).toBe(true);
158
+ expect(response.body).toHaveProperty("data");
159
+ }, 20000);
160
+ });
161
+
162
+ describe("GET /v0/crawl/status/:jobId", () => {
163
+ it("should not require authorization", async () => {
164
+ const response = await request(TEST_URL).get("/v0/crawl/status/123");
165
+ expect(response.statusCode).not.toBe(401);
166
+ });
167
+
168
+ it("should return Job not found for invalid job ID", async () => {
169
+ const response = await request(TEST_URL).get(
170
+ "/v0/crawl/status/invalidJobId",
171
+ );
172
+ expect(response.statusCode).toBe(404);
173
+ });
174
+
175
+ it("should return a successful response for a valid crawl job", async () => {
176
+ const crawlResponse = await request(TEST_URL)
177
+ .post("/v0/crawl")
178
+ .set("Content-Type", "application/json")
179
+ .send({ url: "https://firecrawl.dev" });
180
+ expect(crawlResponse.statusCode).toBe(200);
181
+
182
+ const response = await request(TEST_URL).get(
183
+ `/v0/crawl/status/${crawlResponse.body.jobId}`,
184
+ );
185
+ expect(response.statusCode).toBe(200);
186
+ expect(response.body).toHaveProperty("status");
187
+ expect(response.body.status).toBe("active");
188
+
189
+ // wait for 30 seconds
190
+ await new Promise((r) => setTimeout(r, 30000));
191
+
192
+ const completedResponse = await request(TEST_URL).get(
193
+ `/v0/crawl/status/${crawlResponse.body.jobId}`,
194
+ );
195
+ expect(completedResponse.statusCode).toBe(200);
196
+ expect(completedResponse.body).toHaveProperty("status");
197
+ expect(completedResponse.body.status).toBe("completed");
198
+ expect(completedResponse.body).toHaveProperty("data");
199
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
200
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
201
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
202
+ }, 60000); // 60 seconds
203
+ });
204
+
205
+ describe("GET /is-production", () => {
206
+ it("should return the production status", async () => {
207
+ const response = await request(TEST_URL).get("/is-production");
208
+ expect(response.statusCode).toBe(200);
209
+ expect(response.body).toHaveProperty("isProduction");
210
+ });
211
+ });
212
+ });
src/__tests__/e2e_v1_withAuth/index.test.ts ADDED
@@ -0,0 +1,1066 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import request from "supertest";
2
+ import { configDotenv } from "dotenv";
3
+ import { ScrapeRequestInput } from "../../controllers/v1/types";
4
+ import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
5
+
6
+ configDotenv();
7
+ const TEST_URL = "http://127.0.0.1:3002";
8
+
9
+ describe("E2E Tests for v1 API Routes", () => {
10
+ beforeAll(() => {
11
+ process.env.USE_DB_AUTHENTICATION = "true";
12
+ });
13
+
14
+ afterAll(() => {
15
+ delete process.env.USE_DB_AUTHENTICATION;
16
+ });
17
+
18
+ describe("GET /is-production", () => {
19
+ it.concurrent("should return the production status", async () => {
20
+ const response: any = await request(TEST_URL).get("/is-production");
21
+
22
+ console.log(
23
+ "process.env.USE_DB_AUTHENTICATION",
24
+ process.env.USE_DB_AUTHENTICATION,
25
+ );
26
+ console.log("?", process.env.USE_DB_AUTHENTICATION === "true");
27
+ const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
28
+ console.log("!!useDbAuthentication", !!useDbAuthentication);
29
+ console.log("!useDbAuthentication", !useDbAuthentication);
30
+
31
+ expect(response.statusCode).toBe(200);
32
+ expect(response.body).toHaveProperty("isProduction");
33
+ });
34
+ });
35
+
36
+ describe("POST /v1/scrape", () => {
37
+ it.concurrent("should require authorization", async () => {
38
+ const response: any = await request(TEST_URL)
39
+ .post("/v1/scrape")
40
+ .send({ url: "https://firecrawl.dev" });
41
+
42
+ expect(response.statusCode).toBe(401);
43
+ });
44
+
45
+ it.concurrent("should throw error for blocklisted URL", async () => {
46
+ const scrapeRequest: ScrapeRequestInput = {
47
+ url: "https://facebook.com/fake-test",
48
+ };
49
+
50
+ const response = await request(TEST_URL)
51
+ .post("/v1/scrape")
52
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
53
+ .set("Content-Type", "application/json")
54
+ .send(scrapeRequest);
55
+
56
+ expect(response.statusCode).toBe(403);
57
+ expect(response.body.error).toBe(BLOCKLISTED_URL_MESSAGE);
58
+ });
59
+
60
+ it.concurrent(
61
+ "should return an error response with an invalid API key",
62
+ async () => {
63
+ const response: any = await request(TEST_URL)
64
+ .post("/v1/scrape")
65
+ .set("Authorization", `Bearer invalid-api-key`)
66
+ .set("Content-Type", "application/json")
67
+ .send({ url: "https://firecrawl.dev" });
68
+ expect(response.statusCode).toBe(401);
69
+ },
70
+ );
71
+
72
+ it.concurrent(
73
+ "should return a successful response with a valid API key",
74
+ async () => {
75
+ const scrapeRequest: ScrapeRequestInput = {
76
+ url: "https://roastmywebsite.ai",
77
+ };
78
+
79
+ const response: any = await request(TEST_URL)
80
+ .post("/v1/scrape")
81
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
82
+ .set("Content-Type", "application/json")
83
+ .send(scrapeRequest);
84
+
85
+ expect(response.statusCode).toBe(200);
86
+
87
+ if (!("data" in response.body)) {
88
+ throw new Error("Expected response body to have 'data' property");
89
+ }
90
+ expect(response.body.data).not.toHaveProperty("content");
91
+ expect(response.body.data).toHaveProperty("markdown");
92
+ expect(response.body.data).toHaveProperty("metadata");
93
+ expect(response.body.data).not.toHaveProperty("html");
94
+ expect(response.body.data.markdown).toContain("_Roast_");
95
+ expect(response.body.data.metadata.error).toBeUndefined();
96
+ expect(response.body.data.metadata.title).toBe("Roast My Website");
97
+ expect(response.body.data.metadata.description).toBe(
98
+ "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
99
+ );
100
+ expect(response.body.data.metadata.keywords).toBe(
101
+ "Roast My Website,Roast,Website,GitHub,Firecrawl",
102
+ );
103
+ expect(response.body.data.metadata.robots).toBe("follow, index");
104
+ expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
105
+ expect(response.body.data.metadata.ogDescription).toBe(
106
+ "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
107
+ );
108
+ expect(response.body.data.metadata.ogUrl).toBe(
109
+ "https://www.roastmywebsite.ai",
110
+ );
111
+ expect(response.body.data.metadata.ogImage).toBe(
112
+ "https://www.roastmywebsite.ai/og.png",
113
+ );
114
+ expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
115
+ expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
116
+ expect(response.body.data.metadata.sourceURL).toBe(
117
+ "https://roastmywebsite.ai",
118
+ );
119
+ expect(response.body.data.metadata.statusCode).toBe(200);
120
+ },
121
+ 30000,
122
+ ); // 30 seconds timeout
123
+
124
+ it.concurrent(
125
+ "should return a successful response with a valid API key",
126
+ async () => {
127
+ const scrapeRequest: ScrapeRequestInput = {
128
+ url: "https://arxiv.org/abs/2410.04840",
129
+ };
130
+
131
+ const response: any = await request(TEST_URL)
132
+ .post("/v1/scrape")
133
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
134
+ .set("Content-Type", "application/json")
135
+ .send(scrapeRequest);
136
+
137
+ expect(response.statusCode).toBe(200);
138
+
139
+ if (!("data" in response.body)) {
140
+ throw new Error("Expected response body to have 'data' property");
141
+ }
142
+ expect(response.body.data).not.toHaveProperty("content");
143
+ expect(response.body.data).toHaveProperty("markdown");
144
+ expect(response.body.data).toHaveProperty("metadata");
145
+ expect(response.body.data).not.toHaveProperty("html");
146
+ expect(response.body.data.markdown).toContain("Strong Model Collapse");
147
+ expect(response.body.data.metadata.error).toBeUndefined();
148
+ expect(response.body.data.metadata.description).toContain(
149
+ "Abstract page for arXiv paper 2410.04840: Strong Model Collapse",
150
+ );
151
+ expect(response.body.data.metadata.citation_title).toBe(
152
+ "Strong Model Collapse",
153
+ );
154
+ expect(response.body.data.metadata.citation_author).toEqual([
155
+ "Dohmatob, Elvis",
156
+ "Feng, Yunzhen",
157
+ "Subramonian, Arjun",
158
+ "Kempe, Julia",
159
+ ]);
160
+ expect(response.body.data.metadata.citation_date).toBe("2024/10/07");
161
+ expect(response.body.data.metadata.citation_online_date).toBe(
162
+ "2024/10/08",
163
+ );
164
+ expect(response.body.data.metadata.citation_pdf_url).toBe(
165
+ "http://arxiv.org/pdf/2410.04840",
166
+ );
167
+ expect(response.body.data.metadata.citation_arxiv_id).toBe(
168
+ "2410.04840",
169
+ );
170
+ expect(response.body.data.metadata.citation_abstract).toContain(
171
+ "Within the scaling laws paradigm",
172
+ );
173
+ expect(response.body.data.metadata.sourceURL).toBe(
174
+ "https://arxiv.org/abs/2410.04840",
175
+ );
176
+ expect(response.body.data.metadata.statusCode).toBe(200);
177
+ },
178
+ 30000,
179
+ );
180
+ it.concurrent(
181
+ "should return a successful response with a valid API key and includeHtml set to true",
182
+ async () => {
183
+ const scrapeRequest: ScrapeRequestInput = {
184
+ url: "https://roastmywebsite.ai",
185
+ formats: ["markdown", "html"],
186
+ };
187
+
188
+ const response: any = await request(TEST_URL)
189
+ .post("/v1/scrape")
190
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
191
+ .set("Content-Type", "application/json")
192
+ .send(scrapeRequest);
193
+
194
+ expect(response.statusCode).toBe(200);
195
+ expect(response.body).toHaveProperty("data");
196
+ if (!("data" in response.body)) {
197
+ throw new Error("Expected response body to have 'data' property");
198
+ }
199
+ expect(response.body.data).toHaveProperty("markdown");
200
+ expect(response.body.data).toHaveProperty("html");
201
+ expect(response.body.data).toHaveProperty("metadata");
202
+ expect(response.body.data.markdown).toContain("_Roast_");
203
+ expect(response.body.data.html).toContain("<h1");
204
+ expect(response.body.data.metadata.statusCode).toBe(200);
205
+ expect(response.body.data.metadata.error).toBeUndefined();
206
+ },
207
+ 30000,
208
+ );
209
+ it.concurrent(
210
+ "should return a successful response for a valid scrape with PDF file",
211
+ async () => {
212
+ const scrapeRequest: ScrapeRequestInput = {
213
+ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf",
214
+ // formats: ["markdown", "html"],
215
+ };
216
+ const response: any = await request(TEST_URL)
217
+ .post("/v1/scrape")
218
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
219
+ .set("Content-Type", "application/json")
220
+ .send(scrapeRequest);
221
+ await new Promise((r) => setTimeout(r, 6000));
222
+
223
+ expect(response.statusCode).toBe(200);
224
+ expect(response.body).toHaveProperty("data");
225
+ if (!("data" in response.body)) {
226
+ throw new Error("Expected response body to have 'data' property");
227
+ }
228
+ expect(response.body.data).toHaveProperty("metadata");
229
+ expect(response.body.data.markdown).toContain(
230
+ "Broad Line Radio Galaxy",
231
+ );
232
+ expect(response.body.data.metadata.statusCode).toBe(200);
233
+ expect(response.body.data.metadata.error).toBeUndefined();
234
+ },
235
+ 60000,
236
+ );
237
+
238
+ it.concurrent(
239
+ "should return a successful response for a valid scrape with PDF file without explicit .pdf extension",
240
+ async () => {
241
+ const scrapeRequest: ScrapeRequestInput = {
242
+ url: "https://arxiv.org/pdf/astro-ph/9301001",
243
+ };
244
+ const response: any = await request(TEST_URL)
245
+ .post("/v1/scrape")
246
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
247
+ .set("Content-Type", "application/json")
248
+ .send(scrapeRequest);
249
+ await new Promise((r) => setTimeout(r, 6000));
250
+
251
+ expect(response.statusCode).toBe(200);
252
+ expect(response.body).toHaveProperty("data");
253
+ if (!("data" in response.body)) {
254
+ throw new Error("Expected response body to have 'data' property");
255
+ }
256
+ expect(response.body.data).toHaveProperty("markdown");
257
+ expect(response.body.data).toHaveProperty("metadata");
258
+ expect(response.body.data.markdown).toContain(
259
+ "Broad Line Radio Galaxy",
260
+ );
261
+ expect(response.body.data.metadata.statusCode).toBe(200);
262
+ expect(response.body.data.metadata.error).toBeUndefined();
263
+ },
264
+ 60000,
265
+ );
266
+
267
+ it.concurrent(
268
+ "should return a successful response with a valid API key with removeTags option",
269
+ async () => {
270
+ const scrapeRequest: ScrapeRequestInput = {
271
+ url: "https://www.scrapethissite.com/",
272
+ onlyMainContent: false, // default is true
273
+ };
274
+ const responseWithoutRemoveTags: any = await request(TEST_URL)
275
+ .post("/v1/scrape")
276
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
277
+ .set("Content-Type", "application/json")
278
+ .send(scrapeRequest);
279
+ expect(responseWithoutRemoveTags.statusCode).toBe(200);
280
+ expect(responseWithoutRemoveTags.body).toHaveProperty("data");
281
+
282
+ if (!("data" in responseWithoutRemoveTags.body)) {
283
+ throw new Error("Expected response body to have 'data' property");
284
+ }
285
+ expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
286
+ expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
287
+ expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
288
+ expect(responseWithoutRemoveTags.body.data.markdown).toContain(
289
+ "[FAQ](/faq/)",
290
+ ); // .nav
291
+ expect(responseWithoutRemoveTags.body.data.markdown).toContain(
292
+ "Hartley Brody 2023",
293
+ ); // #footer
294
+
295
+ const scrapeRequestWithRemoveTags: ScrapeRequestInput = {
296
+ url: "https://www.scrapethissite.com/",
297
+ excludeTags: [".nav", "#footer", "strong"],
298
+ onlyMainContent: false, // default is true
299
+ };
300
+ const response: any = await request(TEST_URL)
301
+ .post("/v1/scrape")
302
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
303
+ .set("Content-Type", "application/json")
304
+ .send(scrapeRequestWithRemoveTags);
305
+
306
+ expect(response.statusCode).toBe(200);
307
+ expect(response.body).toHaveProperty("data");
308
+ if (!("data" in response.body)) {
309
+ throw new Error("Expected response body to have 'data' property");
310
+ }
311
+ expect(response.body.data).toHaveProperty("markdown");
312
+ expect(response.body.data).toHaveProperty("metadata");
313
+ expect(response.body.data).not.toHaveProperty("html");
314
+ expect(response.body.data.markdown).not.toContain("Hartley Brody 2023");
315
+ expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); //
316
+ },
317
+ 30000,
318
+ );
319
+
320
+ it.concurrent(
321
+ "should return a successful response for a scrape with 400 page",
322
+ async () => {
323
+ const response: any = await request(TEST_URL)
324
+ .post("/v1/scrape")
325
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
326
+ .set("Content-Type", "application/json")
327
+ .send({ url: "https://httpstat.us/400" });
328
+ await new Promise((r) => setTimeout(r, 5000));
329
+
330
+ expect(response.statusCode).toBe(200);
331
+ expect(response.body).toHaveProperty("data");
332
+ if (!("data" in response.body)) {
333
+ throw new Error("Expected response body to have 'data' property");
334
+ }
335
+ expect(response.body.data).toHaveProperty("markdown");
336
+ expect(response.body.data).toHaveProperty("metadata");
337
+ expect(response.body.data.metadata.statusCode).toBe(400);
338
+ },
339
+ 60000,
340
+ );
341
+
342
+ it.concurrent(
343
+ "should return a successful response for a scrape with 401 page",
344
+ async () => {
345
+ const response: any = await request(TEST_URL)
346
+ .post("/v1/scrape")
347
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
348
+ .set("Content-Type", "application/json")
349
+ .send({ url: "https://httpstat.us/401" });
350
+ await new Promise((r) => setTimeout(r, 5000));
351
+
352
+ expect(response.statusCode).toBe(200);
353
+ expect(response.body).toHaveProperty("data");
354
+ if (!("data" in response.body)) {
355
+ throw new Error("Expected response body to have 'data' property");
356
+ }
357
+ expect(response.body.data).toHaveProperty("markdown");
358
+ expect(response.body.data).toHaveProperty("metadata");
359
+ expect(response.body.data.metadata.statusCode).toBe(401);
360
+ },
361
+ 60000,
362
+ );
363
+
364
+ // Removed it as we want to retry fallback to the next scraper
365
+ // it.concurrent('should return a successful response for a scrape with 403 page', async () => {
366
+ // const response: any = await request(TEST_URL)
367
+ // .post('/v1/scrape')
368
+ // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
369
+ // .set('Content-Type', 'application/json')
370
+ // .send({ url: 'https://httpstat.us/403' });
371
+ // await new Promise((r) => setTimeout(r, 5000));
372
+
373
+ // expect(response.statusCode).toBe(200);
374
+ // expect(response.body).toHaveProperty('data');
375
+ // if (!("data" in response.body)) {
376
+ // throw new Error("Expected response body to have 'data' property");
377
+ // }
378
+ // expect(response.body.data).toHaveProperty('markdown');
379
+ // expect(response.body.data).toHaveProperty('metadata');
380
+ // expect(response.body.data.metadata.statusCode).toBe(403);
381
+ // }, 60000);
382
+
383
+ it.concurrent(
384
+ "should return a successful response for a scrape with 404 page",
385
+ async () => {
386
+ const response: any = await request(TEST_URL)
387
+ .post("/v1/scrape")
388
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
389
+ .set("Content-Type", "application/json")
390
+ .send({ url: "https://httpstat.us/404" });
391
+ await new Promise((r) => setTimeout(r, 5000));
392
+
393
+ expect(response.statusCode).toBe(200);
394
+ expect(response.body).toHaveProperty("data");
395
+ if (!("data" in response.body)) {
396
+ throw new Error("Expected response body to have 'data' property");
397
+ }
398
+ expect(response.body.data).toHaveProperty("markdown");
399
+ expect(response.body.data).toHaveProperty("metadata");
400
+ expect(response.body.data.metadata.statusCode).toBe(404);
401
+ },
402
+ 60000,
403
+ );
404
+
405
+ // it.concurrent('should return a successful response for a scrape with 405 page', async () => {
406
+ // const response: any = await request(TEST_URL)
407
+ // .post('/v1/scrape')
408
+ // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
409
+ // .set('Content-Type', 'application/json')
410
+ // .send({ url: 'https://httpstat.us/405' });
411
+ // await new Promise((r) => setTimeout(r, 5000));
412
+
413
+ // expect(response.statusCode).toBe(200);
414
+ // expect(response.body).toHaveProperty('data');
415
+ // if (!("data" in response.body)) {
416
+ // throw new Error("Expected response body to have 'data' property");
417
+ // }
418
+ // expect(response.body.data).toHaveProperty('markdown');
419
+ // expect(response.body.data).toHaveProperty('metadata');
420
+ // expect(response.body.data.metadata.statusCode).toBe(405);
421
+ // }, 60000);
422
+
423
+ // it.concurrent('should return a successful response for a scrape with 500 page', async () => {
424
+ // const response: any = await request(TEST_URL)
425
+ // .post('/v1/scrape')
426
+ // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
427
+ // .set('Content-Type', 'application/json')
428
+ // .send({ url: 'https://httpstat.us/500' });
429
+ // await new Promise((r) => setTimeout(r, 5000));
430
+
431
+ // expect(response.statusCode).toBe(200);
432
+ // expect(response.body).toHaveProperty('data');
433
+ // if (!("data" in response.body)) {
434
+ // throw new Error("Expected response body to have 'data' property");
435
+ // }
436
+ // expect(response.body.data).toHaveProperty('markdown');
437
+ // expect(response.body.data).toHaveProperty('metadata');
438
+ // expect(response.body.data.metadata.statusCode).toBe(500);
439
+ // }, 60000);
440
+
441
+ it.concurrent(
442
+ "should return a timeout error when scraping takes longer than the specified timeout",
443
+ async () => {
444
+ const response: any = await request(TEST_URL)
445
+ .post("/v1/scrape")
446
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
447
+ .set("Content-Type", "application/json")
448
+ .send({ url: "https://firecrawl.dev", timeout: 1000 });
449
+
450
+ expect(response.statusCode).toBe(408);
451
+ },
452
+ 3000,
453
+ );
454
+
455
+ it.concurrent(
456
+ "should return a successful response with a valid API key and includeHtml set to true",
457
+ async () => {
458
+ const scrapeRequest: ScrapeRequestInput = {
459
+ url: "https://roastmywebsite.ai",
460
+ formats: ["html", "rawHtml"],
461
+ };
462
+
463
+ const response: any = await request(TEST_URL)
464
+ .post("/v1/scrape")
465
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
466
+ .set("Content-Type", "application/json")
467
+ .send(scrapeRequest);
468
+
469
+ expect(response.statusCode).toBe(200);
470
+ expect(response.body).toHaveProperty("data");
471
+ if (!("data" in response.body)) {
472
+ throw new Error("Expected response body to have 'data' property");
473
+ }
474
+ expect(response.body.data).not.toHaveProperty("markdown");
475
+ expect(response.body.data).toHaveProperty("html");
476
+ expect(response.body.data).toHaveProperty("rawHtml");
477
+ expect(response.body.data).toHaveProperty("metadata");
478
+ expect(response.body.data.html).toContain("<h1");
479
+ expect(response.body.data.rawHtml).toContain("<html");
480
+ expect(response.body.data.metadata.statusCode).toBe(200);
481
+ expect(response.body.data.metadata.error).toBeUndefined();
482
+ },
483
+ 30000,
484
+ );
485
+
486
+ it.concurrent(
487
+ "should return a successful response with waitFor",
488
+ async () => {
489
+ const scrapeRequest: ScrapeRequestInput = {
490
+ url: "https://ycombinator.com/companies",
491
+ formats: ["markdown"],
492
+ waitFor: 8000,
493
+ };
494
+
495
+ const response: any = await request(TEST_URL)
496
+ .post("/v1/scrape")
497
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
498
+ .set("Content-Type", "application/json")
499
+ .send(scrapeRequest);
500
+
501
+ expect(response.statusCode).toBe(200);
502
+ expect(response.body).toHaveProperty("data");
503
+ if (!("data" in response.body)) {
504
+ throw new Error("Expected response body to have 'data' property");
505
+ }
506
+ expect(response.body.data).toHaveProperty("markdown");
507
+ expect(response.body.data).not.toHaveProperty("html");
508
+ expect(response.body.data).not.toHaveProperty("links");
509
+ expect(response.body.data).not.toHaveProperty("rawHtml");
510
+ expect(response.body.data).toHaveProperty("metadata");
511
+ expect(response.body.data.markdown).toContain("PagerDuty");
512
+ expect(response.body.data.metadata.statusCode).toBe(200);
513
+ expect(response.body.data.metadata.error).toBeUndefined();
514
+ },
515
+ 30000,
516
+ );
517
+
518
+ it.concurrent(
519
+ "should return a successful response with a valid links on page",
520
+ async () => {
521
+ const scrapeRequest: ScrapeRequestInput = {
522
+ url: "https://roastmywebsite.ai",
523
+ formats: ["links"],
524
+ };
525
+
526
+ const response: any = await request(TEST_URL)
527
+ .post("/v1/scrape")
528
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
529
+ .set("Content-Type", "application/json")
530
+ .send(scrapeRequest);
531
+
532
+ expect(response.statusCode).toBe(200);
533
+ expect(response.body).toHaveProperty("data");
534
+ if (!("data" in response.body)) {
535
+ throw new Error("Expected response body to have 'data' property");
536
+ }
537
+ expect(response.body.data).not.toHaveProperty("html");
538
+ expect(response.body.data).not.toHaveProperty("rawHtml");
539
+ expect(response.body.data).toHaveProperty("links");
540
+ expect(response.body.data).toHaveProperty("metadata");
541
+ expect(response.body.data.links).toContain("https://firecrawl.dev");
542
+ expect(response.body.data.metadata.statusCode).toBe(200);
543
+ expect(response.body.data.metadata.error).toBeUndefined();
544
+ },
545
+ 30000,
546
+ );
547
+ });
548
+
549
+ describe("POST /v1/map", () => {
550
+ it.concurrent("should require authorization", async () => {
551
+ const response: any = await request(TEST_URL)
552
+ .post("/v1/map")
553
+ .send({ url: "https://firecrawl.dev" });
554
+ expect(response.statusCode).toBe(401);
555
+ });
556
+
557
+ it.concurrent(
558
+ "should return an error response with an invalid API key",
559
+ async () => {
560
+ const response: any = await request(TEST_URL)
561
+ .post("/v1/map")
562
+ .set("Authorization", `Bearer invalid-api-key`)
563
+ .set("Content-Type", "application/json")
564
+ .send({ url: "https://firecrawl.dev" });
565
+ expect(response.statusCode).toBe(401);
566
+ },
567
+ );
568
+
569
+ it.concurrent(
570
+ "should return a successful response with a valid API key",
571
+ async () => {
572
+ const mapRequest = {
573
+ url: "https://roastmywebsite.ai",
574
+ };
575
+
576
+ const response: any = await request(TEST_URL)
577
+ .post("/v1/map")
578
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
579
+ .set("Content-Type", "application/json")
580
+ .send(mapRequest);
581
+
582
+ expect(response.statusCode).toBe(200);
583
+ expect(response.body).toHaveProperty("success", true);
584
+ expect(response.body).toHaveProperty("links");
585
+ if (!("links" in response.body)) {
586
+ throw new Error("Expected response body to have 'links' property");
587
+ }
588
+ const links = response.body.links as unknown[];
589
+ expect(Array.isArray(links)).toBe(true);
590
+ expect(links.length).toBeGreaterThan(0);
591
+ },
592
+ );
593
+
594
+ it.concurrent(
595
+ "should return a successful response with a valid API key and search",
596
+ async () => {
597
+ const mapRequest = {
598
+ url: "https://usemotion.com",
599
+ search: "pricing",
600
+ };
601
+
602
+ const response: any = await request(TEST_URL)
603
+ .post("/v1/map")
604
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
605
+ .set("Content-Type", "application/json")
606
+ .send(mapRequest);
607
+
608
+ expect(response.statusCode).toBe(200);
609
+ expect(response.body).toHaveProperty("success", true);
610
+ expect(response.body).toHaveProperty("links");
611
+ if (!("links" in response.body)) {
612
+ throw new Error("Expected response body to have 'links' property");
613
+ }
614
+ const links = response.body.links as unknown[];
615
+ expect(Array.isArray(links)).toBe(true);
616
+ expect(links.length).toBeGreaterThan(0);
617
+ expect(links[0]).toContain("usemotion.com/pricing");
618
+ },
619
+ );
620
+
621
+ it.concurrent(
622
+ "should return a successful response with a valid API key and search and allowSubdomains",
623
+ async () => {
624
+ const mapRequest = {
625
+ url: "https://firecrawl.dev",
626
+ search: "docs",
627
+ includeSubdomains: true,
628
+ };
629
+
630
+ const response: any = await request(TEST_URL)
631
+ .post("/v1/map")
632
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
633
+ .set("Content-Type", "application/json")
634
+ .send(mapRequest);
635
+
636
+ expect(response.statusCode).toBe(200);
637
+ expect(response.body).toHaveProperty("success", true);
638
+ expect(response.body).toHaveProperty("links");
639
+ if (!("links" in response.body)) {
640
+ throw new Error("Expected response body to have 'links' property");
641
+ }
642
+ const links = response.body.links as unknown[];
643
+ expect(Array.isArray(links)).toBe(true);
644
+ expect(links.length).toBeGreaterThan(0);
645
+
646
+ const containsDocsFirecrawlDev = links.some((link: string) =>
647
+ link.includes("docs.firecrawl.dev"),
648
+ );
649
+ expect(containsDocsFirecrawlDev).toBe(true);
650
+ },
651
+ );
652
+
653
+ it.concurrent(
654
+ "should return a successful response with a valid API key and search and allowSubdomains and www",
655
+ async () => {
656
+ const mapRequest = {
657
+ url: "https://www.firecrawl.dev",
658
+ search: "docs",
659
+ includeSubdomains: true,
660
+ };
661
+
662
+ const response: any = await request(TEST_URL)
663
+ .post("/v1/map")
664
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
665
+ .set("Content-Type", "application/json")
666
+ .send(mapRequest);
667
+
668
+ expect(response.statusCode).toBe(200);
669
+ expect(response.body).toHaveProperty("success", true);
670
+ expect(response.body).toHaveProperty("links");
671
+ if (!("links" in response.body)) {
672
+ throw new Error("Expected response body to have 'links' property");
673
+ }
674
+ const links = response.body.links as unknown[];
675
+ expect(Array.isArray(links)).toBe(true);
676
+ expect(links.length).toBeGreaterThan(0);
677
+
678
+ const containsDocsFirecrawlDev = links.some((link: string) =>
679
+ link.includes("docs.firecrawl.dev"),
680
+ );
681
+ expect(containsDocsFirecrawlDev).toBe(true);
682
+ },
683
+ 10000,
684
+ );
685
+
686
+ it.concurrent(
687
+ "should return a successful response with a valid API key and search and not allowSubdomains and www",
688
+ async () => {
689
+ const mapRequest = {
690
+ url: "https://www.firecrawl.dev",
691
+ search: "docs",
692
+ includeSubdomains: false,
693
+ };
694
+
695
+ const response: any = await request(TEST_URL)
696
+ .post("/v1/map")
697
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
698
+ .set("Content-Type", "application/json")
699
+ .send(mapRequest);
700
+
701
+ expect(response.statusCode).toBe(200);
702
+ expect(response.body).toHaveProperty("success", true);
703
+ expect(response.body).toHaveProperty("links");
704
+ if (!("links" in response.body)) {
705
+ throw new Error("Expected response body to have 'links' property");
706
+ }
707
+ const links = response.body.links as unknown[];
708
+ expect(Array.isArray(links)).toBe(true);
709
+ expect(links.length).toBeGreaterThan(0);
710
+ expect(links[0]).not.toContain("docs.firecrawl.dev");
711
+ },
712
+ );
713
+
714
+ it.concurrent("should return an error for invalid URL", async () => {
715
+ const mapRequest = {
716
+ url: "invalid-url",
717
+ includeSubdomains: true,
718
+ search: "test",
719
+ };
720
+
721
+ const response: any = await request(TEST_URL)
722
+ .post("/v1/map")
723
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
724
+ .set("Content-Type", "application/json")
725
+ .send(mapRequest);
726
+
727
+ expect(response.statusCode).toBe(400);
728
+ expect(response.body).toHaveProperty("success", false);
729
+ expect(response.body).toHaveProperty("error");
730
+ });
731
+ });
732
+
733
+ describe("POST /v1/crawl", () => {
734
+ it.concurrent("should require authorization", async () => {
735
+ const response: any = await request(TEST_URL)
736
+ .post("/v1/crawl")
737
+ .send({ url: "https://firecrawl.dev" });
738
+ expect(response.statusCode).toBe(401);
739
+ });
740
+
741
+ it.concurrent("should throw error for blocklisted URL", async () => {
742
+ const scrapeRequest: ScrapeRequestInput = {
743
+ url: "https://facebook.com/fake-test",
744
+ };
745
+
746
+ const response = await request(TEST_URL)
747
+ .post("/v1/crawl")
748
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
749
+ .set("Content-Type", "application/json")
750
+ .send(scrapeRequest);
751
+
752
+ expect(response.statusCode).toBe(403);
753
+ expect(response.body.error).toBe(BLOCKLISTED_URL_MESSAGE);
754
+ });
755
+
756
+ it.concurrent(
757
+ "should return an error response with an invalid API key",
758
+ async () => {
759
+ const response: any = await request(TEST_URL)
760
+ .post("/v1/crawl")
761
+ .set("Authorization", `Bearer invalid-api-key`)
762
+ .set("Content-Type", "application/json")
763
+ .send({ url: "https://firecrawl.dev" });
764
+ expect(response.statusCode).toBe(401);
765
+ },
766
+ );
767
+
768
+ it.concurrent("should return a successful response", async () => {
769
+ const response = await request(TEST_URL)
770
+ .post("/v1/crawl")
771
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
772
+ .set("Content-Type", "application/json")
773
+ .send({ url: "https://firecrawl.dev" });
774
+
775
+ expect(response.statusCode).toBe(200);
776
+ expect(response.body).toHaveProperty("id");
777
+ expect(response.body.id).toMatch(
778
+ /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
779
+ );
780
+ expect(response.body).toHaveProperty("success", true);
781
+ expect(response.body).toHaveProperty("url");
782
+ expect(response.body.url).toContain("/v1/crawl/");
783
+ });
784
+
785
+ it.concurrent(
786
+ "should return a successful response with a valid API key and valid includes option",
787
+ async () => {
788
+ const crawlResponse = await request(TEST_URL)
789
+ .post("/v1/crawl")
790
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
791
+ .set("Content-Type", "application/json")
792
+ .send({
793
+ url: "https://firecrawl.dev",
794
+ limit: 40,
795
+ includePaths: ["blog/*"],
796
+ });
797
+
798
+ let response;
799
+ let isFinished = false;
800
+
801
+ while (!isFinished) {
802
+ response = await request(TEST_URL)
803
+ .get(`/v1/crawl/${crawlResponse.body.id}`)
804
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
805
+
806
+ expect(response.statusCode).toBe(200);
807
+ expect(response.body).toHaveProperty("status");
808
+ isFinished = response.body.status === "completed";
809
+
810
+ if (!isFinished) {
811
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
812
+ }
813
+ }
814
+
815
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
816
+ const completedResponse = await request(TEST_URL)
817
+ .get(`/v1/crawl/${crawlResponse.body.id}`)
818
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
819
+
820
+ const urls = completedResponse.body.data.map(
821
+ (item: any) => item.metadata?.sourceURL,
822
+ );
823
+ expect(urls.length).toBeGreaterThan(5);
824
+ urls.forEach((url: string) => {
825
+ expect(url).toContain("firecrawl.dev/blog");
826
+ });
827
+
828
+ expect(completedResponse.statusCode).toBe(200);
829
+ expect(completedResponse.body).toHaveProperty("status");
830
+ expect(completedResponse.body.status).toBe("completed");
831
+ expect(completedResponse.body).toHaveProperty("data");
832
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
833
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
834
+ expect(completedResponse.body.data[0]).not.toHaveProperty("content"); // v0
835
+ expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
836
+ expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
837
+ },
838
+ 180000,
839
+ ); // 180 seconds
840
+
841
+ it.concurrent(
842
+ "should return a successful response with a valid API key and valid excludes option",
843
+ async () => {
844
+ const crawlResponse = await request(TEST_URL)
845
+ .post("/v1/crawl")
846
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
847
+ .set("Content-Type", "application/json")
848
+ .send({
849
+ url: "https://firecrawl.dev",
850
+ limit: 40,
851
+ excludePaths: ["blog/*"],
852
+ });
853
+
854
+ let isFinished = false;
855
+ let response;
856
+
857
+ while (!isFinished) {
858
+ response = await request(TEST_URL)
859
+ .get(`/v1/crawl/${crawlResponse.body.id}`)
860
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
861
+
862
+ expect(response.statusCode).toBe(200);
863
+ expect(response.body).toHaveProperty("status");
864
+ isFinished = response.body.status === "completed";
865
+
866
+ if (!isFinished) {
867
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
868
+ }
869
+ }
870
+
871
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
872
+ const completedResponse = await request(TEST_URL)
873
+ .get(`/v1/crawl/${crawlResponse.body.id}`)
874
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
875
+
876
+ const urls = completedResponse.body.data.map(
877
+ (item: any) => item.metadata?.sourceURL,
878
+ );
879
+ expect(urls.length).toBeGreaterThan(3);
880
+ urls.forEach((url: string) => {
881
+ expect(url.startsWith("https://www.firecrawl.dev/blog/")).toBeFalsy();
882
+ });
883
+ },
884
+ 90000,
885
+ ); // 90 seconds
886
+
887
+ it.concurrent(
888
+ "should return a successful response with max depth option for a valid crawl job",
889
+ async () => {
890
+ const crawlResponse = await request(TEST_URL)
891
+ .post("/v1/crawl")
892
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
893
+ .set("Content-Type", "application/json")
894
+ .send({
895
+ url: "https://www.scrapethissite.com",
896
+ maxDepth: 1,
897
+ });
898
+ expect(crawlResponse.statusCode).toBe(200);
899
+
900
+ const response = await request(TEST_URL)
901
+ .get(`/v1/crawl/${crawlResponse.body.id}`)
902
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
903
+ expect(response.statusCode).toBe(200);
904
+ expect(response.body).toHaveProperty("status");
905
+ expect(["active", "waiting", "completed", "scraping"]).toContain(
906
+ response.body.status,
907
+ );
908
+ // wait for 60 seconds
909
+ let isCompleted = false;
910
+ while (!isCompleted) {
911
+ const statusCheckResponse = await request(TEST_URL)
912
+ .get(`/v1/crawl/${crawlResponse.body.id}`)
913
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
914
+ expect(statusCheckResponse.statusCode).toBe(200);
915
+ isCompleted = statusCheckResponse.body.status === "completed";
916
+ if (!isCompleted) {
917
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
918
+ }
919
+ }
920
+ const completedResponse = await request(TEST_URL)
921
+ .get(`/v1/crawl/${crawlResponse.body.id}`)
922
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
923
+
924
+ expect(completedResponse.statusCode).toBe(200);
925
+ expect(completedResponse.body).toHaveProperty("status");
926
+ expect(completedResponse.body.status).toBe("completed");
927
+ expect(completedResponse.body).toHaveProperty("data");
928
+ expect(completedResponse.body.data[0]).not.toHaveProperty("content");
929
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
930
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
931
+ expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
932
+ expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
933
+ const urls = completedResponse.body.data.map(
934
+ (item: any) => item.metadata?.sourceURL,
935
+ );
936
+ expect(urls.length).toBeGreaterThan(1);
937
+
938
+ // Check if all URLs have a maximum depth of 1
939
+ urls.forEach((url: string) => {
940
+ const pathSplits = new URL(url).pathname.split("/");
941
+ const depth =
942
+ pathSplits.length -
943
+ (pathSplits[0].length === 0 &&
944
+ pathSplits[pathSplits.length - 1].length === 0
945
+ ? 1
946
+ : 0);
947
+ expect(depth).toBeLessThanOrEqual(2);
948
+ });
949
+ },
950
+ 180000,
951
+ );
952
+ });
953
+
954
+ describe("GET /v1/crawl/:jobId", () => {
955
+ it.concurrent("should require authorization", async () => {
956
+ const response = await request(TEST_URL).get("/v1/crawl/123");
957
+ expect(response.statusCode).toBe(401);
958
+ });
959
+
960
+ it.concurrent(
961
+ "should return an error response with an invalid API key",
962
+ async () => {
963
+ const response = await request(TEST_URL)
964
+ .get("/v1/crawl/123")
965
+ .set("Authorization", `Bearer invalid-api-key`);
966
+ expect(response.statusCode).toBe(401);
967
+ },
968
+ );
969
+
970
+ it.concurrent(
971
+ "should return Job not found for invalid job ID",
972
+ async () => {
973
+ const response = await request(TEST_URL)
974
+ .get("/v1/crawl/invalidJobId")
975
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
976
+ expect(response.statusCode).toBe(404);
977
+ },
978
+ );
979
+
980
+ it.concurrent(
981
+ "should return a successful crawl status response for a valid crawl job",
982
+ async () => {
983
+ const crawlResponse = await request(TEST_URL)
984
+ .post("/v1/crawl")
985
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
986
+ .set("Content-Type", "application/json")
987
+ .send({ url: "https://docs.firecrawl.dev" });
988
+ expect(crawlResponse.statusCode).toBe(200);
989
+
990
+ let isCompleted = false;
991
+
992
+ while (!isCompleted) {
993
+ const response = await request(TEST_URL)
994
+ .get(`/v1/crawl/${crawlResponse.body.id}`)
995
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
996
+ expect(response.statusCode).toBe(200);
997
+ expect(response.body).toHaveProperty("status");
998
+
999
+ if (response.body.status === "completed") {
1000
+ isCompleted = true;
1001
+ } else {
1002
+ await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
1003
+ }
1004
+ }
1005
+
1006
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
1007
+ const completedResponse = await request(TEST_URL)
1008
+ .get(`/v1/crawl/${crawlResponse.body.id}`)
1009
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
1010
+
1011
+ expect(completedResponse.body).toHaveProperty("status");
1012
+ expect(completedResponse.body.status).toBe("completed");
1013
+ expect(completedResponse.body).toHaveProperty("data");
1014
+ expect(completedResponse.body.data[0]).not.toHaveProperty("content");
1015
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
1016
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
1017
+ expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
1018
+ expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
1019
+
1020
+ const childrenLinks = completedResponse.body.data.filter(
1021
+ (doc) => doc.metadata && doc.metadata.sourceURL,
1022
+ );
1023
+
1024
+ expect(childrenLinks.length).toBe(completedResponse.body.data.length);
1025
+ },
1026
+ 180000,
1027
+ ); // 120 seconds
1028
+
1029
+ it.concurrent(
1030
+ "If someone cancels a crawl job, it should turn into failed status",
1031
+ async () => {
1032
+ const crawlResponse = await request(TEST_URL)
1033
+ .post("/v1/crawl")
1034
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
1035
+ .set("Content-Type", "application/json")
1036
+ .send({ url: "https://docs.firecrawl.dev", limit: 10 });
1037
+
1038
+ expect(crawlResponse.statusCode).toBe(200);
1039
+
1040
+ await new Promise((r) => setTimeout(r, 10000));
1041
+
1042
+ const responseCancel = await request(TEST_URL)
1043
+ .delete(`/v1/crawl/${crawlResponse.body.id}`)
1044
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
1045
+ expect(responseCancel.statusCode).toBe(200);
1046
+ expect(responseCancel.body).toHaveProperty("status");
1047
+ expect(responseCancel.body.status).toBe("cancelled");
1048
+
1049
+ await new Promise((r) => setTimeout(r, 10000));
1050
+ const completedResponse = await request(TEST_URL)
1051
+ .get(`/v1/crawl/${crawlResponse.body.id}`)
1052
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
1053
+
1054
+ expect(completedResponse.statusCode).toBe(200);
1055
+ expect(completedResponse.body).toHaveProperty("status");
1056
+ expect(completedResponse.body.status).toBe("cancelled");
1057
+ expect(completedResponse.body).toHaveProperty("data");
1058
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
1059
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
1060
+ expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
1061
+ expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
1062
+ },
1063
+ 60000,
1064
+ ); // 60 seconds
1065
+ });
1066
+ });
src/__tests__/e2e_v1_withAuth_all_params/index.test.ts ADDED
@@ -0,0 +1,711 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import request from "supertest";
2
+ import { configDotenv } from "dotenv";
3
+ import { ScrapeRequest } from "../../controllers/v1/types";
4
+
5
+ configDotenv();
6
+ const FIRECRAWL_API_URL = "http://127.0.0.1:3002";
7
+ const E2E_TEST_SERVER_URL = "http://firecrawl-e2e-test.vercel.app"; // @rafaelsideguide/firecrawl-e2e-test
8
+
9
+ describe("E2E Tests for v1 API Routes", () => {
10
+ it.concurrent(
11
+ "should return a successful response for a scrape with 403 page",
12
+ async () => {
13
+ const response: any = await request(FIRECRAWL_API_URL)
14
+ .post("/v1/scrape")
15
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
16
+ .set("Content-Type", "application/json")
17
+ .send({ url: "https://httpstat.us/403" });
18
+
19
+ expect(response.statusCode).toBe(200);
20
+ expect(response.body).toHaveProperty("data");
21
+ if (!("data" in response.body)) {
22
+ throw new Error("Expected response body to have 'data' property");
23
+ }
24
+ expect(response.body.data).toHaveProperty("markdown");
25
+ expect(response.body.data).toHaveProperty("metadata");
26
+ expect(response.body.data.metadata.statusCode).toBe(403);
27
+ },
28
+ 30000,
29
+ );
30
+
31
+ it.concurrent(
32
+ "should handle 'formats:markdown (default)' parameter correctly",
33
+ async () => {
34
+ const scrapeRequest = {
35
+ url: E2E_TEST_SERVER_URL,
36
+ } as ScrapeRequest;
37
+
38
+ const response: any = await request(FIRECRAWL_API_URL)
39
+ .post("/v1/scrape")
40
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
41
+ .set("Content-Type", "application/json")
42
+ .send(scrapeRequest);
43
+
44
+ expect(response.statusCode).toBe(200);
45
+ expect(response.body).toHaveProperty("data");
46
+ if (!("data" in response.body)) {
47
+ throw new Error("Expected response body to have 'data' property");
48
+ }
49
+
50
+ expect(response.body.data).toHaveProperty("markdown");
51
+
52
+ expect(response.body.data.markdown).toContain(
53
+ "This page is used for end-to-end (e2e) testing with Firecrawl.",
54
+ );
55
+ expect(response.body.data.markdown).toContain(
56
+ "Content with id #content-1",
57
+ );
58
+ // expect(response.body.data.markdown).toContain("Loading...");
59
+ expect(response.body.data.markdown).toContain("Click me!");
60
+ expect(response.body.data.markdown).toContain(
61
+ "Power your AI apps with clean data crawled from any website. It's also open-source.",
62
+ ); // firecrawl.dev inside an iframe
63
+ expect(response.body.data.markdown).toContain(
64
+ "This content loads only when you see it. Don't blink! 👼",
65
+ ); // the browser always scroll to the bottom
66
+ expect(response.body.data.markdown).not.toContain("Header"); // Only main content is returned by default
67
+ expect(response.body.data.markdown).not.toContain("footer"); // Only main content is returned by default
68
+ expect(response.body.data.markdown).not.toContain(
69
+ "This content is only visible on mobile",
70
+ );
71
+ },
72
+ 30000,
73
+ );
74
+
75
+ it.concurrent(
76
+ "should handle 'formats:html' parameter correctly",
77
+ async () => {
78
+ const scrapeRequest = {
79
+ url: E2E_TEST_SERVER_URL,
80
+ formats: ["html"],
81
+ } as ScrapeRequest;
82
+
83
+ const response: any = await request(FIRECRAWL_API_URL)
84
+ .post("/v1/scrape")
85
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
86
+ .set("Content-Type", "application/json")
87
+ .send(scrapeRequest);
88
+
89
+ expect(response.statusCode).toBe(200);
90
+ expect(response.body).toHaveProperty("data");
91
+ if (!("data" in response.body)) {
92
+ throw new Error("Expected response body to have 'data' property");
93
+ }
94
+
95
+ expect(response.body.data).not.toHaveProperty("markdown");
96
+ expect(response.body.data).toHaveProperty("html");
97
+
98
+ expect(response.body.data.html).not.toContain(
99
+ '<header class="row-start-1" style="">Header</header>',
100
+ );
101
+ expect(response.body.data.html).toContain(
102
+ '<p style="">This page is used for end-to-end (e2e) testing with Firecrawl.</p>',
103
+ );
104
+ },
105
+ 30000,
106
+ );
107
+
108
+ it.concurrent(
109
+ "should handle 'rawHtml' in 'formats' parameter correctly",
110
+ async () => {
111
+ const scrapeRequest = {
112
+ url: E2E_TEST_SERVER_URL,
113
+ formats: ["rawHtml"],
114
+ } as ScrapeRequest;
115
+
116
+ const response: any = await request(FIRECRAWL_API_URL)
117
+ .post("/v1/scrape")
118
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
119
+ .set("Content-Type", "application/json")
120
+ .send(scrapeRequest);
121
+
122
+ expect(response.statusCode).toBe(200);
123
+ expect(response.body).toHaveProperty("data");
124
+ if (!("data" in response.body)) {
125
+ throw new Error("Expected response body to have 'data' property");
126
+ }
127
+
128
+ expect(response.body.data).not.toHaveProperty("markdown");
129
+ expect(response.body.data).toHaveProperty("rawHtml");
130
+
131
+ expect(response.body.data.rawHtml).toContain(
132
+ ">This page is used for end-to-end (e2e) testing with Firecrawl.</p>",
133
+ );
134
+ expect(response.body.data.rawHtml).toContain(">Header</header>");
135
+ },
136
+ 30000,
137
+ );
138
+
139
+ // - TODO: tests for links
140
+ // - TODO: tests for screenshot
141
+ // - TODO: tests for screenshot@fullPage
142
+
143
+ it.concurrent(
144
+ "should handle 'headers' parameter correctly",
145
+ async () => {
146
+ // @ts-ignore
147
+ const scrapeRequest = {
148
+ url: E2E_TEST_SERVER_URL,
149
+ headers: { "e2e-header-test": "firecrawl" },
150
+ } as ScrapeRequest;
151
+
152
+ const response: any = await request(FIRECRAWL_API_URL)
153
+ .post("/v1/scrape")
154
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
155
+ .set("Content-Type", "application/json")
156
+ .send(scrapeRequest);
157
+
158
+ expect(response.statusCode).toBe(200);
159
+ expect(response.body).toHaveProperty("data");
160
+ if (!("data" in response.body)) {
161
+ throw new Error("Expected response body to have 'data' property");
162
+ }
163
+
164
+ expect(response.body.data.markdown).toContain(
165
+ "e2e-header-test: firecrawl",
166
+ );
167
+ },
168
+ 30000,
169
+ );
170
+
171
+ it.concurrent(
172
+ "should handle 'includeTags' parameter correctly",
173
+ async () => {
174
+ const scrapeRequest = {
175
+ url: E2E_TEST_SERVER_URL,
176
+ includeTags: ["#content-1"],
177
+ } as ScrapeRequest;
178
+
179
+ const response: any = await request(FIRECRAWL_API_URL)
180
+ .post("/v1/scrape")
181
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
182
+ .set("Content-Type", "application/json")
183
+ .send(scrapeRequest);
184
+
185
+ expect(response.statusCode).toBe(200);
186
+ expect(response.body).toHaveProperty("data");
187
+ if (!("data" in response.body)) {
188
+ throw new Error("Expected response body to have 'data' property");
189
+ }
190
+
191
+ expect(response.body.data.markdown).not.toContain(
192
+ "<p>This page is used for end-to-end (e2e) testing with Firecrawl.</p>",
193
+ );
194
+ expect(response.body.data.markdown).toContain(
195
+ "Content with id #content-1",
196
+ );
197
+ },
198
+ 30000,
199
+ );
200
+
201
+ it.concurrent(
202
+ "should handle 'excludeTags' parameter correctly",
203
+ async () => {
204
+ const scrapeRequest = {
205
+ url: E2E_TEST_SERVER_URL,
206
+ excludeTags: ["#content-1"],
207
+ } as ScrapeRequest;
208
+
209
+ const response: any = await request(FIRECRAWL_API_URL)
210
+ .post("/v1/scrape")
211
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
212
+ .set("Content-Type", "application/json")
213
+ .send(scrapeRequest);
214
+
215
+ expect(response.statusCode).toBe(200);
216
+ expect(response.body).toHaveProperty("data");
217
+ if (!("data" in response.body)) {
218
+ throw new Error("Expected response body to have 'data' property");
219
+ }
220
+
221
+ expect(response.body.data.markdown).toContain(
222
+ "This page is used for end-to-end (e2e) testing with Firecrawl.",
223
+ );
224
+ expect(response.body.data.markdown).not.toContain(
225
+ "Content with id #content-1",
226
+ );
227
+ },
228
+ 30000,
229
+ );
230
+
231
+ it.concurrent(
232
+ "should handle 'onlyMainContent' parameter correctly",
233
+ async () => {
234
+ const scrapeRequest = {
235
+ url: E2E_TEST_SERVER_URL,
236
+ formats: ["html", "markdown"],
237
+ onlyMainContent: false,
238
+ } as ScrapeRequest;
239
+
240
+ const response: any = await request(FIRECRAWL_API_URL)
241
+ .post("/v1/scrape")
242
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
243
+ .set("Content-Type", "application/json")
244
+ .send(scrapeRequest);
245
+
246
+ expect(response.statusCode).toBe(200);
247
+ expect(response.body).toHaveProperty("data");
248
+ if (!("data" in response.body)) {
249
+ throw new Error("Expected response body to have 'data' property");
250
+ }
251
+
252
+ expect(response.body.data.markdown).toContain(
253
+ "This page is used for end-to-end (e2e) testing with Firecrawl.",
254
+ );
255
+ expect(response.body.data.html).toContain(
256
+ '<header class="row-start-1" style="">Header</header>',
257
+ );
258
+ },
259
+ 30000,
260
+ );
261
+
262
+ it.concurrent(
263
+ "should handle 'timeout' parameter correctly",
264
+ async () => {
265
+ const scrapeRequest = {
266
+ url: E2E_TEST_SERVER_URL,
267
+ timeout: 500,
268
+ } as ScrapeRequest;
269
+
270
+ const response: any = await request(FIRECRAWL_API_URL)
271
+ .post("/v1/scrape")
272
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
273
+ .set("Content-Type", "application/json")
274
+ .send(scrapeRequest);
275
+
276
+ expect(response.statusCode).toBe(408);
277
+
278
+ if (!("error" in response.body)) {
279
+ throw new Error("Expected response body to have 'error' property");
280
+ }
281
+ expect(response.body.error).toBe("Request timed out");
282
+ expect(response.body.success).toBe(false);
283
+ },
284
+ 30000,
285
+ );
286
+
287
+ it.concurrent(
288
+ "should handle 'mobile' parameter correctly",
289
+ async () => {
290
+ const scrapeRequest = {
291
+ url: E2E_TEST_SERVER_URL,
292
+ mobile: true,
293
+ } as ScrapeRequest;
294
+
295
+ const response: any = await request(FIRECRAWL_API_URL)
296
+ .post("/v1/scrape")
297
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
298
+ .set("Content-Type", "application/json")
299
+ .send(scrapeRequest);
300
+
301
+ expect(response.statusCode).toBe(200);
302
+
303
+ if (!("data" in response.body)) {
304
+ throw new Error("Expected response body to have 'data' property");
305
+ }
306
+ expect(response.body.data.markdown).toContain(
307
+ "This content is only visible on mobile",
308
+ );
309
+ },
310
+ 30000,
311
+ );
312
+
313
+ it.concurrent(
314
+ "should handle 'parsePDF' parameter correctly",
315
+ async () => {
316
+ const response: any = await request(FIRECRAWL_API_URL)
317
+ .post("/v1/scrape")
318
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
319
+ .set("Content-Type", "application/json")
320
+ .send({ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" });
321
+ await new Promise((r) => setTimeout(r, 6000));
322
+
323
+ expect(response.statusCode).toBe(200);
324
+ expect(response.body).toHaveProperty("data");
325
+ if (!("data" in response.body)) {
326
+ throw new Error("Expected response body to have 'data' property");
327
+ }
328
+
329
+ expect(response.body.data.markdown).toContain(
330
+ "arXiv:astro-ph/9301001v1 7 Jan 1993",
331
+ );
332
+ expect(response.body.data.markdown).not.toContain(
333
+ "h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm",
334
+ );
335
+
336
+ const responseNoParsePDF: any = await request(FIRECRAWL_API_URL)
337
+ .post("/v1/scrape")
338
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
339
+ .set("Content-Type", "application/json")
340
+ .send({
341
+ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf",
342
+ parsePDF: false,
343
+ });
344
+ await new Promise((r) => setTimeout(r, 6000));
345
+
346
+ expect(responseNoParsePDF.statusCode).toBe(200);
347
+ expect(responseNoParsePDF.body).toHaveProperty("data");
348
+ if (!("data" in responseNoParsePDF.body)) {
349
+ throw new Error("Expected response body to have 'data' property");
350
+ }
351
+ expect(responseNoParsePDF.body.data.markdown).toContain(
352
+ "h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm",
353
+ );
354
+ },
355
+ 30000,
356
+ );
357
+
358
+ // it.concurrent("should handle 'location' parameter correctly",
359
+ // async () => {
360
+ // const scrapeRequest: ScrapeRequest = {
361
+ // url: "https://roastmywebsite.ai",
362
+ // location: {
363
+ // country: "US",
364
+ // languages: ["en"]
365
+ // }
366
+ // };
367
+
368
+ // const response: any = await request(FIRECRAWL_API_URL)
369
+ // .post("/v1/scrape")
370
+ // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
371
+ // .set("Content-Type", "application/json")
372
+ // .send(scrapeRequest);
373
+
374
+ // expect(response.statusCode).toBe(200);
375
+ // // Add assertions to verify location is handled correctly
376
+ // },
377
+ // 30000);
378
+
379
+ it.concurrent(
380
+ "should handle 'skipTlsVerification' parameter correctly",
381
+ async () => {
382
+ const scrapeRequest = {
383
+ url: "https://expired.badssl.com/",
384
+ timeout: 120000,
385
+ } as ScrapeRequest;
386
+
387
+ const response: any = await request(FIRECRAWL_API_URL)
388
+ .post("/v1/scrape")
389
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
390
+ .set("Content-Type", "application/json")
391
+ .send(scrapeRequest);
392
+ console.log("Error1a");
393
+ // console.log(response.body)
394
+ expect(response.statusCode).toBe(200);
395
+ if (!("data" in response.body)) {
396
+ throw new Error("Expected response body to have 'data' property");
397
+ }
398
+ expect(response.body.data.metadata.pageStatusCode).toBe(500);
399
+ console.log("Error?");
400
+
401
+ const scrapeRequestWithSkipTlsVerification = {
402
+ url: "https://expired.badssl.com/",
403
+ skipTlsVerification: true,
404
+ timeout: 120000,
405
+ } as ScrapeRequest;
406
+
407
+ const responseWithSkipTlsVerification: any = await request(
408
+ FIRECRAWL_API_URL,
409
+ )
410
+ .post("/v1/scrape")
411
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
412
+ .set("Content-Type", "application/json")
413
+ .send(scrapeRequestWithSkipTlsVerification);
414
+
415
+ console.log("Error1b");
416
+ // console.log(responseWithSkipTlsVerification.body)
417
+ expect(responseWithSkipTlsVerification.statusCode).toBe(200);
418
+ if (!("data" in responseWithSkipTlsVerification.body)) {
419
+ throw new Error("Expected response body to have 'data' property");
420
+ }
421
+ // console.log(responseWithSkipTlsVerification.body.data)
422
+ expect(responseWithSkipTlsVerification.body.data.markdown).toContain(
423
+ "badssl.com",
424
+ );
425
+ },
426
+ 60000,
427
+ );
428
+
429
+ it.concurrent(
430
+ "should handle 'removeBase64Images' parameter correctly",
431
+ async () => {
432
+ const scrapeRequest = {
433
+ url: E2E_TEST_SERVER_URL,
434
+ removeBase64Images: true,
435
+ } as ScrapeRequest;
436
+
437
+ const response: any = await request(FIRECRAWL_API_URL)
438
+ .post("/v1/scrape")
439
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
440
+ .set("Content-Type", "application/json")
441
+ .send(scrapeRequest);
442
+
443
+ expect(response.statusCode).toBe(200);
444
+ if (!("data" in response.body)) {
445
+ throw new Error("Expected response body to have 'data' property");
446
+ }
447
+ // console.log(response.body.data.markdown)
448
+ // - TODO: not working for every image
449
+ // expect(response.body.data.markdown).toContain("Image-Removed");
450
+ },
451
+ 30000,
452
+ );
453
+
454
+ it.concurrent(
455
+ "should handle 'action wait' parameter correctly",
456
+ async () => {
457
+ const scrapeRequest = {
458
+ url: E2E_TEST_SERVER_URL,
459
+ actions: [
460
+ {
461
+ type: "wait",
462
+ milliseconds: 10000,
463
+ },
464
+ ],
465
+ } as ScrapeRequest;
466
+
467
+ const response: any = await request(FIRECRAWL_API_URL)
468
+ .post("/v1/scrape")
469
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
470
+ .set("Content-Type", "application/json")
471
+ .send(scrapeRequest);
472
+
473
+ expect(response.statusCode).toBe(200);
474
+ if (!("data" in response.body)) {
475
+ throw new Error("Expected response body to have 'data' property");
476
+ }
477
+ expect(response.body.data.markdown).not.toContain("Loading...");
478
+ expect(response.body.data.markdown).toContain(
479
+ "Content loaded after 5 seconds!",
480
+ );
481
+ },
482
+ 30000,
483
+ );
484
+
485
+ // screenshot
486
+ it.concurrent(
487
+ "should handle 'action screenshot' parameter correctly",
488
+ async () => {
489
+ const scrapeRequest = {
490
+ url: E2E_TEST_SERVER_URL,
491
+ actions: [
492
+ {
493
+ type: "screenshot",
494
+ },
495
+ ],
496
+ } as ScrapeRequest;
497
+
498
+ const response: any = await request(FIRECRAWL_API_URL)
499
+ .post("/v1/scrape")
500
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
501
+ .set("Content-Type", "application/json")
502
+ .send(scrapeRequest);
503
+
504
+ expect(response.statusCode).toBe(200);
505
+ if (!("data" in response.body)) {
506
+ throw new Error("Expected response body to have 'data' property");
507
+ }
508
+ if (!response.body.data.actions?.screenshots) {
509
+ throw new Error("Expected response body to have screenshots array");
510
+ }
511
+ expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(
512
+ 0,
513
+ );
514
+ expect(response.body.data.actions.screenshots[0]).toContain(
515
+ "https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-",
516
+ );
517
+
518
+ // TODO compare screenshot with expected screenshot
519
+ },
520
+ 30000,
521
+ );
522
+
523
+ it.concurrent(
524
+ "should handle 'action screenshot@fullPage' parameter correctly",
525
+ async () => {
526
+ const scrapeRequest = {
527
+ url: E2E_TEST_SERVER_URL,
528
+ actions: [
529
+ {
530
+ type: "screenshot",
531
+ fullPage: true,
532
+ },
533
+ {
534
+ type: "scrape",
535
+ },
536
+ ],
537
+ } as ScrapeRequest;
538
+
539
+ const response: any = await request(FIRECRAWL_API_URL)
540
+ .post("/v1/scrape")
541
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
542
+ .set("Content-Type", "application/json")
543
+ .send(scrapeRequest);
544
+
545
+ expect(response.statusCode).toBe(200);
546
+ if (!("data" in response.body)) {
547
+ throw new Error("Expected response body to have 'data' property");
548
+ }
549
+ // console.log(response.body.data.actions?.screenshots[0])
550
+ if (!response.body.data.actions?.screenshots) {
551
+ throw new Error("Expected response body to have screenshots array");
552
+ }
553
+ expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(
554
+ 0,
555
+ );
556
+ expect(response.body.data.actions.screenshots[0]).toContain(
557
+ "https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-",
558
+ );
559
+
560
+ if (!response.body.data.actions?.scrapes) {
561
+ throw new Error("Expected response body to have scrapes array");
562
+ }
563
+ expect(response.body.data.actions.scrapes[0].url).toBe(
564
+ "https://firecrawl-e2e-test.vercel.app/",
565
+ );
566
+ expect(response.body.data.actions.scrapes[0].html).toContain(
567
+ "This page is used for end-to-end (e2e) testing with Firecrawl.</p>",
568
+ );
569
+ // TODO compare screenshot with expected full page screenshot
570
+ },
571
+ 30000,
572
+ );
573
+
574
+ it.concurrent(
575
+ "should handle 'action click' parameter correctly",
576
+ async () => {
577
+ const scrapeRequest = {
578
+ url: E2E_TEST_SERVER_URL,
579
+ actions: [
580
+ {
581
+ type: "click",
582
+ selector: "#click-me",
583
+ },
584
+ ],
585
+ } as ScrapeRequest;
586
+
587
+ const response: any = await request(FIRECRAWL_API_URL)
588
+ .post("/v1/scrape")
589
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
590
+ .set("Content-Type", "application/json")
591
+ .send(scrapeRequest);
592
+
593
+ expect(response.statusCode).toBe(200);
594
+ if (!("data" in response.body)) {
595
+ throw new Error("Expected response body to have 'data' property");
596
+ }
597
+ expect(response.body.data.markdown).not.toContain("Click me!");
598
+ expect(response.body.data.markdown).toContain(
599
+ "Text changed after click!",
600
+ );
601
+ },
602
+ 30000,
603
+ );
604
+
605
+ it.concurrent(
606
+ "should handle 'action write' parameter correctly",
607
+ async () => {
608
+ const scrapeRequest = {
609
+ url: E2E_TEST_SERVER_URL,
610
+ formats: ["html"],
611
+ actions: [
612
+ {
613
+ type: "click",
614
+ selector: "#input-1",
615
+ },
616
+ {
617
+ type: "write",
618
+ text: "Hello, world!",
619
+ },
620
+ ],
621
+ } as ScrapeRequest;
622
+
623
+ const response: any = await request(FIRECRAWL_API_URL)
624
+ .post("/v1/scrape")
625
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
626
+ .set("Content-Type", "application/json")
627
+ .send(scrapeRequest);
628
+
629
+ expect(response.statusCode).toBe(200);
630
+ if (!("data" in response.body)) {
631
+ throw new Error("Expected response body to have 'data' property");
632
+ }
633
+
634
+ // TODO: fix this test (need to fix fire-engine first)
635
+ // uncomment the following line:
636
+ // expect(response.body.data.html).toContain("<input id=\"input-1\" type=\"text\" placeholder=\"Enter text here...\" style=\"padding:8px;margin:10px;border:1px solid #ccc;border-radius:4px;background-color:#000\" value=\"Hello, world!\">");
637
+ },
638
+ 30000,
639
+ );
640
+
641
+ // TODO: fix this test (need to fix fire-engine first)
642
+ it.concurrent(
643
+ "should handle 'action pressKey' parameter correctly",
644
+ async () => {
645
+ const scrapeRequest = {
646
+ url: E2E_TEST_SERVER_URL,
647
+ formats: ["markdown"],
648
+ actions: [
649
+ {
650
+ type: "press",
651
+ key: "ArrowDown",
652
+ },
653
+ ],
654
+ } as ScrapeRequest;
655
+
656
+ const response: any = await request(FIRECRAWL_API_URL)
657
+ .post("/v1/scrape")
658
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
659
+ .set("Content-Type", "application/json")
660
+ .send(scrapeRequest);
661
+
662
+ // // TODO: fix this test (need to fix fire-engine first)
663
+ // // right now response.body is: { success: false, error: '(Internal server error) - null' }
664
+ // expect(response.statusCode).toBe(200);
665
+ // if (!("data" in response.body)) {
666
+ // throw new Error("Expected response body to have 'data' property");
667
+ // }
668
+ // expect(response.body.data.markdown).toContain("Last Key Clicked: ArrowDown")
669
+ },
670
+ 30000,
671
+ );
672
+
673
+ // TODO: fix this test (need to fix fire-engine first)
674
+ it.concurrent(
675
+ "should handle 'action scroll' parameter correctly",
676
+ async () => {
677
+ const scrapeRequest = {
678
+ url: E2E_TEST_SERVER_URL,
679
+ formats: ["markdown"],
680
+ actions: [
681
+ {
682
+ type: "click",
683
+ selector: "#scroll-bottom-loader",
684
+ },
685
+ {
686
+ type: "scroll",
687
+ direction: "down",
688
+ amount: 2000,
689
+ },
690
+ ],
691
+ } as ScrapeRequest;
692
+
693
+ const response: any = await request(FIRECRAWL_API_URL)
694
+ .post("/v1/scrape")
695
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
696
+ .set("Content-Type", "application/json")
697
+ .send(scrapeRequest);
698
+
699
+ // TODO: uncomment this tests
700
+ // expect(response.statusCode).toBe(200);
701
+ // if (!("data" in response.body)) {
702
+ // throw new Error("Expected response body to have 'data' property");
703
+ // }
704
+ //
705
+ // expect(response.body.data.markdown).toContain("You have reached the bottom!")
706
+ },
707
+ 30000,
708
+ );
709
+
710
+ // TODO: test scrape action
711
+ });
src/__tests__/e2e_withAuth/index.test.ts ADDED
@@ -0,0 +1,862 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import request from "supertest";
2
+ import dotenv from "dotenv";
3
+
4
+ dotenv.config();
5
+ const TEST_URL = "http://127.0.0.1:3002";
6
+
7
+ describe("E2E Tests for v0 API Routes", () => {
8
+ beforeAll(() => {
9
+ process.env.USE_DB_AUTHENTICATION = "true";
10
+ });
11
+
12
+ afterAll(() => {
13
+ delete process.env.USE_DB_AUTHENTICATION;
14
+ });
15
+
16
+ describe("GET /is-production", () => {
17
+ it.concurrent("should return the production status", async () => {
18
+ const response = await request(TEST_URL).get("/is-production");
19
+ expect(response.statusCode).toBe(200);
20
+ expect(response.body).toHaveProperty("isProduction");
21
+ });
22
+ });
23
+
24
+ describe("POST /v0/scrape", () => {
25
+ it.concurrent("should require authorization", async () => {
26
+ const response: any = await request(TEST_URL).post("/v0/scrape");
27
+ expect(response.statusCode).toBe(401);
28
+ });
29
+
30
+ it.concurrent(
31
+ "should return an error response with an invalid API key",
32
+ async () => {
33
+ const response: any = await request(TEST_URL)
34
+ .post("/v0/scrape")
35
+ .set("Authorization", `Bearer invalid-api-key`)
36
+ .set("Content-Type", "application/json")
37
+ .send({ url: "https://firecrawl.dev" });
38
+ expect(response.statusCode).toBe(401);
39
+ },
40
+ );
41
+
42
+ it.concurrent(
43
+ "should return a successful response with a valid API key",
44
+ async () => {
45
+ const response: any = await request(TEST_URL)
46
+ .post("/v0/scrape")
47
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
48
+ .set("Content-Type", "application/json")
49
+ .send({ url: "https://roastmywebsite.ai" });
50
+ expect(response.statusCode).toBe(200);
51
+ expect(response.body).toHaveProperty("data");
52
+ expect(response.body.data).toHaveProperty("content");
53
+ expect(response.body.data).toHaveProperty("markdown");
54
+ expect(response.body.data).toHaveProperty("metadata");
55
+ expect(response.body.data).not.toHaveProperty("html");
56
+ expect(response.body.data.content).toContain("_Roast_");
57
+ expect(response.body.data.metadata.pageError).toBeUndefined();
58
+ expect(response.body.data.metadata.title).toBe("Roast My Website");
59
+ expect(response.body.data.metadata.description).toBe(
60
+ "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
61
+ );
62
+ expect(response.body.data.metadata.keywords).toBe(
63
+ "Roast My Website,Roast,Website,GitHub,Firecrawl",
64
+ );
65
+ expect(response.body.data.metadata.robots).toBe("follow, index");
66
+ expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
67
+ expect(response.body.data.metadata.ogDescription).toBe(
68
+ "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
69
+ );
70
+ expect(response.body.data.metadata.ogUrl).toBe(
71
+ "https://www.roastmywebsite.ai",
72
+ );
73
+ expect(response.body.data.metadata.ogImage).toBe(
74
+ "https://www.roastmywebsite.ai/og.png",
75
+ );
76
+ expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
77
+ expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
78
+ expect(response.body.data.metadata.sourceURL).toBe(
79
+ "https://roastmywebsite.ai",
80
+ );
81
+ expect(response.body.data.metadata.pageStatusCode).toBe(200);
82
+ },
83
+ 30000,
84
+ ); // 30 seconds timeout
85
+
86
+ it.concurrent(
87
+ "should return a successful response with a valid API key and includeHtml set to true",
88
+ async () => {
89
+ const response: any = await request(TEST_URL)
90
+ .post("/v0/scrape")
91
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
92
+ .set("Content-Type", "application/json")
93
+ .send({
94
+ url: "https://roastmywebsite.ai",
95
+ pageOptions: { includeHtml: true },
96
+ });
97
+ expect(response.statusCode).toBe(200);
98
+ expect(response.body).toHaveProperty("data");
99
+ expect(response.body.data).toHaveProperty("content");
100
+ expect(response.body.data).toHaveProperty("markdown");
101
+ expect(response.body.data).toHaveProperty("html");
102
+ expect(response.body.data).toHaveProperty("metadata");
103
+ expect(response.body.data.content).toContain("_Roast_");
104
+ expect(response.body.data.markdown).toContain("_Roast_");
105
+ expect(response.body.data.html).toContain("<h1");
106
+ expect(response.body.data.metadata.pageStatusCode).toBe(200);
107
+ expect(response.body.data.metadata.pageError).toBeUndefined();
108
+ },
109
+ 30000,
110
+ ); // 30 seconds timeout
111
+
112
+ it.concurrent(
113
+ "should return a successful response for a valid scrape with PDF file",
114
+ async () => {
115
+ const response: any = await request(TEST_URL)
116
+ .post("/v0/scrape")
117
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
118
+ .set("Content-Type", "application/json")
119
+ .send({ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" });
120
+ await new Promise((r) => setTimeout(r, 6000));
121
+
122
+ expect(response.statusCode).toBe(200);
123
+ expect(response.body).toHaveProperty("data");
124
+ expect(response.body.data).toHaveProperty("content");
125
+ expect(response.body.data).toHaveProperty("metadata");
126
+ expect(response.body.data.content).toContain(
127
+ "We present spectrophotometric observations of the Broad Line Radio Galaxy",
128
+ );
129
+ expect(response.body.data.metadata.pageStatusCode).toBe(200);
130
+ expect(response.body.data.metadata.pageError).toBeUndefined();
131
+ },
132
+ 60000,
133
+ ); // 60 seconds
134
+
135
+ it.concurrent(
136
+ "should return a successful response for a valid scrape with PDF file without explicit .pdf extension",
137
+ async () => {
138
+ const response: any = await request(TEST_URL)
139
+ .post("/v0/scrape")
140
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
141
+ .set("Content-Type", "application/json")
142
+ .send({ url: "https://arxiv.org/pdf/astro-ph/9301001" });
143
+ await new Promise((r) => setTimeout(r, 6000));
144
+
145
+ expect(response.statusCode).toBe(200);
146
+ expect(response.body).toHaveProperty("data");
147
+ expect(response.body.data).toHaveProperty("content");
148
+ expect(response.body.data).toHaveProperty("metadata");
149
+ expect(response.body.data.content).toContain(
150
+ "We present spectrophotometric observations of the Broad Line Radio Galaxy",
151
+ );
152
+ expect(response.body.data.metadata.pageStatusCode).toBe(200);
153
+ expect(response.body.data.metadata.pageError).toBeUndefined();
154
+ },
155
+ 60000,
156
+ ); // 60 seconds
157
+
158
+ it.concurrent(
159
+ "should return a successful response with a valid API key with removeTags option",
160
+ async () => {
161
+ const responseWithoutRemoveTags: any = await request(TEST_URL)
162
+ .post("/v0/scrape")
163
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
164
+ .set("Content-Type", "application/json")
165
+ .send({ url: "https://www.scrapethissite.com/" });
166
+ expect(responseWithoutRemoveTags.statusCode).toBe(200);
167
+ expect(responseWithoutRemoveTags.body).toHaveProperty("data");
168
+ expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
169
+ expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
170
+ expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
171
+ expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
172
+ expect(responseWithoutRemoveTags.body.data.content).toContain(
173
+ "Scrape This Site",
174
+ );
175
+ expect(responseWithoutRemoveTags.body.data.content).toContain(
176
+ "Lessons and Videos",
177
+ ); // #footer
178
+ expect(responseWithoutRemoveTags.body.data.content).toContain(
179
+ "[Sandbox](",
180
+ ); // .nav
181
+ expect(responseWithoutRemoveTags.body.data.content).toContain(
182
+ "web scraping",
183
+ ); // strong
184
+
185
+ const response: any = await request(TEST_URL)
186
+ .post("/v0/scrape")
187
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
188
+ .set("Content-Type", "application/json")
189
+ .send({
190
+ url: "https://www.scrapethissite.com/",
191
+ pageOptions: { removeTags: [".nav", "#footer", "strong"] },
192
+ });
193
+ expect(response.statusCode).toBe(200);
194
+ expect(response.body).toHaveProperty("data");
195
+ expect(response.body.data).toHaveProperty("content");
196
+ expect(response.body.data).toHaveProperty("markdown");
197
+ expect(response.body.data).toHaveProperty("metadata");
198
+ expect(response.body.data).not.toHaveProperty("html");
199
+ expect(response.body.data.content).toContain("Scrape This Site");
200
+ expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
201
+ expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
202
+ expect(response.body.data.content).not.toContain("web scraping"); // strong
203
+ },
204
+ 30000,
205
+ ); // 30 seconds timeout
206
+
207
+ it.concurrent(
208
+ "should return a successful response for a scrape with 400 page",
209
+ async () => {
210
+ const response: any = await request(TEST_URL)
211
+ .post("/v0/scrape")
212
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
213
+ .set("Content-Type", "application/json")
214
+ .send({ url: "https://httpstat.us/400" });
215
+ await new Promise((r) => setTimeout(r, 5000));
216
+
217
+ expect(response.statusCode).toBe(200);
218
+ expect(response.body).toHaveProperty("data");
219
+ expect(response.body.data).toHaveProperty("content");
220
+ expect(response.body.data).toHaveProperty("metadata");
221
+ expect(response.body.data.metadata.pageStatusCode).toBe(400);
222
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
223
+ "bad request",
224
+ );
225
+ },
226
+ 60000,
227
+ ); // 60 seconds
228
+
229
+ it.concurrent(
230
+ "should return a successful response for a scrape with 401 page",
231
+ async () => {
232
+ const response: any = await request(TEST_URL)
233
+ .post("/v0/scrape")
234
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
235
+ .set("Content-Type", "application/json")
236
+ .send({ url: "https://httpstat.us/401" });
237
+ await new Promise((r) => setTimeout(r, 5000));
238
+
239
+ expect(response.statusCode).toBe(200);
240
+ expect(response.body).toHaveProperty("data");
241
+ expect(response.body.data).toHaveProperty("content");
242
+ expect(response.body.data).toHaveProperty("metadata");
243
+ expect(response.body.data.metadata.pageStatusCode).toBe(401);
244
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
245
+ "unauthorized",
246
+ );
247
+ },
248
+ 60000,
249
+ ); // 60 seconds
250
+
251
+ it.concurrent(
252
+ "should return a successful response for a scrape with 403 page",
253
+ async () => {
254
+ const response: any = await request(TEST_URL)
255
+ .post("/v0/scrape")
256
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
257
+ .set("Content-Type", "application/json")
258
+ .send({ url: "https://httpstat.us/403" });
259
+
260
+ await new Promise((r) => setTimeout(r, 5000));
261
+ expect(response.statusCode).toBe(200);
262
+ expect(response.body).toHaveProperty("data");
263
+ expect(response.body.data).toHaveProperty("content");
264
+ expect(response.body.data).toHaveProperty("metadata");
265
+ expect(response.body.data.metadata.pageStatusCode).toBe(403);
266
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
267
+ "forbidden",
268
+ );
269
+ },
270
+ 60000,
271
+ ); // 60 seconds
272
+
273
+ it.concurrent(
274
+ "should return a successful response for a scrape with 404 page",
275
+ async () => {
276
+ const response: any = await request(TEST_URL)
277
+ .post("/v0/scrape")
278
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
279
+ .set("Content-Type", "application/json")
280
+ .send({ url: "https://httpstat.us/404" });
281
+ await new Promise((r) => setTimeout(r, 5000));
282
+
283
+ expect(response.statusCode).toBe(200);
284
+ expect(response.body).toHaveProperty("data");
285
+ expect(response.body.data).toHaveProperty("content");
286
+ expect(response.body.data).toHaveProperty("metadata");
287
+ expect(response.body.data.metadata.pageStatusCode).toBe(404);
288
+ },
289
+ 60000,
290
+ ); // 60 seconds
291
+
292
+ it.concurrent(
293
+ "should return a successful response for a scrape with 405 page",
294
+ async () => {
295
+ const response = await request(TEST_URL)
296
+ .post("/v0/scrape")
297
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
298
+ .set("Content-Type", "application/json")
299
+ .send({ url: "https://httpstat.us/405" });
300
+ await new Promise((r) => setTimeout(r, 5000));
301
+
302
+ expect(response.statusCode).toBe(200);
303
+ expect(response.body).toHaveProperty("data");
304
+ expect(response.body.data).toHaveProperty("content");
305
+ expect(response.body.data).toHaveProperty("metadata");
306
+ expect(response.body.data.metadata.pageStatusCode).toBe(405);
307
+ },
308
+ 60000,
309
+ ); // 60 seconds
310
+
311
+ it.concurrent(
312
+ "should return a successful response for a scrape with 500 page",
313
+ async () => {
314
+ const response: any = await request(TEST_URL)
315
+ .post("/v0/scrape")
316
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
317
+ .set("Content-Type", "application/json")
318
+ .send({ url: "https://httpstat.us/500" });
319
+ await new Promise((r) => setTimeout(r, 5000));
320
+
321
+ expect(response.statusCode).toBe(200);
322
+ expect(response.body).toHaveProperty("data");
323
+ expect(response.body.data).toHaveProperty("content");
324
+ expect(response.body.data).toHaveProperty("metadata");
325
+ expect(response.body.data.metadata.pageStatusCode).toBe(500);
326
+ },
327
+ 60000,
328
+ ); // 60 seconds
329
+ });
330
+
331
+ describe("POST /v0/crawl", () => {
332
+ it.concurrent("should require authorization", async () => {
333
+ const response: any = await request(TEST_URL).post("/v0/crawl");
334
+ expect(response.statusCode).toBe(401);
335
+ });
336
+
337
+ it.concurrent(
338
+ "should return an error response with an invalid API key",
339
+ async () => {
340
+ const response: any = await request(TEST_URL)
341
+ .post("/v0/crawl")
342
+ .set("Authorization", `Bearer invalid-api-key`)
343
+ .set("Content-Type", "application/json")
344
+ .send({ url: "https://firecrawl.dev" });
345
+ expect(response.statusCode).toBe(401);
346
+ },
347
+ );
348
+
349
+ it.concurrent(
350
+ "should return a successful response with a valid API key for crawl",
351
+ async () => {
352
+ const response: any = await request(TEST_URL)
353
+ .post("/v0/crawl")
354
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
355
+ .set("Content-Type", "application/json")
356
+ .send({ url: "https://firecrawl.dev" });
357
+ expect(response.statusCode).toBe(200);
358
+ expect(response.body).toHaveProperty("jobId");
359
+ expect(response.body.jobId).toMatch(
360
+ /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
361
+ );
362
+ },
363
+ );
364
+
365
+ it.concurrent(
366
+ "should return a successful response with a valid API key and valid includes option",
367
+ async () => {
368
+ const crawlResponse: any = await request(TEST_URL)
369
+ .post("/v0/crawl")
370
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
371
+ .set("Content-Type", "application/json")
372
+ .send({
373
+ url: "https://mendable.ai",
374
+ limit: 10,
375
+ crawlerOptions: {
376
+ includes: ["blog/*"],
377
+ },
378
+ });
379
+
380
+ let response: any;
381
+ let isFinished = false;
382
+
383
+ while (!isFinished) {
384
+ response = await request(TEST_URL)
385
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
386
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
387
+
388
+ expect(response.statusCode).toBe(200);
389
+ expect(response.body).toHaveProperty("status");
390
+ isFinished = response.body.status === "completed";
391
+
392
+ if (!isFinished) {
393
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
394
+ }
395
+ }
396
+
397
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
398
+ const completedResponse = await request(TEST_URL)
399
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
400
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
401
+
402
+ const urls = completedResponse.body.data.map(
403
+ (item: any) => item.metadata?.sourceURL,
404
+ );
405
+ expect(urls.length).toBeGreaterThan(5);
406
+ urls.forEach((url: string) => {
407
+ expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy();
408
+ });
409
+
410
+ expect(completedResponse.statusCode).toBe(200);
411
+ expect(completedResponse.body).toHaveProperty("status");
412
+ expect(completedResponse.body.status).toBe("completed");
413
+ expect(completedResponse.body).toHaveProperty("data");
414
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
415
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
416
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
417
+ expect(completedResponse.body.data[0].content).toContain("Mendable");
418
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
419
+ 200,
420
+ );
421
+ expect(
422
+ completedResponse.body.data[0].metadata.pageError,
423
+ ).toBeUndefined();
424
+ },
425
+ 180000,
426
+ ); // 180 seconds
427
+
428
+ it.concurrent(
429
+ "should return a successful response with a valid API key and valid excludes option",
430
+ async () => {
431
+ const crawlResponse: any = await request(TEST_URL)
432
+ .post("/v0/crawl")
433
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
434
+ .set("Content-Type", "application/json")
435
+ .send({
436
+ url: "https://mendable.ai",
437
+ limit: 10,
438
+ crawlerOptions: {
439
+ excludes: ["blog/*"],
440
+ },
441
+ });
442
+
443
+ let isFinished = false;
444
+ let response: any;
445
+
446
+ while (!isFinished) {
447
+ response = await request(TEST_URL)
448
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
449
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
450
+
451
+ expect(response.statusCode).toBe(200);
452
+ expect(response.body).toHaveProperty("status");
453
+ isFinished = response.body.status === "completed";
454
+
455
+ if (!isFinished) {
456
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
457
+ }
458
+ }
459
+
460
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
461
+ const completedResponse: any = await request(TEST_URL)
462
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
463
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
464
+
465
+ const urls = completedResponse.body.data.map(
466
+ (item: any) => item.metadata?.sourceURL,
467
+ );
468
+ expect(urls.length).toBeGreaterThan(5);
469
+ urls.forEach((url: string) => {
470
+ expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
471
+ });
472
+ },
473
+ 90000,
474
+ ); // 90 seconds
475
+
476
+ it.concurrent(
477
+ "should return a successful response with max depth option for a valid crawl job",
478
+ async () => {
479
+ const crawlResponse: any = await request(TEST_URL)
480
+ .post("/v0/crawl")
481
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
482
+ .set("Content-Type", "application/json")
483
+ .send({
484
+ url: "https://www.scrapethissite.com",
485
+ crawlerOptions: { maxDepth: 1 },
486
+ });
487
+ expect(crawlResponse.statusCode).toBe(200);
488
+
489
+ const response: any = await request(TEST_URL)
490
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
491
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
492
+ expect(response.statusCode).toBe(200);
493
+ expect(response.body).toHaveProperty("status");
494
+ expect(["active", "waiting"]).toContain(response.body.status);
495
+ // wait for 60 seconds
496
+ let isCompleted = false;
497
+ while (!isCompleted) {
498
+ const statusCheckResponse = await request(TEST_URL)
499
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
500
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
501
+ expect(statusCheckResponse.statusCode).toBe(200);
502
+ isCompleted = statusCheckResponse.body.status === "completed";
503
+ if (!isCompleted) {
504
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
505
+ }
506
+ }
507
+ const completedResponse: any = await request(TEST_URL)
508
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
509
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
510
+
511
+ expect(completedResponse.statusCode).toBe(200);
512
+ expect(completedResponse.body).toHaveProperty("status");
513
+ expect(completedResponse.body.status).toBe("completed");
514
+ expect(completedResponse.body).toHaveProperty("data");
515
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
516
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
517
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
518
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
519
+ 200,
520
+ );
521
+ expect(
522
+ completedResponse.body.data[0].metadata.pageError,
523
+ ).toBeUndefined();
524
+ const urls = completedResponse.body.data.map(
525
+ (item: any) => item.metadata?.sourceURL,
526
+ );
527
+ expect(urls.length).toBeGreaterThan(1);
528
+
529
+ // Check if all URLs have a maximum depth of 1
530
+ urls.forEach((url: string) => {
531
+ const pathSplits = new URL(url).pathname.split("/");
532
+ const depth =
533
+ pathSplits.length -
534
+ (pathSplits[0].length === 0 &&
535
+ pathSplits[pathSplits.length - 1].length === 0
536
+ ? 1
537
+ : 0);
538
+ expect(depth).toBeLessThanOrEqual(2);
539
+ });
540
+ },
541
+ 180000,
542
+ );
543
+ });
544
+
545
+ describe("POST /v0/crawlWebsitePreview", () => {
546
+ it.concurrent("should require authorization", async () => {
547
+ const response: any = await request(TEST_URL).post(
548
+ "/v0/crawlWebsitePreview",
549
+ );
550
+ expect(response.statusCode).toBe(401);
551
+ });
552
+
553
+ it.concurrent(
554
+ "should return an error response with an invalid API key",
555
+ async () => {
556
+ const response: any = await request(TEST_URL)
557
+ .post("/v0/crawlWebsitePreview")
558
+ .set("Authorization", `Bearer invalid-api-key`)
559
+ .set("Content-Type", "application/json")
560
+ .send({ url: "https://firecrawl.dev" });
561
+ expect(response.statusCode).toBe(401);
562
+ },
563
+ );
564
+
565
+ it.concurrent(
566
+ "should return a timeout error when scraping takes longer than the specified timeout",
567
+ async () => {
568
+ const response: any = await request(TEST_URL)
569
+ .post("/v0/scrape")
570
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
571
+ .set("Content-Type", "application/json")
572
+ .send({ url: "https://firecrawl.dev", timeout: 1000 });
573
+
574
+ expect(response.statusCode).toBe(408);
575
+ },
576
+ 3000,
577
+ );
578
+ });
579
+
580
+ describe("POST /v0/search", () => {
581
+ it.concurrent("should require authorization", async () => {
582
+ const response = await request(TEST_URL).post("/v0/search");
583
+ expect(response.statusCode).toBe(401);
584
+ });
585
+
586
+ it.concurrent(
587
+ "should return an error response with an invalid API key",
588
+ async () => {
589
+ const response = await request(TEST_URL)
590
+ .post("/v0/search")
591
+ .set("Authorization", `Bearer invalid-api-key`)
592
+ .set("Content-Type", "application/json")
593
+ .send({ query: "test" });
594
+ expect(response.statusCode).toBe(401);
595
+ },
596
+ );
597
+
598
+ it.concurrent(
599
+ "should return a successful response with a valid API key for search",
600
+ async () => {
601
+ const response = await request(TEST_URL)
602
+ .post("/v0/search")
603
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
604
+ .set("Content-Type", "application/json")
605
+ .send({ query: "test" });
606
+ expect(response.statusCode).toBe(200);
607
+ expect(response.body).toHaveProperty("success");
608
+ expect(response.body.success).toBe(true);
609
+ expect(response.body).toHaveProperty("data");
610
+ },
611
+ 60000,
612
+ ); // 60 seconds timeout
613
+ });
614
+
615
+ describe("GET /v0/crawl/status/:jobId", () => {
616
+ it.concurrent("should require authorization", async () => {
617
+ const response = await request(TEST_URL).get("/v0/crawl/status/123");
618
+ expect(response.statusCode).toBe(401);
619
+ });
620
+
621
+ it.concurrent(
622
+ "should return an error response with an invalid API key",
623
+ async () => {
624
+ const response = await request(TEST_URL)
625
+ .get("/v0/crawl/status/123")
626
+ .set("Authorization", `Bearer invalid-api-key`);
627
+ expect(response.statusCode).toBe(401);
628
+ },
629
+ );
630
+
631
+ it.concurrent(
632
+ "should return Job not found for invalid job ID",
633
+ async () => {
634
+ const response = await request(TEST_URL)
635
+ .get("/v0/crawl/status/invalidJobId")
636
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
637
+ expect(response.statusCode).toBe(404);
638
+ },
639
+ );
640
+
641
+ it.concurrent(
642
+ "should return a successful crawl status response for a valid crawl job",
643
+ async () => {
644
+ const crawlResponse = await request(TEST_URL)
645
+ .post("/v0/crawl")
646
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
647
+ .set("Content-Type", "application/json")
648
+ .send({ url: "https://firecrawl.dev/blog" });
649
+ expect(crawlResponse.statusCode).toBe(200);
650
+
651
+ let isCompleted = false;
652
+
653
+ while (!isCompleted) {
654
+ const response = await request(TEST_URL)
655
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
656
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
657
+ expect(response.statusCode).toBe(200);
658
+ expect(response.body).toHaveProperty("status");
659
+
660
+ if (response.body.status === "completed") {
661
+ isCompleted = true;
662
+ } else {
663
+ await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
664
+ }
665
+ }
666
+
667
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
668
+ const completedResponse = await request(TEST_URL)
669
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
670
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
671
+
672
+ expect(completedResponse.body).toHaveProperty("status");
673
+ expect(completedResponse.body.status).toBe("completed");
674
+ expect(completedResponse.body).toHaveProperty("data");
675
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
676
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
677
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
678
+ expect(completedResponse.body.data[0].content).toContain("Firecrawl");
679
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
680
+ 200,
681
+ );
682
+ expect(
683
+ completedResponse.body.data[0].metadata.pageError,
684
+ ).toBeUndefined();
685
+
686
+ const childrenLinks = completedResponse.body.data.filter(
687
+ (doc) =>
688
+ doc.metadata &&
689
+ doc.metadata.sourceURL &&
690
+ doc.metadata.sourceURL.includes("firecrawl.dev/blog"),
691
+ );
692
+
693
+ expect(childrenLinks.length).toBe(completedResponse.body.data.length);
694
+ },
695
+ 180000,
696
+ ); // 120 seconds
697
+
698
+ // TODO: review the test below
699
+ // it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => {
700
+ // const crawlResponse = await request(TEST_URL)
701
+ // .post('/v0/crawl')
702
+ // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
703
+ // .set('Content-Type', 'application/json')
704
+ // .send({ url: 'https://arxiv.org/list/astro-ph/1993-01',
705
+ // crawlerOptions: {
706
+ // limit: 10,
707
+ // returnOnlyUrls: true
708
+ // }});
709
+ // expect(crawlResponse.statusCode).toBe(200);
710
+
711
+ // let isCompleted = false;
712
+ // let completedResponse;
713
+
714
+ // while (!isCompleted) {
715
+ // const response = await request(TEST_URL)
716
+ // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
717
+ // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
718
+ // expect(response.statusCode).toBe(200);
719
+ // expect(response.body).toHaveProperty('status');
720
+
721
+ // if (response.body.status === 'completed') {
722
+ // isCompleted = true;
723
+ // completedResponse = response;
724
+ // } else {
725
+ // await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
726
+ // }
727
+ // }
728
+ // expect(completedResponse.body.status).toBe('completed');
729
+ // expect(completedResponse.body).toHaveProperty('data');
730
+ // expect(completedResponse.body.data.length).toEqual(1);
731
+ // expect(completedResponse.body.data).toEqual(
732
+ // expect.arrayContaining([
733
+ // expect.objectContaining({
734
+ // content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.')
735
+ // })
736
+ // ])
737
+ // );
738
+
739
+ // expect(completedResponse.body.data[0]).toHaveProperty("metadata");
740
+ // expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
741
+ // expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
742
+ // }, 180000); // 120 seconds
743
+
744
+ it.concurrent(
745
+ "If someone cancels a crawl job, it should turn into failed status",
746
+ async () => {
747
+ const crawlResponse = await request(TEST_URL)
748
+ .post("/v0/crawl")
749
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
750
+ .set("Content-Type", "application/json")
751
+ .send({
752
+ url: "https://docs.tatum.io",
753
+ crawlerOptions: { limit: 200 },
754
+ });
755
+
756
+ expect(crawlResponse.statusCode).toBe(200);
757
+
758
+ await new Promise((r) => setTimeout(r, 10000));
759
+
760
+ const responseCancel = await request(TEST_URL)
761
+ .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
762
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
763
+ expect(responseCancel.statusCode).toBe(200);
764
+ expect(responseCancel.body).toHaveProperty("status");
765
+ expect(responseCancel.body.status).toBe("cancelled");
766
+
767
+ await new Promise((r) => setTimeout(r, 10000));
768
+ const completedResponse = await request(TEST_URL)
769
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
770
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
771
+ .maxResponseSize(4000000000);
772
+
773
+ expect(completedResponse.statusCode).toBe(200);
774
+ expect(completedResponse.body).toHaveProperty("status");
775
+ expect(completedResponse.body.status).toBe("failed");
776
+ expect(completedResponse.body).toHaveProperty("data");
777
+
778
+ let isNullOrEmptyArray = false;
779
+ if (
780
+ completedResponse.body.data === null ||
781
+ completedResponse.body.data.length === 0
782
+ ) {
783
+ isNullOrEmptyArray = true;
784
+ }
785
+ expect(isNullOrEmptyArray).toBe(true);
786
+ expect(completedResponse.body.data).toEqual(expect.arrayContaining([]));
787
+ expect(completedResponse.body).toHaveProperty("partial_data");
788
+ expect(completedResponse.body.partial_data[0]).toHaveProperty(
789
+ "content",
790
+ );
791
+ expect(completedResponse.body.partial_data[0]).toHaveProperty(
792
+ "markdown",
793
+ );
794
+ expect(completedResponse.body.partial_data[0]).toHaveProperty(
795
+ "metadata",
796
+ );
797
+ expect(
798
+ completedResponse.body.partial_data[0].metadata.pageStatusCode,
799
+ ).toBe(200);
800
+ expect(
801
+ completedResponse.body.partial_data[0].metadata.pageError,
802
+ ).toBeUndefined();
803
+ },
804
+ 60000,
805
+ ); // 60 seconds
806
+ });
807
+
808
+ describe("POST /v0/scrape with LLM Extraction", () => {
809
+ it.concurrent(
810
+ "should extract data using LLM extraction mode",
811
+ async () => {
812
+ const response = await request(TEST_URL)
813
+ .post("/v0/scrape")
814
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
815
+ .set("Content-Type", "application/json")
816
+ .send({
817
+ url: "https://mendable.ai",
818
+ pageOptions: {
819
+ onlyMainContent: true,
820
+ },
821
+ extractorOptions: {
822
+ mode: "llm-extraction",
823
+ extractionPrompt:
824
+ "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
825
+ extractionSchema: {
826
+ type: "object",
827
+ properties: {
828
+ company_mission: {
829
+ type: "string",
830
+ },
831
+ supports_sso: {
832
+ type: "boolean",
833
+ },
834
+ is_open_source: {
835
+ type: "boolean",
836
+ },
837
+ },
838
+ required: ["company_mission", "supports_sso", "is_open_source"],
839
+ },
840
+ },
841
+ });
842
+
843
+ // Ensure that the job was successfully created before proceeding with LLM extraction
844
+ expect(response.statusCode).toBe(200);
845
+
846
+ // Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
847
+ let llmExtraction = response.body.data.llm_extraction;
848
+
849
+ // Check if the llm_extraction object has the required properties with correct types and values
850
+ expect(llmExtraction).toHaveProperty("company_mission");
851
+ expect(typeof llmExtraction.company_mission).toBe("string");
852
+ expect(llmExtraction).toHaveProperty("supports_sso");
853
+ expect(llmExtraction.supports_sso).toBe(true);
854
+ expect(typeof llmExtraction.supports_sso).toBe("boolean");
855
+ expect(llmExtraction).toHaveProperty("is_open_source");
856
+ expect(llmExtraction.is_open_source).toBe(false);
857
+ expect(typeof llmExtraction.is_open_source).toBe("boolean");
858
+ },
859
+ 60000,
860
+ ); // 60 secs
861
+ });
862
+ });
src/__tests__/queue-concurrency-integration.test.ts ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { redisConnection } from "../services/queue-service";
2
+ import { addScrapeJob, addScrapeJobs } from "../services/queue-jobs";
3
+ import {
4
+ cleanOldConcurrencyLimitEntries,
5
+ pushConcurrencyLimitActiveJob,
6
+ takeConcurrencyLimitedJob,
7
+ removeConcurrencyLimitActiveJob,
8
+ } from "../lib/concurrency-limit";
9
+ import { WebScraperOptions } from "../types";
10
+ import { getACUCTeam } from "../controllers/auth";
11
+
12
+ // Mock all the dependencies
13
+ const mockAdd = jest.fn();
14
+ jest.mock("../services/queue-service", () => ({
15
+ redisConnection: {
16
+ zremrangebyscore: jest.fn(),
17
+ zrangebyscore: jest.fn(),
18
+ zadd: jest.fn(),
19
+ zrem: jest.fn(),
20
+ zmpop: jest.fn(),
21
+ zcard: jest.fn(),
22
+ smembers: jest.fn(),
23
+ },
24
+ getScrapeQueue: jest.fn(() => ({
25
+ add: mockAdd,
26
+ })),
27
+ }));
28
+
29
+ jest.mock("uuid", () => ({
30
+ v4: jest.fn(() => "mock-uuid"),
31
+ }));
32
+
33
+ describe("Queue Concurrency Integration", () => {
34
+ const mockTeamId = "test-team-id";
35
+ const mockNow = Date.now();
36
+
37
+ const defaultScrapeOptions = {
38
+ formats: ["markdown"] as (
39
+ | "markdown"
40
+ | "html"
41
+ | "rawHtml"
42
+ | "links"
43
+ | "screenshot"
44
+ | "screenshot@fullPage"
45
+ | "extract"
46
+ | "json"
47
+ )[],
48
+ onlyMainContent: true,
49
+ waitFor: 0,
50
+ mobile: false,
51
+ parsePDF: false,
52
+ timeout: 30000,
53
+ extract: {
54
+ mode: "llm" as const,
55
+ systemPrompt: "test",
56
+ schema: {},
57
+ },
58
+ extractOptions: { mode: "llm" as const, systemPrompt: "test" },
59
+ javascript: true,
60
+ headers: {},
61
+ cookies: [],
62
+ blockResources: true,
63
+ skipTlsVerification: false,
64
+ removeBase64Images: true,
65
+ fastMode: false,
66
+ blockAds: true,
67
+ };
68
+
69
+ beforeEach(() => {
70
+ jest.clearAllMocks();
71
+ jest.spyOn(Date, "now").mockImplementation(() => mockNow);
72
+ });
73
+
74
+ describe("Single Job Addition", () => {
75
+ const mockWebScraperOptions: WebScraperOptions = {
76
+ url: "https://test.com",
77
+ mode: "single_urls",
78
+ team_id: mockTeamId,
79
+ scrapeOptions: defaultScrapeOptions,
80
+ crawlerOptions: null,
81
+ };
82
+
83
+ it("should add job directly to BullMQ when under concurrency limit", async () => {
84
+ // Mock current active jobs to be under limit
85
+ (redisConnection.zrangebyscore as jest.Mock).mockResolvedValue([]);
86
+
87
+ await addScrapeJob(mockWebScraperOptions);
88
+
89
+ // Should have checked concurrency
90
+ expect(redisConnection.zrangebyscore).toHaveBeenCalled();
91
+
92
+ // Should have added to BullMQ
93
+ expect(mockAdd).toHaveBeenCalled();
94
+
95
+ // Should have added to active jobs
96
+ expect(redisConnection.zadd).toHaveBeenCalledWith(
97
+ expect.stringContaining("concurrency-limiter"),
98
+ expect.any(Number),
99
+ expect.any(String),
100
+ );
101
+ });
102
+
103
+ it("should add job to concurrency queue when at concurrency limit", async () => {
104
+ // Mock current active jobs to be at limit
105
+ (getACUCTeam as jest.Mock).mockResolvedValue({
106
+ concurrency: 15,
107
+ } as any);
108
+ const activeJobs = Array(15).fill("active-job");
109
+ (redisConnection.zrangebyscore as jest.Mock).mockResolvedValue(
110
+ activeJobs,
111
+ );
112
+
113
+ await addScrapeJob(mockWebScraperOptions);
114
+
115
+ // Should have checked concurrency
116
+ expect(redisConnection.zrangebyscore).toHaveBeenCalled();
117
+
118
+ // Should NOT have added to BullMQ
119
+ expect(mockAdd).not.toHaveBeenCalled();
120
+
121
+ // Should have added to concurrency queue
122
+ expect(redisConnection.zadd).toHaveBeenCalledWith(
123
+ expect.stringContaining("concurrency-limit-queue"),
124
+ expect.any(Number),
125
+ expect.stringContaining("mock-uuid"),
126
+ );
127
+ });
128
+ });
129
+
130
+ describe("Batch Job Addition", () => {
131
+ const createMockJobs = (count: number) =>
132
+ Array(count)
133
+ .fill(null)
134
+ .map((_, i) => ({
135
+ data: {
136
+ url: `https://test${i}.com`,
137
+ mode: "single_urls",
138
+ team_id: mockTeamId,
139
+ scrapeOptions: defaultScrapeOptions,
140
+ } as WebScraperOptions,
141
+ opts: {
142
+ jobId: `job-${i}`,
143
+ priority: 1,
144
+ },
145
+ }));
146
+
147
+ it("should handle batch jobs respecting concurrency limits", async () => {
148
+ const maxConcurrency = 15;
149
+ (getACUCTeam as jest.Mock).mockResolvedValue({
150
+ concurrency: maxConcurrency,
151
+ } as any);
152
+ const totalJobs = maxConcurrency + 5; // Some jobs should go to queue
153
+ const mockJobs = createMockJobs(totalJobs);
154
+
155
+ // Mock current active jobs to be empty
156
+ (redisConnection.zrangebyscore as jest.Mock).mockResolvedValue([]);
157
+
158
+ await addScrapeJobs(mockJobs);
159
+
160
+ // Should have added maxConcurrency jobs to BullMQ
161
+ expect(mockAdd).toHaveBeenCalledTimes(maxConcurrency);
162
+
163
+ // Should have added remaining jobs to concurrency queue
164
+ expect(redisConnection.zadd).toHaveBeenCalledWith(
165
+ expect.stringContaining("concurrency-limit-queue"),
166
+ expect.any(Number),
167
+ expect.any(String),
168
+ );
169
+ });
170
+
171
+ it("should handle empty job array", async () => {
172
+ const result = await addScrapeJobs([]);
173
+ expect(result).toBe(true);
174
+ expect(mockAdd).not.toHaveBeenCalled();
175
+ expect(redisConnection.zadd).not.toHaveBeenCalled();
176
+ });
177
+ });
178
+
179
+ describe("Queue Worker Integration", () => {
180
+ it("should process next queued job when active job completes", async () => {
181
+ const mockJob = {
182
+ id: "test-job",
183
+ data: {
184
+ team_id: mockTeamId,
185
+ },
186
+ };
187
+
188
+ // Mock a queued job
189
+ const queuedJob = {
190
+ id: "queued-job",
191
+ data: { test: "data" },
192
+ opts: {},
193
+ };
194
+ (redisConnection.zmpop as jest.Mock).mockResolvedValueOnce([
195
+ "key",
196
+ [[JSON.stringify(queuedJob)]],
197
+ ]);
198
+
199
+ // Simulate job completion in worker
200
+ await removeConcurrencyLimitActiveJob(mockTeamId, mockJob.id);
201
+ await cleanOldConcurrencyLimitEntries(mockTeamId);
202
+
203
+ const nextJob = await takeConcurrencyLimitedJob(mockTeamId);
204
+
205
+ // Should have taken next job from queue
206
+ expect(nextJob).toEqual(queuedJob);
207
+
208
+ // Should have added new job to active jobs
209
+ await pushConcurrencyLimitActiveJob(mockTeamId, nextJob!.id, 2 * 60 * 1000);
210
+ expect(redisConnection.zadd).toHaveBeenCalledWith(
211
+ expect.stringContaining("concurrency-limiter"),
212
+ expect.any(Number),
213
+ nextJob!.id,
214
+ );
215
+ });
216
+
217
+ it("should handle job failure and cleanup", async () => {
218
+ const mockJob = {
219
+ id: "failing-job",
220
+ data: {
221
+ team_id: mockTeamId,
222
+ },
223
+ };
224
+
225
+ // Add job to active jobs
226
+ await pushConcurrencyLimitActiveJob(mockTeamId, mockJob.id, 2 * 60 * 1000);
227
+
228
+ // Simulate job failure and cleanup
229
+ await removeConcurrencyLimitActiveJob(mockTeamId, mockJob.id);
230
+ await cleanOldConcurrencyLimitEntries(mockTeamId);
231
+
232
+ // Verify job was removed from active jobs
233
+ expect(redisConnection.zrem).toHaveBeenCalledWith(
234
+ expect.stringContaining("concurrency-limiter"),
235
+ mockJob.id,
236
+ );
237
+ });
238
+ });
239
+
240
+ describe("Edge Cases", () => {
241
+ it("should handle stalled jobs cleanup", async () => {
242
+ const stalledTime = mockNow - 3 * 60 * 1000; // 3 minutes ago
243
+
244
+ // Mock stalled jobs in Redis
245
+ (redisConnection.zrangebyscore as jest.Mock).mockResolvedValueOnce([
246
+ "stalled-job",
247
+ ]);
248
+
249
+ await cleanOldConcurrencyLimitEntries(mockTeamId, mockNow);
250
+
251
+ // Should have cleaned up stalled jobs
252
+ expect(redisConnection.zremrangebyscore).toHaveBeenCalledWith(
253
+ expect.stringContaining("concurrency-limiter"),
254
+ -Infinity,
255
+ mockNow,
256
+ );
257
+ });
258
+
259
+ it("should handle race conditions in job queue processing", async () => {
260
+ // Mock a race condition where job is taken by another worker
261
+ (redisConnection.zmpop as jest.Mock).mockResolvedValueOnce(null);
262
+
263
+ const nextJob = await takeConcurrencyLimitedJob(mockTeamId);
264
+
265
+ // Should handle gracefully when no job is available
266
+ expect(nextJob).toBeNull();
267
+ });
268
+ });
269
+ });
src/__tests__/snips/batch-scrape.test.ts ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { batchScrape } from "./lib";
2
+
3
+ describe("Batch scrape tests", () => {
4
+ it.concurrent("works", async () => {
5
+ const response = await batchScrape({
6
+ urls: ["http://firecrawl.dev"]
7
+ });
8
+
9
+ expect(response.body.data[0]).toHaveProperty("markdown");
10
+ expect(response.body.data[0].markdown).toContain("Firecrawl");
11
+ }, 180000);
12
+
13
+ if (!process.env.TEST_SUITE_SELF_HOSTED) {
14
+ describe("JSON format", () => {
15
+ it.concurrent("works", async () => {
16
+ const response = await batchScrape({
17
+ urls: ["http://firecrawl.dev"],
18
+ formats: ["json"],
19
+ jsonOptions: {
20
+ prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
21
+ schema: {
22
+ type: "object",
23
+ properties: {
24
+ company_mission: {
25
+ type: "string",
26
+ },
27
+ supports_sso: {
28
+ type: "boolean",
29
+ },
30
+ is_open_source: {
31
+ type: "boolean",
32
+ },
33
+ },
34
+ required: ["company_mission", "supports_sso", "is_open_source"],
35
+ },
36
+ },
37
+ });
38
+
39
+ expect(response.body.data[0]).toHaveProperty("json");
40
+ expect(response.body.data[0].json).toHaveProperty("company_mission");
41
+ expect(typeof response.body.data[0].json.company_mission).toBe("string");
42
+ expect(response.body.data[0].json).toHaveProperty("supports_sso");
43
+ expect(response.body.data[0].json.supports_sso).toBe(false);
44
+ expect(typeof response.body.data[0].json.supports_sso).toBe("boolean");
45
+ expect(response.body.data[0].json).toHaveProperty("is_open_source");
46
+ expect(response.body.data[0].json.is_open_source).toBe(true);
47
+ expect(typeof response.body.data[0].json.is_open_source).toBe("boolean");
48
+ }, 180000);
49
+ });
50
+ }
51
+ });
src/__tests__/snips/billing.test.ts ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // import { batchScrape, crawl, creditUsage, extract, map, scrape, search, tokenUsage } from "./lib";
2
+
3
+ // const sleep = (ms: number) => new Promise(x => setTimeout(() => x(true), ms));
4
+ // const sleepForBatchBilling = () => sleep(20000);
5
+
6
+ // beforeAll(async () => {
7
+ // // Wait for previous test runs to stop billing processing
8
+ // if (!process.env.TEST_SUITE_SELF_HOSTED) {
9
+ // await sleep(40000);
10
+ // }
11
+ // }, 50000);
12
+
13
+ // describe("Billing tests", () => {
14
+ // if (process.env.TEST_SUITE_SELF_HOSTED) {
15
+ // it("dummy", () => {
16
+ // expect(true).toBe(true);
17
+ // });
18
+ // } else {
19
+ // it("bills scrape correctly", async () => {
20
+ // const rc1 = (await creditUsage()).remaining_credits;
21
+
22
+ // // Run all scrape operations in parallel with Promise.all
23
+ // await Promise.all([
24
+ // // scrape 1: regular fc.dev scrape (1 credit)
25
+ // scrape({
26
+ // url: "https://firecrawl.dev"
27
+ // }),
28
+
29
+ // // scrape 1.1: regular fc.dev scrape (1 credit)
30
+ // scrape({
31
+ // url: "https://firecrawl.dev"
32
+ // }),
33
+
34
+ // // scrape 2: fc.dev with json (5 credits)
35
+ // scrape({
36
+ // url: "https://firecrawl.dev",
37
+ // formats: ["json"],
38
+ // jsonOptions: {
39
+ // schema: {
40
+ // type: "object",
41
+ // properties: {
42
+ // is_open_source: { type: "boolean" },
43
+ // },
44
+ // required: ["is_open_source"],
45
+ // },
46
+ // },
47
+ // })
48
+ // ]);
49
+
50
+ // // sum: 7 credits
51
+
52
+ // await sleepForBatchBilling();
53
+
54
+ // const rc2 = (await creditUsage()).remaining_credits;
55
+
56
+ // expect(rc1 - rc2).toBe(7);
57
+ // }, 120000);
58
+
59
+ // it("bills batch scrape correctly", async () => {
60
+ // const rc1 = (await creditUsage()).remaining_credits;
61
+
62
+ // // Run both scrape operations in parallel with Promise.all
63
+ // const [scrape1, scrape2] = await Promise.all([
64
+ // // scrape 1: regular batch scrape with failing domain (2 credits)
65
+ // batchScrape({
66
+ // urls: [
67
+ // "https://firecrawl.dev",
68
+ // "https://mendable.ai",
69
+ // "https://thisdomaindoesnotexistandwillfail.fcr",
70
+ // ],
71
+ // }),
72
+
73
+ // // scrape 2: batch scrape with json (10 credits)
74
+ // batchScrape({
75
+ // urls: [
76
+ // "https://firecrawl.dev",
77
+ // "https://mendable.ai",
78
+ // "https://thisdomaindoesnotexistandwillfail.fcr",
79
+ // ],
80
+ // formats: ["json"],
81
+ // jsonOptions: {
82
+ // schema: {
83
+ // type: "object",
84
+ // properties: {
85
+ // four_word_summary: { type: "string" },
86
+ // },
87
+ // required: ["four_word_summary"],
88
+ // },
89
+ // },
90
+ // })
91
+ // ]);
92
+
93
+ // // sum: 12 credits
94
+
95
+ // await sleepForBatchBilling();
96
+
97
+ // const rc2 = (await creditUsage()).remaining_credits;
98
+
99
+ // expect(rc1 - rc2).toBe(12);
100
+ // }, 600000);
101
+
102
+ // it("bills crawl correctly", async () => {
103
+ // const rc1 = (await creditUsage()).remaining_credits;
104
+
105
+ // // Run both crawl operations in parallel with Promise.all
106
+ // const [crawl1, crawl2] = await Promise.all([
107
+ // // crawl 1: regular fc.dev crawl (x credits)
108
+ // crawl({
109
+ // url: "https://firecrawl.dev",
110
+ // }),
111
+
112
+ // // crawl 2: fc.dev crawl with json (5y credits)
113
+ // crawl({
114
+ // url: "https://firecrawl.dev",
115
+ // scrapeOptions: {
116
+ // formats: ["json"],
117
+ // jsonOptions: {
118
+ // schema: {
119
+ // type: "object",
120
+ // properties: {
121
+ // four_word_summary: { type: "string" },
122
+ // },
123
+ // required: ["four_word_summary"],
124
+ // },
125
+ // },
126
+ // }
127
+ // })
128
+ // ]);
129
+
130
+ // expect(crawl1.success).toBe(true);
131
+ // expect(crawl2.success).toBe(true);
132
+
133
+ // // sum: x+5y credits
134
+
135
+ // await sleepForBatchBilling();
136
+
137
+ // const rc2 = (await creditUsage()).remaining_credits;
138
+
139
+ // if (crawl1.success && crawl2.success) {
140
+ // expect(rc1 - rc2).toBe(crawl1.completed + crawl2.completed * 5);
141
+ // }
142
+ // }, 600000);
143
+
144
+ // it("bills map correctly", async () => {
145
+ // const rc1 = (await creditUsage()).remaining_credits;
146
+ // await map({ url: "https://firecrawl.dev" });
147
+ // await sleepForBatchBilling();
148
+ // const rc2 = (await creditUsage()).remaining_credits;
149
+ // expect(rc1 - rc2).toBe(1);
150
+ // }, 60000);
151
+
152
+ // it("bills search correctly", async () => {
153
+ // const rc1 = (await creditUsage()).remaining_credits;
154
+
155
+ // const results = await search({
156
+ // query: "firecrawl"
157
+ // });
158
+
159
+ // await sleepForBatchBilling();
160
+
161
+ // const rc2 = (await creditUsage()).remaining_credits;
162
+
163
+ // expect(rc1 - rc2).toBe(results.length);
164
+ // }, 60000);
165
+
166
+ // it("bills extract correctly", async () => {
167
+ // const rc1 = (await tokenUsage()).remaining_tokens;
168
+
169
+ // await extract({
170
+ // urls: ["https://firecrawl.dev"],
171
+ // schema: {
172
+ // "type": "object",
173
+ // "properties": {
174
+ // "is_open_source": {
175
+ // "type": "boolean"
176
+ // }
177
+ // },
178
+ // "required": [
179
+ // "is_open_source"
180
+ // ]
181
+ // },
182
+ // origin: "api-sdk",
183
+ // });
184
+
185
+ // await sleepForBatchBilling();
186
+
187
+ // const rc2 = (await tokenUsage()).remaining_tokens;
188
+
189
+ // expect(rc1 - rc2).toBe(305);
190
+ // }, 300000);
191
+ // }
192
+ // });
193
+
194
+ // temporarily disabled
195
+ it("is mocked", () => {
196
+ expect(true).toBe(true);
197
+ });
src/__tests__/snips/crawl.test.ts ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { crawl } from "./lib";
2
+
3
+ describe("Crawl tests", () => {
4
+ it.concurrent("works", async () => {
5
+ await crawl({
6
+ url: "https://firecrawl.dev",
7
+ limit: 10,
8
+ });
9
+ }, 120000);
10
+
11
+ it.concurrent("filters URLs properly", async () => {
12
+ const res = await crawl({
13
+ url: "https://firecrawl.dev/pricing",
14
+ includePaths: ["^/pricing$"],
15
+ limit: 10,
16
+ });
17
+
18
+ expect(res.success).toBe(true);
19
+ if (res.success) {
20
+ expect(res.completed).toBe(1);
21
+ expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing");
22
+ }
23
+ }, 120000);
24
+
25
+ it.concurrent("filters URLs properly when using regexOnFullURL", async () => {
26
+ const res = await crawl({
27
+ url: "https://firecrawl.dev/pricing",
28
+ includePaths: ["^https://(www\\.)?firecrawl\\.dev/pricing$"],
29
+ regexOnFullURL: true,
30
+ limit: 10,
31
+ });
32
+
33
+ expect(res.success).toBe(true);
34
+ if (res.success) {
35
+ expect(res.completed).toBe(1);
36
+ expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing");
37
+ }
38
+ }, 120000);
39
+
40
+ // TEMP: Flaky
41
+ // it.concurrent("discovers URLs properly when origin is not included", async () => {
42
+ // const res = await crawl({
43
+ // url: "https://firecrawl.dev",
44
+ // includePaths: ["^/blog"],
45
+ // ignoreSitemap: true,
46
+ // limit: 10,
47
+ // });
48
+
49
+ // expect(res.success).toBe(true);
50
+ // if (res.success) {
51
+ // expect(res.data.length).toBeGreaterThan(1);
52
+ // for (const page of res.data) {
53
+ // expect(page.metadata.url ?? page.metadata.sourceURL).toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog/);
54
+ // }
55
+ // }
56
+ // }, 300000);
57
+
58
+ // TEMP: Flaky
59
+ // it.concurrent("discovers URLs properly when maxDiscoveryDepth is provided", async () => {
60
+ // const res = await crawl({
61
+ // url: "https://firecrawl.dev",
62
+ // ignoreSitemap: true,
63
+ // maxDiscoveryDepth: 1,
64
+ // limit: 10,
65
+ // });
66
+
67
+ // expect(res.success).toBe(true);
68
+ // if (res.success) {
69
+ // expect(res.data.length).toBeGreaterThan(1);
70
+ // for (const page of res.data) {
71
+ // expect(page.metadata.url ?? page.metadata.sourceURL).not.toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog\/.+$/);
72
+ // }
73
+ // }
74
+ // }, 300000);
75
+ });
src/__tests__/snips/extract.test.ts ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { extract } from "./lib";
2
+
3
+ describe("Extract tests", () => {
4
+ if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY || process.env.OLLAMA_BASE_URL) {
5
+ it.concurrent("works", async () => {
6
+ const res = await extract({
7
+ urls: ["https://firecrawl.dev"],
8
+ schema: {
9
+ "type": "object",
10
+ "properties": {
11
+ "company_mission": {
12
+ "type": "string"
13
+ },
14
+ "is_open_source": {
15
+ "type": "boolean"
16
+ }
17
+ },
18
+ "required": [
19
+ "company_mission",
20
+ "is_open_source"
21
+ ]
22
+ },
23
+ origin: "api-sdk",
24
+ });
25
+
26
+ expect(res.data).toHaveProperty("company_mission");
27
+ expect(typeof res.data.company_mission).toBe("string")
28
+ expect(res.data).toHaveProperty("is_open_source");
29
+ expect(typeof res.data.is_open_source).toBe("boolean");
30
+ expect(res.data.is_open_source).toBe(true);
31
+ }, 60000);
32
+
33
+ it.concurrent("works with unsupported JSON schema parameters", async () => {
34
+ const res = await extract({
35
+ urls: ["https://firecrawl.dev"],
36
+ schema: {
37
+ "type": "object",
38
+ "properties": {
39
+ "company_name": {
40
+ "type": "string",
41
+ "pattern": "^[a-zA-Z0-9]+$"
42
+ },
43
+ },
44
+ "required": [
45
+ "company_name"
46
+ ]
47
+ },
48
+ origin: "api-sdk",
49
+ });
50
+
51
+ expect(res.data).toHaveProperty("company_name");
52
+ expect(typeof res.data.company_name).toBe("string")
53
+ }, 60000);
54
+ } else {
55
+ it.concurrent("dummy test", () => {
56
+ expect(true).toBe(true);
57
+ });
58
+ }
59
+ });
src/__tests__/snips/lib.ts ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { configDotenv } from "dotenv";
2
+ configDotenv();
3
+
4
+ import { ScrapeRequestInput, Document, ExtractRequestInput, ExtractResponse, CrawlRequestInput, MapRequestInput, BatchScrapeRequestInput, SearchRequestInput, CrawlStatusResponse } from "../../controllers/v1/types";
5
+ import request from "supertest";
6
+
7
+ // =========================================
8
+ // Configuration
9
+ // =========================================
10
+
11
+ const TEST_URL = "http://127.0.0.1:3002";
12
+
13
+ // =========================================
14
+ // Scrape API
15
+ // =========================================
16
+
17
+ async function scrapeRaw(body: ScrapeRequestInput) {
18
+ return await request(TEST_URL)
19
+ .post("/v1/scrape")
20
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
21
+ .set("Content-Type", "application/json")
22
+ .send(body);
23
+ }
24
+
25
+ function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrapeRaw>>) {
26
+ expect(response.statusCode).toBe(200);
27
+ expect(response.body.success).toBe(true);
28
+ expect(typeof response.body.data).toBe("object");
29
+ }
30
+
31
+ export async function scrape(body: ScrapeRequestInput): Promise<Document> {
32
+ const raw = await scrapeRaw(body);
33
+ expectScrapeToSucceed(raw);
34
+ return raw.body.data;
35
+ }
36
+
37
+ export async function scrapeStatusRaw(jobId: string) {
38
+ return await request(TEST_URL)
39
+ .get("/v1/scrape/" + encodeURIComponent(jobId))
40
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
41
+ .send();
42
+ }
43
+
44
+ export async function scrapeStatus(jobId: string): Promise<Document> {
45
+ const raw = await scrapeStatusRaw(jobId);
46
+ expect(raw.statusCode).toBe(200);
47
+ expect(raw.body.success).toBe(true);
48
+ expect(typeof raw.body.data).toBe("object");
49
+ expect(raw.body.data).not.toBeNull();
50
+ expect(raw.body.data).toBeDefined();
51
+ return raw.body.data;
52
+ }
53
+
54
+ // =========================================
55
+ // Crawl API
56
+ // =========================================
57
+
58
+ async function crawlStart(body: CrawlRequestInput) {
59
+ return await request(TEST_URL)
60
+ .post("/v1/crawl")
61
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
62
+ .set("Content-Type", "application/json")
63
+ .send(body);
64
+ }
65
+
66
+ async function crawlStatus(id: string) {
67
+ return await request(TEST_URL)
68
+ .get("/v1/crawl/" + encodeURIComponent(id))
69
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
70
+ .send();
71
+ }
72
+
73
+ function expectCrawlStartToSucceed(response: Awaited<ReturnType<typeof crawlStart>>) {
74
+ expect(response.statusCode).toBe(200);
75
+ expect(response.body.success).toBe(true);
76
+ expect(typeof response.body.id).toBe("string");
77
+ }
78
+
79
+ function expectCrawlToSucceed(response: Awaited<ReturnType<typeof crawlStatus>>) {
80
+ expect(response.statusCode).toBe(200);
81
+ expect(response.body.success).toBe(true);
82
+ expect(typeof response.body.status).toBe("string");
83
+ expect(response.body.status).toBe("completed");
84
+ expect(response.body).toHaveProperty("data");
85
+ expect(Array.isArray(response.body.data)).toBe(true);
86
+ expect(response.body.data.length).toBeGreaterThan(0);
87
+ }
88
+
89
+ export async function crawl(body: CrawlRequestInput): Promise<CrawlStatusResponse> {
90
+ const cs = await crawlStart(body);
91
+ expectCrawlStartToSucceed(cs);
92
+
93
+ let x;
94
+
95
+ do {
96
+ x = await crawlStatus(cs.body.id);
97
+ expect(x.statusCode).toBe(200);
98
+ expect(typeof x.body.status).toBe("string");
99
+ } while (x.body.status === "scraping");
100
+
101
+ expectCrawlToSucceed(x);
102
+ return x.body;
103
+ }
104
+
105
+ // =========================================
106
+ // Batch Scrape API
107
+ // =========================================
108
+
109
+ async function batchScrapeStart(body: BatchScrapeRequestInput) {
110
+ return await request(TEST_URL)
111
+ .post("/v1/batch/scrape")
112
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
113
+ .set("Content-Type", "application/json")
114
+ .send(body);
115
+ }
116
+
117
+ async function batchScrapeStatus(id: string) {
118
+ return await request(TEST_URL)
119
+ .get("/v1/batch/scrape/" + encodeURIComponent(id))
120
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
121
+ .send();
122
+ }
123
+
124
+ function expectBatchScrapeStartToSucceed(response: Awaited<ReturnType<typeof batchScrape>>) {
125
+ expect(response.statusCode).toBe(200);
126
+ expect(response.body.success).toBe(true);
127
+ expect(typeof response.body.id).toBe("string");
128
+ }
129
+
130
+ function expectBatchScrapeToSucceed(response: Awaited<ReturnType<typeof batchScrapeStatus>>) {
131
+ expect(response.statusCode).toBe(200);
132
+ expect(response.body.success).toBe(true);
133
+ expect(typeof response.body.status).toBe("string");
134
+ expect(response.body.status).toBe("completed");
135
+ expect(response.body).toHaveProperty("data");
136
+ expect(Array.isArray(response.body.data)).toBe(true);
137
+ expect(response.body.data.length).toBeGreaterThan(0);
138
+ }
139
+
140
+ export async function batchScrape(body: BatchScrapeRequestInput): ReturnType<typeof batchScrapeStatus> {
141
+ const bss = await batchScrapeStart(body);
142
+ expectBatchScrapeStartToSucceed(bss);
143
+
144
+ let x;
145
+
146
+ do {
147
+ x = await batchScrapeStatus(bss.body.id);
148
+ expect(x.statusCode).toBe(200);
149
+ expect(typeof x.body.status).toBe("string");
150
+ } while (x.body.status === "scraping");
151
+
152
+ expectBatchScrapeToSucceed(x);
153
+ return x;
154
+ }
155
+
156
+ // =========================================
157
+ // Map API
158
+ // =========================================
159
+
160
+ export async function map(body: MapRequestInput) {
161
+ return await request(TEST_URL)
162
+ .post("/v1/map")
163
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
164
+ .set("Content-Type", "application/json")
165
+ .send(body);
166
+ }
167
+
168
+ export function expectMapToSucceed(response: Awaited<ReturnType<typeof map>>) {
169
+ expect(response.statusCode).toBe(200);
170
+ expect(response.body.success).toBe(true);
171
+ expect(Array.isArray(response.body.links)).toBe(true);
172
+ expect(response.body.links.length).toBeGreaterThan(0);
173
+ }
174
+
175
+ // =========================================
176
+ // Extract API
177
+ // =========================================
178
+
179
+ async function extractStart(body: ExtractRequestInput) {
180
+ return await request(TEST_URL)
181
+ .post("/v1/extract")
182
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
183
+ .set("Content-Type", "application/json")
184
+ .send(body);
185
+ }
186
+
187
+ async function extractStatus(id: string) {
188
+ return await request(TEST_URL)
189
+ .get("/v1/extract/" + encodeURIComponent(id))
190
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
191
+ .send();
192
+ }
193
+
194
+
195
+ function expectExtractStartToSucceed(response: Awaited<ReturnType<typeof extractStart>>) {
196
+ expect(response.statusCode).toBe(200);
197
+ expect(response.body.success).toBe(true);
198
+ expect(typeof response.body.id).toBe("string");
199
+ }
200
+
201
+ function expectExtractToSucceed(response: Awaited<ReturnType<typeof extractStatus>>) {
202
+ expect(response.statusCode).toBe(200);
203
+ expect(response.body.success).toBe(true);
204
+ expect(typeof response.body.status).toBe("string");
205
+ expect(response.body.status).toBe("completed");
206
+ expect(response.body).toHaveProperty("data");
207
+ }
208
+
209
+ export async function extract(body: ExtractRequestInput): Promise<ExtractResponse> {
210
+ const es = await extractStart(body);
211
+ expectExtractStartToSucceed(es);
212
+
213
+ let x;
214
+
215
+ do {
216
+ x = await extractStatus(es.body.id);
217
+ expect(x.statusCode).toBe(200);
218
+ expect(typeof x.body.status).toBe("string");
219
+ } while (x.body.status === "processing");
220
+
221
+ expectExtractToSucceed(x);
222
+ return x.body;
223
+ }
224
+
225
+ // =========================================
226
+ // Search API
227
+ // =========================================
228
+
229
+ async function searchRaw(body: SearchRequestInput) {
230
+ return await request(TEST_URL)
231
+ .post("/v1/search")
232
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
233
+ .set("Content-Type", "application/json")
234
+ .send(body);
235
+ }
236
+
237
+ function expectSearchToSucceed(response: Awaited<ReturnType<typeof searchRaw>>) {
238
+ expect(response.statusCode).toBe(200);
239
+ expect(response.body.success).toBe(true);
240
+ expect(typeof response.body.data).toBe("object");
241
+ expect(Array.isArray(response.body.data)).toBe(true);
242
+ expect(response.body.data.length).toBeGreaterThan(0);
243
+ }
244
+
245
+ export async function search(body: SearchRequestInput): Promise<Document[]> {
246
+ const raw = await searchRaw(body);
247
+ expectSearchToSucceed(raw);
248
+ return raw.body.data;
249
+ }
250
+
251
+ // =========================================
252
+ // Billing API
253
+ // =========================================
254
+
255
+ export async function creditUsage(): Promise<{ remaining_credits: number }> {
256
+ const req = (await request(TEST_URL)
257
+ .get("/v1/team/credit-usage")
258
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
259
+ .set("Content-Type", "application/json"));
260
+
261
+ if (req.status !== 200) {
262
+ throw req.body;
263
+ }
264
+
265
+ return req.body.data;
266
+ }
267
+
268
+ export async function tokenUsage(): Promise<{ remaining_tokens: number }> {
269
+ return (await request(TEST_URL)
270
+ .get("/v1/team/token-usage")
271
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
272
+ .set("Content-Type", "application/json")).body.data;
273
+ }
src/__tests__/snips/map.test.ts ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { expectMapToSucceed, map } from "./lib";
2
+
3
+ describe("Map tests", () => {
4
+ it.concurrent("basic map succeeds", async () => {
5
+ const response = await map({
6
+ url: "http://firecrawl.dev",
7
+ });
8
+
9
+ expectMapToSucceed(response);
10
+ }, 10000);
11
+
12
+ it.concurrent("times out properly", async () => {
13
+ const response = await map({
14
+ url: "http://firecrawl.dev",
15
+ timeout: 1
16
+ });
17
+
18
+ expect(response.statusCode).toBe(408);
19
+ expect(response.body.success).toBe(false);
20
+ expect(response.body.error).toBe("Request timed out");
21
+ }, 10000);
22
+
23
+ it.concurrent("handles query parameters correctly", async () => {
24
+ let response = await map({
25
+ url: "https://www.hfea.gov.uk",
26
+ sitemapOnly: true,
27
+ useMock: "map-query-params",
28
+ });
29
+
30
+ expect(response.statusCode).toBe(200);
31
+ expect(response.body.success).toBe(true);
32
+ expect(response.body.links.some(x => x.match(/^https:\/\/www\.hfea\.gov\.uk\/choose-a-clinic\/clinic-search\/results\/?\?options=\d+$/))).toBe(true);
33
+ }, 60000);
34
+ });
src/__tests__/snips/mocks/map-query-params.json ADDED
The diff for this file is too large to render. See raw diff
 
src/__tests__/snips/mocks/mocking-works-properly.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "time": 1735911273239,
4
+ "options": {
5
+ "url": "<fire-engine>/scrape",
6
+ "method": "POST",
7
+ "body": {
8
+ "url": "http://firecrawl.dev",
9
+ "engine": "chrome-cdp",
10
+ "instantReturn": true,
11
+ "skipTlsVerification": false,
12
+ "priority": 10,
13
+ "mobile": false,
14
+ "timeout": 15000
15
+ },
16
+ "headers": {},
17
+ "ignoreResponse": false,
18
+ "ignoreFailure": false,
19
+ "tryCount": 3
20
+ },
21
+ "result": {
22
+ "status": 200,
23
+ "headers": {},
24
+ "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp\",\"processing\":true}"
25
+ }
26
+ },
27
+ {
28
+ "time": 1735911273354,
29
+ "options": {
30
+ "url": "<fire-engine>/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
31
+ "method": "GET",
32
+ "headers": {},
33
+ "ignoreResponse": false,
34
+ "ignoreFailure": false,
35
+ "tryCount": 1
36
+ },
37
+ "result": {
38
+ "status": 200,
39
+ "headers": {},
40
+ "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"prioritized\",\"processing\":true}"
41
+ }
42
+ },
43
+ {
44
+ "time": 1735911273720,
45
+ "options": {
46
+ "url": "<fire-engine>/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
47
+ "method": "GET",
48
+ "headers": {},
49
+ "ignoreResponse": false,
50
+ "ignoreFailure": false,
51
+ "tryCount": 1
52
+ },
53
+ "result": {
54
+ "status": 200,
55
+ "headers": {},
56
+ "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"active\",\"processing\":true}"
57
+ }
58
+ },
59
+ {
60
+ "time": 1735911274092,
61
+ "options": {
62
+ "url": "<fire-engine>/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
63
+ "method": "GET",
64
+ "headers": {},
65
+ "ignoreResponse": false,
66
+ "ignoreFailure": false,
67
+ "tryCount": 1
68
+ },
69
+ "result": {
70
+ "status": 200,
71
+ "headers": {},
72
+ "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"active\",\"processing\":true}"
73
+ }
74
+ },
75
+ {
76
+ "time": 1735911274467,
77
+ "options": {
78
+ "url": "<fire-engine>/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
79
+ "method": "GET",
80
+ "headers": {},
81
+ "ignoreResponse": false,
82
+ "ignoreFailure": false,
83
+ "tryCount": 1
84
+ },
85
+ "result": {
86
+ "status": 200,
87
+ "headers": {},
88
+ "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"active\",\"processing\":true}"
89
+ }
90
+ },
91
+ {
92
+ "time": 1735911274947,
93
+ "options": {
94
+ "url": "<fire-engine>/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
95
+ "method": "GET",
96
+ "headers": {},
97
+ "ignoreResponse": false,
98
+ "ignoreFailure": false,
99
+ "tryCount": 1
100
+ },
101
+ "result": {
102
+ "status": 200,
103
+ "headers": {},
104
+ "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"completed\",\"processing\":false,\"timeTaken\":1.204,\"content\":\"<!DOCTYPE html><html lang=\\\"en\\\"><body><p>this is fake data coming from the mocking system!</p></body></html>\",\"url\":\"https://www.firecrawl.dev/\",\"screenshots\":[],\"actionContent\":[],\"pageStatusCode\":200,\"responseHeaders\":{\"X-DNS-Prefetch-Control\":\"off\",\"age\":\"0\",\"cache-control\":\"private, no-cache, no-store, max-age=0, must-revalidate\",\"content-encoding\":\"br\",\"content-type\":\"text/html; charset=utf-8\",\"date\":\"Fri, 03 Jan 2025 13:34:34 GMT\",\"link\":\"</_next/static/media/171883e03d2067b6-s.p.woff2>; rel=preload; as=\\\"font\\\"; crossorigin=\\\"\\\"; type=\\\"font/woff2\\\", </_next/static/media/a34f9d1faa5f3315-s.p.woff2>; rel=preload; as=\\\"font\\\"; crossorigin=\\\"\\\"; type=\\\"font/woff2\\\", </_next/static/media/c4c7b0ec92b72e30-s.p.woff2>; rel=preload; as=\\\"font\\\"; crossorigin=\\\"\\\"; type=\\\"font/woff2\\\"\",\"permissions-policy\":\"keyboard-map=(), attribution-reporting=(), run-ad-auction=(), private-state-token-redemption=(), private-state-token-issuance=(), join-ad-interest-group=(), idle-detection=(), compute-pressure=(), browsing-topics=()\",\"server\":\"Vercel\",\"strict-transport-security\":\"max-age=63072000\",\"vary\":\"RSC, Next-Router-State-Tree, Next-Router-Prefetch\",\"x-matched-path\":\"/\",\"x-powered-by\":\"Next.js\",\"x-vercel-cache\":\"MISS\",\"x-vercel-id\":\"iad1::iad1::bs88l-1735911273932-1f7bba7a8b45\"},\"invalidTlsCert\":false,\"file\":null}"
105
+ }
106
+ }
107
+ ]
src/__tests__/snips/scrape.test.ts ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { scrape, scrapeStatus } from "./lib";
2
+
3
+ describe("Scrape tests", () => {
4
+ it.concurrent("mocking works properly", async () => {
5
+ // depends on falsified mock mocking-works-properly
6
+ // this test will fail if mock is bypassed with real data -- firecrawl.dev will never have
7
+ // that as its actual markdown output
8
+
9
+ const response = await scrape({
10
+ url: "http://firecrawl.dev",
11
+ useMock: "mocking-works-properly",
12
+ });
13
+
14
+ expect(response.markdown).toBe(
15
+ "this is fake data coming from the mocking system!",
16
+ );
17
+ }, 30000);
18
+
19
+ it.concurrent("works", async () => {
20
+ const response = await scrape({
21
+ url: "http://firecrawl.dev"
22
+ });
23
+
24
+ expect(response.markdown).toContain("Firecrawl");
25
+ }, 30000);
26
+
27
+ it.concurrent("scrape status works", async () => {
28
+ const response = await scrape({
29
+ url: "http://firecrawl.dev"
30
+ });
31
+
32
+ expect(response.markdown).toContain("Firecrawl");
33
+
34
+ const status = await scrapeStatus(response.metadata.scrapeId!);
35
+ expect(JSON.stringify(status)).toBe(JSON.stringify(response));
36
+ }, 60000);
37
+
38
+ it.concurrent("handles non-UTF-8 encodings", async () => {
39
+ const response = await scrape({
40
+ url: "https://www.rtpro.yamaha.co.jp/RT/docs/misc/kanji-sjis.html",
41
+ });
42
+
43
+ expect(response.markdown).toContain("ぐ け げ こ ご さ ざ し じ す ず せ ぜ そ ぞ た");
44
+ }, 30000);
45
+
46
+ if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) {
47
+ it.concurrent("self-hosted proxy works", async () => {
48
+ const response = await scrape({
49
+ url: "https://icanhazip.com"
50
+ });
51
+
52
+ expect(response.markdown?.trim()).toBe(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]);
53
+ }, 30000);
54
+ }
55
+
56
+ if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.PLAYWRIGHT_MICROSERVICE_URL) {
57
+ it.concurrent("waitFor works", async () => {
58
+ const response = await scrape({
59
+ url: "http://firecrawl.dev",
60
+ waitFor: 2000,
61
+ });
62
+
63
+ expect(response.markdown).toContain("Firecrawl");
64
+ }, 30000);
65
+ }
66
+
67
+ describe("JSON scrape support", () => {
68
+ it.concurrent("returns parseable JSON", async () => {
69
+ const response = await scrape({
70
+ url: "https://jsonplaceholder.typicode.com/todos/1",
71
+ formats: ["rawHtml"],
72
+ });
73
+
74
+ const obj = JSON.parse(response.rawHtml!);
75
+ expect(obj.id).toBe(1);
76
+ }, 30000);
77
+ });
78
+
79
+ if (!process.env.TEST_SUITE_SELF_HOSTED) {
80
+ // describe("Ad blocking (f-e dependant)", () => {
81
+ // it.concurrent("blocks ads by default", async () => {
82
+ // const response = await scrape({
83
+ // url: "https://www.allrecipes.com/recipe/18185/yum/",
84
+ // });
85
+
86
+ // expect(response.markdown).not.toContain(".g.doubleclick.net/");
87
+ // }, 30000);
88
+
89
+ // it.concurrent("doesn't block ads if explicitly disabled", async () => {
90
+ // const response = await scrape({
91
+ // url: "https://www.allrecipes.com/recipe/18185/yum/",
92
+ // blockAds: false,
93
+ // });
94
+
95
+ // expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//);
96
+ // }, 30000);
97
+ // });
98
+
99
+ describe("Change Tracking format", () => {
100
+ it.concurrent("works", async () => {
101
+ const response = await scrape({
102
+ url: "https://example.com",
103
+ formats: ["markdown", "changeTracking"],
104
+ });
105
+
106
+ expect(response.changeTracking).toBeDefined();
107
+ expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
108
+ }, 30000);
109
+
110
+ it.concurrent("includes git diff when requested", async () => {
111
+ const response = await scrape({
112
+ url: "https://example.com",
113
+ formats: ["markdown", "changeTracking"],
114
+ changeTrackingOptions: {
115
+ modes: ["git-diff"]
116
+ }
117
+ });
118
+
119
+ expect(response.changeTracking).toBeDefined();
120
+ expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
121
+
122
+ if (response.changeTracking?.changeStatus === "changed") {
123
+ expect(response.changeTracking?.diff).toBeDefined();
124
+ expect(response.changeTracking?.diff?.text).toBeDefined();
125
+ expect(response.changeTracking?.diff?.json).toBeDefined();
126
+ expect(response.changeTracking?.diff?.json.files).toBeInstanceOf(Array);
127
+ }
128
+ }, 30000);
129
+
130
+ it.concurrent("includes structured output when requested", async () => {
131
+ const response = await scrape({
132
+ url: "https://example.com",
133
+ formats: ["markdown", "changeTracking"],
134
+ changeTrackingOptions: {
135
+ modes: ["json"],
136
+ prompt: "Summarize the changes between the previous and current content",
137
+ }
138
+ });
139
+
140
+ expect(response.changeTracking).toBeDefined();
141
+ expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
142
+
143
+ if (response.changeTracking?.changeStatus === "changed") {
144
+ expect(response.changeTracking?.json).toBeDefined();
145
+ }
146
+ }, 30000);
147
+
148
+ it.concurrent("supports schema-based extraction for change tracking", async () => {
149
+ const response = await scrape({
150
+ url: "https://example.com",
151
+ formats: ["markdown", "changeTracking"],
152
+ changeTrackingOptions: {
153
+ modes: ["json"],
154
+ schema: {
155
+ type: "object",
156
+ properties: {
157
+ pricing: {
158
+ type: "object",
159
+ properties: {
160
+ amount: { type: "number" },
161
+ currency: { type: "string" }
162
+ }
163
+ },
164
+ features: {
165
+ type: "array",
166
+ items: { type: "string" }
167
+ }
168
+ }
169
+ }
170
+ }
171
+ });
172
+
173
+ expect(response.changeTracking).toBeDefined();
174
+ expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
175
+
176
+ if (response.changeTracking?.changeStatus === "changed") {
177
+ expect(response.changeTracking?.json).toBeDefined();
178
+ if (response.changeTracking?.json.pricing) {
179
+ expect(response.changeTracking?.json.pricing).toHaveProperty("old");
180
+ expect(response.changeTracking?.json.pricing).toHaveProperty("new");
181
+ }
182
+ if (response.changeTracking?.json.features) {
183
+ expect(response.changeTracking?.json.features).toHaveProperty("old");
184
+ expect(response.changeTracking?.json.features).toHaveProperty("new");
185
+ }
186
+ }
187
+ }, 30000);
188
+
189
+ it.concurrent("supports both git-diff and structured modes together", async () => {
190
+ const response = await scrape({
191
+ url: "https://example.com",
192
+ formats: ["markdown", "changeTracking"],
193
+ changeTrackingOptions: {
194
+ modes: ["git-diff", "json"],
195
+ schema: {
196
+ type: "object",
197
+ properties: {
198
+ summary: { type: "string" },
199
+ changes: { type: "array", items: { type: "string" } }
200
+ }
201
+ }
202
+ }
203
+ });
204
+
205
+ expect(response.changeTracking).toBeDefined();
206
+ expect(response.changeTracking?.previousScrapeAt).not.toBeNull();
207
+
208
+ if (response.changeTracking?.changeStatus === "changed") {
209
+ expect(response.changeTracking?.diff).toBeDefined();
210
+ expect(response.changeTracking?.diff?.text).toBeDefined();
211
+ expect(response.changeTracking?.diff?.json).toBeDefined();
212
+
213
+ expect(response.changeTracking?.json).toBeDefined();
214
+ expect(response.changeTracking?.json).toHaveProperty("summary");
215
+ expect(response.changeTracking?.json).toHaveProperty("changes");
216
+ }
217
+ }, 30000);
218
+ });
219
+
220
+ describe("Location API (f-e dependant)", () => {
221
+ it.concurrent("works without specifying an explicit location", async () => {
222
+ await scrape({
223
+ url: "https://iplocation.com",
224
+ });
225
+ }, 30000);
226
+
227
+ it.concurrent("works with country US", async () => {
228
+ const response = await scrape({
229
+ url: "https://iplocation.com",
230
+ location: { country: "US" },
231
+ });
232
+
233
+ expect(response.markdown).toContain("| Country | United States |");
234
+ }, 30000);
235
+ });
236
+
237
+ describe("Screenshot (f-e/sb dependant)", () => {
238
+ it.concurrent("screenshot format works", async () => {
239
+ const response = await scrape({
240
+ url: "http://firecrawl.dev",
241
+ formats: ["screenshot"]
242
+ });
243
+
244
+ expect(typeof response.screenshot).toBe("string");
245
+ }, 30000);
246
+
247
+ it.concurrent("screenshot@fullPage format works", async () => {
248
+ const response = await scrape({
249
+ url: "http://firecrawl.dev",
250
+ formats: ["screenshot@fullPage"]
251
+ });
252
+
253
+ expect(typeof response.screenshot).toBe("string");
254
+ }, 30000);
255
+ });
256
+
257
+ describe("Proxy API (f-e dependant)", () => {
258
+ it.concurrent("undefined works", async () => {
259
+ await scrape({
260
+ url: "http://firecrawl.dev",
261
+ });
262
+ }, 30000);
263
+
264
+ it.concurrent("basic works", async () => {
265
+ await scrape({
266
+ url: "http://firecrawl.dev",
267
+ proxy: "basic",
268
+ });
269
+ }, 30000);
270
+
271
+ it.concurrent("stealth works", async () => {
272
+ await scrape({
273
+ url: "http://firecrawl.dev",
274
+ proxy: "stealth",
275
+ timeout: 120000,
276
+ });
277
+ }, 130000);
278
+ });
279
+
280
+ // Temporarily disabled, too flaky
281
+ // describe("PDF (f-e dependant)", () => {
282
+ // it.concurrent("works for PDFs behind anti-bot", async () => {
283
+ // const response = await scrape({
284
+ // url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf"
285
+ // });
286
+
287
+ // expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix");
288
+ // }, 60000);
289
+ // });
290
+ }
291
+
292
+ if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY || process.env.OLLAMA_BASE_URL) {
293
+ describe("JSON format", () => {
294
+ it.concurrent("works", async () => {
295
+ const response = await scrape({
296
+ url: "http://firecrawl.dev",
297
+ formats: ["json"],
298
+ jsonOptions: {
299
+ prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
300
+ schema: {
301
+ type: "object",
302
+ properties: {
303
+ company_mission: {
304
+ type: "string",
305
+ },
306
+ supports_sso: {
307
+ type: "boolean",
308
+ },
309
+ is_open_source: {
310
+ type: "boolean",
311
+ },
312
+ },
313
+ required: ["company_mission", "supports_sso", "is_open_source"],
314
+ },
315
+ },
316
+ });
317
+
318
+ expect(response).toHaveProperty("json");
319
+ expect(response.json).toHaveProperty("company_mission");
320
+ expect(typeof response.json.company_mission).toBe("string");
321
+ expect(response.json).toHaveProperty("supports_sso");
322
+ expect(response.json.supports_sso).toBe(false);
323
+ expect(typeof response.json.supports_sso).toBe("boolean");
324
+ expect(response.json).toHaveProperty("is_open_source");
325
+ expect(response.json.is_open_source).toBe(true);
326
+ expect(typeof response.json.is_open_source).toBe("boolean");
327
+ }, 30000);
328
+ });
329
+ }
330
+ });
src/__tests__/snips/search.test.ts ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { search } from "./lib";
2
+
3
+ describe("Search tests", () => {
4
+ it.concurrent("works", async () => {
5
+ await search({
6
+ query: "firecrawl"
7
+ });
8
+ }, 60000);
9
+
10
+ it.concurrent("works with scrape", async () => {
11
+ const res = await search({
12
+ query: "firecrawl",
13
+ limit: 5,
14
+ scrapeOptions: {
15
+ formats: ["markdown"],
16
+ },
17
+ });
18
+
19
+ for (const doc of res) {
20
+ expect(doc.markdown).toBeDefined();
21
+ }
22
+ }, 60000);
23
+ });
src/__tests__/snips/utils/collect-mocks.js ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const path = require("path");
2
+ const fs = require("fs");
3
+
4
+ const mocksDirPath = path.join(__dirname, "../../../scraper/scrapeURL/mocks");
5
+ const files = fs.readdirSync(mocksDirPath);
6
+
7
+ const contents = files.map((x) =>
8
+ JSON.parse(fs.readFileSync(path.join(mocksDirPath, x), "utf8")),
9
+ );
10
+
11
+ fs.writeFileSync(
12
+ path.join(__dirname, "../mocks/" + process.argv[2] + ".json"),
13
+ JSON.stringify(contents, undefined, 4),
14
+ );
src/control.ts ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ // ! IN CASE OPENAI goes down, then activate the fallback -> true
2
+ export const is_fallback = false;
src/controllers/__tests__/crawl.test.ts ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { crawlController } from "../v0/crawl";
2
+ import { Request, Response } from "express";
3
+ import { authenticateUser } from "../auth"; // Ensure this import is correct
4
+ import { createIdempotencyKey } from "../../services/idempotency/create";
5
+ import { validateIdempotencyKey } from "../../services/idempotency/validate";
6
+ import { v4 as uuidv4 } from "uuid";
7
+
8
+ jest.mock("../auth", () => ({
9
+ authenticateUser: jest.fn().mockResolvedValue({
10
+ success: true,
11
+ team_id: "team123",
12
+ error: null,
13
+ status: 200,
14
+ }),
15
+ reduce: jest.fn(),
16
+ }));
17
+ jest.mock("../../services/idempotency/validate");
18
+
19
+ describe("crawlController", () => {
20
+ it("should prevent duplicate requests using the same idempotency key", async () => {
21
+ const req = {
22
+ headers: {
23
+ "x-idempotency-key": await uuidv4(),
24
+ Authorization: `Bearer ${process.env.TEST_API_KEY}`,
25
+ },
26
+ body: {
27
+ url: "https://mendable.ai",
28
+ },
29
+ } as unknown as Request;
30
+ const res = {
31
+ status: jest.fn().mockReturnThis(),
32
+ json: jest.fn(),
33
+ } as unknown as Response;
34
+
35
+ // Mock the idempotency key validation to return false for the second call
36
+ (validateIdempotencyKey as jest.Mock)
37
+ .mockResolvedValueOnce(true)
38
+ .mockResolvedValueOnce(false);
39
+
40
+ // First request should succeed
41
+ await crawlController(req, res);
42
+ expect(res.status).not.toHaveBeenCalledWith(409);
43
+
44
+ // Second request with the same key should fail
45
+ await crawlController(req, res);
46
+ expect(res.status).toHaveBeenCalledWith(409);
47
+ expect(res.json).toHaveBeenCalledWith({
48
+ error: "Idempotency key already used",
49
+ });
50
+ });
51
+ });
src/controllers/auth.ts ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { parseApi } from "../lib/parseApi";
2
+ import { getRateLimiter } from "../services/rate-limiter";
3
+ import {
4
+ AuthResponse,
5
+ NotificationType,
6
+ RateLimiterMode,
7
+ } from "../types";
8
+ import { supabase_rr_service, supabase_service } from "../services/supabase";
9
+ import { withAuth } from "../lib/withAuth";
10
+ import { RateLimiterRedis } from "rate-limiter-flexible";
11
+ import { sendNotification } from "../services/notification/email_notification";
12
+ import { logger } from "../lib/logger";
13
+ import { redlock } from "../services/redlock";
14
+ import { deleteKey, getValue } from "../services/redis";
15
+ import { setValue } from "../services/redis";
16
+ import { validate } from "uuid";
17
+ import * as Sentry from "@sentry/node";
18
+ import { AuthCreditUsageChunk, AuthCreditUsageChunkFromTeam } from "./v1/types";
19
+ // const { data, error } = await supabase_service
20
+ // .from('api_keys')
21
+ // .select(`
22
+ // key,
23
+ // team_id,
24
+ // teams (
25
+ // subscriptions (
26
+ // price_id
27
+ // )
28
+ // )
29
+ // `)
30
+ // .eq('key', normalizedApi)
31
+ // .limit(1)
32
+ // .single();
33
+ function normalizedApiIsUuid(potentialUuid: string): boolean {
34
+ // Check if the string is a valid UUID
35
+ return validate(potentialUuid);
36
+ }
37
+
38
+ export async function setCachedACUC(
39
+ api_key: string,
40
+ is_extract: boolean,
41
+ acuc:
42
+ | AuthCreditUsageChunk
43
+ | null
44
+ | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null),
45
+ ) {
46
+ const cacheKeyACUC = `acuc_${api_key}_${is_extract ? "extract" : "scrape"}`;
47
+ const redLockKey = `lock_${cacheKeyACUC}`;
48
+
49
+ try {
50
+ await redlock.using([redLockKey], 10000, {}, async (signal) => {
51
+ if (typeof acuc === "function") {
52
+ acuc = acuc(JSON.parse((await getValue(cacheKeyACUC)) ?? "null"));
53
+
54
+ if (acuc === null) {
55
+ if (signal.aborted) {
56
+ throw signal.error;
57
+ }
58
+
59
+ return;
60
+ }
61
+ }
62
+
63
+ if (signal.aborted) {
64
+ throw signal.error;
65
+ }
66
+
67
+ // Cache for 10 minutes. - mogery
68
+ await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true);
69
+ });
70
+ } catch (error) {
71
+ logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
72
+ }
73
+ }
74
+
75
+ const mockPreviewACUC: (team_id: string, is_extract: boolean) => AuthCreditUsageChunk = (team_id, is_extract) => ({
76
+ api_key: "preview",
77
+ team_id,
78
+ sub_id: "bypass",
79
+ sub_current_period_start: new Date().toISOString(),
80
+ sub_current_period_end: new Date(new Date().getTime() + 30 * 24 * 60 * 60 * 1000).toISOString(),
81
+ sub_user_id: "bypass",
82
+ price_id: "bypass",
83
+ rate_limits: {
84
+ crawl: 2,
85
+ scrape: 10,
86
+ extract: 10,
87
+ search: 5,
88
+ map: 5,
89
+ preview: 5,
90
+ crawlStatus: 500,
91
+ extractStatus: 500,
92
+ extractAgentPreview: 1,
93
+ scrapeAgentPreview: 5,
94
+ },
95
+ price_credits: 99999999,
96
+ credits_used: 0,
97
+ coupon_credits: 99999999,
98
+ adjusted_credits_used: 0,
99
+ remaining_credits: 99999999,
100
+ total_credits_sum: 99999999,
101
+ plan_priority: {
102
+ bucketLimit: 25,
103
+ planModifier: 0.1,
104
+ },
105
+ concurrency: is_extract ? 200 : 2,
106
+ is_extract,
107
+ });
108
+
109
+ const mockACUC: () => AuthCreditUsageChunk = () => ({
110
+ api_key: "bypass",
111
+ team_id: "bypass",
112
+ sub_id: "bypass",
113
+ sub_current_period_start: new Date().toISOString(),
114
+ sub_current_period_end: new Date(new Date().getTime() + 30 * 24 * 60 * 60 * 1000).toISOString(),
115
+ sub_user_id: "bypass",
116
+ price_id: "bypass",
117
+ rate_limits: {
118
+ crawl: 99999999,
119
+ scrape: 99999999,
120
+ extract: 99999999,
121
+ search: 99999999,
122
+ map: 99999999,
123
+ preview: 99999999,
124
+ crawlStatus: 99999999,
125
+ extractStatus: 99999999,
126
+ extractAgentPreview: 99999999,
127
+ scrapeAgentPreview: 99999999,
128
+ },
129
+ price_credits: 99999999,
130
+ credits_used: 0,
131
+ coupon_credits: 99999999,
132
+ adjusted_credits_used: 0,
133
+ remaining_credits: 99999999,
134
+ total_credits_sum: 99999999,
135
+ plan_priority: {
136
+ bucketLimit: 25,
137
+ planModifier: 0.1,
138
+ },
139
+ concurrency: 99999999,
140
+ is_extract: false,
141
+ });
142
+
143
+ export async function getACUC(
144
+ api_key: string,
145
+ cacheOnly = false,
146
+ useCache = true,
147
+ mode?: RateLimiterMode,
148
+ ): Promise<AuthCreditUsageChunk | null> {
149
+ let isExtract =
150
+ mode === RateLimiterMode.Extract ||
151
+ mode === RateLimiterMode.ExtractStatus;
152
+
153
+ if (api_key === process.env.PREVIEW_TOKEN) {
154
+ const acuc = mockPreviewACUC(api_key, isExtract);
155
+ acuc.is_extract = isExtract;
156
+ return acuc;
157
+ }
158
+
159
+ if (process.env.USE_DB_AUTHENTICATION !== "true") {
160
+ const acuc = mockACUC();
161
+ acuc.is_extract = isExtract;
162
+ return acuc;
163
+ }
164
+
165
+ const cacheKeyACUC = `acuc_${api_key}_${isExtract ? "extract" : "scrape"}`;
166
+
167
+ if (useCache) {
168
+ const cachedACUC = await getValue(cacheKeyACUC);
169
+ if (cachedACUC !== null) {
170
+ return JSON.parse(cachedACUC);
171
+ }
172
+ }
173
+
174
+ if (!cacheOnly) {
175
+ let data;
176
+ let error;
177
+ let retries = 0;
178
+ const maxRetries = 5;
179
+ while (retries < maxRetries) {
180
+ const client =
181
+ Math.random() > (2/3) ? supabase_rr_service : supabase_service;
182
+ ({ data, error } = await client.rpc(
183
+ "auth_credit_usage_chunk_30",
184
+ { input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true },
185
+ { get: true },
186
+ ));
187
+
188
+ if (!error) {
189
+ break;
190
+ }
191
+
192
+ logger.warn(
193
+ `Failed to retrieve authentication and credit usage data after ${retries}, trying again...`,
194
+ { error }
195
+ );
196
+ retries++;
197
+ if (retries === maxRetries) {
198
+ throw new Error(
199
+ "Failed to retrieve authentication and credit usage data after 3 attempts: " +
200
+ JSON.stringify(error),
201
+ );
202
+ }
203
+
204
+ // Wait for a short time before retrying
205
+ await new Promise((resolve) => setTimeout(resolve, 200));
206
+ }
207
+
208
+ const chunk: AuthCreditUsageChunk | null =
209
+ data.length === 0 ? null : data[0].team_id === null ? null : data[0];
210
+
211
+ // NOTE: Should we cache null chunks? - mogery
212
+ if (chunk !== null && useCache) {
213
+ setCachedACUC(api_key, isExtract, chunk);
214
+ }
215
+
216
+ return chunk ? { ...chunk, is_extract: isExtract } : null;
217
+ } else {
218
+ return null;
219
+ }
220
+ }
221
+
222
+ export async function setCachedACUCTeam(
223
+ team_id: string,
224
+ is_extract: boolean,
225
+ acuc:
226
+ | AuthCreditUsageChunkFromTeam
227
+ | null
228
+ | ((acuc: AuthCreditUsageChunkFromTeam) => AuthCreditUsageChunkFromTeam | null),
229
+ ) {
230
+ const cacheKeyACUC = `acuc_team_${team_id}_${is_extract ? "extract" : "scrape"}`;
231
+ const redLockKey = `lock_${cacheKeyACUC}`;
232
+
233
+ try {
234
+ await redlock.using([redLockKey], 10000, {}, async (signal) => {
235
+ if (typeof acuc === "function") {
236
+ acuc = acuc(JSON.parse((await getValue(cacheKeyACUC)) ?? "null"));
237
+
238
+ if (acuc === null) {
239
+ if (signal.aborted) {
240
+ throw signal.error;
241
+ }
242
+
243
+ return;
244
+ }
245
+ }
246
+
247
+ if (signal.aborted) {
248
+ throw signal.error;
249
+ }
250
+
251
+ // Cache for 10 minutes. - mogery
252
+ await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true);
253
+ });
254
+ } catch (error) {
255
+ logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
256
+ }
257
+ }
258
+
259
+ export async function getACUCTeam(
260
+ team_id: string,
261
+ cacheOnly = false,
262
+ useCache = true,
263
+ mode?: RateLimiterMode,
264
+ ): Promise<AuthCreditUsageChunkFromTeam | null> {
265
+ let isExtract =
266
+ mode === RateLimiterMode.Extract ||
267
+ mode === RateLimiterMode.ExtractStatus;
268
+
269
+ if (team_id.startsWith("preview")) {
270
+ const acuc = mockPreviewACUC(team_id, isExtract);
271
+ return acuc;
272
+ }
273
+
274
+ if (process.env.USE_DB_AUTHENTICATION !== "true") {
275
+ const acuc = mockACUC();
276
+ acuc.is_extract = isExtract;
277
+ return acuc;
278
+ }
279
+
280
+ const cacheKeyACUC = `acuc_team_${team_id}_${isExtract ? "extract" : "scrape"}`;
281
+
282
+ if (useCache) {
283
+ const cachedACUC = await getValue(cacheKeyACUC);
284
+ if (cachedACUC !== null) {
285
+ return JSON.parse(cachedACUC);
286
+ }
287
+ }
288
+
289
+ if (!cacheOnly) {
290
+ let data;
291
+ let error;
292
+ let retries = 0;
293
+ const maxRetries = 5;
294
+
295
+ while (retries < maxRetries) {
296
+ const client =
297
+ Math.random() > (2/3) ? supabase_rr_service : supabase_service;
298
+ ({ data, error } = await client.rpc(
299
+ "auth_credit_usage_chunk_30_from_team",
300
+ { input_team: team_id, i_is_extract: isExtract, tally_untallied_credits: true },
301
+ { get: true },
302
+ ));
303
+
304
+ if (!error) {
305
+ break;
306
+ }
307
+
308
+ logger.warn(
309
+ `Failed to retrieve authentication and credit usage data after ${retries}, trying again...`,
310
+ { error }
311
+ );
312
+ retries++;
313
+ if (retries === maxRetries) {
314
+ throw new Error(
315
+ "Failed to retrieve authentication and credit usage data after 3 attempts: " +
316
+ JSON.stringify(error),
317
+ );
318
+ }
319
+
320
+ // Wait for a short time before retrying
321
+ await new Promise((resolve) => setTimeout(resolve, 200));
322
+ }
323
+
324
+ const chunk: AuthCreditUsageChunk | null =
325
+ data.length === 0 ? null : data[0].team_id === null ? null : data[0];
326
+
327
+ // NOTE: Should we cache null chunks? - mogery
328
+ if (chunk !== null && useCache) {
329
+ setCachedACUCTeam(team_id, isExtract, chunk);
330
+ }
331
+
332
+ return chunk ? { ...chunk, is_extract: isExtract } : null;
333
+ } else {
334
+ return null;
335
+ }
336
+ }
337
+
338
+ export async function clearACUC(api_key: string): Promise<void> {
339
+ // Delete cache for all rate limiter modes
340
+ const modes = [true, false];
341
+ await Promise.all(
342
+ modes.map(async (mode) => {
343
+ const cacheKey = `acuc_${api_key}_${mode ? "extract" : "scrape"}`;
344
+ await deleteKey(cacheKey);
345
+ }),
346
+ );
347
+
348
+ // Also clear the base cache key
349
+ await deleteKey(`acuc_${api_key}`);
350
+ }
351
+
352
+ export async function clearACUCTeam(team_id: string): Promise<void> {
353
+ // Delete cache for all rate limiter modes
354
+ const modes = [true, false];
355
+ await Promise.all(
356
+ modes.map(async (mode) => {
357
+ const cacheKey = `acuc_team_${team_id}_${mode ? "extract" : "scrape"}`;
358
+ await deleteKey(cacheKey);
359
+ }),
360
+ );
361
+
362
+ // Also clear the base cache key
363
+ await deleteKey(`acuc_team_${team_id}`);
364
+ }
365
+
366
+ export async function authenticateUser(
367
+ req,
368
+ res,
369
+ mode?: RateLimiterMode,
370
+ ): Promise<AuthResponse> {
371
+ return withAuth(supaAuthenticateUser, {
372
+ success: true,
373
+ chunk: null,
374
+ team_id: "bypass",
375
+ })(req, res, mode);
376
+ }
377
+
378
+ export async function supaAuthenticateUser(
379
+ req,
380
+ res,
381
+ mode?: RateLimiterMode,
382
+ ): Promise<AuthResponse> {
383
+ const authHeader =
384
+ req.headers.authorization ??
385
+ (req.headers["sec-websocket-protocol"]
386
+ ? `Bearer ${req.headers["sec-websocket-protocol"]}`
387
+ : null);
388
+ if (!authHeader) {
389
+ return { success: false, error: "Unauthorized", status: 401 };
390
+ }
391
+ const token = authHeader.split(" ")[1]; // Extract the token from "Bearer <token>"
392
+ if (!token) {
393
+ return {
394
+ success: false,
395
+ error: "Unauthorized: Token missing",
396
+ status: 401,
397
+ };
398
+ }
399
+
400
+ const incomingIP = (req.headers["x-preview-ip"] || req.headers["x-forwarded-for"] ||
401
+ req.socket.remoteAddress) as string;
402
+ const iptoken = incomingIP + token;
403
+
404
+ let rateLimiter: RateLimiterRedis;
405
+ let subscriptionData: { team_id: string} | null = null;
406
+ let normalizedApi: string;
407
+
408
+ let teamId: string | null = null;
409
+ let priceId: string | null = null;
410
+ let chunk: AuthCreditUsageChunk | null = null;
411
+ if (token == "this_is_just_a_preview_token") {
412
+ throw new Error(
413
+ "Unauthenticated Playground calls are temporarily disabled due to abuse. Please sign up.",
414
+ );
415
+ }
416
+ if (token == process.env.PREVIEW_TOKEN) {
417
+ if (mode == RateLimiterMode.CrawlStatus) {
418
+ rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
419
+ } else if (mode == RateLimiterMode.ExtractStatus) {
420
+ rateLimiter = getRateLimiter(RateLimiterMode.ExtractStatus, token);
421
+ } else {
422
+ rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
423
+ }
424
+ teamId = `preview_${iptoken}`;
425
+ } else {
426
+ normalizedApi = parseApi(token);
427
+ if (!normalizedApiIsUuid(normalizedApi)) {
428
+ return {
429
+ success: false,
430
+ error: "Unauthorized: Invalid token",
431
+ status: 401,
432
+ };
433
+ }
434
+
435
+ chunk = await getACUC(normalizedApi, false, true, mode);
436
+
437
+ if (chunk === null) {
438
+ return {
439
+ success: false,
440
+ error: "Unauthorized: Invalid token",
441
+ status: 401,
442
+ };
443
+ }
444
+
445
+ teamId = chunk.team_id;
446
+ priceId = chunk.price_id;
447
+
448
+ subscriptionData = {
449
+ team_id: teamId,
450
+ };
451
+ rateLimiter = getRateLimiter(
452
+ mode ?? RateLimiterMode.Crawl,
453
+ chunk.rate_limits,
454
+ );
455
+ }
456
+
457
+ const team_endpoint_token =
458
+ token === process.env.PREVIEW_TOKEN ? iptoken : teamId;
459
+
460
+ try {
461
+ await rateLimiter.consume(team_endpoint_token);
462
+ } catch (rateLimiterRes) {
463
+ logger.error(`Rate limit exceeded: ${rateLimiterRes}`, {
464
+ teamId,
465
+ priceId,
466
+ mode,
467
+ rateLimits: chunk?.rate_limits,
468
+ rateLimiterRes,
469
+ });
470
+ const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
471
+ const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
472
+
473
+ // We can only send a rate limit email every 7 days, send notification already has the date in between checking
474
+ const startDate = new Date();
475
+ const endDate = new Date();
476
+ endDate.setDate(endDate.getDate() + 7);
477
+
478
+ // await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
479
+
480
+ return {
481
+ success: false,
482
+ error: `Rate limit exceeded. Consumed (req/min): ${rateLimiterRes.consumedPoints}, Remaining (req/min): ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
483
+ status: 429,
484
+ };
485
+ }
486
+
487
+ if (
488
+ token === process.env.PREVIEW_TOKEN &&
489
+ (mode === RateLimiterMode.Scrape ||
490
+ mode === RateLimiterMode.Preview ||
491
+ mode === RateLimiterMode.Map ||
492
+ mode === RateLimiterMode.Crawl ||
493
+ mode === RateLimiterMode.CrawlStatus ||
494
+ mode === RateLimiterMode.Extract ||
495
+ mode === RateLimiterMode.Search)
496
+ ) {
497
+ return {
498
+ success: true,
499
+ team_id: `preview_${iptoken}`,
500
+ chunk: null,
501
+ };
502
+ // check the origin of the request and make sure its from firecrawl.dev
503
+ // const origin = req.headers.origin;
504
+ // if (origin && origin.includes("firecrawl.dev")){
505
+ // return { success: true, team_id: "preview" };
506
+ // }
507
+ // if(process.env.ENV !== "production") {
508
+ // return { success: true, team_id: "preview" };
509
+ // }
510
+
511
+ // return { success: false, error: "Unauthorized: Invalid token", status: 401 };
512
+ }
513
+
514
+ return {
515
+ success: true,
516
+ team_id: teamId ?? undefined,
517
+ chunk,
518
+ };
519
+ }
src/controllers/v0/admin/acuc-cache-clear.ts ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Request, Response } from "express";
2
+ import { supabase_service } from "../../../services/supabase";
3
+ import { clearACUC, clearACUCTeam } from "../../auth";
4
+ import { logger } from "../../../lib/logger";
5
+
6
+ export async function acucCacheClearController(req: Request, res: Response) {
7
+ try {
8
+ const team_id: string = req.body.team_id;
9
+
10
+ const keys = await supabase_service
11
+ .from("api_keys")
12
+ .select("*")
13
+ .eq("team_id", team_id);
14
+
15
+ await Promise.all((keys.data ?? []).map((x) => clearACUC(x.key)));
16
+ await clearACUCTeam(team_id);
17
+
18
+ logger.info(`ACUC cache cleared for team ${team_id}`);
19
+ res.json({ ok: true });
20
+ } catch (error) {
21
+ logger.error(`Error clearing ACUC cache via API route: ${error}`);
22
+ res.status(500).json({ error: "Internal server error" });
23
+ }
24
+ }