Spaces:
Paused
Paused
| import { scrape, scrapeStatus } from "./lib"; | |
| describe("Scrape tests", () => { | |
| it.concurrent("mocking works properly", async () => { | |
| // depends on falsified mock mocking-works-properly | |
| // this test will fail if mock is bypassed with real data -- firecrawl.dev will never have | |
| // that as its actual markdown output | |
| const response = await scrape({ | |
| url: "http://firecrawl.dev", | |
| useMock: "mocking-works-properly", | |
| }); | |
| expect(response.markdown).toBe( | |
| "this is fake data coming from the mocking system!", | |
| ); | |
| }, 30000); | |
| it.concurrent("works", async () => { | |
| const response = await scrape({ | |
| url: "http://firecrawl.dev" | |
| }); | |
| expect(response.markdown).toContain("Firecrawl"); | |
| }, 30000); | |
| it.concurrent("scrape status works", async () => { | |
| const response = await scrape({ | |
| url: "http://firecrawl.dev" | |
| }); | |
| expect(response.markdown).toContain("Firecrawl"); | |
| const status = await scrapeStatus(response.metadata.scrapeId!); | |
| expect(JSON.stringify(status)).toBe(JSON.stringify(response)); | |
| }, 60000); | |
| it.concurrent("handles non-UTF-8 encodings", async () => { | |
| const response = await scrape({ | |
| url: "https://www.rtpro.yamaha.co.jp/RT/docs/misc/kanji-sjis.html", | |
| }); | |
| expect(response.markdown).toContain("γ γ γ γ γ γ γ γ γ γ γ γ γ γ γ γ"); | |
| }, 30000); | |
| if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) { | |
| it.concurrent("self-hosted proxy works", async () => { | |
| const response = await scrape({ | |
| url: "https://icanhazip.com" | |
| }); | |
| expect(response.markdown?.trim()).toBe(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]); | |
| }, 30000); | |
| } | |
| if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.PLAYWRIGHT_MICROSERVICE_URL) { | |
| it.concurrent("waitFor works", async () => { | |
| const response = await scrape({ | |
| url: "http://firecrawl.dev", | |
| waitFor: 2000, | |
| }); | |
| expect(response.markdown).toContain("Firecrawl"); | |
| }, 30000); | |
| } | |
| describe("JSON scrape support", () => { | |
| it.concurrent("returns parseable JSON", async () => { | |
| const response = await scrape({ | |
| url: "https://jsonplaceholder.typicode.com/todos/1", | |
| formats: ["rawHtml"], | |
| }); | |
| const obj = JSON.parse(response.rawHtml!); | |
| expect(obj.id).toBe(1); | |
| }, 30000); | |
| }); | |
| if (!process.env.TEST_SUITE_SELF_HOSTED) { | |
| // describe("Ad blocking (f-e dependant)", () => { | |
| // it.concurrent("blocks ads by default", async () => { | |
| // const response = await scrape({ | |
| // url: "https://www.allrecipes.com/recipe/18185/yum/", | |
| // }); | |
| // expect(response.markdown).not.toContain(".g.doubleclick.net/"); | |
| // }, 30000); | |
| // it.concurrent("doesn't block ads if explicitly disabled", async () => { | |
| // const response = await scrape({ | |
| // url: "https://www.allrecipes.com/recipe/18185/yum/", | |
| // blockAds: false, | |
| // }); | |
| // expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//); | |
| // }, 30000); | |
| // }); | |
| describe("Change Tracking format", () => { | |
| it.concurrent("works", async () => { | |
| const response = await scrape({ | |
| url: "https://example.com", | |
| formats: ["markdown", "changeTracking"], | |
| }); | |
| expect(response.changeTracking).toBeDefined(); | |
| expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); | |
| }, 30000); | |
| it.concurrent("includes git diff when requested", async () => { | |
| const response = await scrape({ | |
| url: "https://example.com", | |
| formats: ["markdown", "changeTracking"], | |
| changeTrackingOptions: { | |
| modes: ["git-diff"] | |
| } | |
| }); | |
| expect(response.changeTracking).toBeDefined(); | |
| expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); | |
| if (response.changeTracking?.changeStatus === "changed") { | |
| expect(response.changeTracking?.diff).toBeDefined(); | |
| expect(response.changeTracking?.diff?.text).toBeDefined(); | |
| expect(response.changeTracking?.diff?.json).toBeDefined(); | |
| expect(response.changeTracking?.diff?.json.files).toBeInstanceOf(Array); | |
| } | |
| }, 30000); | |
| it.concurrent("includes structured output when requested", async () => { | |
| const response = await scrape({ | |
| url: "https://example.com", | |
| formats: ["markdown", "changeTracking"], | |
| changeTrackingOptions: { | |
| modes: ["json"], | |
| prompt: "Summarize the changes between the previous and current content", | |
| } | |
| }); | |
| expect(response.changeTracking).toBeDefined(); | |
| expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); | |
| if (response.changeTracking?.changeStatus === "changed") { | |
| expect(response.changeTracking?.json).toBeDefined(); | |
| } | |
| }, 30000); | |
| it.concurrent("supports schema-based extraction for change tracking", async () => { | |
| const response = await scrape({ | |
| url: "https://example.com", | |
| formats: ["markdown", "changeTracking"], | |
| changeTrackingOptions: { | |
| modes: ["json"], | |
| schema: { | |
| type: "object", | |
| properties: { | |
| pricing: { | |
| type: "object", | |
| properties: { | |
| amount: { type: "number" }, | |
| currency: { type: "string" } | |
| } | |
| }, | |
| features: { | |
| type: "array", | |
| items: { type: "string" } | |
| } | |
| } | |
| } | |
| } | |
| }); | |
| expect(response.changeTracking).toBeDefined(); | |
| expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); | |
| if (response.changeTracking?.changeStatus === "changed") { | |
| expect(response.changeTracking?.json).toBeDefined(); | |
| if (response.changeTracking?.json.pricing) { | |
| expect(response.changeTracking?.json.pricing).toHaveProperty("old"); | |
| expect(response.changeTracking?.json.pricing).toHaveProperty("new"); | |
| } | |
| if (response.changeTracking?.json.features) { | |
| expect(response.changeTracking?.json.features).toHaveProperty("old"); | |
| expect(response.changeTracking?.json.features).toHaveProperty("new"); | |
| } | |
| } | |
| }, 30000); | |
| it.concurrent("supports both git-diff and structured modes together", async () => { | |
| const response = await scrape({ | |
| url: "https://example.com", | |
| formats: ["markdown", "changeTracking"], | |
| changeTrackingOptions: { | |
| modes: ["git-diff", "json"], | |
| schema: { | |
| type: "object", | |
| properties: { | |
| summary: { type: "string" }, | |
| changes: { type: "array", items: { type: "string" } } | |
| } | |
| } | |
| } | |
| }); | |
| expect(response.changeTracking).toBeDefined(); | |
| expect(response.changeTracking?.previousScrapeAt).not.toBeNull(); | |
| if (response.changeTracking?.changeStatus === "changed") { | |
| expect(response.changeTracking?.diff).toBeDefined(); | |
| expect(response.changeTracking?.diff?.text).toBeDefined(); | |
| expect(response.changeTracking?.diff?.json).toBeDefined(); | |
| expect(response.changeTracking?.json).toBeDefined(); | |
| expect(response.changeTracking?.json).toHaveProperty("summary"); | |
| expect(response.changeTracking?.json).toHaveProperty("changes"); | |
| } | |
| }, 30000); | |
| }); | |
| describe("Location API (f-e dependant)", () => { | |
| it.concurrent("works without specifying an explicit location", async () => { | |
| await scrape({ | |
| url: "https://iplocation.com", | |
| }); | |
| }, 30000); | |
| it.concurrent("works with country US", async () => { | |
| const response = await scrape({ | |
| url: "https://iplocation.com", | |
| location: { country: "US" }, | |
| }); | |
| expect(response.markdown).toContain("| Country | United States |"); | |
| }, 30000); | |
| }); | |
| describe("Screenshot (f-e/sb dependant)", () => { | |
| it.concurrent("screenshot format works", async () => { | |
| const response = await scrape({ | |
| url: "http://firecrawl.dev", | |
| formats: ["screenshot"] | |
| }); | |
| expect(typeof response.screenshot).toBe("string"); | |
| }, 30000); | |
| it.concurrent("screenshot@fullPage format works", async () => { | |
| const response = await scrape({ | |
| url: "http://firecrawl.dev", | |
| formats: ["screenshot@fullPage"] | |
| }); | |
| expect(typeof response.screenshot).toBe("string"); | |
| }, 30000); | |
| }); | |
| describe("Proxy API (f-e dependant)", () => { | |
| it.concurrent("undefined works", async () => { | |
| await scrape({ | |
| url: "http://firecrawl.dev", | |
| }); | |
| }, 30000); | |
| it.concurrent("basic works", async () => { | |
| await scrape({ | |
| url: "http://firecrawl.dev", | |
| proxy: "basic", | |
| }); | |
| }, 30000); | |
| it.concurrent("stealth works", async () => { | |
| await scrape({ | |
| url: "http://firecrawl.dev", | |
| proxy: "stealth", | |
| timeout: 120000, | |
| }); | |
| }, 130000); | |
| }); | |
| // Temporarily disabled, too flaky | |
| // describe("PDF (f-e dependant)", () => { | |
| // it.concurrent("works for PDFs behind anti-bot", async () => { | |
| // const response = await scrape({ | |
| // url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf" | |
| // }); | |
| // expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix"); | |
| // }, 60000); | |
| // }); | |
| } | |
| if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY || process.env.OLLAMA_BASE_URL) { | |
| describe("JSON format", () => { | |
| it.concurrent("works", async () => { | |
| const response = await scrape({ | |
| url: "http://firecrawl.dev", | |
| formats: ["json"], | |
| jsonOptions: { | |
| prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.", | |
| schema: { | |
| type: "object", | |
| properties: { | |
| company_mission: { | |
| type: "string", | |
| }, | |
| supports_sso: { | |
| type: "boolean", | |
| }, | |
| is_open_source: { | |
| type: "boolean", | |
| }, | |
| }, | |
| required: ["company_mission", "supports_sso", "is_open_source"], | |
| }, | |
| }, | |
| }); | |
| expect(response).toHaveProperty("json"); | |
| expect(response.json).toHaveProperty("company_mission"); | |
| expect(typeof response.json.company_mission).toBe("string"); | |
| expect(response.json).toHaveProperty("supports_sso"); | |
| expect(response.json.supports_sso).toBe(false); | |
| expect(typeof response.json.supports_sso).toBe("boolean"); | |
| expect(response.json).toHaveProperty("is_open_source"); | |
| expect(response.json.is_open_source).toBe(true); | |
| expect(typeof response.json.is_open_source).toBe("boolean"); | |
| }, 30000); | |
| }); | |
| } | |
| }); | |