import { Request, Response } from "express"; import { checkTeamCredits } from "../../../src/services/billing/credit_billing"; import { authenticateUser } from "../auth"; import { RateLimiterMode } from "../../../src/types"; import { addScrapeJob } from "../../../src/services/queue-jobs"; import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist"; import { logCrawl } from "../../../src/services/logging/crawl_log"; import { validateIdempotencyKey } from "../../../src/services/idempotency/validate"; import { createIdempotencyKey } from "../../../src/services/idempotency/create"; import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin, } from "../../../src/lib/default-values"; import { v4 as uuidv4 } from "uuid"; import { logger } from "../../../src/lib/logger"; import { addCrawlJob, addCrawlJobs, crawlToCrawler, finishCrawlKickoff, lockURL, lockURLs, saveCrawl, StoredCrawl, } from "../../../src/lib/crawl-redis"; import { getScrapeQueue, redisConnection } from "../../../src/services/queue-service"; import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import * as Sentry from "@sentry/node"; import { getJobPriority } from "../../lib/job-priority"; import { fromLegacyScrapeOptions, url as urlSchema } from "../v1/types"; import { ZodError } from "zod"; import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; export async function crawlController(req: Request, res: Response) { try { const auth = await authenticateUser(req, res, RateLimiterMode.Crawl); if (!auth.success) { return res.status(auth.status).json({ error: auth.error }); } const { team_id, chunk } = auth; redisConnection.sadd("teams_using_v0", team_id) .catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id })); if (req.headers["x-idempotency-key"]) { const isIdempotencyValid = await validateIdempotencyKey(req); if (!isIdempotencyValid) { return res.status(409).json({ error: "Idempotency key already used" }); } try { createIdempotencyKey(req); } catch (error) { logger.error(error); return res.status(500).json({ error: error.message }); } } const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions, }; const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; if (Array.isArray(crawlerOptions.includes)) { for (const x of crawlerOptions.includes) { try { new RegExp(x); } catch (e) { return res.status(400).json({ error: e.message }); } } } if (Array.isArray(crawlerOptions.excludes)) { for (const x of crawlerOptions.excludes) { try { new RegExp(x); } catch (e) { return res.status(400).json({ error: e.message }); } } } const limitCheck = req.body?.crawlerOptions?.limit ?? 1; const { success: creditsCheckSuccess, message: creditsCheckMessage, remainingCredits, } = await checkTeamCredits(chunk, team_id, limitCheck); if (!creditsCheckSuccess) { return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com", }); } // TODO: need to do this to v1 crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit); let url = urlSchema.parse(req.body.url); if (!url) { return res.status(400).json({ error: "Url is required" }); } if (typeof url !== "string") { return res.status(400).json({ error: "URL must be a string" }); } try { url = checkAndUpdateURL(url).url; } catch (e) { return res .status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500) .json({ error: e.message ?? e }); } if (isUrlBlocked(url)) { return res.status(403).json({ error: BLOCKLISTED_URL_MESSAGE, }); } // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? // try { // const a = new WebScraperDataProvider(); // await a.setOptions({ // jobId: uuidv4(), // mode: "single_urls", // urls: [url], // crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true }, // pageOptions: pageOptions, // }); // const docs = await a.getDocuments(false, (progress) => { // job.updateProgress({ // current: progress.current, // total: progress.total, // current_step: "SCRAPING", // current_url: progress.currentDocumentUrl, // }); // }); // return res.json({ // success: true, // documents: docs, // }); // } catch (error) { // logger.error(error); // return res.status(500).json({ error: error.message }); // } // } const id = uuidv4(); await logCrawl(id, team_id); const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions( pageOptions, undefined, undefined, team_id ); internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter delete (scrapeOptions as any).timeout; const sc: StoredCrawl = { originUrl: url, crawlerOptions, scrapeOptions, internalOptions, team_id, createdAt: Date.now(), }; const crawler = crawlToCrawler(id, sc); try { sc.robots = await crawler.getRobotsTxt(); } catch (_) {} await saveCrawl(id, sc); await finishCrawlKickoff(id); const sitemap = sc.crawlerOptions.ignoreSitemap ? 0 : await crawler.tryGetSitemap(async (urls) => { if (urls.length === 0) return; let jobPriority = await getJobPriority({ team_id, basePriority: 21, }); const jobs = urls.map((url) => { const uuid = uuidv4(); return { name: uuid, data: { url, mode: "single_urls", crawlerOptions, scrapeOptions, internalOptions, team_id, origin: req.body.origin ?? defaultOrigin, crawl_id: id, sitemapped: true, }, opts: { jobId: uuid, priority: jobPriority, }, }; }); await lockURLs( id, sc, jobs.map((x) => x.data.url), ); await addCrawlJobs( id, jobs.map((x) => x.opts.jobId), ); for (const job of jobs) { // add with sentry instrumentation await addScrapeJob(job.data as any, {}, job.opts.jobId); } }); if (sitemap === 0) { await lockURL(id, sc, url); // Not needed, first one should be 15. // const jobPriority = await getJobPriority({team_id, basePriority: 10}) const jobId = uuidv4(); await addScrapeJob( { url, mode: "single_urls", crawlerOptions, scrapeOptions, internalOptions, team_id, origin: req.body.origin ?? defaultOrigin, crawl_id: id, }, { priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs }, jobId, ); await addCrawlJob(id, jobId); } res.json({ jobId: id }); } catch (error) { Sentry.captureException(error); logger.error(error); return res.status(500).json({ error: error instanceof ZodError ? "Invalid URL" : error.message, }); } }