Spaces:

Echo-AI-official
/

Fire-crawl

Paused

App Files Files Community

Fire-crawl / src /controllers /v0 /scrape.ts

Echo-AI-official

Upload 280 files

0e759d2 verified 9 months ago

raw

history blame contribute delete

8.59 kB

	import { ExtractorOptions, PageOptions } from "./../../lib/entities";
	import { Request, Response } from "express";
	import {
	billTeam,
	checkTeamCredits,
	} from "../../services/billing/credit_billing";
	import { authenticateUser } from "../auth";
	import { RateLimiterMode } from "../../types";
	import { logJob } from "../../services/logging/log_job";
	import {
	fromLegacyCombo,
	toLegacyDocument,
	url as urlSchema,
	} from "../v1/types";
	import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
	import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
	import {
	defaultPageOptions,
	defaultExtractorOptions,
	defaultTimeout,
	defaultOrigin,
	} from "../../lib/default-values";
	import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
	import { getScrapeQueue, redisConnection } from "../../services/queue-service";
	import { v4 as uuidv4 } from "uuid";
	import { logger } from "../../lib/logger";
	import * as Sentry from "@sentry/node";
	import { getJobPriority } from "../../lib/job-priority";
	import { fromLegacyScrapeOptions } from "../v1/types";
	import { ZodError } from "zod";
	import { Document as V0Document } from "./../../lib/entities";
	import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
	import { getJobFromGCS } from "../../lib/gcs-jobs";

	export async function scrapeHelper(
	jobId: string,
	req: Request,
	team_id: string,
	crawlerOptions: any,
	pageOptions: PageOptions,
	extractorOptions: ExtractorOptions,
	timeout: number,
	): Promise<{
	success: boolean;
	error?: string;
	data?: V0Document \| { url: string };
	returnCode: number;
	}> {
	const url = urlSchema.parse(req.body.url);
	if (typeof url !== "string") {
	return { success: false, error: "Url is required", returnCode: 400 };
	}

	if (isUrlBlocked(url)) {
	return {
	success: false,
	error: BLOCKLISTED_URL_MESSAGE,
	returnCode: 403,
	};
	}

	const jobPriority = await getJobPriority({ team_id, basePriority: 10 });

	const { scrapeOptions, internalOptions } = fromLegacyCombo(
	pageOptions,
	extractorOptions,
	timeout,
	crawlerOptions,
	team_id,
	);

	await addScrapeJob(
	{
	url,
	mode: "single_urls",
	team_id,
	scrapeOptions,
	internalOptions,
	origin: req.body.origin ?? defaultOrigin,
	is_scrape: true,
	},
	{},
	jobId,
	jobPriority,
	);

	let doc;

	const err = await Sentry.startSpan(
	{
	name: "Wait for job to finish",
	op: "bullmq.wait",
	attributes: { job: jobId },
	},
	async (span) => {
	try {
	doc = await waitForJob(jobId, timeout);
	} catch (e) {
	if (
	e instanceof Error &&
	(e.message.startsWith("Job wait") \|\| e.message === "timeout")
	) {
	span.setAttribute("timedOut", true);
	return {
	success: false,
	error: "Request timed out",
	returnCode: 408,
	};
	} else if (
	typeof e === "string" &&
	(e.includes("Error generating completions: ") \|\|
	e.includes("Invalid schema for function") \|\|
	e.includes(
	"LLM extraction did not match the extraction schema you provided.",
	))
	) {
	return {
	success: false,
	error: e,
	returnCode: 500,
	};
	} else {
	throw e;
	}
	}
	span.setAttribute("result", JSON.stringify(doc));
	return null;
	},
	);

	if (err !== null) {
	return err;
	}

	await getScrapeQueue().remove(jobId);

	if (!doc) {
	console.error("!!! PANIC DOC IS", doc);
	return {
	success: true,
	error: "No page found",
	returnCode: 200,
	data: doc,
	};
	}

	delete doc.index;
	delete doc.provider;

	// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
	if (
	!pageOptions.includeRawHtml &&
	extractorOptions.mode == "llm-extraction-from-raw-html"
	) {
	if (doc.rawHtml) {
	delete doc.rawHtml;
	}
	}

	if (!pageOptions.includeHtml) {
	if (doc.html) {
	delete doc.html;
	}
	}

	return {
	success: true,
	data: toLegacyDocument(doc, internalOptions),
	returnCode: 200,
	};
	}

	export async function scrapeController(req: Request, res: Response) {
	try {
	let earlyReturn = false;
	// make sure to authenticate user first, Bearer <token>
	const auth = await authenticateUser(req, res, RateLimiterMode.Scrape);
	if (!auth.success) {
	return res.status(auth.status).json({ error: auth.error });
	}

	const { team_id, chunk } = auth;

	redisConnection.sadd("teams_using_v0", team_id)
	.catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id }));

	const crawlerOptions = req.body.crawlerOptions ?? {};
	const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
	const extractorOptions = {
	...defaultExtractorOptions,
	...req.body.extractorOptions,
	};
	const origin = req.body.origin ?? defaultOrigin;
	let timeout = req.body.timeout ?? defaultTimeout;

	if (extractorOptions.mode.includes("llm-extraction")) {
	if (
	typeof extractorOptions.extractionSchema !== "object" \|\|
	extractorOptions.extractionSchema === null
	) {
	return res.status(400).json({
	error:
	"extractorOptions.extractionSchema must be an object if llm-extraction mode is specified",
	});
	}

	pageOptions.onlyMainContent = true;
	timeout = req.body.timeout ?? 90000;
	}

	// checkCredits
	try {
	const { success: creditsCheckSuccess, message: creditsCheckMessage } =
	await checkTeamCredits(chunk, team_id, 1);
	if (!creditsCheckSuccess) {
	earlyReturn = true;
	return res.status(402).json({
	error:
	"Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing",
	});
	}
	} catch (error) {
	logger.error(error);
	earlyReturn = true;
	return res.status(500).json({
	error:
	"Error checking team credits. Please contact help@firecrawl.com for help.",
	});
	}

	const jobId = uuidv4();

	const startTime = new Date().getTime();
	const result = await scrapeHelper(
	jobId,
	req,
	team_id,
	crawlerOptions,
	pageOptions,
	extractorOptions,
	timeout,
	);
	const endTime = new Date().getTime();
	const timeTakenInSeconds = (endTime - startTime) / 1000;
	const numTokens =
	result.data && (result.data as V0Document).markdown
	? numTokensFromString(
	(result.data as V0Document).markdown!,
	"gpt-3.5-turbo",
	)
	: 0;

	if (result.success) {
	let creditsToBeBilled = 1;
	const creditsPerLLMExtract = 4;

	if (extractorOptions.mode.includes("llm-extraction")) {
	// creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
	creditsToBeBilled += creditsPerLLMExtract;
	}

	let startTimeBilling = new Date().getTime();

	if (earlyReturn) {
	// Don't bill if we're early returning
	return;
	}
	if (creditsToBeBilled > 0) {
	// billing for doc done on queue end, bill only for llm extraction
	billTeam(team_id, chunk?.sub_id, creditsToBeBilled, logger).catch(
	(error) => {
	logger.error(
	`Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,
	{ error },
	);
	// Optionally, you could notify an admin or add to a retry queue here
	},
	);
	}
	}

	let doc = result.data;
	if (!pageOptions \|\| !pageOptions.includeRawHtml) {
	if (doc && (doc as V0Document).rawHtml) {
	delete (doc as V0Document).rawHtml;
	}
	}

	if (pageOptions && pageOptions.includeExtract) {
	if (!pageOptions.includeMarkdown && doc && (doc as V0Document).markdown) {
	delete (doc as V0Document).markdown;
	}
	}

	return res.status(result.returnCode).json(result);
	} catch (error) {
	Sentry.captureException(error);
	logger.error("Scrape error occcurred", { error });
	return res.status(500).json({
	error:
	error instanceof ZodError
	? "Invalid URL"
	: typeof error === "string"
	? error
	: (error?.message ?? "Internal Server Error"),
	});
	}
	}