Spaces:

Echo-AI-official
/

Fire-crawl

Paused

App Files Files Community

Fire-crawl / src /scraper /scrapeURL /index.ts

Echo-AI-official

Upload 280 files

0e759d2 verified 9 months ago

raw

history blame contribute delete

14 kB

	import { Logger } from "winston";
	import * as Sentry from "@sentry/node";

	import { Document, ScrapeOptions, TimeoutSignal } from "../../controllers/v1/types";
	import { logger as _logger } from "../../lib/logger";
	import {
	buildFallbackList,
	Engine,
	EngineScrapeResult,
	FeatureFlag,
	scrapeURLWithEngine,
	} from "./engines";
	import { parseMarkdown } from "../../lib/html-to-markdown";
	import {
	ActionError,
	AddFeatureError,
	EngineError,
	NoEnginesLeftError,
	PDFAntibotError,
	RemoveFeatureError,
	SiteError,
	TimeoutError,
	UnsupportedFileError,
	} from "./error";
	import { executeTransformers } from "./transformers";
	import { LLMRefusalError } from "./transformers/llmExtract";
	import { urlSpecificParams } from "./lib/urlSpecificParams";
	import { loadMock, MockState } from "./lib/mock";

	export type ScrapeUrlResponse = (
	\| {
	success: true;
	document: Document;
	}
	\| {
	success: false;
	error: any;
	}
	) & {
	logs: any[];
	engines: EngineResultsTracker;
	};

	export type Meta = {
	id: string;
	url: string;
	options: ScrapeOptions;
	internalOptions: InternalOptions;
	logger: Logger;
	logs: any[];
	featureFlags: Set<FeatureFlag>;
	mock: MockState \| null;
	pdfPrefetch: {
	filePath: string;
	url?: string;
	status: number;
	} \| null \| undefined; // undefined: no prefetch yet, null: prefetch came back empty
	};

	function buildFeatureFlags(
	url: string,
	options: ScrapeOptions,
	internalOptions: InternalOptions,
	): Set<FeatureFlag> {
	const flags: Set<FeatureFlag> = new Set();

	if (options.actions !== undefined) {
	flags.add("actions");
	}

	if (options.formats.includes("screenshot")) {
	flags.add("screenshot");
	}

	if (options.formats.includes("screenshot@fullPage")) {
	flags.add("screenshot@fullScreen");
	}

	if (options.waitFor !== 0) {
	flags.add("waitFor");
	}

	if (internalOptions.atsv) {
	flags.add("atsv");
	}

	if (options.location \|\| options.geolocation) {
	flags.add("location");
	}

	if (options.mobile) {
	flags.add("mobile");
	}

	if (options.skipTlsVerification) {
	flags.add("skipTlsVerification");
	}

	if (options.fastMode) {
	flags.add("useFastMode");
	}

	if (options.proxy === "stealth") {
	flags.add("stealthProxy");
	}

	const urlO = new URL(url);

	if (urlO.pathname.endsWith(".pdf")) {
	flags.add("pdf");
	}

	if (urlO.pathname.endsWith(".docx")) {
	flags.add("docx");
	}

	return flags;
	}

	// The meta object contains all required information to perform a scrape.
	// For example, the scrape ID, URL, options, feature flags, logs that occur while scraping.
	// The meta object is usually immutable, except for the logs array, and in edge cases (e.g. a new feature is suddenly required)
	// Having a meta object that is treated as immutable helps the code stay clean and easily tracable,
	// while also retaining the benefits that WebScraper had from its OOP design.
	async function buildMetaObject(
	id: string,
	url: string,
	options: ScrapeOptions,
	internalOptions: InternalOptions,
	): Promise<Meta> {
	const specParams =
	urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
	if (specParams !== undefined) {
	options = Object.assign(options, specParams.scrapeOptions);
	internalOptions = Object.assign(
	internalOptions,
	specParams.internalOptions,
	);
	}

	const logger = _logger.child({
	module: "ScrapeURL",
	scrapeId: id,
	scrapeURL: url,
	});
	const logs: any[] = [];

	return {
	id,
	url,
	options,
	internalOptions,
	logger,
	logs,
	featureFlags: buildFeatureFlags(url, options, internalOptions),
	mock:
	options.useMock !== undefined
	? await loadMock(options.useMock, _logger)
	: null,
	pdfPrefetch: undefined,
	};
	}

	export type InternalOptions = {
	teamId: string;

	priority?: number; // Passed along to fire-engine
	forceEngine?: Engine \| Engine[];
	atsv?: boolean; // anti-bot solver, beta

	v0CrawlOnlyUrls?: boolean;
	v0DisableJsDom?: boolean;
	useCache?: boolean;
	disableSmartWaitCache?: boolean; // Passed along to fire-engine
	isBackgroundIndex?: boolean;
	fromCache?: boolean; // Indicates if the document was retrieved from cache
	abort?: AbortSignal;
	urlInvisibleInCurrentCrawl?: boolean;
	};

	export type EngineResultsTracker = {
	[E in Engine]?: (
	\| {
	state: "error";
	error: any;
	unexpected: boolean;
	}
	\| {
	state: "success";
	result: EngineScrapeResult & { markdown: string };
	factors: Record<string, boolean>;
	unsupportedFeatures: Set<FeatureFlag>;
	}
	\| {
	state: "timeout";
	}
	) & {
	startedAt: number;
	finishedAt: number;
	};
	};

	export type EngineScrapeResultWithContext = {
	engine: Engine;
	unsupportedFeatures: Set<FeatureFlag>;
	result: EngineScrapeResult & { markdown: string };
	};

	function safeguardCircularError<T>(error: T): T {
	if (typeof error === "object" && error !== null && (error as any).results) {
	const newError = structuredClone(error);
	delete (newError as any).results;
	return newError;
	} else {
	return error;
	}
	}

	async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
	meta.logger.info(`Scraping URL ${JSON.stringify(meta.url)}...`);

	// TODO: handle sitemap data, see WebScraper/index.ts:280
	// TODO: ScrapeEvents

	const fallbackList = buildFallbackList(meta);

	const results: EngineResultsTracker = {};
	let result: EngineScrapeResultWithContext \| null = null;

	const timeToRun =
	meta.options.timeout !== undefined
	? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2))
	: (!meta.options.actions && !meta.options.jsonOptions && !meta.options.extract)
	? Math.round(120000 / Math.min(fallbackList.length, 2))
	: undefined;

	for (const { engine, unsupportedFeatures } of fallbackList) {
	meta.internalOptions.abort?.throwIfAborted();
	const startedAt = Date.now();
	try {
	meta.logger.info("Scraping via " + engine + "...");
	const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun);
	if (_engineResult.markdown === undefined) {
	// Some engines emit Markdown directly.
	_engineResult.markdown = await parseMarkdown(_engineResult.html);
	}
	const engineResult = _engineResult as EngineScrapeResult & {
	markdown: string;
	};

	// Success factors
	const isLongEnough = engineResult.markdown.length > 0;
	const isGoodStatusCode =
	(engineResult.statusCode >= 200 && engineResult.statusCode < 300) \|\|
	engineResult.statusCode === 304;
	const hasNoPageError = engineResult.error === undefined;

	results[engine] = {
	state: "success",
	result: engineResult,
	factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
	unsupportedFeatures,
	startedAt,
	finishedAt: Date.now(),
	};

	// NOTE: TODO: what to do when status code is bad is tough...
	// we cannot just rely on text because error messages can be brief and not hit the limit
	// should we just use all the fallbacks and pick the one with the longest text? - mogery
	if (isLongEnough \|\| !isGoodStatusCode) {
	meta.logger.info("Scrape via " + engine + " deemed successful.", {
	factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
	});
	result = {
	engine,
	unsupportedFeatures,
	result: engineResult as EngineScrapeResult & { markdown: string },
	};
	break;
	}
	} catch (error) {
	if (error instanceof EngineError) {
	meta.logger.info("Engine " + engine + " could not scrape the page.", {
	error,
	});
	results[engine] = {
	state: "error",
	error: safeguardCircularError(error),
	unexpected: false,
	startedAt,
	finishedAt: Date.now(),
	};
	} else if (error instanceof TimeoutError) {
	meta.logger.info("Engine " + engine + " timed out while scraping.", {
	error,
	});
	results[engine] = {
	state: "timeout",
	startedAt,
	finishedAt: Date.now(),
	};
	} else if (
	error instanceof AddFeatureError \|\|
	error instanceof RemoveFeatureError
	) {
	throw error;
	} else if (error instanceof LLMRefusalError) {
	results[engine] = {
	state: "error",
	error: safeguardCircularError(error),
	unexpected: true,
	startedAt,
	finishedAt: Date.now(),
	};
	error.results = results;
	meta.logger.warn("LLM refusal encountered", { error });
	throw error;
	} else if (error instanceof SiteError) {
	throw error;
	} else if (error instanceof ActionError) {
	throw error;
	} else if (error instanceof UnsupportedFileError) {
	throw error;
	} else if (error instanceof PDFAntibotError) {
	throw error;
	} else if (error instanceof TimeoutSignal) {
	throw error;
	} else {
	Sentry.captureException(error);
	meta.logger.warn(
	"An unexpected error happened while scraping with " + engine + ".",
	{ error },
	);
	results[engine] = {
	state: "error",
	error: safeguardCircularError(error),
	unexpected: true,
	startedAt,
	finishedAt: Date.now(),
	};
	}
	}
	}

	if (result === null) {
	throw new NoEnginesLeftError(
	fallbackList.map((x) => x.engine),
	results,
	);
	}

	let document: Document = {
	markdown: result.result.markdown,
	rawHtml: result.result.html,
	screenshot: result.result.screenshot,
	actions: result.result.actions,
	metadata: {
	sourceURL: meta.url,
	url: result.result.url,
	statusCode: result.result.statusCode,
	error: result.result.error,
	},
	};

	if (result.unsupportedFeatures.size > 0) {
	const warning = `The engine used does not support the following features: ${[...result.unsupportedFeatures].join(", ")} -- your scrape may be partial.`;
	meta.logger.warn(warning, {
	engine: result.engine,
	unsupportedFeatures: result.unsupportedFeatures,
	});
	document.warning =
	document.warning !== undefined
	? document.warning + " " + warning
	: warning;
	}

	document = await executeTransformers(meta, document);

	return {
	success: true,
	document,
	logs: meta.logs,
	engines: results,
	};
	}

	export async function scrapeURL(
	id: string,
	url: string,
	options: ScrapeOptions,
	internalOptions: InternalOptions,
	): Promise<ScrapeUrlResponse> {
	const meta = await buildMetaObject(id, url, options, internalOptions);
	try {
	while (true) {
	try {
	return await scrapeURLLoop(meta);
	} catch (error) {
	if (
	error instanceof AddFeatureError &&
	meta.internalOptions.forceEngine === undefined
	) {
	meta.logger.debug(
	"More feature flags requested by scraper: adding " +
	error.featureFlags.join(", "),
	{ error, existingFlags: meta.featureFlags },
	);
	meta.featureFlags = new Set(
	[...meta.featureFlags].concat(error.featureFlags),
	);
	if (error.pdfPrefetch) {
	meta.pdfPrefetch = error.pdfPrefetch;
	}
	} else if (
	error instanceof RemoveFeatureError &&
	meta.internalOptions.forceEngine === undefined
	) {
	meta.logger.debug(
	"Incorrect feature flags reported by scraper: removing " +
	error.featureFlags.join(","),
	{ error, existingFlags: meta.featureFlags },
	);
	meta.featureFlags = new Set(
	[...meta.featureFlags].filter(
	(x) => !error.featureFlags.includes(x),
	),
	);
	} else if (
	error instanceof PDFAntibotError &&
	meta.internalOptions.forceEngine === undefined
	) {
	if (meta.pdfPrefetch !== undefined) {
	meta.logger.error("PDF was prefetched and still blocked by antibot, failing");
	throw error;
	} else {
	meta.logger.debug("PDF was blocked by anti-bot, prefetching with chrome-cdp");
	meta.featureFlags = new Set(
	[...meta.featureFlags].filter(
	(x) => x !== "pdf",
	),
	);
	}
	} else {
	throw error;
	}
	}
	}
	} catch (error) {
	let results: EngineResultsTracker = {};

	if (error instanceof NoEnginesLeftError) {
	meta.logger.warn("scrapeURL: All scraping engines failed!", { error });
	results = error.results;
	} else if (error instanceof LLMRefusalError) {
	meta.logger.warn("scrapeURL: LLM refused to extract content", { error });
	results = error.results!;
	} else if (
	error instanceof Error &&
	error.message.includes("Invalid schema for response_format")
	) {
	// TODO: seperate into custom error
	meta.logger.warn("scrapeURL: LLM schema error", { error });
	// TODO: results?
	} else if (error instanceof SiteError) {
	meta.logger.warn("scrapeURL: Site failed to load in browser", { error });
	} else if (error instanceof ActionError) {
	meta.logger.warn("scrapeURL: Action(s) failed to complete", { error });
	} else if (error instanceof UnsupportedFileError) {
	meta.logger.warn("scrapeURL: Tried to scrape unsupported file", {
	error,
	});
	} else if (error instanceof TimeoutSignal) {
	throw error;
	} else {
	Sentry.captureException(error);
	meta.logger.error("scrapeURL: Unexpected error happened", { error });
	// TODO: results?
	}

	return {
	success: false,
	error,
	logs: meta.logs,
	engines: results,
	};
	}
	}