import { Logger } from "winston"; import * as Sentry from "@sentry/node"; import { Document, ScrapeOptions, TimeoutSignal } from "../../controllers/v1/types"; import { logger as _logger } from "../../lib/logger"; import { buildFallbackList, Engine, EngineScrapeResult, FeatureFlag, scrapeURLWithEngine, } from "./engines"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { ActionError, AddFeatureError, EngineError, NoEnginesLeftError, PDFAntibotError, RemoveFeatureError, SiteError, TimeoutError, UnsupportedFileError, } from "./error"; import { executeTransformers } from "./transformers"; import { LLMRefusalError } from "./transformers/llmExtract"; import { urlSpecificParams } from "./lib/urlSpecificParams"; import { loadMock, MockState } from "./lib/mock"; export type ScrapeUrlResponse = ( | { success: true; document: Document; } | { success: false; error: any; } ) & { logs: any[]; engines: EngineResultsTracker; }; export type Meta = { id: string; url: string; options: ScrapeOptions; internalOptions: InternalOptions; logger: Logger; logs: any[]; featureFlags: Set; mock: MockState | null; pdfPrefetch: { filePath: string; url?: string; status: number; } | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty }; function buildFeatureFlags( url: string, options: ScrapeOptions, internalOptions: InternalOptions, ): Set { const flags: Set = new Set(); if (options.actions !== undefined) { flags.add("actions"); } if (options.formats.includes("screenshot")) { flags.add("screenshot"); } if (options.formats.includes("screenshot@fullPage")) { flags.add("screenshot@fullScreen"); } if (options.waitFor !== 0) { flags.add("waitFor"); } if (internalOptions.atsv) { flags.add("atsv"); } if (options.location || options.geolocation) { flags.add("location"); } if (options.mobile) { flags.add("mobile"); } if (options.skipTlsVerification) { flags.add("skipTlsVerification"); } if (options.fastMode) { flags.add("useFastMode"); } if (options.proxy === "stealth") { flags.add("stealthProxy"); } const urlO = new URL(url); if (urlO.pathname.endsWith(".pdf")) { flags.add("pdf"); } if (urlO.pathname.endsWith(".docx")) { flags.add("docx"); } return flags; } // The meta object contains all required information to perform a scrape. // For example, the scrape ID, URL, options, feature flags, logs that occur while scraping. // The meta object is usually immutable, except for the logs array, and in edge cases (e.g. a new feature is suddenly required) // Having a meta object that is treated as immutable helps the code stay clean and easily tracable, // while also retaining the benefits that WebScraper had from its OOP design. async function buildMetaObject( id: string, url: string, options: ScrapeOptions, internalOptions: InternalOptions, ): Promise { const specParams = urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")]; if (specParams !== undefined) { options = Object.assign(options, specParams.scrapeOptions); internalOptions = Object.assign( internalOptions, specParams.internalOptions, ); } const logger = _logger.child({ module: "ScrapeURL", scrapeId: id, scrapeURL: url, }); const logs: any[] = []; return { id, url, options, internalOptions, logger, logs, featureFlags: buildFeatureFlags(url, options, internalOptions), mock: options.useMock !== undefined ? await loadMock(options.useMock, _logger) : null, pdfPrefetch: undefined, }; } export type InternalOptions = { teamId: string; priority?: number; // Passed along to fire-engine forceEngine?: Engine | Engine[]; atsv?: boolean; // anti-bot solver, beta v0CrawlOnlyUrls?: boolean; v0DisableJsDom?: boolean; useCache?: boolean; disableSmartWaitCache?: boolean; // Passed along to fire-engine isBackgroundIndex?: boolean; fromCache?: boolean; // Indicates if the document was retrieved from cache abort?: AbortSignal; urlInvisibleInCurrentCrawl?: boolean; }; export type EngineResultsTracker = { [E in Engine]?: ( | { state: "error"; error: any; unexpected: boolean; } | { state: "success"; result: EngineScrapeResult & { markdown: string }; factors: Record; unsupportedFeatures: Set; } | { state: "timeout"; } ) & { startedAt: number; finishedAt: number; }; }; export type EngineScrapeResultWithContext = { engine: Engine; unsupportedFeatures: Set; result: EngineScrapeResult & { markdown: string }; }; function safeguardCircularError(error: T): T { if (typeof error === "object" && error !== null && (error as any).results) { const newError = structuredClone(error); delete (newError as any).results; return newError; } else { return error; } } async function scrapeURLLoop(meta: Meta): Promise { meta.logger.info(`Scraping URL ${JSON.stringify(meta.url)}...`); // TODO: handle sitemap data, see WebScraper/index.ts:280 // TODO: ScrapeEvents const fallbackList = buildFallbackList(meta); const results: EngineResultsTracker = {}; let result: EngineScrapeResultWithContext | null = null; const timeToRun = meta.options.timeout !== undefined ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2)) : (!meta.options.actions && !meta.options.jsonOptions && !meta.options.extract) ? Math.round(120000 / Math.min(fallbackList.length, 2)) : undefined; for (const { engine, unsupportedFeatures } of fallbackList) { meta.internalOptions.abort?.throwIfAborted(); const startedAt = Date.now(); try { meta.logger.info("Scraping via " + engine + "..."); const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun); if (_engineResult.markdown === undefined) { // Some engines emit Markdown directly. _engineResult.markdown = await parseMarkdown(_engineResult.html); } const engineResult = _engineResult as EngineScrapeResult & { markdown: string; }; // Success factors const isLongEnough = engineResult.markdown.length > 0; const isGoodStatusCode = (engineResult.statusCode >= 200 && engineResult.statusCode < 300) || engineResult.statusCode === 304; const hasNoPageError = engineResult.error === undefined; results[engine] = { state: "success", result: engineResult, factors: { isLongEnough, isGoodStatusCode, hasNoPageError }, unsupportedFeatures, startedAt, finishedAt: Date.now(), }; // NOTE: TODO: what to do when status code is bad is tough... // we cannot just rely on text because error messages can be brief and not hit the limit // should we just use all the fallbacks and pick the one with the longest text? - mogery if (isLongEnough || !isGoodStatusCode) { meta.logger.info("Scrape via " + engine + " deemed successful.", { factors: { isLongEnough, isGoodStatusCode, hasNoPageError }, }); result = { engine, unsupportedFeatures, result: engineResult as EngineScrapeResult & { markdown: string }, }; break; } } catch (error) { if (error instanceof EngineError) { meta.logger.info("Engine " + engine + " could not scrape the page.", { error, }); results[engine] = { state: "error", error: safeguardCircularError(error), unexpected: false, startedAt, finishedAt: Date.now(), }; } else if (error instanceof TimeoutError) { meta.logger.info("Engine " + engine + " timed out while scraping.", { error, }); results[engine] = { state: "timeout", startedAt, finishedAt: Date.now(), }; } else if ( error instanceof AddFeatureError || error instanceof RemoveFeatureError ) { throw error; } else if (error instanceof LLMRefusalError) { results[engine] = { state: "error", error: safeguardCircularError(error), unexpected: true, startedAt, finishedAt: Date.now(), }; error.results = results; meta.logger.warn("LLM refusal encountered", { error }); throw error; } else if (error instanceof SiteError) { throw error; } else if (error instanceof ActionError) { throw error; } else if (error instanceof UnsupportedFileError) { throw error; } else if (error instanceof PDFAntibotError) { throw error; } else if (error instanceof TimeoutSignal) { throw error; } else { Sentry.captureException(error); meta.logger.warn( "An unexpected error happened while scraping with " + engine + ".", { error }, ); results[engine] = { state: "error", error: safeguardCircularError(error), unexpected: true, startedAt, finishedAt: Date.now(), }; } } } if (result === null) { throw new NoEnginesLeftError( fallbackList.map((x) => x.engine), results, ); } let document: Document = { markdown: result.result.markdown, rawHtml: result.result.html, screenshot: result.result.screenshot, actions: result.result.actions, metadata: { sourceURL: meta.url, url: result.result.url, statusCode: result.result.statusCode, error: result.result.error, }, }; if (result.unsupportedFeatures.size > 0) { const warning = `The engine used does not support the following features: ${[...result.unsupportedFeatures].join(", ")} -- your scrape may be partial.`; meta.logger.warn(warning, { engine: result.engine, unsupportedFeatures: result.unsupportedFeatures, }); document.warning = document.warning !== undefined ? document.warning + " " + warning : warning; } document = await executeTransformers(meta, document); return { success: true, document, logs: meta.logs, engines: results, }; } export async function scrapeURL( id: string, url: string, options: ScrapeOptions, internalOptions: InternalOptions, ): Promise { const meta = await buildMetaObject(id, url, options, internalOptions); try { while (true) { try { return await scrapeURLLoop(meta); } catch (error) { if ( error instanceof AddFeatureError && meta.internalOptions.forceEngine === undefined ) { meta.logger.debug( "More feature flags requested by scraper: adding " + error.featureFlags.join(", "), { error, existingFlags: meta.featureFlags }, ); meta.featureFlags = new Set( [...meta.featureFlags].concat(error.featureFlags), ); if (error.pdfPrefetch) { meta.pdfPrefetch = error.pdfPrefetch; } } else if ( error instanceof RemoveFeatureError && meta.internalOptions.forceEngine === undefined ) { meta.logger.debug( "Incorrect feature flags reported by scraper: removing " + error.featureFlags.join(","), { error, existingFlags: meta.featureFlags }, ); meta.featureFlags = new Set( [...meta.featureFlags].filter( (x) => !error.featureFlags.includes(x), ), ); } else if ( error instanceof PDFAntibotError && meta.internalOptions.forceEngine === undefined ) { if (meta.pdfPrefetch !== undefined) { meta.logger.error("PDF was prefetched and still blocked by antibot, failing"); throw error; } else { meta.logger.debug("PDF was blocked by anti-bot, prefetching with chrome-cdp"); meta.featureFlags = new Set( [...meta.featureFlags].filter( (x) => x !== "pdf", ), ); } } else { throw error; } } } } catch (error) { let results: EngineResultsTracker = {}; if (error instanceof NoEnginesLeftError) { meta.logger.warn("scrapeURL: All scraping engines failed!", { error }); results = error.results; } else if (error instanceof LLMRefusalError) { meta.logger.warn("scrapeURL: LLM refused to extract content", { error }); results = error.results!; } else if ( error instanceof Error && error.message.includes("Invalid schema for response_format") ) { // TODO: seperate into custom error meta.logger.warn("scrapeURL: LLM schema error", { error }); // TODO: results? } else if (error instanceof SiteError) { meta.logger.warn("scrapeURL: Site failed to load in browser", { error }); } else if (error instanceof ActionError) { meta.logger.warn("scrapeURL: Action(s) failed to complete", { error }); } else if (error instanceof UnsupportedFileError) { meta.logger.warn("scrapeURL: Tried to scrape unsupported file", { error, }); } else if (error instanceof TimeoutSignal) { throw error; } else { Sentry.captureException(error); meta.logger.error("scrapeURL: Unexpected error happened", { error }); // TODO: results? } return { success: false, error, logs: meta.logs, engines: results, }; } }