Spaces:
Paused
Paused
| import { Logger } from "winston"; | |
| import * as Sentry from "@sentry/node"; | |
| import { Document, ScrapeOptions, TimeoutSignal } from "../../controllers/v1/types"; | |
| import { logger as _logger } from "../../lib/logger"; | |
| import { | |
| buildFallbackList, | |
| Engine, | |
| EngineScrapeResult, | |
| FeatureFlag, | |
| scrapeURLWithEngine, | |
| } from "./engines"; | |
| import { parseMarkdown } from "../../lib/html-to-markdown"; | |
| import { | |
| ActionError, | |
| AddFeatureError, | |
| EngineError, | |
| NoEnginesLeftError, | |
| PDFAntibotError, | |
| RemoveFeatureError, | |
| SiteError, | |
| TimeoutError, | |
| UnsupportedFileError, | |
| } from "./error"; | |
| import { executeTransformers } from "./transformers"; | |
| import { LLMRefusalError } from "./transformers/llmExtract"; | |
| import { urlSpecificParams } from "./lib/urlSpecificParams"; | |
| import { loadMock, MockState } from "./lib/mock"; | |
| export type ScrapeUrlResponse = ( | |
| | { | |
| success: true; | |
| document: Document; | |
| } | |
| | { | |
| success: false; | |
| error: any; | |
| } | |
| ) & { | |
| logs: any[]; | |
| engines: EngineResultsTracker; | |
| }; | |
| export type Meta = { | |
| id: string; | |
| url: string; | |
| options: ScrapeOptions; | |
| internalOptions: InternalOptions; | |
| logger: Logger; | |
| logs: any[]; | |
| featureFlags: Set<FeatureFlag>; | |
| mock: MockState | null; | |
| pdfPrefetch: { | |
| filePath: string; | |
| url?: string; | |
| status: number; | |
| } | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty | |
| }; | |
| function buildFeatureFlags( | |
| url: string, | |
| options: ScrapeOptions, | |
| internalOptions: InternalOptions, | |
| ): Set<FeatureFlag> { | |
| const flags: Set<FeatureFlag> = new Set(); | |
| if (options.actions !== undefined) { | |
| flags.add("actions"); | |
| } | |
| if (options.formats.includes("screenshot")) { | |
| flags.add("screenshot"); | |
| } | |
| if (options.formats.includes("screenshot@fullPage")) { | |
| flags.add("screenshot@fullScreen"); | |
| } | |
| if (options.waitFor !== 0) { | |
| flags.add("waitFor"); | |
| } | |
| if (internalOptions.atsv) { | |
| flags.add("atsv"); | |
| } | |
| if (options.location || options.geolocation) { | |
| flags.add("location"); | |
| } | |
| if (options.mobile) { | |
| flags.add("mobile"); | |
| } | |
| if (options.skipTlsVerification) { | |
| flags.add("skipTlsVerification"); | |
| } | |
| if (options.fastMode) { | |
| flags.add("useFastMode"); | |
| } | |
| if (options.proxy === "stealth") { | |
| flags.add("stealthProxy"); | |
| } | |
| const urlO = new URL(url); | |
| if (urlO.pathname.endsWith(".pdf")) { | |
| flags.add("pdf"); | |
| } | |
| if (urlO.pathname.endsWith(".docx")) { | |
| flags.add("docx"); | |
| } | |
| return flags; | |
| } | |
| // The meta object contains all required information to perform a scrape. | |
| // For example, the scrape ID, URL, options, feature flags, logs that occur while scraping. | |
| // The meta object is usually immutable, except for the logs array, and in edge cases (e.g. a new feature is suddenly required) | |
| // Having a meta object that is treated as immutable helps the code stay clean and easily tracable, | |
| // while also retaining the benefits that WebScraper had from its OOP design. | |
| async function buildMetaObject( | |
| id: string, | |
| url: string, | |
| options: ScrapeOptions, | |
| internalOptions: InternalOptions, | |
| ): Promise<Meta> { | |
| const specParams = | |
| urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")]; | |
| if (specParams !== undefined) { | |
| options = Object.assign(options, specParams.scrapeOptions); | |
| internalOptions = Object.assign( | |
| internalOptions, | |
| specParams.internalOptions, | |
| ); | |
| } | |
| const logger = _logger.child({ | |
| module: "ScrapeURL", | |
| scrapeId: id, | |
| scrapeURL: url, | |
| }); | |
| const logs: any[] = []; | |
| return { | |
| id, | |
| url, | |
| options, | |
| internalOptions, | |
| logger, | |
| logs, | |
| featureFlags: buildFeatureFlags(url, options, internalOptions), | |
| mock: | |
| options.useMock !== undefined | |
| ? await loadMock(options.useMock, _logger) | |
| : null, | |
| pdfPrefetch: undefined, | |
| }; | |
| } | |
| export type InternalOptions = { | |
| teamId: string; | |
| priority?: number; // Passed along to fire-engine | |
| forceEngine?: Engine | Engine[]; | |
| atsv?: boolean; // anti-bot solver, beta | |
| v0CrawlOnlyUrls?: boolean; | |
| v0DisableJsDom?: boolean; | |
| useCache?: boolean; | |
| disableSmartWaitCache?: boolean; // Passed along to fire-engine | |
| isBackgroundIndex?: boolean; | |
| fromCache?: boolean; // Indicates if the document was retrieved from cache | |
| abort?: AbortSignal; | |
| urlInvisibleInCurrentCrawl?: boolean; | |
| }; | |
| export type EngineResultsTracker = { | |
| [E in Engine]?: ( | |
| | { | |
| state: "error"; | |
| error: any; | |
| unexpected: boolean; | |
| } | |
| | { | |
| state: "success"; | |
| result: EngineScrapeResult & { markdown: string }; | |
| factors: Record<string, boolean>; | |
| unsupportedFeatures: Set<FeatureFlag>; | |
| } | |
| | { | |
| state: "timeout"; | |
| } | |
| ) & { | |
| startedAt: number; | |
| finishedAt: number; | |
| }; | |
| }; | |
| export type EngineScrapeResultWithContext = { | |
| engine: Engine; | |
| unsupportedFeatures: Set<FeatureFlag>; | |
| result: EngineScrapeResult & { markdown: string }; | |
| }; | |
| function safeguardCircularError<T>(error: T): T { | |
| if (typeof error === "object" && error !== null && (error as any).results) { | |
| const newError = structuredClone(error); | |
| delete (newError as any).results; | |
| return newError; | |
| } else { | |
| return error; | |
| } | |
| } | |
| async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> { | |
| meta.logger.info(`Scraping URL ${JSON.stringify(meta.url)}...`); | |
| // TODO: handle sitemap data, see WebScraper/index.ts:280 | |
| // TODO: ScrapeEvents | |
| const fallbackList = buildFallbackList(meta); | |
| const results: EngineResultsTracker = {}; | |
| let result: EngineScrapeResultWithContext | null = null; | |
| const timeToRun = | |
| meta.options.timeout !== undefined | |
| ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2)) | |
| : (!meta.options.actions && !meta.options.jsonOptions && !meta.options.extract) | |
| ? Math.round(120000 / Math.min(fallbackList.length, 2)) | |
| : undefined; | |
| for (const { engine, unsupportedFeatures } of fallbackList) { | |
| meta.internalOptions.abort?.throwIfAborted(); | |
| const startedAt = Date.now(); | |
| try { | |
| meta.logger.info("Scraping via " + engine + "..."); | |
| const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun); | |
| if (_engineResult.markdown === undefined) { | |
| // Some engines emit Markdown directly. | |
| _engineResult.markdown = await parseMarkdown(_engineResult.html); | |
| } | |
| const engineResult = _engineResult as EngineScrapeResult & { | |
| markdown: string; | |
| }; | |
| // Success factors | |
| const isLongEnough = engineResult.markdown.length > 0; | |
| const isGoodStatusCode = | |
| (engineResult.statusCode >= 200 && engineResult.statusCode < 300) || | |
| engineResult.statusCode === 304; | |
| const hasNoPageError = engineResult.error === undefined; | |
| results[engine] = { | |
| state: "success", | |
| result: engineResult, | |
| factors: { isLongEnough, isGoodStatusCode, hasNoPageError }, | |
| unsupportedFeatures, | |
| startedAt, | |
| finishedAt: Date.now(), | |
| }; | |
| // NOTE: TODO: what to do when status code is bad is tough... | |
| // we cannot just rely on text because error messages can be brief and not hit the limit | |
| // should we just use all the fallbacks and pick the one with the longest text? - mogery | |
| if (isLongEnough || !isGoodStatusCode) { | |
| meta.logger.info("Scrape via " + engine + " deemed successful.", { | |
| factors: { isLongEnough, isGoodStatusCode, hasNoPageError }, | |
| }); | |
| result = { | |
| engine, | |
| unsupportedFeatures, | |
| result: engineResult as EngineScrapeResult & { markdown: string }, | |
| }; | |
| break; | |
| } | |
| } catch (error) { | |
| if (error instanceof EngineError) { | |
| meta.logger.info("Engine " + engine + " could not scrape the page.", { | |
| error, | |
| }); | |
| results[engine] = { | |
| state: "error", | |
| error: safeguardCircularError(error), | |
| unexpected: false, | |
| startedAt, | |
| finishedAt: Date.now(), | |
| }; | |
| } else if (error instanceof TimeoutError) { | |
| meta.logger.info("Engine " + engine + " timed out while scraping.", { | |
| error, | |
| }); | |
| results[engine] = { | |
| state: "timeout", | |
| startedAt, | |
| finishedAt: Date.now(), | |
| }; | |
| } else if ( | |
| error instanceof AddFeatureError || | |
| error instanceof RemoveFeatureError | |
| ) { | |
| throw error; | |
| } else if (error instanceof LLMRefusalError) { | |
| results[engine] = { | |
| state: "error", | |
| error: safeguardCircularError(error), | |
| unexpected: true, | |
| startedAt, | |
| finishedAt: Date.now(), | |
| }; | |
| error.results = results; | |
| meta.logger.warn("LLM refusal encountered", { error }); | |
| throw error; | |
| } else if (error instanceof SiteError) { | |
| throw error; | |
| } else if (error instanceof ActionError) { | |
| throw error; | |
| } else if (error instanceof UnsupportedFileError) { | |
| throw error; | |
| } else if (error instanceof PDFAntibotError) { | |
| throw error; | |
| } else if (error instanceof TimeoutSignal) { | |
| throw error; | |
| } else { | |
| Sentry.captureException(error); | |
| meta.logger.warn( | |
| "An unexpected error happened while scraping with " + engine + ".", | |
| { error }, | |
| ); | |
| results[engine] = { | |
| state: "error", | |
| error: safeguardCircularError(error), | |
| unexpected: true, | |
| startedAt, | |
| finishedAt: Date.now(), | |
| }; | |
| } | |
| } | |
| } | |
| if (result === null) { | |
| throw new NoEnginesLeftError( | |
| fallbackList.map((x) => x.engine), | |
| results, | |
| ); | |
| } | |
| let document: Document = { | |
| markdown: result.result.markdown, | |
| rawHtml: result.result.html, | |
| screenshot: result.result.screenshot, | |
| actions: result.result.actions, | |
| metadata: { | |
| sourceURL: meta.url, | |
| url: result.result.url, | |
| statusCode: result.result.statusCode, | |
| error: result.result.error, | |
| }, | |
| }; | |
| if (result.unsupportedFeatures.size > 0) { | |
| const warning = `The engine used does not support the following features: ${[...result.unsupportedFeatures].join(", ")} -- your scrape may be partial.`; | |
| meta.logger.warn(warning, { | |
| engine: result.engine, | |
| unsupportedFeatures: result.unsupportedFeatures, | |
| }); | |
| document.warning = | |
| document.warning !== undefined | |
| ? document.warning + " " + warning | |
| : warning; | |
| } | |
| document = await executeTransformers(meta, document); | |
| return { | |
| success: true, | |
| document, | |
| logs: meta.logs, | |
| engines: results, | |
| }; | |
| } | |
| export async function scrapeURL( | |
| id: string, | |
| url: string, | |
| options: ScrapeOptions, | |
| internalOptions: InternalOptions, | |
| ): Promise<ScrapeUrlResponse> { | |
| const meta = await buildMetaObject(id, url, options, internalOptions); | |
| try { | |
| while (true) { | |
| try { | |
| return await scrapeURLLoop(meta); | |
| } catch (error) { | |
| if ( | |
| error instanceof AddFeatureError && | |
| meta.internalOptions.forceEngine === undefined | |
| ) { | |
| meta.logger.debug( | |
| "More feature flags requested by scraper: adding " + | |
| error.featureFlags.join(", "), | |
| { error, existingFlags: meta.featureFlags }, | |
| ); | |
| meta.featureFlags = new Set( | |
| [...meta.featureFlags].concat(error.featureFlags), | |
| ); | |
| if (error.pdfPrefetch) { | |
| meta.pdfPrefetch = error.pdfPrefetch; | |
| } | |
| } else if ( | |
| error instanceof RemoveFeatureError && | |
| meta.internalOptions.forceEngine === undefined | |
| ) { | |
| meta.logger.debug( | |
| "Incorrect feature flags reported by scraper: removing " + | |
| error.featureFlags.join(","), | |
| { error, existingFlags: meta.featureFlags }, | |
| ); | |
| meta.featureFlags = new Set( | |
| [...meta.featureFlags].filter( | |
| (x) => !error.featureFlags.includes(x), | |
| ), | |
| ); | |
| } else if ( | |
| error instanceof PDFAntibotError && | |
| meta.internalOptions.forceEngine === undefined | |
| ) { | |
| if (meta.pdfPrefetch !== undefined) { | |
| meta.logger.error("PDF was prefetched and still blocked by antibot, failing"); | |
| throw error; | |
| } else { | |
| meta.logger.debug("PDF was blocked by anti-bot, prefetching with chrome-cdp"); | |
| meta.featureFlags = new Set( | |
| [...meta.featureFlags].filter( | |
| (x) => x !== "pdf", | |
| ), | |
| ); | |
| } | |
| } else { | |
| throw error; | |
| } | |
| } | |
| } | |
| } catch (error) { | |
| let results: EngineResultsTracker = {}; | |
| if (error instanceof NoEnginesLeftError) { | |
| meta.logger.warn("scrapeURL: All scraping engines failed!", { error }); | |
| results = error.results; | |
| } else if (error instanceof LLMRefusalError) { | |
| meta.logger.warn("scrapeURL: LLM refused to extract content", { error }); | |
| results = error.results!; | |
| } else if ( | |
| error instanceof Error && | |
| error.message.includes("Invalid schema for response_format") | |
| ) { | |
| // TODO: seperate into custom error | |
| meta.logger.warn("scrapeURL: LLM schema error", { error }); | |
| // TODO: results? | |
| } else if (error instanceof SiteError) { | |
| meta.logger.warn("scrapeURL: Site failed to load in browser", { error }); | |
| } else if (error instanceof ActionError) { | |
| meta.logger.warn("scrapeURL: Action(s) failed to complete", { error }); | |
| } else if (error instanceof UnsupportedFileError) { | |
| meta.logger.warn("scrapeURL: Tried to scrape unsupported file", { | |
| error, | |
| }); | |
| } else if (error instanceof TimeoutSignal) { | |
| throw error; | |
| } else { | |
| Sentry.captureException(error); | |
| meta.logger.error("scrapeURL: Unexpected error happened", { error }); | |
| // TODO: results? | |
| } | |
| return { | |
| success: false, | |
| error, | |
| logs: meta.logs, | |
| engines: results, | |
| }; | |
| } | |
| } | |