Spaces:
Paused
Paused
| import { ScrapeActionContent } from "../../../lib/entities"; | |
| import { Meta } from ".."; | |
| import { scrapeDOCX } from "./docx"; | |
| import { | |
| scrapeURLWithFireEngineChromeCDP, | |
| scrapeURLWithFireEnginePlaywright, | |
| scrapeURLWithFireEngineTLSClient, | |
| } from "./fire-engine"; | |
| import { scrapePDF } from "./pdf"; | |
| import { scrapeURLWithScrapingBee } from "./scrapingbee"; | |
| import { scrapeURLWithFetch } from "./fetch"; | |
| import { scrapeURLWithPlaywright } from "./playwright"; | |
| import { scrapeCache } from "./cache"; | |
| export type Engine = | |
| | "fire-engine;chrome-cdp" | |
| | "fire-engine;playwright" | |
| | "fire-engine;tlsclient" | |
| | "scrapingbee" | |
| | "scrapingbeeLoad" | |
| | "playwright" | |
| | "fetch" | |
| | "pdf" | |
| | "docx" | |
| | "cache"; | |
| const useScrapingBee = | |
| process.env.SCRAPING_BEE_API_KEY !== "" && | |
| process.env.SCRAPING_BEE_API_KEY !== undefined; | |
| const useFireEngine = | |
| process.env.FIRE_ENGINE_BETA_URL !== "" && | |
| process.env.FIRE_ENGINE_BETA_URL !== undefined; | |
| const usePlaywright = | |
| process.env.PLAYWRIGHT_MICROSERVICE_URL !== "" && | |
| process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined; | |
| const useCache = | |
| process.env.CACHE_REDIS_URL !== "" && | |
| process.env.CACHE_REDIS_URL !== undefined; | |
| export const engines: Engine[] = [ | |
| ...(useCache ? ["cache" as const] : []), | |
| ...(useFireEngine | |
| ? [ | |
| "fire-engine;chrome-cdp" as const, | |
| "fire-engine;playwright" as const, | |
| "fire-engine;tlsclient" as const, | |
| ] | |
| : []), | |
| ...(useScrapingBee | |
| ? ["scrapingbee" as const, "scrapingbeeLoad" as const] | |
| : []), | |
| ...(usePlaywright ? ["playwright" as const] : []), | |
| "fetch", | |
| "pdf", | |
| "docx", | |
| ]; | |
| export const featureFlags = [ | |
| "actions", | |
| "waitFor", | |
| "screenshot", | |
| "screenshot@fullScreen", | |
| "pdf", | |
| "docx", | |
| "atsv", | |
| "location", | |
| "mobile", | |
| "skipTlsVerification", | |
| "useFastMode", | |
| "stealthProxy", | |
| ] as const; | |
| export type FeatureFlag = (typeof featureFlags)[number]; | |
| export const featureFlagOptions: { | |
| [F in FeatureFlag]: { | |
| priority: number; | |
| }; | |
| } = { | |
| actions: { priority: 20 }, | |
| waitFor: { priority: 1 }, | |
| screenshot: { priority: 10 }, | |
| "screenshot@fullScreen": { priority: 10 }, | |
| pdf: { priority: 100 }, | |
| docx: { priority: 100 }, | |
| atsv: { priority: 90 }, // NOTE: should atsv force to tlsclient? adjust priority if not | |
| useFastMode: { priority: 90 }, | |
| location: { priority: 10 }, | |
| mobile: { priority: 10 }, | |
| skipTlsVerification: { priority: 10 }, | |
| stealthProxy: { priority: 20 }, | |
| } as const; | |
| export type EngineScrapeResult = { | |
| url: string; | |
| html: string; | |
| markdown?: string; | |
| statusCode: number; | |
| error?: string; | |
| screenshot?: string; | |
| actions?: { | |
| screenshots: string[]; | |
| scrapes: ScrapeActionContent[]; | |
| javascriptReturns: { | |
| type: string; | |
| value: unknown | |
| }[]; | |
| }; | |
| }; | |
| const engineHandlers: { | |
| [E in Engine]: ( | |
| meta: Meta, | |
| timeToRun: number | undefined, | |
| ) => Promise<EngineScrapeResult>; | |
| } = { | |
| cache: scrapeCache, | |
| "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP, | |
| "fire-engine;playwright": scrapeURLWithFireEnginePlaywright, | |
| "fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient, | |
| scrapingbee: scrapeURLWithScrapingBee("domcontentloaded"), | |
| scrapingbeeLoad: scrapeURLWithScrapingBee("networkidle2"), | |
| playwright: scrapeURLWithPlaywright, | |
| fetch: scrapeURLWithFetch, | |
| pdf: scrapePDF, | |
| docx: scrapeDOCX, | |
| }; | |
| export const engineOptions: { | |
| [E in Engine]: { | |
| // A list of feature flags the engine supports. | |
| features: { [F in FeatureFlag]: boolean }; | |
| // This defines the order of engines in general. The engine with the highest quality will be used the most. | |
| // Negative quality numbers are reserved for specialty engines, e.g. PDF and DOCX | |
| quality: number; | |
| }; | |
| } = { | |
| cache: { | |
| features: { | |
| actions: false, | |
| waitFor: true, | |
| screenshot: false, | |
| "screenshot@fullScreen": false, | |
| pdf: false, // TODO: figure this out | |
| docx: false, // TODO: figure this out | |
| atsv: false, | |
| location: false, | |
| mobile: false, | |
| skipTlsVerification: false, | |
| useFastMode: false, | |
| stealthProxy: false, | |
| }, | |
| quality: 1000, // cache should always be tried first | |
| }, | |
| "fire-engine;chrome-cdp": { | |
| features: { | |
| actions: true, | |
| waitFor: true, // through actions transform | |
| screenshot: true, // through actions transform | |
| "screenshot@fullScreen": true, // through actions transform | |
| pdf: false, | |
| docx: false, | |
| atsv: false, | |
| location: true, | |
| mobile: true, | |
| skipTlsVerification: true, | |
| useFastMode: false, | |
| stealthProxy: true, | |
| }, | |
| quality: 50, | |
| }, | |
| "fire-engine;playwright": { | |
| features: { | |
| actions: false, | |
| waitFor: true, | |
| screenshot: true, | |
| "screenshot@fullScreen": true, | |
| pdf: false, | |
| docx: false, | |
| atsv: false, | |
| location: false, | |
| mobile: false, | |
| skipTlsVerification: false, | |
| useFastMode: false, | |
| stealthProxy: true, | |
| }, | |
| quality: 40, | |
| }, | |
| scrapingbee: { | |
| features: { | |
| actions: false, | |
| waitFor: true, | |
| screenshot: true, | |
| "screenshot@fullScreen": true, | |
| pdf: false, | |
| docx: false, | |
| atsv: false, | |
| location: false, | |
| mobile: false, | |
| skipTlsVerification: false, | |
| useFastMode: false, | |
| stealthProxy: false, | |
| }, | |
| quality: 30, | |
| }, | |
| scrapingbeeLoad: { | |
| features: { | |
| actions: false, | |
| waitFor: true, | |
| screenshot: true, | |
| "screenshot@fullScreen": true, | |
| pdf: false, | |
| docx: false, | |
| atsv: false, | |
| location: false, | |
| mobile: false, | |
| skipTlsVerification: false, | |
| useFastMode: false, | |
| stealthProxy: false, | |
| }, | |
| quality: 29, | |
| }, | |
| playwright: { | |
| features: { | |
| actions: false, | |
| waitFor: true, | |
| screenshot: false, | |
| "screenshot@fullScreen": false, | |
| pdf: false, | |
| docx: false, | |
| atsv: false, | |
| location: false, | |
| mobile: false, | |
| skipTlsVerification: false, | |
| useFastMode: false, | |
| stealthProxy: false, | |
| }, | |
| quality: 20, | |
| }, | |
| "fire-engine;tlsclient": { | |
| features: { | |
| actions: false, | |
| waitFor: false, | |
| screenshot: false, | |
| "screenshot@fullScreen": false, | |
| pdf: false, | |
| docx: false, | |
| atsv: true, | |
| location: true, | |
| mobile: false, | |
| skipTlsVerification: false, | |
| useFastMode: true, | |
| stealthProxy: true, | |
| }, | |
| quality: 10, | |
| }, | |
| fetch: { | |
| features: { | |
| actions: false, | |
| waitFor: false, | |
| screenshot: false, | |
| "screenshot@fullScreen": false, | |
| pdf: false, | |
| docx: false, | |
| atsv: false, | |
| location: false, | |
| mobile: false, | |
| skipTlsVerification: false, | |
| useFastMode: true, | |
| stealthProxy: false, | |
| }, | |
| quality: 5, | |
| }, | |
| pdf: { | |
| features: { | |
| actions: false, | |
| waitFor: false, | |
| screenshot: false, | |
| "screenshot@fullScreen": false, | |
| pdf: true, | |
| docx: false, | |
| atsv: false, | |
| location: false, | |
| mobile: false, | |
| skipTlsVerification: false, | |
| useFastMode: true, | |
| stealthProxy: true, // kinda... | |
| }, | |
| quality: -10, | |
| }, | |
| docx: { | |
| features: { | |
| actions: false, | |
| waitFor: false, | |
| screenshot: false, | |
| "screenshot@fullScreen": false, | |
| pdf: false, | |
| docx: true, | |
| atsv: false, | |
| location: false, | |
| mobile: false, | |
| skipTlsVerification: false, | |
| useFastMode: true, | |
| stealthProxy: true, // kinda... | |
| }, | |
| quality: -10, | |
| }, | |
| }; | |
| export function buildFallbackList(meta: Meta): { | |
| engine: Engine; | |
| unsupportedFeatures: Set<FeatureFlag>; | |
| }[] { | |
| const _engines: Engine[] = [ | |
| ...engines, | |
| // enable fire-engine in self-hosted testing environment when mocks are supplied | |
| ...((!useFireEngine && meta.mock !== null) ? ["fire-engine;chrome-cdp", "fire-engine;playwright", "fire-engine;tlsclient"] as Engine[] : []) | |
| ]; | |
| if (meta.internalOptions.useCache !== true) { | |
| const cacheIndex = _engines.indexOf("cache"); | |
| if (cacheIndex !== -1) { | |
| _engines.splice(cacheIndex, 1); | |
| } | |
| } else { | |
| meta.logger.debug("Cache engine enabled by useCache option"); | |
| } | |
| const prioritySum = [...meta.featureFlags].reduce( | |
| (a, x) => a + featureFlagOptions[x].priority, | |
| 0, | |
| ); | |
| const priorityThreshold = Math.floor(prioritySum / 2); | |
| let selectedEngines: { | |
| engine: Engine; | |
| supportScore: number; | |
| unsupportedFeatures: Set<FeatureFlag>; | |
| }[] = []; | |
| const currentEngines = | |
| meta.internalOptions.forceEngine !== undefined | |
| ? (Array.isArray(meta.internalOptions.forceEngine) ? meta.internalOptions.forceEngine : [meta.internalOptions.forceEngine]) | |
| : _engines; | |
| for (const engine of currentEngines) { | |
| const supportedFlags = new Set([ | |
| ...Object.entries(engineOptions[engine].features) | |
| .filter( | |
| ([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true, | |
| ) | |
| .map(([k, _]) => k), | |
| ]); | |
| const supportScore = [...supportedFlags].reduce( | |
| (a, x) => a + featureFlagOptions[x].priority, | |
| 0, | |
| ); | |
| const unsupportedFeatures = new Set([...meta.featureFlags]); | |
| for (const flag of meta.featureFlags) { | |
| if (supportedFlags.has(flag)) { | |
| unsupportedFeatures.delete(flag); | |
| } | |
| } | |
| if (supportScore >= priorityThreshold) { | |
| selectedEngines.push({ engine, supportScore, unsupportedFeatures }); | |
| meta.logger.debug(`Engine ${engine} meets feature priority threshold`, { | |
| supportScore, | |
| prioritySum, | |
| priorityThreshold, | |
| featureFlags: [...meta.featureFlags], | |
| unsupportedFeatures, | |
| }); | |
| } else { | |
| meta.logger.debug( | |
| `Engine ${engine} does not meet feature priority threshold`, | |
| { | |
| supportScore, | |
| prioritySum, | |
| priorityThreshold, | |
| featureFlags: [...meta.featureFlags], | |
| unsupportedFeatures, | |
| }, | |
| ); | |
| } | |
| } | |
| if (selectedEngines.some((x) => engineOptions[x.engine].quality > 0)) { | |
| selectedEngines = selectedEngines.filter( | |
| (x) => engineOptions[x.engine].quality > 0, | |
| ); | |
| } | |
| if (meta.internalOptions.forceEngine === undefined) { // retain force engine order | |
| selectedEngines.sort( | |
| (a, b) => | |
| b.supportScore - a.supportScore || | |
| engineOptions[b.engine].quality - engineOptions[a.engine].quality, | |
| ); | |
| } | |
| return selectedEngines; | |
| } | |
| export async function scrapeURLWithEngine( | |
| meta: Meta, | |
| engine: Engine, | |
| timeToRun: number | undefined, | |
| ): Promise<EngineScrapeResult> { | |
| const fn = engineHandlers[engine]; | |
| const logger = meta.logger.child({ | |
| method: fn.name ?? "scrapeURLWithEngine", | |
| engine, | |
| }); | |
| const _meta = { | |
| ...meta, | |
| logger, | |
| }; | |
| return await fn(_meta, timeToRun); | |
| } | |