Echo-AI-official's picture
Upload 280 files
0e759d2 verified
import { Logger } from "winston";
import * as Sentry from "@sentry/node";
import { Document, ScrapeOptions, TimeoutSignal } from "../../controllers/v1/types";
import { logger as _logger } from "../../lib/logger";
import {
buildFallbackList,
Engine,
EngineScrapeResult,
FeatureFlag,
scrapeURLWithEngine,
} from "./engines";
import { parseMarkdown } from "../../lib/html-to-markdown";
import {
ActionError,
AddFeatureError,
EngineError,
NoEnginesLeftError,
PDFAntibotError,
RemoveFeatureError,
SiteError,
TimeoutError,
UnsupportedFileError,
} from "./error";
import { executeTransformers } from "./transformers";
import { LLMRefusalError } from "./transformers/llmExtract";
import { urlSpecificParams } from "./lib/urlSpecificParams";
import { loadMock, MockState } from "./lib/mock";
export type ScrapeUrlResponse = (
| {
success: true;
document: Document;
}
| {
success: false;
error: any;
}
) & {
logs: any[];
engines: EngineResultsTracker;
};
export type Meta = {
id: string;
url: string;
options: ScrapeOptions;
internalOptions: InternalOptions;
logger: Logger;
logs: any[];
featureFlags: Set<FeatureFlag>;
mock: MockState | null;
pdfPrefetch: {
filePath: string;
url?: string;
status: number;
} | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty
};
function buildFeatureFlags(
url: string,
options: ScrapeOptions,
internalOptions: InternalOptions,
): Set<FeatureFlag> {
const flags: Set<FeatureFlag> = new Set();
if (options.actions !== undefined) {
flags.add("actions");
}
if (options.formats.includes("screenshot")) {
flags.add("screenshot");
}
if (options.formats.includes("screenshot@fullPage")) {
flags.add("screenshot@fullScreen");
}
if (options.waitFor !== 0) {
flags.add("waitFor");
}
if (internalOptions.atsv) {
flags.add("atsv");
}
if (options.location || options.geolocation) {
flags.add("location");
}
if (options.mobile) {
flags.add("mobile");
}
if (options.skipTlsVerification) {
flags.add("skipTlsVerification");
}
if (options.fastMode) {
flags.add("useFastMode");
}
if (options.proxy === "stealth") {
flags.add("stealthProxy");
}
const urlO = new URL(url);
if (urlO.pathname.endsWith(".pdf")) {
flags.add("pdf");
}
if (urlO.pathname.endsWith(".docx")) {
flags.add("docx");
}
return flags;
}
// The meta object contains all required information to perform a scrape.
// For example, the scrape ID, URL, options, feature flags, logs that occur while scraping.
// The meta object is usually immutable, except for the logs array, and in edge cases (e.g. a new feature is suddenly required)
// Having a meta object that is treated as immutable helps the code stay clean and easily tracable,
// while also retaining the benefits that WebScraper had from its OOP design.
async function buildMetaObject(
id: string,
url: string,
options: ScrapeOptions,
internalOptions: InternalOptions,
): Promise<Meta> {
const specParams =
urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
if (specParams !== undefined) {
options = Object.assign(options, specParams.scrapeOptions);
internalOptions = Object.assign(
internalOptions,
specParams.internalOptions,
);
}
const logger = _logger.child({
module: "ScrapeURL",
scrapeId: id,
scrapeURL: url,
});
const logs: any[] = [];
return {
id,
url,
options,
internalOptions,
logger,
logs,
featureFlags: buildFeatureFlags(url, options, internalOptions),
mock:
options.useMock !== undefined
? await loadMock(options.useMock, _logger)
: null,
pdfPrefetch: undefined,
};
}
export type InternalOptions = {
teamId: string;
priority?: number; // Passed along to fire-engine
forceEngine?: Engine | Engine[];
atsv?: boolean; // anti-bot solver, beta
v0CrawlOnlyUrls?: boolean;
v0DisableJsDom?: boolean;
useCache?: boolean;
disableSmartWaitCache?: boolean; // Passed along to fire-engine
isBackgroundIndex?: boolean;
fromCache?: boolean; // Indicates if the document was retrieved from cache
abort?: AbortSignal;
urlInvisibleInCurrentCrawl?: boolean;
};
export type EngineResultsTracker = {
[E in Engine]?: (
| {
state: "error";
error: any;
unexpected: boolean;
}
| {
state: "success";
result: EngineScrapeResult & { markdown: string };
factors: Record<string, boolean>;
unsupportedFeatures: Set<FeatureFlag>;
}
| {
state: "timeout";
}
) & {
startedAt: number;
finishedAt: number;
};
};
export type EngineScrapeResultWithContext = {
engine: Engine;
unsupportedFeatures: Set<FeatureFlag>;
result: EngineScrapeResult & { markdown: string };
};
function safeguardCircularError<T>(error: T): T {
if (typeof error === "object" && error !== null && (error as any).results) {
const newError = structuredClone(error);
delete (newError as any).results;
return newError;
} else {
return error;
}
}
async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
meta.logger.info(`Scraping URL ${JSON.stringify(meta.url)}...`);
// TODO: handle sitemap data, see WebScraper/index.ts:280
// TODO: ScrapeEvents
const fallbackList = buildFallbackList(meta);
const results: EngineResultsTracker = {};
let result: EngineScrapeResultWithContext | null = null;
const timeToRun =
meta.options.timeout !== undefined
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2))
: (!meta.options.actions && !meta.options.jsonOptions && !meta.options.extract)
? Math.round(120000 / Math.min(fallbackList.length, 2))
: undefined;
for (const { engine, unsupportedFeatures } of fallbackList) {
meta.internalOptions.abort?.throwIfAborted();
const startedAt = Date.now();
try {
meta.logger.info("Scraping via " + engine + "...");
const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun);
if (_engineResult.markdown === undefined) {
// Some engines emit Markdown directly.
_engineResult.markdown = await parseMarkdown(_engineResult.html);
}
const engineResult = _engineResult as EngineScrapeResult & {
markdown: string;
};
// Success factors
const isLongEnough = engineResult.markdown.length > 0;
const isGoodStatusCode =
(engineResult.statusCode >= 200 && engineResult.statusCode < 300) ||
engineResult.statusCode === 304;
const hasNoPageError = engineResult.error === undefined;
results[engine] = {
state: "success",
result: engineResult,
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
unsupportedFeatures,
startedAt,
finishedAt: Date.now(),
};
// NOTE: TODO: what to do when status code is bad is tough...
// we cannot just rely on text because error messages can be brief and not hit the limit
// should we just use all the fallbacks and pick the one with the longest text? - mogery
if (isLongEnough || !isGoodStatusCode) {
meta.logger.info("Scrape via " + engine + " deemed successful.", {
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
});
result = {
engine,
unsupportedFeatures,
result: engineResult as EngineScrapeResult & { markdown: string },
};
break;
}
} catch (error) {
if (error instanceof EngineError) {
meta.logger.info("Engine " + engine + " could not scrape the page.", {
error,
});
results[engine] = {
state: "error",
error: safeguardCircularError(error),
unexpected: false,
startedAt,
finishedAt: Date.now(),
};
} else if (error instanceof TimeoutError) {
meta.logger.info("Engine " + engine + " timed out while scraping.", {
error,
});
results[engine] = {
state: "timeout",
startedAt,
finishedAt: Date.now(),
};
} else if (
error instanceof AddFeatureError ||
error instanceof RemoveFeatureError
) {
throw error;
} else if (error instanceof LLMRefusalError) {
results[engine] = {
state: "error",
error: safeguardCircularError(error),
unexpected: true,
startedAt,
finishedAt: Date.now(),
};
error.results = results;
meta.logger.warn("LLM refusal encountered", { error });
throw error;
} else if (error instanceof SiteError) {
throw error;
} else if (error instanceof ActionError) {
throw error;
} else if (error instanceof UnsupportedFileError) {
throw error;
} else if (error instanceof PDFAntibotError) {
throw error;
} else if (error instanceof TimeoutSignal) {
throw error;
} else {
Sentry.captureException(error);
meta.logger.warn(
"An unexpected error happened while scraping with " + engine + ".",
{ error },
);
results[engine] = {
state: "error",
error: safeguardCircularError(error),
unexpected: true,
startedAt,
finishedAt: Date.now(),
};
}
}
}
if (result === null) {
throw new NoEnginesLeftError(
fallbackList.map((x) => x.engine),
results,
);
}
let document: Document = {
markdown: result.result.markdown,
rawHtml: result.result.html,
screenshot: result.result.screenshot,
actions: result.result.actions,
metadata: {
sourceURL: meta.url,
url: result.result.url,
statusCode: result.result.statusCode,
error: result.result.error,
},
};
if (result.unsupportedFeatures.size > 0) {
const warning = `The engine used does not support the following features: ${[...result.unsupportedFeatures].join(", ")} -- your scrape may be partial.`;
meta.logger.warn(warning, {
engine: result.engine,
unsupportedFeatures: result.unsupportedFeatures,
});
document.warning =
document.warning !== undefined
? document.warning + " " + warning
: warning;
}
document = await executeTransformers(meta, document);
return {
success: true,
document,
logs: meta.logs,
engines: results,
};
}
export async function scrapeURL(
id: string,
url: string,
options: ScrapeOptions,
internalOptions: InternalOptions,
): Promise<ScrapeUrlResponse> {
const meta = await buildMetaObject(id, url, options, internalOptions);
try {
while (true) {
try {
return await scrapeURLLoop(meta);
} catch (error) {
if (
error instanceof AddFeatureError &&
meta.internalOptions.forceEngine === undefined
) {
meta.logger.debug(
"More feature flags requested by scraper: adding " +
error.featureFlags.join(", "),
{ error, existingFlags: meta.featureFlags },
);
meta.featureFlags = new Set(
[...meta.featureFlags].concat(error.featureFlags),
);
if (error.pdfPrefetch) {
meta.pdfPrefetch = error.pdfPrefetch;
}
} else if (
error instanceof RemoveFeatureError &&
meta.internalOptions.forceEngine === undefined
) {
meta.logger.debug(
"Incorrect feature flags reported by scraper: removing " +
error.featureFlags.join(","),
{ error, existingFlags: meta.featureFlags },
);
meta.featureFlags = new Set(
[...meta.featureFlags].filter(
(x) => !error.featureFlags.includes(x),
),
);
} else if (
error instanceof PDFAntibotError &&
meta.internalOptions.forceEngine === undefined
) {
if (meta.pdfPrefetch !== undefined) {
meta.logger.error("PDF was prefetched and still blocked by antibot, failing");
throw error;
} else {
meta.logger.debug("PDF was blocked by anti-bot, prefetching with chrome-cdp");
meta.featureFlags = new Set(
[...meta.featureFlags].filter(
(x) => x !== "pdf",
),
);
}
} else {
throw error;
}
}
}
} catch (error) {
let results: EngineResultsTracker = {};
if (error instanceof NoEnginesLeftError) {
meta.logger.warn("scrapeURL: All scraping engines failed!", { error });
results = error.results;
} else if (error instanceof LLMRefusalError) {
meta.logger.warn("scrapeURL: LLM refused to extract content", { error });
results = error.results!;
} else if (
error instanceof Error &&
error.message.includes("Invalid schema for response_format")
) {
// TODO: seperate into custom error
meta.logger.warn("scrapeURL: LLM schema error", { error });
// TODO: results?
} else if (error instanceof SiteError) {
meta.logger.warn("scrapeURL: Site failed to load in browser", { error });
} else if (error instanceof ActionError) {
meta.logger.warn("scrapeURL: Action(s) failed to complete", { error });
} else if (error instanceof UnsupportedFileError) {
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", {
error,
});
} else if (error instanceof TimeoutSignal) {
throw error;
} else {
Sentry.captureException(error);
meta.logger.error("scrapeURL: Unexpected error happened", { error });
// TODO: results?
}
return {
success: false,
error,
logs: meta.logs,
engines: results,
};
}
}