nomagick's picture
fix: some invalid uriComponent case
0e8308e unverified
raw
history blame
38.1 kB
import {
assignTransferProtocolMeta, marshalErrorLike,
RPCHost, RPCReflection,
AssertionFailureError, ParamValidationError, Defer,
} from 'civkit';
import { singleton } from 'tsyringe';
import { AsyncContext, BudgetExceededError, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
import _ from 'lodash';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
import { Request, Response } from 'express';
const pNormalizeUrl = import("@esm2cjs/normalize-url");
import { Crawled } from '../db/crawled';
import { randomUUID } from 'crypto';
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
import { countGPTToken as estimateToken } from '../shared/utils/openai';
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/scrapping-options';
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
import { DomainBlockade } from '../db/domain-blockade';
import { DomainProfile } from '../db/domain-profile';
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
import { JSDomControl } from '../services/jsdom';
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
import { CurlControl } from '../services/curl';
import { LmControl } from '../services/lm';
import { tryDecodeURIComponent } from '../utils/misc';
export interface ExtraScrappingOptions extends ScrappingOptions {
withIframe?: boolean | 'quoted';
withShadowDom?: boolean;
targetSelector?: string | string[];
removeSelector?: string | string[];
keepImgDataUrl?: boolean;
engine?: string;
}
const indexProto = {
toString: function (): string {
return _(this)
.toPairs()
.map(([k, v]) => k ? `[${_.upperFirst(_.lowerCase(k))}] ${v}` : '')
.value()
.join('\n') + '\n';
}
};
@singleton()
export class CrawlerHost extends RPCHost {
logger = this.globalLogger.child({ service: this.constructor.name });
cacheRetentionMs = 1000 * 3600 * 24 * 7;
cacheValidMs = 1000 * 3600;
urlValidMs = 1000 * 3600 * 4;
abuseBlockMs = 1000 * 3600;
domainProfileRetentionMs = 1000 * 3600 * 24 * 30;
constructor(
protected globalLogger: Logger,
protected puppeteerControl: PuppeteerControl,
protected curlControl: CurlControl,
protected lmControl: LmControl,
protected jsdomControl: JSDomControl,
protected snapshotFormatter: SnapshotFormatter,
protected firebaseObjectStorage: FirebaseStorageBucketControl,
protected rateLimitControl: RateLimitControl,
protected threadLocal: AsyncContext,
protected fbHealthCheck: FirebaseRoundTripChecker,
) {
super(...arguments);
puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ExtraScrappingOptions & { url: URL; }) => {
if (!snapshot.title?.trim() && !snapshot.pdfs?.length) {
return;
}
if (options.cookies?.length) {
// Potential privacy issue, dont cache if cookies are used
return;
}
if (options.injectFrameScripts?.length || options.injectPageScripts?.length || options.viewport) {
// Potentially mangeled content, dont cache if scripts are injected
return;
}
if (options.locale) {
Reflect.set(snapshot, 'locale', options.locale);
}
await this.setToCache(options.url, snapshot);
await this.exploreDirectEngine(snapshot).catch(() => undefined);
});
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn });
await DomainBlockade.save(DomainBlockade.from({
domain: abuseEvent.url.hostname.toLowerCase(),
triggerReason: `${abuseEvent.reason}`,
triggerUrl: abuseEvent.url.toString(),
createdAt: new Date(),
expireAt: new Date(Date.now() + this.abuseBlockMs),
})).catch((err) => {
this.logger.warn(`Failed to save domain blockade for ${abuseEvent.url.hostname}`, { err: marshalErrorLike(err) });
});
});
}
override async init() {
await this.dependencyReady();
this.emit('ready');
}
getIndex(user?: JinaEmbeddingsTokenAccount) {
const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
Object.assign(indexObject, {
usage1: 'https://r.jina.ai/YOUR_URL',
usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY',
homepage: 'https://jina.ai/reader',
sourceCode: 'https://github.com/jina-ai/reader',
});
if (user) {
indexObject[''] = undefined;
indexObject.authenticatedAs = `${user.user_id} (${user.full_name})`;
indexObject.balanceLeft = user.wallet.total_balance;
}
return indexObject;
}
@CloudHTTPv2({
name: 'crawl2',
runtime: {
memory: '4GiB',
timeoutSeconds: 300,
concurrency: 22,
},
tags: ['Crawler'],
httpMethod: ['get', 'post'],
returnType: [String, OutputServerEventStream],
exposeRoot: true,
})
@CloudHTTPv2({
runtime: {
memory: '4GiB',
cpu: 2,
timeoutSeconds: 300,
concurrency: 10,
maxInstances: 1000,
minInstances: 1,
},
tags: ['Crawler'],
httpMethod: ['get', 'post'],
returnType: [String, OutputServerEventStream],
exposeRoot: true,
})
async crawl(
@RPCReflect() rpcReflect: RPCReflection,
@Ctx() ctx: {
req: Request,
res: Response,
},
auth: JinaEmbeddingsAuthDTO,
crawlerOptionsHeaderOnly: CrawlerOptionsHeaderOnly,
crawlerOptionsParamsAllowed: CrawlerOptions,
) {
const uid = await auth.solveUID();
let chargeAmount = 0;
const crawlerOptions = ctx.req.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed;
// Note req.url in express is actually unparsed `path`, e.g. `/some-path?abc`. Instead of a real url.
const targetUrl = await this.getTargetUrl(tryDecodeURIComponent(ctx.req.url), crawlerOptions);
if (!targetUrl) {
const latestUser = uid ? await auth.assertUser() : undefined;
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
return this.getIndex(latestUser);
}
return assignTransferProtocolMeta(`${this.getIndex(latestUser)}`,
{ contentType: 'text/plain', envelope: null }
);
}
// Prevent circular crawling
this.puppeteerControl.circuitBreakerHosts.add(
ctx.req.hostname.toLowerCase()
);
if (uid) {
const user = await auth.assertUser();
if (!(user.wallet.total_balance > 0)) {
throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
}
const rateLimitPolicy = auth.getRateLimits(rpcReflect.name.toUpperCase()) || [
parseInt(user.metadata?.speed_level) >= 2 ?
RateLimitDesc.from({
occurrence: 1000,
periodSeconds: 60
}) :
RateLimitDesc.from({
occurrence: 200,
periodSeconds: 60
})
];
const apiRoll = await this.rateLimitControl.simpleRPCUidBasedLimit(
rpcReflect, uid, [rpcReflect.name.toUpperCase()],
...rateLimitPolicy
);
rpcReflect.finally(() => {
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
return;
}
if (chargeAmount) {
auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => {
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
});
apiRoll.chargeAmount = chargeAmount;
}
});
} else if (ctx.req.ip) {
const apiRoll = await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, [rpcReflect.name.toUpperCase()],
[
// 20 requests per minute
new Date(Date.now() - 60 * 1000), 20
]
);
rpcReflect.finally(() => {
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
return;
}
if (chargeAmount) {
apiRoll._ref?.set({
chargeAmount,
}, { merge: true }).catch((err) => this.logger.warn(`Failed to log charge amount in apiRoll`, { err }));
}
});
}
if (!uid) {
const blockade = (await DomainBlockade.fromFirestoreQuery(
DomainBlockade.COLLECTION
.where('domain', '==', targetUrl.hostname.toLowerCase())
.where('expireAt', '>=', new Date())
.limit(1)
))[0];
if (blockade) {
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
}
}
const crawlOpts = await this.configure(crawlerOptions);
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
const sseStream = new OutputServerEventStream();
rpcReflect.return(sseStream);
try {
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
if (!scrapped) {
continue;
}
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
}
sseStream.write({
event: 'data',
data: formatted,
});
if (chargeAmount && scrapped.pdfs?.length) {
break;
}
}
} catch (err: any) {
this.logger.error(`Failed to crawl ${targetUrl}`, { err: marshalErrorLike(err) });
sseStream.write({
event: 'error',
data: marshalErrorLike(err),
});
}
sseStream.end();
return sseStream;
}
let lastScrapped;
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped;
if (!crawlerOptions.isEarlyReturnApplicable()) {
continue;
}
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
continue;
}
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
}
if (scrapped?.pdfs?.length && !chargeAmount) {
continue;
}
return formatted;
}
if (!lastScrapped) {
if (crawlOpts.targetSelector) {
throw new AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`);
}
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
}
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
}
return formatted;
}
if (crawlerOptions.isRequestingCompoundContentFormat()) {
throw new ParamValidationError({
path: 'respondWith',
message: `You are requesting compound content format, please explicitly accept 'text/event-stream' or 'application/json' in header.`
});
}
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped;
if (!crawlerOptions.isEarlyReturnApplicable()) {
continue;
}
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
continue;
}
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
}
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
);
}
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
);
}
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
}
if (!lastScrapped) {
if (crawlOpts.targetSelector) {
throw new AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`);
}
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
}
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
}
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
);
}
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
);
}
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
}
async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
let url: string;
const targetUrlFromGet = originPath.slice(1);
if (crawlerOptions.pdf) {
const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64');
url = `file://pdf.${md5Hasher.hash(pdfBuf)}`;
} else if (targetUrlFromGet) {
url = targetUrlFromGet.trim();
} else if (crawlerOptions.url) {
url = crawlerOptions.url.trim();
} else {
return null;
}
let result: URL;
const normalizeUrl = (await pNormalizeUrl).default;
try {
result = new URL(
normalizeUrl(
url,
{
stripWWW: false,
removeTrailingSlash: false,
removeSingleSlash: false,
sortQueryParameters: false,
}
)
);
} catch (err) {
throw new ParamValidationError({
message: `${err}`,
path: 'url'
});
}
if (!['http:', 'https:', 'file:'].includes(result.protocol)) {
throw new ParamValidationError({
message: `Invalid protocol ${result.protocol}`,
path: 'url'
});
}
return result;
}
getUrlDigest(urlToCrawl: URL) {
const normalizedURL = new URL(urlToCrawl);
if (!normalizedURL.hash.startsWith('#/')) {
normalizedURL.hash = '';
}
const normalizedUrl = normalizedURL.toString().toLowerCase();
const digest = md5Hasher.hash(normalizedUrl.toString());
return digest;
}
async queryCache(urlToCrawl: URL, cacheTolerance: number) {
const digest = this.getUrlDigest(urlToCrawl);
const cache = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
if (!cache) {
return undefined;
}
const age = Date.now() - cache.createdAt.valueOf();
const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
url: urlToCrawl, digest, age, stale, cacheTolerance
});
let snapshot: PageSnapshot | undefined;
let screenshotUrl: string | undefined;
let pageshotUrl: string | undefined;
const preparations = [
this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => {
snapshot = JSON.parse(r.toString('utf-8'));
}),
cache.screenshotAvailable ?
this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
screenshotUrl = r;
}) :
Promise.resolve(undefined),
cache.pageshotAvailable ?
this.firebaseObjectStorage.signDownloadUrl(`pageshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
pageshotUrl = r;
}) :
Promise.resolve(undefined)
];
try {
await Promise.all(preparations);
} catch (_err) {
// Swallow cache errors.
return undefined;
}
return {
isFresh: !stale,
...cache,
snapshot: {
...snapshot,
screenshot: undefined,
pageshot: undefined,
screenshotUrl,
pageshotUrl,
} as PageSnapshot & { screenshotUrl?: string; pageshotUrl?: string; }
};
}
async setToCache(urlToCrawl: URL, snapshot: PageSnapshot) {
const digest = this.getUrlDigest(urlToCrawl);
this.logger.info(`Caching snapshot of ${urlToCrawl}...`, { url: urlToCrawl, digest, title: snapshot?.title, href: snapshot?.href });
const nowDate = new Date();
const cache = Crawled.from({
_id: randomUUID(),
url: urlToCrawl.toString(),
createdAt: nowDate,
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
urlPathDigest: digest,
});
const savingOfSnapshot = this.firebaseObjectStorage.saveFile(`snapshots/${cache._id}`,
Buffer.from(
JSON.stringify({
...snapshot,
screenshot: undefined,
pageshot: undefined,
}),
'utf-8'
),
{
metadata: {
contentType: 'application/json',
}
}
).then((r) => {
cache.snapshotAvailable = true;
return r;
});
if (snapshot.screenshot) {
await this.firebaseObjectStorage.saveFile(`screenshots/${cache._id}`, snapshot.screenshot, {
metadata: {
contentType: 'image/png',
}
});
cache.screenshotAvailable = true;
}
if (snapshot.pageshot) {
await this.firebaseObjectStorage.saveFile(`pageshots/${cache._id}`, snapshot.pageshot, {
metadata: {
contentType: 'image/png',
}
});
cache.pageshotAvailable = true;
}
await savingOfSnapshot;
const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
return undefined;
});
return r;
}
async *iterSnapshots(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
// if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.VLM)) {
// const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, {
// ...crawlOpts, engine: ENGINE_TYPE.BROWSER
// }, crawlerOpts);
// yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot);
// return;
// }
if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.READER_LM)) {
const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
...crawlOpts,
engine: crawlOpts?.engine || ENGINE_TYPE.AUTO,
}, crawlerOpts);
if (!finalAutoSnapshot?.html) {
throw new AssertionFailureError(`Unexpected non HTML content for ReaderLM: ${urlToCrawl}`);
}
if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
return;
}
yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot);
return;
}
yield* this.cachedScrap(urlToCrawl, crawlOpts, crawlerOpts);
}
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
if (crawlerOpts?.html) {
const snapshot = {
href: urlToCrawl.toString(),
html: crawlerOpts.html,
title: '',
text: '',
} as PageSnapshot;
yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
return;
}
if (crawlerOpts?.pdf) {
const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
const snapshot = {
href: urlToCrawl.toString(),
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
title: '',
text: '',
pdfs: [pdfDataUrl],
} as PageSnapshot;
yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
return;
}
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
return;
}
let cache;
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
cache = await this.queryCache(urlToCrawl, cacheTolerance);
}
if (cache?.isFresh &&
(!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
(_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
) {
yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
return;
}
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
const { digest } = this.getDomainProfileUrlDigest(urlToCrawl);
const domainProfile = await DomainProfile.fromFirestore(digest);
if (domainProfile?.engine === ENGINE_TYPE.DIRECT) {
try {
const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
// Expect downstream code to "break" here if it's satisfied with the direct engine
yield snapshot;
if (crawlOpts?.engine === ENGINE_TYPE.AUTO) {
return;
}
} catch (err: any) {
this.logger.warn(`Failed to scrap ${urlToCrawl} with direct engine`, { err: marshalErrorLike(err) });
}
}
}
try {
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe || crawlOpts?.withShadowDom) {
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
yield this.jsdomControl.narrowSnapshot(x, crawlOpts);
}
return;
}
yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
} catch (err: any) {
if (cache && !(err instanceof SecurityCompromiseError)) {
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
return;
}
throw err;
}
}
assignChargeAmount(formatted: FormattedPage, scrappingOptions?: ExtraScrappingOptions) {
if (!formatted) {
return 0;
}
let amount = 0;
if (formatted.content) {
const x1 = estimateToken(formatted.content);
if (scrappingOptions?.engine?.toLowerCase().includes('lm')) {
amount += x1 * 2;
}
amount += x1;
} else if (formatted.description) {
amount += estimateToken(formatted.description);
}
if (formatted.text) {
amount += estimateToken(formatted.text);
}
if (formatted.html) {
amount += estimateToken(formatted.html);
}
if (formatted.screenshotUrl || formatted.screenshot) {
// OpenAI image token count for 1024x1024 image
amount += 765;
}
Object.assign(formatted, { usage: { tokens: amount } });
return amount;
}
async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
const iterators = urls.map((url) => this.cachedScrap(url, options, crawlerOpts));
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
let nextDeferred = Defer();
let concluded = false;
const handler = async (it: AsyncGenerator<PageSnapshot | undefined>, idx: number) => {
try {
for await (const x of it) {
results[idx] = x;
if (x) {
nextDeferred.resolve();
nextDeferred = Defer();
}
}
} catch (err: any) {
this.logger.warn(`Failed to scrap ${urls[idx]}`, { err: marshalErrorLike(err) });
}
};
Promise.all(
iterators.map((it, idx) => handler(it, idx))
).finally(() => {
concluded = true;
nextDeferred.resolve();
});
yield results;
try {
while (!concluded) {
await nextDeferred.promise;
yield results;
}
} finally {
for (const x of iterators) {
x.return();
}
}
}
async configure(opts: CrawlerOptions) {
this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
this.threadLocal.set('userAgent', opts.userAgent);
if (opts.timeout) {
this.threadLocal.set('timeout', opts.timeout * 1000);
}
this.threadLocal.set('retainImages', opts.retainImages);
this.threadLocal.set('noGfm', opts.noGfm);
const crawlOpts: ExtraScrappingOptions = {
proxyUrl: opts.proxyUrl,
cookies: opts.setCookies,
favorScreenshot: ['screenshot', 'pageshot'].some((x) => opts.respondWith.includes(x)),
removeSelector: opts.removeSelector,
targetSelector: opts.targetSelector,
waitForSelector: opts.waitForSelector,
overrideUserAgent: opts.userAgent,
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
withIframe: opts.withIframe,
withShadowDom: opts.withShadowDom,
locale: opts.locale,
referer: opts.referer,
viewport: opts.viewport,
engine: opts.engine,
};
if (opts.locale) {
crawlOpts.extraHeaders ??= {};
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
}
if (opts.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
crawlOpts.favorScreenshot = true;
}
if (opts.injectFrameScript?.length) {
crawlOpts.injectFrameScripts = (await Promise.all(
opts.injectFrameScript.map((x) => {
if (URL.canParse(x)) {
return fetch(x).then((r) => r.text());
}
return x;
})
)).filter(Boolean);
}
if (opts.injectPageScript?.length) {
crawlOpts.injectPageScripts = (await Promise.all(
opts.injectPageScript.map((x) => {
if (URL.canParse(x)) {
return fetch(x).then((r) => r.text());
}
return x;
})
)).filter(Boolean);
}
return crawlOpts;
}
formatSnapshot(
crawlerOptions: CrawlerOptions,
snapshot: PageSnapshot & {
screenshotUrl?: string;
pageshotUrl?: string;
},
nominalUrl?: URL,
urlValidMs?: number
) {
const presumedURL = crawlerOptions.base === 'final' ? new URL(snapshot.href) : nominalUrl;
const respondWith = crawlerOptions.respondWith;
if (respondWith === CONTENT_FORMAT.READER_LM || respondWith === CONTENT_FORMAT.VLM) {
const output: FormattedPage = {
title: snapshot.title,
content: snapshot.parsed?.textContent,
url: presumedURL?.href || snapshot.href,
[Symbol.dispose]: () => undefined,
};
Object.defineProperty(output, 'textRepresentation', {
value: snapshot.parsed?.textContent,
enumerable: false,
});
return output;
}
return this.snapshotFormatter.formatSnapshot(respondWith, snapshot, presumedURL, urlValidMs);
}
async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
const it = this.cachedScrap(url, opts, crawlerOptions);
let lastSnapshot;
let lastError;
try {
for await (const x of it) {
lastSnapshot = x;
}
} catch (err) {
lastError = err;
}
if (!lastSnapshot && lastError) {
throw lastError;
}
if (!lastSnapshot) {
throw new AssertionFailureError(`No content available`);
}
return lastSnapshot;
}
async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
const it = this.iterSnapshots(url, { ...opts, minIntervalMs: 500 });
let lastSnapshot;
let goodEnough = false;
try {
for await (const x of it) {
lastSnapshot = x;
if (goodEnough) {
break;
}
if (lastSnapshot?.parsed?.content) {
// After it's good enough, wait for next snapshot;
goodEnough = true;
}
}
} catch (err) {
if (lastSnapshot) {
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
}
throw err;
}
if (!lastSnapshot) {
throw new AssertionFailureError(`No content available`);
}
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
}
async exploreDirectEngine(knownSnapshot: PageSnapshot) {
const realUrl = new URL(knownSnapshot.href);
const { digest, path } = this.getDomainProfileUrlDigest(realUrl);
const profile = await DomainProfile.fromFirestore(digest);
if (!profile) {
const record = DomainProfile.from({
_id: digest,
origin: realUrl.origin.toLowerCase(),
path,
triggerUrl: realUrl.href,
engine: knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT,
createdAt: new Date(),
expireAt: new Date(Date.now() + this.domainProfileRetentionMs),
});
await DomainProfile.save(record);
return;
}
if (profile.engine === ENGINE_TYPE.BROWSER) {
// Mixed engine, always use browser
return;
}
profile.origin = realUrl.origin.toLowerCase();
profile.triggerUrl = realUrl.href;
profile.path = path;
profile.engine = knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT;
profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs);
await DomainProfile.save(profile);
return;
}
getDomainProfileUrlDigest(url: URL) {
const pathname = url.pathname;
const pathVec = pathname.split('/');
const parentPath = pathVec.slice(0, -1).join('/');
const finalPath = parentPath || pathname;
const key = url.origin.toLocaleLowerCase() + finalPath;
return {
digest: md5Hasher.hash(key),
path: finalPath,
};
}
}