diff --git a/.gitignore b/.gitignore
index b1b850be6634a94404231bba62280b2aa7938179..96d70870f6cd7169b51c6ef3fc002c70cbd6419e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,7 +69,7 @@ node_modules/
toy*.ts
.DS_Store
-build/
+# build/ # Commented out for HF deployment - need pre-built files
.firebase-emu/
*.log
.DS_Store
diff --git a/build/api/crawler.js b/build/api/crawler.js
new file mode 100644
index 0000000000000000000000000000000000000000..0c0dcab31e244433eef6aa957beff4a75420e088
--- /dev/null
+++ b/build/api/crawler.js
@@ -0,0 +1,1170 @@
+"use strict";
+var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
+ var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
+ if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
+ else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
+ return c > 3 && r && Object.defineProperty(target, key, r), r;
+};
+var __metadata = (this && this.__metadata) || function (k, v) {
+ if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
+};
+var __param = (this && this.__param) || function (paramIndex, decorator) {
+ return function (target, key) { decorator(target, key, paramIndex); }
+};
+var __importDefault = (this && this.__importDefault) || function (mod) {
+ return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+var _a, _b, _c, _d, _e, _f;
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.CrawlerHost = void 0;
+const tsyringe_1 = require("tsyringe");
+const url_1 = require("url");
+const crypto_1 = require("crypto");
+const lodash_1 = __importDefault(require("lodash"));
+const civ_rpc_1 = require("civkit/civ-rpc");
+const lang_1 = require("civkit/lang");
+const defer_1 = require("civkit/defer");
+const decorators_1 = require("civkit/decorators");
+const fancy_file_1 = require("civkit/fancy-file");
+const crawler_options_1 = require("../dto/crawler-options");
+const crawled_1 = require("../db/crawled");
+const domain_blockade_1 = require("../db/domain-blockade");
+const transform_server_event_stream_1 = require("../lib/transform-server-event-stream");
+const puppeteer_1 = require("../services/puppeteer");
+const jsdom_1 = require("../services/jsdom");
+const snapshot_formatter_1 = require("../services/snapshot-formatter");
+const curl_1 = require("../services/curl");
+const lm_1 = require("../services/lm");
+const misc_1 = require("../utils/misc");
+const cf_browser_rendering_1 = require("../services/cf-browser-rendering");
+const logger_1 = require("../services/logger");
+const rate_limit_1 = require("../shared/services/rate-limit");
+const async_context_1 = require("../services/async-context");
+const registry_1 = require("../services/registry");
+const errors_1 = require("../services/errors");
+const openai_1 = require("../shared/utils/openai");
+const proxy_provider_1 = require("../shared/services/proxy-provider");
+const firebase_storage_bucket_1 = require("../shared/services/firebase-storage-bucket");
+const jina_embeddings_auth_1 = require("../dto/jina-embeddings-auth");
+const robots_text_1 = require("../services/robots-text");
+const temp_file_1 = require("../services/temp-file");
+const misc_2 = require("../services/misc");
+const http_1 = require("civkit/http");
+const geoip_1 = require("../services/geoip");
+const indexProto = {
+ toString: function () {
+ return (0, lodash_1.default)(this)
+ .toPairs()
+ .map(([k, v]) => k ? `[${lodash_1.default.upperFirst(lodash_1.default.lowerCase(k))}] ${v}` : '')
+ .value()
+ .join('\n') + '\n';
+ }
+};
+let CrawlerHost = class CrawlerHost extends civ_rpc_1.RPCHost {
+ constructor(globalLogger, puppeteerControl, curlControl, cfBrowserRendering, proxyProvider, lmControl, jsdomControl, snapshotFormatter, firebaseObjectStorage, rateLimitControl, threadLocal, robotsTxtService, tempFileManager, geoIpService, miscService) {
+ super(...arguments);
+ this.globalLogger = globalLogger;
+ this.puppeteerControl = puppeteerControl;
+ this.curlControl = curlControl;
+ this.cfBrowserRendering = cfBrowserRendering;
+ this.proxyProvider = proxyProvider;
+ this.lmControl = lmControl;
+ this.jsdomControl = jsdomControl;
+ this.snapshotFormatter = snapshotFormatter;
+ this.firebaseObjectStorage = firebaseObjectStorage;
+ this.rateLimitControl = rateLimitControl;
+ this.threadLocal = threadLocal;
+ this.robotsTxtService = robotsTxtService;
+ this.tempFileManager = tempFileManager;
+ this.geoIpService = geoIpService;
+ this.miscService = miscService;
+ this.logger = this.globalLogger.child({ service: this.constructor.name });
+ this.cacheRetentionMs = 1000 * 3600 * 24 * 7;
+ this.cacheValidMs = 1000 * 3600;
+ this.urlValidMs = 1000 * 3600 * 4;
+ this.abuseBlockMs = 1000 * 3600;
+ this.domainProfileRetentionMs = 1000 * 3600 * 24 * 30;
+ this.batchedCaches = [];
+ this.proxyIterMap = new WeakMap();
+ puppeteerControl.on('crawled', async (snapshot, options) => {
+ if (!snapshot.title?.trim() && !snapshot.pdfs?.length) {
+ return;
+ }
+ if (options.cookies?.length || options.private) {
+ // Potential privacy issue, dont cache if cookies are used
+ return;
+ }
+ if (options.injectFrameScripts?.length || options.injectPageScripts?.length || options.viewport) {
+ // Potentially mangeled content, dont cache if scripts are injected
+ return;
+ }
+ if (snapshot.isIntermediate) {
+ return;
+ }
+ if (!snapshot.lastMutationIdle) {
+ // Never reached mutationIdle, presumably too short timeout
+ return;
+ }
+ if (options.locale) {
+ Reflect.set(snapshot, 'locale', options.locale);
+ }
+ const analyzed = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html);
+ if (analyzed.tokens < 200) {
+ // Does not contain enough content
+ if (snapshot.status !== 200) {
+ return;
+ }
+ if (snapshot.html.includes('captcha') || snapshot.html.includes('cf-turnstile')) {
+ return;
+ }
+ }
+ await this.setToCache(options.url, snapshot);
+ });
+ puppeteerControl.on('abuse', async (abuseEvent) => {
+ this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn });
+ await domain_blockade_1.DomainBlockade.save(domain_blockade_1.DomainBlockade.from({
+ domain: abuseEvent.url.hostname.toLowerCase(),
+ triggerReason: `${abuseEvent.reason}`,
+ triggerUrl: abuseEvent.url.toString(),
+ createdAt: new Date(),
+ expireAt: new Date(Date.now() + this.abuseBlockMs),
+ })).catch((err) => {
+ this.logger.warn(`Failed to save domain blockade for ${abuseEvent.url.hostname}`, { err: (0, lang_1.marshalErrorLike)(err) });
+ });
+ });
+ setInterval(() => {
+ const thisBatch = this.batchedCaches;
+ this.batchedCaches = [];
+ if (!thisBatch.length) {
+ return;
+ }
+ const batch = crawled_1.Crawled.DB.batch();
+ for (const x of thisBatch) {
+ batch.set(crawled_1.Crawled.COLLECTION.doc(x._id), x.degradeForFireStore(), { merge: true });
+ }
+ batch.commit()
+ .then(() => {
+ this.logger.debug(`Saved ${thisBatch.length} caches by batch`);
+ })
+ .catch((err) => {
+ this.logger.warn(`Failed to save cache in batch`, { err });
+ });
+ }, 1000 * 10 + Math.round(1000 * Math.random())).unref();
+ }
+ async init() {
+ await this.dependencyReady();
+ if (this.puppeteerControl.effectiveUA) {
+ this.curlControl.impersonateChrome(this.puppeteerControl.effectiveUA);
+ }
+ this.emit('ready');
+ }
+ async getIndex(auth) {
+ const indexObject = Object.create(indexProto);
+ Object.assign(indexObject, {
+ usage1: 'https://r.jina.ai/YOUR_URL',
+ usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY',
+ homepage: 'https://jina.ai/reader',
+ });
+ await auth?.solveUID();
+ if (auth && auth.user) {
+ indexObject[''] = undefined;
+ indexObject.authenticatedAs = `${auth.user.user_id} (${auth.user.full_name})`;
+ indexObject.balanceLeft = auth.user.wallet.total_balance;
+ }
+ return indexObject;
+ }
+ async getIndexCtrl(ctx, auth) {
+ const indexObject = await this.getIndex(auth);
+ if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
+ return indexObject;
+ }
+ return (0, civ_rpc_1.assignTransferProtocolMeta)(`${indexObject}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
+ }
+ async crawl(rpcReflect, ctx, auth, crawlerOptionsHeaderOnly, crawlerOptionsParamsAllowed) {
+ const uid = await auth.solveUID();
+ let chargeAmount = 0;
+ const crawlerOptions = ctx.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed;
+ const tierPolicy = await this.saasAssertTierPolicy(crawlerOptions, auth);
+ // Use koa ctx.URL, a standard URL object to avoid node.js framework prop naming confusion
+ const targetUrl = await this.getTargetUrl((0, misc_1.tryDecodeURIComponent)(`${ctx.URL.pathname}${ctx.URL.search}`), crawlerOptions);
+ if (!targetUrl) {
+ return await this.getIndex(auth);
+ }
+ // Prevent circular crawling
+ this.puppeteerControl.circuitBreakerHosts.add(ctx.hostname.toLowerCase());
+ if (uid) {
+ const user = await auth.assertUser();
+ if (!(user.wallet.total_balance > 0)) {
+ throw new errors_1.InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
+ }
+ const rateLimitPolicy = auth.getRateLimits('CRAWL') || [
+ parseInt(user.metadata?.speed_level) >= 2 ?
+ rate_limit_1.RateLimitDesc.from({
+ occurrence: 5000,
+ periodSeconds: 60
+ }) :
+ rate_limit_1.RateLimitDesc.from({
+ occurrence: 500,
+ periodSeconds: 60
+ })
+ ];
+ const apiRoll = await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'], ...rateLimitPolicy);
+ rpcReflect.finally(() => {
+ if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
+ return;
+ }
+ if (chargeAmount) {
+ auth.reportUsage(chargeAmount, `reader-crawl`).catch((err) => {
+ this.logger.warn(`Unable to report usage for ${uid}`, { err: (0, lang_1.marshalErrorLike)(err) });
+ });
+ apiRoll.chargeAmount = chargeAmount;
+ }
+ });
+ }
+ else if (ctx.ip) {
+ const apiRoll = await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.ip, ['CRAWL'], [
+ // 20 requests per minute
+ new Date(Date.now() - 60 * 1000), 20
+ ]);
+ rpcReflect.finally(() => {
+ if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
+ return;
+ }
+ apiRoll.chargeAmount = chargeAmount;
+ });
+ }
+ if (!uid) {
+ // Enforce no proxy is allocated for anonymous users due to abuse.
+ crawlerOptions.proxy = 'none';
+ const blockade = (await domain_blockade_1.DomainBlockade.fromFirestoreQuery(domain_blockade_1.DomainBlockade.COLLECTION
+ .where('domain', '==', targetUrl.hostname.toLowerCase())
+ .where('expireAt', '>=', new Date())
+ .limit(1)))[0];
+ if (blockade) {
+ throw new errors_1.SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
+ }
+ }
+ const crawlOpts = await this.configure(crawlerOptions);
+ this.logger.info(`Accepting request from ${uid || ctx.ip}`, { opts: crawlerOptions });
+ if (crawlerOptions.robotsTxt) {
+ await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
+ }
+ if (rpcReflect.signal.aborted) {
+ return;
+ }
+ if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
+ const sseStream = new transform_server_event_stream_1.OutputServerEventStream();
+ rpcReflect.return(sseStream);
+ try {
+ for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
+ if (!scrapped) {
+ continue;
+ }
+ if (rpcReflect.signal.aborted) {
+ break;
+ }
+ const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
+ chargeAmount = this.assignChargeAmount(formatted, tierPolicy);
+ sseStream.write({
+ event: 'data',
+ data: formatted,
+ });
+ if (chargeAmount && scrapped.pdfs?.length) {
+ break;
+ }
+ }
+ }
+ catch (err) {
+ this.logger.error(`Failed to crawl ${targetUrl}`, { err: (0, lang_1.marshalErrorLike)(err) });
+ sseStream.write({
+ event: 'error',
+ data: (0, lang_1.marshalErrorLike)(err),
+ });
+ }
+ sseStream.end();
+ return sseStream;
+ }
+ let lastScrapped;
+ if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
+ try {
+ for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
+ lastScrapped = scrapped;
+ if (rpcReflect.signal.aborted) {
+ break;
+ }
+ if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
+ continue;
+ }
+ if (!scrapped.title) {
+ continue;
+ }
+ const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
+ chargeAmount = this.assignChargeAmount(formatted, tierPolicy);
+ if (scrapped?.pdfs?.length && !chargeAmount) {
+ continue;
+ }
+ return formatted;
+ }
+ }
+ catch (err) {
+ if (!lastScrapped) {
+ throw err;
+ }
+ }
+ if (!lastScrapped) {
+ if (crawlOpts.targetSelector) {
+ throw new civ_rpc_1.AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`);
+ }
+ throw new civ_rpc_1.AssertionFailureError(`No content available for URL ${targetUrl}`);
+ }
+ const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
+ chargeAmount = this.assignChargeAmount(formatted, tierPolicy);
+ return formatted;
+ }
+ if (crawlerOptions.isRequestingCompoundContentFormat()) {
+ throw new civ_rpc_1.ParamValidationError({
+ path: 'respondWith',
+ message: `You are requesting compound content format, please explicitly accept 'text/event-stream' or 'application/json' in header.`
+ });
+ }
+ try {
+ for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
+ lastScrapped = scrapped;
+ if (rpcReflect.signal.aborted) {
+ break;
+ }
+ if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
+ continue;
+ }
+ if (!scrapped.title) {
+ continue;
+ }
+ const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
+ chargeAmount = this.assignChargeAmount(formatted, tierPolicy);
+ if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
+ return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } });
+ }
+ if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
+ return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } });
+ }
+ return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
+ }
+ }
+ catch (err) {
+ if (!lastScrapped) {
+ throw err;
+ }
+ }
+ if (!lastScrapped) {
+ if (crawlOpts.targetSelector) {
+ throw new civ_rpc_1.AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`);
+ }
+ throw new civ_rpc_1.AssertionFailureError(`No content available for URL ${targetUrl}`);
+ }
+ const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
+ chargeAmount = this.assignChargeAmount(formatted, tierPolicy);
+ if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
+ return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } });
+ }
+ if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
+ return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } });
+ }
+ return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
+ }
+ async getTargetUrl(originPath, crawlerOptions) {
+ let url = '';
+ const targetUrlFromGet = originPath.slice(1);
+ if (crawlerOptions.pdf) {
+ const pdfFile = crawlerOptions.pdf;
+ const identifier = pdfFile instanceof fancy_file_1.FancyFile ? (await pdfFile.sha256Sum) : (0, crypto_1.randomUUID)();
+ url = `blob://pdf/${identifier}`;
+ crawlerOptions.url ??= url;
+ }
+ else if (targetUrlFromGet) {
+ url = targetUrlFromGet.trim();
+ }
+ else if (crawlerOptions.url) {
+ url = crawlerOptions.url.trim();
+ }
+ if (!url) {
+ throw new civ_rpc_1.ParamValidationError({
+ message: 'No URL provided',
+ path: 'url'
+ });
+ }
+ const { url: safeURL, ips } = await this.miscService.assertNormalizedUrl(url);
+ if (this.puppeteerControl.circuitBreakerHosts.has(safeURL.hostname.toLowerCase())) {
+ throw new errors_1.SecurityCompromiseError({
+ message: `Circular hostname: ${safeURL.protocol}`,
+ path: 'url'
+ });
+ }
+ crawlerOptions._hintIps = ips;
+ return safeURL;
+ }
+ getUrlDigest(urlToCrawl) {
+ const normalizedURL = new URL(urlToCrawl);
+ if (!normalizedURL.hash.startsWith('#/')) {
+ normalizedURL.hash = '';
+ }
+ const normalizedUrl = normalizedURL.toString().toLowerCase();
+ const digest = snapshot_formatter_1.md5Hasher.hash(normalizedUrl.toString());
+ return digest;
+ }
+ async *queryCache(urlToCrawl, cacheTolerance) {
+ const digest = this.getUrlDigest(urlToCrawl);
+ const cache = (await (crawled_1.Crawled.fromFirestoreQuery(crawled_1.Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)).catch((err) => {
+ this.logger.warn(`Failed to query cache, unknown issue`, { err });
+ // https://github.com/grpc/grpc-node/issues/2647
+ // https://github.com/googleapis/nodejs-firestore/issues/1023
+ // https://github.com/googleapis/nodejs-firestore/issues/1023
+ return undefined;
+ })))?.[0];
+ yield cache;
+ if (!cache) {
+ return;
+ }
+ const age = Date.now() - cache.createdAt.valueOf();
+ const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
+ this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
+ url: urlToCrawl, digest, age, stale, cacheTolerance
+ });
+ let snapshot;
+ let screenshotUrl;
+ let pageshotUrl;
+ const preparations = [
+ this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => {
+ snapshot = JSON.parse(r.toString('utf-8'));
+ }),
+ cache.screenshotAvailable ?
+ this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
+ screenshotUrl = r;
+ }) :
+ Promise.resolve(undefined),
+ cache.pageshotAvailable ?
+ this.firebaseObjectStorage.signDownloadUrl(`pageshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
+ pageshotUrl = r;
+ }) :
+ Promise.resolve(undefined)
+ ];
+ try {
+ await Promise.all(preparations);
+ }
+ catch (_err) {
+ // Swallow cache errors.
+ return undefined;
+ }
+ yield {
+ isFresh: !stale,
+ ...cache,
+ snapshot: {
+ ...snapshot,
+ screenshot: undefined,
+ pageshot: undefined,
+ screenshotUrl,
+ pageshotUrl,
+ }
+ };
+ }
+ async setToCache(urlToCrawl, snapshot) {
+ const digest = this.getUrlDigest(urlToCrawl);
+ this.logger.info(`Caching snapshot of ${urlToCrawl}...`, { url: urlToCrawl, digest, title: snapshot?.title, href: snapshot?.href });
+ const nowDate = new Date();
+ const cache = crawled_1.Crawled.from({
+ _id: (0, crypto_1.randomUUID)(),
+ url: urlToCrawl.toString(),
+ createdAt: nowDate,
+ expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
+ htmlSignificantlyModifiedByJs: snapshot.htmlSignificantlyModifiedByJs,
+ urlPathDigest: digest,
+ });
+ const savingOfSnapshot = this.firebaseObjectStorage.saveFile(`snapshots/${cache._id}`, Buffer.from(JSON.stringify({
+ ...snapshot,
+ screenshot: undefined,
+ pageshot: undefined,
+ }), 'utf-8'), {
+ metadata: {
+ contentType: 'application/json',
+ }
+ }).then((r) => {
+ cache.snapshotAvailable = true;
+ return r;
+ });
+ if (snapshot.screenshot) {
+ await this.firebaseObjectStorage.saveFile(`screenshots/${cache._id}`, snapshot.screenshot, {
+ metadata: {
+ contentType: 'image/png',
+ }
+ });
+ cache.screenshotAvailable = true;
+ }
+ if (snapshot.pageshot) {
+ await this.firebaseObjectStorage.saveFile(`pageshots/${cache._id}`, snapshot.pageshot, {
+ metadata: {
+ contentType: 'image/png',
+ }
+ });
+ cache.pageshotAvailable = true;
+ }
+ await savingOfSnapshot;
+ this.batchedCaches.push(cache);
+ // const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
+ // this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
+ // return undefined;
+ // });
+ return cache;
+ }
+ async *iterSnapshots(urlToCrawl, crawlOpts, crawlerOpts) {
+ // if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.VLM)) {
+ // const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, {
+ // ...crawlOpts, engine: ENGINE_TYPE.BROWSER
+ // }, crawlerOpts);
+ // yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot);
+ // return;
+ // }
+ if (crawlerOpts?.respondWith.includes(crawler_options_1.CONTENT_FORMAT.READER_LM)) {
+ const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
+ ...crawlOpts,
+ engine: crawlOpts?.engine || crawler_options_1.ENGINE_TYPE.AUTO,
+ }, crawler_options_1.CrawlerOptions.from({
+ ...crawlerOpts,
+ respondWith: 'html',
+ }));
+ if (!finalAutoSnapshot?.html) {
+ throw new civ_rpc_1.AssertionFailureError(`Unexpected non HTML content for ReaderLM: ${urlToCrawl}`);
+ }
+ if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
+ const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
+ yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
+ return;
+ }
+ try {
+ yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot);
+ }
+ catch (err) {
+ if (err instanceof http_1.HTTPServiceError && err.status === 429) {
+ throw new errors_1.ServiceNodeResourceDrainError(`Reader LM is at capacity, please try again later.`);
+ }
+ throw err;
+ }
+ return;
+ }
+ yield* this.cachedScrap(urlToCrawl, crawlOpts, crawlerOpts);
+ }
+ async *cachedScrap(urlToCrawl, crawlOpts, crawlerOpts) {
+ if (crawlerOpts?.html) {
+ const snapshot = {
+ href: urlToCrawl.toString(),
+ html: crawlerOpts.html,
+ title: '',
+ text: '',
+ };
+ yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
+ return;
+ }
+ if (crawlerOpts?.pdf) {
+ const pdfFile = crawlerOpts.pdf instanceof fancy_file_1.FancyFile ? crawlerOpts.pdf : this.tempFileManager.cacheBuffer(Buffer.from(crawlerOpts.pdf, 'base64'));
+ const pdfLocalPath = (0, url_1.pathToFileURL)((await pdfFile.filePath));
+ const snapshot = {
+ href: urlToCrawl.toString(),
+ html: `