web_reader / build /cloud-functions /adaptive-crawler.js
Mohammad Shahid
Include pre-built files for HF deployment
f316cce
"use strict";
var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
return c > 3 && r && Object.defineProperty(target, key, r), r;
};
var __metadata = (this && this.__metadata) || function (k, v) {
if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
};
var __param = (this && this.__param) || function (paramIndex, decorator) {
return function (target, key) { decorator(target, key, paramIndex); }
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
var AdaptiveCrawlerHost_1;
var _a, _b, _c, _d, _e, _f;
Object.defineProperty(exports, "__esModule", { value: true });
exports.AdaptiveCrawlerHost = void 0;
const civkit_1 = require("civkit");
const tsyringe_1 = require("tsyringe");
const shared_1 = require("../shared");
const jina_embeddings_auth_1 = require("../shared/dto/jina-embeddings-auth");
const robots_parser_1 = __importDefault(require("robots-parser"));
const xmldom_1 = require("@xmldom/xmldom");
const adaptive_crawler_options_1 = require("../dto/adaptive-crawler-options");
const crawler_options_1 = require("../dto/crawler-options");
const adaptive_crawl_task_1 = require("../db/adaptive-crawl-task");
const functions_1 = require("firebase-admin/functions");
const get_function_url_1 = require("../utils/get-function-url");
const md5Hasher = new civkit_1.HashManager('md5', 'hex');
const removeURLHash = (url) => {
try {
const o = new URL(url);
o.hash = '';
return o.toString();
}
catch (e) {
return url;
}
};
let AdaptiveCrawlerHost = class AdaptiveCrawlerHost extends civkit_1.RPCHost {
static { AdaptiveCrawlerHost_1 = this; }
static { this.__singleCrawlQueueName = 'singleCrawlQueue'; }
constructor(globalLogger, firebaseObjectStorage) {
super(...arguments);
this.globalLogger = globalLogger;
this.firebaseObjectStorage = firebaseObjectStorage;
this.logger = this.globalLogger.child({ service: this.constructor.name });
// Actual cache storage (gcp buckets) exists for 7 days, so here we need to select a time < 7 days.
this.cacheExpiry = 3 * 1000 * 60 * 60 * 24;
}
async init() {
await this.dependencyReady();
this.emit('ready');
}
async adaptiveCrawl(rpcReflect, ctx, auth, crawlerOptions, adaptiveCrawlerOptions) {
this.logger.debug({
adaptiveCrawlerOptions,
crawlerOptions,
});
const uid = await auth.solveUID();
const { useSitemap, maxPages } = adaptiveCrawlerOptions;
let tmpUrl = ctx.req.url.slice(1)?.trim();
if (!tmpUrl) {
tmpUrl = crawlerOptions.url?.trim() ?? '';
}
const targetUrl = new URL(tmpUrl);
if (!targetUrl) {
const latestUser = uid ? await auth.assertUser() : undefined;
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
return this.getIndex(latestUser);
}
return (0, civkit_1.assignTransferProtocolMeta)(`${this.getIndex(latestUser)}`, { contentType: 'text/plain', envelope: null });
}
const meta = {
targetUrl: targetUrl.toString(),
useSitemap,
maxPages,
};
const digest = md5Hasher.hash(JSON.stringify(meta));
const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
const existing = await adaptive_crawl_task_1.AdaptiveCrawlTask.fromFirestore(shortDigest);
if (existing?.createdAt) {
if (existing.createdAt.getTime() > Date.now() - this.cacheExpiry) {
this.logger.info(`Cache hit for ${shortDigest}, created at ${existing.createdAt.toDateString()}`);
return { taskId: shortDigest };
}
else {
this.logger.info(`Cache expired for ${shortDigest}, created at ${existing.createdAt.toDateString()}`);
}
}
await adaptive_crawl_task_1.AdaptiveCrawlTask.COLLECTION.doc(shortDigest).set({
_id: shortDigest,
status: adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.PENDING,
statusText: 'Pending',
meta,
createdAt: new Date(),
urls: [],
processed: {},
failed: {},
});
let urls = [];
if (useSitemap) {
urls = await this.crawlUrlsFromSitemap(targetUrl, maxPages);
}
if (urls.length > 0) {
await adaptive_crawl_task_1.AdaptiveCrawlTask.COLLECTION.doc(shortDigest).update({
status: adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.PROCESSING,
statusText: `Processing 0/${urls.length}`,
urls,
});
const promises = [];
for (const url of urls) {
promises.push((0, functions_1.getFunctions)().taskQueue(AdaptiveCrawlerHost_1.__singleCrawlQueueName).enqueue({
shortDigest, url, token: auth.bearerToken, meta
}, {
dispatchDeadlineSeconds: 1800,
uri: await (0, get_function_url_1.getFunctionUrl)(AdaptiveCrawlerHost_1.__singleCrawlQueueName),
}));
}
;
await Promise.all(promises);
}
else {
meta.useSitemap = false;
await adaptive_crawl_task_1.AdaptiveCrawlTask.COLLECTION.doc(shortDigest).update({
urls: [targetUrl.toString()],
});
await (0, functions_1.getFunctions)().taskQueue(AdaptiveCrawlerHost_1.__singleCrawlQueueName).enqueue({
shortDigest, url: targetUrl.toString(), token: auth.bearerToken, meta
}, {
dispatchDeadlineSeconds: 1800,
uri: await (0, get_function_url_1.getFunctionUrl)(AdaptiveCrawlerHost_1.__singleCrawlQueueName),
});
}
return { taskId: shortDigest };
}
async adaptiveCrawlStatus(rpcReflect, ctx, auth, taskId, urls = []) {
if (!taskId) {
throw new civkit_1.ParamValidationError('taskId is required');
}
const state = await adaptive_crawl_task_1.AdaptiveCrawlTask.fromFirestore(taskId);
if (!state) {
throw new civkit_1.AssertionFailureError('The task does not exist');
}
if (state?.createdAt && state.createdAt.getTime() < Date.now() - this.cacheExpiry) {
throw new civkit_1.AssertionFailureError('The task has expired');
}
if (urls.length) {
const promises = Object.entries(state?.processed ?? {}).map(async ([url, cachePath]) => {
if (urls.includes(url)) {
const raw = await this.firebaseObjectStorage.downloadFile(cachePath);
state.processed[url] = JSON.parse(raw.toString('utf-8'));
}
});
await Promise.all(promises);
}
return state;
}
async singleCrawlQueue(shortDigest, url, token, meta) {
const error = {
reason: ''
};
const state = await adaptive_crawl_task_1.AdaptiveCrawlTask.fromFirestore(shortDigest);
if (state?.status === adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.COMPLETED) {
return;
}
try {
url = removeURLHash(url);
}
catch (e) {
error.reason = `Failed to parse url: ${url}`;
}
this.logger.debug(shortDigest, url, meta);
const cachePath = `adaptive-crawl-task/${shortDigest}/${md5Hasher.hash(url)}`;
if (!error.reason) {
const result = meta.useSitemap
? await this.handleSingleCrawl(shortDigest, url, token, cachePath)
: await this.handleSingleCrawlRecursively(shortDigest, url, token, meta, cachePath);
if (!result) {
return;
}
error.reason = result.error.reason;
}
await adaptive_crawl_task_1.AdaptiveCrawlTask.DB.runTransaction(async (transaction) => {
const ref = adaptive_crawl_task_1.AdaptiveCrawlTask.COLLECTION.doc(shortDigest);
const state = await transaction.get(ref);
const data = state.data();
if (error.reason) {
data.failed[url] = error;
}
else {
data.processed[url] = cachePath;
}
const status = Object.keys(data.processed).length + Object.keys(data.failed).length >= data.urls.length
? adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.COMPLETED : adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.PROCESSING;
const statusText = Object.keys(data.processed).length + Object.keys(data.failed).length >= data.urls.length
? `Completed ${Object.keys(data.processed).length} Succeeded, ${Object.keys(data.failed).length} Failed`
: `Processing ${Object.keys(data.processed).length + Object.keys(data.failed).length}/${data.urls.length}`;
const payload = {
status,
statusText,
processed: data.processed,
failed: data.failed,
};
if (status === adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.COMPLETED) {
payload.finishedAt = new Date();
payload.duration = new Date().getTime() - data.createdAt.toDate().getTime();
}
transaction.update(ref, payload);
});
}
async handleSingleCrawl(shortDigest, url, token, cachePath) {
const error = {
reason: ''
};
const response = await fetch('https://r.jina.ai', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${token}`,
'Accept': 'application/json',
},
body: JSON.stringify({ url })
});
if (!response.ok) {
error.reason = `Failed to crawl ${url}, ${response.statusText}`;
}
else {
const json = await response.json();
await this.firebaseObjectStorage.saveFile(cachePath, Buffer.from(JSON.stringify(json), 'utf-8'), {
metadata: {
contentType: 'application/json',
}
});
}
return {
error,
};
}
async handleSingleCrawlRecursively(shortDigest, url, token, meta, cachePath) {
const error = {
reason: ''
};
const response = await fetch('https://r.jina.ai', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${token}`,
'Accept': 'application/json',
'X-With-Links-Summary': 'true',
},
body: JSON.stringify({ url })
});
if (!response.ok) {
error.reason = `Failed to crawl ${url}, ${response.statusText}`;
}
else {
const json = await response.json();
await this.firebaseObjectStorage.saveFile(cachePath, Buffer.from(JSON.stringify(json), 'utf-8'), {
metadata: {
contentType: 'application/json',
}
});
const title = json.data.title;
const description = json.data.description;
const links = json.data.links;
const relevantUrls = await this.getRelevantUrls(token, { title, description, links });
this.logger.debug(`Total urls: ${Object.keys(links).length}, relevant urls: ${relevantUrls.length}`);
for (const url of relevantUrls) {
let abortContinue = false;
let abortBreak = false;
await adaptive_crawl_task_1.AdaptiveCrawlTask.DB.runTransaction(async (transaction) => {
const ref = adaptive_crawl_task_1.AdaptiveCrawlTask.COLLECTION.doc(shortDigest);
const state = await transaction.get(ref);
const data = state.data();
if (data.urls.includes(url)) {
this.logger.debug('Recursive CONTINUE', data);
abortContinue = true;
return;
}
const urls = [
...data.urls,
url
];
if (urls.length > meta.maxPages || data.status === adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.COMPLETED) {
this.logger.debug('Recursive BREAK', data);
abortBreak = true;
return;
}
transaction.update(ref, { urls });
});
if (abortContinue) {
continue;
}
if (abortBreak) {
break;
}
await (0, functions_1.getFunctions)().taskQueue(AdaptiveCrawlerHost_1.__singleCrawlQueueName).enqueue({
shortDigest, url, token, meta
}, {
dispatchDeadlineSeconds: 1800,
uri: await (0, get_function_url_1.getFunctionUrl)(AdaptiveCrawlerHost_1.__singleCrawlQueueName),
});
}
;
}
return {
error,
};
}
async getRelevantUrls(token, { title, description, links }) {
const invalidSuffix = [
'.zip',
'.docx',
'.pptx',
'.xlsx',
];
const validLinks = Object.entries(links)
.map(([title, link]) => link)
.filter(link => link.startsWith('http') && !invalidSuffix.some(suffix => link.endsWith(suffix)));
let query = '';
if (!description) {
query += title;
}
else {
query += `TITLE: ${title}; DESCRIPTION: ${description}`;
}
const data = {
model: 'jina-reranker-v2-base-multilingual',
query,
top_n: 15,
documents: validLinks,
};
const response = await fetch('https://api.jina.ai/v1/rerank', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${token}`
},
body: JSON.stringify(data)
});
const json = (await response.json());
const highestRelevanceScore = json.results[0]?.relevance_score ?? 0;
return json.results.filter(r => r.relevance_score > Math.max(highestRelevanceScore * 0.6, 0.1)).map(r => removeURLHash(r.document.text));
}
getIndex(user) {
// TODO: 需要更新使用方式
// const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
// Object.assign(indexObject, {
// usage1: 'https://r.jina.ai/YOUR_URL',
// usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY',
// homepage: 'https://jina.ai/reader',
// sourceCode: 'https://github.com/jina-ai/reader',
// });
// if (user) {
// indexObject[''] = undefined;
// indexObject.authenticatedAs = `${user.user_id} (${user.full_name})`;
// indexObject.balanceLeft = user.wallet.total_balance;
// }
// return indexObject;
}
async crawlUrlsFromSitemap(url, maxPages) {
const sitemapsFromRobotsTxt = await this.getSitemapsFromRobotsTxt(url);
const initialSitemaps = [];
if (sitemapsFromRobotsTxt === null) {
initialSitemaps.push(`${url.origin}/sitemap.xml`);
}
else {
initialSitemaps.push(...sitemapsFromRobotsTxt);
}
const allUrls = new Set();
const processedSitemaps = new Set();
const fetchSitemapUrls = async (sitemapUrl) => {
sitemapUrl = sitemapUrl.trim();
if (processedSitemaps.has(sitemapUrl)) {
return;
}
processedSitemaps.add(sitemapUrl);
try {
const response = await fetch(sitemapUrl);
const sitemapContent = await response.text();
const parser = new xmldom_1.DOMParser();
const xmlDoc = parser.parseFromString(sitemapContent, 'text/xml');
// handle normal sitemap
const urlElements = xmlDoc.getElementsByTagName('url');
for (let i = 0; i < urlElements.length; i++) {
const locElement = urlElements[i].getElementsByTagName('loc')[0];
if (locElement) {
const loc = locElement.textContent?.trim() || '';
if (loc.startsWith(url.origin) && !loc.endsWith('.xml')) {
allUrls.add(removeURLHash(loc));
}
if (allUrls.size >= maxPages) {
return;
}
}
}
// handle sitemap index
const sitemapElements = xmlDoc.getElementsByTagName('sitemap');
for (let i = 0; i < sitemapElements.length; i++) {
const locElement = sitemapElements[i].getElementsByTagName('loc')[0];
if (locElement) {
await fetchSitemapUrls(locElement.textContent?.trim() || '');
if (allUrls.size >= maxPages) {
return;
}
}
}
}
catch (error) {
this.logger.error(`Error fetching sitemap ${sitemapUrl}:`, error);
}
};
for (const sitemapUrl of initialSitemaps) {
await fetchSitemapUrls(sitemapUrl);
if (allUrls.size >= maxPages) {
break;
}
}
const urlsToProcess = Array.from(allUrls).slice(0, maxPages);
return urlsToProcess;
}
async getSitemapsFromRobotsTxt(url) {
const hostname = url.origin;
const robotsUrl = `${hostname}/robots.txt`;
const response = await fetch(robotsUrl);
if (response.status === 404) {
return null;
}
const robotsTxt = await response.text();
if (robotsTxt.length) {
const robot = (0, robots_parser_1.default)(robotsUrl, robotsTxt);
return robot.getSitemaps();
}
return null;
}
};
exports.AdaptiveCrawlerHost = AdaptiveCrawlerHost;
__decorate([
(0, shared_1.CloudHTTPv2)({
runtime: {
memory: '1GiB',
timeoutSeconds: 300,
concurrency: 22,
},
tags: ['Crawler'],
httpMethod: ['post', 'get'],
returnType: [String],
}),
__param(0, (0, shared_1.RPCReflect)()),
__param(1, (0, shared_1.Ctx)()),
__metadata("design:type", Function),
__metadata("design:paramtypes", [typeof (_c = typeof civkit_1.RPCReflection !== "undefined" && civkit_1.RPCReflection) === "function" ? _c : Object, Object, typeof (_d = typeof jina_embeddings_auth_1.JinaEmbeddingsAuthDTO !== "undefined" && jina_embeddings_auth_1.JinaEmbeddingsAuthDTO) === "function" ? _d : Object, crawler_options_1.CrawlerOptions,
adaptive_crawler_options_1.AdaptiveCrawlerOptions]),
__metadata("design:returntype", Promise)
], AdaptiveCrawlerHost.prototype, "adaptiveCrawl", null);
__decorate([
(0, shared_1.CloudHTTPv2)({
runtime: {
memory: '1GiB',
timeoutSeconds: 300,
concurrency: 22,
},
tags: ['Crawler'],
httpMethod: ['post', 'get'],
returnType: adaptive_crawl_task_1.AdaptiveCrawlTask,
}),
__param(0, (0, shared_1.RPCReflect)()),
__param(1, (0, shared_1.Ctx)()),
__param(3, (0, shared_1.Param)('taskId')),
__param(4, (0, shared_1.Param)('urls')),
__metadata("design:type", Function),
__metadata("design:paramtypes", [typeof (_e = typeof civkit_1.RPCReflection !== "undefined" && civkit_1.RPCReflection) === "function" ? _e : Object, Object, typeof (_f = typeof jina_embeddings_auth_1.JinaEmbeddingsAuthDTO !== "undefined" && jina_embeddings_auth_1.JinaEmbeddingsAuthDTO) === "function" ? _f : Object, String, Array]),
__metadata("design:returntype", Promise)
], AdaptiveCrawlerHost.prototype, "adaptiveCrawlStatus", null);
__decorate([
(0, shared_1.CloudTaskV2)({
name: AdaptiveCrawlerHost.__singleCrawlQueueName,
runtime: {
cpu: 1,
memory: '1GiB',
timeoutSeconds: 3600,
concurrency: 2,
maxInstances: 200,
retryConfig: {
maxAttempts: 3,
minBackoffSeconds: 60,
},
rateLimits: {
maxConcurrentDispatches: 150,
maxDispatchesPerSecond: 5,
},
}
}),
__param(0, (0, shared_1.Param)('shortDigest')),
__param(1, (0, shared_1.Param)('url')),
__param(2, (0, shared_1.Param)('token')),
__param(3, (0, shared_1.Param)('meta')),
__metadata("design:type", Function),
__metadata("design:paramtypes", [String, String, String, Object]),
__metadata("design:returntype", Promise)
], AdaptiveCrawlerHost.prototype, "singleCrawlQueue", null);
exports.AdaptiveCrawlerHost = AdaptiveCrawlerHost = AdaptiveCrawlerHost_1 = __decorate([
(0, tsyringe_1.singleton)(),
__metadata("design:paramtypes", [typeof (_a = typeof shared_1.Logger !== "undefined" && shared_1.Logger) === "function" ? _a : Object, typeof (_b = typeof shared_1.FirebaseStorageBucketControl !== "undefined" && shared_1.FirebaseStorageBucketControl) === "function" ? _b : Object])
], AdaptiveCrawlerHost);
//# sourceMappingURL=adaptive-crawler.js.map