"use strict"; var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) { var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d; if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc); else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r; return c > 3 && r && Object.defineProperty(target, key, r), r; }; var __metadata = (this && this.__metadata) || function (k, v) { if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v); }; var __param = (this && this.__param) || function (paramIndex, decorator) { return function (target, key) { decorator(target, key, paramIndex); } }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; var AdaptiveCrawlerHost_1; var _a, _b, _c, _d, _e, _f; Object.defineProperty(exports, "__esModule", { value: true }); exports.AdaptiveCrawlerHost = void 0; const civkit_1 = require("civkit"); const tsyringe_1 = require("tsyringe"); const shared_1 = require("../shared"); const jina_embeddings_auth_1 = require("../shared/dto/jina-embeddings-auth"); const robots_parser_1 = __importDefault(require("robots-parser")); const xmldom_1 = require("@xmldom/xmldom"); const adaptive_crawler_options_1 = require("../dto/adaptive-crawler-options"); const crawler_options_1 = require("../dto/crawler-options"); const adaptive_crawl_task_1 = require("../db/adaptive-crawl-task"); const functions_1 = require("firebase-admin/functions"); const get_function_url_1 = require("../utils/get-function-url"); const md5Hasher = new civkit_1.HashManager('md5', 'hex'); const removeURLHash = (url) => { try { const o = new URL(url); o.hash = ''; return o.toString(); } catch (e) { return url; } }; let AdaptiveCrawlerHost = class AdaptiveCrawlerHost extends civkit_1.RPCHost { static { AdaptiveCrawlerHost_1 = this; } static { this.__singleCrawlQueueName = 'singleCrawlQueue'; } constructor(globalLogger, firebaseObjectStorage) { super(...arguments); this.globalLogger = globalLogger; this.firebaseObjectStorage = firebaseObjectStorage; this.logger = this.globalLogger.child({ service: this.constructor.name }); // Actual cache storage (gcp buckets) exists for 7 days, so here we need to select a time < 7 days. this.cacheExpiry = 3 * 1000 * 60 * 60 * 24; } async init() { await this.dependencyReady(); this.emit('ready'); } async adaptiveCrawl(rpcReflect, ctx, auth, crawlerOptions, adaptiveCrawlerOptions) { this.logger.debug({ adaptiveCrawlerOptions, crawlerOptions, }); const uid = await auth.solveUID(); const { useSitemap, maxPages } = adaptiveCrawlerOptions; let tmpUrl = ctx.req.url.slice(1)?.trim(); if (!tmpUrl) { tmpUrl = crawlerOptions.url?.trim() ?? ''; } const targetUrl = new URL(tmpUrl); if (!targetUrl) { const latestUser = uid ? await auth.assertUser() : undefined; if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { return this.getIndex(latestUser); } return (0, civkit_1.assignTransferProtocolMeta)(`${this.getIndex(latestUser)}`, { contentType: 'text/plain', envelope: null }); } const meta = { targetUrl: targetUrl.toString(), useSitemap, maxPages, }; const digest = md5Hasher.hash(JSON.stringify(meta)); const shortDigest = Buffer.from(digest, 'hex').toString('base64url'); const existing = await adaptive_crawl_task_1.AdaptiveCrawlTask.fromFirestore(shortDigest); if (existing?.createdAt) { if (existing.createdAt.getTime() > Date.now() - this.cacheExpiry) { this.logger.info(`Cache hit for ${shortDigest}, created at ${existing.createdAt.toDateString()}`); return { taskId: shortDigest }; } else { this.logger.info(`Cache expired for ${shortDigest}, created at ${existing.createdAt.toDateString()}`); } } await adaptive_crawl_task_1.AdaptiveCrawlTask.COLLECTION.doc(shortDigest).set({ _id: shortDigest, status: adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.PENDING, statusText: 'Pending', meta, createdAt: new Date(), urls: [], processed: {}, failed: {}, }); let urls = []; if (useSitemap) { urls = await this.crawlUrlsFromSitemap(targetUrl, maxPages); } if (urls.length > 0) { await adaptive_crawl_task_1.AdaptiveCrawlTask.COLLECTION.doc(shortDigest).update({ status: adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.PROCESSING, statusText: `Processing 0/${urls.length}`, urls, }); const promises = []; for (const url of urls) { promises.push((0, functions_1.getFunctions)().taskQueue(AdaptiveCrawlerHost_1.__singleCrawlQueueName).enqueue({ shortDigest, url, token: auth.bearerToken, meta }, { dispatchDeadlineSeconds: 1800, uri: await (0, get_function_url_1.getFunctionUrl)(AdaptiveCrawlerHost_1.__singleCrawlQueueName), })); } ; await Promise.all(promises); } else { meta.useSitemap = false; await adaptive_crawl_task_1.AdaptiveCrawlTask.COLLECTION.doc(shortDigest).update({ urls: [targetUrl.toString()], }); await (0, functions_1.getFunctions)().taskQueue(AdaptiveCrawlerHost_1.__singleCrawlQueueName).enqueue({ shortDigest, url: targetUrl.toString(), token: auth.bearerToken, meta }, { dispatchDeadlineSeconds: 1800, uri: await (0, get_function_url_1.getFunctionUrl)(AdaptiveCrawlerHost_1.__singleCrawlQueueName), }); } return { taskId: shortDigest }; } async adaptiveCrawlStatus(rpcReflect, ctx, auth, taskId, urls = []) { if (!taskId) { throw new civkit_1.ParamValidationError('taskId is required'); } const state = await adaptive_crawl_task_1.AdaptiveCrawlTask.fromFirestore(taskId); if (!state) { throw new civkit_1.AssertionFailureError('The task does not exist'); } if (state?.createdAt && state.createdAt.getTime() < Date.now() - this.cacheExpiry) { throw new civkit_1.AssertionFailureError('The task has expired'); } if (urls.length) { const promises = Object.entries(state?.processed ?? {}).map(async ([url, cachePath]) => { if (urls.includes(url)) { const raw = await this.firebaseObjectStorage.downloadFile(cachePath); state.processed[url] = JSON.parse(raw.toString('utf-8')); } }); await Promise.all(promises); } return state; } async singleCrawlQueue(shortDigest, url, token, meta) { const error = { reason: '' }; const state = await adaptive_crawl_task_1.AdaptiveCrawlTask.fromFirestore(shortDigest); if (state?.status === adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.COMPLETED) { return; } try { url = removeURLHash(url); } catch (e) { error.reason = `Failed to parse url: ${url}`; } this.logger.debug(shortDigest, url, meta); const cachePath = `adaptive-crawl-task/${shortDigest}/${md5Hasher.hash(url)}`; if (!error.reason) { const result = meta.useSitemap ? await this.handleSingleCrawl(shortDigest, url, token, cachePath) : await this.handleSingleCrawlRecursively(shortDigest, url, token, meta, cachePath); if (!result) { return; } error.reason = result.error.reason; } await adaptive_crawl_task_1.AdaptiveCrawlTask.DB.runTransaction(async (transaction) => { const ref = adaptive_crawl_task_1.AdaptiveCrawlTask.COLLECTION.doc(shortDigest); const state = await transaction.get(ref); const data = state.data(); if (error.reason) { data.failed[url] = error; } else { data.processed[url] = cachePath; } const status = Object.keys(data.processed).length + Object.keys(data.failed).length >= data.urls.length ? adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.COMPLETED : adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.PROCESSING; const statusText = Object.keys(data.processed).length + Object.keys(data.failed).length >= data.urls.length ? `Completed ${Object.keys(data.processed).length} Succeeded, ${Object.keys(data.failed).length} Failed` : `Processing ${Object.keys(data.processed).length + Object.keys(data.failed).length}/${data.urls.length}`; const payload = { status, statusText, processed: data.processed, failed: data.failed, }; if (status === adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.COMPLETED) { payload.finishedAt = new Date(); payload.duration = new Date().getTime() - data.createdAt.toDate().getTime(); } transaction.update(ref, payload); }); } async handleSingleCrawl(shortDigest, url, token, cachePath) { const error = { reason: '' }; const response = await fetch('https://r.jina.ai', { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${token}`, 'Accept': 'application/json', }, body: JSON.stringify({ url }) }); if (!response.ok) { error.reason = `Failed to crawl ${url}, ${response.statusText}`; } else { const json = await response.json(); await this.firebaseObjectStorage.saveFile(cachePath, Buffer.from(JSON.stringify(json), 'utf-8'), { metadata: { contentType: 'application/json', } }); } return { error, }; } async handleSingleCrawlRecursively(shortDigest, url, token, meta, cachePath) { const error = { reason: '' }; const response = await fetch('https://r.jina.ai', { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${token}`, 'Accept': 'application/json', 'X-With-Links-Summary': 'true', }, body: JSON.stringify({ url }) }); if (!response.ok) { error.reason = `Failed to crawl ${url}, ${response.statusText}`; } else { const json = await response.json(); await this.firebaseObjectStorage.saveFile(cachePath, Buffer.from(JSON.stringify(json), 'utf-8'), { metadata: { contentType: 'application/json', } }); const title = json.data.title; const description = json.data.description; const links = json.data.links; const relevantUrls = await this.getRelevantUrls(token, { title, description, links }); this.logger.debug(`Total urls: ${Object.keys(links).length}, relevant urls: ${relevantUrls.length}`); for (const url of relevantUrls) { let abortContinue = false; let abortBreak = false; await adaptive_crawl_task_1.AdaptiveCrawlTask.DB.runTransaction(async (transaction) => { const ref = adaptive_crawl_task_1.AdaptiveCrawlTask.COLLECTION.doc(shortDigest); const state = await transaction.get(ref); const data = state.data(); if (data.urls.includes(url)) { this.logger.debug('Recursive CONTINUE', data); abortContinue = true; return; } const urls = [ ...data.urls, url ]; if (urls.length > meta.maxPages || data.status === adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.COMPLETED) { this.logger.debug('Recursive BREAK', data); abortBreak = true; return; } transaction.update(ref, { urls }); }); if (abortContinue) { continue; } if (abortBreak) { break; } await (0, functions_1.getFunctions)().taskQueue(AdaptiveCrawlerHost_1.__singleCrawlQueueName).enqueue({ shortDigest, url, token, meta }, { dispatchDeadlineSeconds: 1800, uri: await (0, get_function_url_1.getFunctionUrl)(AdaptiveCrawlerHost_1.__singleCrawlQueueName), }); } ; } return { error, }; } async getRelevantUrls(token, { title, description, links }) { const invalidSuffix = [ '.zip', '.docx', '.pptx', '.xlsx', ]; const validLinks = Object.entries(links) .map(([title, link]) => link) .filter(link => link.startsWith('http') && !invalidSuffix.some(suffix => link.endsWith(suffix))); let query = ''; if (!description) { query += title; } else { query += `TITLE: ${title}; DESCRIPTION: ${description}`; } const data = { model: 'jina-reranker-v2-base-multilingual', query, top_n: 15, documents: validLinks, }; const response = await fetch('https://api.jina.ai/v1/rerank', { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${token}` }, body: JSON.stringify(data) }); const json = (await response.json()); const highestRelevanceScore = json.results[0]?.relevance_score ?? 0; return json.results.filter(r => r.relevance_score > Math.max(highestRelevanceScore * 0.6, 0.1)).map(r => removeURLHash(r.document.text)); } getIndex(user) { // TODO: 需要更新使用方式 // const indexObject: Record = Object.create(indexProto); // Object.assign(indexObject, { // usage1: 'https://r.jina.ai/YOUR_URL', // usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY', // homepage: 'https://jina.ai/reader', // sourceCode: 'https://github.com/jina-ai/reader', // }); // if (user) { // indexObject[''] = undefined; // indexObject.authenticatedAs = `${user.user_id} (${user.full_name})`; // indexObject.balanceLeft = user.wallet.total_balance; // } // return indexObject; } async crawlUrlsFromSitemap(url, maxPages) { const sitemapsFromRobotsTxt = await this.getSitemapsFromRobotsTxt(url); const initialSitemaps = []; if (sitemapsFromRobotsTxt === null) { initialSitemaps.push(`${url.origin}/sitemap.xml`); } else { initialSitemaps.push(...sitemapsFromRobotsTxt); } const allUrls = new Set(); const processedSitemaps = new Set(); const fetchSitemapUrls = async (sitemapUrl) => { sitemapUrl = sitemapUrl.trim(); if (processedSitemaps.has(sitemapUrl)) { return; } processedSitemaps.add(sitemapUrl); try { const response = await fetch(sitemapUrl); const sitemapContent = await response.text(); const parser = new xmldom_1.DOMParser(); const xmlDoc = parser.parseFromString(sitemapContent, 'text/xml'); // handle normal sitemap const urlElements = xmlDoc.getElementsByTagName('url'); for (let i = 0; i < urlElements.length; i++) { const locElement = urlElements[i].getElementsByTagName('loc')[0]; if (locElement) { const loc = locElement.textContent?.trim() || ''; if (loc.startsWith(url.origin) && !loc.endsWith('.xml')) { allUrls.add(removeURLHash(loc)); } if (allUrls.size >= maxPages) { return; } } } // handle sitemap index const sitemapElements = xmlDoc.getElementsByTagName('sitemap'); for (let i = 0; i < sitemapElements.length; i++) { const locElement = sitemapElements[i].getElementsByTagName('loc')[0]; if (locElement) { await fetchSitemapUrls(locElement.textContent?.trim() || ''); if (allUrls.size >= maxPages) { return; } } } } catch (error) { this.logger.error(`Error fetching sitemap ${sitemapUrl}:`, error); } }; for (const sitemapUrl of initialSitemaps) { await fetchSitemapUrls(sitemapUrl); if (allUrls.size >= maxPages) { break; } } const urlsToProcess = Array.from(allUrls).slice(0, maxPages); return urlsToProcess; } async getSitemapsFromRobotsTxt(url) { const hostname = url.origin; const robotsUrl = `${hostname}/robots.txt`; const response = await fetch(robotsUrl); if (response.status === 404) { return null; } const robotsTxt = await response.text(); if (robotsTxt.length) { const robot = (0, robots_parser_1.default)(robotsUrl, robotsTxt); return robot.getSitemaps(); } return null; } }; exports.AdaptiveCrawlerHost = AdaptiveCrawlerHost; __decorate([ (0, shared_1.CloudHTTPv2)({ runtime: { memory: '1GiB', timeoutSeconds: 300, concurrency: 22, }, tags: ['Crawler'], httpMethod: ['post', 'get'], returnType: [String], }), __param(0, (0, shared_1.RPCReflect)()), __param(1, (0, shared_1.Ctx)()), __metadata("design:type", Function), __metadata("design:paramtypes", [typeof (_c = typeof civkit_1.RPCReflection !== "undefined" && civkit_1.RPCReflection) === "function" ? _c : Object, Object, typeof (_d = typeof jina_embeddings_auth_1.JinaEmbeddingsAuthDTO !== "undefined" && jina_embeddings_auth_1.JinaEmbeddingsAuthDTO) === "function" ? _d : Object, crawler_options_1.CrawlerOptions, adaptive_crawler_options_1.AdaptiveCrawlerOptions]), __metadata("design:returntype", Promise) ], AdaptiveCrawlerHost.prototype, "adaptiveCrawl", null); __decorate([ (0, shared_1.CloudHTTPv2)({ runtime: { memory: '1GiB', timeoutSeconds: 300, concurrency: 22, }, tags: ['Crawler'], httpMethod: ['post', 'get'], returnType: adaptive_crawl_task_1.AdaptiveCrawlTask, }), __param(0, (0, shared_1.RPCReflect)()), __param(1, (0, shared_1.Ctx)()), __param(3, (0, shared_1.Param)('taskId')), __param(4, (0, shared_1.Param)('urls')), __metadata("design:type", Function), __metadata("design:paramtypes", [typeof (_e = typeof civkit_1.RPCReflection !== "undefined" && civkit_1.RPCReflection) === "function" ? _e : Object, Object, typeof (_f = typeof jina_embeddings_auth_1.JinaEmbeddingsAuthDTO !== "undefined" && jina_embeddings_auth_1.JinaEmbeddingsAuthDTO) === "function" ? _f : Object, String, Array]), __metadata("design:returntype", Promise) ], AdaptiveCrawlerHost.prototype, "adaptiveCrawlStatus", null); __decorate([ (0, shared_1.CloudTaskV2)({ name: AdaptiveCrawlerHost.__singleCrawlQueueName, runtime: { cpu: 1, memory: '1GiB', timeoutSeconds: 3600, concurrency: 2, maxInstances: 200, retryConfig: { maxAttempts: 3, minBackoffSeconds: 60, }, rateLimits: { maxConcurrentDispatches: 150, maxDispatchesPerSecond: 5, }, } }), __param(0, (0, shared_1.Param)('shortDigest')), __param(1, (0, shared_1.Param)('url')), __param(2, (0, shared_1.Param)('token')), __param(3, (0, shared_1.Param)('meta')), __metadata("design:type", Function), __metadata("design:paramtypes", [String, String, String, Object]), __metadata("design:returntype", Promise) ], AdaptiveCrawlerHost.prototype, "singleCrawlQueue", null); exports.AdaptiveCrawlerHost = AdaptiveCrawlerHost = AdaptiveCrawlerHost_1 = __decorate([ (0, tsyringe_1.singleton)(), __metadata("design:paramtypes", [typeof (_a = typeof shared_1.Logger !== "undefined" && shared_1.Logger) === "function" ? _a : Object, typeof (_b = typeof shared_1.FirebaseStorageBucketControl !== "undefined" && shared_1.FirebaseStorageBucketControl) === "function" ? _b : Object]) ], AdaptiveCrawlerHost); //# sourceMappingURL=adaptive-crawler.js.map