Spaces:
Build error
Build error
| ; | |
| var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) { | |
| var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d; | |
| if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc); | |
| else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r; | |
| return c > 3 && r && Object.defineProperty(target, key, r), r; | |
| }; | |
| var __metadata = (this && this.__metadata) || function (k, v) { | |
| if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v); | |
| }; | |
| var __param = (this && this.__param) || function (paramIndex, decorator) { | |
| return function (target, key) { decorator(target, key, paramIndex); } | |
| }; | |
| var __importDefault = (this && this.__importDefault) || function (mod) { | |
| return (mod && mod.__esModule) ? mod : { "default": mod }; | |
| }; | |
| var AdaptiveCrawlerHost_1; | |
| var _a, _b, _c, _d, _e, _f; | |
| Object.defineProperty(exports, "__esModule", { value: true }); | |
| exports.AdaptiveCrawlerHost = void 0; | |
| const civkit_1 = require("civkit"); | |
| const tsyringe_1 = require("tsyringe"); | |
| const shared_1 = require("../shared"); | |
| const jina_embeddings_auth_1 = require("../shared/dto/jina-embeddings-auth"); | |
| const robots_parser_1 = __importDefault(require("robots-parser")); | |
| const xmldom_1 = require("@xmldom/xmldom"); | |
| const adaptive_crawler_options_1 = require("../dto/adaptive-crawler-options"); | |
| const crawler_options_1 = require("../dto/crawler-options"); | |
| const adaptive_crawl_task_1 = require("../db/adaptive-crawl-task"); | |
| const functions_1 = require("firebase-admin/functions"); | |
| const get_function_url_1 = require("../utils/get-function-url"); | |
| const md5Hasher = new civkit_1.HashManager('md5', 'hex'); | |
| const removeURLHash = (url) => { | |
| try { | |
| const o = new URL(url); | |
| o.hash = ''; | |
| return o.toString(); | |
| } | |
| catch (e) { | |
| return url; | |
| } | |
| }; | |
| let AdaptiveCrawlerHost = class AdaptiveCrawlerHost extends civkit_1.RPCHost { | |
| static { AdaptiveCrawlerHost_1 = this; } | |
| static { this.__singleCrawlQueueName = 'singleCrawlQueue'; } | |
| constructor(globalLogger, firebaseObjectStorage) { | |
| super(...arguments); | |
| this.globalLogger = globalLogger; | |
| this.firebaseObjectStorage = firebaseObjectStorage; | |
| this.logger = this.globalLogger.child({ service: this.constructor.name }); | |
| // Actual cache storage (gcp buckets) exists for 7 days, so here we need to select a time < 7 days. | |
| this.cacheExpiry = 3 * 1000 * 60 * 60 * 24; | |
| } | |
| async init() { | |
| await this.dependencyReady(); | |
| this.emit('ready'); | |
| } | |
| async adaptiveCrawl(rpcReflect, ctx, auth, crawlerOptions, adaptiveCrawlerOptions) { | |
| this.logger.debug({ | |
| adaptiveCrawlerOptions, | |
| crawlerOptions, | |
| }); | |
| const uid = await auth.solveUID(); | |
| const { useSitemap, maxPages } = adaptiveCrawlerOptions; | |
| let tmpUrl = ctx.req.url.slice(1)?.trim(); | |
| if (!tmpUrl) { | |
| tmpUrl = crawlerOptions.url?.trim() ?? ''; | |
| } | |
| const targetUrl = new URL(tmpUrl); | |
| if (!targetUrl) { | |
| const latestUser = uid ? await auth.assertUser() : undefined; | |
| if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { | |
| return this.getIndex(latestUser); | |
| } | |
| return (0, civkit_1.assignTransferProtocolMeta)(`${this.getIndex(latestUser)}`, { contentType: 'text/plain', envelope: null }); | |
| } | |
| const meta = { | |
| targetUrl: targetUrl.toString(), | |
| useSitemap, | |
| maxPages, | |
| }; | |
| const digest = md5Hasher.hash(JSON.stringify(meta)); | |
| const shortDigest = Buffer.from(digest, 'hex').toString('base64url'); | |
| const existing = await adaptive_crawl_task_1.AdaptiveCrawlTask.fromFirestore(shortDigest); | |
| if (existing?.createdAt) { | |
| if (existing.createdAt.getTime() > Date.now() - this.cacheExpiry) { | |
| this.logger.info(`Cache hit for ${shortDigest}, created at ${existing.createdAt.toDateString()}`); | |
| return { taskId: shortDigest }; | |
| } | |
| else { | |
| this.logger.info(`Cache expired for ${shortDigest}, created at ${existing.createdAt.toDateString()}`); | |
| } | |
| } | |
| await adaptive_crawl_task_1.AdaptiveCrawlTask.COLLECTION.doc(shortDigest).set({ | |
| _id: shortDigest, | |
| status: adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.PENDING, | |
| statusText: 'Pending', | |
| meta, | |
| createdAt: new Date(), | |
| urls: [], | |
| processed: {}, | |
| failed: {}, | |
| }); | |
| let urls = []; | |
| if (useSitemap) { | |
| urls = await this.crawlUrlsFromSitemap(targetUrl, maxPages); | |
| } | |
| if (urls.length > 0) { | |
| await adaptive_crawl_task_1.AdaptiveCrawlTask.COLLECTION.doc(shortDigest).update({ | |
| status: adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.PROCESSING, | |
| statusText: `Processing 0/${urls.length}`, | |
| urls, | |
| }); | |
| const promises = []; | |
| for (const url of urls) { | |
| promises.push((0, functions_1.getFunctions)().taskQueue(AdaptiveCrawlerHost_1.__singleCrawlQueueName).enqueue({ | |
| shortDigest, url, token: auth.bearerToken, meta | |
| }, { | |
| dispatchDeadlineSeconds: 1800, | |
| uri: await (0, get_function_url_1.getFunctionUrl)(AdaptiveCrawlerHost_1.__singleCrawlQueueName), | |
| })); | |
| } | |
| ; | |
| await Promise.all(promises); | |
| } | |
| else { | |
| meta.useSitemap = false; | |
| await adaptive_crawl_task_1.AdaptiveCrawlTask.COLLECTION.doc(shortDigest).update({ | |
| urls: [targetUrl.toString()], | |
| }); | |
| await (0, functions_1.getFunctions)().taskQueue(AdaptiveCrawlerHost_1.__singleCrawlQueueName).enqueue({ | |
| shortDigest, url: targetUrl.toString(), token: auth.bearerToken, meta | |
| }, { | |
| dispatchDeadlineSeconds: 1800, | |
| uri: await (0, get_function_url_1.getFunctionUrl)(AdaptiveCrawlerHost_1.__singleCrawlQueueName), | |
| }); | |
| } | |
| return { taskId: shortDigest }; | |
| } | |
| async adaptiveCrawlStatus(rpcReflect, ctx, auth, taskId, urls = []) { | |
| if (!taskId) { | |
| throw new civkit_1.ParamValidationError('taskId is required'); | |
| } | |
| const state = await adaptive_crawl_task_1.AdaptiveCrawlTask.fromFirestore(taskId); | |
| if (!state) { | |
| throw new civkit_1.AssertionFailureError('The task does not exist'); | |
| } | |
| if (state?.createdAt && state.createdAt.getTime() < Date.now() - this.cacheExpiry) { | |
| throw new civkit_1.AssertionFailureError('The task has expired'); | |
| } | |
| if (urls.length) { | |
| const promises = Object.entries(state?.processed ?? {}).map(async ([url, cachePath]) => { | |
| if (urls.includes(url)) { | |
| const raw = await this.firebaseObjectStorage.downloadFile(cachePath); | |
| state.processed[url] = JSON.parse(raw.toString('utf-8')); | |
| } | |
| }); | |
| await Promise.all(promises); | |
| } | |
| return state; | |
| } | |
| async singleCrawlQueue(shortDigest, url, token, meta) { | |
| const error = { | |
| reason: '' | |
| }; | |
| const state = await adaptive_crawl_task_1.AdaptiveCrawlTask.fromFirestore(shortDigest); | |
| if (state?.status === adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.COMPLETED) { | |
| return; | |
| } | |
| try { | |
| url = removeURLHash(url); | |
| } | |
| catch (e) { | |
| error.reason = `Failed to parse url: ${url}`; | |
| } | |
| this.logger.debug(shortDigest, url, meta); | |
| const cachePath = `adaptive-crawl-task/${shortDigest}/${md5Hasher.hash(url)}`; | |
| if (!error.reason) { | |
| const result = meta.useSitemap | |
| ? await this.handleSingleCrawl(shortDigest, url, token, cachePath) | |
| : await this.handleSingleCrawlRecursively(shortDigest, url, token, meta, cachePath); | |
| if (!result) { | |
| return; | |
| } | |
| error.reason = result.error.reason; | |
| } | |
| await adaptive_crawl_task_1.AdaptiveCrawlTask.DB.runTransaction(async (transaction) => { | |
| const ref = adaptive_crawl_task_1.AdaptiveCrawlTask.COLLECTION.doc(shortDigest); | |
| const state = await transaction.get(ref); | |
| const data = state.data(); | |
| if (error.reason) { | |
| data.failed[url] = error; | |
| } | |
| else { | |
| data.processed[url] = cachePath; | |
| } | |
| const status = Object.keys(data.processed).length + Object.keys(data.failed).length >= data.urls.length | |
| ? adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.COMPLETED : adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.PROCESSING; | |
| const statusText = Object.keys(data.processed).length + Object.keys(data.failed).length >= data.urls.length | |
| ? `Completed ${Object.keys(data.processed).length} Succeeded, ${Object.keys(data.failed).length} Failed` | |
| : `Processing ${Object.keys(data.processed).length + Object.keys(data.failed).length}/${data.urls.length}`; | |
| const payload = { | |
| status, | |
| statusText, | |
| processed: data.processed, | |
| failed: data.failed, | |
| }; | |
| if (status === adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.COMPLETED) { | |
| payload.finishedAt = new Date(); | |
| payload.duration = new Date().getTime() - data.createdAt.toDate().getTime(); | |
| } | |
| transaction.update(ref, payload); | |
| }); | |
| } | |
| async handleSingleCrawl(shortDigest, url, token, cachePath) { | |
| const error = { | |
| reason: '' | |
| }; | |
| const response = await fetch('https://r.jina.ai', { | |
| method: 'POST', | |
| headers: { | |
| 'Content-Type': 'application/json', | |
| 'Authorization': `Bearer ${token}`, | |
| 'Accept': 'application/json', | |
| }, | |
| body: JSON.stringify({ url }) | |
| }); | |
| if (!response.ok) { | |
| error.reason = `Failed to crawl ${url}, ${response.statusText}`; | |
| } | |
| else { | |
| const json = await response.json(); | |
| await this.firebaseObjectStorage.saveFile(cachePath, Buffer.from(JSON.stringify(json), 'utf-8'), { | |
| metadata: { | |
| contentType: 'application/json', | |
| } | |
| }); | |
| } | |
| return { | |
| error, | |
| }; | |
| } | |
| async handleSingleCrawlRecursively(shortDigest, url, token, meta, cachePath) { | |
| const error = { | |
| reason: '' | |
| }; | |
| const response = await fetch('https://r.jina.ai', { | |
| method: 'POST', | |
| headers: { | |
| 'Content-Type': 'application/json', | |
| 'Authorization': `Bearer ${token}`, | |
| 'Accept': 'application/json', | |
| 'X-With-Links-Summary': 'true', | |
| }, | |
| body: JSON.stringify({ url }) | |
| }); | |
| if (!response.ok) { | |
| error.reason = `Failed to crawl ${url}, ${response.statusText}`; | |
| } | |
| else { | |
| const json = await response.json(); | |
| await this.firebaseObjectStorage.saveFile(cachePath, Buffer.from(JSON.stringify(json), 'utf-8'), { | |
| metadata: { | |
| contentType: 'application/json', | |
| } | |
| }); | |
| const title = json.data.title; | |
| const description = json.data.description; | |
| const links = json.data.links; | |
| const relevantUrls = await this.getRelevantUrls(token, { title, description, links }); | |
| this.logger.debug(`Total urls: ${Object.keys(links).length}, relevant urls: ${relevantUrls.length}`); | |
| for (const url of relevantUrls) { | |
| let abortContinue = false; | |
| let abortBreak = false; | |
| await adaptive_crawl_task_1.AdaptiveCrawlTask.DB.runTransaction(async (transaction) => { | |
| const ref = adaptive_crawl_task_1.AdaptiveCrawlTask.COLLECTION.doc(shortDigest); | |
| const state = await transaction.get(ref); | |
| const data = state.data(); | |
| if (data.urls.includes(url)) { | |
| this.logger.debug('Recursive CONTINUE', data); | |
| abortContinue = true; | |
| return; | |
| } | |
| const urls = [ | |
| ...data.urls, | |
| url | |
| ]; | |
| if (urls.length > meta.maxPages || data.status === adaptive_crawl_task_1.AdaptiveCrawlTaskStatus.COMPLETED) { | |
| this.logger.debug('Recursive BREAK', data); | |
| abortBreak = true; | |
| return; | |
| } | |
| transaction.update(ref, { urls }); | |
| }); | |
| if (abortContinue) { | |
| continue; | |
| } | |
| if (abortBreak) { | |
| break; | |
| } | |
| await (0, functions_1.getFunctions)().taskQueue(AdaptiveCrawlerHost_1.__singleCrawlQueueName).enqueue({ | |
| shortDigest, url, token, meta | |
| }, { | |
| dispatchDeadlineSeconds: 1800, | |
| uri: await (0, get_function_url_1.getFunctionUrl)(AdaptiveCrawlerHost_1.__singleCrawlQueueName), | |
| }); | |
| } | |
| ; | |
| } | |
| return { | |
| error, | |
| }; | |
| } | |
| async getRelevantUrls(token, { title, description, links }) { | |
| const invalidSuffix = [ | |
| '.zip', | |
| '.docx', | |
| '.pptx', | |
| '.xlsx', | |
| ]; | |
| const validLinks = Object.entries(links) | |
| .map(([title, link]) => link) | |
| .filter(link => link.startsWith('http') && !invalidSuffix.some(suffix => link.endsWith(suffix))); | |
| let query = ''; | |
| if (!description) { | |
| query += title; | |
| } | |
| else { | |
| query += `TITLE: ${title}; DESCRIPTION: ${description}`; | |
| } | |
| const data = { | |
| model: 'jina-reranker-v2-base-multilingual', | |
| query, | |
| top_n: 15, | |
| documents: validLinks, | |
| }; | |
| const response = await fetch('https://api.jina.ai/v1/rerank', { | |
| method: 'POST', | |
| headers: { | |
| 'Content-Type': 'application/json', | |
| 'Authorization': `Bearer ${token}` | |
| }, | |
| body: JSON.stringify(data) | |
| }); | |
| const json = (await response.json()); | |
| const highestRelevanceScore = json.results[0]?.relevance_score ?? 0; | |
| return json.results.filter(r => r.relevance_score > Math.max(highestRelevanceScore * 0.6, 0.1)).map(r => removeURLHash(r.document.text)); | |
| } | |
| getIndex(user) { | |
| // TODO: 需要更新使用方式 | |
| // const indexObject: Record<string, string | number | undefined> = Object.create(indexProto); | |
| // Object.assign(indexObject, { | |
| // usage1: 'https://r.jina.ai/YOUR_URL', | |
| // usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY', | |
| // homepage: 'https://jina.ai/reader', | |
| // sourceCode: 'https://github.com/jina-ai/reader', | |
| // }); | |
| // if (user) { | |
| // indexObject[''] = undefined; | |
| // indexObject.authenticatedAs = `${user.user_id} (${user.full_name})`; | |
| // indexObject.balanceLeft = user.wallet.total_balance; | |
| // } | |
| // return indexObject; | |
| } | |
| async crawlUrlsFromSitemap(url, maxPages) { | |
| const sitemapsFromRobotsTxt = await this.getSitemapsFromRobotsTxt(url); | |
| const initialSitemaps = []; | |
| if (sitemapsFromRobotsTxt === null) { | |
| initialSitemaps.push(`${url.origin}/sitemap.xml`); | |
| } | |
| else { | |
| initialSitemaps.push(...sitemapsFromRobotsTxt); | |
| } | |
| const allUrls = new Set(); | |
| const processedSitemaps = new Set(); | |
| const fetchSitemapUrls = async (sitemapUrl) => { | |
| sitemapUrl = sitemapUrl.trim(); | |
| if (processedSitemaps.has(sitemapUrl)) { | |
| return; | |
| } | |
| processedSitemaps.add(sitemapUrl); | |
| try { | |
| const response = await fetch(sitemapUrl); | |
| const sitemapContent = await response.text(); | |
| const parser = new xmldom_1.DOMParser(); | |
| const xmlDoc = parser.parseFromString(sitemapContent, 'text/xml'); | |
| // handle normal sitemap | |
| const urlElements = xmlDoc.getElementsByTagName('url'); | |
| for (let i = 0; i < urlElements.length; i++) { | |
| const locElement = urlElements[i].getElementsByTagName('loc')[0]; | |
| if (locElement) { | |
| const loc = locElement.textContent?.trim() || ''; | |
| if (loc.startsWith(url.origin) && !loc.endsWith('.xml')) { | |
| allUrls.add(removeURLHash(loc)); | |
| } | |
| if (allUrls.size >= maxPages) { | |
| return; | |
| } | |
| } | |
| } | |
| // handle sitemap index | |
| const sitemapElements = xmlDoc.getElementsByTagName('sitemap'); | |
| for (let i = 0; i < sitemapElements.length; i++) { | |
| const locElement = sitemapElements[i].getElementsByTagName('loc')[0]; | |
| if (locElement) { | |
| await fetchSitemapUrls(locElement.textContent?.trim() || ''); | |
| if (allUrls.size >= maxPages) { | |
| return; | |
| } | |
| } | |
| } | |
| } | |
| catch (error) { | |
| this.logger.error(`Error fetching sitemap ${sitemapUrl}:`, error); | |
| } | |
| }; | |
| for (const sitemapUrl of initialSitemaps) { | |
| await fetchSitemapUrls(sitemapUrl); | |
| if (allUrls.size >= maxPages) { | |
| break; | |
| } | |
| } | |
| const urlsToProcess = Array.from(allUrls).slice(0, maxPages); | |
| return urlsToProcess; | |
| } | |
| async getSitemapsFromRobotsTxt(url) { | |
| const hostname = url.origin; | |
| const robotsUrl = `${hostname}/robots.txt`; | |
| const response = await fetch(robotsUrl); | |
| if (response.status === 404) { | |
| return null; | |
| } | |
| const robotsTxt = await response.text(); | |
| if (robotsTxt.length) { | |
| const robot = (0, robots_parser_1.default)(robotsUrl, robotsTxt); | |
| return robot.getSitemaps(); | |
| } | |
| return null; | |
| } | |
| }; | |
| exports.AdaptiveCrawlerHost = AdaptiveCrawlerHost; | |
| __decorate([ | |
| (0, shared_1.CloudHTTPv2)({ | |
| runtime: { | |
| memory: '1GiB', | |
| timeoutSeconds: 300, | |
| concurrency: 22, | |
| }, | |
| tags: ['Crawler'], | |
| httpMethod: ['post', 'get'], | |
| returnType: [String], | |
| }), | |
| __param(0, (0, shared_1.RPCReflect)()), | |
| __param(1, (0, shared_1.Ctx)()), | |
| __metadata("design:type", Function), | |
| __metadata("design:paramtypes", [typeof (_c = typeof civkit_1.RPCReflection !== "undefined" && civkit_1.RPCReflection) === "function" ? _c : Object, Object, typeof (_d = typeof jina_embeddings_auth_1.JinaEmbeddingsAuthDTO !== "undefined" && jina_embeddings_auth_1.JinaEmbeddingsAuthDTO) === "function" ? _d : Object, crawler_options_1.CrawlerOptions, | |
| adaptive_crawler_options_1.AdaptiveCrawlerOptions]), | |
| __metadata("design:returntype", Promise) | |
| ], AdaptiveCrawlerHost.prototype, "adaptiveCrawl", null); | |
| __decorate([ | |
| (0, shared_1.CloudHTTPv2)({ | |
| runtime: { | |
| memory: '1GiB', | |
| timeoutSeconds: 300, | |
| concurrency: 22, | |
| }, | |
| tags: ['Crawler'], | |
| httpMethod: ['post', 'get'], | |
| returnType: adaptive_crawl_task_1.AdaptiveCrawlTask, | |
| }), | |
| __param(0, (0, shared_1.RPCReflect)()), | |
| __param(1, (0, shared_1.Ctx)()), | |
| __param(3, (0, shared_1.Param)('taskId')), | |
| __param(4, (0, shared_1.Param)('urls')), | |
| __metadata("design:type", Function), | |
| __metadata("design:paramtypes", [typeof (_e = typeof civkit_1.RPCReflection !== "undefined" && civkit_1.RPCReflection) === "function" ? _e : Object, Object, typeof (_f = typeof jina_embeddings_auth_1.JinaEmbeddingsAuthDTO !== "undefined" && jina_embeddings_auth_1.JinaEmbeddingsAuthDTO) === "function" ? _f : Object, String, Array]), | |
| __metadata("design:returntype", Promise) | |
| ], AdaptiveCrawlerHost.prototype, "adaptiveCrawlStatus", null); | |
| __decorate([ | |
| (0, shared_1.CloudTaskV2)({ | |
| name: AdaptiveCrawlerHost.__singleCrawlQueueName, | |
| runtime: { | |
| cpu: 1, | |
| memory: '1GiB', | |
| timeoutSeconds: 3600, | |
| concurrency: 2, | |
| maxInstances: 200, | |
| retryConfig: { | |
| maxAttempts: 3, | |
| minBackoffSeconds: 60, | |
| }, | |
| rateLimits: { | |
| maxConcurrentDispatches: 150, | |
| maxDispatchesPerSecond: 5, | |
| }, | |
| } | |
| }), | |
| __param(0, (0, shared_1.Param)('shortDigest')), | |
| __param(1, (0, shared_1.Param)('url')), | |
| __param(2, (0, shared_1.Param)('token')), | |
| __param(3, (0, shared_1.Param)('meta')), | |
| __metadata("design:type", Function), | |
| __metadata("design:paramtypes", [String, String, String, Object]), | |
| __metadata("design:returntype", Promise) | |
| ], AdaptiveCrawlerHost.prototype, "singleCrawlQueue", null); | |
| exports.AdaptiveCrawlerHost = AdaptiveCrawlerHost = AdaptiveCrawlerHost_1 = __decorate([ | |
| (0, tsyringe_1.singleton)(), | |
| __metadata("design:paramtypes", [typeof (_a = typeof shared_1.Logger !== "undefined" && shared_1.Logger) === "function" ? _a : Object, typeof (_b = typeof shared_1.FirebaseStorageBucketControl !== "undefined" && shared_1.FirebaseStorageBucketControl) === "function" ? _b : Object]) | |
| ], AdaptiveCrawlerHost); | |
| //# sourceMappingURL=adaptive-crawler.js.map |