Spaces:
Build error
Build error
fix: improve search responsiveness
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -28,6 +28,18 @@ export interface ExtraScrappingOptions extends ScrappingOptions {
|
|
| 28 |
targetSelector?: string;
|
| 29 |
}
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
@singleton()
|
| 32 |
export class CrawlerHost extends RPCHost {
|
| 33 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
|
@@ -123,7 +135,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 123 |
|
| 124 |
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
|
| 125 |
screenshotUrl?: string;
|
| 126 |
-
}, nominalUrl?: URL)
|
| 127 |
if (mode === 'screenshot') {
|
| 128 |
if (snapshot.screenshot && !snapshot.screenshotUrl) {
|
| 129 |
const fid = `instant-screenshots/${randomUUID()}`;
|
|
@@ -140,7 +152,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 140 |
toString() {
|
| 141 |
return this.screenshotUrl;
|
| 142 |
}
|
| 143 |
-
};
|
| 144 |
}
|
| 145 |
if (mode === 'html') {
|
| 146 |
return {
|
|
@@ -148,7 +160,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 148 |
toString() {
|
| 149 |
return this.html;
|
| 150 |
}
|
| 151 |
-
};
|
| 152 |
}
|
| 153 |
if (mode === 'text') {
|
| 154 |
return {
|
|
@@ -156,7 +168,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 156 |
toString() {
|
| 157 |
return this.text;
|
| 158 |
}
|
| 159 |
-
};
|
| 160 |
}
|
| 161 |
|
| 162 |
const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
|
|
@@ -272,7 +284,7 @@ ${this.content}
|
|
| 272 |
}
|
| 273 |
};
|
| 274 |
|
| 275 |
-
return formatted;
|
| 276 |
}
|
| 277 |
|
| 278 |
@CloudHTTPv2({
|
|
|
|
| 28 |
targetSelector?: string;
|
| 29 |
}
|
| 30 |
|
| 31 |
+
export interface FormattedPage {
|
| 32 |
+
title?: string;
|
| 33 |
+
url?: string;
|
| 34 |
+
content?: string;
|
| 35 |
+
publishedTime?: string;
|
| 36 |
+
html?: string;
|
| 37 |
+
text?: string;
|
| 38 |
+
screenshotUrl?: string;
|
| 39 |
+
|
| 40 |
+
toString: () => string;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
@singleton()
|
| 44 |
export class CrawlerHost extends RPCHost {
|
| 45 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
|
|
|
| 135 |
|
| 136 |
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
|
| 137 |
screenshotUrl?: string;
|
| 138 |
+
}, nominalUrl?: URL){
|
| 139 |
if (mode === 'screenshot') {
|
| 140 |
if (snapshot.screenshot && !snapshot.screenshotUrl) {
|
| 141 |
const fid = `instant-screenshots/${randomUUID()}`;
|
|
|
|
| 152 |
toString() {
|
| 153 |
return this.screenshotUrl;
|
| 154 |
}
|
| 155 |
+
} as FormattedPage;
|
| 156 |
}
|
| 157 |
if (mode === 'html') {
|
| 158 |
return {
|
|
|
|
| 160 |
toString() {
|
| 161 |
return this.html;
|
| 162 |
}
|
| 163 |
+
} as FormattedPage;
|
| 164 |
}
|
| 165 |
if (mode === 'text') {
|
| 166 |
return {
|
|
|
|
| 168 |
toString() {
|
| 169 |
return this.text;
|
| 170 |
}
|
| 171 |
+
} as FormattedPage;
|
| 172 |
}
|
| 173 |
|
| 174 |
const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
|
|
|
|
| 284 |
}
|
| 285 |
};
|
| 286 |
|
| 287 |
+
return formatted as FormattedPage;
|
| 288 |
}
|
| 289 |
|
| 290 |
@CloudHTTPv2({
|
backend/functions/src/cloud-functions/searcher.ts
CHANGED
|
@@ -12,7 +12,7 @@ import { ScrappingOptions } from '../services/puppeteer';
|
|
| 12 |
import { Request, Response } from 'express';
|
| 13 |
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
| 14 |
import { BraveSearchService } from '../services/brave-search';
|
| 15 |
-
import { CrawlerHost } from './crawler';
|
| 16 |
import { CookieParam } from 'puppeteer';
|
| 17 |
|
| 18 |
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
|
@@ -31,6 +31,8 @@ export class SearcherHost extends RPCHost {
|
|
| 31 |
|
| 32 |
reasonableDelayMs = 10_000;
|
| 33 |
|
|
|
|
|
|
|
| 34 |
constructor(
|
| 35 |
protected globalLogger: Logger,
|
| 36 |
protected rateLimitControl: RateLimitControl,
|
|
@@ -63,7 +65,7 @@ export class SearcherHost extends RPCHost {
|
|
| 63 |
runtime: {
|
| 64 |
memory: '8GiB',
|
| 65 |
timeoutSeconds: 300,
|
| 66 |
-
concurrency:
|
| 67 |
maxInstances: 200,
|
| 68 |
},
|
| 69 |
openapi: {
|
|
@@ -154,7 +156,7 @@ export class SearcherHost extends RPCHost {
|
|
| 154 |
throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
|
| 155 |
}
|
| 156 |
|
| 157 |
-
await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'],
|
| 158 |
[
|
| 159 |
// 40 requests per minute
|
| 160 |
new Date(Date.now() - 60 * 1000), 40
|
|
@@ -163,19 +165,29 @@ export class SearcherHost extends RPCHost {
|
|
| 163 |
|
| 164 |
rpcReflect.finally(() => {
|
| 165 |
if (chargeAmount) {
|
| 166 |
-
auth.reportUsage(chargeAmount, 'reader-
|
| 167 |
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
| 168 |
});
|
|
|
|
|
|
|
|
|
|
| 169 |
}
|
| 170 |
});
|
| 171 |
} else if (ctx.req.ip) {
|
| 172 |
this.threadLocal.set('ip', ctx.req.ip);
|
| 173 |
-
await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'],
|
| 174 |
[
|
| 175 |
// 5 requests per minute
|
| 176 |
new Date(Date.now() - 60 * 1000), 5
|
| 177 |
]
|
| 178 |
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
}
|
| 180 |
|
| 181 |
const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
|
|
@@ -211,7 +223,7 @@ export class SearcherHost extends RPCHost {
|
|
| 211 |
const searchQuery = noSlashPath;
|
| 212 |
const r = await this.cachedWebSearch({
|
| 213 |
q: searchQuery,
|
| 214 |
-
count:
|
| 215 |
}, noCache);
|
| 216 |
|
| 217 |
const it = this.fetchSearchResults(customMode, r.web.results, crawlOpts, pageCacheTolerance);
|
|
@@ -262,7 +274,7 @@ export class SearcherHost extends RPCHost {
|
|
| 262 |
for await (const scrapped of it) {
|
| 263 |
lastScrapped = scrapped;
|
| 264 |
|
| 265 |
-
if (!this.
|
| 266 |
continue;
|
| 267 |
}
|
| 268 |
clearTimeout(earlyReturnTimer);
|
|
@@ -296,7 +308,7 @@ export class SearcherHost extends RPCHost {
|
|
| 296 |
for await (const scrapped of it) {
|
| 297 |
lastScrapped = scrapped;
|
| 298 |
|
| 299 |
-
if (!this.
|
| 300 |
continue;
|
| 301 |
}
|
| 302 |
|
|
@@ -331,50 +343,68 @@ export class SearcherHost extends RPCHost {
|
|
| 331 |
const mapped = scrapped.map((x, i) => {
|
| 332 |
const upstreamSearchResult = searchResults[i];
|
| 333 |
if (!x || (!x.parsed && mode !== 'markdown')) {
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
[${i + 1}] URL Source: ${this.url}
|
| 339 |
-
[${i + 1}] Description: ${this.description}
|
| 340 |
-
`;
|
| 341 |
-
}
|
| 342 |
-
return `[${i + 1}] No content available for ${this.url}`;
|
| 343 |
-
}
|
| 344 |
};
|
| 345 |
-
const r = Object.create(p);
|
| 346 |
-
r.url = upstreamSearchResult.url;
|
| 347 |
-
r.title = upstreamSearchResult.title;
|
| 348 |
-
r.description = upstreamSearchResult.description;
|
| 349 |
-
|
| 350 |
-
return r;
|
| 351 |
}
|
| 352 |
return this.crawler.formatSnapshot(mode, x, urls[i]);
|
| 353 |
});
|
| 354 |
|
| 355 |
-
const resultArray = await Promise.all(mapped);
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
}
|
| 363 |
|
| 364 |
-
return `[${i + 1}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
|
| 366 |
[${i + 1}] Markdown Content:
|
| 367 |
${this.content}
|
| 368 |
`;
|
| 369 |
-
};
|
| 370 |
}
|
| 371 |
-
}
|
| 372 |
-
resultArray.toString = function () {
|
| 373 |
-
return this.map((x, i) => x ? x.toString() : `[${i + 1}] No content available for ${urls[i]}`).join('\n\n').trimEnd() + '\n';
|
| 374 |
};
|
|
|
|
| 375 |
|
| 376 |
-
|
| 377 |
-
|
|
|
|
|
|
|
|
|
|
| 378 |
}
|
| 379 |
|
| 380 |
getChargeAmount(formatted: any[]) {
|
|
@@ -383,17 +413,16 @@ ${this.content}
|
|
| 383 |
);
|
| 384 |
}
|
| 385 |
|
| 386 |
-
|
| 387 |
-
return
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
);
|
| 397 |
}
|
| 398 |
|
| 399 |
async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) {
|
|
|
|
| 12 |
import { Request, Response } from 'express';
|
| 13 |
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
| 14 |
import { BraveSearchService } from '../services/brave-search';
|
| 15 |
+
import { CrawlerHost, FormattedPage } from './crawler';
|
| 16 |
import { CookieParam } from 'puppeteer';
|
| 17 |
|
| 18 |
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
|
|
|
| 31 |
|
| 32 |
reasonableDelayMs = 10_000;
|
| 33 |
|
| 34 |
+
targetResultCount = 5;
|
| 35 |
+
|
| 36 |
constructor(
|
| 37 |
protected globalLogger: Logger,
|
| 38 |
protected rateLimitControl: RateLimitControl,
|
|
|
|
| 65 |
runtime: {
|
| 66 |
memory: '8GiB',
|
| 67 |
timeoutSeconds: 300,
|
| 68 |
+
concurrency: 4,
|
| 69 |
maxInstances: 200,
|
| 70 |
},
|
| 71 |
openapi: {
|
|
|
|
| 156 |
throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
|
| 157 |
}
|
| 158 |
|
| 159 |
+
const apiRoll = await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'],
|
| 160 |
[
|
| 161 |
// 40 requests per minute
|
| 162 |
new Date(Date.now() - 60 * 1000), 40
|
|
|
|
| 165 |
|
| 166 |
rpcReflect.finally(() => {
|
| 167 |
if (chargeAmount) {
|
| 168 |
+
auth.reportUsage(chargeAmount, 'reader-search').catch((err) => {
|
| 169 |
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
| 170 |
});
|
| 171 |
+
apiRoll._ref?.set({
|
| 172 |
+
chargeAmount,
|
| 173 |
+
}, { merge: true }).catch((err) => this.logger.warn(`Failed to log charge amount in apiRoll`, { err }));
|
| 174 |
}
|
| 175 |
});
|
| 176 |
} else if (ctx.req.ip) {
|
| 177 |
this.threadLocal.set('ip', ctx.req.ip);
|
| 178 |
+
const apiRoll = await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'],
|
| 179 |
[
|
| 180 |
// 5 requests per minute
|
| 181 |
new Date(Date.now() - 60 * 1000), 5
|
| 182 |
]
|
| 183 |
);
|
| 184 |
+
rpcReflect.finally(() => {
|
| 185 |
+
if (chargeAmount) {
|
| 186 |
+
apiRoll._ref?.set({
|
| 187 |
+
chargeAmount,
|
| 188 |
+
}, { merge: true }).catch((err) => this.logger.warn(`Failed to log charge amount in apiRoll`, { err }));
|
| 189 |
+
}
|
| 190 |
+
});
|
| 191 |
}
|
| 192 |
|
| 193 |
const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
|
|
|
|
| 223 |
const searchQuery = noSlashPath;
|
| 224 |
const r = await this.cachedWebSearch({
|
| 225 |
q: searchQuery,
|
| 226 |
+
count: 10
|
| 227 |
}, noCache);
|
| 228 |
|
| 229 |
const it = this.fetchSearchResults(customMode, r.web.results, crawlOpts, pageCacheTolerance);
|
|
|
|
| 274 |
for await (const scrapped of it) {
|
| 275 |
lastScrapped = scrapped;
|
| 276 |
|
| 277 |
+
if (!this.searchResultsQualified(scrapped)) {
|
| 278 |
continue;
|
| 279 |
}
|
| 280 |
clearTimeout(earlyReturnTimer);
|
|
|
|
| 308 |
for await (const scrapped of it) {
|
| 309 |
lastScrapped = scrapped;
|
| 310 |
|
| 311 |
+
if (!this.searchResultsQualified(scrapped)) {
|
| 312 |
continue;
|
| 313 |
}
|
| 314 |
|
|
|
|
| 343 |
const mapped = scrapped.map((x, i) => {
|
| 344 |
const upstreamSearchResult = searchResults[i];
|
| 345 |
if (!x || (!x.parsed && mode !== 'markdown')) {
|
| 346 |
+
return {
|
| 347 |
+
url: upstreamSearchResult.url,
|
| 348 |
+
title: upstreamSearchResult.title,
|
| 349 |
+
description: upstreamSearchResult.description,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
}
|
| 352 |
return this.crawler.formatSnapshot(mode, x, urls[i]);
|
| 353 |
});
|
| 354 |
|
| 355 |
+
const resultArray = await Promise.all(mapped) as FormattedPage[];
|
| 356 |
+
|
| 357 |
+
yield this.reOrganizeSearchResults(resultArray);
|
| 358 |
+
}
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
reOrganizeSearchResults(searchResults: FormattedPage[]) {
|
| 362 |
+
const [qualifiedPages, unqualifiedPages] = _.partition(searchResults, (x) => this.pageQualified(x));
|
| 363 |
+
const acceptSet = new Set(qualifiedPages);
|
| 364 |
+
|
| 365 |
+
const n = this.targetResultCount - qualifiedPages.length;
|
| 366 |
+
for (const x of unqualifiedPages.slice(0, n >= 0 ? n : 0)) {
|
| 367 |
+
acceptSet.add(x);
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, this.targetResultCount);
|
| 371 |
+
filtered.toString = searchResults.toString;
|
| 372 |
+
|
| 373 |
+
const resultArray = filtered.map((x, i) => {
|
| 374 |
+
|
| 375 |
+
return {
|
| 376 |
+
...x,
|
| 377 |
+
toString(this: any) {
|
| 378 |
+
if (this.description) {
|
| 379 |
+
if (this.title) {
|
| 380 |
+
return `[${i + 1}] Title: ${this.title}
|
| 381 |
+
[${i + 1}] URL Source: ${this.url}
|
| 382 |
+
[${i + 1}] Description: ${this.description}
|
| 383 |
+
`;
|
| 384 |
}
|
| 385 |
|
| 386 |
+
return `[${i + 1}] No content available for ${this.url}`;
|
| 387 |
+
}
|
| 388 |
+
|
| 389 |
+
const mixins = [];
|
| 390 |
+
if (this.publishedTime) {
|
| 391 |
+
mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
return `[${i + 1}] Title: ${this.title}
|
| 395 |
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
|
| 396 |
[${i + 1}] Markdown Content:
|
| 397 |
${this.content}
|
| 398 |
`;
|
|
|
|
| 399 |
}
|
|
|
|
|
|
|
|
|
|
| 400 |
};
|
| 401 |
+
});
|
| 402 |
|
| 403 |
+
resultArray.toString = function () {
|
| 404 |
+
return this.map((x, i) => x ? x.toString() : `[${i + 1}] No content available for ${this[i].url}`).join('\n\n').trimEnd() + '\n';
|
| 405 |
+
};
|
| 406 |
+
|
| 407 |
+
return resultArray;
|
| 408 |
}
|
| 409 |
|
| 410 |
getChargeAmount(formatted: any[]) {
|
|
|
|
| 413 |
);
|
| 414 |
}
|
| 415 |
|
| 416 |
+
pageQualified(formattedPage: FormattedPage) {
|
| 417 |
+
return formattedPage.title &&
|
| 418 |
+
formattedPage.content ||
|
| 419 |
+
formattedPage.screenshotUrl ||
|
| 420 |
+
formattedPage.text ||
|
| 421 |
+
formattedPage.html;
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
searchResultsQualified(results: FormattedPage[]) {
|
| 425 |
+
return _.every(results, (x) => this.pageQualified(x)) && results.length >= this.targetResultCount;
|
|
|
|
| 426 |
}
|
| 427 |
|
| 428 |
async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) {
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -99,7 +99,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 99 |
return page.browser().connected && !page.isClosed();
|
| 100 |
}
|
| 101 |
}, {
|
| 102 |
-
max: Math.max(1 + Math.floor(os.totalmem() / (
|
| 103 |
min: 1,
|
| 104 |
acquireTimeoutMillis: 60_000,
|
| 105 |
testOnBorrow: true,
|
|
|
|
| 99 |
return page.browser().connected && !page.isClosed();
|
| 100 |
}
|
| 101 |
}, {
|
| 102 |
+
max: Math.max(1 + Math.floor(os.totalmem() / (256 * 1024 * 1024)), 16),
|
| 103 |
min: 1,
|
| 104 |
acquireTimeoutMillis: 60_000,
|
| 105 |
testOnBorrow: true,
|