nomagick commited on
Commit
72e1c46
·
unverified ·
1 Parent(s): e100b25

fix: improve search responsiveness

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -28,6 +28,18 @@ export interface ExtraScrappingOptions extends ScrappingOptions {
28
  targetSelector?: string;
29
  }
30
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  @singleton()
32
  export class CrawlerHost extends RPCHost {
33
  logger = this.globalLogger.child({ service: this.constructor.name });
@@ -123,7 +135,7 @@ export class CrawlerHost extends RPCHost {
123
 
124
  async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
125
  screenshotUrl?: string;
126
- }, nominalUrl?: URL) {
127
  if (mode === 'screenshot') {
128
  if (snapshot.screenshot && !snapshot.screenshotUrl) {
129
  const fid = `instant-screenshots/${randomUUID()}`;
@@ -140,7 +152,7 @@ export class CrawlerHost extends RPCHost {
140
  toString() {
141
  return this.screenshotUrl;
142
  }
143
- };
144
  }
145
  if (mode === 'html') {
146
  return {
@@ -148,7 +160,7 @@ export class CrawlerHost extends RPCHost {
148
  toString() {
149
  return this.html;
150
  }
151
- };
152
  }
153
  if (mode === 'text') {
154
  return {
@@ -156,7 +168,7 @@ export class CrawlerHost extends RPCHost {
156
  toString() {
157
  return this.text;
158
  }
159
- };
160
  }
161
 
162
  const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
@@ -272,7 +284,7 @@ ${this.content}
272
  }
273
  };
274
 
275
- return formatted;
276
  }
277
 
278
  @CloudHTTPv2({
 
28
  targetSelector?: string;
29
  }
30
 
31
+ export interface FormattedPage {
32
+ title?: string;
33
+ url?: string;
34
+ content?: string;
35
+ publishedTime?: string;
36
+ html?: string;
37
+ text?: string;
38
+ screenshotUrl?: string;
39
+
40
+ toString: () => string;
41
+ }
42
+
43
  @singleton()
44
  export class CrawlerHost extends RPCHost {
45
  logger = this.globalLogger.child({ service: this.constructor.name });
 
135
 
136
  async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
137
  screenshotUrl?: string;
138
+ }, nominalUrl?: URL){
139
  if (mode === 'screenshot') {
140
  if (snapshot.screenshot && !snapshot.screenshotUrl) {
141
  const fid = `instant-screenshots/${randomUUID()}`;
 
152
  toString() {
153
  return this.screenshotUrl;
154
  }
155
+ } as FormattedPage;
156
  }
157
  if (mode === 'html') {
158
  return {
 
160
  toString() {
161
  return this.html;
162
  }
163
+ } as FormattedPage;
164
  }
165
  if (mode === 'text') {
166
  return {
 
168
  toString() {
169
  return this.text;
170
  }
171
+ } as FormattedPage;
172
  }
173
 
174
  const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
 
284
  }
285
  };
286
 
287
+ return formatted as FormattedPage;
288
  }
289
 
290
  @CloudHTTPv2({
backend/functions/src/cloud-functions/searcher.ts CHANGED
@@ -12,7 +12,7 @@ import { ScrappingOptions } from '../services/puppeteer';
12
  import { Request, Response } from 'express';
13
  import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
14
  import { BraveSearchService } from '../services/brave-search';
15
- import { CrawlerHost } from './crawler';
16
  import { CookieParam } from 'puppeteer';
17
 
18
  import { parseString as parseSetCookieString } from 'set-cookie-parser';
@@ -31,6 +31,8 @@ export class SearcherHost extends RPCHost {
31
 
32
  reasonableDelayMs = 10_000;
33
 
 
 
34
  constructor(
35
  protected globalLogger: Logger,
36
  protected rateLimitControl: RateLimitControl,
@@ -63,7 +65,7 @@ export class SearcherHost extends RPCHost {
63
  runtime: {
64
  memory: '8GiB',
65
  timeoutSeconds: 300,
66
- concurrency: 8,
67
  maxInstances: 200,
68
  },
69
  openapi: {
@@ -154,7 +156,7 @@ export class SearcherHost extends RPCHost {
154
  throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
155
  }
156
 
157
- await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'],
158
  [
159
  // 40 requests per minute
160
  new Date(Date.now() - 60 * 1000), 40
@@ -163,19 +165,29 @@ export class SearcherHost extends RPCHost {
163
 
164
  rpcReflect.finally(() => {
165
  if (chargeAmount) {
166
- auth.reportUsage(chargeAmount, 'reader-crawl').catch((err) => {
167
  this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
168
  });
 
 
 
169
  }
170
  });
171
  } else if (ctx.req.ip) {
172
  this.threadLocal.set('ip', ctx.req.ip);
173
- await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'],
174
  [
175
  // 5 requests per minute
176
  new Date(Date.now() - 60 * 1000), 5
177
  ]
178
  );
 
 
 
 
 
 
 
179
  }
180
 
181
  const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
@@ -211,7 +223,7 @@ export class SearcherHost extends RPCHost {
211
  const searchQuery = noSlashPath;
212
  const r = await this.cachedWebSearch({
213
  q: searchQuery,
214
- count: 5
215
  }, noCache);
216
 
217
  const it = this.fetchSearchResults(customMode, r.web.results, crawlOpts, pageCacheTolerance);
@@ -262,7 +274,7 @@ export class SearcherHost extends RPCHost {
262
  for await (const scrapped of it) {
263
  lastScrapped = scrapped;
264
 
265
- if (!this.qualified(scrapped)) {
266
  continue;
267
  }
268
  clearTimeout(earlyReturnTimer);
@@ -296,7 +308,7 @@ export class SearcherHost extends RPCHost {
296
  for await (const scrapped of it) {
297
  lastScrapped = scrapped;
298
 
299
- if (!this.qualified(scrapped)) {
300
  continue;
301
  }
302
 
@@ -331,50 +343,68 @@ export class SearcherHost extends RPCHost {
331
  const mapped = scrapped.map((x, i) => {
332
  const upstreamSearchResult = searchResults[i];
333
  if (!x || (!x.parsed && mode !== 'markdown')) {
334
- const p = {
335
- toString(this: any) {
336
- if (this.title && this.description) {
337
- return `[${i + 1}] Title: ${this.title}
338
- [${i + 1}] URL Source: ${this.url}
339
- [${i + 1}] Description: ${this.description}
340
- `;
341
- }
342
- return `[${i + 1}] No content available for ${this.url}`;
343
- }
344
  };
345
- const r = Object.create(p);
346
- r.url = upstreamSearchResult.url;
347
- r.title = upstreamSearchResult.title;
348
- r.description = upstreamSearchResult.description;
349
-
350
- return r;
351
  }
352
  return this.crawler.formatSnapshot(mode, x, urls[i]);
353
  });
354
 
355
- const resultArray = await Promise.all(mapped);
356
- for (const [i, result] of resultArray.entries()) {
357
- if (result && typeof result === 'object' && Object.hasOwn(result, 'toString')) {
358
- result.toString = function (this: any) {
359
- const mixins = [];
360
- if (this.publishedTime) {
361
- mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  }
363
 
364
- return `[${i + 1}] Title: ${this.title}
 
 
 
 
 
 
 
 
365
  [${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
366
  [${i + 1}] Markdown Content:
367
  ${this.content}
368
  `;
369
- };
370
  }
371
- }
372
- resultArray.toString = function () {
373
- return this.map((x, i) => x ? x.toString() : `[${i + 1}] No content available for ${urls[i]}`).join('\n\n').trimEnd() + '\n';
374
  };
 
375
 
376
- yield resultArray;
377
- }
 
 
 
378
  }
379
 
380
  getChargeAmount(formatted: any[]) {
@@ -383,17 +413,16 @@ ${this.content}
383
  );
384
  }
385
 
386
- qualified(scrapped: any[]) {
387
- return _.every(scrapped, (x) =>
388
- (x as any)?.title &&
389
- (
390
- (x as any).content ||
391
- (x as any).screenShotUrl ||
392
- (x as any).screenshot ||
393
- (x as any).text ||
394
- (x as any).html
395
- )
396
- );
397
  }
398
 
399
  async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) {
 
12
  import { Request, Response } from 'express';
13
  import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
14
  import { BraveSearchService } from '../services/brave-search';
15
+ import { CrawlerHost, FormattedPage } from './crawler';
16
  import { CookieParam } from 'puppeteer';
17
 
18
  import { parseString as parseSetCookieString } from 'set-cookie-parser';
 
31
 
32
  reasonableDelayMs = 10_000;
33
 
34
+ targetResultCount = 5;
35
+
36
  constructor(
37
  protected globalLogger: Logger,
38
  protected rateLimitControl: RateLimitControl,
 
65
  runtime: {
66
  memory: '8GiB',
67
  timeoutSeconds: 300,
68
+ concurrency: 4,
69
  maxInstances: 200,
70
  },
71
  openapi: {
 
156
  throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
157
  }
158
 
159
+ const apiRoll = await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'],
160
  [
161
  // 40 requests per minute
162
  new Date(Date.now() - 60 * 1000), 40
 
165
 
166
  rpcReflect.finally(() => {
167
  if (chargeAmount) {
168
+ auth.reportUsage(chargeAmount, 'reader-search').catch((err) => {
169
  this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
170
  });
171
+ apiRoll._ref?.set({
172
+ chargeAmount,
173
+ }, { merge: true }).catch((err) => this.logger.warn(`Failed to log charge amount in apiRoll`, { err }));
174
  }
175
  });
176
  } else if (ctx.req.ip) {
177
  this.threadLocal.set('ip', ctx.req.ip);
178
+ const apiRoll = await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'],
179
  [
180
  // 5 requests per minute
181
  new Date(Date.now() - 60 * 1000), 5
182
  ]
183
  );
184
+ rpcReflect.finally(() => {
185
+ if (chargeAmount) {
186
+ apiRoll._ref?.set({
187
+ chargeAmount,
188
+ }, { merge: true }).catch((err) => this.logger.warn(`Failed to log charge amount in apiRoll`, { err }));
189
+ }
190
+ });
191
  }
192
 
193
  const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
 
223
  const searchQuery = noSlashPath;
224
  const r = await this.cachedWebSearch({
225
  q: searchQuery,
226
+ count: 10
227
  }, noCache);
228
 
229
  const it = this.fetchSearchResults(customMode, r.web.results, crawlOpts, pageCacheTolerance);
 
274
  for await (const scrapped of it) {
275
  lastScrapped = scrapped;
276
 
277
+ if (!this.searchResultsQualified(scrapped)) {
278
  continue;
279
  }
280
  clearTimeout(earlyReturnTimer);
 
308
  for await (const scrapped of it) {
309
  lastScrapped = scrapped;
310
 
311
+ if (!this.searchResultsQualified(scrapped)) {
312
  continue;
313
  }
314
 
 
343
  const mapped = scrapped.map((x, i) => {
344
  const upstreamSearchResult = searchResults[i];
345
  if (!x || (!x.parsed && mode !== 'markdown')) {
346
+ return {
347
+ url: upstreamSearchResult.url,
348
+ title: upstreamSearchResult.title,
349
+ description: upstreamSearchResult.description,
 
 
 
 
 
 
350
  };
 
 
 
 
 
 
351
  }
352
  return this.crawler.formatSnapshot(mode, x, urls[i]);
353
  });
354
 
355
+ const resultArray = await Promise.all(mapped) as FormattedPage[];
356
+
357
+ yield this.reOrganizeSearchResults(resultArray);
358
+ }
359
+ }
360
+
361
+ reOrganizeSearchResults(searchResults: FormattedPage[]) {
362
+ const [qualifiedPages, unqualifiedPages] = _.partition(searchResults, (x) => this.pageQualified(x));
363
+ const acceptSet = new Set(qualifiedPages);
364
+
365
+ const n = this.targetResultCount - qualifiedPages.length;
366
+ for (const x of unqualifiedPages.slice(0, n >= 0 ? n : 0)) {
367
+ acceptSet.add(x);
368
+ }
369
+
370
+ const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, this.targetResultCount);
371
+ filtered.toString = searchResults.toString;
372
+
373
+ const resultArray = filtered.map((x, i) => {
374
+
375
+ return {
376
+ ...x,
377
+ toString(this: any) {
378
+ if (this.description) {
379
+ if (this.title) {
380
+ return `[${i + 1}] Title: ${this.title}
381
+ [${i + 1}] URL Source: ${this.url}
382
+ [${i + 1}] Description: ${this.description}
383
+ `;
384
  }
385
 
386
+ return `[${i + 1}] No content available for ${this.url}`;
387
+ }
388
+
389
+ const mixins = [];
390
+ if (this.publishedTime) {
391
+ mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
392
+ }
393
+
394
+ return `[${i + 1}] Title: ${this.title}
395
  [${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
396
  [${i + 1}] Markdown Content:
397
  ${this.content}
398
  `;
 
399
  }
 
 
 
400
  };
401
+ });
402
 
403
+ resultArray.toString = function () {
404
+ return this.map((x, i) => x ? x.toString() : `[${i + 1}] No content available for ${this[i].url}`).join('\n\n').trimEnd() + '\n';
405
+ };
406
+
407
+ return resultArray;
408
  }
409
 
410
  getChargeAmount(formatted: any[]) {
 
413
  );
414
  }
415
 
416
+ pageQualified(formattedPage: FormattedPage) {
417
+ return formattedPage.title &&
418
+ formattedPage.content ||
419
+ formattedPage.screenshotUrl ||
420
+ formattedPage.text ||
421
+ formattedPage.html;
422
+ }
423
+
424
+ searchResultsQualified(results: FormattedPage[]) {
425
+ return _.every(results, (x) => this.pageQualified(x)) && results.length >= this.targetResultCount;
 
426
  }
427
 
428
  async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) {
backend/functions/src/services/puppeteer.ts CHANGED
@@ -99,7 +99,7 @@ export class PuppeteerControl extends AsyncService {
99
  return page.browser().connected && !page.isClosed();
100
  }
101
  }, {
102
- max: Math.max(1 + Math.floor(os.totalmem() / (384 * 1024 * 1024)), 16),
103
  min: 1,
104
  acquireTimeoutMillis: 60_000,
105
  testOnBorrow: true,
 
99
  return page.browser().connected && !page.isClosed();
100
  }
101
  }, {
102
+ max: Math.max(1 + Math.floor(os.totalmem() / (256 * 1024 * 1024)), 16),
103
  min: 1,
104
  acquireTimeoutMillis: 60_000,
105
  testOnBorrow: true,