Aaron Ji nomagick commited on
Commit
60e67db
·
unverified ·
1 Parent(s): 2c0d020

fix: fallback logic (#1182)

Browse files

* fix: fallback logic

* fix: fallback logic

* fix: fallback logic

* chore: cleanup

* chore: cleanup

* chore: adjust fallback logic

* chore: adjust fallback logic

* chore: cleanup

* chore: cleanup

* tweak: fallback mech

---------

Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>

Files changed (2) hide show
  1. src/api/searcher-serper.ts +52 -49
  2. src/api/serp.ts +30 -7
src/api/searcher-serper.ts CHANGED
@@ -286,30 +286,13 @@ export class SearcherHost extends RPCHost {
286
  page,
287
  };
288
 
289
- const { response: r, query: successQuery, tryTimes } = await this.searchWithFallback(
290
  searchParams, fallback, crawlerOptions.noCache
291
  );
292
  chargeAmountScaler *= tryTimes;
293
 
294
  fallbackQuery = successQuery !== searchQuery ? successQuery : undefined;
295
 
296
- let results;
297
- switch (variant) {
298
- case 'images': {
299
- results = (r as SerperImageSearchResponse).images;
300
- break;
301
- }
302
- case 'news': {
303
- results = (r as SerperNewsSearchResponse).news;
304
- break;
305
- }
306
- case 'web':
307
- default: {
308
- results = (r as SerperWebSearchResponse).organic;
309
- break;
310
- }
311
- }
312
-
313
  if (!results.length) {
314
  throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
315
  }
@@ -323,7 +306,7 @@ export class SearcherHost extends RPCHost {
323
  const targetResultCount = crawlWithoutContent ? count : count + 2;
324
  const trimmedResults = results.filter((x) => Boolean(x.link)).slice(0, targetResultCount).map((x) => this.mapToFinalResults(x));
325
  trimmedResults.toString = function () {
326
- let r = this.map((x, i) => x ? Reflect.apply(x.toString, x, [i]) : '').join('\n\n').trimEnd() + '\n';
327
  if (fallbackQuery) {
328
  r = `Fallback query: ${fallbackQuery}\n\n${r}`;
329
  }
@@ -515,52 +498,72 @@ export class SearcherHost extends RPCHost {
515
  params: SerperSearchQueryParams & { variant: 'web' | 'images' | 'news'; provider?: string; },
516
  useFallback: boolean = false,
517
  noCache: boolean = false
518
- ): Promise<{ response: SerperSearchResponse; query: string; tryTimes: number }> {
519
  // Try original query first
520
  const originalQuery = params.q;
521
- const response = await this.cachedSearch(params, noCache);
522
 
523
  // Extract results based on variant
524
- let results: any[] = [];
525
  let tryTimes = 1;
526
- switch (params.variant) {
527
- case 'images': results = (response as SerperImageSearchResponse).images; break;
528
- case 'news': results = (response as SerperNewsSearchResponse).news; break;
529
- case 'web': default: results = (response as SerperWebSearchResponse).organic; break;
530
- }
531
-
532
- // Return early if we got results or fallback is disabled
533
- if (results.length > 0 || !useFallback) {
534
- return { response, query: originalQuery, tryTimes };
535
  }
536
 
537
- // Try with progressively shorter queries
538
- const terms = originalQuery.trim().split(/\s+/);
539
 
540
  this.logger.info(`No results for "${originalQuery}", trying fallback queries`);
541
- const containsRTL = /[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\u0590-\u05FF\uFB1D-\uFB4F\u0700-\u074F\u0780-\u07BF\u07C0-\u07FF]/.test(originalQuery);
542
 
543
- while (terms.length > 1) {
544
- containsRTL ? terms.shift() : terms.pop(); // Remove last term
545
- const shortenedQuery = terms.join(' ');
546
 
547
- const fallbackParams = { ...params, q: shortenedQuery };
548
- const fallbackResponse = await this.cachedSearch(fallbackParams, noCache);
549
-
550
- let fallbackResults: any[] = [];
551
- switch (params.variant) {
552
- case 'images': fallbackResults = (fallbackResponse as SerperImageSearchResponse).images; break;
553
- case 'news': fallbackResults = (fallbackResponse as SerperNewsSearchResponse).news; break;
554
- case 'web': default: fallbackResults = (fallbackResponse as SerperWebSearchResponse).organic; break;
555
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
556
 
557
- tryTimes++;
558
  if (fallbackResults.length > 0) {
559
- return { response: fallbackResponse, query: shortenedQuery, tryTimes };
560
  }
561
  }
562
 
563
- return { response, query: originalQuery, tryTimes };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
  }
565
 
566
  async *fetchSearchResults(
@@ -651,7 +654,7 @@ export class SearcherHost extends RPCHost {
651
  metadata.fallback = fallbackQuery;
652
  }
653
 
654
- assignMeta(formatted, metadata);
655
 
656
  return final;
657
  }
 
286
  page,
287
  };
288
 
289
+ const { results, query: successQuery, tryTimes } = await this.searchWithFallback(
290
  searchParams, fallback, crawlerOptions.noCache
291
  );
292
  chargeAmountScaler *= tryTimes;
293
 
294
  fallbackQuery = successQuery !== searchQuery ? successQuery : undefined;
295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  if (!results.length) {
297
  throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
298
  }
 
306
  const targetResultCount = crawlWithoutContent ? count : count + 2;
307
  const trimmedResults = results.filter((x) => Boolean(x.link)).slice(0, targetResultCount).map((x) => this.mapToFinalResults(x));
308
  trimmedResults.toString = function () {
309
+ let r = this.map((x, i) => x ? Reflect.apply(x.toString, x, [i]) : '').join('\n\n').trimEnd() + '\n';
310
  if (fallbackQuery) {
311
  r = `Fallback query: ${fallbackQuery}\n\n${r}`;
312
  }
 
498
  params: SerperSearchQueryParams & { variant: 'web' | 'images' | 'news'; provider?: string; },
499
  useFallback: boolean = false,
500
  noCache: boolean = false
501
+ ) {
502
  // Try original query first
503
  const originalQuery = params.q;
504
+ const containsRTL = /[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\u0590-\u05FF\uFB1D-\uFB4F\u0700-\u074F\u0780-\u07BF\u07C0-\u07FF]/.test(originalQuery);
505
 
506
  // Extract results based on variant
 
507
  let tryTimes = 1;
508
+ const results = await this.doSearch(params, noCache);
509
+ if (results.length && !useFallback) {
510
+ return { results, query: params.q, tryTimes };
 
 
 
 
 
 
511
  }
512
 
513
+ let queryTerms = originalQuery.split(/\s+/);
514
+ const lastResort = containsRTL ? queryTerms.slice(queryTerms.length - 2) : queryTerms.slice(0, 2);
515
 
516
  this.logger.info(`No results for "${originalQuery}", trying fallback queries`);
 
517
 
518
+ let terms: string[] = [];
519
+ // fallback n times
520
+ const n = 4;
521
 
522
+ while (tryTimes <= n) {
523
+ const delta = Math.ceil(queryTerms.length / n) * tryTimes;
524
+ terms = containsRTL ? queryTerms.slice(0, queryTerms.length - delta) : queryTerms.slice(delta);
525
+ const query = terms.join(' ');
526
+ if (!query) {
527
+ break;
 
 
528
  }
529
+ tryTimes += 1;
530
+ this.logger.info(`Retrying search with fallback query: "${query}"`);
531
+ const fallbackParams = { ...params, q: query };
532
+ const fallbackResults = await this.doSearch(fallbackParams, noCache);
533
+ if (fallbackResults.length > 0) {
534
+ return { results: fallbackResults, query: fallbackParams.q, tryTimes };
535
+ }
536
+ }
537
+
538
+ if (terms.length > lastResort.length) {
539
+ const query = lastResort.join(' ');
540
+ this.logger.info(`Retrying search with fallback query: "${query}"`);
541
+ const fallbackParams = { ...params, q: query };
542
+ tryTimes += 1;
543
+ const fallbackResults = await this.doSearch(fallbackParams, noCache);
544
 
 
545
  if (fallbackResults.length > 0) {
546
+ return { results: fallbackResults, query, tryTimes };
547
  }
548
  }
549
 
550
+ return { results, query: originalQuery, tryTimes };
551
+ }
552
+
553
+ async doSearch(
554
+ params: SerperSearchQueryParams & { variant: 'web' | 'images' | 'news'; provider?: string; },
555
+ noCache: boolean = false,
556
+ ) {
557
+ const response = await this.cachedSearch(params, noCache);
558
+
559
+ let results = [];
560
+ switch (params.variant) {
561
+ case 'images': results = (response as SerperImageSearchResponse).images; break;
562
+ case 'news': results = (response as SerperNewsSearchResponse).news; break;
563
+ case 'web': default: results = (response as SerperWebSearchResponse).organic; break;
564
+ }
565
+
566
+ return results;
567
  }
568
 
569
  async *fetchSearchResults(
 
654
  metadata.fallback = fallbackQuery;
655
  }
656
 
657
+ assignMeta(formatted, metadata);
658
 
659
  return final;
660
  }
src/api/serp.ts CHANGED
@@ -275,6 +275,8 @@ export class SerpHost extends RPCHost {
275
  }
276
 
277
  let realQuery = q;
 
 
278
  let results = await this.cachedSearch(variant, {
279
  provider: searchEngine,
280
  q,
@@ -289,14 +291,21 @@ export class SerpHost extends RPCHost {
289
  if (fallback && !results?.length && (!page || page === 1)) {
290
  let tryTimes = 1;
291
  const containsRTL = /[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\u0590-\u05FF\uFB1D-\uFB4F\u0700-\u074F\u0780-\u07BF\u07C0-\u07FF]/.test(q);
292
- let terms = q.split(/\s+/g).filter((x) => !!x);
293
- terms = containsRTL ? terms.slice(10) : terms.slice(0, 10); // don't try to fallback on more than 10 terms
294
- while (terms.length > 1) {
295
- containsRTL ? terms.shift() : terms.pop(); // reduce the query by one term at a time
296
- realQuery = terms.join(' ').trim();
297
- if (!realQuery) {
 
 
298
  break;
299
  }
 
 
 
 
 
300
  this.logger.info(`Retrying search with fallback query: "${realQuery}"`);
301
  results = await this.cachedSearch(variant, {
302
  provider: searchEngine,
@@ -306,11 +315,25 @@ export class SerpHost extends RPCHost {
306
  hl,
307
  location,
308
  }, crawlerOptions);
309
- tryTimes += 1;
310
  if (results?.length) {
311
  break;
312
  }
313
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  chargeAmountScaler *= tryTimes;
315
  }
316
 
 
275
  }
276
 
277
  let realQuery = q;
278
+ let queryTerms = q.split(/\s+/g).filter((x) => !!x);
279
+
280
  let results = await this.cachedSearch(variant, {
281
  provider: searchEngine,
282
  q,
 
291
  if (fallback && !results?.length && (!page || page === 1)) {
292
  let tryTimes = 1;
293
  const containsRTL = /[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\u0590-\u05FF\uFB1D-\uFB4F\u0700-\u074F\u0780-\u07BF\u07C0-\u07FF]/.test(q);
294
+ const lastResort = (containsRTL ? queryTerms.slice(queryTerms.length - 2) : queryTerms.slice(0, 2)).join(' ');
295
+ const n = 4;
296
+ let terms: string[] = [];
297
+ while (tryTimes <= n) {
298
+ const delta = Math.ceil(queryTerms.length / n) * tryTimes;
299
+ terms = containsRTL ? queryTerms.slice(0, queryTerms.length - delta) : queryTerms.slice(delta);
300
+ const query = terms.join(' ');
301
+ if (!query) {
302
  break;
303
  }
304
+ if (realQuery === query) {
305
+ continue;
306
+ }
307
+ tryTimes += 1;
308
+ realQuery = query;
309
  this.logger.info(`Retrying search with fallback query: "${realQuery}"`);
310
  results = await this.cachedSearch(variant, {
311
  provider: searchEngine,
 
315
  hl,
316
  location,
317
  }, crawlerOptions);
 
318
  if (results?.length) {
319
  break;
320
  }
321
  }
322
+
323
+ if (!results?.length && realQuery.length > lastResort.length) {
324
+ realQuery = lastResort;
325
+ this.logger.info(`Retrying search with fallback query: "${realQuery}"`);
326
+ tryTimes += 1;
327
+ results = await this.cachedSearch(variant, {
328
+ provider: searchEngine,
329
+ q: realQuery,
330
+ num,
331
+ gl,
332
+ hl,
333
+ location,
334
+ }, crawlerOptions);
335
+ }
336
+
337
  chargeAmountScaler *= tryTimes;
338
  }
339