mapleeit commited on
Commit
a44d9a2
·
1 Parent(s): af282ee

feat(adaptive-crawler): optimize relevance detection

Browse files
backend/functions/src/cloud-functions/adaptive-crawler.ts CHANGED
@@ -362,10 +362,9 @@ export class AdaptiveCrawlerHost extends RPCHost {
362
 
363
  const title = json.data.title;
364
  const description = json.data.description;
365
- const rerankQuery = `TITLE: ${title}; DESCRIPTION: ${description}`;
366
  const links = json.data.links as Record<string, string>;
367
 
368
- const relevantUrls = await this.getRelevantUrls(token, { query: rerankQuery, links });
369
  this.logger.debug(`Total urls: ${Object.keys(links).length}, relevant urls: ${relevantUrls.length}`);
370
 
371
  for (const url of relevantUrls) {
@@ -418,9 +417,10 @@ export class AdaptiveCrawlerHost extends RPCHost {
418
  }
419
 
420
  async getRelevantUrls(token: string, {
421
- query, links
422
  }: {
423
- query: string;
 
424
  links: Record<string, string>;
425
  }) {
426
  const invalidSuffix = [
@@ -434,6 +434,13 @@ export class AdaptiveCrawlerHost extends RPCHost {
434
  .map(([title, link]) => link)
435
  .filter(link => link.startsWith('http') && !invalidSuffix.some(suffix => link.endsWith(suffix)));
436
 
 
 
 
 
 
 
 
437
  const data = {
438
  model: 'jina-reranker-v2-base-multilingual',
439
  query,
@@ -460,7 +467,8 @@ export class AdaptiveCrawlerHost extends RPCHost {
460
  }[];
461
  };
462
 
463
- return json.results.filter(r => r.relevance_score > 0.3).map(r => removeURLHash(r.document.text));
 
464
  }
465
 
466
  getIndex(user?: JinaEmbeddingsTokenAccount) {
 
362
 
363
  const title = json.data.title;
364
  const description = json.data.description;
 
365
  const links = json.data.links as Record<string, string>;
366
 
367
+ const relevantUrls = await this.getRelevantUrls(token, { title, description, links });
368
  this.logger.debug(`Total urls: ${Object.keys(links).length}, relevant urls: ${relevantUrls.length}`);
369
 
370
  for (const url of relevantUrls) {
 
417
  }
418
 
419
  async getRelevantUrls(token: string, {
420
+ title, description, links
421
  }: {
422
+ title: string;
423
+ description: string;
424
  links: Record<string, string>;
425
  }) {
426
  const invalidSuffix = [
 
434
  .map(([title, link]) => link)
435
  .filter(link => link.startsWith('http') && !invalidSuffix.some(suffix => link.endsWith(suffix)));
436
 
437
+ let query = '';
438
+ if (!description) {
439
+ query += title;
440
+ } else {
441
+ query += `TITLE: ${title}; DESCRIPTION: ${description}`;
442
+ }
443
+
444
  const data = {
445
  model: 'jina-reranker-v2-base-multilingual',
446
  query,
 
467
  }[];
468
  };
469
 
470
+ const highestRelevanceScore = json.results[0]?.relevance_score ?? 0;
471
+ return json.results.filter(r => r.relevance_score > Math.max(highestRelevanceScore * 0.6, 0.1)).map(r => removeURLHash(r.document.text));
472
  }
473
 
474
  getIndex(user?: JinaEmbeddingsTokenAccount) {