nomagick commited on
Commit
6f37e5d
·
unverified ·
1 Parent(s): ee008eb

feat: x-remove-selector

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -27,7 +27,8 @@ import { DomainBlockade } from '../db/domain-blockade';
27
  const md5Hasher = new HashManager('md5', 'hex');
28
 
29
  export interface ExtraScrappingOptions extends ScrappingOptions {
30
- targetSelector?: string;
 
31
  }
32
 
33
  export interface FormattedPage {
@@ -131,12 +132,15 @@ export class CrawlerHost extends RPCHost {
131
  return indexObject;
132
  }
133
 
134
- getTurndown(noRules?: boolean | string) {
 
 
 
135
  const turnDownService = new TurndownService({
136
  codeBlockStyle: 'fenced',
137
  preformattedCode: true,
138
  } as any);
139
- if (!noRules) {
140
  turnDownService.addRule('remove-irrelevant', {
141
  filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea'],
142
  replacement: () => ''
@@ -177,7 +181,14 @@ export class CrawlerHost extends RPCHost {
177
  if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
178
 
179
  const fixedContent = content.replace(/\s+/g, ' ').trim();
180
- const fixedHref = href.replace(/\s+/g, '').trim();
 
 
 
 
 
 
 
181
 
182
  return `[${fixedContent}](${fixedHref}${title || ''})`;
183
  }
@@ -317,7 +328,7 @@ export class CrawlerHost extends RPCHost {
317
  }
318
 
319
  const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
320
- let turnDownService = mode === 'markdown' ? this.getTurndown() : this.getTurndown('without any rule');
321
  for (const plugin of this.turnDownPlugins) {
322
  turnDownService = turnDownService.use(plugin);
323
  }
@@ -380,7 +391,7 @@ export class CrawlerHost extends RPCHost {
380
  contentText = turnDownService.turndown(toBeTurnedToMd).trim();
381
  } catch (err) {
382
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
383
- const vanillaTurnDownService = this.getTurndown();
384
  try {
385
  contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
386
  } catch (err2) {
@@ -397,7 +408,7 @@ export class CrawlerHost extends RPCHost {
397
  contentText = turnDownService.turndown(snapshot.html);
398
  } catch (err) {
399
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
400
- const vanillaTurnDownService = this.getTurndown();
401
  try {
402
  contentText = vanillaTurnDownService.turndown(snapshot.html);
403
  } catch (err2) {
@@ -799,22 +810,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
799
  return r;
800
  }
801
 
802
- async * cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
803
  let cache;
804
  if (cacheTolerance && !crawlOpts?.cookies?.length) {
805
  cache = await this.queryCache(urlToCrawl, cacheTolerance);
806
  }
807
 
808
  if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
809
- yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts?.targetSelector);
810
 
811
  return;
812
  }
813
 
814
  try {
815
- if (crawlOpts?.targetSelector) {
816
  for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
817
- yield this.puppeteerControl.narrowSnapshot(x, crawlOpts.targetSelector);
818
  }
819
 
820
  return;
@@ -824,7 +835,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
824
  } catch (err: any) {
825
  if (cache) {
826
  this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
827
- yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts?.targetSelector);
828
  return;
829
  }
830
  throw err;
@@ -853,7 +864,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
853
  }
854
 
855
 
856
- async * scrapMany(urls: URL[], options?: ScrappingOptions, cacheTolerance?: number) {
857
  const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));
858
 
859
  const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
@@ -910,8 +921,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
910
  proxyUrl: opts.proxyUrl,
911
  cookies: opts.setCookies,
912
  favorScreenshot: opts.respondWith === 'screenshot',
913
- waitForSelector: opts.waitForSelector,
914
  targetSelector: opts.targetSelector,
 
915
  overrideUserAgent: opts.userAgent,
916
  timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
917
  };
 
27
  const md5Hasher = new HashManager('md5', 'hex');
28
 
29
  export interface ExtraScrappingOptions extends ScrappingOptions {
30
+ targetSelector?: string | string[];
31
+ removeSelector?: string | string[];
32
  }
33
 
34
  export interface FormattedPage {
 
132
  return indexObject;
133
  }
134
 
135
+ getTurndown(options?: {
136
+ noRules?: boolean | string,
137
+ url?: string | URL;
138
+ }) {
139
  const turnDownService = new TurndownService({
140
  codeBlockStyle: 'fenced',
141
  preformattedCode: true,
142
  } as any);
143
+ if (!options?.noRules) {
144
  turnDownService.addRule('remove-irrelevant', {
145
  filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea'],
146
  replacement: () => ''
 
181
  if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
182
 
183
  const fixedContent = content.replace(/\s+/g, ' ').trim();
184
+ let fixedHref = href.replace(/\s+/g, '').trim();
185
+ if (options?.url) {
186
+ try {
187
+ fixedHref = new URL(fixedHref, options.url).toString();
188
+ } catch (_err) {
189
+ void 0;
190
+ }
191
+ }
192
 
193
  return `[${fixedContent}](${fixedHref}${title || ''})`;
194
  }
 
328
  }
329
 
330
  const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
331
+ let turnDownService = mode === 'markdown' ? this.getTurndown({ url: snapshot.href }) : this.getTurndown({ noRules: true, url: snapshot.href });
332
  for (const plugin of this.turnDownPlugins) {
333
  turnDownService = turnDownService.use(plugin);
334
  }
 
391
  contentText = turnDownService.turndown(toBeTurnedToMd).trim();
392
  } catch (err) {
393
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
394
+ const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
395
  try {
396
  contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
397
  } catch (err2) {
 
408
  contentText = turnDownService.turndown(snapshot.html);
409
  } catch (err) {
410
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
411
+ const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
412
  try {
413
  contentText = vanillaTurnDownService.turndown(snapshot.html);
414
  } catch (err2) {
 
810
  return r;
811
  }
812
 
813
+ async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
814
  let cache;
815
  if (cacheTolerance && !crawlOpts?.cookies?.length) {
816
  cache = await this.queryCache(urlToCrawl, cacheTolerance);
817
  }
818
 
819
  if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
820
+ yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
821
 
822
  return;
823
  }
824
 
825
  try {
826
+ if (crawlOpts?.targetSelector || crawlOpts?.removeSelector) {
827
  for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
828
+ yield this.puppeteerControl.narrowSnapshot(x, crawlOpts);
829
  }
830
 
831
  return;
 
835
  } catch (err: any) {
836
  if (cache) {
837
  this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
838
+ yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
839
  return;
840
  }
841
  throw err;
 
864
  }
865
 
866
 
867
+ async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, cacheTolerance?: number) {
868
  const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));
869
 
870
  const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
 
921
  proxyUrl: opts.proxyUrl,
922
  cookies: opts.setCookies,
923
  favorScreenshot: opts.respondWith === 'screenshot',
924
+ removeSelector: opts.removeSelector,
925
  targetSelector: opts.targetSelector,
926
+ waitForSelector: opts.waitForSelector,
927
  overrideUserAgent: opts.userAgent,
928
  timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
929
  };
backend/functions/src/cloud-functions/searcher.ts CHANGED
@@ -8,11 +8,10 @@ import { singleton } from 'tsyringe';
8
  import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
9
  import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
10
  import _ from 'lodash';
11
- import { ScrappingOptions } from '../services/puppeteer';
12
  import { Request, Response } from 'express';
13
  import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
14
  import { BraveSearchService } from '../services/brave-search';
15
- import { CrawlerHost, FormattedPage } from './crawler';
16
  import { CookieParam } from 'puppeteer';
17
 
18
  import { parseString as parseSetCookieString } from 'set-cookie-parser';
@@ -304,7 +303,7 @@ export class SearcherHost extends RPCHost {
304
  async *fetchSearchResults(
305
  mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
306
  searchResults?: WebSearchResult[],
307
- options?: ScrappingOptions,
308
  pageCacheTolerance?: number
309
  ) {
310
  if (!searchResults) {
 
8
  import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
9
  import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
10
  import _ from 'lodash';
 
11
  import { Request, Response } from 'express';
12
  import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
13
  import { BraveSearchService } from '../services/brave-search';
14
+ import { CrawlerHost, ExtraScrappingOptions, FormattedPage } from './crawler';
15
  import { CookieParam } from 'puppeteer';
16
 
17
  import { parseString as parseSetCookieString } from 'set-cookie-parser';
 
303
  async *fetchSearchResults(
304
  mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
305
  searchResults?: WebSearchResult[],
306
+ options?: ExtraScrappingOptions,
307
  pageCacheTolerance?: number
308
  ) {
309
  if (!searchResults) {
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -53,6 +53,13 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
53
  in: 'header',
54
  schema: { type: 'string' }
55
  },
 
 
 
 
 
 
 
56
  'X-Proxy-Url': {
57
  description: `Specifies your custom proxy if you prefer to use one.\n\n` +
58
  `Supported protocols: \n` +
@@ -130,11 +137,14 @@ export class CrawlerOptions extends AutoCastable {
130
  @Prop()
131
  cacheTolerance?: number;
132
 
133
- @Prop()
134
- targetSelector?: string;
135
 
136
- @Prop()
137
- waitForSelector?: string;
 
 
 
138
 
139
  @Prop({
140
  arrayOf: String,
@@ -193,15 +203,17 @@ export class CrawlerOptions extends AutoCastable {
193
  instance.timeout = timeoutSeconds;
194
  }
195
 
196
- const targetSelector = ctx?.req.get('x-target-selector');
 
 
197
  instance.targetSelector ??= targetSelector;
198
- const waitForSelector = ctx?.req.get('x-wait-for-selector');
199
  instance.waitForSelector ??= waitForSelector || instance.targetSelector;
200
  const overrideUserAgent = ctx?.req.get('x-user-agent');
201
  instance.userAgent ??= overrideUserAgent;
202
 
203
  const cookies: CookieParam[] = [];
204
- const setCookieHeaders = ctx?.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]);
205
  if (Array.isArray(setCookieHeaders)) {
206
  for (const setCookie of setCookieHeaders) {
207
  cookies.push({
 
53
  in: 'header',
54
  schema: { type: 'string' }
55
  },
56
+ 'X-Remove-Selector': {
57
+ description: `Specifies a CSS selector to remove elements from the full html.\n\n` +
58
+ 'Example `X-Remove-Selector: nav`'
59
+ ,
60
+ in: 'header',
61
+ schema: { type: 'string' }
62
+ },
63
  'X-Proxy-Url': {
64
  description: `Specifies your custom proxy if you prefer to use one.\n\n` +
65
  `Supported protocols: \n` +
 
137
  @Prop()
138
  cacheTolerance?: number;
139
 
140
+ @Prop({ arrayOf: String })
141
+ targetSelector?: string | string[];
142
 
143
+ @Prop({ arrayOf: String })
144
+ waitForSelector?: string | string[];
145
+
146
+ @Prop({ arrayOf: String })
147
+ removeSelector?: string | string[];
148
 
149
  @Prop({
150
  arrayOf: String,
 
203
  instance.timeout = timeoutSeconds;
204
  }
205
 
206
+ const removeSelector = ctx?.req.get('x-remove-selector')?.split(', ');
207
+ instance.removeSelector ??= removeSelector;
208
+ const targetSelector = ctx?.req.get('x-target-selector')?.split(', ');
209
  instance.targetSelector ??= targetSelector;
210
+ const waitForSelector = ctx?.req.get('x-wait-for-selector')?.split(', ');
211
  instance.waitForSelector ??= waitForSelector || instance.targetSelector;
212
  const overrideUserAgent = ctx?.req.get('x-user-agent');
213
  instance.userAgent ??= overrideUserAgent;
214
 
215
  const cookies: CookieParam[] = [];
216
+ const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
217
  if (Array.isArray(setCookieHeaders)) {
218
  for (const setCookie of setCookieHeaders) {
219
  cookies.push({
backend/functions/src/services/puppeteer.ts CHANGED
@@ -63,7 +63,7 @@ export interface ScrappingOptions {
63
  proxyUrl?: string;
64
  cookies?: CookieParam[];
65
  favorScreenshot?: boolean;
66
- waitForSelector?: string;
67
  minIntervalMs?: number;
68
  overrideUserAgent?: string;
69
  timeoutMs?: number;
@@ -483,7 +483,8 @@ document.addEventListener('load', handlePageLoad);
483
  );
484
  });
485
  if (options?.waitForSelector) {
486
- page.waitForSelector(options.waitForSelector)
 
487
  .then(async () => {
488
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
489
  screenshot = await page.screenshot();
@@ -547,8 +548,11 @@ document.addEventListener('load', handlePageLoad);
547
  return true;
548
  }
549
 
550
- narrowSnapshot(snapshot: PageSnapshot | undefined, targetSelect?: string): PageSnapshot | undefined {
551
- if (!targetSelect) {
 
 
 
552
  return snapshot;
553
  }
554
  if (!snapshot?.html) {
@@ -556,26 +560,68 @@ document.addEventListener('load', handlePageLoad);
556
  }
557
 
558
  const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
559
- const elem = jsdom.window.document.querySelector(targetSelect);
560
 
561
- if (!elem) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
562
  return snapshot;
563
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
 
565
- const selectedJsDom = new JSDOM(elem.outerHTML, { url: snapshot.href, virtualConsole });
566
  let parsed;
567
  try {
568
- parsed = new Readability(selectedJsDom.window.document).parse();
569
  } catch (err: any) {
570
  this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
571
  }
572
 
573
  // No innerText in jsdom
574
  // https://github.com/jsdom/jsdom/issues/1245
575
- const textContent = elem.textContent;
576
  const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
577
 
578
- const imageTags = Array.from(elem.querySelectorAll('img[src],img[data-src]'))
579
  .map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
580
  .flat()
581
  .map((x) => {
@@ -592,7 +638,7 @@ document.addEventListener('load', handlePageLoad);
592
  const r = {
593
  ...snapshot,
594
  parsed,
595
- html: elem.outerHTML,
596
  text: cleanedText,
597
  imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
598
  } as PageSnapshot;
 
63
  proxyUrl?: string;
64
  cookies?: CookieParam[];
65
  favorScreenshot?: boolean;
66
+ waitForSelector?: string | string[];
67
  minIntervalMs?: number;
68
  overrideUserAgent?: string;
69
  timeoutMs?: number;
 
483
  );
484
  });
485
  if (options?.waitForSelector) {
486
+ const waitPromise = Array.isArray(options.waitForSelector) ? Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x))) : page.waitForSelector(options.waitForSelector);
487
+ waitPromise
488
  .then(async () => {
489
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
490
  screenshot = await page.screenshot();
 
548
  return true;
549
  }
550
 
551
+ narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
552
+ targetSelector?: string | string[];
553
+ removeSelector?: string | string[];
554
+ }): PageSnapshot | undefined {
555
+ if (!options?.targetSelector && !options?.removeSelector) {
556
  return snapshot;
557
  }
558
  if (!snapshot?.html) {
 
560
  }
561
 
562
  const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
563
+ const allNodes: Node[] = [];
564
 
565
+ if (Array.isArray(options.removeSelector)) {
566
+ for (const rl of options.removeSelector) {
567
+ jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
568
+ }
569
+ } else if (options.removeSelector) {
570
+ jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
571
+ }
572
+
573
+ if (Array.isArray(options.targetSelector)) {
574
+ for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
575
+ x.forEach((el) => {
576
+ if (!allNodes.includes(el)) {
577
+ allNodes.push(el);
578
+ }
579
+ });
580
+ }
581
+ } else if (options.targetSelector) {
582
+ jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
583
+ if (!allNodes.includes(el)) {
584
+ allNodes.push(el);
585
+ }
586
+ });
587
+ } else {
588
+ allNodes.push(jsdom.window.document);
589
+ }
590
+
591
+ if (!allNodes.length) {
592
  return snapshot;
593
  }
594
+ const textChunks: string[] = [];
595
+ let rootDoc: Document;
596
+ if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
597
+ rootDoc = allNodes[0] as any;
598
+ if (rootDoc.body.textContent) {
599
+ textChunks.push(rootDoc.body.textContent);
600
+ }
601
+ } else {
602
+ rootDoc = new JSDOM('', { url: snapshot.href, virtualConsole }).window.document;
603
+ for (const n of allNodes) {
604
+ rootDoc.body.appendChild(n);
605
+ rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
606
+ if (n.textContent) {
607
+ textChunks.push(n.textContent);
608
+ }
609
+ }
610
+ }
611
 
 
612
  let parsed;
613
  try {
614
+ parsed = new Readability(rootDoc.cloneNode(true) as any).parse();
615
  } catch (err: any) {
616
  this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
617
  }
618
 
619
  // No innerText in jsdom
620
  // https://github.com/jsdom/jsdom/issues/1245
621
+ const textContent = textChunks.join('\n\n');
622
  const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
623
 
624
+ const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
625
  .map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
626
  .flat()
627
  .map((x) => {
 
638
  const r = {
639
  ...snapshot,
640
  parsed,
641
+ html: rootDoc.documentElement.outerHTML,
642
  text: cleanedText,
643
  imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
644
  } as PageSnapshot;
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 5939c7091985706bebe7d1d83591430426b292c8
 
1
+ Subproject commit b30155da82ea8e311faab58bb5a360e829547ea0