nomagick commited on
Commit
3020d58
·
unverified ·
1 Parent(s): da48d0e

fix: catch jsdom errors

Browse files
package-lock.json CHANGED
@@ -17,7 +17,7 @@
17
  "axios": "^1.3.3",
18
  "bcrypt": "^5.1.0",
19
  "busboy": "^1.6.0",
20
- "civkit": "^0.8.4-9d62ed1",
21
  "core-js": "^3.37.1",
22
  "cors": "^2.8.5",
23
  "dayjs": "^1.11.9",
@@ -4095,9 +4095,9 @@
4095
  }
4096
  },
4097
  "node_modules/civkit": {
4098
- "version": "0.8.4-9d62ed1",
4099
- "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-9d62ed1.tgz",
4100
- "integrity": "sha512-uDmUCjsISAVrJvandGCUm7zTseDhAKISaPwYev73s2VGwZsvG8K/pF4ErSKWp54soNA96RSamyrkVDayqEpHmQ==",
4101
  "license": "AGPL",
4102
  "dependencies": {
4103
  "lodash": "^4.17.21",
 
17
  "axios": "^1.3.3",
18
  "bcrypt": "^5.1.0",
19
  "busboy": "^1.6.0",
20
+ "civkit": "^0.8.4-31171c2",
21
  "core-js": "^3.37.1",
22
  "cors": "^2.8.5",
23
  "dayjs": "^1.11.9",
 
4095
  }
4096
  },
4097
  "node_modules/civkit": {
4098
+ "version": "0.8.4-31171c2",
4099
+ "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-31171c2.tgz",
4100
+ "integrity": "sha512-Orr2pl/LmXpwKICWmW/IrUFeOXnuECTceqpL0GdYAbnzh66Zlew5CxM+fyZdBStq1DqXjh5wJyCBqHe+aM3nNQ==",
4101
  "license": "AGPL",
4102
  "dependencies": {
4103
  "lodash": "^4.17.21",
package.json CHANGED
@@ -25,7 +25,7 @@
25
  "axios": "^1.3.3",
26
  "bcrypt": "^5.1.0",
27
  "busboy": "^1.6.0",
28
- "civkit": "^0.8.4-9d62ed1",
29
  "core-js": "^3.37.1",
30
  "cors": "^2.8.5",
31
  "dayjs": "^1.11.9",
 
25
  "axios": "^1.3.3",
26
  "bcrypt": "^5.1.0",
27
  "busboy": "^1.6.0",
28
+ "civkit": "^0.8.4-31171c2",
29
  "core-js": "^3.37.1",
30
  "cors": "^2.8.5",
31
  "dayjs": "^1.11.9",
src/api/crawler.ts CHANGED
@@ -960,6 +960,21 @@ export class CrawlerHost extends RPCHost {
960
  proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
961
  private: Boolean(opts.doNotTrack),
962
  };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
963
 
964
  if (opts.locale) {
965
  crawlOpts.extraHeaders ??= {};
 
960
  proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
961
  private: Boolean(opts.doNotTrack),
962
  };
963
+ if (crawlOpts.targetSelector?.length) {
964
+ if (typeof crawlOpts.targetSelector === 'string') {
965
+ crawlOpts.targetSelector = [crawlOpts.targetSelector];
966
+ }
967
+ for (const s of crawlOpts.targetSelector) {
968
+ for (const e of s.split(',').map((x)=> x.trim())) {
969
+ if (e.startsWith('*') || e.startsWith(':') || e.includes('*:')) {
970
+ throw new ParamValidationError({
971
+ message: `Unacceptable selector: '${e}'. We cannot accept match-all selector for performance reasons. Sorry.`,
972
+ path: 'targetSelector'
973
+ });
974
+ }
975
+ }
976
+ }
977
+ }
978
 
979
  if (opts.locale) {
980
  crawlOpts.extraHeaders ??= {};
src/dto/crawler-options.ts CHANGED
@@ -436,7 +436,6 @@ export class CrawlerOptions extends AutoCastable {
436
  instance.targetSelector ??= targetSelector?.length ? targetSelector : undefined;
437
  const waitForSelector = ctx?.get('x-wait-for-selector')?.split(', ').filter(Boolean);
438
  instance.waitForSelector ??= (waitForSelector?.length ? waitForSelector : undefined) || instance.targetSelector;
439
- instance.targetSelector = filterSelector(instance.targetSelector);
440
  const overrideUserAgent = ctx?.get('x-user-agent') || undefined;
441
  instance.userAgent ??= overrideUserAgent;
442
 
@@ -590,21 +589,4 @@ export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
590
 
591
  return instance;
592
  }
593
- }
594
-
595
- function filterSelector(s?: string | string[]) {
596
- if (!s) {
597
- return s;
598
- }
599
- const sr = Array.isArray(s) ? s : [s];
600
- const selectors = sr.filter((i) => {
601
- const innerSelectors = i.split(',').map((s) => s.trim());
602
- const someViolation = innerSelectors.find((x) => x.startsWith('*') || x.startsWith(':') || x.includes('*:'));
603
- if (someViolation) {
604
- return false;
605
- }
606
- return true;
607
- });
608
-
609
- return selectors;
610
- };
 
436
  instance.targetSelector ??= targetSelector?.length ? targetSelector : undefined;
437
  const waitForSelector = ctx?.get('x-wait-for-selector')?.split(', ').filter(Boolean);
438
  instance.waitForSelector ??= (waitForSelector?.length ? waitForSelector : undefined) || instance.targetSelector;
 
439
  const overrideUserAgent = ctx?.get('x-user-agent') || undefined;
440
  instance.userAgent ??= overrideUserAgent;
441
 
 
589
 
590
  return instance;
591
  }
592
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/services/jsdom.ts CHANGED
@@ -1,5 +1,4 @@
1
  import { container, singleton } from 'tsyringe';
2
- import { AsyncService, marshalErrorLike } from 'civkit';
3
  import { GlobalLogger } from './logger';
4
  import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer';
5
  import { Readability } from '@mozilla/readability';
@@ -8,6 +7,8 @@ import { Threaded } from '../services/threaded';
8
  import type { ExtraScrappingOptions } from '../api/crawler';
9
  import { tailwindClasses } from '../utils/tailwind-classes';
10
  import { countGPTToken } from '../shared/utils/openai';
 
 
11
 
12
  const pLinkedom = import('linkedom');
13
 
@@ -38,8 +39,17 @@ export class JSDomControl extends AsyncService {
38
  return snapshot;
39
  }
40
 
41
- // SideLoad contains native objects that cannot go through thread boundaries.
42
- return this.actualNarrowSnapshot(snapshot, { ...options, sideLoad: undefined });
 
 
 
 
 
 
 
 
 
43
  }
44
 
45
  @Threaded()
@@ -151,7 +161,7 @@ export class JSDomControl extends AsyncService {
151
  try {
152
  parsed = new Readability(rootDoc.cloneNode(true) as any).parse();
153
  } catch (err: any) {
154
- this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
155
  }
156
 
157
  const imgSet = new Set<string>();
 
1
  import { container, singleton } from 'tsyringe';
 
2
  import { GlobalLogger } from './logger';
3
  import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer';
4
  import { Readability } from '@mozilla/readability';
 
7
  import type { ExtraScrappingOptions } from '../api/crawler';
8
  import { tailwindClasses } from '../utils/tailwind-classes';
9
  import { countGPTToken } from '../shared/utils/openai';
10
+ import { AsyncService } from 'civkit/async-service';
11
+ import { ApplicationError, AssertionFailureError } from 'civkit/civ-rpc';
12
 
13
  const pLinkedom = import('linkedom');
14
 
 
39
  return snapshot;
40
  }
41
 
42
+ try {
43
+ // SideLoad contains native objects that cannot go through thread boundaries.
44
+ return await this.actualNarrowSnapshot(snapshot, { ...options, sideLoad: undefined });
45
+ } catch (err: any) {
46
+ this.logger.warn(`Error narrowing snapshot`, { err });
47
+ if (err instanceof ApplicationError) {
48
+ throw err;
49
+ }
50
+
51
+ throw new AssertionFailureError(`Failed to process the page: ${err?.message}`);
52
+ }
53
  }
54
 
55
  @Threaded()
 
161
  try {
162
  parsed = new Readability(rootDoc.cloneNode(true) as any).parse();
163
  } catch (err: any) {
164
+ this.logger.warn(`Failed to parse selected element`, { err });
165
  }
166
 
167
  const imgSet = new Set<string>();