Spaces:
Build error
Build error
fix: catch jsdom errors
Browse files- package-lock.json +4 -4
- package.json +1 -1
- src/api/crawler.ts +15 -0
- src/dto/crawler-options.ts +1 -19
- src/services/jsdom.ts +14 -4
package-lock.json
CHANGED
|
@@ -17,7 +17,7 @@
|
|
| 17 |
"axios": "^1.3.3",
|
| 18 |
"bcrypt": "^5.1.0",
|
| 19 |
"busboy": "^1.6.0",
|
| 20 |
-
"civkit": "^0.8.4-
|
| 21 |
"core-js": "^3.37.1",
|
| 22 |
"cors": "^2.8.5",
|
| 23 |
"dayjs": "^1.11.9",
|
|
@@ -4095,9 +4095,9 @@
|
|
| 4095 |
}
|
| 4096 |
},
|
| 4097 |
"node_modules/civkit": {
|
| 4098 |
-
"version": "0.8.4-
|
| 4099 |
-
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-
|
| 4100 |
-
"integrity": "sha512-
|
| 4101 |
"license": "AGPL",
|
| 4102 |
"dependencies": {
|
| 4103 |
"lodash": "^4.17.21",
|
|
|
|
| 17 |
"axios": "^1.3.3",
|
| 18 |
"bcrypt": "^5.1.0",
|
| 19 |
"busboy": "^1.6.0",
|
| 20 |
+
"civkit": "^0.8.4-31171c2",
|
| 21 |
"core-js": "^3.37.1",
|
| 22 |
"cors": "^2.8.5",
|
| 23 |
"dayjs": "^1.11.9",
|
|
|
|
| 4095 |
}
|
| 4096 |
},
|
| 4097 |
"node_modules/civkit": {
|
| 4098 |
+
"version": "0.8.4-31171c2",
|
| 4099 |
+
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-31171c2.tgz",
|
| 4100 |
+
"integrity": "sha512-Orr2pl/LmXpwKICWmW/IrUFeOXnuECTceqpL0GdYAbnzh66Zlew5CxM+fyZdBStq1DqXjh5wJyCBqHe+aM3nNQ==",
|
| 4101 |
"license": "AGPL",
|
| 4102 |
"dependencies": {
|
| 4103 |
"lodash": "^4.17.21",
|
package.json
CHANGED
|
@@ -25,7 +25,7 @@
|
|
| 25 |
"axios": "^1.3.3",
|
| 26 |
"bcrypt": "^5.1.0",
|
| 27 |
"busboy": "^1.6.0",
|
| 28 |
-
"civkit": "^0.8.4-
|
| 29 |
"core-js": "^3.37.1",
|
| 30 |
"cors": "^2.8.5",
|
| 31 |
"dayjs": "^1.11.9",
|
|
|
|
| 25 |
"axios": "^1.3.3",
|
| 26 |
"bcrypt": "^5.1.0",
|
| 27 |
"busboy": "^1.6.0",
|
| 28 |
+
"civkit": "^0.8.4-31171c2",
|
| 29 |
"core-js": "^3.37.1",
|
| 30 |
"cors": "^2.8.5",
|
| 31 |
"dayjs": "^1.11.9",
|
src/api/crawler.ts
CHANGED
|
@@ -960,6 +960,21 @@ export class CrawlerHost extends RPCHost {
|
|
| 960 |
proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
|
| 961 |
private: Boolean(opts.doNotTrack),
|
| 962 |
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 963 |
|
| 964 |
if (opts.locale) {
|
| 965 |
crawlOpts.extraHeaders ??= {};
|
|
|
|
| 960 |
proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
|
| 961 |
private: Boolean(opts.doNotTrack),
|
| 962 |
};
|
| 963 |
+
if (crawlOpts.targetSelector?.length) {
|
| 964 |
+
if (typeof crawlOpts.targetSelector === 'string') {
|
| 965 |
+
crawlOpts.targetSelector = [crawlOpts.targetSelector];
|
| 966 |
+
}
|
| 967 |
+
for (const s of crawlOpts.targetSelector) {
|
| 968 |
+
for (const e of s.split(',').map((x)=> x.trim())) {
|
| 969 |
+
if (e.startsWith('*') || e.startsWith(':') || e.includes('*:')) {
|
| 970 |
+
throw new ParamValidationError({
|
| 971 |
+
message: `Unacceptable selector: '${e}'. We cannot accept match-all selector for performance reasons. Sorry.`,
|
| 972 |
+
path: 'targetSelector'
|
| 973 |
+
});
|
| 974 |
+
}
|
| 975 |
+
}
|
| 976 |
+
}
|
| 977 |
+
}
|
| 978 |
|
| 979 |
if (opts.locale) {
|
| 980 |
crawlOpts.extraHeaders ??= {};
|
src/dto/crawler-options.ts
CHANGED
|
@@ -436,7 +436,6 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 436 |
instance.targetSelector ??= targetSelector?.length ? targetSelector : undefined;
|
| 437 |
const waitForSelector = ctx?.get('x-wait-for-selector')?.split(', ').filter(Boolean);
|
| 438 |
instance.waitForSelector ??= (waitForSelector?.length ? waitForSelector : undefined) || instance.targetSelector;
|
| 439 |
-
instance.targetSelector = filterSelector(instance.targetSelector);
|
| 440 |
const overrideUserAgent = ctx?.get('x-user-agent') || undefined;
|
| 441 |
instance.userAgent ??= overrideUserAgent;
|
| 442 |
|
|
@@ -590,21 +589,4 @@ export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
|
|
| 590 |
|
| 591 |
return instance;
|
| 592 |
}
|
| 593 |
-
}
|
| 594 |
-
|
| 595 |
-
function filterSelector(s?: string | string[]) {
|
| 596 |
-
if (!s) {
|
| 597 |
-
return s;
|
| 598 |
-
}
|
| 599 |
-
const sr = Array.isArray(s) ? s : [s];
|
| 600 |
-
const selectors = sr.filter((i) => {
|
| 601 |
-
const innerSelectors = i.split(',').map((s) => s.trim());
|
| 602 |
-
const someViolation = innerSelectors.find((x) => x.startsWith('*') || x.startsWith(':') || x.includes('*:'));
|
| 603 |
-
if (someViolation) {
|
| 604 |
-
return false;
|
| 605 |
-
}
|
| 606 |
-
return true;
|
| 607 |
-
});
|
| 608 |
-
|
| 609 |
-
return selectors;
|
| 610 |
-
};
|
|
|
|
| 436 |
instance.targetSelector ??= targetSelector?.length ? targetSelector : undefined;
|
| 437 |
const waitForSelector = ctx?.get('x-wait-for-selector')?.split(', ').filter(Boolean);
|
| 438 |
instance.waitForSelector ??= (waitForSelector?.length ? waitForSelector : undefined) || instance.targetSelector;
|
|
|
|
| 439 |
const overrideUserAgent = ctx?.get('x-user-agent') || undefined;
|
| 440 |
instance.userAgent ??= overrideUserAgent;
|
| 441 |
|
|
|
|
| 589 |
|
| 590 |
return instance;
|
| 591 |
}
|
| 592 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/services/jsdom.ts
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
import { container, singleton } from 'tsyringe';
|
| 2 |
-
import { AsyncService, marshalErrorLike } from 'civkit';
|
| 3 |
import { GlobalLogger } from './logger';
|
| 4 |
import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer';
|
| 5 |
import { Readability } from '@mozilla/readability';
|
|
@@ -8,6 +7,8 @@ import { Threaded } from '../services/threaded';
|
|
| 8 |
import type { ExtraScrappingOptions } from '../api/crawler';
|
| 9 |
import { tailwindClasses } from '../utils/tailwind-classes';
|
| 10 |
import { countGPTToken } from '../shared/utils/openai';
|
|
|
|
|
|
|
| 11 |
|
| 12 |
const pLinkedom = import('linkedom');
|
| 13 |
|
|
@@ -38,8 +39,17 @@ export class JSDomControl extends AsyncService {
|
|
| 38 |
return snapshot;
|
| 39 |
}
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
}
|
| 44 |
|
| 45 |
@Threaded()
|
|
@@ -151,7 +161,7 @@ export class JSDomControl extends AsyncService {
|
|
| 151 |
try {
|
| 152 |
parsed = new Readability(rootDoc.cloneNode(true) as any).parse();
|
| 153 |
} catch (err: any) {
|
| 154 |
-
this.logger.warn(`Failed to parse selected element`, { err
|
| 155 |
}
|
| 156 |
|
| 157 |
const imgSet = new Set<string>();
|
|
|
|
| 1 |
import { container, singleton } from 'tsyringe';
|
|
|
|
| 2 |
import { GlobalLogger } from './logger';
|
| 3 |
import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer';
|
| 4 |
import { Readability } from '@mozilla/readability';
|
|
|
|
| 7 |
import type { ExtraScrappingOptions } from '../api/crawler';
|
| 8 |
import { tailwindClasses } from '../utils/tailwind-classes';
|
| 9 |
import { countGPTToken } from '../shared/utils/openai';
|
| 10 |
+
import { AsyncService } from 'civkit/async-service';
|
| 11 |
+
import { ApplicationError, AssertionFailureError } from 'civkit/civ-rpc';
|
| 12 |
|
| 13 |
const pLinkedom = import('linkedom');
|
| 14 |
|
|
|
|
| 39 |
return snapshot;
|
| 40 |
}
|
| 41 |
|
| 42 |
+
try {
|
| 43 |
+
// SideLoad contains native objects that cannot go through thread boundaries.
|
| 44 |
+
return await this.actualNarrowSnapshot(snapshot, { ...options, sideLoad: undefined });
|
| 45 |
+
} catch (err: any) {
|
| 46 |
+
this.logger.warn(`Error narrowing snapshot`, { err });
|
| 47 |
+
if (err instanceof ApplicationError) {
|
| 48 |
+
throw err;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
throw new AssertionFailureError(`Failed to process the page: ${err?.message}`);
|
| 52 |
+
}
|
| 53 |
}
|
| 54 |
|
| 55 |
@Threaded()
|
|
|
|
| 161 |
try {
|
| 162 |
parsed = new Readability(rootDoc.cloneNode(true) as any).parse();
|
| 163 |
} catch (err: any) {
|
| 164 |
+
this.logger.warn(`Failed to parse selected element`, { err });
|
| 165 |
}
|
| 166 |
|
| 167 |
const imgSet = new Set<string>();
|