Spaces:
Build error
Build error
fix: guard invalid domain names
Browse files- package-lock.json +4 -4
- package.json +1 -1
- src/api/crawler.ts +17 -2
- src/services/puppeteer.ts +6 -6
package-lock.json
CHANGED
|
@@ -17,7 +17,7 @@
|
|
| 17 |
"axios": "^1.3.3",
|
| 18 |
"bcrypt": "^5.1.0",
|
| 19 |
"busboy": "^1.6.0",
|
| 20 |
-
"civkit": "^0.8.4-
|
| 21 |
"core-js": "^3.37.1",
|
| 22 |
"cors": "^2.8.5",
|
| 23 |
"dayjs": "^1.11.9",
|
|
@@ -4095,9 +4095,9 @@
|
|
| 4095 |
}
|
| 4096 |
},
|
| 4097 |
"node_modules/civkit": {
|
| 4098 |
-
"version": "0.8.4-
|
| 4099 |
-
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-
|
| 4100 |
-
"integrity": "sha512-
|
| 4101 |
"license": "AGPL",
|
| 4102 |
"dependencies": {
|
| 4103 |
"lodash": "^4.17.21",
|
|
|
|
| 17 |
"axios": "^1.3.3",
|
| 18 |
"bcrypt": "^5.1.0",
|
| 19 |
"busboy": "^1.6.0",
|
| 20 |
+
"civkit": "^0.8.4-bc8ef5e",
|
| 21 |
"core-js": "^3.37.1",
|
| 22 |
"cors": "^2.8.5",
|
| 23 |
"dayjs": "^1.11.9",
|
|
|
|
| 4095 |
}
|
| 4096 |
},
|
| 4097 |
"node_modules/civkit": {
|
| 4098 |
+
"version": "0.8.4-bc8ef5e",
|
| 4099 |
+
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-bc8ef5e.tgz",
|
| 4100 |
+
"integrity": "sha512-WpybmXgLxUmqrqTeCsWmVFRSEq/3up34kFfByEssXARom5XcvB9uAHzXHhPXmob3m9BGxBXAALD04UBOUq0J4g==",
|
| 4101 |
"license": "AGPL",
|
| 4102 |
"dependencies": {
|
| 4103 |
"lodash": "^4.17.21",
|
package.json
CHANGED
|
@@ -25,7 +25,7 @@
|
|
| 25 |
"axios": "^1.3.3",
|
| 26 |
"bcrypt": "^5.1.0",
|
| 27 |
"busboy": "^1.6.0",
|
| 28 |
-
"civkit": "^0.8.4-
|
| 29 |
"core-js": "^3.37.1",
|
| 30 |
"cors": "^2.8.5",
|
| 31 |
"dayjs": "^1.11.9",
|
|
|
|
| 25 |
"axios": "^1.3.3",
|
| 26 |
"bcrypt": "^5.1.0",
|
| 27 |
"busboy": "^1.6.0",
|
| 28 |
+
"civkit": "^0.8.4-bc8ef5e",
|
| 29 |
"core-js": "^3.37.1",
|
| 30 |
"cors": "^2.8.5",
|
| 31 |
"dayjs": "^1.11.9",
|
src/api/crawler.ts
CHANGED
|
@@ -42,6 +42,8 @@ import { ProxyProvider } from '../shared/services/proxy-provider';
|
|
| 42 |
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
| 43 |
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
| 44 |
import { RobotsTxtService } from '../services/robots-text';
|
|
|
|
|
|
|
| 45 |
|
| 46 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 47 |
withIframe?: boolean | 'quoted';
|
|
@@ -465,7 +467,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 465 |
const targetUrlFromGet = originPath.slice(1);
|
| 466 |
if (crawlerOptions.pdf) {
|
| 467 |
const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64');
|
| 468 |
-
url = `
|
| 469 |
} else if (targetUrlFromGet) {
|
| 470 |
url = targetUrlFromGet.trim();
|
| 471 |
} else if (crawlerOptions.url) {
|
|
@@ -495,13 +497,26 @@ export class CrawlerHost extends RPCHost {
|
|
| 495 |
});
|
| 496 |
}
|
| 497 |
|
| 498 |
-
if (!['http:', 'https:', '
|
| 499 |
throw new ParamValidationError({
|
| 500 |
message: `Invalid protocol ${result.protocol}`,
|
| 501 |
path: 'url'
|
| 502 |
});
|
| 503 |
}
|
| 504 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 505 |
return result;
|
| 506 |
}
|
| 507 |
|
|
|
|
| 42 |
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
| 43 |
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
| 44 |
import { RobotsTxtService } from '../services/robots-text';
|
| 45 |
+
import { lookup } from 'dns/promises';
|
| 46 |
+
import { isIP } from 'net';
|
| 47 |
|
| 48 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 49 |
withIframe?: boolean | 'quoted';
|
|
|
|
| 467 |
const targetUrlFromGet = originPath.slice(1);
|
| 468 |
if (crawlerOptions.pdf) {
|
| 469 |
const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64');
|
| 470 |
+
url = `blob://pdf/${md5Hasher.hash(pdfBuf)}`;
|
| 471 |
} else if (targetUrlFromGet) {
|
| 472 |
url = targetUrlFromGet.trim();
|
| 473 |
} else if (crawlerOptions.url) {
|
|
|
|
| 497 |
});
|
| 498 |
}
|
| 499 |
|
| 500 |
+
if (!['http:', 'https:', 'blob:'].includes(result.protocol)) {
|
| 501 |
throw new ParamValidationError({
|
| 502 |
message: `Invalid protocol ${result.protocol}`,
|
| 503 |
path: 'url'
|
| 504 |
});
|
| 505 |
}
|
| 506 |
|
| 507 |
+
if (!isIP(result.hostname)) {
|
| 508 |
+
await lookup(result.hostname).catch((err) => {
|
| 509 |
+
if (err.code === 'ENOTFOUND') {
|
| 510 |
+
return Promise.reject(new ParamValidationError({
|
| 511 |
+
message: `Domain '${result.hostname}' could not be resolved`,
|
| 512 |
+
path: 'url'
|
| 513 |
+
}));
|
| 514 |
+
}
|
| 515 |
+
|
| 516 |
+
return;
|
| 517 |
+
});
|
| 518 |
+
}
|
| 519 |
+
|
| 520 |
return result;
|
| 521 |
}
|
| 522 |
|
src/services/puppeteer.ts
CHANGED
|
@@ -605,15 +605,15 @@ export class PuppeteerControl extends AsyncService {
|
|
| 605 |
}
|
| 606 |
|
| 607 |
const parsedUrl = new URL(requestUrl);
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
const tldParsed = tldExtract(requestUrl);
|
| 613 |
domainSet.add(tldParsed.domain);
|
|
|
|
|
|
|
| 614 |
}
|
| 615 |
-
} catch (err) {
|
| 616 |
-
return req.abort('blockedbyclient', 1000);
|
| 617 |
}
|
| 618 |
|
| 619 |
if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {
|
|
|
|
| 605 |
}
|
| 606 |
|
| 607 |
const parsedUrl = new URL(requestUrl);
|
| 608 |
+
if (isIP(parsedUrl.hostname)) {
|
| 609 |
+
domainSet.add(parsedUrl.hostname);
|
| 610 |
+
} else {
|
| 611 |
+
try {
|
| 612 |
const tldParsed = tldExtract(requestUrl);
|
| 613 |
domainSet.add(tldParsed.domain);
|
| 614 |
+
} catch (_err) {
|
| 615 |
+
domainSet.add(parsedUrl.hostname);
|
| 616 |
}
|
|
|
|
|
|
|
| 617 |
}
|
| 618 |
|
| 619 |
if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {
|