nomagick commited on
Commit
4ca627c
·
unverified ·
1 Parent(s): 4830ff5

fix: guard invalid domain names

Browse files
package-lock.json CHANGED
@@ -17,7 +17,7 @@
17
  "axios": "^1.3.3",
18
  "bcrypt": "^5.1.0",
19
  "busboy": "^1.6.0",
20
- "civkit": "^0.8.4-6ed9027",
21
  "core-js": "^3.37.1",
22
  "cors": "^2.8.5",
23
  "dayjs": "^1.11.9",
@@ -4095,9 +4095,9 @@
4095
  }
4096
  },
4097
  "node_modules/civkit": {
4098
- "version": "0.8.4-6ed9027",
4099
- "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-6ed9027.tgz",
4100
- "integrity": "sha512-VU8Ykik1L16Li9/QZfw5wYsmu3jJYH/zIHbM6Vd2ajRI7Mh4fSO3cXadUntM190BersLW9Fts+qunDPabhIWZA==",
4101
  "license": "AGPL",
4102
  "dependencies": {
4103
  "lodash": "^4.17.21",
 
17
  "axios": "^1.3.3",
18
  "bcrypt": "^5.1.0",
19
  "busboy": "^1.6.0",
20
+ "civkit": "^0.8.4-bc8ef5e",
21
  "core-js": "^3.37.1",
22
  "cors": "^2.8.5",
23
  "dayjs": "^1.11.9",
 
4095
  }
4096
  },
4097
  "node_modules/civkit": {
4098
+ "version": "0.8.4-bc8ef5e",
4099
+ "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-bc8ef5e.tgz",
4100
+ "integrity": "sha512-WpybmXgLxUmqrqTeCsWmVFRSEq/3up34kFfByEssXARom5XcvB9uAHzXHhPXmob3m9BGxBXAALD04UBOUq0J4g==",
4101
  "license": "AGPL",
4102
  "dependencies": {
4103
  "lodash": "^4.17.21",
package.json CHANGED
@@ -25,7 +25,7 @@
25
  "axios": "^1.3.3",
26
  "bcrypt": "^5.1.0",
27
  "busboy": "^1.6.0",
28
- "civkit": "^0.8.4-6ed9027",
29
  "core-js": "^3.37.1",
30
  "cors": "^2.8.5",
31
  "dayjs": "^1.11.9",
 
25
  "axios": "^1.3.3",
26
  "bcrypt": "^5.1.0",
27
  "busboy": "^1.6.0",
28
+ "civkit": "^0.8.4-bc8ef5e",
29
  "core-js": "^3.37.1",
30
  "cors": "^2.8.5",
31
  "dayjs": "^1.11.9",
src/api/crawler.ts CHANGED
@@ -42,6 +42,8 @@ import { ProxyProvider } from '../shared/services/proxy-provider';
42
  import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
43
  import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
44
  import { RobotsTxtService } from '../services/robots-text';
 
 
45
 
46
  export interface ExtraScrappingOptions extends ScrappingOptions {
47
  withIframe?: boolean | 'quoted';
@@ -465,7 +467,7 @@ export class CrawlerHost extends RPCHost {
465
  const targetUrlFromGet = originPath.slice(1);
466
  if (crawlerOptions.pdf) {
467
  const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64');
468
- url = `file://pdf.${md5Hasher.hash(pdfBuf)}`;
469
  } else if (targetUrlFromGet) {
470
  url = targetUrlFromGet.trim();
471
  } else if (crawlerOptions.url) {
@@ -495,13 +497,26 @@ export class CrawlerHost extends RPCHost {
495
  });
496
  }
497
 
498
- if (!['http:', 'https:', 'file:'].includes(result.protocol)) {
499
  throw new ParamValidationError({
500
  message: `Invalid protocol ${result.protocol}`,
501
  path: 'url'
502
  });
503
  }
504
 
 
 
 
 
 
 
 
 
 
 
 
 
 
505
  return result;
506
  }
507
 
 
42
  import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
43
  import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
44
  import { RobotsTxtService } from '../services/robots-text';
45
+ import { lookup } from 'dns/promises';
46
+ import { isIP } from 'net';
47
 
48
  export interface ExtraScrappingOptions extends ScrappingOptions {
49
  withIframe?: boolean | 'quoted';
 
467
  const targetUrlFromGet = originPath.slice(1);
468
  if (crawlerOptions.pdf) {
469
  const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64');
470
+ url = `blob://pdf/${md5Hasher.hash(pdfBuf)}`;
471
  } else if (targetUrlFromGet) {
472
  url = targetUrlFromGet.trim();
473
  } else if (crawlerOptions.url) {
 
497
  });
498
  }
499
 
500
+ if (!['http:', 'https:', 'blob:'].includes(result.protocol)) {
501
  throw new ParamValidationError({
502
  message: `Invalid protocol ${result.protocol}`,
503
  path: 'url'
504
  });
505
  }
506
 
507
+ if (!isIP(result.hostname)) {
508
+ await lookup(result.hostname).catch((err) => {
509
+ if (err.code === 'ENOTFOUND') {
510
+ return Promise.reject(new ParamValidationError({
511
+ message: `Domain '${result.hostname}' could not be resolved`,
512
+ path: 'url'
513
+ }));
514
+ }
515
+
516
+ return;
517
+ });
518
+ }
519
+
520
  return result;
521
  }
522
 
src/services/puppeteer.ts CHANGED
@@ -605,15 +605,15 @@ export class PuppeteerControl extends AsyncService {
605
  }
606
 
607
  const parsedUrl = new URL(requestUrl);
608
- try {
609
- if (isIP(parsedUrl.hostname)) {
610
- domainSet.add(parsedUrl.hostname);
611
- } else {
612
  const tldParsed = tldExtract(requestUrl);
613
  domainSet.add(tldParsed.domain);
 
 
614
  }
615
- } catch (err) {
616
- return req.abort('blockedbyclient', 1000);
617
  }
618
 
619
  if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {
 
605
  }
606
 
607
  const parsedUrl = new URL(requestUrl);
608
+ if (isIP(parsedUrl.hostname)) {
609
+ domainSet.add(parsedUrl.hostname);
610
+ } else {
611
+ try {
612
  const tldParsed = tldExtract(requestUrl);
613
  domainSet.add(tldParsed.domain);
614
+ } catch (_err) {
615
+ domainSet.add(parsedUrl.hostname);
616
  }
 
 
617
  }
618
 
619
  if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {