nomagick commited on
Commit
e2a187d
·
unverified ·
1 Parent(s): 67d4a9f

fix: crawling IP url

Browse files
backend/functions/src/services/puppeteer.ts CHANGED
@@ -13,6 +13,7 @@ import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
13
  import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors';
14
  import { TimeoutError } from 'puppeteer';
15
  import _ from 'lodash';
 
16
  const tldExtract = require('tld-extract');
17
 
18
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
@@ -570,15 +571,19 @@ export class PuppeteerControl extends AsyncService {
570
  if (!requestUrl.startsWith('http:') && !requestUrl.startsWith('https:') && !requestUrl.startsWith('chrome-extension:') && requestUrl !== 'about:blank') {
571
  return req.abort('blockedbyclient', 1000);
572
  }
 
 
573
  try {
574
- const tldParsed = tldExtract(requestUrl);
575
- domainSet.add(tldParsed.domain);
 
 
 
 
576
  } catch (err) {
577
  return req.abort('blockedbyclient', 1000);
578
  }
579
 
580
- const parsedUrl = new URL(requestUrl);
581
-
582
  if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {
583
  page.emit('abuse', { url: requestUrl, page, sn, reason: `Abusive request: ${requestUrl}` });
584
  return req.abort('blockedbyclient', 1000);
 
13
  import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors';
14
  import { TimeoutError } from 'puppeteer';
15
  import _ from 'lodash';
16
+ import { isIP } from 'net';
17
  const tldExtract = require('tld-extract');
18
 
19
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
 
571
  if (!requestUrl.startsWith('http:') && !requestUrl.startsWith('https:') && !requestUrl.startsWith('chrome-extension:') && requestUrl !== 'about:blank') {
572
  return req.abort('blockedbyclient', 1000);
573
  }
574
+
575
+ const parsedUrl = new URL(requestUrl);
576
  try {
577
+ if (isIP(parsedUrl.hostname)) {
578
+ domainSet.add(parsedUrl.hostname);
579
+ } else {
580
+ const tldParsed = tldExtract(requestUrl);
581
+ domainSet.add(tldParsed.domain);
582
+ }
583
  } catch (err) {
584
  return req.abort('blockedbyclient', 1000);
585
  }
586
 
 
 
587
  if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {
588
  page.emit('abuse', { url: requestUrl, page, sn, reason: `Abusive request: ${requestUrl}` });
589
  return req.abort('blockedbyclient', 1000);