nomagick commited on
Commit
607407f
·
unverified ·
1 Parent(s): 94170db

fix: pdf detection

Browse files
backend/functions/src/services/puppeteer.ts CHANGED
@@ -11,6 +11,7 @@ import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
11
  import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
12
  import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
13
  import { TimeoutError } from 'puppeteer';
 
14
  const tldExtract = require('tld-extract');
15
 
16
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
@@ -114,13 +115,6 @@ function briefImgs(elem) {
114
  };
115
  });
116
  }
117
- function briefPDFs() {
118
- const pdfTags = Array.from(document.querySelectorAll('embed[type="application/pdf"]'));
119
-
120
- return pdfTags.map((x)=> {
121
- return x.src === 'about:blank' ? document.location.href : x.src;
122
- });
123
- }
124
  function getMaxDepthAndCountUsingTreeWalker(root) {
125
  let maxDepth = 0;
126
  let currentDepth = 0;
@@ -178,7 +172,6 @@ function giveSnapshot(stopActiveSnapshot) {
178
  text: document.body?.innerText,
179
  parsed: parsed,
180
  imgs: [],
181
- pdfs: briefPDFs(),
182
  maxElemDepth: domAnalysis.maxDepth,
183
  elemCount: domAnalysis.elementCount,
184
  };
@@ -324,7 +317,7 @@ export class PuppeteerControl extends AsyncService {
324
  }
325
  t0 ??= Date.now();
326
  const requestUrl = req.url();
327
- if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
328
  return req.abort('blockedbyclient', 1000);
329
  }
330
  const tldParsed = tldExtract(requestUrl);
@@ -469,7 +462,19 @@ document.addEventListener('load', handlePageLoad);
469
  let snapshot: PageSnapshot | undefined;
470
  let screenshot: Buffer | undefined;
471
  let pageshot: Buffer | undefined;
 
472
  const page = await this.getNextPage();
 
 
 
 
 
 
 
 
 
 
 
473
  const sn = this.snMap.get(page);
474
  this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
475
 
@@ -619,7 +624,7 @@ document.addEventListener('load', handlePageLoad);
619
  this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
620
  this.emit(
621
  'crawled',
622
- { ...snapshot, screenshot, pageshot },
623
  { ...options, url: parsedUrl }
624
  );
625
  }
@@ -672,7 +677,7 @@ document.addEventListener('load', handlePageLoad);
672
  }
673
  throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
674
  }
675
- yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
676
  break;
677
  }
678
  if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
@@ -681,7 +686,7 @@ document.addEventListener('load', handlePageLoad);
681
  lastHTML = snapshot.html;
682
  }
683
  if (snapshot || screenshot) {
684
- yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
685
  }
686
  if (error) {
687
  throw error;
 
11
  import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
12
  import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
13
  import { TimeoutError } from 'puppeteer';
14
+ import _ from 'lodash';
15
  const tldExtract = require('tld-extract');
16
 
17
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
 
115
  };
116
  });
117
  }
 
 
 
 
 
 
 
118
  function getMaxDepthAndCountUsingTreeWalker(root) {
119
  let maxDepth = 0;
120
  let currentDepth = 0;
 
172
  text: document.body?.innerText,
173
  parsed: parsed,
174
  imgs: [],
 
175
  maxElemDepth: domAnalysis.maxDepth,
176
  elemCount: domAnalysis.elementCount,
177
  };
 
317
  }
318
  t0 ??= Date.now();
319
  const requestUrl = req.url();
320
+ if (!requestUrl.startsWith('http:') && !requestUrl.startsWith('https:') && !requestUrl.startsWith('chrome-extension:') && requestUrl !== 'about:blank') {
321
  return req.abort('blockedbyclient', 1000);
322
  }
323
  const tldParsed = tldExtract(requestUrl);
 
462
  let snapshot: PageSnapshot | undefined;
463
  let screenshot: Buffer | undefined;
464
  let pageshot: Buffer | undefined;
465
+ const pdfUrls: string[] = [];
466
  const page = await this.getNextPage();
467
+ page.on('response', (resp) => {
468
+ if (!resp.ok()) {
469
+ return;
470
+ }
471
+ const headers = resp.headers();
472
+ const url = resp.url();
473
+ const contentType = headers['content-type'];
474
+ if (contentType?.toLowerCase().includes('application/pdf')) {
475
+ pdfUrls.push(url);
476
+ }
477
+ });
478
  const sn = this.snMap.get(page);
479
  this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
480
 
 
624
  this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
625
  this.emit(
626
  'crawled',
627
+ { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot, },
628
  { ...options, url: parsedUrl }
629
  );
630
  }
 
677
  }
678
  throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
679
  }
680
+ yield { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot } as PageSnapshot;
681
  break;
682
  }
683
  if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
 
686
  lastHTML = snapshot.html;
687
  }
688
  if (snapshot || screenshot) {
689
+ yield { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot } as PageSnapshot;
690
  }
691
  if (error) {
692
  throw error;
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 4af413f68207157d099ee99a2c056298b833dcd1
 
1
+ Subproject commit d2b0fbf184b4c77e80e8d1dd36b3f4d1807e0e09