Spaces:
Build error
Build error
fix: pdf detection
Browse files- backend/functions/src/services/puppeteer.ts +17 -12
- thinapps-shared +1 -1
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -11,6 +11,7 @@ import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
|
| 11 |
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
| 12 |
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
|
| 13 |
import { TimeoutError } from 'puppeteer';
|
|
|
|
| 14 |
const tldExtract = require('tld-extract');
|
| 15 |
|
| 16 |
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
|
@@ -114,13 +115,6 @@ function briefImgs(elem) {
|
|
| 114 |
};
|
| 115 |
});
|
| 116 |
}
|
| 117 |
-
function briefPDFs() {
|
| 118 |
-
const pdfTags = Array.from(document.querySelectorAll('embed[type="application/pdf"]'));
|
| 119 |
-
|
| 120 |
-
return pdfTags.map((x)=> {
|
| 121 |
-
return x.src === 'about:blank' ? document.location.href : x.src;
|
| 122 |
-
});
|
| 123 |
-
}
|
| 124 |
function getMaxDepthAndCountUsingTreeWalker(root) {
|
| 125 |
let maxDepth = 0;
|
| 126 |
let currentDepth = 0;
|
|
@@ -178,7 +172,6 @@ function giveSnapshot(stopActiveSnapshot) {
|
|
| 178 |
text: document.body?.innerText,
|
| 179 |
parsed: parsed,
|
| 180 |
imgs: [],
|
| 181 |
-
pdfs: briefPDFs(),
|
| 182 |
maxElemDepth: domAnalysis.maxDepth,
|
| 183 |
elemCount: domAnalysis.elementCount,
|
| 184 |
};
|
|
@@ -324,7 +317,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 324 |
}
|
| 325 |
t0 ??= Date.now();
|
| 326 |
const requestUrl = req.url();
|
| 327 |
-
if (!requestUrl.startsWith(
|
| 328 |
return req.abort('blockedbyclient', 1000);
|
| 329 |
}
|
| 330 |
const tldParsed = tldExtract(requestUrl);
|
|
@@ -469,7 +462,19 @@ document.addEventListener('load', handlePageLoad);
|
|
| 469 |
let snapshot: PageSnapshot | undefined;
|
| 470 |
let screenshot: Buffer | undefined;
|
| 471 |
let pageshot: Buffer | undefined;
|
|
|
|
| 472 |
const page = await this.getNextPage();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
const sn = this.snMap.get(page);
|
| 474 |
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
| 475 |
|
|
@@ -619,7 +624,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 619 |
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
| 620 |
this.emit(
|
| 621 |
'crawled',
|
| 622 |
-
{ ...snapshot, screenshot, pageshot },
|
| 623 |
{ ...options, url: parsedUrl }
|
| 624 |
);
|
| 625 |
}
|
|
@@ -672,7 +677,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 672 |
}
|
| 673 |
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
| 674 |
}
|
| 675 |
-
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
|
| 676 |
break;
|
| 677 |
}
|
| 678 |
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
|
@@ -681,7 +686,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 681 |
lastHTML = snapshot.html;
|
| 682 |
}
|
| 683 |
if (snapshot || screenshot) {
|
| 684 |
-
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
|
| 685 |
}
|
| 686 |
if (error) {
|
| 687 |
throw error;
|
|
|
|
| 11 |
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
| 12 |
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
|
| 13 |
import { TimeoutError } from 'puppeteer';
|
| 14 |
+
import _ from 'lodash';
|
| 15 |
const tldExtract = require('tld-extract');
|
| 16 |
|
| 17 |
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
|
|
|
| 115 |
};
|
| 116 |
});
|
| 117 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
function getMaxDepthAndCountUsingTreeWalker(root) {
|
| 119 |
let maxDepth = 0;
|
| 120 |
let currentDepth = 0;
|
|
|
|
| 172 |
text: document.body?.innerText,
|
| 173 |
parsed: parsed,
|
| 174 |
imgs: [],
|
|
|
|
| 175 |
maxElemDepth: domAnalysis.maxDepth,
|
| 176 |
elemCount: domAnalysis.elementCount,
|
| 177 |
};
|
|
|
|
| 317 |
}
|
| 318 |
t0 ??= Date.now();
|
| 319 |
const requestUrl = req.url();
|
| 320 |
+
if (!requestUrl.startsWith('http:') && !requestUrl.startsWith('https:') && !requestUrl.startsWith('chrome-extension:') && requestUrl !== 'about:blank') {
|
| 321 |
return req.abort('blockedbyclient', 1000);
|
| 322 |
}
|
| 323 |
const tldParsed = tldExtract(requestUrl);
|
|
|
|
| 462 |
let snapshot: PageSnapshot | undefined;
|
| 463 |
let screenshot: Buffer | undefined;
|
| 464 |
let pageshot: Buffer | undefined;
|
| 465 |
+
const pdfUrls: string[] = [];
|
| 466 |
const page = await this.getNextPage();
|
| 467 |
+
page.on('response', (resp) => {
|
| 468 |
+
if (!resp.ok()) {
|
| 469 |
+
return;
|
| 470 |
+
}
|
| 471 |
+
const headers = resp.headers();
|
| 472 |
+
const url = resp.url();
|
| 473 |
+
const contentType = headers['content-type'];
|
| 474 |
+
if (contentType?.toLowerCase().includes('application/pdf')) {
|
| 475 |
+
pdfUrls.push(url);
|
| 476 |
+
}
|
| 477 |
+
});
|
| 478 |
const sn = this.snMap.get(page);
|
| 479 |
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
| 480 |
|
|
|
|
| 624 |
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
| 625 |
this.emit(
|
| 626 |
'crawled',
|
| 627 |
+
{ ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot, },
|
| 628 |
{ ...options, url: parsedUrl }
|
| 629 |
);
|
| 630 |
}
|
|
|
|
| 677 |
}
|
| 678 |
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
| 679 |
}
|
| 680 |
+
yield { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot } as PageSnapshot;
|
| 681 |
break;
|
| 682 |
}
|
| 683 |
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
|
|
|
| 686 |
lastHTML = snapshot.html;
|
| 687 |
}
|
| 688 |
if (snapshot || screenshot) {
|
| 689 |
+
yield { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot } as PageSnapshot;
|
| 690 |
}
|
| 691 |
if (error) {
|
| 692 |
throw error;
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit d2b0fbf184b4c77e80e8d1dd36b3f4d1807e0e09
|