Spaces:
Build error
Build error
feat: with-iframe and full-page screenshot
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -28,6 +28,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-
|
|
| 28 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 29 |
|
| 30 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
|
|
|
| 31 |
targetSelector?: string | string[];
|
| 32 |
removeSelector?: string | string[];
|
| 33 |
keepImgDataUrl?: boolean;
|
|
@@ -907,7 +908,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 907 |
}
|
| 908 |
|
| 909 |
try {
|
| 910 |
-
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector) {
|
| 911 |
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
| 912 |
yield this.puppeteerControl.narrowSnapshot(x, crawlOpts);
|
| 913 |
}
|
|
@@ -1011,6 +1012,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 1011 |
waitForSelector: opts.waitForSelector,
|
| 1012 |
overrideUserAgent: opts.userAgent,
|
| 1013 |
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
|
|
|
| 1014 |
};
|
| 1015 |
|
| 1016 |
return crawlOpts;
|
|
|
|
| 28 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 29 |
|
| 30 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 31 |
+
withIframe?: boolean;
|
| 32 |
targetSelector?: string | string[];
|
| 33 |
removeSelector?: string | string[];
|
| 34 |
keepImgDataUrl?: boolean;
|
|
|
|
| 908 |
}
|
| 909 |
|
| 910 |
try {
|
| 911 |
+
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe) {
|
| 912 |
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
| 913 |
yield this.puppeteerControl.narrowSnapshot(x, crawlOpts);
|
| 914 |
}
|
|
|
|
| 1012 |
waitForSelector: opts.waitForSelector,
|
| 1013 |
overrideUserAgent: opts.userAgent,
|
| 1014 |
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
| 1015 |
+
withIframe: opts.withIframe,
|
| 1016 |
};
|
| 1017 |
|
| 1018 |
return crawlOpts;
|
backend/functions/src/dto/scrapping-options.ts
CHANGED
|
@@ -164,6 +164,11 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 164 |
})
|
| 165 |
keepImgDataUrl!: boolean;
|
| 166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
@Prop({
|
| 168 |
arrayOf: String,
|
| 169 |
})
|
|
@@ -238,6 +243,13 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 238 |
if (keepImgDataUrl !== undefined) {
|
| 239 |
instance.keepImgDataUrl = Boolean(keepImgDataUrl);
|
| 240 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
const cookies: CookieParam[] = [];
|
| 243 |
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
|
|
|
|
| 164 |
})
|
| 165 |
keepImgDataUrl!: boolean;
|
| 166 |
|
| 167 |
+
@Prop({
|
| 168 |
+
default: false,
|
| 169 |
+
})
|
| 170 |
+
withIframe!: boolean;
|
| 171 |
+
|
| 172 |
@Prop({
|
| 173 |
arrayOf: String,
|
| 174 |
})
|
|
|
|
| 243 |
if (keepImgDataUrl !== undefined) {
|
| 244 |
instance.keepImgDataUrl = Boolean(keepImgDataUrl);
|
| 245 |
}
|
| 246 |
+
const withIframe = ctx?.req.get('x-with-iframe');
|
| 247 |
+
if (withIframe !== undefined) {
|
| 248 |
+
instance.withIframe = Boolean(withIframe);
|
| 249 |
+
}
|
| 250 |
+
if (instance.withIframe) {
|
| 251 |
+
instance.timeout ??= null;
|
| 252 |
+
}
|
| 253 |
|
| 254 |
const cookies: CookieParam[] = [];
|
| 255 |
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -54,6 +54,7 @@ export interface PageSnapshot {
|
|
| 54 |
imgs?: ImgBrief[];
|
| 55 |
pdfs?: string[];
|
| 56 |
maxElemDepth?: number;
|
|
|
|
| 57 |
}
|
| 58 |
|
| 59 |
export interface ExtendedSnapshot extends PageSnapshot {
|
|
@@ -88,6 +89,100 @@ puppeteer.use(puppeteerPageProxy({
|
|
| 88 |
interceptResolutionPriority: 1,
|
| 89 |
}));
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
@singleton()
|
| 92 |
export class PuppeteerControl extends AsyncService {
|
| 93 |
|
|
@@ -206,98 +301,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 206 |
}
|
| 207 |
page.emit('snapshot', snapshot);
|
| 208 |
}));
|
| 209 |
-
preparations.push(page.evaluateOnNewDocument(
|
| 210 |
-
preparations.push(page.evaluateOnNewDocument(`
|
| 211 |
-
function briefImgs(elem) {
|
| 212 |
-
const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]'));
|
| 213 |
-
|
| 214 |
-
return imageTags.map((x)=> {
|
| 215 |
-
let linkPreferredSrc = x.src;
|
| 216 |
-
if (linkPreferredSrc.startsWith('data:')) {
|
| 217 |
-
if (typeof x.dataset?.src === 'string' && !x.dataset.src.startsWith('data:')) {
|
| 218 |
-
linkPreferredSrc = x.dataset.src;
|
| 219 |
-
}
|
| 220 |
-
}
|
| 221 |
-
|
| 222 |
-
return {
|
| 223 |
-
src: new URL(linkPreferredSrc, document.location.href).toString(),
|
| 224 |
-
loaded: x.complete,
|
| 225 |
-
width: x.width,
|
| 226 |
-
height: x.height,
|
| 227 |
-
naturalWidth: x.naturalWidth,
|
| 228 |
-
naturalHeight: x.naturalHeight,
|
| 229 |
-
alt: x.alt || x.title,
|
| 230 |
-
};
|
| 231 |
-
});
|
| 232 |
-
}
|
| 233 |
-
function briefPDFs() {
|
| 234 |
-
const pdfTags = Array.from(document.querySelectorAll('embed[type="application/pdf"]'));
|
| 235 |
-
|
| 236 |
-
return pdfTags.map((x)=> {
|
| 237 |
-
return x.src === 'about:blank' ? document.location.href : x.src;
|
| 238 |
-
});
|
| 239 |
-
}
|
| 240 |
-
function getMaxDepthUsingTreeWalker(root) {
|
| 241 |
-
let maxDepth = 0;
|
| 242 |
-
let currentDepth = 0;
|
| 243 |
-
|
| 244 |
-
const treeWalker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT, null, false);
|
| 245 |
-
|
| 246 |
-
while (true) {
|
| 247 |
-
maxDepth = Math.max(maxDepth, currentDepth);
|
| 248 |
-
|
| 249 |
-
if (treeWalker.firstChild()) {
|
| 250 |
-
currentDepth++;
|
| 251 |
-
} else {
|
| 252 |
-
while (!treeWalker.nextSibling() && currentDepth > 0) {
|
| 253 |
-
treeWalker.parentNode();
|
| 254 |
-
currentDepth--;
|
| 255 |
-
}
|
| 256 |
-
|
| 257 |
-
if (currentDepth <= 0) {
|
| 258 |
-
break;
|
| 259 |
-
}
|
| 260 |
-
}
|
| 261 |
-
}
|
| 262 |
-
|
| 263 |
-
return maxDepth + 1;
|
| 264 |
-
}
|
| 265 |
-
|
| 266 |
-
function giveSnapshot(stopActiveSnapshot) {
|
| 267 |
-
if (stopActiveSnapshot) {
|
| 268 |
-
window.haltSnapshot = true;
|
| 269 |
-
}
|
| 270 |
-
let parsed;
|
| 271 |
-
try {
|
| 272 |
-
parsed = new Readability(document.cloneNode(true)).parse();
|
| 273 |
-
} catch (err) {
|
| 274 |
-
void 0;
|
| 275 |
-
}
|
| 276 |
-
|
| 277 |
-
const r = {
|
| 278 |
-
title: document.title,
|
| 279 |
-
href: document.location.href,
|
| 280 |
-
html: document.documentElement?.outerHTML,
|
| 281 |
-
text: document.body?.innerText,
|
| 282 |
-
parsed: parsed,
|
| 283 |
-
imgs: [],
|
| 284 |
-
pdfs: briefPDFs(),
|
| 285 |
-
maxElemDepth: getMaxDepthUsingTreeWalker(document.documentElement)
|
| 286 |
-
};
|
| 287 |
-
if (parsed && parsed.content) {
|
| 288 |
-
const elem = document.createElement('div');
|
| 289 |
-
elem.innerHTML = parsed.content;
|
| 290 |
-
r.imgs = briefImgs(elem);
|
| 291 |
-
} else {
|
| 292 |
-
const allImgs = briefImgs();
|
| 293 |
-
if (allImgs.length === 1) {
|
| 294 |
-
r.imgs = allImgs;
|
| 295 |
-
}
|
| 296 |
-
}
|
| 297 |
-
|
| 298 |
-
return r;
|
| 299 |
-
}
|
| 300 |
-
`));
|
| 301 |
preparations.push(page.setRequestInterception(true));
|
| 302 |
|
| 303 |
await Promise.all(preparations);
|
|
@@ -523,8 +527,12 @@ document.addEventListener('load', handlePageLoad);
|
|
| 523 |
}
|
| 524 |
}
|
| 525 |
try {
|
|
|
|
| 526 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 527 |
-
screenshot = await page.screenshot();
|
|
|
|
|
|
|
|
|
|
| 528 |
} catch (err: any) {
|
| 529 |
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err: marshalErrorLike(err) });
|
| 530 |
if (stuff instanceof Error) {
|
|
@@ -542,8 +550,12 @@ document.addEventListener('load', handlePageLoad);
|
|
| 542 |
if ((!snapshot?.title || !snapshot?.parsed?.content) && !(snapshot?.pdfs?.length)) {
|
| 543 |
const salvaged = await this.salvage(url, page);
|
| 544 |
if (salvaged) {
|
|
|
|
| 545 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 546 |
-
screenshot = await page.screenshot();
|
|
|
|
|
|
|
|
|
|
| 547 |
}
|
| 548 |
}
|
| 549 |
} catch (err: any) {
|
|
@@ -572,8 +584,12 @@ document.addEventListener('load', handlePageLoad);
|
|
| 572 |
Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
|
| 573 |
page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
|
| 574 |
.then(async () => {
|
|
|
|
| 575 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 576 |
-
screenshot = await page.screenshot();
|
|
|
|
|
|
|
|
|
|
| 577 |
finalized = true;
|
| 578 |
})
|
| 579 |
.catch((err) => {
|
|
@@ -607,7 +623,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 607 |
break;
|
| 608 |
}
|
| 609 |
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
| 610 |
-
screenshot = await page.screenshot();
|
| 611 |
lastHTML = snapshot.html;
|
| 612 |
}
|
| 613 |
if (snapshot || screenshot) {
|
|
@@ -649,9 +665,30 @@ document.addEventListener('load', handlePageLoad);
|
|
| 649 |
return true;
|
| 650 |
}
|
| 651 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 652 |
narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
|
| 653 |
targetSelector?: string | string[];
|
| 654 |
removeSelector?: string | string[];
|
|
|
|
| 655 |
}): PageSnapshot | undefined {
|
| 656 |
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector) {
|
| 657 |
return snapshot;
|
|
@@ -662,9 +699,25 @@ document.addEventListener('load', handlePageLoad);
|
|
| 662 |
|
| 663 |
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
| 664 |
const allNodes: Node[] = [];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 665 |
|
| 666 |
if (Array.isArray(options?.removeSelector)) {
|
| 667 |
-
for (const rl of options.removeSelector) {
|
| 668 |
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
|
| 669 |
}
|
| 670 |
} else if (options?.removeSelector) {
|
|
@@ -672,7 +725,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 672 |
}
|
| 673 |
|
| 674 |
if (Array.isArray(options?.targetSelector)) {
|
| 675 |
-
for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
|
| 676 |
x.forEach((el) => {
|
| 677 |
if (!allNodes.includes(el)) {
|
| 678 |
allNodes.push(el);
|
|
|
|
| 54 |
imgs?: ImgBrief[];
|
| 55 |
pdfs?: string[];
|
| 56 |
maxElemDepth?: number;
|
| 57 |
+
childFrames?: PageSnapshot[];
|
| 58 |
}
|
| 59 |
|
| 60 |
export interface ExtendedSnapshot extends PageSnapshot {
|
|
|
|
| 89 |
interceptResolutionPriority: 1,
|
| 90 |
}));
|
| 91 |
|
| 92 |
+
const SCRIPT_TO_INJECT_INTO_FRAME = `
|
| 93 |
+
${READABILITY_JS}
|
| 94 |
+
|
| 95 |
+
function briefImgs(elem) {
|
| 96 |
+
const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]'));
|
| 97 |
+
|
| 98 |
+
return imageTags.map((x)=> {
|
| 99 |
+
let linkPreferredSrc = x.src;
|
| 100 |
+
if (linkPreferredSrc.startsWith('data:')) {
|
| 101 |
+
if (typeof x.dataset?.src === 'string' && !x.dataset.src.startsWith('data:')) {
|
| 102 |
+
linkPreferredSrc = x.dataset.src;
|
| 103 |
+
}
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
return {
|
| 107 |
+
src: new URL(linkPreferredSrc, document.location.href).toString(),
|
| 108 |
+
loaded: x.complete,
|
| 109 |
+
width: x.width,
|
| 110 |
+
height: x.height,
|
| 111 |
+
naturalWidth: x.naturalWidth,
|
| 112 |
+
naturalHeight: x.naturalHeight,
|
| 113 |
+
alt: x.alt || x.title,
|
| 114 |
+
};
|
| 115 |
+
});
|
| 116 |
+
}
|
| 117 |
+
function briefPDFs() {
|
| 118 |
+
const pdfTags = Array.from(document.querySelectorAll('embed[type="application/pdf"]'));
|
| 119 |
+
|
| 120 |
+
return pdfTags.map((x)=> {
|
| 121 |
+
return x.src === 'about:blank' ? document.location.href : x.src;
|
| 122 |
+
});
|
| 123 |
+
}
|
| 124 |
+
function getMaxDepthUsingTreeWalker(root) {
|
| 125 |
+
let maxDepth = 0;
|
| 126 |
+
let currentDepth = 0;
|
| 127 |
+
|
| 128 |
+
const treeWalker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT, null, false);
|
| 129 |
+
|
| 130 |
+
while (true) {
|
| 131 |
+
maxDepth = Math.max(maxDepth, currentDepth);
|
| 132 |
+
|
| 133 |
+
if (treeWalker.firstChild()) {
|
| 134 |
+
currentDepth++;
|
| 135 |
+
} else {
|
| 136 |
+
while (!treeWalker.nextSibling() && currentDepth > 0) {
|
| 137 |
+
treeWalker.parentNode();
|
| 138 |
+
currentDepth--;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
if (currentDepth <= 0) {
|
| 142 |
+
break;
|
| 143 |
+
}
|
| 144 |
+
}
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
return maxDepth + 1;
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
function giveSnapshot(stopActiveSnapshot) {
|
| 151 |
+
if (stopActiveSnapshot) {
|
| 152 |
+
window.haltSnapshot = true;
|
| 153 |
+
}
|
| 154 |
+
let parsed;
|
| 155 |
+
try {
|
| 156 |
+
parsed = new Readability(document.cloneNode(true)).parse();
|
| 157 |
+
} catch (err) {
|
| 158 |
+
void 0;
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
const r = {
|
| 162 |
+
title: document.title,
|
| 163 |
+
href: document.location.href,
|
| 164 |
+
html: document.documentElement?.outerHTML,
|
| 165 |
+
text: document.body?.innerText,
|
| 166 |
+
parsed: parsed,
|
| 167 |
+
imgs: [],
|
| 168 |
+
pdfs: briefPDFs(),
|
| 169 |
+
maxElemDepth: getMaxDepthUsingTreeWalker(document.documentElement)
|
| 170 |
+
};
|
| 171 |
+
if (parsed && parsed.content) {
|
| 172 |
+
const elem = document.createElement('div');
|
| 173 |
+
elem.innerHTML = parsed.content;
|
| 174 |
+
r.imgs = briefImgs(elem);
|
| 175 |
+
} else {
|
| 176 |
+
const allImgs = briefImgs();
|
| 177 |
+
if (allImgs.length === 1) {
|
| 178 |
+
r.imgs = allImgs;
|
| 179 |
+
}
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
return r;
|
| 183 |
+
}
|
| 184 |
+
`;
|
| 185 |
+
|
| 186 |
@singleton()
|
| 187 |
export class PuppeteerControl extends AsyncService {
|
| 188 |
|
|
|
|
| 301 |
}
|
| 302 |
page.emit('snapshot', snapshot);
|
| 303 |
}));
|
| 304 |
+
preparations.push(page.evaluateOnNewDocument(SCRIPT_TO_INJECT_INTO_FRAME));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
preparations.push(page.setRequestInterception(true));
|
| 306 |
|
| 307 |
await Promise.all(preparations);
|
|
|
|
| 527 |
}
|
| 528 |
}
|
| 529 |
try {
|
| 530 |
+
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 531 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 532 |
+
screenshot = await page.screenshot({ fullPage: true });
|
| 533 |
+
if (snapshot) {
|
| 534 |
+
snapshot.childFrames = await pSubFrameSnapshots;
|
| 535 |
+
}
|
| 536 |
} catch (err: any) {
|
| 537 |
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err: marshalErrorLike(err) });
|
| 538 |
if (stuff instanceof Error) {
|
|
|
|
| 550 |
if ((!snapshot?.title || !snapshot?.parsed?.content) && !(snapshot?.pdfs?.length)) {
|
| 551 |
const salvaged = await this.salvage(url, page);
|
| 552 |
if (salvaged) {
|
| 553 |
+
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 554 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 555 |
+
screenshot = await page.screenshot({ fullPage: true });
|
| 556 |
+
if (snapshot) {
|
| 557 |
+
snapshot.childFrames = await pSubFrameSnapshots;
|
| 558 |
+
}
|
| 559 |
}
|
| 560 |
}
|
| 561 |
} catch (err: any) {
|
|
|
|
| 584 |
Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
|
| 585 |
page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
|
| 586 |
.then(async () => {
|
| 587 |
+
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 588 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 589 |
+
screenshot = await page.screenshot({ fullPage: true });
|
| 590 |
+
if (snapshot) {
|
| 591 |
+
snapshot.childFrames = await pSubFrameSnapshots;
|
| 592 |
+
}
|
| 593 |
finalized = true;
|
| 594 |
})
|
| 595 |
.catch((err) => {
|
|
|
|
| 623 |
break;
|
| 624 |
}
|
| 625 |
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
| 626 |
+
screenshot = await page.screenshot({ fullPage: true });
|
| 627 |
lastHTML = snapshot.html;
|
| 628 |
}
|
| 629 |
if (snapshot || screenshot) {
|
|
|
|
| 665 |
return true;
|
| 666 |
}
|
| 667 |
|
| 668 |
+
async snapshotChildFrames(page: Page): Promise<PageSnapshot[]> {
|
| 669 |
+
const childFrames = page.mainFrame().childFrames();
|
| 670 |
+
const r = await Promise.all(childFrames.map(async (x) => {
|
| 671 |
+
const thisUrl = x.url();
|
| 672 |
+
if (!thisUrl || thisUrl === 'about:blank') {
|
| 673 |
+
return undefined;
|
| 674 |
+
}
|
| 675 |
+
try {
|
| 676 |
+
await x.evaluate(SCRIPT_TO_INJECT_INTO_FRAME);
|
| 677 |
+
|
| 678 |
+
return await x.evaluate(`giveSnapshot()`);
|
| 679 |
+
} catch (err) {
|
| 680 |
+
this.logger.warn(`Failed to snapshot child frame ${thisUrl}`, { err });
|
| 681 |
+
return undefined;
|
| 682 |
+
}
|
| 683 |
+
})) as PageSnapshot[];
|
| 684 |
+
|
| 685 |
+
return r.filter(Boolean);
|
| 686 |
+
}
|
| 687 |
+
|
| 688 |
narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
|
| 689 |
targetSelector?: string | string[];
|
| 690 |
removeSelector?: string | string[];
|
| 691 |
+
withIframe?: boolean;
|
| 692 |
}): PageSnapshot | undefined {
|
| 693 |
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector) {
|
| 694 |
return snapshot;
|
|
|
|
| 699 |
|
| 700 |
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
| 701 |
const allNodes: Node[] = [];
|
| 702 |
+
if (options?.withIframe) {
|
| 703 |
+
jsdom.window.document.querySelectorAll('iframe[src]').forEach((x) => {
|
| 704 |
+
const src = x.getAttribute('src');
|
| 705 |
+
const thisSnapshot = snapshot.childFrames?.find((f) => f.href === src);
|
| 706 |
+
if (thisSnapshot?.html) {
|
| 707 |
+
x.innerHTML = thisSnapshot.html;
|
| 708 |
+
x.querySelectorAll('script, style').forEach((s) => s.remove());
|
| 709 |
+
x.querySelectorAll('[src]').forEach((el) => {
|
| 710 |
+
el.setAttribute('src', new URL(el.getAttribute('src')!, src!).toString());
|
| 711 |
+
});
|
| 712 |
+
x.querySelectorAll('[href]').forEach((el) => {
|
| 713 |
+
el.setAttribute('href', new URL(el.getAttribute('href')!, src!).toString());
|
| 714 |
+
});
|
| 715 |
+
}
|
| 716 |
+
});
|
| 717 |
+
}
|
| 718 |
|
| 719 |
if (Array.isArray(options?.removeSelector)) {
|
| 720 |
+
for (const rl of options!.removeSelector) {
|
| 721 |
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
|
| 722 |
}
|
| 723 |
} else if (options?.removeSelector) {
|
|
|
|
| 725 |
}
|
| 726 |
|
| 727 |
if (Array.isArray(options?.targetSelector)) {
|
| 728 |
+
for (const x of options!.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
|
| 729 |
x.forEach((el) => {
|
| 730 |
if (!allNodes.includes(el)) {
|
| 731 |
allNodes.push(el);
|