nomagick commited on
Commit
77c8480
·
unverified ·
1 Parent(s): e4d46e7

feat: with-iframe and full-page screenshot

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -28,6 +28,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-
28
  const md5Hasher = new HashManager('md5', 'hex');
29
 
30
  export interface ExtraScrappingOptions extends ScrappingOptions {
 
31
  targetSelector?: string | string[];
32
  removeSelector?: string | string[];
33
  keepImgDataUrl?: boolean;
@@ -907,7 +908,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
907
  }
908
 
909
  try {
910
- if (crawlOpts?.targetSelector || crawlOpts?.removeSelector) {
911
  for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
912
  yield this.puppeteerControl.narrowSnapshot(x, crawlOpts);
913
  }
@@ -1011,6 +1012,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
1011
  waitForSelector: opts.waitForSelector,
1012
  overrideUserAgent: opts.userAgent,
1013
  timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
 
1014
  };
1015
 
1016
  return crawlOpts;
 
28
  const md5Hasher = new HashManager('md5', 'hex');
29
 
30
  export interface ExtraScrappingOptions extends ScrappingOptions {
31
+ withIframe?: boolean;
32
  targetSelector?: string | string[];
33
  removeSelector?: string | string[];
34
  keepImgDataUrl?: boolean;
 
908
  }
909
 
910
  try {
911
+ if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe) {
912
  for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
913
  yield this.puppeteerControl.narrowSnapshot(x, crawlOpts);
914
  }
 
1012
  waitForSelector: opts.waitForSelector,
1013
  overrideUserAgent: opts.userAgent,
1014
  timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
1015
+ withIframe: opts.withIframe,
1016
  };
1017
 
1018
  return crawlOpts;
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -164,6 +164,11 @@ export class CrawlerOptions extends AutoCastable {
164
  })
165
  keepImgDataUrl!: boolean;
166
 
 
 
 
 
 
167
  @Prop({
168
  arrayOf: String,
169
  })
@@ -238,6 +243,13 @@ export class CrawlerOptions extends AutoCastable {
238
  if (keepImgDataUrl !== undefined) {
239
  instance.keepImgDataUrl = Boolean(keepImgDataUrl);
240
  }
 
 
 
 
 
 
 
241
 
242
  const cookies: CookieParam[] = [];
243
  const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
 
164
  })
165
  keepImgDataUrl!: boolean;
166
 
167
+ @Prop({
168
+ default: false,
169
+ })
170
+ withIframe!: boolean;
171
+
172
  @Prop({
173
  arrayOf: String,
174
  })
 
243
  if (keepImgDataUrl !== undefined) {
244
  instance.keepImgDataUrl = Boolean(keepImgDataUrl);
245
  }
246
+ const withIframe = ctx?.req.get('x-with-iframe');
247
+ if (withIframe !== undefined) {
248
+ instance.withIframe = Boolean(withIframe);
249
+ }
250
+ if (instance.withIframe) {
251
+ instance.timeout ??= null;
252
+ }
253
 
254
  const cookies: CookieParam[] = [];
255
  const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
backend/functions/src/services/puppeteer.ts CHANGED
@@ -54,6 +54,7 @@ export interface PageSnapshot {
54
  imgs?: ImgBrief[];
55
  pdfs?: string[];
56
  maxElemDepth?: number;
 
57
  }
58
 
59
  export interface ExtendedSnapshot extends PageSnapshot {
@@ -88,6 +89,100 @@ puppeteer.use(puppeteerPageProxy({
88
  interceptResolutionPriority: 1,
89
  }));
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  @singleton()
92
  export class PuppeteerControl extends AsyncService {
93
 
@@ -206,98 +301,7 @@ export class PuppeteerControl extends AsyncService {
206
  }
207
  page.emit('snapshot', snapshot);
208
  }));
209
- preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
210
- preparations.push(page.evaluateOnNewDocument(`
211
- function briefImgs(elem) {
212
- const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]'));
213
-
214
- return imageTags.map((x)=> {
215
- let linkPreferredSrc = x.src;
216
- if (linkPreferredSrc.startsWith('data:')) {
217
- if (typeof x.dataset?.src === 'string' && !x.dataset.src.startsWith('data:')) {
218
- linkPreferredSrc = x.dataset.src;
219
- }
220
- }
221
-
222
- return {
223
- src: new URL(linkPreferredSrc, document.location.href).toString(),
224
- loaded: x.complete,
225
- width: x.width,
226
- height: x.height,
227
- naturalWidth: x.naturalWidth,
228
- naturalHeight: x.naturalHeight,
229
- alt: x.alt || x.title,
230
- };
231
- });
232
- }
233
- function briefPDFs() {
234
- const pdfTags = Array.from(document.querySelectorAll('embed[type="application/pdf"]'));
235
-
236
- return pdfTags.map((x)=> {
237
- return x.src === 'about:blank' ? document.location.href : x.src;
238
- });
239
- }
240
- function getMaxDepthUsingTreeWalker(root) {
241
- let maxDepth = 0;
242
- let currentDepth = 0;
243
-
244
- const treeWalker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT, null, false);
245
-
246
- while (true) {
247
- maxDepth = Math.max(maxDepth, currentDepth);
248
-
249
- if (treeWalker.firstChild()) {
250
- currentDepth++;
251
- } else {
252
- while (!treeWalker.nextSibling() && currentDepth > 0) {
253
- treeWalker.parentNode();
254
- currentDepth--;
255
- }
256
-
257
- if (currentDepth <= 0) {
258
- break;
259
- }
260
- }
261
- }
262
-
263
- return maxDepth + 1;
264
- }
265
-
266
- function giveSnapshot(stopActiveSnapshot) {
267
- if (stopActiveSnapshot) {
268
- window.haltSnapshot = true;
269
- }
270
- let parsed;
271
- try {
272
- parsed = new Readability(document.cloneNode(true)).parse();
273
- } catch (err) {
274
- void 0;
275
- }
276
-
277
- const r = {
278
- title: document.title,
279
- href: document.location.href,
280
- html: document.documentElement?.outerHTML,
281
- text: document.body?.innerText,
282
- parsed: parsed,
283
- imgs: [],
284
- pdfs: briefPDFs(),
285
- maxElemDepth: getMaxDepthUsingTreeWalker(document.documentElement)
286
- };
287
- if (parsed && parsed.content) {
288
- const elem = document.createElement('div');
289
- elem.innerHTML = parsed.content;
290
- r.imgs = briefImgs(elem);
291
- } else {
292
- const allImgs = briefImgs();
293
- if (allImgs.length === 1) {
294
- r.imgs = allImgs;
295
- }
296
- }
297
-
298
- return r;
299
- }
300
- `));
301
  preparations.push(page.setRequestInterception(true));
302
 
303
  await Promise.all(preparations);
@@ -523,8 +527,12 @@ document.addEventListener('load', handlePageLoad);
523
  }
524
  }
525
  try {
 
526
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
527
- screenshot = await page.screenshot();
 
 
 
528
  } catch (err: any) {
529
  this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err: marshalErrorLike(err) });
530
  if (stuff instanceof Error) {
@@ -542,8 +550,12 @@ document.addEventListener('load', handlePageLoad);
542
  if ((!snapshot?.title || !snapshot?.parsed?.content) && !(snapshot?.pdfs?.length)) {
543
  const salvaged = await this.salvage(url, page);
544
  if (salvaged) {
 
545
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
546
- screenshot = await page.screenshot();
 
 
 
547
  }
548
  }
549
  } catch (err: any) {
@@ -572,8 +584,12 @@ document.addEventListener('load', handlePageLoad);
572
  Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
573
  page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
574
  .then(async () => {
 
575
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
576
- screenshot = await page.screenshot();
 
 
 
577
  finalized = true;
578
  })
579
  .catch((err) => {
@@ -607,7 +623,7 @@ document.addEventListener('load', handlePageLoad);
607
  break;
608
  }
609
  if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
610
- screenshot = await page.screenshot();
611
  lastHTML = snapshot.html;
612
  }
613
  if (snapshot || screenshot) {
@@ -649,9 +665,30 @@ document.addEventListener('load', handlePageLoad);
649
  return true;
650
  }
651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
  narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
653
  targetSelector?: string | string[];
654
  removeSelector?: string | string[];
 
655
  }): PageSnapshot | undefined {
656
  if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector) {
657
  return snapshot;
@@ -662,9 +699,25 @@ document.addEventListener('load', handlePageLoad);
662
 
663
  const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
664
  const allNodes: Node[] = [];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
665
 
666
  if (Array.isArray(options?.removeSelector)) {
667
- for (const rl of options.removeSelector) {
668
  jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
669
  }
670
  } else if (options?.removeSelector) {
@@ -672,7 +725,7 @@ document.addEventListener('load', handlePageLoad);
672
  }
673
 
674
  if (Array.isArray(options?.targetSelector)) {
675
- for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
676
  x.forEach((el) => {
677
  if (!allNodes.includes(el)) {
678
  allNodes.push(el);
 
54
  imgs?: ImgBrief[];
55
  pdfs?: string[];
56
  maxElemDepth?: number;
57
+ childFrames?: PageSnapshot[];
58
  }
59
 
60
  export interface ExtendedSnapshot extends PageSnapshot {
 
89
  interceptResolutionPriority: 1,
90
  }));
91
 
92
+ const SCRIPT_TO_INJECT_INTO_FRAME = `
93
+ ${READABILITY_JS}
94
+
95
+ function briefImgs(elem) {
96
+ const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]'));
97
+
98
+ return imageTags.map((x)=> {
99
+ let linkPreferredSrc = x.src;
100
+ if (linkPreferredSrc.startsWith('data:')) {
101
+ if (typeof x.dataset?.src === 'string' && !x.dataset.src.startsWith('data:')) {
102
+ linkPreferredSrc = x.dataset.src;
103
+ }
104
+ }
105
+
106
+ return {
107
+ src: new URL(linkPreferredSrc, document.location.href).toString(),
108
+ loaded: x.complete,
109
+ width: x.width,
110
+ height: x.height,
111
+ naturalWidth: x.naturalWidth,
112
+ naturalHeight: x.naturalHeight,
113
+ alt: x.alt || x.title,
114
+ };
115
+ });
116
+ }
117
+ function briefPDFs() {
118
+ const pdfTags = Array.from(document.querySelectorAll('embed[type="application/pdf"]'));
119
+
120
+ return pdfTags.map((x)=> {
121
+ return x.src === 'about:blank' ? document.location.href : x.src;
122
+ });
123
+ }
124
+ function getMaxDepthUsingTreeWalker(root) {
125
+ let maxDepth = 0;
126
+ let currentDepth = 0;
127
+
128
+ const treeWalker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT, null, false);
129
+
130
+ while (true) {
131
+ maxDepth = Math.max(maxDepth, currentDepth);
132
+
133
+ if (treeWalker.firstChild()) {
134
+ currentDepth++;
135
+ } else {
136
+ while (!treeWalker.nextSibling() && currentDepth > 0) {
137
+ treeWalker.parentNode();
138
+ currentDepth--;
139
+ }
140
+
141
+ if (currentDepth <= 0) {
142
+ break;
143
+ }
144
+ }
145
+ }
146
+
147
+ return maxDepth + 1;
148
+ }
149
+
150
+ function giveSnapshot(stopActiveSnapshot) {
151
+ if (stopActiveSnapshot) {
152
+ window.haltSnapshot = true;
153
+ }
154
+ let parsed;
155
+ try {
156
+ parsed = new Readability(document.cloneNode(true)).parse();
157
+ } catch (err) {
158
+ void 0;
159
+ }
160
+
161
+ const r = {
162
+ title: document.title,
163
+ href: document.location.href,
164
+ html: document.documentElement?.outerHTML,
165
+ text: document.body?.innerText,
166
+ parsed: parsed,
167
+ imgs: [],
168
+ pdfs: briefPDFs(),
169
+ maxElemDepth: getMaxDepthUsingTreeWalker(document.documentElement)
170
+ };
171
+ if (parsed && parsed.content) {
172
+ const elem = document.createElement('div');
173
+ elem.innerHTML = parsed.content;
174
+ r.imgs = briefImgs(elem);
175
+ } else {
176
+ const allImgs = briefImgs();
177
+ if (allImgs.length === 1) {
178
+ r.imgs = allImgs;
179
+ }
180
+ }
181
+
182
+ return r;
183
+ }
184
+ `;
185
+
186
  @singleton()
187
  export class PuppeteerControl extends AsyncService {
188
 
 
301
  }
302
  page.emit('snapshot', snapshot);
303
  }));
304
+ preparations.push(page.evaluateOnNewDocument(SCRIPT_TO_INJECT_INTO_FRAME));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  preparations.push(page.setRequestInterception(true));
306
 
307
  await Promise.all(preparations);
 
527
  }
528
  }
529
  try {
530
+ const pSubFrameSnapshots = this.snapshotChildFrames(page);
531
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
532
+ screenshot = await page.screenshot({ fullPage: true });
533
+ if (snapshot) {
534
+ snapshot.childFrames = await pSubFrameSnapshots;
535
+ }
536
  } catch (err: any) {
537
  this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err: marshalErrorLike(err) });
538
  if (stuff instanceof Error) {
 
550
  if ((!snapshot?.title || !snapshot?.parsed?.content) && !(snapshot?.pdfs?.length)) {
551
  const salvaged = await this.salvage(url, page);
552
  if (salvaged) {
553
+ const pSubFrameSnapshots = this.snapshotChildFrames(page);
554
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
555
+ screenshot = await page.screenshot({ fullPage: true });
556
+ if (snapshot) {
557
+ snapshot.childFrames = await pSubFrameSnapshots;
558
+ }
559
  }
560
  }
561
  } catch (err: any) {
 
584
  Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
585
  page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
586
  .then(async () => {
587
+ const pSubFrameSnapshots = this.snapshotChildFrames(page);
588
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
589
+ screenshot = await page.screenshot({ fullPage: true });
590
+ if (snapshot) {
591
+ snapshot.childFrames = await pSubFrameSnapshots;
592
+ }
593
  finalized = true;
594
  })
595
  .catch((err) => {
 
623
  break;
624
  }
625
  if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
626
+ screenshot = await page.screenshot({ fullPage: true });
627
  lastHTML = snapshot.html;
628
  }
629
  if (snapshot || screenshot) {
 
665
  return true;
666
  }
667
 
668
+ async snapshotChildFrames(page: Page): Promise<PageSnapshot[]> {
669
+ const childFrames = page.mainFrame().childFrames();
670
+ const r = await Promise.all(childFrames.map(async (x) => {
671
+ const thisUrl = x.url();
672
+ if (!thisUrl || thisUrl === 'about:blank') {
673
+ return undefined;
674
+ }
675
+ try {
676
+ await x.evaluate(SCRIPT_TO_INJECT_INTO_FRAME);
677
+
678
+ return await x.evaluate(`giveSnapshot()`);
679
+ } catch (err) {
680
+ this.logger.warn(`Failed to snapshot child frame ${thisUrl}`, { err });
681
+ return undefined;
682
+ }
683
+ })) as PageSnapshot[];
684
+
685
+ return r.filter(Boolean);
686
+ }
687
+
688
  narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
689
  targetSelector?: string | string[];
690
  removeSelector?: string | string[];
691
+ withIframe?: boolean;
692
  }): PageSnapshot | undefined {
693
  if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector) {
694
  return snapshot;
 
699
 
700
  const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
701
  const allNodes: Node[] = [];
702
+ if (options?.withIframe) {
703
+ jsdom.window.document.querySelectorAll('iframe[src]').forEach((x) => {
704
+ const src = x.getAttribute('src');
705
+ const thisSnapshot = snapshot.childFrames?.find((f) => f.href === src);
706
+ if (thisSnapshot?.html) {
707
+ x.innerHTML = thisSnapshot.html;
708
+ x.querySelectorAll('script, style').forEach((s) => s.remove());
709
+ x.querySelectorAll('[src]').forEach((el) => {
710
+ el.setAttribute('src', new URL(el.getAttribute('src')!, src!).toString());
711
+ });
712
+ x.querySelectorAll('[href]').forEach((el) => {
713
+ el.setAttribute('href', new URL(el.getAttribute('href')!, src!).toString());
714
+ });
715
+ }
716
+ });
717
+ }
718
 
719
  if (Array.isArray(options?.removeSelector)) {
720
+ for (const rl of options!.removeSelector) {
721
  jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
722
  }
723
  } else if (options?.removeSelector) {
 
725
  }
726
 
727
  if (Array.isArray(options?.targetSelector)) {
728
+ for (const x of options!.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
729
  x.forEach((el) => {
730
  if (!allNodes.includes(el)) {
731
  allNodes.push(el);