nomagick commited on
Commit
4e5aff3
·
unverified ·
1 Parent(s): 0f23979

debug: log jsdom and turndown operations

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -251,9 +251,10 @@ export class CrawlerHost extends RPCHost {
251
  }
252
 
253
  getGeneralSnapshotMixins(snapshot: PageSnapshot) {
254
- const inferred = this.jsdomControl.inferSnapshot(snapshot);
255
  const mixin: any = {};
256
  if (this.threadLocal.get('withImagesSummary')) {
 
257
  const imageSummary = {} as { [k: string]: string; };
258
  const imageIdxTrack = new Map<string, number[]>();
259
 
@@ -278,6 +279,7 @@ export class CrawlerHost extends RPCHost {
278
  .value();
279
  }
280
  if (this.threadLocal.get('withLinksSummary')) {
 
281
  mixin.links = _.invert(inferred.links || {});
282
  }
283
 
@@ -384,8 +386,8 @@ export class CrawlerHost extends RPCHost {
384
  let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
385
  if (mode !== 'markdown' && snapshot.parsed?.content) {
386
  const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
387
- const par1 = turnDownService.turndown(jsDomElementOfHTML);
388
- const par2 = snapshot.parsed.content ? turnDownService.turndown(jsDomElementOfParsed) : '';
389
 
390
  // If Readability did its job
391
  if (par2.length >= 0.3 * par1.length) {
@@ -469,12 +471,12 @@ export class CrawlerHost extends RPCHost {
469
 
470
  if (toBeTurnedToMd) {
471
  try {
472
- contentText = turnDownService.turndown(toBeTurnedToMd).trim();
473
  } catch (err) {
474
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
475
  const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
476
  try {
477
- contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
478
  } catch (err2) {
479
  this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
480
  }
@@ -486,12 +488,12 @@ export class CrawlerHost extends RPCHost {
486
  && toBeTurnedToMd !== jsDomElementOfHTML
487
  ) {
488
  try {
489
- contentText = turnDownService.turndown(snapshot.html);
490
  } catch (err) {
491
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
492
  const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
493
  try {
494
- contentText = vanillaTurnDownService.turndown(snapshot.html);
495
  } catch (err2) {
496
  this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
497
  }
@@ -959,7 +961,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
959
  cache = await this.queryCache(urlToCrawl, cacheTolerance);
960
  }
961
 
962
- if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
963
  yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
964
 
965
  return;
 
251
  }
252
 
253
  getGeneralSnapshotMixins(snapshot: PageSnapshot) {
254
+ let inferred;
255
  const mixin: any = {};
256
  if (this.threadLocal.get('withImagesSummary')) {
257
+ inferred ??= this.jsdomControl.inferSnapshot(snapshot);
258
  const imageSummary = {} as { [k: string]: string; };
259
  const imageIdxTrack = new Map<string, number[]>();
260
 
 
279
  .value();
280
  }
281
  if (this.threadLocal.get('withLinksSummary')) {
282
+ inferred ??= this.jsdomControl.inferSnapshot(snapshot);
283
  mixin.links = _.invert(inferred.links || {});
284
  }
285
 
 
386
  let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
387
  if (mode !== 'markdown' && snapshot.parsed?.content) {
388
  const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
389
+ const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
390
+ const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
391
 
392
  // If Readability did its job
393
  if (par2.length >= 0.3 * par1.length) {
 
471
 
472
  if (toBeTurnedToMd) {
473
  try {
474
+ contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
475
  } catch (err) {
476
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
477
  const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
478
  try {
479
+ contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
480
  } catch (err2) {
481
  this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
482
  }
 
488
  && toBeTurnedToMd !== jsDomElementOfHTML
489
  ) {
490
  try {
491
+ contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
492
  } catch (err) {
493
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
494
  const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
495
  try {
496
+ contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
497
  } catch (err2) {
498
  this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
499
  }
 
961
  cache = await this.queryCache(urlToCrawl, cacheTolerance);
962
  }
963
 
964
+ if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable)))) {
965
  yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
966
 
967
  return;
backend/functions/src/services/jsdom.ts CHANGED
@@ -4,6 +4,7 @@ import { Logger } from '../shared/services/logger';
4
  import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
5
  import { JSDOM, VirtualConsole } from 'jsdom';
6
  import { Readability } from '@mozilla/readability';
 
7
 
8
  const virtualConsole = new VirtualConsole();
9
  virtualConsole.on('error', () => void 0);
@@ -35,7 +36,7 @@ export class JSDomControl extends AsyncService {
35
  if (!snapshot?.html) {
36
  return snapshot;
37
  }
38
-
39
  const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
40
  const allNodes: Node[] = [];
41
  if (options?.withIframe) {
@@ -137,10 +138,16 @@ export class JSDomControl extends AsyncService {
137
  imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
138
  } as PageSnapshot;
139
 
 
 
 
 
 
140
  return r;
141
  }
142
 
143
  inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
 
144
  const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
145
  try {
146
  const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
@@ -191,6 +198,11 @@ export class JSDomControl extends AsyncService {
191
  void 0;
192
  }
193
 
 
 
 
 
 
194
  return extendedSnapshot;
195
  }
196
 
@@ -199,6 +211,19 @@ export class JSDomControl extends AsyncService {
199
 
200
  return parsed.window.document.documentElement;
201
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  }
203
 
204
  const jsdomControl = container.resolve(JSDomControl);
 
4
  import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
5
  import { JSDOM, VirtualConsole } from 'jsdom';
6
  import { Readability } from '@mozilla/readability';
7
+ import TurndownService from 'turndown';
8
 
9
  const virtualConsole = new VirtualConsole();
10
  virtualConsole.on('error', () => void 0);
 
36
  if (!snapshot?.html) {
37
  return snapshot;
38
  }
39
+ const t0 = Date.now();
40
  const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
41
  const allNodes: Node[] = [];
42
  if (options?.withIframe) {
 
138
  imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
139
  } as PageSnapshot;
140
 
141
+ const dt = Date.now() - t0;
142
+ if (dt > 1000) {
143
+ this.logger.warn(`Performance issue: Narrowing snapshot took ${dt}ms`, { url: snapshot.href, dt });
144
+ }
145
+
146
  return r;
147
  }
148
 
149
  inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
150
+ const t0 = Date.now();
151
  const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
152
  try {
153
  const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
 
198
  void 0;
199
  }
200
 
201
+ const dt = Date.now() - t0;
202
+ if (dt > 1000) {
203
+ this.logger.warn(`Performance issue: Inferring snapshot took ${dt}ms`, { url: snapshot.href, dt });
204
+ }
205
+
206
  return extendedSnapshot;
207
  }
208
 
 
211
 
212
  return parsed.window.document.documentElement;
213
  }
214
+
215
+ runTurndown(turndownService: TurndownService, html: TurndownService.Node | string) {
216
+ const t0 = Date.now();
217
+
218
+ try {
219
+ return turndownService.turndown(html);
220
+ } finally {
221
+ const dt = Date.now() - t0;
222
+ if (dt > 1000) {
223
+ this.logger.warn(`Performance issue: Turndown took ${dt}ms`, { dt });
224
+ }
225
+ }
226
+ }
227
  }
228
 
229
  const jsdomControl = container.resolve(JSDomControl);