Spaces:
Build error
Build error
debug: log jsdom and turndown operations
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -251,9 +251,10 @@ export class CrawlerHost extends RPCHost {
|
|
| 251 |
}
|
| 252 |
|
| 253 |
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
| 254 |
-
|
| 255 |
const mixin: any = {};
|
| 256 |
if (this.threadLocal.get('withImagesSummary')) {
|
|
|
|
| 257 |
const imageSummary = {} as { [k: string]: string; };
|
| 258 |
const imageIdxTrack = new Map<string, number[]>();
|
| 259 |
|
|
@@ -278,6 +279,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 278 |
.value();
|
| 279 |
}
|
| 280 |
if (this.threadLocal.get('withLinksSummary')) {
|
|
|
|
| 281 |
mixin.links = _.invert(inferred.links || {});
|
| 282 |
}
|
| 283 |
|
|
@@ -384,8 +386,8 @@ export class CrawlerHost extends RPCHost {
|
|
| 384 |
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
|
| 385 |
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
| 386 |
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
| 387 |
-
const par1 =
|
| 388 |
-
const par2 = snapshot.parsed.content ?
|
| 389 |
|
| 390 |
// If Readability did its job
|
| 391 |
if (par2.length >= 0.3 * par1.length) {
|
|
@@ -469,12 +471,12 @@ export class CrawlerHost extends RPCHost {
|
|
| 469 |
|
| 470 |
if (toBeTurnedToMd) {
|
| 471 |
try {
|
| 472 |
-
contentText =
|
| 473 |
} catch (err) {
|
| 474 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 475 |
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
| 476 |
try {
|
| 477 |
-
contentText =
|
| 478 |
} catch (err2) {
|
| 479 |
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
| 480 |
}
|
|
@@ -486,12 +488,12 @@ export class CrawlerHost extends RPCHost {
|
|
| 486 |
&& toBeTurnedToMd !== jsDomElementOfHTML
|
| 487 |
) {
|
| 488 |
try {
|
| 489 |
-
contentText =
|
| 490 |
} catch (err) {
|
| 491 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 492 |
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
| 493 |
try {
|
| 494 |
-
contentText =
|
| 495 |
} catch (err2) {
|
| 496 |
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
| 497 |
}
|
|
@@ -959,7 +961,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 959 |
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
| 960 |
}
|
| 961 |
|
| 962 |
-
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache
|
| 963 |
yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
| 964 |
|
| 965 |
return;
|
|
|
|
| 251 |
}
|
| 252 |
|
| 253 |
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
| 254 |
+
let inferred;
|
| 255 |
const mixin: any = {};
|
| 256 |
if (this.threadLocal.get('withImagesSummary')) {
|
| 257 |
+
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
| 258 |
const imageSummary = {} as { [k: string]: string; };
|
| 259 |
const imageIdxTrack = new Map<string, number[]>();
|
| 260 |
|
|
|
|
| 279 |
.value();
|
| 280 |
}
|
| 281 |
if (this.threadLocal.get('withLinksSummary')) {
|
| 282 |
+
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
| 283 |
mixin.links = _.invert(inferred.links || {});
|
| 284 |
}
|
| 285 |
|
|
|
|
| 386 |
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
|
| 387 |
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
| 388 |
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
| 389 |
+
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
| 390 |
+
const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
|
| 391 |
|
| 392 |
// If Readability did its job
|
| 393 |
if (par2.length >= 0.3 * par1.length) {
|
|
|
|
| 471 |
|
| 472 |
if (toBeTurnedToMd) {
|
| 473 |
try {
|
| 474 |
+
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
| 475 |
} catch (err) {
|
| 476 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 477 |
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
| 478 |
try {
|
| 479 |
+
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
| 480 |
} catch (err2) {
|
| 481 |
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
| 482 |
}
|
|
|
|
| 488 |
&& toBeTurnedToMd !== jsDomElementOfHTML
|
| 489 |
) {
|
| 490 |
try {
|
| 491 |
+
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
|
| 492 |
} catch (err) {
|
| 493 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 494 |
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
| 495 |
try {
|
| 496 |
+
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
|
| 497 |
} catch (err2) {
|
| 498 |
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
| 499 |
}
|
|
|
|
| 961 |
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
| 962 |
}
|
| 963 |
|
| 964 |
+
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable)))) {
|
| 965 |
yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
| 966 |
|
| 967 |
return;
|
backend/functions/src/services/jsdom.ts
CHANGED
|
@@ -4,6 +4,7 @@ import { Logger } from '../shared/services/logger';
|
|
| 4 |
import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
|
| 5 |
import { JSDOM, VirtualConsole } from 'jsdom';
|
| 6 |
import { Readability } from '@mozilla/readability';
|
|
|
|
| 7 |
|
| 8 |
const virtualConsole = new VirtualConsole();
|
| 9 |
virtualConsole.on('error', () => void 0);
|
|
@@ -35,7 +36,7 @@ export class JSDomControl extends AsyncService {
|
|
| 35 |
if (!snapshot?.html) {
|
| 36 |
return snapshot;
|
| 37 |
}
|
| 38 |
-
|
| 39 |
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
| 40 |
const allNodes: Node[] = [];
|
| 41 |
if (options?.withIframe) {
|
|
@@ -137,10 +138,16 @@ export class JSDomControl extends AsyncService {
|
|
| 137 |
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
| 138 |
} as PageSnapshot;
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
return r;
|
| 141 |
}
|
| 142 |
|
| 143 |
inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
|
|
|
|
| 144 |
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
|
| 145 |
try {
|
| 146 |
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
|
@@ -191,6 +198,11 @@ export class JSDomControl extends AsyncService {
|
|
| 191 |
void 0;
|
| 192 |
}
|
| 193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
return extendedSnapshot;
|
| 195 |
}
|
| 196 |
|
|
@@ -199,6 +211,19 @@ export class JSDomControl extends AsyncService {
|
|
| 199 |
|
| 200 |
return parsed.window.document.documentElement;
|
| 201 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
}
|
| 203 |
|
| 204 |
const jsdomControl = container.resolve(JSDomControl);
|
|
|
|
| 4 |
import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
|
| 5 |
import { JSDOM, VirtualConsole } from 'jsdom';
|
| 6 |
import { Readability } from '@mozilla/readability';
|
| 7 |
+
import TurndownService from 'turndown';
|
| 8 |
|
| 9 |
const virtualConsole = new VirtualConsole();
|
| 10 |
virtualConsole.on('error', () => void 0);
|
|
|
|
| 36 |
if (!snapshot?.html) {
|
| 37 |
return snapshot;
|
| 38 |
}
|
| 39 |
+
const t0 = Date.now();
|
| 40 |
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
| 41 |
const allNodes: Node[] = [];
|
| 42 |
if (options?.withIframe) {
|
|
|
|
| 138 |
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
| 139 |
} as PageSnapshot;
|
| 140 |
|
| 141 |
+
const dt = Date.now() - t0;
|
| 142 |
+
if (dt > 1000) {
|
| 143 |
+
this.logger.warn(`Performance issue: Narrowing snapshot took ${dt}ms`, { url: snapshot.href, dt });
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
return r;
|
| 147 |
}
|
| 148 |
|
| 149 |
inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
|
| 150 |
+
const t0 = Date.now();
|
| 151 |
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
|
| 152 |
try {
|
| 153 |
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
|
|
|
| 198 |
void 0;
|
| 199 |
}
|
| 200 |
|
| 201 |
+
const dt = Date.now() - t0;
|
| 202 |
+
if (dt > 1000) {
|
| 203 |
+
this.logger.warn(`Performance issue: Inferring snapshot took ${dt}ms`, { url: snapshot.href, dt });
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
return extendedSnapshot;
|
| 207 |
}
|
| 208 |
|
|
|
|
| 211 |
|
| 212 |
return parsed.window.document.documentElement;
|
| 213 |
}
|
| 214 |
+
|
| 215 |
+
runTurndown(turndownService: TurndownService, html: TurndownService.Node | string) {
|
| 216 |
+
const t0 = Date.now();
|
| 217 |
+
|
| 218 |
+
try {
|
| 219 |
+
return turndownService.turndown(html);
|
| 220 |
+
} finally {
|
| 221 |
+
const dt = Date.now() - t0;
|
| 222 |
+
if (dt > 1000) {
|
| 223 |
+
this.logger.warn(`Performance issue: Turndown took ${dt}ms`, { dt });
|
| 224 |
+
}
|
| 225 |
+
}
|
| 226 |
+
}
|
| 227 |
}
|
| 228 |
|
| 229 |
const jsdomControl = container.resolve(JSDomControl);
|