Spaces:
Build error
Build error
fix: html rebasing with <base> tag
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -394,7 +394,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 394 |
|
| 395 |
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
| 396 |
let toBeTurnedToMd = jsDomElementOfHTML;
|
| 397 |
-
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
|
| 398 |
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
| 399 |
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
| 400 |
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
|
@@ -402,7 +402,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 402 |
|
| 403 |
// If Readability did its job
|
| 404 |
if (par2.length >= 0.3 * par1.length) {
|
| 405 |
-
turnDownService = this.getTurndown({ noRules: true, url: snapshot.
|
| 406 |
if (snapshot.parsed.content) {
|
| 407 |
toBeTurnedToMd = jsDomElementOfParsed;
|
| 408 |
}
|
|
@@ -440,7 +440,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 440 |
|
| 441 |
let src;
|
| 442 |
try {
|
| 443 |
-
src = new URL(linkPreferredSrc, nominalUrl).toString();
|
| 444 |
} catch (_err) {
|
| 445 |
void 0;
|
| 446 |
}
|
|
@@ -485,7 +485,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 485 |
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
| 486 |
} catch (err) {
|
| 487 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 488 |
-
const vanillaTurnDownService = this.getTurndown({ url: snapshot.
|
| 489 |
try {
|
| 490 |
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
| 491 |
} catch (err2) {
|
|
@@ -502,7 +502,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 502 |
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
|
| 503 |
} catch (err) {
|
| 504 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 505 |
-
const vanillaTurnDownService = this.getTurndown({ url: snapshot.
|
| 506 |
try {
|
| 507 |
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
|
| 508 |
} catch (err2) {
|
|
|
|
| 394 |
|
| 395 |
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
| 396 |
let toBeTurnedToMd = jsDomElementOfHTML;
|
| 397 |
+
let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
| 398 |
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
| 399 |
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
| 400 |
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
|
|
|
| 402 |
|
| 403 |
// If Readability did its job
|
| 404 |
if (par2.length >= 0.3 * par1.length) {
|
| 405 |
+
turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
| 406 |
if (snapshot.parsed.content) {
|
| 407 |
toBeTurnedToMd = jsDomElementOfParsed;
|
| 408 |
}
|
|
|
|
| 440 |
|
| 441 |
let src;
|
| 442 |
try {
|
| 443 |
+
src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
|
| 444 |
} catch (_err) {
|
| 445 |
void 0;
|
| 446 |
}
|
|
|
|
| 485 |
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
| 486 |
} catch (err) {
|
| 487 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 488 |
+
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
| 489 |
try {
|
| 490 |
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
| 491 |
} catch (err2) {
|
|
|
|
| 502 |
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
|
| 503 |
} catch (err) {
|
| 504 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 505 |
+
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
| 506 |
try {
|
| 507 |
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
|
| 508 |
} catch (err2) {
|
backend/functions/src/services/jsdom.ts
CHANGED
|
@@ -121,7 +121,7 @@ export class JSDomControl extends AsyncService {
|
|
| 121 |
.flat()
|
| 122 |
.map((x) => {
|
| 123 |
try {
|
| 124 |
-
return new URL(x, snapshot.href).toString();
|
| 125 |
} catch (err) {
|
| 126 |
return null;
|
| 127 |
}
|
|
@@ -160,7 +160,7 @@ export class JSDomControl extends AsyncService {
|
|
| 160 |
return undefined;
|
| 161 |
}
|
| 162 |
try {
|
| 163 |
-
const parsed = new URL(href, snapshot.href);
|
| 164 |
if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') {
|
| 165 |
return undefined;
|
| 166 |
}
|
|
@@ -188,7 +188,7 @@ export class JSDomControl extends AsyncService {
|
|
| 188 |
}
|
| 189 |
|
| 190 |
return {
|
| 191 |
-
src: new URL(linkPreferredSrc, snapshot.href).toString(),
|
| 192 |
width: parseInt(x.getAttribute('width') || '0'),
|
| 193 |
height: parseInt(x.getAttribute('height') || '0'),
|
| 194 |
alt: x.getAttribute('alt') || x.getAttribute('title'),
|
|
|
|
| 121 |
.flat()
|
| 122 |
.map((x) => {
|
| 123 |
try {
|
| 124 |
+
return new URL(x, snapshot.rebase || snapshot.href).toString();
|
| 125 |
} catch (err) {
|
| 126 |
return null;
|
| 127 |
}
|
|
|
|
| 160 |
return undefined;
|
| 161 |
}
|
| 162 |
try {
|
| 163 |
+
const parsed = new URL(href, snapshot.rebase || snapshot.href);
|
| 164 |
if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') {
|
| 165 |
return undefined;
|
| 166 |
}
|
|
|
|
| 188 |
}
|
| 189 |
|
| 190 |
return {
|
| 191 |
+
src: new URL(linkPreferredSrc, snapshot.rebase || snapshot.href).toString(),
|
| 192 |
width: parseInt(x.getAttribute('width') || '0'),
|
| 193 |
height: parseInt(x.getAttribute('height') || '0'),
|
| 194 |
alt: x.getAttribute('alt') || x.getAttribute('title'),
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -42,6 +42,7 @@ export interface ReadabilityParsed {
|
|
| 42 |
export interface PageSnapshot {
|
| 43 |
title: string;
|
| 44 |
href: string;
|
|
|
|
| 45 |
html: string;
|
| 46 |
text: string;
|
| 47 |
parsed?: Partial<ReadabilityParsed> | null;
|
|
@@ -101,7 +102,7 @@ function briefImgs(elem) {
|
|
| 101 |
}
|
| 102 |
|
| 103 |
return {
|
| 104 |
-
src: new URL(linkPreferredSrc, document.
|
| 105 |
loaded: x.complete,
|
| 106 |
width: x.width,
|
| 107 |
height: x.height,
|
|
@@ -179,6 +180,9 @@ function giveSnapshot(stopActiveSnapshot) {
|
|
| 179 |
maxElemDepth: domAnalysis.maxDepth,
|
| 180 |
elemCount: domAnalysis.elementCount,
|
| 181 |
};
|
|
|
|
|
|
|
|
|
|
| 182 |
if (parsed && parsed.content) {
|
| 183 |
const elem = document.createElement('div');
|
| 184 |
elem.innerHTML = parsed.content;
|
|
|
|
| 42 |
export interface PageSnapshot {
|
| 43 |
title: string;
|
| 44 |
href: string;
|
| 45 |
+
rebase?: string;
|
| 46 |
html: string;
|
| 47 |
text: string;
|
| 48 |
parsed?: Partial<ReadabilityParsed> | null;
|
|
|
|
| 102 |
}
|
| 103 |
|
| 104 |
return {
|
| 105 |
+
src: new URL(linkPreferredSrc, document.baseURI).toString(),
|
| 106 |
loaded: x.complete,
|
| 107 |
width: x.width,
|
| 108 |
height: x.height,
|
|
|
|
| 180 |
maxElemDepth: domAnalysis.maxDepth,
|
| 181 |
elemCount: domAnalysis.elementCount,
|
| 182 |
};
|
| 183 |
+
if (document.baseURI !== r.href) {
|
| 184 |
+
r.rebase = document.baseURI;
|
| 185 |
+
}
|
| 186 |
if (parsed && parsed.content) {
|
| 187 |
const elem = document.createElement('div');
|
| 188 |
elem.innerHTML = parsed.content;
|