nomagick commited on
Commit
7af2bde
·
unverified ·
1 Parent(s): 40e9185

fix: html rebasing with <base> tag

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -394,7 +394,7 @@ export class CrawlerHost extends RPCHost {
394
 
395
  const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
396
  let toBeTurnedToMd = jsDomElementOfHTML;
397
- let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
398
  if (mode !== 'markdown' && snapshot.parsed?.content) {
399
  const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
400
  const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
@@ -402,7 +402,7 @@ export class CrawlerHost extends RPCHost {
402
 
403
  // If Readability did its job
404
  if (par2.length >= 0.3 * par1.length) {
405
- turnDownService = this.getTurndown({ noRules: true, url: snapshot.href, imgDataUrlToObjectUrl });
406
  if (snapshot.parsed.content) {
407
  toBeTurnedToMd = jsDomElementOfParsed;
408
  }
@@ -440,7 +440,7 @@ export class CrawlerHost extends RPCHost {
440
 
441
  let src;
442
  try {
443
- src = new URL(linkPreferredSrc, nominalUrl).toString();
444
  } catch (_err) {
445
  void 0;
446
  }
@@ -485,7 +485,7 @@ export class CrawlerHost extends RPCHost {
485
  contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
486
  } catch (err) {
487
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
488
- const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
489
  try {
490
  contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
491
  } catch (err2) {
@@ -502,7 +502,7 @@ export class CrawlerHost extends RPCHost {
502
  contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
503
  } catch (err) {
504
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
505
- const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
506
  try {
507
  contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
508
  } catch (err2) {
 
394
 
395
  const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
396
  let toBeTurnedToMd = jsDomElementOfHTML;
397
+ let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
398
  if (mode !== 'markdown' && snapshot.parsed?.content) {
399
  const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
400
  const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
 
402
 
403
  // If Readability did its job
404
  if (par2.length >= 0.3 * par1.length) {
405
+ turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
406
  if (snapshot.parsed.content) {
407
  toBeTurnedToMd = jsDomElementOfParsed;
408
  }
 
440
 
441
  let src;
442
  try {
443
+ src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
444
  } catch (_err) {
445
  void 0;
446
  }
 
485
  contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
486
  } catch (err) {
487
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
488
+ const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
489
  try {
490
  contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
491
  } catch (err2) {
 
502
  contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
503
  } catch (err) {
504
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
505
+ const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
506
  try {
507
  contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
508
  } catch (err2) {
backend/functions/src/services/jsdom.ts CHANGED
@@ -121,7 +121,7 @@ export class JSDomControl extends AsyncService {
121
  .flat()
122
  .map((x) => {
123
  try {
124
- return new URL(x, snapshot.href).toString();
125
  } catch (err) {
126
  return null;
127
  }
@@ -160,7 +160,7 @@ export class JSDomControl extends AsyncService {
160
  return undefined;
161
  }
162
  try {
163
- const parsed = new URL(href, snapshot.href);
164
  if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') {
165
  return undefined;
166
  }
@@ -188,7 +188,7 @@ export class JSDomControl extends AsyncService {
188
  }
189
 
190
  return {
191
- src: new URL(linkPreferredSrc, snapshot.href).toString(),
192
  width: parseInt(x.getAttribute('width') || '0'),
193
  height: parseInt(x.getAttribute('height') || '0'),
194
  alt: x.getAttribute('alt') || x.getAttribute('title'),
 
121
  .flat()
122
  .map((x) => {
123
  try {
124
+ return new URL(x, snapshot.rebase || snapshot.href).toString();
125
  } catch (err) {
126
  return null;
127
  }
 
160
  return undefined;
161
  }
162
  try {
163
+ const parsed = new URL(href, snapshot.rebase || snapshot.href);
164
  if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') {
165
  return undefined;
166
  }
 
188
  }
189
 
190
  return {
191
+ src: new URL(linkPreferredSrc, snapshot.rebase || snapshot.href).toString(),
192
  width: parseInt(x.getAttribute('width') || '0'),
193
  height: parseInt(x.getAttribute('height') || '0'),
194
  alt: x.getAttribute('alt') || x.getAttribute('title'),
backend/functions/src/services/puppeteer.ts CHANGED
@@ -42,6 +42,7 @@ export interface ReadabilityParsed {
42
  export interface PageSnapshot {
43
  title: string;
44
  href: string;
 
45
  html: string;
46
  text: string;
47
  parsed?: Partial<ReadabilityParsed> | null;
@@ -101,7 +102,7 @@ function briefImgs(elem) {
101
  }
102
 
103
  return {
104
- src: new URL(linkPreferredSrc, document.location.href).toString(),
105
  loaded: x.complete,
106
  width: x.width,
107
  height: x.height,
@@ -179,6 +180,9 @@ function giveSnapshot(stopActiveSnapshot) {
179
  maxElemDepth: domAnalysis.maxDepth,
180
  elemCount: domAnalysis.elementCount,
181
  };
 
 
 
182
  if (parsed && parsed.content) {
183
  const elem = document.createElement('div');
184
  elem.innerHTML = parsed.content;
 
42
  export interface PageSnapshot {
43
  title: string;
44
  href: string;
45
+ rebase?: string;
46
  html: string;
47
  text: string;
48
  parsed?: Partial<ReadabilityParsed> | null;
 
102
  }
103
 
104
  return {
105
+ src: new URL(linkPreferredSrc, document.baseURI).toString(),
106
  loaded: x.complete,
107
  width: x.width,
108
  height: x.height,
 
180
  maxElemDepth: domAnalysis.maxDepth,
181
  elemCount: domAnalysis.elementCount,
182
  };
183
+ if (document.baseURI !== r.href) {
184
+ r.rebase = document.baseURI;
185
+ }
186
  if (parsed && parsed.content) {
187
  const elem = document.createElement('div');
188
  elem.innerHTML = parsed.content;