nomagick commited on
Commit
16cabca
·
unverified ·
1 Parent(s): 2b29679

feat: opt out gfm/table

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -688,6 +688,7 @@ export class CrawlerHost extends RPCHost {
688
  this.threadLocal.set('timeout', opts.timeout * 1000);
689
  }
690
  this.threadLocal.set('retainImages', opts.retainImages);
 
691
 
692
  const crawlOpts: ExtraScrappingOptions = {
693
  proxyUrl: opts.proxyUrl,
 
688
  this.threadLocal.set('timeout', opts.timeout * 1000);
689
  }
690
  this.threadLocal.set('retainImages', opts.retainImages);
691
+ this.threadLocal.set('noGfm', opts.noGfm);
692
 
693
  const crawlOpts: ExtraScrappingOptions = {
694
  proxyUrl: opts.proxyUrl,
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -203,6 +203,11 @@ export class CrawlerOptions extends AutoCastable {
203
  })
204
  noCache!: boolean;
205
 
 
 
 
 
 
206
  @Prop()
207
  cacheTolerance?: number;
208
 
@@ -318,6 +323,11 @@ export class CrawlerOptions extends AutoCastable {
318
  instance.cacheTolerance = cacheTolerance;
319
  }
320
 
 
 
 
 
 
321
  let timeoutSeconds = parseInt(ctx?.req.get('x-timeout') || '');
322
  if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) {
323
  instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180;
 
203
  })
204
  noCache!: boolean;
205
 
206
+ @Prop({
207
+ default: false,
208
+ })
209
+ noGfm!: string | boolean;
210
+
211
  @Prop()
212
  cacheTolerance?: number;
213
 
 
323
  instance.cacheTolerance = cacheTolerance;
324
  }
325
 
326
+ const noGfm = ctx?.req.get('x-no-gfm');
327
+ if (noGfm) {
328
+ instance.noGfm = noGfm === 'table' ? noGfm : Boolean(noGfm);
329
+ }
330
+
331
  let timeoutSeconds = parseInt(ctx?.req.get('x-timeout') || '');
332
  if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) {
333
  instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180;
backend/functions/src/services/snapshot-formatter.ts CHANGED
@@ -1,7 +1,7 @@
1
  import { randomUUID } from 'crypto';
2
  import { container, singleton } from 'tsyringe';
3
  import { AsyncService, HashManager, marshalErrorLike } from 'civkit';
4
- import TurndownService, { Rule } from 'turndown';
5
  import { Logger } from '../shared/services/logger';
6
  import { PageSnapshot } from './puppeteer';
7
  import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
@@ -44,12 +44,15 @@ export interface FormattedPage {
44
 
45
  export const md5Hasher = new HashManager('md5', 'hex');
46
 
 
 
47
  @singleton()
48
  export class SnapshotFormatter extends AsyncService {
49
 
50
  logger = this.globalLogger.child({ service: this.constructor.name });
51
 
52
- turnDownPlugins = [require('turndown-plugin-gfm').tables, require('turndown-plugin-gfm').strikethrough];
 
53
 
54
  constructor(
55
  protected globalLogger: Logger,
@@ -182,9 +185,10 @@ export class SnapshotFormatter extends AsyncService {
182
  }
183
 
184
  const urlToAltMap: { [k: string]: string | undefined; } = {};
 
185
  const imageRetention = this.threadLocal.get('retainImages') as CrawlerOptions['retainImages'];
186
  let imgIdx = 0;
187
- const customRules = {
188
  'img-retention': {
189
  filter: 'img',
190
  replacement: (_content: string, node: HTMLElement) => {
@@ -255,10 +259,16 @@ export class SnapshotFormatter extends AsyncService {
255
  }
256
  } as Rule
257
  };
 
 
 
 
 
 
258
 
259
  const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
260
  let toBeTurnedToMd = jsDomElementOfHTML;
261
- let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl, customRules });
262
  if (!mode.includes('markdown') && snapshot.parsed?.content) {
263
  const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
264
  const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
@@ -266,15 +276,15 @@ export class SnapshotFormatter extends AsyncService {
266
 
267
  // If Readability did its job
268
  if (par2.length >= 0.3 * par1.length) {
269
- turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl, customRules });
270
  if (snapshot.parsed.content) {
271
  toBeTurnedToMd = jsDomElementOfParsed;
272
  }
273
  }
274
  }
275
 
276
- for (const plugin of this.turnDownPlugins) {
277
- turnDownService = turnDownService.use(plugin);
278
  }
279
 
280
  // _p is the special suffix for withGeneratedAlt
@@ -297,7 +307,7 @@ export class SnapshotFormatter extends AsyncService {
297
  contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
298
  } catch (err) {
299
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
300
- const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl, customRules });
301
  try {
302
  contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
303
  } catch (err2) {
@@ -315,7 +325,7 @@ export class SnapshotFormatter extends AsyncService {
315
  contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim();
316
  } catch (err) {
317
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
318
- const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl, customRules });
319
  try {
320
  contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim();
321
  } catch (err2) {
@@ -465,11 +475,15 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
465
  imgDataUrlToObjectUrl?: boolean;
466
  removeImages?: boolean | 'src';
467
  customRules?: { [k: string]: Rule; };
 
468
  }) {
469
  const turnDownService = new TurndownService({
470
  codeBlockStyle: 'fenced',
471
  preformattedCode: true,
472
  } as any);
 
 
 
473
  if (!options?.noRules) {
474
  turnDownService.addRule('remove-irrelevant', {
475
  filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'],
@@ -586,7 +600,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
586
  return true;
587
  }
588
 
589
- if (content.includes('<table') && content.includes('</table>')) {
590
  if (node?.textContent && content.length > node.textContent.length * 0.8) {
591
  return true;
592
  }
 
1
  import { randomUUID } from 'crypto';
2
  import { container, singleton } from 'tsyringe';
3
  import { AsyncService, HashManager, marshalErrorLike } from 'civkit';
4
+ import TurndownService, { Filter, Rule } from 'turndown';
5
  import { Logger } from '../shared/services/logger';
6
  import { PageSnapshot } from './puppeteer';
7
  import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
 
44
 
45
  export const md5Hasher = new HashManager('md5', 'hex');
46
 
47
+ const gfmPlugin = require('turndown-plugin-gfm');
48
+
49
  @singleton()
50
  export class SnapshotFormatter extends AsyncService {
51
 
52
  logger = this.globalLogger.child({ service: this.constructor.name });
53
 
54
+ gfmPlugin = gfmPlugin.gfm;
55
+ gfmNoTable = [gfmPlugin.highlightedCodeBlock, gfmPlugin.strikethrough, gfmPlugin.taskListItems];
56
 
57
  constructor(
58
  protected globalLogger: Logger,
 
185
  }
186
 
187
  const urlToAltMap: { [k: string]: string | undefined; } = {};
188
+ const noGFMOpts = this.threadLocal.get('noGfm');
189
  const imageRetention = this.threadLocal.get('retainImages') as CrawlerOptions['retainImages'];
190
  let imgIdx = 0;
191
+ const customRules: { [k: string]: Rule; } = {
192
  'img-retention': {
193
  filter: 'img',
194
  replacement: (_content: string, node: HTMLElement) => {
 
259
  }
260
  } as Rule
261
  };
262
+ const optsMixin = {
263
+ url: snapshot.rebase || nominalUrl,
264
+ customRules,
265
+ customKeep: noGFMOpts === 'table' ? 'table' : undefined,
266
+ imgDataUrlToObjectUrl,
267
+ } as const;
268
 
269
  const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
270
  let toBeTurnedToMd = jsDomElementOfHTML;
271
+ let turnDownService = this.getTurndown({ ...optsMixin });
272
  if (!mode.includes('markdown') && snapshot.parsed?.content) {
273
  const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
274
  const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
 
276
 
277
  // If Readability did its job
278
  if (par2.length >= 0.3 * par1.length) {
279
+ turnDownService = this.getTurndown({ noRules: true, ...optsMixin });
280
  if (snapshot.parsed.content) {
281
  toBeTurnedToMd = jsDomElementOfParsed;
282
  }
283
  }
284
  }
285
 
286
+ if (!noGFMOpts) {
287
+ turnDownService = turnDownService.use(noGFMOpts === 'table' ? this.gfmNoTable : this.gfmPlugin);
288
  }
289
 
290
  // _p is the special suffix for withGeneratedAlt
 
307
  contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
308
  } catch (err) {
309
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
310
+ const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
311
  try {
312
  contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
313
  } catch (err2) {
 
325
  contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim();
326
  } catch (err) {
327
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
328
+ const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
329
  try {
330
  contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim();
331
  } catch (err2) {
 
475
  imgDataUrlToObjectUrl?: boolean;
476
  removeImages?: boolean | 'src';
477
  customRules?: { [k: string]: Rule; };
478
+ customKeep?: Filter
479
  }) {
480
  const turnDownService = new TurndownService({
481
  codeBlockStyle: 'fenced',
482
  preformattedCode: true,
483
  } as any);
484
+ if (options?.customKeep) {
485
+ turnDownService.keep(options.customKeep);
486
+ }
487
  if (!options?.noRules) {
488
  turnDownService.addRule('remove-irrelevant', {
489
  filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'],
 
600
  return true;
601
  }
602
 
603
+ if (!this.threadLocal.get('noGfm') && content.includes('<table') && content.includes('</table>')) {
604
  if (node?.textContent && content.length > node.textContent.length * 0.8) {
605
  return true;
606
  }