Spaces:
Build error
Build error
feat: opt out gfm/table
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -688,6 +688,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 688 |
this.threadLocal.set('timeout', opts.timeout * 1000);
|
| 689 |
}
|
| 690 |
this.threadLocal.set('retainImages', opts.retainImages);
|
|
|
|
| 691 |
|
| 692 |
const crawlOpts: ExtraScrappingOptions = {
|
| 693 |
proxyUrl: opts.proxyUrl,
|
|
|
|
| 688 |
this.threadLocal.set('timeout', opts.timeout * 1000);
|
| 689 |
}
|
| 690 |
this.threadLocal.set('retainImages', opts.retainImages);
|
| 691 |
+
this.threadLocal.set('noGfm', opts.noGfm);
|
| 692 |
|
| 693 |
const crawlOpts: ExtraScrappingOptions = {
|
| 694 |
proxyUrl: opts.proxyUrl,
|
backend/functions/src/dto/scrapping-options.ts
CHANGED
|
@@ -203,6 +203,11 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 203 |
})
|
| 204 |
noCache!: boolean;
|
| 205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
@Prop()
|
| 207 |
cacheTolerance?: number;
|
| 208 |
|
|
@@ -318,6 +323,11 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 318 |
instance.cacheTolerance = cacheTolerance;
|
| 319 |
}
|
| 320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
let timeoutSeconds = parseInt(ctx?.req.get('x-timeout') || '');
|
| 322 |
if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) {
|
| 323 |
instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180;
|
|
|
|
| 203 |
})
|
| 204 |
noCache!: boolean;
|
| 205 |
|
| 206 |
+
@Prop({
|
| 207 |
+
default: false,
|
| 208 |
+
})
|
| 209 |
+
noGfm!: string | boolean;
|
| 210 |
+
|
| 211 |
@Prop()
|
| 212 |
cacheTolerance?: number;
|
| 213 |
|
|
|
|
| 323 |
instance.cacheTolerance = cacheTolerance;
|
| 324 |
}
|
| 325 |
|
| 326 |
+
const noGfm = ctx?.req.get('x-no-gfm');
|
| 327 |
+
if (noGfm) {
|
| 328 |
+
instance.noGfm = noGfm === 'table' ? noGfm : Boolean(noGfm);
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
let timeoutSeconds = parseInt(ctx?.req.get('x-timeout') || '');
|
| 332 |
if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) {
|
| 333 |
instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180;
|
backend/functions/src/services/snapshot-formatter.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import { randomUUID } from 'crypto';
|
| 2 |
import { container, singleton } from 'tsyringe';
|
| 3 |
import { AsyncService, HashManager, marshalErrorLike } from 'civkit';
|
| 4 |
-
import TurndownService, { Rule } from 'turndown';
|
| 5 |
import { Logger } from '../shared/services/logger';
|
| 6 |
import { PageSnapshot } from './puppeteer';
|
| 7 |
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
|
@@ -44,12 +44,15 @@ export interface FormattedPage {
|
|
| 44 |
|
| 45 |
export const md5Hasher = new HashManager('md5', 'hex');
|
| 46 |
|
|
|
|
|
|
|
| 47 |
@singleton()
|
| 48 |
export class SnapshotFormatter extends AsyncService {
|
| 49 |
|
| 50 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 51 |
|
| 52 |
-
|
|
|
|
| 53 |
|
| 54 |
constructor(
|
| 55 |
protected globalLogger: Logger,
|
|
@@ -182,9 +185,10 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 182 |
}
|
| 183 |
|
| 184 |
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
|
|
|
| 185 |
const imageRetention = this.threadLocal.get('retainImages') as CrawlerOptions['retainImages'];
|
| 186 |
let imgIdx = 0;
|
| 187 |
-
const customRules = {
|
| 188 |
'img-retention': {
|
| 189 |
filter: 'img',
|
| 190 |
replacement: (_content: string, node: HTMLElement) => {
|
|
@@ -255,10 +259,16 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 255 |
}
|
| 256 |
} as Rule
|
| 257 |
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
|
| 259 |
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
| 260 |
let toBeTurnedToMd = jsDomElementOfHTML;
|
| 261 |
-
let turnDownService = this.getTurndown({
|
| 262 |
if (!mode.includes('markdown') && snapshot.parsed?.content) {
|
| 263 |
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
| 264 |
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
|
@@ -266,15 +276,15 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 266 |
|
| 267 |
// If Readability did its job
|
| 268 |
if (par2.length >= 0.3 * par1.length) {
|
| 269 |
-
turnDownService = this.getTurndown({ noRules: true,
|
| 270 |
if (snapshot.parsed.content) {
|
| 271 |
toBeTurnedToMd = jsDomElementOfParsed;
|
| 272 |
}
|
| 273 |
}
|
| 274 |
}
|
| 275 |
|
| 276 |
-
|
| 277 |
-
turnDownService = turnDownService.use(
|
| 278 |
}
|
| 279 |
|
| 280 |
// _p is the special suffix for withGeneratedAlt
|
|
@@ -297,7 +307,7 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 297 |
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
| 298 |
} catch (err) {
|
| 299 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 300 |
-
const vanillaTurnDownService = this.getTurndown({
|
| 301 |
try {
|
| 302 |
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
| 303 |
} catch (err2) {
|
|
@@ -315,7 +325,7 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 315 |
contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim();
|
| 316 |
} catch (err) {
|
| 317 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 318 |
-
const vanillaTurnDownService = this.getTurndown({
|
| 319 |
try {
|
| 320 |
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim();
|
| 321 |
} catch (err2) {
|
|
@@ -465,11 +475,15 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 465 |
imgDataUrlToObjectUrl?: boolean;
|
| 466 |
removeImages?: boolean | 'src';
|
| 467 |
customRules?: { [k: string]: Rule; };
|
|
|
|
| 468 |
}) {
|
| 469 |
const turnDownService = new TurndownService({
|
| 470 |
codeBlockStyle: 'fenced',
|
| 471 |
preformattedCode: true,
|
| 472 |
} as any);
|
|
|
|
|
|
|
|
|
|
| 473 |
if (!options?.noRules) {
|
| 474 |
turnDownService.addRule('remove-irrelevant', {
|
| 475 |
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'],
|
|
@@ -586,7 +600,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 586 |
return true;
|
| 587 |
}
|
| 588 |
|
| 589 |
-
if (content.includes('<table') && content.includes('</table>')) {
|
| 590 |
if (node?.textContent && content.length > node.textContent.length * 0.8) {
|
| 591 |
return true;
|
| 592 |
}
|
|
|
|
| 1 |
import { randomUUID } from 'crypto';
|
| 2 |
import { container, singleton } from 'tsyringe';
|
| 3 |
import { AsyncService, HashManager, marshalErrorLike } from 'civkit';
|
| 4 |
+
import TurndownService, { Filter, Rule } from 'turndown';
|
| 5 |
import { Logger } from '../shared/services/logger';
|
| 6 |
import { PageSnapshot } from './puppeteer';
|
| 7 |
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
|
|
|
| 44 |
|
| 45 |
export const md5Hasher = new HashManager('md5', 'hex');
|
| 46 |
|
| 47 |
+
const gfmPlugin = require('turndown-plugin-gfm');
|
| 48 |
+
|
| 49 |
@singleton()
|
| 50 |
export class SnapshotFormatter extends AsyncService {
|
| 51 |
|
| 52 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 53 |
|
| 54 |
+
gfmPlugin = gfmPlugin.gfm;
|
| 55 |
+
gfmNoTable = [gfmPlugin.highlightedCodeBlock, gfmPlugin.strikethrough, gfmPlugin.taskListItems];
|
| 56 |
|
| 57 |
constructor(
|
| 58 |
protected globalLogger: Logger,
|
|
|
|
| 185 |
}
|
| 186 |
|
| 187 |
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
| 188 |
+
const noGFMOpts = this.threadLocal.get('noGfm');
|
| 189 |
const imageRetention = this.threadLocal.get('retainImages') as CrawlerOptions['retainImages'];
|
| 190 |
let imgIdx = 0;
|
| 191 |
+
const customRules: { [k: string]: Rule; } = {
|
| 192 |
'img-retention': {
|
| 193 |
filter: 'img',
|
| 194 |
replacement: (_content: string, node: HTMLElement) => {
|
|
|
|
| 259 |
}
|
| 260 |
} as Rule
|
| 261 |
};
|
| 262 |
+
const optsMixin = {
|
| 263 |
+
url: snapshot.rebase || nominalUrl,
|
| 264 |
+
customRules,
|
| 265 |
+
customKeep: noGFMOpts === 'table' ? 'table' : undefined,
|
| 266 |
+
imgDataUrlToObjectUrl,
|
| 267 |
+
} as const;
|
| 268 |
|
| 269 |
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
| 270 |
let toBeTurnedToMd = jsDomElementOfHTML;
|
| 271 |
+
let turnDownService = this.getTurndown({ ...optsMixin });
|
| 272 |
if (!mode.includes('markdown') && snapshot.parsed?.content) {
|
| 273 |
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
| 274 |
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
|
|
|
| 276 |
|
| 277 |
// If Readability did its job
|
| 278 |
if (par2.length >= 0.3 * par1.length) {
|
| 279 |
+
turnDownService = this.getTurndown({ noRules: true, ...optsMixin });
|
| 280 |
if (snapshot.parsed.content) {
|
| 281 |
toBeTurnedToMd = jsDomElementOfParsed;
|
| 282 |
}
|
| 283 |
}
|
| 284 |
}
|
| 285 |
|
| 286 |
+
if (!noGFMOpts) {
|
| 287 |
+
turnDownService = turnDownService.use(noGFMOpts === 'table' ? this.gfmNoTable : this.gfmPlugin);
|
| 288 |
}
|
| 289 |
|
| 290 |
// _p is the special suffix for withGeneratedAlt
|
|
|
|
| 307 |
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
| 308 |
} catch (err) {
|
| 309 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 310 |
+
const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
|
| 311 |
try {
|
| 312 |
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
| 313 |
} catch (err2) {
|
|
|
|
| 325 |
contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim();
|
| 326 |
} catch (err) {
|
| 327 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 328 |
+
const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
|
| 329 |
try {
|
| 330 |
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim();
|
| 331 |
} catch (err2) {
|
|
|
|
| 475 |
imgDataUrlToObjectUrl?: boolean;
|
| 476 |
removeImages?: boolean | 'src';
|
| 477 |
customRules?: { [k: string]: Rule; };
|
| 478 |
+
customKeep?: Filter
|
| 479 |
}) {
|
| 480 |
const turnDownService = new TurndownService({
|
| 481 |
codeBlockStyle: 'fenced',
|
| 482 |
preformattedCode: true,
|
| 483 |
} as any);
|
| 484 |
+
if (options?.customKeep) {
|
| 485 |
+
turnDownService.keep(options.customKeep);
|
| 486 |
+
}
|
| 487 |
if (!options?.noRules) {
|
| 488 |
turnDownService.addRule('remove-irrelevant', {
|
| 489 |
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'],
|
|
|
|
| 600 |
return true;
|
| 601 |
}
|
| 602 |
|
| 603 |
+
if (!this.threadLocal.get('noGfm') && content.includes('<table') && content.includes('</table>')) {
|
| 604 |
if (node?.textContent && content.length > node.textContent.length * 0.8) {
|
| 605 |
return true;
|
| 606 |
}
|