Spaces:
Build error
Build error
feat: x-remove-selector
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -27,7 +27,8 @@ import { DomainBlockade } from '../db/domain-blockade';
|
|
| 27 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 28 |
|
| 29 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 30 |
-
targetSelector?: string;
|
|
|
|
| 31 |
}
|
| 32 |
|
| 33 |
export interface FormattedPage {
|
|
@@ -131,12 +132,15 @@ export class CrawlerHost extends RPCHost {
|
|
| 131 |
return indexObject;
|
| 132 |
}
|
| 133 |
|
| 134 |
-
getTurndown(
|
|
|
|
|
|
|
|
|
|
| 135 |
const turnDownService = new TurndownService({
|
| 136 |
codeBlockStyle: 'fenced',
|
| 137 |
preformattedCode: true,
|
| 138 |
} as any);
|
| 139 |
-
if (!noRules) {
|
| 140 |
turnDownService.addRule('remove-irrelevant', {
|
| 141 |
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea'],
|
| 142 |
replacement: () => ''
|
|
@@ -177,7 +181,14 @@ export class CrawlerHost extends RPCHost {
|
|
| 177 |
if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
|
| 178 |
|
| 179 |
const fixedContent = content.replace(/\s+/g, ' ').trim();
|
| 180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
return `[${fixedContent}](${fixedHref}${title || ''})`;
|
| 183 |
}
|
|
@@ -317,7 +328,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 317 |
}
|
| 318 |
|
| 319 |
const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
|
| 320 |
-
let turnDownService = mode === 'markdown' ? this.getTurndown() : this.getTurndown(
|
| 321 |
for (const plugin of this.turnDownPlugins) {
|
| 322 |
turnDownService = turnDownService.use(plugin);
|
| 323 |
}
|
|
@@ -380,7 +391,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 380 |
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
| 381 |
} catch (err) {
|
| 382 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 383 |
-
const vanillaTurnDownService = this.getTurndown();
|
| 384 |
try {
|
| 385 |
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
| 386 |
} catch (err2) {
|
|
@@ -397,7 +408,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 397 |
contentText = turnDownService.turndown(snapshot.html);
|
| 398 |
} catch (err) {
|
| 399 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 400 |
-
const vanillaTurnDownService = this.getTurndown();
|
| 401 |
try {
|
| 402 |
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
| 403 |
} catch (err2) {
|
|
@@ -799,22 +810,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 799 |
return r;
|
| 800 |
}
|
| 801 |
|
| 802 |
-
async *
|
| 803 |
let cache;
|
| 804 |
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
| 805 |
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
| 806 |
}
|
| 807 |
|
| 808 |
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
|
| 809 |
-
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts
|
| 810 |
|
| 811 |
return;
|
| 812 |
}
|
| 813 |
|
| 814 |
try {
|
| 815 |
-
if (crawlOpts?.targetSelector) {
|
| 816 |
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
| 817 |
-
yield this.puppeteerControl.narrowSnapshot(x, crawlOpts
|
| 818 |
}
|
| 819 |
|
| 820 |
return;
|
|
@@ -824,7 +835,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 824 |
} catch (err: any) {
|
| 825 |
if (cache) {
|
| 826 |
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
|
| 827 |
-
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts
|
| 828 |
return;
|
| 829 |
}
|
| 830 |
throw err;
|
|
@@ -853,7 +864,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 853 |
}
|
| 854 |
|
| 855 |
|
| 856 |
-
async *
|
| 857 |
const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));
|
| 858 |
|
| 859 |
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
|
|
@@ -910,8 +921,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 910 |
proxyUrl: opts.proxyUrl,
|
| 911 |
cookies: opts.setCookies,
|
| 912 |
favorScreenshot: opts.respondWith === 'screenshot',
|
| 913 |
-
|
| 914 |
targetSelector: opts.targetSelector,
|
|
|
|
| 915 |
overrideUserAgent: opts.userAgent,
|
| 916 |
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
| 917 |
};
|
|
|
|
| 27 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 28 |
|
| 29 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 30 |
+
targetSelector?: string | string[];
|
| 31 |
+
removeSelector?: string | string[];
|
| 32 |
}
|
| 33 |
|
| 34 |
export interface FormattedPage {
|
|
|
|
| 132 |
return indexObject;
|
| 133 |
}
|
| 134 |
|
| 135 |
+
getTurndown(options?: {
|
| 136 |
+
noRules?: boolean | string,
|
| 137 |
+
url?: string | URL;
|
| 138 |
+
}) {
|
| 139 |
const turnDownService = new TurndownService({
|
| 140 |
codeBlockStyle: 'fenced',
|
| 141 |
preformattedCode: true,
|
| 142 |
} as any);
|
| 143 |
+
if (!options?.noRules) {
|
| 144 |
turnDownService.addRule('remove-irrelevant', {
|
| 145 |
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea'],
|
| 146 |
replacement: () => ''
|
|
|
|
| 181 |
if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
|
| 182 |
|
| 183 |
const fixedContent = content.replace(/\s+/g, ' ').trim();
|
| 184 |
+
let fixedHref = href.replace(/\s+/g, '').trim();
|
| 185 |
+
if (options?.url) {
|
| 186 |
+
try {
|
| 187 |
+
fixedHref = new URL(fixedHref, options.url).toString();
|
| 188 |
+
} catch (_err) {
|
| 189 |
+
void 0;
|
| 190 |
+
}
|
| 191 |
+
}
|
| 192 |
|
| 193 |
return `[${fixedContent}](${fixedHref}${title || ''})`;
|
| 194 |
}
|
|
|
|
| 328 |
}
|
| 329 |
|
| 330 |
const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
|
| 331 |
+
let turnDownService = mode === 'markdown' ? this.getTurndown({ url: snapshot.href }) : this.getTurndown({ noRules: true, url: snapshot.href });
|
| 332 |
for (const plugin of this.turnDownPlugins) {
|
| 333 |
turnDownService = turnDownService.use(plugin);
|
| 334 |
}
|
|
|
|
| 391 |
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
| 392 |
} catch (err) {
|
| 393 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 394 |
+
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
|
| 395 |
try {
|
| 396 |
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
| 397 |
} catch (err2) {
|
|
|
|
| 408 |
contentText = turnDownService.turndown(snapshot.html);
|
| 409 |
} catch (err) {
|
| 410 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 411 |
+
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
|
| 412 |
try {
|
| 413 |
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
| 414 |
} catch (err2) {
|
|
|
|
| 810 |
return r;
|
| 811 |
}
|
| 812 |
|
| 813 |
+
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
|
| 814 |
let cache;
|
| 815 |
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
| 816 |
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
| 817 |
}
|
| 818 |
|
| 819 |
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
|
| 820 |
+
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
| 821 |
|
| 822 |
return;
|
| 823 |
}
|
| 824 |
|
| 825 |
try {
|
| 826 |
+
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector) {
|
| 827 |
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
| 828 |
+
yield this.puppeteerControl.narrowSnapshot(x, crawlOpts);
|
| 829 |
}
|
| 830 |
|
| 831 |
return;
|
|
|
|
| 835 |
} catch (err: any) {
|
| 836 |
if (cache) {
|
| 837 |
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
|
| 838 |
+
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
| 839 |
return;
|
| 840 |
}
|
| 841 |
throw err;
|
|
|
|
| 864 |
}
|
| 865 |
|
| 866 |
|
| 867 |
+
async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, cacheTolerance?: number) {
|
| 868 |
const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));
|
| 869 |
|
| 870 |
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
|
|
|
|
| 921 |
proxyUrl: opts.proxyUrl,
|
| 922 |
cookies: opts.setCookies,
|
| 923 |
favorScreenshot: opts.respondWith === 'screenshot',
|
| 924 |
+
removeSelector: opts.removeSelector,
|
| 925 |
targetSelector: opts.targetSelector,
|
| 926 |
+
waitForSelector: opts.waitForSelector,
|
| 927 |
overrideUserAgent: opts.userAgent,
|
| 928 |
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
| 929 |
};
|
backend/functions/src/cloud-functions/searcher.ts
CHANGED
|
@@ -8,11 +8,10 @@ import { singleton } from 'tsyringe';
|
|
| 8 |
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
| 9 |
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
| 10 |
import _ from 'lodash';
|
| 11 |
-
import { ScrappingOptions } from '../services/puppeteer';
|
| 12 |
import { Request, Response } from 'express';
|
| 13 |
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
| 14 |
import { BraveSearchService } from '../services/brave-search';
|
| 15 |
-
import { CrawlerHost, FormattedPage } from './crawler';
|
| 16 |
import { CookieParam } from 'puppeteer';
|
| 17 |
|
| 18 |
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
|
@@ -304,7 +303,7 @@ export class SearcherHost extends RPCHost {
|
|
| 304 |
async *fetchSearchResults(
|
| 305 |
mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
| 306 |
searchResults?: WebSearchResult[],
|
| 307 |
-
options?:
|
| 308 |
pageCacheTolerance?: number
|
| 309 |
) {
|
| 310 |
if (!searchResults) {
|
|
|
|
| 8 |
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
| 9 |
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
| 10 |
import _ from 'lodash';
|
|
|
|
| 11 |
import { Request, Response } from 'express';
|
| 12 |
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
| 13 |
import { BraveSearchService } from '../services/brave-search';
|
| 14 |
+
import { CrawlerHost, ExtraScrappingOptions, FormattedPage } from './crawler';
|
| 15 |
import { CookieParam } from 'puppeteer';
|
| 16 |
|
| 17 |
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
|
|
|
| 303 |
async *fetchSearchResults(
|
| 304 |
mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
| 305 |
searchResults?: WebSearchResult[],
|
| 306 |
+
options?: ExtraScrappingOptions,
|
| 307 |
pageCacheTolerance?: number
|
| 308 |
) {
|
| 309 |
if (!searchResults) {
|
backend/functions/src/dto/scrapping-options.ts
CHANGED
|
@@ -53,6 +53,13 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
|
| 53 |
in: 'header',
|
| 54 |
schema: { type: 'string' }
|
| 55 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
'X-Proxy-Url': {
|
| 57 |
description: `Specifies your custom proxy if you prefer to use one.\n\n` +
|
| 58 |
`Supported protocols: \n` +
|
|
@@ -130,11 +137,14 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 130 |
@Prop()
|
| 131 |
cacheTolerance?: number;
|
| 132 |
|
| 133 |
-
@Prop()
|
| 134 |
-
targetSelector?: string;
|
| 135 |
|
| 136 |
-
@Prop()
|
| 137 |
-
waitForSelector?: string;
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
@Prop({
|
| 140 |
arrayOf: String,
|
|
@@ -193,15 +203,17 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 193 |
instance.timeout = timeoutSeconds;
|
| 194 |
}
|
| 195 |
|
| 196 |
-
const
|
|
|
|
|
|
|
| 197 |
instance.targetSelector ??= targetSelector;
|
| 198 |
-
const waitForSelector = ctx?.req.get('x-wait-for-selector');
|
| 199 |
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
|
| 200 |
const overrideUserAgent = ctx?.req.get('x-user-agent');
|
| 201 |
instance.userAgent ??= overrideUserAgent;
|
| 202 |
|
| 203 |
const cookies: CookieParam[] = [];
|
| 204 |
-
const setCookieHeaders = ctx?.req.
|
| 205 |
if (Array.isArray(setCookieHeaders)) {
|
| 206 |
for (const setCookie of setCookieHeaders) {
|
| 207 |
cookies.push({
|
|
|
|
| 53 |
in: 'header',
|
| 54 |
schema: { type: 'string' }
|
| 55 |
},
|
| 56 |
+
'X-Remove-Selector': {
|
| 57 |
+
description: `Specifies a CSS selector to remove elements from the full html.\n\n` +
|
| 58 |
+
'Example `X-Remove-Selector: nav`'
|
| 59 |
+
,
|
| 60 |
+
in: 'header',
|
| 61 |
+
schema: { type: 'string' }
|
| 62 |
+
},
|
| 63 |
'X-Proxy-Url': {
|
| 64 |
description: `Specifies your custom proxy if you prefer to use one.\n\n` +
|
| 65 |
`Supported protocols: \n` +
|
|
|
|
| 137 |
@Prop()
|
| 138 |
cacheTolerance?: number;
|
| 139 |
|
| 140 |
+
@Prop({ arrayOf: String })
|
| 141 |
+
targetSelector?: string | string[];
|
| 142 |
|
| 143 |
+
@Prop({ arrayOf: String })
|
| 144 |
+
waitForSelector?: string | string[];
|
| 145 |
+
|
| 146 |
+
@Prop({ arrayOf: String })
|
| 147 |
+
removeSelector?: string | string[];
|
| 148 |
|
| 149 |
@Prop({
|
| 150 |
arrayOf: String,
|
|
|
|
| 203 |
instance.timeout = timeoutSeconds;
|
| 204 |
}
|
| 205 |
|
| 206 |
+
const removeSelector = ctx?.req.get('x-remove-selector')?.split(', ');
|
| 207 |
+
instance.removeSelector ??= removeSelector;
|
| 208 |
+
const targetSelector = ctx?.req.get('x-target-selector')?.split(', ');
|
| 209 |
instance.targetSelector ??= targetSelector;
|
| 210 |
+
const waitForSelector = ctx?.req.get('x-wait-for-selector')?.split(', ');
|
| 211 |
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
|
| 212 |
const overrideUserAgent = ctx?.req.get('x-user-agent');
|
| 213 |
instance.userAgent ??= overrideUserAgent;
|
| 214 |
|
| 215 |
const cookies: CookieParam[] = [];
|
| 216 |
+
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
|
| 217 |
if (Array.isArray(setCookieHeaders)) {
|
| 218 |
for (const setCookie of setCookieHeaders) {
|
| 219 |
cookies.push({
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -63,7 +63,7 @@ export interface ScrappingOptions {
|
|
| 63 |
proxyUrl?: string;
|
| 64 |
cookies?: CookieParam[];
|
| 65 |
favorScreenshot?: boolean;
|
| 66 |
-
waitForSelector?: string;
|
| 67 |
minIntervalMs?: number;
|
| 68 |
overrideUserAgent?: string;
|
| 69 |
timeoutMs?: number;
|
|
@@ -483,7 +483,8 @@ document.addEventListener('load', handlePageLoad);
|
|
| 483 |
);
|
| 484 |
});
|
| 485 |
if (options?.waitForSelector) {
|
| 486 |
-
page.waitForSelector(options.waitForSelector)
|
|
|
|
| 487 |
.then(async () => {
|
| 488 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 489 |
screenshot = await page.screenshot();
|
|
@@ -547,8 +548,11 @@ document.addEventListener('load', handlePageLoad);
|
|
| 547 |
return true;
|
| 548 |
}
|
| 549 |
|
| 550 |
-
narrowSnapshot(snapshot: PageSnapshot | undefined,
|
| 551 |
-
|
|
|
|
|
|
|
|
|
|
| 552 |
return snapshot;
|
| 553 |
}
|
| 554 |
if (!snapshot?.html) {
|
|
@@ -556,26 +560,68 @@ document.addEventListener('load', handlePageLoad);
|
|
| 556 |
}
|
| 557 |
|
| 558 |
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
| 559 |
-
const
|
| 560 |
|
| 561 |
-
if (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
return snapshot;
|
| 563 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 564 |
|
| 565 |
-
const selectedJsDom = new JSDOM(elem.outerHTML, { url: snapshot.href, virtualConsole });
|
| 566 |
let parsed;
|
| 567 |
try {
|
| 568 |
-
parsed = new Readability(
|
| 569 |
} catch (err: any) {
|
| 570 |
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
| 571 |
}
|
| 572 |
|
| 573 |
// No innerText in jsdom
|
| 574 |
// https://github.com/jsdom/jsdom/issues/1245
|
| 575 |
-
const textContent =
|
| 576 |
const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
|
| 577 |
|
| 578 |
-
const imageTags = Array.from(
|
| 579 |
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
|
| 580 |
.flat()
|
| 581 |
.map((x) => {
|
|
@@ -592,7 +638,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 592 |
const r = {
|
| 593 |
...snapshot,
|
| 594 |
parsed,
|
| 595 |
-
html:
|
| 596 |
text: cleanedText,
|
| 597 |
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
| 598 |
} as PageSnapshot;
|
|
|
|
| 63 |
proxyUrl?: string;
|
| 64 |
cookies?: CookieParam[];
|
| 65 |
favorScreenshot?: boolean;
|
| 66 |
+
waitForSelector?: string | string[];
|
| 67 |
minIntervalMs?: number;
|
| 68 |
overrideUserAgent?: string;
|
| 69 |
timeoutMs?: number;
|
|
|
|
| 483 |
);
|
| 484 |
});
|
| 485 |
if (options?.waitForSelector) {
|
| 486 |
+
const waitPromise = Array.isArray(options.waitForSelector) ? Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x))) : page.waitForSelector(options.waitForSelector);
|
| 487 |
+
waitPromise
|
| 488 |
.then(async () => {
|
| 489 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 490 |
screenshot = await page.screenshot();
|
|
|
|
| 548 |
return true;
|
| 549 |
}
|
| 550 |
|
| 551 |
+
narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
|
| 552 |
+
targetSelector?: string | string[];
|
| 553 |
+
removeSelector?: string | string[];
|
| 554 |
+
}): PageSnapshot | undefined {
|
| 555 |
+
if (!options?.targetSelector && !options?.removeSelector) {
|
| 556 |
return snapshot;
|
| 557 |
}
|
| 558 |
if (!snapshot?.html) {
|
|
|
|
| 560 |
}
|
| 561 |
|
| 562 |
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
| 563 |
+
const allNodes: Node[] = [];
|
| 564 |
|
| 565 |
+
if (Array.isArray(options.removeSelector)) {
|
| 566 |
+
for (const rl of options.removeSelector) {
|
| 567 |
+
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
|
| 568 |
+
}
|
| 569 |
+
} else if (options.removeSelector) {
|
| 570 |
+
jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
|
| 571 |
+
}
|
| 572 |
+
|
| 573 |
+
if (Array.isArray(options.targetSelector)) {
|
| 574 |
+
for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
|
| 575 |
+
x.forEach((el) => {
|
| 576 |
+
if (!allNodes.includes(el)) {
|
| 577 |
+
allNodes.push(el);
|
| 578 |
+
}
|
| 579 |
+
});
|
| 580 |
+
}
|
| 581 |
+
} else if (options.targetSelector) {
|
| 582 |
+
jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
|
| 583 |
+
if (!allNodes.includes(el)) {
|
| 584 |
+
allNodes.push(el);
|
| 585 |
+
}
|
| 586 |
+
});
|
| 587 |
+
} else {
|
| 588 |
+
allNodes.push(jsdom.window.document);
|
| 589 |
+
}
|
| 590 |
+
|
| 591 |
+
if (!allNodes.length) {
|
| 592 |
return snapshot;
|
| 593 |
}
|
| 594 |
+
const textChunks: string[] = [];
|
| 595 |
+
let rootDoc: Document;
|
| 596 |
+
if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
|
| 597 |
+
rootDoc = allNodes[0] as any;
|
| 598 |
+
if (rootDoc.body.textContent) {
|
| 599 |
+
textChunks.push(rootDoc.body.textContent);
|
| 600 |
+
}
|
| 601 |
+
} else {
|
| 602 |
+
rootDoc = new JSDOM('', { url: snapshot.href, virtualConsole }).window.document;
|
| 603 |
+
for (const n of allNodes) {
|
| 604 |
+
rootDoc.body.appendChild(n);
|
| 605 |
+
rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
|
| 606 |
+
if (n.textContent) {
|
| 607 |
+
textChunks.push(n.textContent);
|
| 608 |
+
}
|
| 609 |
+
}
|
| 610 |
+
}
|
| 611 |
|
|
|
|
| 612 |
let parsed;
|
| 613 |
try {
|
| 614 |
+
parsed = new Readability(rootDoc.cloneNode(true) as any).parse();
|
| 615 |
} catch (err: any) {
|
| 616 |
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
| 617 |
}
|
| 618 |
|
| 619 |
// No innerText in jsdom
|
| 620 |
// https://github.com/jsdom/jsdom/issues/1245
|
| 621 |
+
const textContent = textChunks.join('\n\n');
|
| 622 |
const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
|
| 623 |
|
| 624 |
+
const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
|
| 625 |
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
|
| 626 |
.flat()
|
| 627 |
.map((x) => {
|
|
|
|
| 638 |
const r = {
|
| 639 |
...snapshot,
|
| 640 |
parsed,
|
| 641 |
+
html: rootDoc.documentElement.outerHTML,
|
| 642 |
text: cleanedText,
|
| 643 |
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
| 644 |
} as PageSnapshot;
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit b30155da82ea8e311faab58bb5a360e829547ea0
|