Spaces:
Build error
Build error
fix: image in summary
Browse files
backend/functions/src/services/jsdom.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import { container, singleton } from 'tsyringe';
|
| 2 |
import { AsyncService, marshalErrorLike } from 'civkit';
|
| 3 |
import { Logger } from '../shared/services/logger';
|
| 4 |
-
import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
|
| 5 |
import { Readability } from '@mozilla/readability';
|
| 6 |
import TurndownService from 'turndown';
|
| 7 |
import { Threaded } from '../shared/services/threaded';
|
|
@@ -144,19 +144,33 @@ export class JSDomControl extends AsyncService {
|
|
| 144 |
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
| 145 |
}
|
| 146 |
|
| 147 |
-
const
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
.map((x) =>
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
}
|
| 156 |
-
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
-
const imageSet = new Set(imageTags);
|
| 160 |
const r = {
|
| 161 |
...snapshot,
|
| 162 |
title: snapshot.title || jsdom.window.document.title,
|
|
@@ -165,7 +179,7 @@ export class JSDomControl extends AsyncService {
|
|
| 165 |
parsed,
|
| 166 |
html: rootDoc.documentElement.outerHTML,
|
| 167 |
text: textChunks.join('\n'),
|
| 168 |
-
imgs: snapshot.imgs?.filter((x) =>
|
| 169 |
} as PageSnapshot;
|
| 170 |
|
| 171 |
const dt = Date.now() - t0;
|
|
@@ -283,7 +297,7 @@ export class JSDomControl extends AsyncService {
|
|
| 283 |
currentNode.parentNode?.removeChild(currentNode); // Remove each comment node
|
| 284 |
}
|
| 285 |
|
| 286 |
-
jsdom.window.document.querySelectorAll('*').forEach((x)=> {
|
| 287 |
const attrs = x.getAttributeNames();
|
| 288 |
for (const attr of attrs) {
|
| 289 |
if (attr.startsWith('data-') || attr.startsWith('aria-')) {
|
|
|
|
| 1 |
import { container, singleton } from 'tsyringe';
|
| 2 |
import { AsyncService, marshalErrorLike } from 'civkit';
|
| 3 |
import { Logger } from '../shared/services/logger';
|
| 4 |
+
import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer';
|
| 5 |
import { Readability } from '@mozilla/readability';
|
| 6 |
import TurndownService from 'turndown';
|
| 7 |
import { Threaded } from '../shared/services/threaded';
|
|
|
|
| 144 |
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
| 145 |
}
|
| 146 |
|
| 147 |
+
const imgSet = new Set<string>();
|
| 148 |
+
const rebuiltImgs: ImgBrief[] = [];
|
| 149 |
+
Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
|
| 150 |
+
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src'), x.getAttribute('alt')])
|
| 151 |
+
.forEach(([u1, u2, alt]) => {
|
| 152 |
+
if (u1) {
|
| 153 |
+
try {
|
| 154 |
+
const u1Txt = new URL(u1, snapshot.rebase || snapshot.href).toString();
|
| 155 |
+
imgSet.add(u1Txt);
|
| 156 |
+
} catch (err) {
|
| 157 |
+
// void 0;
|
| 158 |
+
}
|
| 159 |
}
|
| 160 |
+
if (u2) {
|
| 161 |
+
try {
|
| 162 |
+
const u2Txt = new URL(u2, snapshot.rebase || snapshot.href).toString();
|
| 163 |
+
imgSet.add(u2Txt);
|
| 164 |
+
} catch (err) {
|
| 165 |
+
// void 0;
|
| 166 |
+
}
|
| 167 |
+
}
|
| 168 |
+
rebuiltImgs.push({
|
| 169 |
+
src: u1 || u2,
|
| 170 |
+
alt
|
| 171 |
+
});
|
| 172 |
+
});
|
| 173 |
|
|
|
|
| 174 |
const r = {
|
| 175 |
...snapshot,
|
| 176 |
title: snapshot.title || jsdom.window.document.title,
|
|
|
|
| 179 |
parsed,
|
| 180 |
html: rootDoc.documentElement.outerHTML,
|
| 181 |
text: textChunks.join('\n'),
|
| 182 |
+
imgs: (snapshot.imgs || rebuiltImgs)?.filter((x) => imgSet.has(x.src)) || [],
|
| 183 |
} as PageSnapshot;
|
| 184 |
|
| 185 |
const dt = Date.now() - t0;
|
|
|
|
| 297 |
currentNode.parentNode?.removeChild(currentNode); // Remove each comment node
|
| 298 |
}
|
| 299 |
|
| 300 |
+
jsdom.window.document.querySelectorAll('*').forEach((x) => {
|
| 301 |
const attrs = x.getAttributeNames();
|
| 302 |
for (const attr of attrs) {
|
| 303 |
if (attr.startsWith('data-') || attr.startsWith('aria-')) {
|
backend/functions/src/services/snapshot-formatter.ts
CHANGED
|
@@ -231,7 +231,8 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 231 |
if (imageRetention === 'alt') {
|
| 232 |
return alt ? `(Image ${++imgIdx}: ${alt})` : '';
|
| 233 |
}
|
| 234 |
-
let
|
|
|
|
| 235 |
const maybeSrcSet: string = (node.getAttribute('srcset') || '').trim();
|
| 236 |
if (!linkPreferredSrc && maybeSrcSet) {
|
| 237 |
linkPreferredSrc = maybeSrcSet.split(',').map((x) => x.trim()).filter(Boolean)[0];
|
|
@@ -252,7 +253,7 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 252 |
if (!src) {
|
| 253 |
return '';
|
| 254 |
}
|
| 255 |
-
const mapped = urlToAltMap[
|
| 256 |
const imgSerial = ++imgIdx;
|
| 257 |
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
|
| 258 |
idxArr.push(imgSerial);
|
|
@@ -303,11 +304,13 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 303 |
if (!mode.includes('markdown') && snapshot.parsed?.content) {
|
| 304 |
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
| 305 |
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
|
|
|
| 306 |
const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
|
| 307 |
|
| 308 |
// If Readability did its job
|
| 309 |
if (par2.length >= 0.3 * par1.length) {
|
| 310 |
turnDownService = this.getTurndown({ noRules: true, ...optsMixin });
|
|
|
|
| 311 |
if (snapshot.parsed.content) {
|
| 312 |
toBeTurnedToMd = jsDomElementOfParsed;
|
| 313 |
}
|
|
@@ -336,11 +339,13 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 336 |
if (toBeTurnedToMd) {
|
| 337 |
try {
|
| 338 |
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
|
|
|
| 339 |
} catch (err) {
|
| 340 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 341 |
const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
|
| 342 |
try {
|
| 343 |
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
|
|
|
| 344 |
} catch (err2) {
|
| 345 |
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
| 346 |
}
|
|
@@ -354,11 +359,13 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 354 |
toBeTurnedToMd = jsDomElementOfHTML;
|
| 355 |
try {
|
| 356 |
contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim();
|
|
|
|
| 357 |
} catch (err) {
|
| 358 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 359 |
const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
|
| 360 |
try {
|
| 361 |
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim();
|
|
|
|
| 362 |
} catch (err2) {
|
| 363 |
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
| 364 |
}
|
|
@@ -393,6 +400,12 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 393 |
.toPairs()
|
| 394 |
.map(
|
| 395 |
([url, alt], i) => {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
|
| 397 |
}
|
| 398 |
).fromPairs()
|
|
|
|
| 231 |
if (imageRetention === 'alt') {
|
| 232 |
return alt ? `(Image ${++imgIdx}: ${alt})` : '';
|
| 233 |
}
|
| 234 |
+
let originalSrc = (node.getAttribute('src') || '').trim();
|
| 235 |
+
let linkPreferredSrc = originalSrc;
|
| 236 |
const maybeSrcSet: string = (node.getAttribute('srcset') || '').trim();
|
| 237 |
if (!linkPreferredSrc && maybeSrcSet) {
|
| 238 |
linkPreferredSrc = maybeSrcSet.split(',').map((x) => x.trim()).filter(Boolean)[0];
|
|
|
|
| 253 |
if (!src) {
|
| 254 |
return '';
|
| 255 |
}
|
| 256 |
+
const mapped = urlToAltMap[originalSrc];
|
| 257 |
const imgSerial = ++imgIdx;
|
| 258 |
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
|
| 259 |
idxArr.push(imgSerial);
|
|
|
|
| 304 |
if (!mode.includes('markdown') && snapshot.parsed?.content) {
|
| 305 |
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
| 306 |
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
| 307 |
+
imgIdx = 0;
|
| 308 |
const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
|
| 309 |
|
| 310 |
// If Readability did its job
|
| 311 |
if (par2.length >= 0.3 * par1.length) {
|
| 312 |
turnDownService = this.getTurndown({ noRules: true, ...optsMixin });
|
| 313 |
+
imgIdx = 0;
|
| 314 |
if (snapshot.parsed.content) {
|
| 315 |
toBeTurnedToMd = jsDomElementOfParsed;
|
| 316 |
}
|
|
|
|
| 339 |
if (toBeTurnedToMd) {
|
| 340 |
try {
|
| 341 |
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
| 342 |
+
imgIdx = 0;
|
| 343 |
} catch (err) {
|
| 344 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 345 |
const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
|
| 346 |
try {
|
| 347 |
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
| 348 |
+
imgIdx = 0;
|
| 349 |
} catch (err2) {
|
| 350 |
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
| 351 |
}
|
|
|
|
| 359 |
toBeTurnedToMd = jsDomElementOfHTML;
|
| 360 |
try {
|
| 361 |
contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim();
|
| 362 |
+
imgIdx = 0;
|
| 363 |
} catch (err) {
|
| 364 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 365 |
const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
|
| 366 |
try {
|
| 367 |
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim();
|
| 368 |
+
imgIdx = 0;
|
| 369 |
} catch (err2) {
|
| 370 |
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
| 371 |
}
|
|
|
|
| 400 |
.toPairs()
|
| 401 |
.map(
|
| 402 |
([url, alt], i) => {
|
| 403 |
+
if (imgDataUrlToObjectUrl && url.startsWith('data:')) {
|
| 404 |
+
const refUrl = new URL(formatted.url!);
|
| 405 |
+
const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(url)}`);
|
| 406 |
+
|
| 407 |
+
url = mappedUrl.toString();
|
| 408 |
+
}
|
| 409 |
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
|
| 410 |
}
|
| 411 |
).fromPairs()
|