nomagick commited on
Commit
008dcba
·
unverified ·
1 Parent(s): fc2824b

fix: image in summary

Browse files
backend/functions/src/services/jsdom.ts CHANGED
@@ -1,7 +1,7 @@
1
  import { container, singleton } from 'tsyringe';
2
  import { AsyncService, marshalErrorLike } from 'civkit';
3
  import { Logger } from '../shared/services/logger';
4
- import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
5
  import { Readability } from '@mozilla/readability';
6
  import TurndownService from 'turndown';
7
  import { Threaded } from '../shared/services/threaded';
@@ -144,19 +144,33 @@ export class JSDomControl extends AsyncService {
144
  this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
145
  }
146
 
147
- const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
148
- .map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
149
- .flat()
150
- .map((x) => {
151
- try {
152
- return new URL(x, snapshot.rebase || snapshot.href).toString();
153
- } catch (err) {
154
- return null;
 
 
 
 
155
  }
156
- })
157
- .filter(Boolean);
 
 
 
 
 
 
 
 
 
 
 
158
 
159
- const imageSet = new Set(imageTags);
160
  const r = {
161
  ...snapshot,
162
  title: snapshot.title || jsdom.window.document.title,
@@ -165,7 +179,7 @@ export class JSDomControl extends AsyncService {
165
  parsed,
166
  html: rootDoc.documentElement.outerHTML,
167
  text: textChunks.join('\n'),
168
- imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
169
  } as PageSnapshot;
170
 
171
  const dt = Date.now() - t0;
@@ -283,7 +297,7 @@ export class JSDomControl extends AsyncService {
283
  currentNode.parentNode?.removeChild(currentNode); // Remove each comment node
284
  }
285
 
286
- jsdom.window.document.querySelectorAll('*').forEach((x)=> {
287
  const attrs = x.getAttributeNames();
288
  for (const attr of attrs) {
289
  if (attr.startsWith('data-') || attr.startsWith('aria-')) {
 
1
  import { container, singleton } from 'tsyringe';
2
  import { AsyncService, marshalErrorLike } from 'civkit';
3
  import { Logger } from '../shared/services/logger';
4
+ import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer';
5
  import { Readability } from '@mozilla/readability';
6
  import TurndownService from 'turndown';
7
  import { Threaded } from '../shared/services/threaded';
 
144
  this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
145
  }
146
 
147
+ const imgSet = new Set<string>();
148
+ const rebuiltImgs: ImgBrief[] = [];
149
+ Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
150
+ .map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src'), x.getAttribute('alt')])
151
+ .forEach(([u1, u2, alt]) => {
152
+ if (u1) {
153
+ try {
154
+ const u1Txt = new URL(u1, snapshot.rebase || snapshot.href).toString();
155
+ imgSet.add(u1Txt);
156
+ } catch (err) {
157
+ // void 0;
158
+ }
159
  }
160
+ if (u2) {
161
+ try {
162
+ const u2Txt = new URL(u2, snapshot.rebase || snapshot.href).toString();
163
+ imgSet.add(u2Txt);
164
+ } catch (err) {
165
+ // void 0;
166
+ }
167
+ }
168
+ rebuiltImgs.push({
169
+ src: u1 || u2,
170
+ alt
171
+ });
172
+ });
173
 
 
174
  const r = {
175
  ...snapshot,
176
  title: snapshot.title || jsdom.window.document.title,
 
179
  parsed,
180
  html: rootDoc.documentElement.outerHTML,
181
  text: textChunks.join('\n'),
182
+ imgs: (snapshot.imgs || rebuiltImgs)?.filter((x) => imgSet.has(x.src)) || [],
183
  } as PageSnapshot;
184
 
185
  const dt = Date.now() - t0;
 
297
  currentNode.parentNode?.removeChild(currentNode); // Remove each comment node
298
  }
299
 
300
+ jsdom.window.document.querySelectorAll('*').forEach((x) => {
301
  const attrs = x.getAttributeNames();
302
  for (const attr of attrs) {
303
  if (attr.startsWith('data-') || attr.startsWith('aria-')) {
backend/functions/src/services/snapshot-formatter.ts CHANGED
@@ -231,7 +231,8 @@ export class SnapshotFormatter extends AsyncService {
231
  if (imageRetention === 'alt') {
232
  return alt ? `(Image ${++imgIdx}: ${alt})` : '';
233
  }
234
- let linkPreferredSrc = (node.getAttribute('src') || '').trim();
 
235
  const maybeSrcSet: string = (node.getAttribute('srcset') || '').trim();
236
  if (!linkPreferredSrc && maybeSrcSet) {
237
  linkPreferredSrc = maybeSrcSet.split(',').map((x) => x.trim()).filter(Boolean)[0];
@@ -252,7 +253,7 @@ export class SnapshotFormatter extends AsyncService {
252
  if (!src) {
253
  return '';
254
  }
255
- const mapped = urlToAltMap[src];
256
  const imgSerial = ++imgIdx;
257
  const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
258
  idxArr.push(imgSerial);
@@ -303,11 +304,13 @@ export class SnapshotFormatter extends AsyncService {
303
  if (!mode.includes('markdown') && snapshot.parsed?.content) {
304
  const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
305
  const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
 
306
  const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
307
 
308
  // If Readability did its job
309
  if (par2.length >= 0.3 * par1.length) {
310
  turnDownService = this.getTurndown({ noRules: true, ...optsMixin });
 
311
  if (snapshot.parsed.content) {
312
  toBeTurnedToMd = jsDomElementOfParsed;
313
  }
@@ -336,11 +339,13 @@ export class SnapshotFormatter extends AsyncService {
336
  if (toBeTurnedToMd) {
337
  try {
338
  contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
 
339
  } catch (err) {
340
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
341
  const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
342
  try {
343
  contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
 
344
  } catch (err2) {
345
  this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
346
  }
@@ -354,11 +359,13 @@ export class SnapshotFormatter extends AsyncService {
354
  toBeTurnedToMd = jsDomElementOfHTML;
355
  try {
356
  contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim();
 
357
  } catch (err) {
358
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
359
  const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
360
  try {
361
  contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim();
 
362
  } catch (err2) {
363
  this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
364
  }
@@ -393,6 +400,12 @@ export class SnapshotFormatter extends AsyncService {
393
  .toPairs()
394
  .map(
395
  ([url, alt], i) => {
 
 
 
 
 
 
396
  return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
397
  }
398
  ).fromPairs()
 
231
  if (imageRetention === 'alt') {
232
  return alt ? `(Image ${++imgIdx}: ${alt})` : '';
233
  }
234
+ let originalSrc = (node.getAttribute('src') || '').trim();
235
+ let linkPreferredSrc = originalSrc;
236
  const maybeSrcSet: string = (node.getAttribute('srcset') || '').trim();
237
  if (!linkPreferredSrc && maybeSrcSet) {
238
  linkPreferredSrc = maybeSrcSet.split(',').map((x) => x.trim()).filter(Boolean)[0];
 
253
  if (!src) {
254
  return '';
255
  }
256
+ const mapped = urlToAltMap[originalSrc];
257
  const imgSerial = ++imgIdx;
258
  const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
259
  idxArr.push(imgSerial);
 
304
  if (!mode.includes('markdown') && snapshot.parsed?.content) {
305
  const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
306
  const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
307
+ imgIdx = 0;
308
  const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
309
 
310
  // If Readability did its job
311
  if (par2.length >= 0.3 * par1.length) {
312
  turnDownService = this.getTurndown({ noRules: true, ...optsMixin });
313
+ imgIdx = 0;
314
  if (snapshot.parsed.content) {
315
  toBeTurnedToMd = jsDomElementOfParsed;
316
  }
 
339
  if (toBeTurnedToMd) {
340
  try {
341
  contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
342
+ imgIdx = 0;
343
  } catch (err) {
344
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
345
  const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
346
  try {
347
  contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
348
+ imgIdx = 0;
349
  } catch (err2) {
350
  this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
351
  }
 
359
  toBeTurnedToMd = jsDomElementOfHTML;
360
  try {
361
  contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim();
362
+ imgIdx = 0;
363
  } catch (err) {
364
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
365
  const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
366
  try {
367
  contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim();
368
+ imgIdx = 0;
369
  } catch (err2) {
370
  this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
371
  }
 
400
  .toPairs()
401
  .map(
402
  ([url, alt], i) => {
403
+ if (imgDataUrlToObjectUrl && url.startsWith('data:')) {
404
+ const refUrl = new URL(formatted.url!);
405
+ const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(url)}`);
406
+
407
+ url = mappedUrl.toString();
408
+ }
409
  return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
410
  }
411
  ).fromPairs()