nomagick commited on
Commit
62dc75f
·
unverified ·
1 Parent(s): 8cfd0d6

fix: consider image data-src and make generated alt text optional (#50)

Browse files

* fix: image src and alt

* fix

* docs: doc about x-with-generated-alt

* fix: deps

README.md CHANGED
@@ -72,6 +72,7 @@ As you have already seen above, one can control the behavior of the Reader API u
72
  - `x-respond-with: screenshot` returns the URL of the webpage's screenshot
73
  - You can specify a proxy server via the `x-proxy-url` header.
74
  - You can bypass the cached page (lifetime 300s) via the `x-no-cache` header.
 
75
 
76
  ### JSON mode (super early beta)
77
 
 
72
  - `x-respond-with: screenshot` returns the URL of the webpage's screenshot
73
  - You can specify a proxy server via the `x-proxy-url` header.
74
  - You can bypass the cached page (lifetime 300s) via the `x-no-cache` header.
75
+ - You can enable alt-text generation feature via the `x-with-generated-alt` header.
76
 
77
  ### JSON mode (super early beta)
78
 
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -5,7 +5,7 @@ import {
5
  AssertionFailureError, ParamValidationError,
6
  } from 'civkit';
7
  import { singleton } from 'tsyringe';
8
- import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
9
  import { RateLimitControl } from '../shared/services/rate-limit';
10
  import _ from 'lodash';
11
  import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
@@ -41,6 +41,7 @@ export class CrawlerHost extends RPCHost {
41
  protected altTextService: AltTextService,
42
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
43
  protected rateLimitControl: RateLimitControl,
 
44
  ) {
45
  super(...arguments);
46
 
@@ -123,8 +124,8 @@ export class CrawlerHost extends RPCHost {
123
  turnDownService = turnDownService.use(plugin);
124
  }
125
  const urlToAltMap: { [k: string]: string | undefined; } = {};
126
- if (snapshot.imgs?.length) {
127
- const tasks = (snapshot.imgs || []).map(async (x) => {
128
  const r = await this.altTextService.getAltText(x).catch((err: any) => {
129
  this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
130
  return undefined;
@@ -140,7 +141,15 @@ export class CrawlerHost extends RPCHost {
140
  turnDownService.addRule('img-generated-alt', {
141
  filter: 'img',
142
  replacement: (_content, node) => {
143
- const src = (node.getAttribute('src') || '').trim();
 
 
 
 
 
 
 
 
144
  const alt = cleanAttribute(node.getAttribute('alt'));
145
  if (!src) {
146
  return '';
@@ -285,6 +294,11 @@ ${this.content}
285
  in: 'header',
286
  schema: { type: 'string' }
287
  },
 
 
 
 
 
288
  }
289
  }
290
  },
@@ -365,6 +379,7 @@ ${authMixin}`,
365
  }
366
 
367
  const customMode = ctx.req.get('x-respond-with') || 'default';
 
368
  const noCache = Boolean(ctx.req.get('x-no-cache'));
369
  const cookies: CookieParam[] = [];
370
  const setCookieHeaders = ctx.req.headers['x-set-cookie'];
@@ -381,6 +396,7 @@ ${authMixin}`,
381
  domain: urlToCrawl.hostname,
382
  });
383
  }
 
384
 
385
  const crawlOpts: ScrappingOptions = {
386
  proxyUrl: ctx.req.get('x-proxy-url'),
 
5
  AssertionFailureError, ParamValidationError,
6
  } from 'civkit';
7
  import { singleton } from 'tsyringe';
8
+ import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
9
  import { RateLimitControl } from '../shared/services/rate-limit';
10
  import _ from 'lodash';
11
  import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
 
41
  protected altTextService: AltTextService,
42
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
43
  protected rateLimitControl: RateLimitControl,
44
+ protected threadLocal: AsyncContext,
45
  ) {
46
  super(...arguments);
47
 
 
124
  turnDownService = turnDownService.use(plugin);
125
  }
126
  const urlToAltMap: { [k: string]: string | undefined; } = {};
127
+ if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
128
+ const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
129
  const r = await this.altTextService.getAltText(x).catch((err: any) => {
130
  this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
131
  return undefined;
 
141
  turnDownService.addRule('img-generated-alt', {
142
  filter: 'img',
143
  replacement: (_content, node) => {
144
+ let linkPreferredSrc = (node.getAttribute('src') || '').trim();
145
+ if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
146
+ const dataSrc = (node.getAttribute('data-src') || '').trim();
147
+ if (dataSrc && !dataSrc.startsWith('data:')) {
148
+ linkPreferredSrc = dataSrc;
149
+ }
150
+ }
151
+
152
+ const src = linkPreferredSrc;
153
  const alt = cleanAttribute(node.getAttribute('alt'));
154
  if (!src) {
155
  return '';
 
294
  in: 'header',
295
  schema: { type: 'string' }
296
  },
297
+ 'X-With-Generated-Alt': {
298
+ description: `Enable automatic alt-text generating for images without an meaningful alt-text.`,
299
+ in: 'header',
300
+ schema: { type: 'string' }
301
+ },
302
  }
303
  }
304
  },
 
379
  }
380
 
381
  const customMode = ctx.req.get('x-respond-with') || 'default';
382
+ const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
383
  const noCache = Boolean(ctx.req.get('x-no-cache'));
384
  const cookies: CookieParam[] = [];
385
  const setCookieHeaders = ctx.req.headers['x-set-cookie'];
 
396
  domain: urlToCrawl.hostname,
397
  });
398
  }
399
+ this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
400
 
401
  const crawlOpts: ScrappingOptions = {
402
  proxyUrl: ctx.req.get('x-proxy-url'),
backend/functions/src/services/alt-text.ts CHANGED
@@ -12,6 +12,7 @@ const md5Hasher = new HashManager('md5', 'hex');
12
  @singleton()
13
  export class AltTextService extends AsyncService {
14
 
 
15
  logger = this.globalLogger.child({ service: this.constructor.name });
16
 
17
  constructor(
@@ -48,7 +49,7 @@ export class AltTextService extends AsyncService {
48
  if (!imgBrief.src) {
49
  return undefined;
50
  }
51
- if (imgBrief.alt) {
52
  return imgBrief.alt;
53
  }
54
  const digest = md5Hasher.hash(imgBrief.src);
 
12
  @singleton()
13
  export class AltTextService extends AsyncService {
14
 
15
+ altsToIgnore = 'image,img,photo,picture,pic,alt,figure,fig'.split(',');
16
  logger = this.globalLogger.child({ service: this.constructor.name });
17
 
18
  constructor(
 
49
  if (!imgBrief.src) {
50
  return undefined;
51
  }
52
+ if (imgBrief.alt && !this.altsToIgnore.includes(imgBrief.alt.trim().toLowerCase())) {
53
  return imgBrief.alt;
54
  }
55
  const digest = md5Hasher.hash(imgBrief.src);
backend/functions/src/services/puppeteer.ts CHANGED
@@ -193,17 +193,26 @@ export class PuppeteerControl extends AsyncService {
193
  preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
194
  preparations.push(page.evaluateOnNewDocument(`
195
  function briefImgs(elem) {
196
- const imageTags = Array.from((elem || document).querySelectorAll('img[src]'));
197
-
198
- return imageTags.map((x)=> ({
199
- src: x.src,
200
- loaded: x.complete,
201
- width: x.width,
202
- height: x.height,
203
- naturalWidth: x.naturalWidth,
204
- naturalHeight: x.naturalHeight,
205
- alt: x.alt || x.title,
206
- }));
 
 
 
 
 
 
 
 
 
207
  }
208
  function giveSnapshot() {
209
  let parsed;
 
193
  preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
194
  preparations.push(page.evaluateOnNewDocument(`
195
  function briefImgs(elem) {
196
+ const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]'));
197
+
198
+ return imageTags.map((x)=> {
199
+ let linkPreferredSrc = x.src;
200
+ if (linkPreferredSrc.startsWith('data:')) {
201
+ if (typeof x.dataset?.src === 'string' && !x.dataset.src.startsWith('data:')) {
202
+ linkPreferredSrc = x.dataset.src;
203
+ }
204
+ }
205
+
206
+ return {
207
+ src: linkPreferredSrc,
208
+ loaded: x.complete,
209
+ width: x.width,
210
+ height: x.height,
211
+ naturalWidth: x.naturalWidth,
212
+ naturalHeight: x.naturalHeight,
213
+ alt: x.alt || x.title,
214
+ };
215
+ });
216
  }
217
  function giveSnapshot() {
218
  let parsed;