Spaces:
Build error
Build error
fix: consider image data-src and make generated alt text optional (#50)
Browse files* fix: image src and alt
* fix
* docs: doc about x-with-generated-alt
* fix: deps
README.md
CHANGED
|
@@ -72,6 +72,7 @@ As you have already seen above, one can control the behavior of the Reader API u
|
|
| 72 |
- `x-respond-with: screenshot` returns the URL of the webpage's screenshot
|
| 73 |
- You can specify a proxy server via the `x-proxy-url` header.
|
| 74 |
- You can bypass the cached page (lifetime 300s) via the `x-no-cache` header.
|
|
|
|
| 75 |
|
| 76 |
### JSON mode (super early beta)
|
| 77 |
|
|
|
|
| 72 |
- `x-respond-with: screenshot` returns the URL of the webpage's screenshot
|
| 73 |
- You can specify a proxy server via the `x-proxy-url` header.
|
| 74 |
- You can bypass the cached page (lifetime 300s) via the `x-no-cache` header.
|
| 75 |
+
- You can enable alt-text generation feature via the `x-with-generated-alt` header.
|
| 76 |
|
| 77 |
### JSON mode (super early beta)
|
| 78 |
|
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -5,7 +5,7 @@ import {
|
|
| 5 |
AssertionFailureError, ParamValidationError,
|
| 6 |
} from 'civkit';
|
| 7 |
import { singleton } from 'tsyringe';
|
| 8 |
-
import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
| 9 |
import { RateLimitControl } from '../shared/services/rate-limit';
|
| 10 |
import _ from 'lodash';
|
| 11 |
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
|
@@ -41,6 +41,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 41 |
protected altTextService: AltTextService,
|
| 42 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
| 43 |
protected rateLimitControl: RateLimitControl,
|
|
|
|
| 44 |
) {
|
| 45 |
super(...arguments);
|
| 46 |
|
|
@@ -123,8 +124,8 @@ export class CrawlerHost extends RPCHost {
|
|
| 123 |
turnDownService = turnDownService.use(plugin);
|
| 124 |
}
|
| 125 |
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
| 126 |
-
if (snapshot.imgs?.length) {
|
| 127 |
-
const tasks = (snapshot.imgs || []).map(async (x) => {
|
| 128 |
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
| 129 |
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
| 130 |
return undefined;
|
|
@@ -140,7 +141,15 @@ export class CrawlerHost extends RPCHost {
|
|
| 140 |
turnDownService.addRule('img-generated-alt', {
|
| 141 |
filter: 'img',
|
| 142 |
replacement: (_content, node) => {
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
const alt = cleanAttribute(node.getAttribute('alt'));
|
| 145 |
if (!src) {
|
| 146 |
return '';
|
|
@@ -285,6 +294,11 @@ ${this.content}
|
|
| 285 |
in: 'header',
|
| 286 |
schema: { type: 'string' }
|
| 287 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
}
|
| 289 |
}
|
| 290 |
},
|
|
@@ -365,6 +379,7 @@ ${authMixin}`,
|
|
| 365 |
}
|
| 366 |
|
| 367 |
const customMode = ctx.req.get('x-respond-with') || 'default';
|
|
|
|
| 368 |
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
| 369 |
const cookies: CookieParam[] = [];
|
| 370 |
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
|
@@ -381,6 +396,7 @@ ${authMixin}`,
|
|
| 381 |
domain: urlToCrawl.hostname,
|
| 382 |
});
|
| 383 |
}
|
|
|
|
| 384 |
|
| 385 |
const crawlOpts: ScrappingOptions = {
|
| 386 |
proxyUrl: ctx.req.get('x-proxy-url'),
|
|
|
|
| 5 |
AssertionFailureError, ParamValidationError,
|
| 6 |
} from 'civkit';
|
| 7 |
import { singleton } from 'tsyringe';
|
| 8 |
+
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
| 9 |
import { RateLimitControl } from '../shared/services/rate-limit';
|
| 10 |
import _ from 'lodash';
|
| 11 |
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
|
|
|
| 41 |
protected altTextService: AltTextService,
|
| 42 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
| 43 |
protected rateLimitControl: RateLimitControl,
|
| 44 |
+
protected threadLocal: AsyncContext,
|
| 45 |
) {
|
| 46 |
super(...arguments);
|
| 47 |
|
|
|
|
| 124 |
turnDownService = turnDownService.use(plugin);
|
| 125 |
}
|
| 126 |
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
| 127 |
+
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
|
| 128 |
+
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
|
| 129 |
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
| 130 |
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
| 131 |
return undefined;
|
|
|
|
| 141 |
turnDownService.addRule('img-generated-alt', {
|
| 142 |
filter: 'img',
|
| 143 |
replacement: (_content, node) => {
|
| 144 |
+
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
|
| 145 |
+
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
|
| 146 |
+
const dataSrc = (node.getAttribute('data-src') || '').trim();
|
| 147 |
+
if (dataSrc && !dataSrc.startsWith('data:')) {
|
| 148 |
+
linkPreferredSrc = dataSrc;
|
| 149 |
+
}
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
const src = linkPreferredSrc;
|
| 153 |
const alt = cleanAttribute(node.getAttribute('alt'));
|
| 154 |
if (!src) {
|
| 155 |
return '';
|
|
|
|
| 294 |
in: 'header',
|
| 295 |
schema: { type: 'string' }
|
| 296 |
},
|
| 297 |
+
'X-With-Generated-Alt': {
|
| 298 |
+
description: `Enable automatic alt-text generating for images without an meaningful alt-text.`,
|
| 299 |
+
in: 'header',
|
| 300 |
+
schema: { type: 'string' }
|
| 301 |
+
},
|
| 302 |
}
|
| 303 |
}
|
| 304 |
},
|
|
|
|
| 379 |
}
|
| 380 |
|
| 381 |
const customMode = ctx.req.get('x-respond-with') || 'default';
|
| 382 |
+
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
| 383 |
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
| 384 |
const cookies: CookieParam[] = [];
|
| 385 |
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
|
|
|
| 396 |
domain: urlToCrawl.hostname,
|
| 397 |
});
|
| 398 |
}
|
| 399 |
+
this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
|
| 400 |
|
| 401 |
const crawlOpts: ScrappingOptions = {
|
| 402 |
proxyUrl: ctx.req.get('x-proxy-url'),
|
backend/functions/src/services/alt-text.ts
CHANGED
|
@@ -12,6 +12,7 @@ const md5Hasher = new HashManager('md5', 'hex');
|
|
| 12 |
@singleton()
|
| 13 |
export class AltTextService extends AsyncService {
|
| 14 |
|
|
|
|
| 15 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 16 |
|
| 17 |
constructor(
|
|
@@ -48,7 +49,7 @@ export class AltTextService extends AsyncService {
|
|
| 48 |
if (!imgBrief.src) {
|
| 49 |
return undefined;
|
| 50 |
}
|
| 51 |
-
if (imgBrief.alt) {
|
| 52 |
return imgBrief.alt;
|
| 53 |
}
|
| 54 |
const digest = md5Hasher.hash(imgBrief.src);
|
|
|
|
| 12 |
@singleton()
|
| 13 |
export class AltTextService extends AsyncService {
|
| 14 |
|
| 15 |
+
altsToIgnore = 'image,img,photo,picture,pic,alt,figure,fig'.split(',');
|
| 16 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 17 |
|
| 18 |
constructor(
|
|
|
|
| 49 |
if (!imgBrief.src) {
|
| 50 |
return undefined;
|
| 51 |
}
|
| 52 |
+
if (imgBrief.alt && !this.altsToIgnore.includes(imgBrief.alt.trim().toLowerCase())) {
|
| 53 |
return imgBrief.alt;
|
| 54 |
}
|
| 55 |
const digest = md5Hasher.hash(imgBrief.src);
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -193,17 +193,26 @@ export class PuppeteerControl extends AsyncService {
|
|
| 193 |
preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
|
| 194 |
preparations.push(page.evaluateOnNewDocument(`
|
| 195 |
function briefImgs(elem) {
|
| 196 |
-
const imageTags = Array.from((elem || document).querySelectorAll('img[src]'));
|
| 197 |
-
|
| 198 |
-
return imageTags.map((x)=>
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
}
|
| 208 |
function giveSnapshot() {
|
| 209 |
let parsed;
|
|
|
|
| 193 |
preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
|
| 194 |
preparations.push(page.evaluateOnNewDocument(`
|
| 195 |
function briefImgs(elem) {
|
| 196 |
+
const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]'));
|
| 197 |
+
|
| 198 |
+
return imageTags.map((x)=> {
|
| 199 |
+
let linkPreferredSrc = x.src;
|
| 200 |
+
if (linkPreferredSrc.startsWith('data:')) {
|
| 201 |
+
if (typeof x.dataset?.src === 'string' && !x.dataset.src.startsWith('data:')) {
|
| 202 |
+
linkPreferredSrc = x.dataset.src;
|
| 203 |
+
}
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
return {
|
| 207 |
+
src: linkPreferredSrc,
|
| 208 |
+
loaded: x.complete,
|
| 209 |
+
width: x.width,
|
| 210 |
+
height: x.height,
|
| 211 |
+
naturalWidth: x.naturalWidth,
|
| 212 |
+
naturalHeight: x.naturalHeight,
|
| 213 |
+
alt: x.alt || x.title,
|
| 214 |
+
};
|
| 215 |
+
});
|
| 216 |
}
|
| 217 |
function giveSnapshot() {
|
| 218 |
let parsed;
|