Spaces:
Build error
Build error
fix: give expireAt for image cache
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -53,8 +53,6 @@ export class CrawlerHost extends RPCHost {
|
|
| 53 |
|
| 54 |
turnDownPlugins = [require('turndown-plugin-gfm').gfm];
|
| 55 |
|
| 56 |
-
imageShortUrlPrefix?: string;
|
| 57 |
-
|
| 58 |
constructor(
|
| 59 |
protected globalLogger: Logger,
|
| 60 |
protected puppeteerControl: PuppeteerControl,
|
|
@@ -78,13 +76,13 @@ export class CrawlerHost extends RPCHost {
|
|
| 78 |
|
| 79 |
let contentText = '';
|
| 80 |
if (toBeTurnedToMd) {
|
| 81 |
-
const urlToAltMap: { [k: string]:
|
| 82 |
const tasks = (snapshot.imgs || []).map(async (x) => {
|
| 83 |
-
const r = await this.altTextService.
|
| 84 |
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
| 85 |
return undefined;
|
| 86 |
});
|
| 87 |
-
if (r) {
|
| 88 |
urlToAltMap[x.src.trim()] = r;
|
| 89 |
}
|
| 90 |
});
|
|
@@ -103,7 +101,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 103 |
const mapped = urlToAltMap[src];
|
| 104 |
imgIdx++;
|
| 105 |
if (mapped) {
|
| 106 |
-
return ``;
|
| 109 |
}
|
|
@@ -115,7 +113,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 115 |
if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
|
| 116 |
contentText = turnDownService.turndown(snapshot.html);
|
| 117 |
}
|
| 118 |
-
if (!contentText || (contentText.startsWith('<')
|
| 119 |
contentText = snapshot.text;
|
| 120 |
}
|
| 121 |
|
|
|
|
| 53 |
|
| 54 |
turnDownPlugins = [require('turndown-plugin-gfm').gfm];
|
| 55 |
|
|
|
|
|
|
|
| 56 |
constructor(
|
| 57 |
protected globalLogger: Logger,
|
| 58 |
protected puppeteerControl: PuppeteerControl,
|
|
|
|
| 76 |
|
| 77 |
let contentText = '';
|
| 78 |
if (toBeTurnedToMd) {
|
| 79 |
+
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
| 80 |
const tasks = (snapshot.imgs || []).map(async (x) => {
|
| 81 |
+
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
| 82 |
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
| 83 |
return undefined;
|
| 84 |
});
|
| 85 |
+
if (r && x.src) {
|
| 86 |
urlToAltMap[x.src.trim()] = r;
|
| 87 |
}
|
| 88 |
});
|
|
|
|
| 101 |
const mapped = urlToAltMap[src];
|
| 102 |
imgIdx++;
|
| 103 |
if (mapped) {
|
| 104 |
+
return ``;
|
| 105 |
}
|
| 106 |
return ``;
|
| 107 |
}
|
|
|
|
| 113 |
if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
|
| 114 |
contentText = turnDownService.turndown(snapshot.html);
|
| 115 |
}
|
| 116 |
+
if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) {
|
| 117 |
contentText = snapshot.text;
|
| 118 |
}
|
| 119 |
|
backend/functions/src/services/alt-text.ts
CHANGED
|
@@ -44,32 +44,33 @@ export class AltTextService extends AsyncService {
|
|
| 44 |
}
|
| 45 |
}
|
| 46 |
|
| 47 |
-
async
|
| 48 |
if (!imgBrief.src) {
|
| 49 |
return undefined;
|
| 50 |
}
|
|
|
|
|
|
|
|
|
|
| 51 |
const digest = md5Hasher.hash(imgBrief.src);
|
| 52 |
const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
|
| 53 |
|
| 54 |
const existing = await ImgAlt.fromFirestore(shortDigest);
|
| 55 |
|
| 56 |
-
if (existing
|
| 57 |
-
return
|
| 58 |
-
shortDigest,
|
| 59 |
-
alt: existing.generatedAlt,
|
| 60 |
-
};
|
| 61 |
}
|
| 62 |
|
| 63 |
-
let generatedCaption;
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
}
|
| 69 |
-
this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err });
|
| 70 |
-
}
|
| 71 |
}
|
| 72 |
|
|
|
|
|
|
|
|
|
|
| 73 |
await ImgAlt.COLLECTION.doc(shortDigest).set(
|
| 74 |
{
|
| 75 |
_id: shortDigest,
|
|
@@ -79,13 +80,11 @@ export class AltTextService extends AsyncService {
|
|
| 79 |
urlDigest: digest,
|
| 80 |
originalAlt: imgBrief.alt || '',
|
| 81 |
generatedAlt: generatedCaption || '',
|
| 82 |
-
createdAt: new Date()
|
|
|
|
| 83 |
}, { merge: true }
|
| 84 |
);
|
| 85 |
|
| 86 |
-
return
|
| 87 |
-
shortDigest,
|
| 88 |
-
alt: generatedCaption,
|
| 89 |
-
};
|
| 90 |
}
|
| 91 |
}
|
|
|
|
| 44 |
}
|
| 45 |
}
|
| 46 |
|
| 47 |
+
async getAltText(imgBrief: ImgBrief) {
|
| 48 |
if (!imgBrief.src) {
|
| 49 |
return undefined;
|
| 50 |
}
|
| 51 |
+
if (imgBrief.alt) {
|
| 52 |
+
return imgBrief.alt;
|
| 53 |
+
}
|
| 54 |
const digest = md5Hasher.hash(imgBrief.src);
|
| 55 |
const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
|
| 56 |
|
| 57 |
const existing = await ImgAlt.fromFirestore(shortDigest);
|
| 58 |
|
| 59 |
+
if (existing) {
|
| 60 |
+
return existing.generatedAlt || existing.originalAlt || '';
|
|
|
|
|
|
|
|
|
|
| 61 |
}
|
| 62 |
|
| 63 |
+
let generatedCaption = '';
|
| 64 |
|
| 65 |
+
try {
|
| 66 |
+
generatedCaption = await this.caption(imgBrief.src);
|
| 67 |
+
} catch (err) {
|
| 68 |
+
this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err });
|
|
|
|
|
|
|
| 69 |
}
|
| 70 |
|
| 71 |
+
// Don't try again until the next day
|
| 72 |
+
const expireMixin = generatedCaption ? {} : { expireAt: new Date(Date.now() + 1000 * 3600 * 24) };
|
| 73 |
+
|
| 74 |
await ImgAlt.COLLECTION.doc(shortDigest).set(
|
| 75 |
{
|
| 76 |
_id: shortDigest,
|
|
|
|
| 80 |
urlDigest: digest,
|
| 81 |
originalAlt: imgBrief.alt || '',
|
| 82 |
generatedAlt: generatedCaption || '',
|
| 83 |
+
createdAt: new Date(),
|
| 84 |
+
...expireMixin
|
| 85 |
}, { merge: true }
|
| 86 |
);
|
| 87 |
|
| 88 |
+
return generatedCaption;
|
|
|
|
|
|
|
|
|
|
| 89 |
}
|
| 90 |
}
|