nomagick commited on
Commit
a471a61
·
unverified ·
1 Parent(s): 3b0e020

fix: do img filtering in node instead of browser

Browse files
package-lock.json CHANGED
@@ -17,7 +17,7 @@
17
  "axios": "^1.3.3",
18
  "bcrypt": "^5.1.0",
19
  "busboy": "^1.6.0",
20
- "civkit": "^0.9.0-848ef4e",
21
  "core-js": "^3.37.1",
22
  "cors": "^2.8.5",
23
  "dayjs": "^1.11.9",
@@ -4003,9 +4003,9 @@
4003
  }
4004
  },
4005
  "node_modules/civkit": {
4006
- "version": "0.9.0-848ef4e",
4007
- "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.9.0-848ef4e.tgz",
4008
- "integrity": "sha512-yxk5AKaiZSN4ntlwybVHYgUer402CSw06KzN7wvfaYra9evZkZ7MiFHGULqMnY7657k3CH0WV4n6jGfRj1Vpvw==",
4009
  "license": "AGPL",
4010
  "dependencies": {
4011
  "lodash": "^4.17.21",
 
17
  "axios": "^1.3.3",
18
  "bcrypt": "^5.1.0",
19
  "busboy": "^1.6.0",
20
+ "civkit": "^0.9.0-2570394",
21
  "core-js": "^3.37.1",
22
  "cors": "^2.8.5",
23
  "dayjs": "^1.11.9",
 
4003
  }
4004
  },
4005
  "node_modules/civkit": {
4006
+ "version": "0.9.0-2570394",
4007
+ "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.9.0-2570394.tgz",
4008
+ "integrity": "sha512-w77agnElTEP6g+l66KhX1Ib9z7JXbR3FaR5/2yTUPIPjm32qsWkmKRvv0mZ83IcMSSmTjF9LxboYAliyTx7cIA==",
4009
  "license": "AGPL",
4010
  "dependencies": {
4011
  "lodash": "^4.17.21",
package.json CHANGED
@@ -26,7 +26,7 @@
26
  "axios": "^1.3.3",
27
  "bcrypt": "^5.1.0",
28
  "busboy": "^1.6.0",
29
- "civkit": "^0.9.0-848ef4e",
30
  "core-js": "^3.37.1",
31
  "cors": "^2.8.5",
32
  "dayjs": "^1.11.9",
 
26
  "axios": "^1.3.3",
27
  "bcrypt": "^5.1.0",
28
  "busboy": "^1.6.0",
29
+ "civkit": "^0.9.0-2570394",
30
  "core-js": "^3.37.1",
31
  "cors": "^2.8.5",
32
  "dayjs": "^1.11.9",
src/api/crawler.ts CHANGED
@@ -1069,7 +1069,6 @@ export class CrawlerHost extends RPCHost {
1069
  title: snapshot.title,
1070
  content: snapshot.parsed?.textContent,
1071
  url: presumedURL?.href || snapshot.href,
1072
- [Symbol.dispose]: () => undefined,
1073
  };
1074
 
1075
  Object.defineProperty(output, 'textRepresentation', {
 
1069
  title: snapshot.title,
1070
  content: snapshot.parsed?.textContent,
1071
  url: presumedURL?.href || snapshot.href,
 
1072
  };
1073
 
1074
  Object.defineProperty(output, 'textRepresentation', {
src/services/alt-text.ts CHANGED
@@ -33,8 +33,11 @@ export class AltTextService extends AsyncService {
33
  try {
34
  const img = await this.canvasService.loadImage(url);
35
  const contentTypeHint = Reflect.get(img, 'contentType');
 
 
 
36
  if (Math.min(img.naturalHeight, img.naturalWidth) < 64) {
37
- throw new AssertionFailureError({ message: `Image is too small to generate alt text for url ${url}` });
38
  }
39
  const resized = this.canvasService.fitImageToSquareBox(img, 1024);
40
  const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
@@ -63,6 +66,32 @@ export class AltTextService extends AsyncService {
63
  }
64
  const digest = md5Hasher.hash(imgBrief.src);
65
  const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  const existing = await ImgAlt.fromFirestore(shortDigest);
68
 
@@ -102,4 +131,4 @@ export class AltTextService extends AsyncService {
102
 
103
  return generatedCaption;
104
  }
105
- }
 
33
  try {
34
  const img = await this.canvasService.loadImage(url);
35
  const contentTypeHint = Reflect.get(img, 'contentType');
36
+ if (Math.min(img.naturalHeight, img.naturalWidth) <= 1) {
37
+ return `A ${img.naturalWidth}x${img.naturalHeight} image, likely be a tacker probe`;
38
+ }
39
  if (Math.min(img.naturalHeight, img.naturalWidth) < 64) {
40
+ return `A ${img.naturalWidth}x${img.naturalHeight} small image, likely a logo, icon or avatar`;
41
  }
42
  const resized = this.canvasService.fitImageToSquareBox(img, 1024);
43
  const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
 
66
  }
67
  const digest = md5Hasher.hash(imgBrief.src);
68
  const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
69
+ let dims: number[] = [];
70
+ do {
71
+ if (imgBrief.loaded) {
72
+ if (imgBrief.naturalWidth && imgBrief.naturalHeight) {
73
+ if (Math.min(imgBrief.naturalWidth, imgBrief.naturalHeight) < 64) {
74
+ dims = [imgBrief.naturalWidth, imgBrief.naturalHeight];
75
+ break;
76
+ }
77
+ }
78
+ }
79
+
80
+ if (imgBrief.width && imgBrief.height) {
81
+ if (Math.min(imgBrief.width, imgBrief.height) < 64) {
82
+ dims = [imgBrief.width, imgBrief.height];
83
+ break;
84
+ }
85
+ }
86
+
87
+ } while (false);
88
+
89
+ if (Math.min(...dims) <= 1) {
90
+ return `A ${dims[0]}x${dims[1]} image, likely be a tacker probe`;
91
+ }
92
+ if (Math.min(...dims) < 64) {
93
+ return `A ${dims[0]}x${dims[1]} small image, likely a logo, icon or avatar`;
94
+ }
95
 
96
  const existing = await ImgAlt.fromFirestore(shortDigest);
97
 
 
131
 
132
  return generatedCaption;
133
  }
134
+ };
src/services/puppeteer.ts CHANGED
@@ -407,18 +407,7 @@ function giveSnapshot(stopActiveSnapshot, overrideDomAnalysis) {
407
  if (document.baseURI !== r.href) {
408
  r.rebase = document.baseURI;
409
  }
410
- r.imgs = briefImgs().filter((x)=> {
411
- if (x.complete) {
412
- if (Math.min(x.width, x.height, x.naturalWidth, x.naturalHeight) < 64) {
413
- return false;
414
- }
415
- }
416
- const m = Math.min(x.width, x.height);
417
- if (m && m < 64) {
418
- return false;
419
- }
420
- return true;
421
- });
422
 
423
  return r;
424
  }
 
407
  if (document.baseURI !== r.href) {
408
  r.rebase = document.baseURI;
409
  }
410
+ r.imgs = briefImgs();
 
 
 
 
 
 
 
 
 
 
 
411
 
412
  return r;
413
  }
src/services/snapshot-formatter.ts CHANGED
@@ -43,7 +43,7 @@ export interface FormattedPage {
43
 
44
  textRepresentation?: string;
45
 
46
- [Symbol.dispose]: () => void;
47
  }
48
 
49
  export const md5Hasher = new HashManager('md5', 'hex');
@@ -199,7 +199,6 @@ export class SnapshotFormatter extends AsyncService {
199
  description: (snapshot.description || '').trim(),
200
  url: nominalUrl?.toString() || snapshot.href?.trim(),
201
  publishedTime: snapshot.parsed?.publishedTime || undefined,
202
- [Symbol.dispose]: () => { },
203
  };
204
 
205
  Object.assign(f, formatted);
@@ -395,7 +394,6 @@ export class SnapshotFormatter extends AsyncService {
395
  url: nominalUrl?.toString() || snapshot.href?.trim(),
396
  content: contentText,
397
  publishedTime: snapshot.parsed?.publishedTime || undefined,
398
- [Symbol.dispose]: () => { },
399
  };
400
 
401
  if (snapshot.status) {
 
43
 
44
  textRepresentation?: string;
45
 
46
+ [Symbol.dispose]?: () => void;
47
  }
48
 
49
  export const md5Hasher = new HashManager('md5', 'hex');
 
199
  description: (snapshot.description || '').trim(),
200
  url: nominalUrl?.toString() || snapshot.href?.trim(),
201
  publishedTime: snapshot.parsed?.publishedTime || undefined,
 
202
  };
203
 
204
  Object.assign(f, formatted);
 
394
  url: nominalUrl?.toString() || snapshot.href?.trim(),
395
  content: contentText,
396
  publishedTime: snapshot.parsed?.publishedTime || undefined,
 
397
  };
398
 
399
  if (snapshot.status) {