nomagick commited on
Commit
f6c89e8
·
unverified ·
1 Parent(s): deb0b6d

fix: pdf upload in multipart

Browse files
backend/functions/package-lock.json CHANGED
@@ -15,13 +15,14 @@
15
  "archiver": "^6.0.1",
16
  "axios": "^1.3.3",
17
  "bcrypt": "^5.1.0",
 
18
  "civkit": "^0.8.2-2eddf1b",
19
  "core-js": "^3.37.1",
20
  "cors": "^2.8.5",
21
  "dayjs": "^1.11.9",
22
  "express": "^4.19.2",
23
  "firebase-admin": "^12.1.0",
24
- "firebase-functions": "^6.1.0",
25
  "htmlparser2": "^9.0.0",
26
  "jose": "^5.1.0",
27
  "langdetect": "^0.2.1",
@@ -48,6 +49,7 @@
48
  "devDependencies": {
49
  "@types/archiver": "^5.3.4",
50
  "@types/bcrypt": "^5.0.0",
 
51
  "@types/cors": "^2.8.17",
52
  "@types/generic-pool": "^3.8.1",
53
  "@types/node": "^20.14.13",
@@ -2135,6 +2137,16 @@
2135
  "@types/node": "*"
2136
  }
2137
  },
 
 
 
 
 
 
 
 
 
 
2138
  "node_modules/@types/cacheable-request": {
2139
  "version": "6.0.3",
2140
  "resolved": "https://registry.npmjs.org/@types/cacheable-request/-/cacheable-request-6.0.3.tgz",
@@ -3540,7 +3552,6 @@
3540
  "version": "1.6.0",
3541
  "resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz",
3542
  "integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==",
3543
- "optional": true,
3544
  "dependencies": {
3545
  "streamsearch": "^1.1.0"
3546
  },
@@ -5539,9 +5550,9 @@
5539
  }
5540
  },
5541
  "node_modules/firebase-functions": {
5542
- "version": "6.1.0",
5543
- "resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.1.0.tgz",
5544
- "integrity": "sha512-7Gq7XpIA2qo9wKhYA9Ksb0v2bHfXD70zQwBJO6//Q624A7D9KAb449K6DM0swrCoPO7NGExbPf2eC7j7e+4+xA==",
5545
  "license": "MIT",
5546
  "dependencies": {
5547
  "@types/cors": "^2.8.5",
@@ -5557,7 +5568,7 @@
5557
  "node": ">=14.10.0"
5558
  },
5559
  "peerDependencies": {
5560
- "firebase-admin": "^11.10.0 || ^12.0.0"
5561
  }
5562
  },
5563
  "node_modules/firebase-functions-test": {
@@ -10960,7 +10971,6 @@
10960
  "version": "1.1.0",
10961
  "resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz",
10962
  "integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==",
10963
- "optional": true,
10964
  "engines": {
10965
  "node": ">=10.0.0"
10966
  }
 
15
  "archiver": "^6.0.1",
16
  "axios": "^1.3.3",
17
  "bcrypt": "^5.1.0",
18
+ "busboy": "^1.6.0",
19
  "civkit": "^0.8.2-2eddf1b",
20
  "core-js": "^3.37.1",
21
  "cors": "^2.8.5",
22
  "dayjs": "^1.11.9",
23
  "express": "^4.19.2",
24
  "firebase-admin": "^12.1.0",
25
+ "firebase-functions": "^6.1.1",
26
  "htmlparser2": "^9.0.0",
27
  "jose": "^5.1.0",
28
  "langdetect": "^0.2.1",
 
49
  "devDependencies": {
50
  "@types/archiver": "^5.3.4",
51
  "@types/bcrypt": "^5.0.0",
52
+ "@types/busboy": "^1.5.4",
53
  "@types/cors": "^2.8.17",
54
  "@types/generic-pool": "^3.8.1",
55
  "@types/node": "^20.14.13",
 
2137
  "@types/node": "*"
2138
  }
2139
  },
2140
+ "node_modules/@types/busboy": {
2141
+ "version": "1.5.4",
2142
+ "resolved": "https://registry.npmjs.org/@types/busboy/-/busboy-1.5.4.tgz",
2143
+ "integrity": "sha512-kG7WrUuAKK0NoyxfQHsVE6j1m01s6kMma64E+OZenQABMQyTJop1DumUWcLwAQ2JzpefU7PDYoRDKl8uZosFjw==",
2144
+ "dev": true,
2145
+ "license": "MIT",
2146
+ "dependencies": {
2147
+ "@types/node": "*"
2148
+ }
2149
+ },
2150
  "node_modules/@types/cacheable-request": {
2151
  "version": "6.0.3",
2152
  "resolved": "https://registry.npmjs.org/@types/cacheable-request/-/cacheable-request-6.0.3.tgz",
 
3552
  "version": "1.6.0",
3553
  "resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz",
3554
  "integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==",
 
3555
  "dependencies": {
3556
  "streamsearch": "^1.1.0"
3557
  },
 
5550
  }
5551
  },
5552
  "node_modules/firebase-functions": {
5553
+ "version": "6.1.1",
5554
+ "resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.1.1.tgz",
5555
+ "integrity": "sha512-q+4zsQhX04YJUz6hqaiH/j5kixljPj0PMxkm8KN3juYp3I4NC6CZ4qfy5JRfwvV8VfXM2KkJrZuyJtLyZr97aw==",
5556
  "license": "MIT",
5557
  "dependencies": {
5558
  "@types/cors": "^2.8.5",
 
5568
  "node": ">=14.10.0"
5569
  },
5570
  "peerDependencies": {
5571
+ "firebase-admin": "^11.10.0 || ^12.0.0 || ^13.0.0"
5572
  }
5573
  },
5574
  "node_modules/firebase-functions-test": {
 
10971
  "version": "1.1.0",
10972
  "resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz",
10973
  "integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==",
 
10974
  "engines": {
10975
  "node": ">=10.0.0"
10976
  }
backend/functions/package.json CHANGED
@@ -35,13 +35,14 @@
35
  "archiver": "^6.0.1",
36
  "axios": "^1.3.3",
37
  "bcrypt": "^5.1.0",
 
38
  "civkit": "^0.8.2-2eddf1b",
39
  "core-js": "^3.37.1",
40
  "cors": "^2.8.5",
41
  "dayjs": "^1.11.9",
42
  "express": "^4.19.2",
43
  "firebase-admin": "^12.1.0",
44
- "firebase-functions": "^6.1.0",
45
  "htmlparser2": "^9.0.0",
46
  "jose": "^5.1.0",
47
  "langdetect": "^0.2.1",
@@ -68,6 +69,7 @@
68
  "devDependencies": {
69
  "@types/archiver": "^5.3.4",
70
  "@types/bcrypt": "^5.0.0",
 
71
  "@types/cors": "^2.8.17",
72
  "@types/generic-pool": "^3.8.1",
73
  "@types/node": "^20.14.13",
 
35
  "archiver": "^6.0.1",
36
  "axios": "^1.3.3",
37
  "bcrypt": "^5.1.0",
38
+ "busboy": "^1.6.0",
39
  "civkit": "^0.8.2-2eddf1b",
40
  "core-js": "^3.37.1",
41
  "cors": "^2.8.5",
42
  "dayjs": "^1.11.9",
43
  "express": "^4.19.2",
44
  "firebase-admin": "^12.1.0",
45
+ "firebase-functions": "^6.1.1",
46
  "htmlparser2": "^9.0.0",
47
  "jose": "^5.1.0",
48
  "langdetect": "^0.2.1",
 
69
  "devDependencies": {
70
  "@types/archiver": "^5.3.4",
71
  "@types/bcrypt": "^5.0.0",
72
+ "@types/busboy": "^1.5.4",
73
  "@types/cors": "^2.8.17",
74
  "@types/generic-pool": "^3.8.1",
75
  "@types/node": "^20.14.13",
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -374,7 +374,8 @@ export class CrawlerHost extends RPCHost {
374
 
375
  const targetUrlFromGet = originPath.slice(1);
376
  if (crawlerOptions.pdf) {
377
- url = `file://pdf.${md5Hasher.hash(crawlerOptions.pdf)}`;
 
378
  } else if (targetUrlFromGet) {
379
  url = targetUrlFromGet.trim();
380
  } else if (crawlerOptions.url) {
@@ -552,7 +553,9 @@ export class CrawlerHost extends RPCHost {
552
  }
553
 
554
  if (crawlerOpts?.pdf) {
555
- const pdfDataUrl = `data:application/pdf;base64,${encodeURIComponent(crawlerOpts.pdf)}`;
 
 
556
  const fakeSnapshot = {
557
  href: urlToCrawl.toString(),
558
  html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
 
374
 
375
  const targetUrlFromGet = originPath.slice(1);
376
  if (crawlerOptions.pdf) {
377
+ const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64');
378
+ url = `file://pdf.${md5Hasher.hash(pdfBuf)}`;
379
  } else if (targetUrlFromGet) {
380
  url = targetUrlFromGet.trim();
381
  } else if (crawlerOptions.url) {
 
553
  }
554
 
555
  if (crawlerOpts?.pdf) {
556
+
557
+ const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
558
+ const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
559
  const fakeSnapshot = {
560
  href: urlToCrawl.toString(),
561
  html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -171,8 +171,9 @@ export class CrawlerOptions extends AutoCastable {
171
 
172
  @Prop({
173
  desc: 'Base64 encoded PDF.',
 
174
  })
175
- pdf?: string;
176
 
177
  @Prop({
178
  default: CONTENT_FORMAT.CONTENT,
 
171
 
172
  @Prop({
173
  desc: 'Base64 encoded PDF.',
174
+ type: [File, String]
175
  })
176
+ pdf?: File | string;
177
 
178
  @Prop({
179
  default: CONTENT_FORMAT.CONTENT,
backend/functions/src/services/pdf-extract.ts CHANGED
@@ -64,23 +64,25 @@ export class PDFExtractor extends AsyncService {
64
  }
65
 
66
  isDataUrl(url: string) {
67
- return /^data:.+\/(.+);base64,(.*)$/.test(url);
68
  }
69
 
70
  parseDataUrl(url: string) {
71
- const matches = url.match(/^data:.+\/(.+);base64,(.*)$/);
72
- if (!matches || matches.length !== 3) {
 
 
73
  throw new Error('Invalid data URL');
74
  }
75
 
76
- if (matches[1] !== 'pdf') {
77
  throw new Error('Invalid data URL type');
78
  }
79
 
80
  return {
81
- type: matches[1],
82
- data: matches[2]
83
- }
84
  }
85
 
86
  async extract(url: string | URL) {
@@ -88,9 +90,9 @@ export class PDFExtractor extends AsyncService {
88
 
89
  if (typeof url === 'string' && this.isDataUrl(url)) {
90
  const { data } = this.parseDataUrl(url);
91
-
92
  loadingTask = this.pdfjs.getDocument({
93
- data: atob(decodeURIComponent(data)),
94
  disableFontFace: true,
95
  verbosity: 0
96
  });
 
64
  }
65
 
66
  isDataUrl(url: string) {
67
+ return url.startsWith('data:');
68
  }
69
 
70
  parseDataUrl(url: string) {
71
+ const protocol = url.slice(0, url.indexOf(':'));
72
+ const contentType = url.slice(url.indexOf(':') + 1, url.indexOf(';'));
73
+ const data = url.slice(url.indexOf(',') + 1);
74
+ if (protocol !== 'data' || !data) {
75
  throw new Error('Invalid data URL');
76
  }
77
 
78
+ if (contentType !== 'application/pdf') {
79
  throw new Error('Invalid data URL type');
80
  }
81
 
82
  return {
83
+ type: contentType,
84
+ data: data
85
+ };
86
  }
87
 
88
  async extract(url: string | URL) {
 
90
 
91
  if (typeof url === 'string' && this.isDataUrl(url)) {
92
  const { data } = this.parseDataUrl(url);
93
+ const binary = Uint8Array.from(Buffer.from(data, 'base64'));
94
  loadingTask = this.pdfjs.getDocument({
95
+ data: binary,
96
  disableFontFace: true,
97
  verbosity: 0
98
  });
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 296fe56d235c08978eda384d8fcddbacdd6f7863
 
1
+ Subproject commit a90669ca91d2c8cb470e75bf2cdfa06812e5ba7a