Spaces:
Build error
Build error
fix: pdf upload in multipart
Browse files
backend/functions/package-lock.json
CHANGED
|
@@ -15,13 +15,14 @@
|
|
| 15 |
"archiver": "^6.0.1",
|
| 16 |
"axios": "^1.3.3",
|
| 17 |
"bcrypt": "^5.1.0",
|
|
|
|
| 18 |
"civkit": "^0.8.2-2eddf1b",
|
| 19 |
"core-js": "^3.37.1",
|
| 20 |
"cors": "^2.8.5",
|
| 21 |
"dayjs": "^1.11.9",
|
| 22 |
"express": "^4.19.2",
|
| 23 |
"firebase-admin": "^12.1.0",
|
| 24 |
-
"firebase-functions": "^6.1.
|
| 25 |
"htmlparser2": "^9.0.0",
|
| 26 |
"jose": "^5.1.0",
|
| 27 |
"langdetect": "^0.2.1",
|
|
@@ -48,6 +49,7 @@
|
|
| 48 |
"devDependencies": {
|
| 49 |
"@types/archiver": "^5.3.4",
|
| 50 |
"@types/bcrypt": "^5.0.0",
|
|
|
|
| 51 |
"@types/cors": "^2.8.17",
|
| 52 |
"@types/generic-pool": "^3.8.1",
|
| 53 |
"@types/node": "^20.14.13",
|
|
@@ -2135,6 +2137,16 @@
|
|
| 2135 |
"@types/node": "*"
|
| 2136 |
}
|
| 2137 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2138 |
"node_modules/@types/cacheable-request": {
|
| 2139 |
"version": "6.0.3",
|
| 2140 |
"resolved": "https://registry.npmjs.org/@types/cacheable-request/-/cacheable-request-6.0.3.tgz",
|
|
@@ -3540,7 +3552,6 @@
|
|
| 3540 |
"version": "1.6.0",
|
| 3541 |
"resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz",
|
| 3542 |
"integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==",
|
| 3543 |
-
"optional": true,
|
| 3544 |
"dependencies": {
|
| 3545 |
"streamsearch": "^1.1.0"
|
| 3546 |
},
|
|
@@ -5539,9 +5550,9 @@
|
|
| 5539 |
}
|
| 5540 |
},
|
| 5541 |
"node_modules/firebase-functions": {
|
| 5542 |
-
"version": "6.1.
|
| 5543 |
-
"resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.1.
|
| 5544 |
-
"integrity": "sha512-
|
| 5545 |
"license": "MIT",
|
| 5546 |
"dependencies": {
|
| 5547 |
"@types/cors": "^2.8.5",
|
|
@@ -5557,7 +5568,7 @@
|
|
| 5557 |
"node": ">=14.10.0"
|
| 5558 |
},
|
| 5559 |
"peerDependencies": {
|
| 5560 |
-
"firebase-admin": "^11.10.0 || ^12.0.0"
|
| 5561 |
}
|
| 5562 |
},
|
| 5563 |
"node_modules/firebase-functions-test": {
|
|
@@ -10960,7 +10971,6 @@
|
|
| 10960 |
"version": "1.1.0",
|
| 10961 |
"resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz",
|
| 10962 |
"integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==",
|
| 10963 |
-
"optional": true,
|
| 10964 |
"engines": {
|
| 10965 |
"node": ">=10.0.0"
|
| 10966 |
}
|
|
|
|
| 15 |
"archiver": "^6.0.1",
|
| 16 |
"axios": "^1.3.3",
|
| 17 |
"bcrypt": "^5.1.0",
|
| 18 |
+
"busboy": "^1.6.0",
|
| 19 |
"civkit": "^0.8.2-2eddf1b",
|
| 20 |
"core-js": "^3.37.1",
|
| 21 |
"cors": "^2.8.5",
|
| 22 |
"dayjs": "^1.11.9",
|
| 23 |
"express": "^4.19.2",
|
| 24 |
"firebase-admin": "^12.1.0",
|
| 25 |
+
"firebase-functions": "^6.1.1",
|
| 26 |
"htmlparser2": "^9.0.0",
|
| 27 |
"jose": "^5.1.0",
|
| 28 |
"langdetect": "^0.2.1",
|
|
|
|
| 49 |
"devDependencies": {
|
| 50 |
"@types/archiver": "^5.3.4",
|
| 51 |
"@types/bcrypt": "^5.0.0",
|
| 52 |
+
"@types/busboy": "^1.5.4",
|
| 53 |
"@types/cors": "^2.8.17",
|
| 54 |
"@types/generic-pool": "^3.8.1",
|
| 55 |
"@types/node": "^20.14.13",
|
|
|
|
| 2137 |
"@types/node": "*"
|
| 2138 |
}
|
| 2139 |
},
|
| 2140 |
+
"node_modules/@types/busboy": {
|
| 2141 |
+
"version": "1.5.4",
|
| 2142 |
+
"resolved": "https://registry.npmjs.org/@types/busboy/-/busboy-1.5.4.tgz",
|
| 2143 |
+
"integrity": "sha512-kG7WrUuAKK0NoyxfQHsVE6j1m01s6kMma64E+OZenQABMQyTJop1DumUWcLwAQ2JzpefU7PDYoRDKl8uZosFjw==",
|
| 2144 |
+
"dev": true,
|
| 2145 |
+
"license": "MIT",
|
| 2146 |
+
"dependencies": {
|
| 2147 |
+
"@types/node": "*"
|
| 2148 |
+
}
|
| 2149 |
+
},
|
| 2150 |
"node_modules/@types/cacheable-request": {
|
| 2151 |
"version": "6.0.3",
|
| 2152 |
"resolved": "https://registry.npmjs.org/@types/cacheable-request/-/cacheable-request-6.0.3.tgz",
|
|
|
|
| 3552 |
"version": "1.6.0",
|
| 3553 |
"resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz",
|
| 3554 |
"integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==",
|
|
|
|
| 3555 |
"dependencies": {
|
| 3556 |
"streamsearch": "^1.1.0"
|
| 3557 |
},
|
|
|
|
| 5550 |
}
|
| 5551 |
},
|
| 5552 |
"node_modules/firebase-functions": {
|
| 5553 |
+
"version": "6.1.1",
|
| 5554 |
+
"resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.1.1.tgz",
|
| 5555 |
+
"integrity": "sha512-q+4zsQhX04YJUz6hqaiH/j5kixljPj0PMxkm8KN3juYp3I4NC6CZ4qfy5JRfwvV8VfXM2KkJrZuyJtLyZr97aw==",
|
| 5556 |
"license": "MIT",
|
| 5557 |
"dependencies": {
|
| 5558 |
"@types/cors": "^2.8.5",
|
|
|
|
| 5568 |
"node": ">=14.10.0"
|
| 5569 |
},
|
| 5570 |
"peerDependencies": {
|
| 5571 |
+
"firebase-admin": "^11.10.0 || ^12.0.0 || ^13.0.0"
|
| 5572 |
}
|
| 5573 |
},
|
| 5574 |
"node_modules/firebase-functions-test": {
|
|
|
|
| 10971 |
"version": "1.1.0",
|
| 10972 |
"resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz",
|
| 10973 |
"integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==",
|
|
|
|
| 10974 |
"engines": {
|
| 10975 |
"node": ">=10.0.0"
|
| 10976 |
}
|
backend/functions/package.json
CHANGED
|
@@ -35,13 +35,14 @@
|
|
| 35 |
"archiver": "^6.0.1",
|
| 36 |
"axios": "^1.3.3",
|
| 37 |
"bcrypt": "^5.1.0",
|
|
|
|
| 38 |
"civkit": "^0.8.2-2eddf1b",
|
| 39 |
"core-js": "^3.37.1",
|
| 40 |
"cors": "^2.8.5",
|
| 41 |
"dayjs": "^1.11.9",
|
| 42 |
"express": "^4.19.2",
|
| 43 |
"firebase-admin": "^12.1.0",
|
| 44 |
-
"firebase-functions": "^6.1.
|
| 45 |
"htmlparser2": "^9.0.0",
|
| 46 |
"jose": "^5.1.0",
|
| 47 |
"langdetect": "^0.2.1",
|
|
@@ -68,6 +69,7 @@
|
|
| 68 |
"devDependencies": {
|
| 69 |
"@types/archiver": "^5.3.4",
|
| 70 |
"@types/bcrypt": "^5.0.0",
|
|
|
|
| 71 |
"@types/cors": "^2.8.17",
|
| 72 |
"@types/generic-pool": "^3.8.1",
|
| 73 |
"@types/node": "^20.14.13",
|
|
|
|
| 35 |
"archiver": "^6.0.1",
|
| 36 |
"axios": "^1.3.3",
|
| 37 |
"bcrypt": "^5.1.0",
|
| 38 |
+
"busboy": "^1.6.0",
|
| 39 |
"civkit": "^0.8.2-2eddf1b",
|
| 40 |
"core-js": "^3.37.1",
|
| 41 |
"cors": "^2.8.5",
|
| 42 |
"dayjs": "^1.11.9",
|
| 43 |
"express": "^4.19.2",
|
| 44 |
"firebase-admin": "^12.1.0",
|
| 45 |
+
"firebase-functions": "^6.1.1",
|
| 46 |
"htmlparser2": "^9.0.0",
|
| 47 |
"jose": "^5.1.0",
|
| 48 |
"langdetect": "^0.2.1",
|
|
|
|
| 69 |
"devDependencies": {
|
| 70 |
"@types/archiver": "^5.3.4",
|
| 71 |
"@types/bcrypt": "^5.0.0",
|
| 72 |
+
"@types/busboy": "^1.5.4",
|
| 73 |
"@types/cors": "^2.8.17",
|
| 74 |
"@types/generic-pool": "^3.8.1",
|
| 75 |
"@types/node": "^20.14.13",
|
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -374,7 +374,8 @@ export class CrawlerHost extends RPCHost {
|
|
| 374 |
|
| 375 |
const targetUrlFromGet = originPath.slice(1);
|
| 376 |
if (crawlerOptions.pdf) {
|
| 377 |
-
|
|
|
|
| 378 |
} else if (targetUrlFromGet) {
|
| 379 |
url = targetUrlFromGet.trim();
|
| 380 |
} else if (crawlerOptions.url) {
|
|
@@ -552,7 +553,9 @@ export class CrawlerHost extends RPCHost {
|
|
| 552 |
}
|
| 553 |
|
| 554 |
if (crawlerOpts?.pdf) {
|
| 555 |
-
|
|
|
|
|
|
|
| 556 |
const fakeSnapshot = {
|
| 557 |
href: urlToCrawl.toString(),
|
| 558 |
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
|
|
|
|
| 374 |
|
| 375 |
const targetUrlFromGet = originPath.slice(1);
|
| 376 |
if (crawlerOptions.pdf) {
|
| 377 |
+
const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64');
|
| 378 |
+
url = `file://pdf.${md5Hasher.hash(pdfBuf)}`;
|
| 379 |
} else if (targetUrlFromGet) {
|
| 380 |
url = targetUrlFromGet.trim();
|
| 381 |
} else if (crawlerOptions.url) {
|
|
|
|
| 553 |
}
|
| 554 |
|
| 555 |
if (crawlerOpts?.pdf) {
|
| 556 |
+
|
| 557 |
+
const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
|
| 558 |
+
const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
|
| 559 |
const fakeSnapshot = {
|
| 560 |
href: urlToCrawl.toString(),
|
| 561 |
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
|
backend/functions/src/dto/scrapping-options.ts
CHANGED
|
@@ -171,8 +171,9 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 171 |
|
| 172 |
@Prop({
|
| 173 |
desc: 'Base64 encoded PDF.',
|
|
|
|
| 174 |
})
|
| 175 |
-
pdf?: string;
|
| 176 |
|
| 177 |
@Prop({
|
| 178 |
default: CONTENT_FORMAT.CONTENT,
|
|
|
|
| 171 |
|
| 172 |
@Prop({
|
| 173 |
desc: 'Base64 encoded PDF.',
|
| 174 |
+
type: [File, String]
|
| 175 |
})
|
| 176 |
+
pdf?: File | string;
|
| 177 |
|
| 178 |
@Prop({
|
| 179 |
default: CONTENT_FORMAT.CONTENT,
|
backend/functions/src/services/pdf-extract.ts
CHANGED
|
@@ -64,23 +64,25 @@ export class PDFExtractor extends AsyncService {
|
|
| 64 |
}
|
| 65 |
|
| 66 |
isDataUrl(url: string) {
|
| 67 |
-
return
|
| 68 |
}
|
| 69 |
|
| 70 |
parseDataUrl(url: string) {
|
| 71 |
-
const
|
| 72 |
-
|
|
|
|
|
|
|
| 73 |
throw new Error('Invalid data URL');
|
| 74 |
}
|
| 75 |
|
| 76 |
-
if (
|
| 77 |
throw new Error('Invalid data URL type');
|
| 78 |
}
|
| 79 |
|
| 80 |
return {
|
| 81 |
-
type:
|
| 82 |
-
data:
|
| 83 |
-
}
|
| 84 |
}
|
| 85 |
|
| 86 |
async extract(url: string | URL) {
|
|
@@ -88,9 +90,9 @@ export class PDFExtractor extends AsyncService {
|
|
| 88 |
|
| 89 |
if (typeof url === 'string' && this.isDataUrl(url)) {
|
| 90 |
const { data } = this.parseDataUrl(url);
|
| 91 |
-
|
| 92 |
loadingTask = this.pdfjs.getDocument({
|
| 93 |
-
data:
|
| 94 |
disableFontFace: true,
|
| 95 |
verbosity: 0
|
| 96 |
});
|
|
|
|
| 64 |
}
|
| 65 |
|
| 66 |
isDataUrl(url: string) {
|
| 67 |
+
return url.startsWith('data:');
|
| 68 |
}
|
| 69 |
|
| 70 |
parseDataUrl(url: string) {
|
| 71 |
+
const protocol = url.slice(0, url.indexOf(':'));
|
| 72 |
+
const contentType = url.slice(url.indexOf(':') + 1, url.indexOf(';'));
|
| 73 |
+
const data = url.slice(url.indexOf(',') + 1);
|
| 74 |
+
if (protocol !== 'data' || !data) {
|
| 75 |
throw new Error('Invalid data URL');
|
| 76 |
}
|
| 77 |
|
| 78 |
+
if (contentType !== 'application/pdf') {
|
| 79 |
throw new Error('Invalid data URL type');
|
| 80 |
}
|
| 81 |
|
| 82 |
return {
|
| 83 |
+
type: contentType,
|
| 84 |
+
data: data
|
| 85 |
+
};
|
| 86 |
}
|
| 87 |
|
| 88 |
async extract(url: string | URL) {
|
|
|
|
| 90 |
|
| 91 |
if (typeof url === 'string' && this.isDataUrl(url)) {
|
| 92 |
const { data } = this.parseDataUrl(url);
|
| 93 |
+
const binary = Uint8Array.from(Buffer.from(data, 'base64'));
|
| 94 |
loadingTask = this.pdfjs.getDocument({
|
| 95 |
+
data: binary,
|
| 96 |
disableFontFace: true,
|
| 97 |
verbosity: 0
|
| 98 |
});
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit a90669ca91d2c8cb470e75bf2cdfa06812e5ba7a
|