nomagick commited on
Commit
0f36fe8
·
unverified ·
1 Parent(s): 6a58de5

fix: compressed response from curl

Browse files
backend/functions/src/services/curl.ts CHANGED
@@ -10,6 +10,7 @@ import { AssertionFailureError, FancyFile } from 'civkit';
10
  import { TempFileManager } from '../shared';
11
  import { readFile } from 'fs/promises';
12
  import { pathToFileURL } from 'url';
 
13
 
14
  @singleton()
15
  export class CurlControl extends AsyncService {
@@ -59,6 +60,7 @@ export class CurlControl extends AsyncService {
59
  text: '',
60
  } as PageSnapshot;
61
 
 
62
  const result = await new Promise<{
63
  statusCode: number,
64
  data?: FancyFile,
@@ -102,14 +104,20 @@ export class CurlControl extends AsyncService {
102
  });
103
  curl.setOpt(Curl.option.MAXFILESIZE, 1024 * 1024 * 1024); // 1GB
104
  let status = -1;
105
- let contentType = '';
106
  curl.on('stream', (stream, statusCode, headers) => {
107
  status = statusCode;
108
  outerLoop:
109
  for (const headerVec of headers) {
110
  for (const [k, v] of Object.entries(headerVec)) {
111
- if (k.toLowerCase() === 'content-type') {
 
112
  contentType = v.toLowerCase();
 
 
 
 
 
113
  break outerLoop;
114
  }
115
  }
@@ -130,6 +138,30 @@ export class CurlControl extends AsyncService {
130
  return;
131
  }
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  const fpath = this.tempFileManager.alloc();
134
  const fancyFile = FancyFile.auto(stream, fpath);
135
  this.tempFileManager.bindPathTo(fancyFile, fpath);
@@ -147,8 +179,13 @@ export class CurlControl extends AsyncService {
147
  throw new AssertionFailureError(`Failed to access ${urlToCrawl}: HTTP ${result.statusCode}`);
148
  }
149
 
 
 
 
 
 
150
  if (result.data) {
151
- const mimeType: string = await result.data.mimeType;
152
  if (mimeType.startsWith('text/html')) {
153
  if ((await result.data.size) > 1024 * 1024 * 32) {
154
  throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
 
10
  import { TempFileManager } from '../shared';
11
  import { readFile } from 'fs/promises';
12
  import { pathToFileURL } from 'url';
13
+ import { createBrotliDecompress, createInflate, createGunzip } from 'zlib';
14
 
15
  @singleton()
16
  export class CurlControl extends AsyncService {
 
60
  text: '',
61
  } as PageSnapshot;
62
 
63
+ let contentType = '';
64
  const result = await new Promise<{
65
  statusCode: number,
66
  data?: FancyFile,
 
104
  });
105
  curl.setOpt(Curl.option.MAXFILESIZE, 1024 * 1024 * 1024); // 1GB
106
  let status = -1;
107
+ let contentEncoding = '';
108
  curl.on('stream', (stream, statusCode, headers) => {
109
  status = statusCode;
110
  outerLoop:
111
  for (const headerVec of headers) {
112
  for (const [k, v] of Object.entries(headerVec)) {
113
+ const kl = k.toLowerCase();
114
+ if (kl === 'content-type') {
115
  contentType = v.toLowerCase();
116
+ }
117
+ if (kl === 'content-encoding') {
118
+ contentEncoding = v.toLowerCase();
119
+ }
120
+ if (contentType && contentEncoding) {
121
  break outerLoop;
122
  }
123
  }
 
138
  return;
139
  }
140
 
141
+ switch (contentEncoding) {
142
+ case 'gzip': {
143
+ const decompressed = createGunzip();
144
+ stream.pipe(decompressed);
145
+ stream = decompressed;
146
+ break;
147
+ }
148
+ case 'deflate': {
149
+ const decompressed = createInflate();
150
+ stream.pipe(decompressed);
151
+ stream = decompressed;
152
+ break;
153
+ }
154
+ case 'br': {
155
+ const decompressed = createBrotliDecompress();
156
+ stream.pipe(decompressed);
157
+ stream = decompressed;
158
+ break;
159
+ }
160
+ default: {
161
+ break;
162
+ }
163
+ }
164
+
165
  const fpath = this.tempFileManager.alloc();
166
  const fancyFile = FancyFile.auto(stream, fpath);
167
  this.tempFileManager.bindPathTo(fancyFile, fpath);
 
179
  throw new AssertionFailureError(`Failed to access ${urlToCrawl}: HTTP ${result.statusCode}`);
180
  }
181
 
182
+ if (contentType === 'application/octet-stream') {
183
+ // Content declared as binary is same as unknown.
184
+ contentType = '';
185
+ }
186
+
187
  if (result.data) {
188
+ const mimeType: string = contentType || await result.data.mimeType;
189
  if (mimeType.startsWith('text/html')) {
190
  if ((await result.data.size) > 1024 * 1024 * 32) {
191
  throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);