nomagick commited on
Commit
9a514cd
·
unverified ·
1 Parent(s): 7e6c2fc

fix: cap browser request freq to avoid block from google

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -753,6 +753,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
753
  event: 'data',
754
  data: formatted,
755
  });
 
 
 
756
  }
757
  } catch (err: any) {
758
  this.logger.error(`Failed to crawl ${urlToCrawl}`, { err: marshalErrorLike(err) });
@@ -781,6 +784,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
781
  if (crawlerOptions.timeout === undefined) {
782
  return formatted;
783
  }
 
 
 
 
784
  }
785
 
786
  if (!lastScrapped) {
 
753
  event: 'data',
754
  data: formatted,
755
  });
756
+ if (chargeAmount && scrapped.pdfs?.length) {
757
+ break;
758
+ }
759
  }
760
  } catch (err: any) {
761
  this.logger.error(`Failed to crawl ${urlToCrawl}`, { err: marshalErrorLike(err) });
 
784
  if (crawlerOptions.timeout === undefined) {
785
  return formatted;
786
  }
787
+
788
+ if (chargeAmount && scrapped.pdfs?.length) {
789
+ return formatted;
790
+ }
791
  }
792
 
793
  if (!lastScrapped) {
backend/functions/src/services/puppeteer.ts CHANGED
@@ -1,7 +1,7 @@
1
  import os from 'os';
2
  import fs from 'fs';
3
  import { container, singleton } from 'tsyringe';
4
- import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
5
  import { Logger } from '../shared/services/logger';
6
 
7
  import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
@@ -208,6 +208,7 @@ export class PuppeteerControl extends AsyncService {
208
  logger = this.globalLogger.child({ service: this.constructor.name });
209
 
210
  private __healthCheckInterval?: NodeJS.Timeout;
 
211
 
212
  __loadedPage: Page[] = [];
213
 
@@ -216,6 +217,10 @@ export class PuppeteerControl extends AsyncService {
216
  livePages = new Set<Page>();
217
  lastPageCratedAt: number = 0;
218
 
 
 
 
 
219
  circuitBreakerHosts: Set<string> = new Set();
220
 
221
  constructor(
@@ -239,6 +244,10 @@ export class PuppeteerControl extends AsyncService {
239
  clearInterval(this.__healthCheckInterval);
240
  this.__healthCheckInterval = undefined;
241
  }
 
 
 
 
242
  await this.dependencyReady();
243
 
244
  if (this.browser) {
@@ -267,7 +276,7 @@ export class PuppeteerControl extends AsyncService {
267
 
268
  this.emit('ready');
269
 
270
- this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000);
271
  this.newPage().then((r) => this.__loadedPage.push(r));
272
  }
273
 
@@ -301,6 +310,21 @@ export class PuppeteerControl extends AsyncService {
301
  this.logger.warn(`Browser killed`);
302
  }
303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  async newPage() {
305
  await this.serviceReady();
306
  const dedicatedContext = await this.browser.createBrowserContext();
@@ -330,7 +354,7 @@ export class PuppeteerControl extends AsyncService {
330
  let t0: number | undefined;
331
  let halt = false;
332
 
333
- page.on('request', (req) => {
334
  reqCounter++;
335
  if (halt) {
336
  return req.abort('blockedbyclient', 1000);
@@ -379,6 +403,15 @@ export class PuppeteerControl extends AsyncService {
379
  return req.abort('blockedbyclient', 1000);
380
  }
381
 
 
 
 
 
 
 
 
 
 
382
  const continueArgs = req.continueRequestOverrides
383
  ? [req.continueRequestOverrides(), 0] as const
384
  : [];
@@ -483,16 +516,16 @@ document.addEventListener('load', handlePageLoad);
483
 
484
  await page.evaluateOnNewDocument(() => {
485
  Object.defineProperty(navigator, "language", {
486
- get: function() {
487
  return options?.locale;
488
  }
489
  });
490
  Object.defineProperty(navigator, "languages", {
491
- get: function() {
492
  return [options?.locale];
493
  }
494
  });
495
- })
496
  }
497
 
498
  if (options?.proxyUrl) {
 
1
  import os from 'os';
2
  import fs from 'fs';
3
  import { container, singleton } from 'tsyringe';
4
+ import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency, Deferred } from 'civkit';
5
  import { Logger } from '../shared/services/logger';
6
 
7
  import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
 
208
  logger = this.globalLogger.child({ service: this.constructor.name });
209
 
210
  private __healthCheckInterval?: NodeJS.Timeout;
211
+ private __reqCapInterval?: NodeJS.Timeout;
212
 
213
  __loadedPage: Page[] = [];
214
 
 
217
  livePages = new Set<Page>();
218
  lastPageCratedAt: number = 0;
219
 
220
+ rpsCap: number = 300;
221
+ lastReqSentAt: number = 0;
222
+ requestDeferredQueue: Deferred<boolean>[] = [];
223
+
224
  circuitBreakerHosts: Set<string> = new Set();
225
 
226
  constructor(
 
244
  clearInterval(this.__healthCheckInterval);
245
  this.__healthCheckInterval = undefined;
246
  }
247
+ if (this.__reqCapInterval) {
248
+ clearInterval(this.__reqCapInterval);
249
+ this.__reqCapInterval = undefined;
250
+ }
251
  await this.dependencyReady();
252
 
253
  if (this.browser) {
 
276
 
277
  this.emit('ready');
278
 
279
+ this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000).unref();
280
  this.newPage().then((r) => this.__loadedPage.push(r));
281
  }
282
 
 
310
  this.logger.warn(`Browser killed`);
311
  }
312
 
313
+ reqCapRoutine() {
314
+ const now = Date.now();
315
+ const numToPass = Math.round((now - this.lastReqSentAt) / 1000 * this.rpsCap);
316
+ this.requestDeferredQueue.splice(0, numToPass).forEach((x) => x.resolve(true));
317
+ this.lastReqSentAt = now;
318
+ if (!this.requestDeferredQueue.length) {
319
+ if (this.__reqCapInterval) {
320
+ clearInterval(this.__reqCapInterval);
321
+ this.__reqCapInterval = undefined;
322
+ }
323
+ } else if (!this.__reqCapInterval) {
324
+ this.__reqCapInterval = setInterval(() => this.reqCapRoutine(), 1000 / this.rpsCap).unref();
325
+ }
326
+ }
327
+
328
  async newPage() {
329
  await this.serviceReady();
330
  const dedicatedContext = await this.browser.createBrowserContext();
 
354
  let t0: number | undefined;
355
  let halt = false;
356
 
357
+ page.on('request', async (req) => {
358
  reqCounter++;
359
  if (halt) {
360
  return req.abort('blockedbyclient', 1000);
 
403
  return req.abort('blockedbyclient', 1000);
404
  }
405
 
406
+ const d = Defer();
407
+ this.requestDeferredQueue.push(d);
408
+ process.nextTick(() => this.reqCapRoutine());
409
+ await d.promise;
410
+
411
+ if (req.isInterceptResolutionHandled()) {
412
+ return;
413
+ };
414
+
415
  const continueArgs = req.continueRequestOverrides
416
  ? [req.continueRequestOverrides(), 0] as const
417
  : [];
 
516
 
517
  await page.evaluateOnNewDocument(() => {
518
  Object.defineProperty(navigator, "language", {
519
+ get: function () {
520
  return options?.locale;
521
  }
522
  });
523
  Object.defineProperty(navigator, "languages", {
524
+ get: function () {
525
  return [options?.locale];
526
  }
527
  });
528
+ });
529
  }
530
 
531
  if (options?.proxyUrl) {
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit fe71cc2433f60ada86622f1670a752da40806e4d
 
1
+ Subproject commit fb511e6e7af482577ef321b99ccacac51b99df5b