nomagick commited on
Commit
df71c9a
·
unverified ·
1 Parent(s): ba8ab88

fix: stop using pool

Browse files
backend/functions/package-lock.json CHANGED
@@ -20,7 +20,6 @@
20
  "express": "^4.19.2",
21
  "firebase-admin": "^12.1.0",
22
  "firebase-functions": "^4.9.0",
23
- "generic-pool": "^3.9.0",
24
  "htmlparser2": "^9.0.0",
25
  "jose": "^5.1.0",
26
  "jsdom": "^24.0.0",
@@ -5796,6 +5795,7 @@
5796
  "version": "3.9.0",
5797
  "resolved": "https://registry.npmjs.org/generic-pool/-/generic-pool-3.9.0.tgz",
5798
  "integrity": "sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==",
 
5799
  "engines": {
5800
  "node": ">= 4"
5801
  }
 
20
  "express": "^4.19.2",
21
  "firebase-admin": "^12.1.0",
22
  "firebase-functions": "^4.9.0",
 
23
  "htmlparser2": "^9.0.0",
24
  "jose": "^5.1.0",
25
  "jsdom": "^24.0.0",
 
5795
  "version": "3.9.0",
5796
  "resolved": "https://registry.npmjs.org/generic-pool/-/generic-pool-3.9.0.tgz",
5797
  "integrity": "sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==",
5798
+ "devOptional": true,
5799
  "engines": {
5800
  "node": ">= 4"
5801
  }
backend/functions/package.json CHANGED
@@ -40,7 +40,6 @@
40
  "express": "^4.19.2",
41
  "firebase-admin": "^12.1.0",
42
  "firebase-functions": "^4.9.0",
43
- "generic-pool": "^3.9.0",
44
  "htmlparser2": "^9.0.0",
45
  "jose": "^5.1.0",
46
  "jsdom": "^24.0.0",
 
40
  "express": "^4.19.2",
41
  "firebase-admin": "^12.1.0",
42
  "firebase-functions": "^4.9.0",
 
43
  "htmlparser2": "^9.0.0",
44
  "jose": "^5.1.0",
45
  "jsdom": "^24.0.0",
backend/functions/src/services/puppeteer.ts CHANGED
@@ -1,7 +1,6 @@
1
  import os from 'os';
2
  import fs from 'fs';
3
  import { container, singleton } from 'tsyringe';
4
- import genericPool from 'generic-pool';
5
  import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
6
  import { Logger } from '../shared/services/logger';
7
  import { JSDOM } from 'jsdom';
@@ -76,43 +75,32 @@ puppeteer.use(puppeteerPageProxy({
76
  @singleton()
77
  export class PuppeteerControl extends AsyncService {
78
 
 
79
  browser!: Browser;
80
  logger = this.globalLogger.child({ service: this.constructor.name });
81
 
82
- pagePool = genericPool.createPool({
83
- create: async () => {
84
- const page = await this.newPage();
85
- return page;
86
- },
87
- destroy: async (page) => {
88
- await Promise.race([
89
- (async () => {
90
- const ctx = page.browserContext();
91
- await page.close();
92
- await ctx.close();
93
- })(), delay(5000)
94
- ]).catch((err) => {
95
- this.logger.error(`Failed to destroy page`, { err: marshalErrorLike(err) });
96
- });
97
- },
98
- validate: async (page) => {
99
- return page.browser().connected && !page.isClosed();
100
- }
101
- }, {
102
- max: Math.max(1 + Math.floor(os.totalmem() / (256 * 1024 * 1024)), 16),
103
- min: 1,
104
- acquireTimeoutMillis: 60_000,
105
- testOnBorrow: true,
106
- testOnReturn: true,
107
- autostart: false,
108
- priorityRange: 3
109
- });
110
-
111
  private __healthCheckInterval?: NodeJS.Timeout;
112
 
 
 
 
 
 
 
 
113
  constructor(protected globalLogger: Logger) {
114
  super(...arguments);
115
- this.setMaxListeners(2 * this.pagePool.max + 1);
 
 
 
 
 
 
 
 
 
 
116
  }
117
 
118
  override async init() {
@@ -121,8 +109,6 @@ export class PuppeteerControl extends AsyncService {
121
  this.__healthCheckInterval = undefined;
122
  }
123
  await this.dependencyReady();
124
- this.logger.info(`PuppeteerControl initializing with pool size ${this.pagePool.max}`, { poolSize: this.pagePool.max });
125
- this.pagePool.start();
126
 
127
  if (this.browser) {
128
  if (this.browser.connected) {
@@ -151,24 +137,33 @@ export class PuppeteerControl extends AsyncService {
151
  this.emit('ready');
152
 
153
  this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000);
 
154
  }
155
 
156
  @maxConcurrency(1)
157
  async healthCheck() {
158
- this.pagePool.max += 1;
159
- const healthyPage = await this.pagePool.acquire(3).catch((err) => {
 
 
 
160
  this.logger.warn(`Health check failed`, { err: marshalErrorLike(err) });
161
  return null;
162
  });
163
- this.pagePool.max -= 1;
164
 
165
  if (healthyPage) {
166
- this.pagePool.release(healthyPage);
 
 
 
 
 
 
 
167
  return;
168
  }
169
 
170
  this.logger.warn(`Trying to clean up...`);
171
- await this.pagePool.clear();
172
  this.browser.process()?.kill('SIGKILL');
173
  Reflect.deleteProperty(this, 'browser');
174
  this.emit('crippled');
@@ -178,7 +173,7 @@ export class PuppeteerControl extends AsyncService {
178
  async newPage() {
179
  await this.serviceReady();
180
  const dedicatedContext = await this.browser.createBrowserContext();
181
-
182
  const page = await dedicatedContext.newPage();
183
  const preparations = [];
184
 
@@ -300,18 +295,72 @@ document.addEventListener('readystatechange', handlePageLoad);
300
  document.addEventListener('load', handlePageLoad);
301
  `);
302
 
 
 
 
 
 
303
  return page;
304
  }
305
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
307
  // parsedUrl.search = '';
308
  const url = parsedUrl.toString();
309
 
310
- this.logger.info(`Scraping ${url}`, { url });
311
  let snapshot: PageSnapshot | undefined;
312
  let screenshot: Buffer | undefined;
313
-
314
- const page = await this.pagePool.acquire();
 
315
  if (options?.proxyUrl) {
316
  await page.useProxy(options.proxyUrl);
317
  }
@@ -342,7 +391,7 @@ document.addEventListener('load', handlePageLoad);
342
 
343
  const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
344
  .catch((err) => {
345
- this.logger.warn(`Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
346
  return Promise.reject(new AssertionFailureError({
347
  message: `Failed to goto ${url}: ${err}`,
348
  cause: err,
@@ -362,7 +411,7 @@ document.addEventListener('load', handlePageLoad);
362
  }
363
  }
364
  finalized = true;
365
- this.logger.info(`Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
366
  this.emit(
367
  'crawled',
368
  { ...snapshot, screenshot },
@@ -378,7 +427,7 @@ document.addEventListener('load', handlePageLoad);
378
  nextSnapshotDeferred.resolve(snapshot);
379
  })
380
  .catch((err) => {
381
- this.logger.warn(`Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
382
  });
383
  }
384
 
@@ -401,9 +450,7 @@ document.addEventListener('load', handlePageLoad);
401
  } finally {
402
  gotoPromise.finally(() => {
403
  page.off('snapshot', hdl);
404
- this.pagePool.destroy(page).catch((err) => {
405
- this.logger.warn(`Failed to destroy page`, { err: marshalErrorLike(err) });
406
- });
407
  });
408
  nextSnapshotDeferred.resolve();
409
  }
 
1
  import os from 'os';
2
  import fs from 'fs';
3
  import { container, singleton } from 'tsyringe';
 
4
  import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
5
  import { Logger } from '../shared/services/logger';
6
  import { JSDOM } from 'jsdom';
 
75
  @singleton()
76
  export class PuppeteerControl extends AsyncService {
77
 
78
+ _sn = 0;
79
  browser!: Browser;
80
  logger = this.globalLogger.child({ service: this.constructor.name });
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  private __healthCheckInterval?: NodeJS.Timeout;
83
 
84
+ __loadedPage: Page[] = [];
85
+
86
+ finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>();
87
+ snMap = new WeakMap<Page, number>();
88
+ livePages = new Set<Page>();
89
+ lastPageCratedAt: number = 0;
90
+
91
  constructor(protected globalLogger: Logger) {
92
  super(...arguments);
93
+ this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95;
94
+
95
+ this.on('crippled', () => {
96
+ this.__loadedPage.length = 0;
97
+ this.livePages.clear();
98
+ });
99
+ }
100
+
101
+ briefPages() {
102
+ this.logger.info(`Status: ${this.livePages.size} pages alive: ${Array.from(this.livePages).map((x) => this.snMap.get(x)).sort().join(', ')}; ${this.__loadedPage.length} idle pages: ${this.__loadedPage.map((x) => this.snMap.get(x)).sort().join(', ')}`);
103
+ this.logger.info(``);
104
  }
105
 
106
  override async init() {
 
109
  this.__healthCheckInterval = undefined;
110
  }
111
  await this.dependencyReady();
 
 
112
 
113
  if (this.browser) {
114
  if (this.browser.connected) {
 
137
  this.emit('ready');
138
 
139
  this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000);
140
+ this.newPage().then((r) => this.__loadedPage.push(r));
141
  }
142
 
143
  @maxConcurrency(1)
144
  async healthCheck() {
145
+ if (Date.now() - this.lastPageCratedAt <= 10_000) {
146
+ this.briefPages();
147
+ return;
148
+ }
149
+ const healthyPage = await this.newPage().catch((err) => {
150
  this.logger.warn(`Health check failed`, { err: marshalErrorLike(err) });
151
  return null;
152
  });
 
153
 
154
  if (healthyPage) {
155
+ this.__loadedPage.push(healthyPage);
156
+
157
+ if (this.__loadedPage.length > 3) {
158
+ this.ditchPage(this.__loadedPage.shift()!);
159
+ }
160
+
161
+ this.briefPages();
162
+
163
  return;
164
  }
165
 
166
  this.logger.warn(`Trying to clean up...`);
 
167
  this.browser.process()?.kill('SIGKILL');
168
  Reflect.deleteProperty(this, 'browser');
169
  this.emit('crippled');
 
173
  async newPage() {
174
  await this.serviceReady();
175
  const dedicatedContext = await this.browser.createBrowserContext();
176
+ const sn = this._sn++;
177
  const page = await dedicatedContext.newPage();
178
  const preparations = [];
179
 
 
295
  document.addEventListener('load', handlePageLoad);
296
  `);
297
 
298
+ this.snMap.set(page, sn);
299
+ this.logger.warn(`Page ${sn} created.`);
300
+ this.lastPageCratedAt = Date.now();
301
+ this.livePages.add(page);
302
+
303
  return page;
304
  }
305
 
306
+ async getNextPage() {
307
+ let thePage;
308
+ if (this.__loadedPage.length) {
309
+ thePage = this.__loadedPage.shift();
310
+ if (this.__loadedPage.length <= 1) {
311
+ this.newPage()
312
+ .then((r) => this.__loadedPage.push(r))
313
+ .catch((err) => {
314
+ this.logger.warn(`Failed to load new page ahead of time`, { err: marshalErrorLike(err) });
315
+ });
316
+ }
317
+ }
318
+
319
+ if (!thePage) {
320
+ thePage = await this.newPage();
321
+ }
322
+
323
+ const timer = setTimeout(() => {
324
+ this.logger.warn(`Page is not allowed to live past 5 minutes, ditching page ${this.snMap.get(thePage)}...`);
325
+ this.ditchPage(thePage);
326
+ }, 300 * 1000);
327
+
328
+ this.finalizerMap.set(thePage, timer);
329
+
330
+ return thePage;
331
+ }
332
+
333
+ async ditchPage(page: Page) {
334
+ if (this.finalizerMap.has(page)) {
335
+ clearTimeout(this.finalizerMap.get(page)!);
336
+ this.finalizerMap.delete(page);
337
+ }
338
+ if (page.isClosed()) {
339
+ return;
340
+ }
341
+ const sn = this.snMap.get(page);
342
+ this.logger.info(`Closing page ${sn}`);
343
+ this.livePages.delete(page);
344
+ await Promise.race([
345
+ (async () => {
346
+ const ctx = page.browserContext();
347
+ await page.close();
348
+ await ctx.close();
349
+ })(), delay(5000)
350
+ ]).catch((err) => {
351
+ this.logger.error(`Failed to destroy page ${sn}`, { err: marshalErrorLike(err) });
352
+ });
353
+ }
354
+
355
  async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
356
  // parsedUrl.search = '';
357
  const url = parsedUrl.toString();
358
 
 
359
  let snapshot: PageSnapshot | undefined;
360
  let screenshot: Buffer | undefined;
361
+ const page = await this.getNextPage();
362
+ const sn = this.snMap.get(page);
363
+ this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
364
  if (options?.proxyUrl) {
365
  await page.useProxy(options.proxyUrl);
366
  }
 
391
 
392
  const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
393
  .catch((err) => {
394
+ this.logger.warn(`Page ${sn}: Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
395
  return Promise.reject(new AssertionFailureError({
396
  message: `Failed to goto ${url}: ${err}`,
397
  cause: err,
 
411
  }
412
  }
413
  finalized = true;
414
+ this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
415
  this.emit(
416
  'crawled',
417
  { ...snapshot, screenshot },
 
427
  nextSnapshotDeferred.resolve(snapshot);
428
  })
429
  .catch((err) => {
430
+ this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
431
  });
432
  }
433
 
 
450
  } finally {
451
  gotoPromise.finally(() => {
452
  page.off('snapshot', hdl);
453
+ this.ditchPage(page);
 
 
454
  });
455
  nextSnapshotDeferred.resolve();
456
  }