nomagick commited on
Commit
50ed9cc
·
unverified ·
1 Parent(s): 8a2b095

feat: fallback to google archive (#16)

Browse files

* feat: fallback to google archive

* fix

backend/functions/src/services/puppeteer.ts CHANGED
@@ -1,6 +1,6 @@
1
  import { AssertionFailureError, AsyncService, Defer, HashManager, marshalErrorLike } from 'civkit';
2
  import { container, singleton } from 'tsyringe';
3
- import type { Browser } from 'puppeteer';
4
  import { Logger } from '../shared/services/logger';
5
  import genericPool from 'generic-pool';
6
  import os from 'os';
@@ -93,7 +93,6 @@ export class PuppeteerControl extends AsyncService {
93
  }
94
  }
95
  this.browser = await puppeteer.launch({
96
- headless: true,
97
  timeout: 10_000
98
  }).catch((err: any) => {
99
  this.logger.error(`Unknown firebase issue, just die fast.`, { err });
@@ -266,6 +265,16 @@ function giveSnapshot() {
266
  quality: 85,
267
  });
268
  snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
 
 
 
 
 
 
 
 
 
 
269
  this.logger.info(`Snapshot of ${url} done`, { url, digest, title: snapshot?.title, href: snapshot?.href });
270
  const nowDate = new Date();
271
  Crawled.save(
@@ -299,6 +308,27 @@ function giveSnapshot() {
299
  });
300
  }
301
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  }
303
 
304
  const puppeteerControl = container.resolve(PuppeteerControl);
 
1
  import { AssertionFailureError, AsyncService, Defer, HashManager, marshalErrorLike } from 'civkit';
2
  import { container, singleton } from 'tsyringe';
3
+ import type { Browser, Page } from 'puppeteer';
4
  import { Logger } from '../shared/services/logger';
5
  import genericPool from 'generic-pool';
6
  import os from 'os';
 
93
  }
94
  }
95
  this.browser = await puppeteer.launch({
 
96
  timeout: 10_000
97
  }).catch((err: any) => {
98
  this.logger.error(`Unknown firebase issue, just die fast.`, { err });
 
265
  quality: 85,
266
  });
267
  snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
268
+ if (!snapshot.title || !snapshot.parsed?.content) {
269
+ const salvaged = await this.salvage(url, page);
270
+ if (salvaged) {
271
+ screenshot = await page.screenshot({
272
+ type: 'jpeg',
273
+ quality: 85,
274
+ });
275
+ snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
276
+ }
277
+ }
278
  this.logger.info(`Snapshot of ${url} done`, { url, digest, title: snapshot?.title, href: snapshot?.href });
279
  const nowDate = new Date();
280
  Crawled.save(
 
308
  });
309
  }
310
  }
311
+
312
+ async salvage(url: string, page: Page) {
313
+ this.logger.info(`Salvaging ${url}`);
314
+ const googleArchiveUrl = `https://webcache.googleusercontent.com/search?q=cache:${encodeURIComponent(url)}`;
315
+ const resp = await fetch(googleArchiveUrl, {
316
+ headers: {
317
+ 'User-Agent': `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`
318
+ }
319
+ });
320
+ resp.body?.cancel().catch(() => void 0);
321
+ if (!resp.ok) {
322
+ this.logger.warn(`No salvation found for url: ${url}`, { status: resp.status, url });
323
+ return null;
324
+ }
325
+
326
+ await page.goto(googleArchiveUrl, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 15_000 }).catch((err) => {
327
+ this.logger.warn(`Page salvation did not fully succeed.`, { err: marshalErrorLike(err) });
328
+ });
329
+
330
+ return true;
331
+ }
332
  }
333
 
334
  const puppeteerControl = container.resolve(PuppeteerControl);