Spaces:
Build error
Build error
feat: fallback to google archive (#16)
Browse files* feat: fallback to google archive
* fix
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import { AssertionFailureError, AsyncService, Defer, HashManager, marshalErrorLike } from 'civkit';
|
| 2 |
import { container, singleton } from 'tsyringe';
|
| 3 |
-
import type { Browser } from 'puppeteer';
|
| 4 |
import { Logger } from '../shared/services/logger';
|
| 5 |
import genericPool from 'generic-pool';
|
| 6 |
import os from 'os';
|
|
@@ -93,7 +93,6 @@ export class PuppeteerControl extends AsyncService {
|
|
| 93 |
}
|
| 94 |
}
|
| 95 |
this.browser = await puppeteer.launch({
|
| 96 |
-
headless: true,
|
| 97 |
timeout: 10_000
|
| 98 |
}).catch((err: any) => {
|
| 99 |
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
|
|
@@ -266,6 +265,16 @@ function giveSnapshot() {
|
|
| 266 |
quality: 85,
|
| 267 |
});
|
| 268 |
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
this.logger.info(`Snapshot of ${url} done`, { url, digest, title: snapshot?.title, href: snapshot?.href });
|
| 270 |
const nowDate = new Date();
|
| 271 |
Crawled.save(
|
|
@@ -299,6 +308,27 @@ function giveSnapshot() {
|
|
| 299 |
});
|
| 300 |
}
|
| 301 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
}
|
| 303 |
|
| 304 |
const puppeteerControl = container.resolve(PuppeteerControl);
|
|
|
|
| 1 |
import { AssertionFailureError, AsyncService, Defer, HashManager, marshalErrorLike } from 'civkit';
|
| 2 |
import { container, singleton } from 'tsyringe';
|
| 3 |
+
import type { Browser, Page } from 'puppeteer';
|
| 4 |
import { Logger } from '../shared/services/logger';
|
| 5 |
import genericPool from 'generic-pool';
|
| 6 |
import os from 'os';
|
|
|
|
| 93 |
}
|
| 94 |
}
|
| 95 |
this.browser = await puppeteer.launch({
|
|
|
|
| 96 |
timeout: 10_000
|
| 97 |
}).catch((err: any) => {
|
| 98 |
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
|
|
|
|
| 265 |
quality: 85,
|
| 266 |
});
|
| 267 |
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
| 268 |
+
if (!snapshot.title || !snapshot.parsed?.content) {
|
| 269 |
+
const salvaged = await this.salvage(url, page);
|
| 270 |
+
if (salvaged) {
|
| 271 |
+
screenshot = await page.screenshot({
|
| 272 |
+
type: 'jpeg',
|
| 273 |
+
quality: 85,
|
| 274 |
+
});
|
| 275 |
+
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
| 276 |
+
}
|
| 277 |
+
}
|
| 278 |
this.logger.info(`Snapshot of ${url} done`, { url, digest, title: snapshot?.title, href: snapshot?.href });
|
| 279 |
const nowDate = new Date();
|
| 280 |
Crawled.save(
|
|
|
|
| 308 |
});
|
| 309 |
}
|
| 310 |
}
|
| 311 |
+
|
| 312 |
+
async salvage(url: string, page: Page) {
|
| 313 |
+
this.logger.info(`Salvaging ${url}`);
|
| 314 |
+
const googleArchiveUrl = `https://webcache.googleusercontent.com/search?q=cache:${encodeURIComponent(url)}`;
|
| 315 |
+
const resp = await fetch(googleArchiveUrl, {
|
| 316 |
+
headers: {
|
| 317 |
+
'User-Agent': `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`
|
| 318 |
+
}
|
| 319 |
+
});
|
| 320 |
+
resp.body?.cancel().catch(() => void 0);
|
| 321 |
+
if (!resp.ok) {
|
| 322 |
+
this.logger.warn(`No salvation found for url: ${url}`, { status: resp.status, url });
|
| 323 |
+
return null;
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
await page.goto(googleArchiveUrl, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 15_000 }).catch((err) => {
|
| 327 |
+
this.logger.warn(`Page salvation did not fully succeed.`, { err: marshalErrorLike(err) });
|
| 328 |
+
});
|
| 329 |
+
|
| 330 |
+
return true;
|
| 331 |
+
}
|
| 332 |
}
|
| 333 |
|
| 334 |
const puppeteerControl = container.resolve(PuppeteerControl);
|