Spaces:
Build error
Build error
feat: warn on non 200 response
Browse files
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -4,7 +4,7 @@ import { container, singleton } from 'tsyringe';
|
|
| 4 |
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick } from 'civkit';
|
| 5 |
import { Logger } from '../shared/services/logger';
|
| 6 |
|
| 7 |
-
import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
|
| 8 |
import puppeteer from 'puppeteer-extra';
|
| 9 |
|
| 10 |
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
|
@@ -42,11 +42,13 @@ export interface ReadabilityParsed {
|
|
| 42 |
|
| 43 |
export interface PageSnapshot {
|
| 44 |
title: string;
|
| 45 |
-
description: string;
|
| 46 |
href: string;
|
| 47 |
rebase?: string;
|
| 48 |
html: string;
|
| 49 |
text: string;
|
|
|
|
|
|
|
| 50 |
parsed?: Partial<ReadabilityParsed> | null;
|
| 51 |
screenshot?: Buffer;
|
| 52 |
pageshot?: Buffer;
|
|
@@ -287,7 +289,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 287 |
await this.serviceReady();
|
| 288 |
const dedicatedContext = await this.browser.createBrowserContext();
|
| 289 |
const sn = this._sn++;
|
| 290 |
-
let page
|
| 291 |
try {
|
| 292 |
page = await dedicatedContext.newPage();
|
| 293 |
} catch (err: any) {
|
|
@@ -471,8 +473,12 @@ document.addEventListener('load', handlePageLoad);
|
|
| 471 |
let screenshot: Buffer | undefined;
|
| 472 |
let pageshot: Buffer | undefined;
|
| 473 |
const pdfUrls: string[] = [];
|
|
|
|
| 474 |
const page = await this.getNextPage();
|
| 475 |
page.on('response', (resp) => {
|
|
|
|
|
|
|
|
|
|
| 476 |
if (!resp.ok()) {
|
| 477 |
return;
|
| 478 |
}
|
|
@@ -638,7 +644,12 @@ document.addEventListener('load', handlePageLoad);
|
|
| 638 |
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
| 639 |
this.emit(
|
| 640 |
'crawled',
|
| 641 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
{ ...options, url: parsedUrl }
|
| 643 |
);
|
| 644 |
}
|
|
@@ -691,7 +702,12 @@ document.addEventListener('load', handlePageLoad);
|
|
| 691 |
}
|
| 692 |
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
| 693 |
}
|
| 694 |
-
yield {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 695 |
break;
|
| 696 |
}
|
| 697 |
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
|
@@ -700,7 +716,12 @@ document.addEventListener('load', handlePageLoad);
|
|
| 700 |
lastHTML = snapshot.html;
|
| 701 |
}
|
| 702 |
if (snapshot || screenshot) {
|
| 703 |
-
yield {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 704 |
}
|
| 705 |
if (error) {
|
| 706 |
throw error;
|
|
|
|
| 4 |
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick } from 'civkit';
|
| 5 |
import { Logger } from '../shared/services/logger';
|
| 6 |
|
| 7 |
+
import type { Browser, CookieParam, GoToOptions, HTTPResponse, Page } from 'puppeteer';
|
| 8 |
import puppeteer from 'puppeteer-extra';
|
| 9 |
|
| 10 |
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
|
|
|
| 42 |
|
| 43 |
export interface PageSnapshot {
|
| 44 |
title: string;
|
| 45 |
+
description?: string;
|
| 46 |
href: string;
|
| 47 |
rebase?: string;
|
| 48 |
html: string;
|
| 49 |
text: string;
|
| 50 |
+
status?: number;
|
| 51 |
+
statusText?: string;
|
| 52 |
parsed?: Partial<ReadabilityParsed> | null;
|
| 53 |
screenshot?: Buffer;
|
| 54 |
pageshot?: Buffer;
|
|
|
|
| 289 |
await this.serviceReady();
|
| 290 |
const dedicatedContext = await this.browser.createBrowserContext();
|
| 291 |
const sn = this._sn++;
|
| 292 |
+
let page;
|
| 293 |
try {
|
| 294 |
page = await dedicatedContext.newPage();
|
| 295 |
} catch (err: any) {
|
|
|
|
| 473 |
let screenshot: Buffer | undefined;
|
| 474 |
let pageshot: Buffer | undefined;
|
| 475 |
const pdfUrls: string[] = [];
|
| 476 |
+
let navigationResponse: HTTPResponse | undefined;
|
| 477 |
const page = await this.getNextPage();
|
| 478 |
page.on('response', (resp) => {
|
| 479 |
+
if (resp.request().isNavigationRequest()) {
|
| 480 |
+
navigationResponse = resp;
|
| 481 |
+
}
|
| 482 |
if (!resp.ok()) {
|
| 483 |
return;
|
| 484 |
}
|
|
|
|
| 644 |
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
| 645 |
this.emit(
|
| 646 |
'crawled',
|
| 647 |
+
{
|
| 648 |
+
...snapshot,
|
| 649 |
+
status: navigationResponse?.status(),
|
| 650 |
+
statusText: navigationResponse?.statusText(),
|
| 651 |
+
pdfs: _.uniq(pdfUrls), screenshot, pageshot,
|
| 652 |
+
},
|
| 653 |
{ ...options, url: parsedUrl }
|
| 654 |
);
|
| 655 |
}
|
|
|
|
| 702 |
}
|
| 703 |
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
| 704 |
}
|
| 705 |
+
yield {
|
| 706 |
+
...snapshot,
|
| 707 |
+
status: navigationResponse?.status(),
|
| 708 |
+
statusText: navigationResponse?.statusText(),
|
| 709 |
+
pdfs: _.uniq(pdfUrls), screenshot, pageshot
|
| 710 |
+
} as PageSnapshot;
|
| 711 |
break;
|
| 712 |
}
|
| 713 |
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
|
|
|
| 716 |
lastHTML = snapshot.html;
|
| 717 |
}
|
| 718 |
if (snapshot || screenshot) {
|
| 719 |
+
yield {
|
| 720 |
+
...snapshot,
|
| 721 |
+
status: navigationResponse?.status(),
|
| 722 |
+
statusText: navigationResponse?.statusText(),
|
| 723 |
+
pdfs: _.uniq(pdfUrls), screenshot, pageshot
|
| 724 |
+
} as PageSnapshot;
|
| 725 |
}
|
| 726 |
if (error) {
|
| 727 |
throw error;
|
backend/functions/src/services/snapshot-formatter.ts
CHANGED
|
@@ -12,6 +12,7 @@ import { AltTextService } from './alt-text';
|
|
| 12 |
import { PDFExtractor } from './pdf-extract';
|
| 13 |
import { cleanAttribute } from '../utils/misc';
|
| 14 |
import _ from 'lodash';
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
export interface FormattedPage {
|
|
@@ -28,6 +29,7 @@ export interface FormattedPage {
|
|
| 28 |
pageshot?: Buffer;
|
| 29 |
links?: { [k: string]: string; };
|
| 30 |
images?: { [k: string]: string; };
|
|
|
|
| 31 |
usage?: {
|
| 32 |
total_tokens?: number;
|
| 33 |
totalTokens?: number;
|
|
@@ -323,6 +325,15 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 323 |
[Symbol.dispose]: () => { },
|
| 324 |
};
|
| 325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
if (this.threadLocal.get('withImagesSummary')) {
|
| 327 |
formatted.images =
|
| 328 |
_(imageSummary)
|
|
@@ -369,6 +380,10 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 369 |
suffixMixins.push(linkSummaryChunks.join('\n'));
|
| 370 |
}
|
| 371 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
return `Title: ${this.title}
|
| 373 |
|
| 374 |
URL Source: ${this.url}
|
|
@@ -418,6 +433,14 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 418 |
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
| 419 |
mixin.links = _.invert(inferred.links || {});
|
| 420 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
|
| 422 |
return mixin;
|
| 423 |
}
|
|
|
|
| 12 |
import { PDFExtractor } from './pdf-extract';
|
| 13 |
import { cleanAttribute } from '../utils/misc';
|
| 14 |
import _ from 'lodash';
|
| 15 |
+
import { STATUS_CODES } from 'http';
|
| 16 |
|
| 17 |
|
| 18 |
export interface FormattedPage {
|
|
|
|
| 29 |
pageshot?: Buffer;
|
| 30 |
links?: { [k: string]: string; };
|
| 31 |
images?: { [k: string]: string; };
|
| 32 |
+
warning?: string;
|
| 33 |
usage?: {
|
| 34 |
total_tokens?: number;
|
| 35 |
totalTokens?: number;
|
|
|
|
| 325 |
[Symbol.dispose]: () => { },
|
| 326 |
};
|
| 327 |
|
| 328 |
+
if (snapshot.status) {
|
| 329 |
+
const code = snapshot.status;
|
| 330 |
+
const n = code - 200;
|
| 331 |
+
if (n < 100 || n >= 100) {
|
| 332 |
+
const text = snapshot.statusText || STATUS_CODES[code];
|
| 333 |
+
formatted.warning = `Target URL returned error ${code}${text? `: ${text}` : ''}`;
|
| 334 |
+
}
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
if (this.threadLocal.get('withImagesSummary')) {
|
| 338 |
formatted.images =
|
| 339 |
_(imageSummary)
|
|
|
|
| 380 |
suffixMixins.push(linkSummaryChunks.join('\n'));
|
| 381 |
}
|
| 382 |
|
| 383 |
+
if (this.warning) {
|
| 384 |
+
mixins.push(`Warning: ${this.warning}`);
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
return `Title: ${this.title}
|
| 388 |
|
| 389 |
URL Source: ${this.url}
|
|
|
|
| 433 |
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
| 434 |
mixin.links = _.invert(inferred.links || {});
|
| 435 |
}
|
| 436 |
+
if (snapshot.status) {
|
| 437 |
+
const code = snapshot.status;
|
| 438 |
+
const n = code - 200;
|
| 439 |
+
if (n < 100 || n >= 100) {
|
| 440 |
+
const text = snapshot.statusText || STATUS_CODES[code];
|
| 441 |
+
mixin.warning = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
|
| 442 |
+
}
|
| 443 |
+
}
|
| 444 |
|
| 445 |
return mixin;
|
| 446 |
}
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit 9258853d626758cb14dce55ae4aeaaca9fc4cfd2
|