nomagick commited on
Commit
6e05ea2
·
unverified ·
1 Parent(s): 6147a28

feat: warn on non 200 response

Browse files
backend/functions/src/services/puppeteer.ts CHANGED
@@ -4,7 +4,7 @@ import { container, singleton } from 'tsyringe';
4
  import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick } from 'civkit';
5
  import { Logger } from '../shared/services/logger';
6
 
7
- import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
8
  import puppeteer from 'puppeteer-extra';
9
 
10
  import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
@@ -42,11 +42,13 @@ export interface ReadabilityParsed {
42
 
43
  export interface PageSnapshot {
44
  title: string;
45
- description: string;
46
  href: string;
47
  rebase?: string;
48
  html: string;
49
  text: string;
 
 
50
  parsed?: Partial<ReadabilityParsed> | null;
51
  screenshot?: Buffer;
52
  pageshot?: Buffer;
@@ -287,7 +289,7 @@ export class PuppeteerControl extends AsyncService {
287
  await this.serviceReady();
288
  const dedicatedContext = await this.browser.createBrowserContext();
289
  const sn = this._sn++;
290
- let page
291
  try {
292
  page = await dedicatedContext.newPage();
293
  } catch (err: any) {
@@ -471,8 +473,12 @@ document.addEventListener('load', handlePageLoad);
471
  let screenshot: Buffer | undefined;
472
  let pageshot: Buffer | undefined;
473
  const pdfUrls: string[] = [];
 
474
  const page = await this.getNextPage();
475
  page.on('response', (resp) => {
 
 
 
476
  if (!resp.ok()) {
477
  return;
478
  }
@@ -638,7 +644,12 @@ document.addEventListener('load', handlePageLoad);
638
  this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
639
  this.emit(
640
  'crawled',
641
- { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot, },
 
 
 
 
 
642
  { ...options, url: parsedUrl }
643
  );
644
  }
@@ -691,7 +702,12 @@ document.addEventListener('load', handlePageLoad);
691
  }
692
  throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
693
  }
694
- yield { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot } as PageSnapshot;
 
 
 
 
 
695
  break;
696
  }
697
  if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
@@ -700,7 +716,12 @@ document.addEventListener('load', handlePageLoad);
700
  lastHTML = snapshot.html;
701
  }
702
  if (snapshot || screenshot) {
703
- yield { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot } as PageSnapshot;
 
 
 
 
 
704
  }
705
  if (error) {
706
  throw error;
 
4
  import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick } from 'civkit';
5
  import { Logger } from '../shared/services/logger';
6
 
7
+ import type { Browser, CookieParam, GoToOptions, HTTPResponse, Page } from 'puppeteer';
8
  import puppeteer from 'puppeteer-extra';
9
 
10
  import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
 
42
 
43
  export interface PageSnapshot {
44
  title: string;
45
+ description?: string;
46
  href: string;
47
  rebase?: string;
48
  html: string;
49
  text: string;
50
+ status?: number;
51
+ statusText?: string;
52
  parsed?: Partial<ReadabilityParsed> | null;
53
  screenshot?: Buffer;
54
  pageshot?: Buffer;
 
289
  await this.serviceReady();
290
  const dedicatedContext = await this.browser.createBrowserContext();
291
  const sn = this._sn++;
292
+ let page;
293
  try {
294
  page = await dedicatedContext.newPage();
295
  } catch (err: any) {
 
473
  let screenshot: Buffer | undefined;
474
  let pageshot: Buffer | undefined;
475
  const pdfUrls: string[] = [];
476
+ let navigationResponse: HTTPResponse | undefined;
477
  const page = await this.getNextPage();
478
  page.on('response', (resp) => {
479
+ if (resp.request().isNavigationRequest()) {
480
+ navigationResponse = resp;
481
+ }
482
  if (!resp.ok()) {
483
  return;
484
  }
 
644
  this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
645
  this.emit(
646
  'crawled',
647
+ {
648
+ ...snapshot,
649
+ status: navigationResponse?.status(),
650
+ statusText: navigationResponse?.statusText(),
651
+ pdfs: _.uniq(pdfUrls), screenshot, pageshot,
652
+ },
653
  { ...options, url: parsedUrl }
654
  );
655
  }
 
702
  }
703
  throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
704
  }
705
+ yield {
706
+ ...snapshot,
707
+ status: navigationResponse?.status(),
708
+ statusText: navigationResponse?.statusText(),
709
+ pdfs: _.uniq(pdfUrls), screenshot, pageshot
710
+ } as PageSnapshot;
711
  break;
712
  }
713
  if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
 
716
  lastHTML = snapshot.html;
717
  }
718
  if (snapshot || screenshot) {
719
+ yield {
720
+ ...snapshot,
721
+ status: navigationResponse?.status(),
722
+ statusText: navigationResponse?.statusText(),
723
+ pdfs: _.uniq(pdfUrls), screenshot, pageshot
724
+ } as PageSnapshot;
725
  }
726
  if (error) {
727
  throw error;
backend/functions/src/services/snapshot-formatter.ts CHANGED
@@ -12,6 +12,7 @@ import { AltTextService } from './alt-text';
12
  import { PDFExtractor } from './pdf-extract';
13
  import { cleanAttribute } from '../utils/misc';
14
  import _ from 'lodash';
 
15
 
16
 
17
  export interface FormattedPage {
@@ -28,6 +29,7 @@ export interface FormattedPage {
28
  pageshot?: Buffer;
29
  links?: { [k: string]: string; };
30
  images?: { [k: string]: string; };
 
31
  usage?: {
32
  total_tokens?: number;
33
  totalTokens?: number;
@@ -323,6 +325,15 @@ export class SnapshotFormatter extends AsyncService {
323
  [Symbol.dispose]: () => { },
324
  };
325
 
 
 
 
 
 
 
 
 
 
326
  if (this.threadLocal.get('withImagesSummary')) {
327
  formatted.images =
328
  _(imageSummary)
@@ -369,6 +380,10 @@ export class SnapshotFormatter extends AsyncService {
369
  suffixMixins.push(linkSummaryChunks.join('\n'));
370
  }
371
 
 
 
 
 
372
  return `Title: ${this.title}
373
 
374
  URL Source: ${this.url}
@@ -418,6 +433,14 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
418
  inferred ??= this.jsdomControl.inferSnapshot(snapshot);
419
  mixin.links = _.invert(inferred.links || {});
420
  }
 
 
 
 
 
 
 
 
421
 
422
  return mixin;
423
  }
 
12
  import { PDFExtractor } from './pdf-extract';
13
  import { cleanAttribute } from '../utils/misc';
14
  import _ from 'lodash';
15
+ import { STATUS_CODES } from 'http';
16
 
17
 
18
  export interface FormattedPage {
 
29
  pageshot?: Buffer;
30
  links?: { [k: string]: string; };
31
  images?: { [k: string]: string; };
32
+ warning?: string;
33
  usage?: {
34
  total_tokens?: number;
35
  totalTokens?: number;
 
325
  [Symbol.dispose]: () => { },
326
  };
327
 
328
+ if (snapshot.status) {
329
+ const code = snapshot.status;
330
+ const n = code - 200;
331
+ if (n < 100 || n >= 100) {
332
+ const text = snapshot.statusText || STATUS_CODES[code];
333
+ formatted.warning = `Target URL returned error ${code}${text? `: ${text}` : ''}`;
334
+ }
335
+ }
336
+
337
  if (this.threadLocal.get('withImagesSummary')) {
338
  formatted.images =
339
  _(imageSummary)
 
380
  suffixMixins.push(linkSummaryChunks.join('\n'));
381
  }
382
 
383
+ if (this.warning) {
384
+ mixins.push(`Warning: ${this.warning}`);
385
+ }
386
+
387
  return `Title: ${this.title}
388
 
389
  URL Source: ${this.url}
 
433
  inferred ??= this.jsdomControl.inferSnapshot(snapshot);
434
  mixin.links = _.invert(inferred.links || {});
435
  }
436
+ if (snapshot.status) {
437
+ const code = snapshot.status;
438
+ const n = code - 200;
439
+ if (n < 100 || n >= 100) {
440
+ const text = snapshot.statusText || STATUS_CODES[code];
441
+ mixin.warning = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
442
+ }
443
+ }
444
 
445
  return mixin;
446
  }
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit d287049d46781bff2032b02a2bd4322239145c95
 
1
+ Subproject commit 9258853d626758cb14dce55ae4aeaaca9fc4cfd2