nomagick commited on
Commit
a211366
·
unverified ·
1 Parent(s): 6e36f0a

fix: expose publishedTime if possible

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -123,12 +123,18 @@ export class CrawlerHost extends RPCHost {
123
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),
124
  url: nominalUrl || snapshot.href?.trim(),
125
  content: cleanText,
 
126
 
127
  toString() {
 
 
 
 
 
128
  return `Title: ${this.title}
129
 
130
  URL Source: ${this.url}
131
-
132
  Markdown Content:
133
  ${this.content}
134
  `;
 
123
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),
124
  url: nominalUrl || snapshot.href?.trim(),
125
  content: cleanText,
126
+ publishedTime: snapshot.parsed?.publishedTime || undefined,
127
 
128
  toString() {
129
+ const mixins = [];
130
+ if (this.publishedTime) {
131
+ mixins.push(`Published Time: ${this.publishedTime}`);
132
+ }
133
+
134
  return `Title: ${this.title}
135
 
136
  URL Source: ${this.url}
137
+ ${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''}
138
  Markdown Content:
139
  ${this.content}
140
  `;
backend/functions/src/services/puppeteer.ts CHANGED
@@ -20,23 +20,25 @@ export interface ImgBrief {
20
  alt?: string;
21
  }
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  export interface PageSnapshot {
24
  title: string;
25
  href: string;
26
  html: string;
27
  text: string;
28
- parsed?: {
29
- title: string;
30
- content: string;
31
- textContent: string;
32
- length: number;
33
- excerpt: string;
34
- byline: string;
35
- dir: string;
36
- siteName: string;
37
- lang: string;
38
- publishedTime: string;
39
- } | null;
40
  screenshot?: Buffer;
41
  imgs?: ImgBrief[];
42
  }
@@ -121,7 +123,7 @@ export class PuppeteerControl extends AsyncService {
121
  // preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
122
  // preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
123
  preparations.push(page.setBypassCSP(true));
124
- preparations.push(page.setViewport({ width: 1920, height: 1080 }));
125
  preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
126
  page.emit('snapshot', snapshot);
127
  }));
@@ -262,7 +264,7 @@ function giveSnapshot() {
262
  }
263
  screenshot = await page.screenshot({
264
  type: 'jpeg',
265
- quality: 85,
266
  });
267
  snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
268
  if (!snapshot.title || !snapshot.parsed?.content) {
@@ -270,7 +272,7 @@ function giveSnapshot() {
270
  if (salvaged) {
271
  screenshot = await page.screenshot({
272
  type: 'jpeg',
273
- quality: 85,
274
  });
275
  snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
276
  }
 
20
  alt?: string;
21
  }
22
 
23
+ export interface ReadabilityParsed {
24
+ title: string;
25
+ content: string;
26
+ textContent: string;
27
+ length: number;
28
+ excerpt: string;
29
+ byline: string;
30
+ dir: string;
31
+ siteName: string;
32
+ lang: string;
33
+ publishedTime: string;
34
+ }
35
+
36
  export interface PageSnapshot {
37
  title: string;
38
  href: string;
39
  html: string;
40
  text: string;
41
+ parsed?: Partial<ReadabilityParsed> | null;
 
 
 
 
 
 
 
 
 
 
 
42
  screenshot?: Buffer;
43
  imgs?: ImgBrief[];
44
  }
 
123
  // preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
124
  // preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
125
  preparations.push(page.setBypassCSP(true));
126
+ preparations.push(page.setViewport({ width: 1024, height: 1024 }));
127
  preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
128
  page.emit('snapshot', snapshot);
129
  }));
 
264
  }
265
  screenshot = await page.screenshot({
266
  type: 'jpeg',
267
+ quality: 75,
268
  });
269
  snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
270
  if (!snapshot.title || !snapshot.parsed?.content) {
 
272
  if (salvaged) {
273
  screenshot = await page.screenshot({
274
  type: 'jpeg',
275
+ quality: 75,
276
  });
277
  snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
278
  }