Spaces:
Build error
Build error
fix: expose publishedTime if possible
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -123,12 +123,18 @@ export class CrawlerHost extends RPCHost {
|
|
| 123 |
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
| 124 |
url: nominalUrl || snapshot.href?.trim(),
|
| 125 |
content: cleanText,
|
|
|
|
| 126 |
|
| 127 |
toString() {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
return `Title: ${this.title}
|
| 129 |
|
| 130 |
URL Source: ${this.url}
|
| 131 |
-
|
| 132 |
Markdown Content:
|
| 133 |
${this.content}
|
| 134 |
`;
|
|
|
|
| 123 |
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
| 124 |
url: nominalUrl || snapshot.href?.trim(),
|
| 125 |
content: cleanText,
|
| 126 |
+
publishedTime: snapshot.parsed?.publishedTime || undefined,
|
| 127 |
|
| 128 |
toString() {
|
| 129 |
+
const mixins = [];
|
| 130 |
+
if (this.publishedTime) {
|
| 131 |
+
mixins.push(`Published Time: ${this.publishedTime}`);
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
return `Title: ${this.title}
|
| 135 |
|
| 136 |
URL Source: ${this.url}
|
| 137 |
+
${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''}
|
| 138 |
Markdown Content:
|
| 139 |
${this.content}
|
| 140 |
`;
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -20,23 +20,25 @@ export interface ImgBrief {
|
|
| 20 |
alt?: string;
|
| 21 |
}
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
export interface PageSnapshot {
|
| 24 |
title: string;
|
| 25 |
href: string;
|
| 26 |
html: string;
|
| 27 |
text: string;
|
| 28 |
-
parsed?:
|
| 29 |
-
title: string;
|
| 30 |
-
content: string;
|
| 31 |
-
textContent: string;
|
| 32 |
-
length: number;
|
| 33 |
-
excerpt: string;
|
| 34 |
-
byline: string;
|
| 35 |
-
dir: string;
|
| 36 |
-
siteName: string;
|
| 37 |
-
lang: string;
|
| 38 |
-
publishedTime: string;
|
| 39 |
-
} | null;
|
| 40 |
screenshot?: Buffer;
|
| 41 |
imgs?: ImgBrief[];
|
| 42 |
}
|
|
@@ -121,7 +123,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 121 |
// preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
|
| 122 |
// preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
|
| 123 |
preparations.push(page.setBypassCSP(true));
|
| 124 |
-
preparations.push(page.setViewport({ width:
|
| 125 |
preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
|
| 126 |
page.emit('snapshot', snapshot);
|
| 127 |
}));
|
|
@@ -262,7 +264,7 @@ function giveSnapshot() {
|
|
| 262 |
}
|
| 263 |
screenshot = await page.screenshot({
|
| 264 |
type: 'jpeg',
|
| 265 |
-
quality:
|
| 266 |
});
|
| 267 |
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
| 268 |
if (!snapshot.title || !snapshot.parsed?.content) {
|
|
@@ -270,7 +272,7 @@ function giveSnapshot() {
|
|
| 270 |
if (salvaged) {
|
| 271 |
screenshot = await page.screenshot({
|
| 272 |
type: 'jpeg',
|
| 273 |
-
quality:
|
| 274 |
});
|
| 275 |
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
| 276 |
}
|
|
|
|
| 20 |
alt?: string;
|
| 21 |
}
|
| 22 |
|
| 23 |
+
export interface ReadabilityParsed {
|
| 24 |
+
title: string;
|
| 25 |
+
content: string;
|
| 26 |
+
textContent: string;
|
| 27 |
+
length: number;
|
| 28 |
+
excerpt: string;
|
| 29 |
+
byline: string;
|
| 30 |
+
dir: string;
|
| 31 |
+
siteName: string;
|
| 32 |
+
lang: string;
|
| 33 |
+
publishedTime: string;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
export interface PageSnapshot {
|
| 37 |
title: string;
|
| 38 |
href: string;
|
| 39 |
html: string;
|
| 40 |
text: string;
|
| 41 |
+
parsed?: Partial<ReadabilityParsed> | null;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
screenshot?: Buffer;
|
| 43 |
imgs?: ImgBrief[];
|
| 44 |
}
|
|
|
|
| 123 |
// preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
|
| 124 |
// preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
|
| 125 |
preparations.push(page.setBypassCSP(true));
|
| 126 |
+
preparations.push(page.setViewport({ width: 1024, height: 1024 }));
|
| 127 |
preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
|
| 128 |
page.emit('snapshot', snapshot);
|
| 129 |
}));
|
|
|
|
| 264 |
}
|
| 265 |
screenshot = await page.screenshot({
|
| 266 |
type: 'jpeg',
|
| 267 |
+
quality: 75,
|
| 268 |
});
|
| 269 |
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
| 270 |
if (!snapshot.title || !snapshot.parsed?.content) {
|
|
|
|
| 272 |
if (salvaged) {
|
| 273 |
screenshot = await page.screenshot({
|
| 274 |
type: 'jpeg',
|
| 275 |
+
quality: 75,
|
| 276 |
});
|
| 277 |
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
| 278 |
}
|