Spaces:
Build error
Build error
fix: favor nominal url over real url
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -67,7 +67,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 67 |
this.emit('ready');
|
| 68 |
}
|
| 69 |
|
| 70 |
-
async formatSnapshot(snapshot: PageSnapshot) {
|
| 71 |
const toBeTurnedToMd = snapshot.parsed?.content;
|
| 72 |
let turnDownService = new TurndownService();
|
| 73 |
for (const plugin of this.turnDownPlugins) {
|
|
@@ -121,7 +121,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 121 |
|
| 122 |
const formatted = {
|
| 123 |
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
| 124 |
-
url: snapshot.href?.trim(),
|
| 125 |
content: cleanText,
|
| 126 |
|
| 127 |
toString() {
|
|
@@ -188,7 +188,7 @@ ${this.content}
|
|
| 188 |
continue;
|
| 189 |
}
|
| 190 |
|
| 191 |
-
const formatted = await this.formatSnapshot(scrapped);
|
| 192 |
|
| 193 |
if (scrapped.screenshot && screenshotEnabled) {
|
| 194 |
sseStream.write({
|
|
@@ -223,7 +223,7 @@ ${this.content}
|
|
| 223 |
continue;
|
| 224 |
}
|
| 225 |
|
| 226 |
-
const formatted = await this.formatSnapshot(scrapped);
|
| 227 |
|
| 228 |
return formatted;
|
| 229 |
}
|
|
@@ -232,7 +232,7 @@ ${this.content}
|
|
| 232 |
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
| 233 |
}
|
| 234 |
|
| 235 |
-
return await this.formatSnapshot(lastScrapped);
|
| 236 |
}
|
| 237 |
|
| 238 |
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
|
|
@@ -241,7 +241,7 @@ ${this.content}
|
|
| 241 |
continue;
|
| 242 |
}
|
| 243 |
|
| 244 |
-
const formatted = await this.formatSnapshot(scrapped);
|
| 245 |
|
| 246 |
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
| 247 |
}
|
|
@@ -250,7 +250,7 @@ ${this.content}
|
|
| 250 |
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
| 251 |
}
|
| 252 |
|
| 253 |
-
return `${await this.formatSnapshot(lastScrapped)}`;
|
| 254 |
}
|
| 255 |
|
| 256 |
|
|
|
|
| 67 |
this.emit('ready');
|
| 68 |
}
|
| 69 |
|
| 70 |
+
async formatSnapshot(snapshot: PageSnapshot, nominalUrl?: string) {
|
| 71 |
const toBeTurnedToMd = snapshot.parsed?.content;
|
| 72 |
let turnDownService = new TurndownService();
|
| 73 |
for (const plugin of this.turnDownPlugins) {
|
|
|
|
| 121 |
|
| 122 |
const formatted = {
|
| 123 |
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
| 124 |
+
url: nominalUrl || snapshot.href?.trim(),
|
| 125 |
content: cleanText,
|
| 126 |
|
| 127 |
toString() {
|
|
|
|
| 188 |
continue;
|
| 189 |
}
|
| 190 |
|
| 191 |
+
const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString());
|
| 192 |
|
| 193 |
if (scrapped.screenshot && screenshotEnabled) {
|
| 194 |
sseStream.write({
|
|
|
|
| 223 |
continue;
|
| 224 |
}
|
| 225 |
|
| 226 |
+
const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString());
|
| 227 |
|
| 228 |
return formatted;
|
| 229 |
}
|
|
|
|
| 232 |
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
| 233 |
}
|
| 234 |
|
| 235 |
+
return await this.formatSnapshot(lastScrapped, urlToCrawl?.toString());
|
| 236 |
}
|
| 237 |
|
| 238 |
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
|
|
|
|
| 241 |
continue;
|
| 242 |
}
|
| 243 |
|
| 244 |
+
const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString());
|
| 245 |
|
| 246 |
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
| 247 |
}
|
|
|
|
| 250 |
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
| 251 |
}
|
| 252 |
|
| 253 |
+
return `${await this.formatSnapshot(lastScrapped, urlToCrawl?.toString())}`;
|
| 254 |
}
|
| 255 |
|
| 256 |
|