nomagick commited on
Commit
11a5a90
·
unverified ·
1 Parent(s): bda7e76

fix: favor nominal url over real url

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -67,7 +67,7 @@ export class CrawlerHost extends RPCHost {
67
  this.emit('ready');
68
  }
69
 
70
- async formatSnapshot(snapshot: PageSnapshot) {
71
  const toBeTurnedToMd = snapshot.parsed?.content;
72
  let turnDownService = new TurndownService();
73
  for (const plugin of this.turnDownPlugins) {
@@ -121,7 +121,7 @@ export class CrawlerHost extends RPCHost {
121
 
122
  const formatted = {
123
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),
124
- url: snapshot.href?.trim(),
125
  content: cleanText,
126
 
127
  toString() {
@@ -188,7 +188,7 @@ ${this.content}
188
  continue;
189
  }
190
 
191
- const formatted = await this.formatSnapshot(scrapped);
192
 
193
  if (scrapped.screenshot && screenshotEnabled) {
194
  sseStream.write({
@@ -223,7 +223,7 @@ ${this.content}
223
  continue;
224
  }
225
 
226
- const formatted = await this.formatSnapshot(scrapped);
227
 
228
  return formatted;
229
  }
@@ -232,7 +232,7 @@ ${this.content}
232
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
233
  }
234
 
235
- return await this.formatSnapshot(lastScrapped);
236
  }
237
 
238
  for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
@@ -241,7 +241,7 @@ ${this.content}
241
  continue;
242
  }
243
 
244
- const formatted = await this.formatSnapshot(scrapped);
245
 
246
  return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
247
  }
@@ -250,7 +250,7 @@ ${this.content}
250
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
251
  }
252
 
253
- return `${await this.formatSnapshot(lastScrapped)}`;
254
  }
255
 
256
 
 
67
  this.emit('ready');
68
  }
69
 
70
+ async formatSnapshot(snapshot: PageSnapshot, nominalUrl?: string) {
71
  const toBeTurnedToMd = snapshot.parsed?.content;
72
  let turnDownService = new TurndownService();
73
  for (const plugin of this.turnDownPlugins) {
 
121
 
122
  const formatted = {
123
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),
124
+ url: nominalUrl || snapshot.href?.trim(),
125
  content: cleanText,
126
 
127
  toString() {
 
188
  continue;
189
  }
190
 
191
+ const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString());
192
 
193
  if (scrapped.screenshot && screenshotEnabled) {
194
  sseStream.write({
 
223
  continue;
224
  }
225
 
226
+ const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString());
227
 
228
  return formatted;
229
  }
 
232
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
233
  }
234
 
235
+ return await this.formatSnapshot(lastScrapped, urlToCrawl?.toString());
236
  }
237
 
238
  for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
 
241
  continue;
242
  }
243
 
244
+ const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString());
245
 
246
  return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
247
  }
 
250
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
251
  }
252
 
253
+ return `${await this.formatSnapshot(lastScrapped, urlToCrawl?.toString())}`;
254
  }
255
 
256