hanxiao commited on
Commit
c1743db
·
1 Parent(s): b29a569

chore: clean code

Browse files
backend/functions/package.json CHANGED
@@ -26,6 +26,7 @@
26
  },
27
  "main": "build/index.js",
28
  "dependencies": {
 
29
  "@google-cloud/translate": "^8.2.0",
30
  "@mozilla/readability": "^0.5.0",
31
  "@napi-rs/canvas": "^0.1.44",
 
26
  },
27
  "main": "build/index.js",
28
  "dependencies": {
29
+ "@esm2cjs/normalize-url": "^8.0.0",
30
  "@google-cloud/translate": "^8.2.0",
31
  "@mozilla/readability": "^0.5.0",
32
  "@napi-rs/canvas": "^0.1.44",
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -5,6 +5,7 @@ import _ from 'lodash';
5
  import { PageSnapshot, PuppeteerControl } from '../services/puppeteer';
6
  import TurnDownService from 'turndown';
7
  import { Request, Response } from 'express';
 
8
 
9
 
10
  @singleton()
@@ -57,11 +58,8 @@ ${contentText.trim()}
57
  res: Response,
58
  },
59
  ) {
60
- const url = new URL(ctx.req.url, `${ctx.req.protocol}://${ctx.req.headers.host}`);
61
- const rawPath = url.pathname.split('/').filter(Boolean);
62
- const host = rawPath.shift();
63
- const urlToCrawl = new URL(`${ctx.req.protocol}://${host}/${rawPath.join('/')}`);
64
- urlToCrawl.search = url.search;
65
 
66
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
67
  const sseStream = new OutputServerEventStream();
@@ -88,7 +86,7 @@ ${contentText.trim()}
88
  });
89
  }
90
  } catch (err: any) {
91
- this.logger.error(`Failed to crawl ${url}`, { err: marshalErrorLike(err) });
92
  sseStream.write({
93
  event: 'error',
94
  data: marshalErrorLike(err),
 
5
  import { PageSnapshot, PuppeteerControl } from '../services/puppeteer';
6
  import TurnDownService from 'turndown';
7
  import { Request, Response } from 'express';
8
+ import normalizeUrl from "@esm2cjs/normalize-url";
9
 
10
 
11
  @singleton()
 
58
  res: Response,
59
  },
60
  ) {
61
+ const noSlashURL = ctx.req.url.slice(1);
62
+ const urlToCrawl = new URL(normalizeUrl(noSlashURL));
 
 
 
63
 
64
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
65
  const sseStream = new OutputServerEventStream();
 
86
  });
87
  }
88
  } catch (err: any) {
89
+ this.logger.error(`Failed to crawl ${urlToCrawl.toString()}`, { err: marshalErrorLike(err) });
90
  sseStream.write({
91
  event: 'error',
92
  data: marshalErrorLike(err),