hanxiao commited on
Commit
ef23d81
·
1 Parent(s): c7c039a

feat: clean broken markdown

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -7,6 +7,40 @@ import TurnDownService from 'turndown';
7
  import { Request, Response } from 'express';
8
  import normalizeUrl from "@esm2cjs/normalize-url";
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  @singleton()
12
  export class CrawlerHost extends RPCHost {
@@ -34,10 +68,12 @@ export class CrawlerHost extends RPCHost {
34
 
35
  const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text?.trim();
36
 
 
 
37
  const formatted = {
38
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),
39
  url: snapshot.href?.trim(),
40
- content: contentText.trim(),
41
 
42
  toString() {
43
  return `Title: ${this.title}
 
7
  import { Request, Response } from 'express';
8
  import normalizeUrl from "@esm2cjs/normalize-url";
9
 
10
+ function tidyMarkdown(markdown: string): string {
11
+ // Step 1: Handle complex broken links with text and optional images spread across multiple lines
12
+ let normalizedMarkdown = markdown.replace(/\[\s*([^]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => {
13
+ // Remove internal new lines and excessive spaces within the text
14
+ text = text.replace(/\s+/g, ' ').trim();
15
+ url = url.replace(/\s+/g, '').trim();
16
+ return `[${text}](${url})`;
17
+ });
18
+
19
+ normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
20
+ // Normalize by removing excessive spaces and new lines
21
+ text = text.replace(/\s+/g, ' ').trim();
22
+ alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
23
+ imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
24
+ linkUrl = linkUrl.replace(/\s+/g, '').trim();
25
+ if (imgUrl) {
26
+ return `[${text} ![${alt}](${imgUrl})](${linkUrl})`;
27
+ } else {
28
+ return `[${text}](${linkUrl})`;
29
+ }
30
+ });
31
+
32
+ // Step 2: Normalize regular links that may be broken across lines
33
+ normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => {
34
+ text = text.replace(/\s+/g, ' ').trim();
35
+ url = url.replace(/\s+/g, '').trim();
36
+ return `[${text}](${url})`;
37
+ });
38
+
39
+ // Step 3: Replace more than two consecutive empty lines with exactly two empty lines
40
+ normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
41
+
42
+ return normalizedMarkdown;
43
+ }
44
 
45
  @singleton()
46
  export class CrawlerHost extends RPCHost {
 
68
 
69
  const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text?.trim();
70
 
71
+ const cleanText = tidyMarkdown(contentText).trim();
72
+
73
  const formatted = {
74
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),
75
  url: snapshot.href?.trim(),
76
+ content: cleanText,
77
 
78
  toString() {
79
  return `Title: ${this.title}