Spaces:

bonesmasher
/

web_reader

Build error

App Files Files Community

hanxiao commited on Apr 14, 2024

Commit

ef23d81

1 Parent(s): c7c039a

feat: clean broken markdown

Browse files

Files changed (1) hide show

backend/functions/src/cloud-functions/crawler.ts +37 -1

backend/functions/src/cloud-functions/crawler.ts CHANGED Viewed

@@ -7,6 +7,40 @@ import TurnDownService from 'turndown';
 import { Request, Response } from 'express';
 import normalizeUrl from "@esm2cjs/normalize-url";
 @singleton()
 export class CrawlerHost extends RPCHost {
@@ -34,10 +68,12 @@ export class CrawlerHost extends RPCHost {
         const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text?.trim();
         const formatted = {
             title: (snapshot.parsed?.title || snapshot.title || '').trim(),
             url: snapshot.href?.trim(),
-            content: contentText.trim(),
             toString() {
                 return `Title: ${this.title}

 import { Request, Response } from 'express';
 import normalizeUrl from "@esm2cjs/normalize-url";
+function tidyMarkdown(markdown: string): string {
+    // Step 1: Handle complex broken links with text and optional images spread across multiple lines
+    let normalizedMarkdown = markdown.replace(/\[\s*([^]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => {
+        // Remove internal new lines and excessive spaces within the text
+        text = text.replace(/\s+/g, ' ').trim();
+        url = url.replace(/\s+/g, '').trim();
+        return `[${text}](${url})`;
+    });
+    normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
+        // Normalize by removing excessive spaces and new lines
+        text = text.replace(/\s+/g, ' ').trim();
+        alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
+        imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
+        linkUrl = linkUrl.replace(/\s+/g, '').trim();
+        if (imgUrl) {
+            return `[${text} ![${alt}](${imgUrl})](${linkUrl})`;
+        } else {
+            return `[${text}](${linkUrl})`;
+        }
+    });
+    // Step 2: Normalize regular links that may be broken across lines
+    normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => {
+        text = text.replace(/\s+/g, ' ').trim();
+        url = url.replace(/\s+/g, '').trim();
+        return `[${text}](${url})`;
+    });
+    // Step 3: Replace more than two consecutive empty lines with exactly two empty lines
+    normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
+    return normalizedMarkdown;
+}
 @singleton()
 export class CrawlerHost extends RPCHost {
         const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text?.trim();
+        const cleanText = tidyMarkdown(contentText).trim();
         const formatted = {
             title: (snapshot.parsed?.title || snapshot.title || '').trim(),
             url: snapshot.href?.trim(),
+            content: cleanText,
             toString() {
                 return `Title: ${this.title}