Spaces:

bonesmasher
/

web_reader

Build error

App Files Files Community

nomagick commited on May 9, 2024

Commit

36bf5d9

unverified ·

1 Parent(s): 59f807c

fix: remove tidyMarkdown at all

Browse files

Files changed (2) hide show

backend/functions/src/cloud-functions/crawler.ts +1 -2
backend/functions/src/utils/markdown.ts +29 -24

backend/functions/src/cloud-functions/crawler.ts CHANGED Viewed

@@ -16,7 +16,6 @@ import TurndownService from 'turndown';
 import { parseString as parseSetCookieString } from 'set-cookie-parser';
 import type { CookieParam } from 'puppeteer';
 import { Crawled } from '../db/crawled';
-import { tidyMarkdown } from '../utils/markdown';
 import { cleanAttribute } from '../utils/misc';
 import { randomUUID } from 'crypto';
 import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
@@ -198,7 +197,7 @@ export class CrawlerHost extends RPCHost {
             contentText = snapshot.text;
         }
-        const cleanText = tidyMarkdown(contentText || '').trim();
         const formatted = {
             title: (snapshot.parsed?.title || snapshot.title || '').trim(),

 import { parseString as parseSetCookieString } from 'set-cookie-parser';
 import type { CookieParam } from 'puppeteer';
 import { Crawled } from '../db/crawled';
 import { cleanAttribute } from '../utils/misc';
 import { randomUUID } from 'crypto';
 import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
             contentText = snapshot.text;
         }
+        const cleanText = (contentText || '').trim();
         const formatted = {
             title: (snapshot.parsed?.title || snapshot.title || '').trim(),

backend/functions/src/utils/markdown.ts CHANGED Viewed

@@ -1,34 +1,39 @@
 export function tidyMarkdown(markdown: string): string {
-    const lines = markdown.split('\n');
-    const processedLines = lines.map((line) => {
-        // Handle complex broken links with text and optional images
-        line = line.replace(/\[\s*([^\]\n!]*?)\s*(?:!\[([^\]]*)\]\((.*?)\))?\s*\]\s*\(\s*([^)\n]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
-            text = text.replace(/\s+/g, ' ').trim();
-            alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
-            imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
-            linkUrl = linkUrl.replace(/\s+/g, '').trim();
-            if (imgUrl) {
-                return `[${text} ![${alt}](${imgUrl})](${linkUrl})`;
-            } else {
-                return `[${text}](${linkUrl})`;
-            }
-        });
-        // Normalize regular links that may be broken across lines
-        line = line.replace(/\[\s*([^\]\n]+)\]\s*\(\s*([^)\n]+)\s*\)/g, (match, text, url) => {
-            text = text.replace(/\s+/g, ' ').trim();
-            url = url.replace(/\s+/g, '').trim();
-            return `[${text}](${url})`;
-        });
-        return line;
     });
-    // Join the processed lines back together
-    let normalizedMarkdown = processedLines.join('\n');
-    // Replace more than two consecutive empty lines with exactly two empty lines
     normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
     return normalizedMarkdown.trim();
 }

 export function tidyMarkdown(markdown: string): string {
+    // Step 1: Handle complex broken links with text and optional images spread across multiple lines
+    let normalizedMarkdown = markdown.replace(/\[\s*([^\]\n]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => {
+        // Remove internal new lines and excessive spaces within the text
+        text = text.replace(/\s+/g, ' ').trim();
+        url = url.replace(/\s+/g, '').trim();
+        return `[${text}](${url})`;
+    });
+    normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]\n!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
+        // Normalize by removing excessive spaces and new lines
+        text = text.replace(/\s+/g, ' ').trim();
+        alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
+        imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
+        linkUrl = linkUrl.replace(/\s+/g, '').trim();
+        if (imgUrl) {
+            return `[${text} ![${alt}](${imgUrl})](${linkUrl})`;
+        } else {
+            return `[${text}](${linkUrl})`;
+        }
     });
+    // Step 2: Normalize regular links that may be broken across lines
+    normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => {
+        text = text.replace(/\s+/g, ' ').trim();
+        url = url.replace(/\s+/g, '').trim();
+        return `[${text}](${url})`;
+    });
+    // Step 3: Replace more than two consecutive empty lines with exactly two empty lines
     normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
+    // Step 4: Remove leading spaces from each line
+    normalizedMarkdown = normalizedMarkdown.replace(/^[ \t]+/gm, '');
     return normalizedMarkdown.trim();
 }