nomagick commited on
Commit
36bf5d9
·
unverified ·
1 Parent(s): 59f807c

fix: remove tidyMarkdown at all

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -16,7 +16,6 @@ import TurndownService from 'turndown';
16
  import { parseString as parseSetCookieString } from 'set-cookie-parser';
17
  import type { CookieParam } from 'puppeteer';
18
  import { Crawled } from '../db/crawled';
19
- import { tidyMarkdown } from '../utils/markdown';
20
  import { cleanAttribute } from '../utils/misc';
21
  import { randomUUID } from 'crypto';
22
  import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
@@ -198,7 +197,7 @@ export class CrawlerHost extends RPCHost {
198
  contentText = snapshot.text;
199
  }
200
 
201
- const cleanText = tidyMarkdown(contentText || '').trim();
202
 
203
  const formatted = {
204
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),
 
16
  import { parseString as parseSetCookieString } from 'set-cookie-parser';
17
  import type { CookieParam } from 'puppeteer';
18
  import { Crawled } from '../db/crawled';
 
19
  import { cleanAttribute } from '../utils/misc';
20
  import { randomUUID } from 'crypto';
21
  import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
 
197
  contentText = snapshot.text;
198
  }
199
 
200
+ const cleanText = (contentText || '').trim();
201
 
202
  const formatted = {
203
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),
backend/functions/src/utils/markdown.ts CHANGED
@@ -1,34 +1,39 @@
 
1
  export function tidyMarkdown(markdown: string): string {
2
- const lines = markdown.split('\n');
3
- const processedLines = lines.map((line) => {
4
- // Handle complex broken links with text and optional images
5
- line = line.replace(/\[\s*([^\]\n!]*?)\s*(?:!\[([^\]]*)\]\((.*?)\))?\s*\]\s*\(\s*([^)\n]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
6
- text = text.replace(/\s+/g, ' ').trim();
7
- alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
8
- imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
9
- linkUrl = linkUrl.replace(/\s+/g, '').trim();
10
- if (imgUrl) {
11
- return `[${text} ![${alt}](${imgUrl})](${linkUrl})`;
12
- } else {
13
- return `[${text}](${linkUrl})`;
14
- }
15
- });
16
 
17
- // Normalize regular links that may be broken across lines
18
- line = line.replace(/\[\s*([^\]\n]+)\]\s*\(\s*([^)\n]+)\s*\)/g, (match, text, url) => {
19
- text = text.replace(/\s+/g, ' ').trim();
20
- url = url.replace(/\s+/g, '').trim();
21
- return `[${text}](${url})`;
22
- });
 
23
 
24
- return line;
 
 
 
 
 
 
 
 
 
 
25
  });
26
 
27
- // Join the processed lines back together
28
- let normalizedMarkdown = processedLines.join('\n');
 
 
 
 
29
 
30
- // Replace more than two consecutive empty lines with exactly two empty lines
31
  normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
32
 
 
 
 
33
  return normalizedMarkdown.trim();
34
  }
 
1
+
2
  export function tidyMarkdown(markdown: string): string {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ // Step 1: Handle complex broken links with text and optional images spread across multiple lines
5
+ let normalizedMarkdown = markdown.replace(/\[\s*([^\]\n]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => {
6
+ // Remove internal new lines and excessive spaces within the text
7
+ text = text.replace(/\s+/g, ' ').trim();
8
+ url = url.replace(/\s+/g, '').trim();
9
+ return `[${text}](${url})`;
10
+ });
11
 
12
+ normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]\n!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
13
+ // Normalize by removing excessive spaces and new lines
14
+ text = text.replace(/\s+/g, ' ').trim();
15
+ alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
16
+ imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
17
+ linkUrl = linkUrl.replace(/\s+/g, '').trim();
18
+ if (imgUrl) {
19
+ return `[${text} ![${alt}](${imgUrl})](${linkUrl})`;
20
+ } else {
21
+ return `[${text}](${linkUrl})`;
22
+ }
23
  });
24
 
25
+ // Step 2: Normalize regular links that may be broken across lines
26
+ normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => {
27
+ text = text.replace(/\s+/g, ' ').trim();
28
+ url = url.replace(/\s+/g, '').trim();
29
+ return `[${text}](${url})`;
30
+ });
31
 
32
+ // Step 3: Replace more than two consecutive empty lines with exactly two empty lines
33
  normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
34
 
35
+ // Step 4: Remove leading spaces from each line
36
+ normalizedMarkdown = normalizedMarkdown.replace(/^[ \t]+/gm, '');
37
+
38
  return normalizedMarkdown.trim();
39
  }