Spaces:
Build error
Build error
fix: remove tidyMarkdown at all
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -16,7 +16,6 @@ import TurndownService from 'turndown';
|
|
| 16 |
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
| 17 |
import type { CookieParam } from 'puppeteer';
|
| 18 |
import { Crawled } from '../db/crawled';
|
| 19 |
-
import { tidyMarkdown } from '../utils/markdown';
|
| 20 |
import { cleanAttribute } from '../utils/misc';
|
| 21 |
import { randomUUID } from 'crypto';
|
| 22 |
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
|
@@ -198,7 +197,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 198 |
contentText = snapshot.text;
|
| 199 |
}
|
| 200 |
|
| 201 |
-
const cleanText =
|
| 202 |
|
| 203 |
const formatted = {
|
| 204 |
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
|
|
|
| 16 |
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
| 17 |
import type { CookieParam } from 'puppeteer';
|
| 18 |
import { Crawled } from '../db/crawled';
|
|
|
|
| 19 |
import { cleanAttribute } from '../utils/misc';
|
| 20 |
import { randomUUID } from 'crypto';
|
| 21 |
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
|
|
|
| 197 |
contentText = snapshot.text;
|
| 198 |
}
|
| 199 |
|
| 200 |
+
const cleanText = (contentText || '').trim();
|
| 201 |
|
| 202 |
const formatted = {
|
| 203 |
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
backend/functions/src/utils/markdown.ts
CHANGED
|
@@ -1,34 +1,39 @@
|
|
|
|
|
| 1 |
export function tidyMarkdown(markdown: string): string {
|
| 2 |
-
const lines = markdown.split('\n');
|
| 3 |
-
const processedLines = lines.map((line) => {
|
| 4 |
-
// Handle complex broken links with text and optional images
|
| 5 |
-
line = line.replace(/\[\s*([^\]\n!]*?)\s*(?:!\[([^\]]*)\]\((.*?)\))?\s*\]\s*\(\s*([^)\n]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
|
| 6 |
-
text = text.replace(/\s+/g, ' ').trim();
|
| 7 |
-
alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
|
| 8 |
-
imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
|
| 9 |
-
linkUrl = linkUrl.replace(/\s+/g, '').trim();
|
| 10 |
-
if (imgUrl) {
|
| 11 |
-
return `[${text} ](${linkUrl})`;
|
| 12 |
-
} else {
|
| 13 |
-
return `[${text}](${linkUrl})`;
|
| 14 |
-
}
|
| 15 |
-
});
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
});
|
|
|
|
| 23 |
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
});
|
| 26 |
|
| 27 |
-
//
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
-
// Replace more than two consecutive empty lines with exactly two empty lines
|
| 31 |
normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
|
| 32 |
|
|
|
|
|
|
|
|
|
|
| 33 |
return normalizedMarkdown.trim();
|
| 34 |
}
|
|
|
|
| 1 |
+
|
| 2 |
export function tidyMarkdown(markdown: string): string {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
+
// Step 1: Handle complex broken links with text and optional images spread across multiple lines
|
| 5 |
+
let normalizedMarkdown = markdown.replace(/\[\s*([^\]\n]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => {
|
| 6 |
+
// Remove internal new lines and excessive spaces within the text
|
| 7 |
+
text = text.replace(/\s+/g, ' ').trim();
|
| 8 |
+
url = url.replace(/\s+/g, '').trim();
|
| 9 |
+
return `[${text}](${url})`;
|
| 10 |
+
});
|
| 11 |
|
| 12 |
+
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]\n!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
|
| 13 |
+
// Normalize by removing excessive spaces and new lines
|
| 14 |
+
text = text.replace(/\s+/g, ' ').trim();
|
| 15 |
+
alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
|
| 16 |
+
imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
|
| 17 |
+
linkUrl = linkUrl.replace(/\s+/g, '').trim();
|
| 18 |
+
if (imgUrl) {
|
| 19 |
+
return `[${text} ](${linkUrl})`;
|
| 20 |
+
} else {
|
| 21 |
+
return `[${text}](${linkUrl})`;
|
| 22 |
+
}
|
| 23 |
});
|
| 24 |
|
| 25 |
+
// Step 2: Normalize regular links that may be broken across lines
|
| 26 |
+
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => {
|
| 27 |
+
text = text.replace(/\s+/g, ' ').trim();
|
| 28 |
+
url = url.replace(/\s+/g, '').trim();
|
| 29 |
+
return `[${text}](${url})`;
|
| 30 |
+
});
|
| 31 |
|
| 32 |
+
// Step 3: Replace more than two consecutive empty lines with exactly two empty lines
|
| 33 |
normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
|
| 34 |
|
| 35 |
+
// Step 4: Remove leading spaces from each line
|
| 36 |
+
normalizedMarkdown = normalizedMarkdown.replace(/^[ \t]+/gm, '');
|
| 37 |
+
|
| 38 |
return normalizedMarkdown.trim();
|
| 39 |
}
|