File size: 4,208 Bytes
fc69895
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
/*
 * Copyright 2023 Vercel, Inc.
 * Adapted from: https://github.com/vercel/streamdown/blob/main/packages/streamdown/lib/parse-blocks.tsx
 */

import { Lexer } from "marked";

/**
 * Parses markdown into independent blocks for efficient memoization during streaming.
 * Blocks are split at natural boundaries while keeping related content together.
 */
export function parseMarkdownIntoBlocks(markdown: string): string[] {
	// Check if the markdown contains footnotes (references or definitions)
	// Footnote references: [^1], [^label], etc.
	// Footnote definitions: [^1]: text, [^label]: text, etc.
	// Use atomic groups or possessive quantifiers to prevent backtracking
	const hasFootnoteReference = /\[\^[^\]\s]{1,200}\](?!:)/.test(markdown);
	const hasFootnoteDefinition = /\[\^[^\]\s]{1,200}\]:/.test(markdown);

	// If footnotes are present, return the entire document as a single block
	// This ensures footnote references and definitions remain in the same mdast tree
	if (hasFootnoteReference || hasFootnoteDefinition) {
		return [markdown];
	}

	const tokens = Lexer.lex(markdown, { gfm: true });

	// Post-process to merge consecutive blocks that belong together
	const mergedBlocks: string[] = [];
	const htmlStack: string[] = []; // Track opening HTML tags

	for (let i = 0; i < tokens.length; i++) {
		const token = tokens[i];
		const currentBlock = token.raw;

		// Check if we're inside an HTML block
		if (htmlStack.length > 0) {
			// We're inside an HTML block, merge with the previous block
			mergedBlocks[mergedBlocks.length - 1] += currentBlock;

			// Check if this token closes an HTML tag
			if (token.type === "html") {
				const closingTagMatch = currentBlock.match(/<\/(\w+)>/);
				if (closingTagMatch) {
					const closingTag = closingTagMatch[1];
					// Check if this closes the most recent opening tag
					if (htmlStack[htmlStack.length - 1] === closingTag) {
						htmlStack.pop();
					}
				}
			}
			continue;
		}

		// Check if this is an opening HTML block tag
		if (token.type === "html" && token.block) {
			const openingTagMatch = currentBlock.match(/<(\w+)[\s>]/);
			if (openingTagMatch) {
				const tagName = openingTagMatch[1];
				// Check if this is a self-closing tag or if there's a closing tag in the same block
				const hasClosingTag = currentBlock.includes(`</${tagName}>`);
				if (!hasClosingTag) {
					// This is an opening tag without a closing tag in the same block
					htmlStack.push(tagName);
				}
			}
		}

		// Math block merging logic (existing)
		// Check if this is a standalone $$ that might be a closing delimiter
		if (currentBlock.trim() === "$$" && mergedBlocks.length > 0) {
			const previousBlock = mergedBlocks.at(-1);

			if (!previousBlock) {
				mergedBlocks.push(currentBlock);
				continue;
			}

			// Check if the previous block starts with $$ but doesn't end with $$
			const prevStartsWith$$ = previousBlock.trimStart().startsWith("$$");
			const prevDollarCount = (previousBlock.match(/\$\$/g) || []).length;

			// If previous block has odd number of $$ and starts with $$, merge them
			if (prevStartsWith$$ && prevDollarCount % 2 === 1) {
				mergedBlocks[mergedBlocks.length - 1] = previousBlock + currentBlock;
				continue;
			}
		}

		// Check if current block ends with $$ and previous block started with $$ but didn't close
		if (mergedBlocks.length > 0 && currentBlock.trimEnd().endsWith("$$")) {
			const previousBlock = mergedBlocks.at(-1);

			if (!previousBlock) {
				mergedBlocks.push(currentBlock);
				continue;
			}

			const prevStartsWith$$ = previousBlock.trimStart().startsWith("$$");
			const prevDollarCount = (previousBlock.match(/\$\$/g) || []).length;
			const currDollarCount = (currentBlock.match(/\$\$/g) || []).length;

			// If previous block has unclosed math (odd $$) and current block ends with $$
			// AND current block doesn't start with $$, it's likely a continuation
			if (
				prevStartsWith$$ &&
				prevDollarCount % 2 === 1 &&
				!currentBlock.trimStart().startsWith("$$") &&
				currDollarCount === 1
			) {
				mergedBlocks[mergedBlocks.length - 1] = previousBlock + currentBlock;
				continue;
			}
		}

		mergedBlocks.push(currentBlock);
	}

	return mergedBlocks;
}