chat-ui / src /lib /utils /parseIncompleteMarkdown.ts
coyotte508
A new start
fc69895
/*
* Copyright 2023 Vercel, Inc.
* Source: https://github.com/vercel/streamdown/blob/main/packages/streamdown/lib/parse-incomplete-markdown.ts
*/
const linkImagePattern = /(!?\[)([^\]]*?)$/;
const boldPattern = /(\*\*)([^*]*?)$/;
const italicPattern = /(__)([^_]*?)$/;
const boldItalicPattern = /(\*\*\*)([^*]*?)$/;
const singleAsteriskPattern = /(\*)([^*]*?)$/;
const singleUnderscorePattern = /(_)([^_]*?)$/;
const inlineCodePattern = /(`)([^`]*?)$/;
const strikethroughPattern = /(~~)([^~]*?)$/;
// Helper function to check if we have a complete code block
const hasCompleteCodeBlock = (text: string): boolean => {
const tripleBackticks = (text.match(/```/g) || []).length;
return tripleBackticks > 0 && tripleBackticks % 2 === 0 && text.includes("\n");
};
// Returns the start index of the currently open fenced code block, or -1 if none
const getOpenCodeFenceIndex = (text: string): number => {
let openFenceIndex = -1;
let inFence = false;
for (const match of text.matchAll(/```/g)) {
const index = match.index ?? -1;
if (index === -1) {
continue;
}
if (inFence) {
// This fence closes the current block
inFence = false;
openFenceIndex = -1;
} else {
// This fence opens a new block
inFence = true;
openFenceIndex = index;
}
}
return openFenceIndex;
};
// Handles incomplete links and images by preserving them with a special marker
const handleIncompleteLinksAndImages = (text: string): string => {
// First check for incomplete URLs: [text](partial-url or ![text](partial-url without closing )
// Pattern: !?[text](url-without-closing-paren at end of string
const incompleteLinkUrlPattern = /(!?)\[([^\]]+)\]\(([^)]+)$/;
const incompleteLinkUrlMatch = text.match(incompleteLinkUrlPattern);
if (incompleteLinkUrlMatch) {
const isImage = incompleteLinkUrlMatch[1] === "!";
const linkText = incompleteLinkUrlMatch[2];
const partialUrl = incompleteLinkUrlMatch[3];
// Find the start position of this link/image pattern
const matchStart = text.lastIndexOf(`${isImage ? "!" : ""}[${linkText}](${partialUrl}`);
const beforeLink = text.substring(0, matchStart);
if (isImage) {
// For images with incomplete URLs, remove them entirely
return beforeLink;
}
// For links with incomplete URLs, replace the URL with placeholder and close it
return `${beforeLink}[${linkText}](streamdown:incomplete-link)`;
}
// Then check for incomplete link text: [partial-text without closing ]
const linkMatch = text.match(linkImagePattern);
if (linkMatch) {
const isImage = linkMatch[1].startsWith("!");
// For images, we still remove them as they can't show skeleton
if (isImage) {
const startIndex = text.lastIndexOf(linkMatch[1]);
return text.substring(0, startIndex);
}
// For links, preserve the text and close the link with a
// special placeholder URL that indicates it's incomplete
return `${text}](streamdown:incomplete-link)`;
}
return text;
};
// Completes incomplete bold formatting (**)
const handleIncompleteBold = (text: string): string => {
// Don't process if inside a complete code block
if (hasCompleteCodeBlock(text)) {
return text;
}
const boldMatch = text.match(boldPattern);
if (boldMatch) {
// Don't close if there's no meaningful content after the opening markers
// boldMatch[2] contains the content after **
// Check if content is only whitespace or other emphasis markers
const contentAfterMarker = boldMatch[2];
if (!contentAfterMarker || /^[\s_~*`]*$/.test(contentAfterMarker)) {
return text;
}
// Check if the bold marker is in a list item context
// Find the position of the matched bold marker
const markerIndex = text.lastIndexOf(boldMatch[1]);
// Don't process if the marker is inside an incomplete code block
const openFenceIndex = getOpenCodeFenceIndex(text);
if (openFenceIndex !== -1 && markerIndex > openFenceIndex) {
return text;
}
const beforeMarker = text.substring(0, markerIndex);
const lastNewlineBeforeMarker = beforeMarker.lastIndexOf("\n");
const lineStart = lastNewlineBeforeMarker === -1 ? 0 : lastNewlineBeforeMarker + 1;
const lineBeforeMarker = text.substring(lineStart, markerIndex);
// Check if this line is a list item with just the bold marker
if (/^[\s]*[-*+][\s]+$/.test(lineBeforeMarker)) {
// This is a list item with just emphasis markers
// Check if content after marker spans multiple lines
const hasNewlineInContent = contentAfterMarker.includes("\n");
if (hasNewlineInContent) {
// Don't complete if the content spans to another line
return text;
}
}
const asteriskPairs = (text.match(/\*\*/g) || []).length;
if (asteriskPairs % 2 === 1) {
return `${text}**`;
}
}
return text;
};
// Completes incomplete italic formatting with double underscores (__)
const handleIncompleteDoubleUnderscoreItalic = (text: string): string => {
// Don't process if inside a complete code block
if (hasCompleteCodeBlock(text)) {
return text;
}
const italicMatch = text.match(italicPattern);
if (italicMatch) {
// Don't close if there's no meaningful content after the opening markers
// italicMatch[2] contains the content after __
// Check if content is only whitespace or other emphasis markers
const contentAfterMarker = italicMatch[2];
if (!contentAfterMarker || /^[\s_~*`]*$/.test(contentAfterMarker)) {
return text;
}
// Check if the underscore marker is in a list item context
// Find the position of the matched underscore marker
const markerIndex = text.lastIndexOf(italicMatch[1]);
// Don't process if the marker is inside an incomplete code block
const openFenceIndex = getOpenCodeFenceIndex(text);
if (openFenceIndex !== -1 && markerIndex > openFenceIndex) {
return text;
}
const beforeMarker = text.substring(0, markerIndex);
const lastNewlineBeforeMarker = beforeMarker.lastIndexOf("\n");
const lineStart = lastNewlineBeforeMarker === -1 ? 0 : lastNewlineBeforeMarker + 1;
const lineBeforeMarker = text.substring(lineStart, markerIndex);
// Check if this line is a list item with just the underscore marker
if (/^[\s]*[-*+][\s]+$/.test(lineBeforeMarker)) {
// This is a list item with just emphasis markers
// Check if content after marker spans multiple lines
const hasNewlineInContent = contentAfterMarker.includes("\n");
if (hasNewlineInContent) {
// Don't complete if the content spans to another line
return text;
}
}
const underscorePairs = (text.match(/__/g) || []).length;
if (underscorePairs % 2 === 1) {
return `${text}__`;
}
}
return text;
};
// Counts single asterisks that are not part of double asterisks, not escaped, and not list markers
const countSingleAsterisks = (text: string): number => {
return text.split("").reduce((acc, char, index) => {
if (char === "*") {
const prevChar = text[index - 1];
const nextChar = text[index + 1];
// Skip if escaped with backslash
if (prevChar === "\\") {
return acc;
}
// Check if this is a list marker (asterisk at start of line followed by space)
// Look backwards to find the start of the current line
let lineStartIndex = index;
for (let i = index - 1; i >= 0; i--) {
if (text[i] === "\n") {
lineStartIndex = i + 1;
break;
}
if (i === 0) {
lineStartIndex = 0;
break;
}
}
// Check if this asterisk is at the beginning of a line (with optional whitespace)
const beforeAsterisk = text.substring(lineStartIndex, index);
if (beforeAsterisk.trim() === "" && (nextChar === " " || nextChar === "\t")) {
// This is likely a list marker, don't count it
return acc;
}
if (prevChar !== "*" && nextChar !== "*") {
return acc + 1;
}
}
return acc;
}, 0);
};
// Completes incomplete italic formatting with single asterisks (*)
const handleIncompleteSingleAsteriskItalic = (text: string): string => {
// Don't process if inside a complete code block
if (hasCompleteCodeBlock(text)) {
return text;
}
const singleAsteriskMatch = text.match(singleAsteriskPattern);
if (singleAsteriskMatch) {
// Find the first single asterisk position (not part of **)
let firstSingleAsteriskIndex = -1;
for (let i = 0; i < text.length; i++) {
if (text[i] === "*" && text[i - 1] !== "*" && text[i + 1] !== "*") {
firstSingleAsteriskIndex = i;
break;
}
}
if (firstSingleAsteriskIndex === -1) {
return text;
}
// Don't process if the marker is inside an incomplete code block
const openFenceIndex = getOpenCodeFenceIndex(text);
if (openFenceIndex !== -1 && firstSingleAsteriskIndex > openFenceIndex) {
return text;
}
// Get content after the first single asterisk
const contentAfterFirstAsterisk = text.substring(firstSingleAsteriskIndex + 1);
// Check if there's meaningful content after the asterisk
// Don't close if content is only whitespace or emphasis markers
if (!contentAfterFirstAsterisk || /^[\s_~*`]*$/.test(contentAfterFirstAsterisk)) {
return text;
}
const singleAsterisks = countSingleAsterisks(text);
if (singleAsterisks % 2 === 1) {
return `${text}*`;
}
}
return text;
};
// Check if a position is within a math block (between $ or $$)
const isWithinMathBlock = (text: string, position: number): boolean => {
// Count dollar signs before this position
let inInlineMath = false;
let inBlockMath = false;
for (let i = 0; i < text.length && i < position; i++) {
// Skip escaped dollar signs
if (text[i] === "\\" && text[i + 1] === "$") {
i++; // Skip the next character
continue;
}
if (text[i] === "$") {
// Check for block math ($$)
if (text[i + 1] === "$") {
inBlockMath = !inBlockMath;
i++; // Skip the second $
inInlineMath = false; // Block math takes precedence
} else if (!inBlockMath) {
// Only toggle inline math if not in block math
inInlineMath = !inInlineMath;
}
}
}
return inInlineMath || inBlockMath;
};
// Counts single underscores that are not part of double underscores, not escaped, and not in math blocks
const countSingleUnderscores = (text: string): number => {
return text.split("").reduce((acc, char, index) => {
if (char === "_") {
const prevChar = text[index - 1];
const nextChar = text[index + 1];
// Skip if escaped with backslash
if (prevChar === "\\") {
return acc;
}
// Skip if within math block
if (isWithinMathBlock(text, index)) {
return acc;
}
// Skip if underscore is word-internal (between word characters)
if (
prevChar &&
nextChar &&
/[\p{L}\p{N}_]/u.test(prevChar) &&
/[\p{L}\p{N}_]/u.test(nextChar)
) {
return acc;
}
if (prevChar !== "_" && nextChar !== "_") {
return acc + 1;
}
}
return acc;
}, 0);
};
// Completes incomplete italic formatting with single underscores (_)
const handleIncompleteSingleUnderscoreItalic = (text: string): string => {
// Don't process if inside a complete code block
if (hasCompleteCodeBlock(text)) {
return text;
}
const singleUnderscoreMatch = text.match(singleUnderscorePattern);
if (singleUnderscoreMatch) {
// Find the first single underscore position (not part of __ and not word-internal)
let firstSingleUnderscoreIndex = -1;
for (let i = 0; i < text.length; i++) {
if (
text[i] === "_" &&
text[i - 1] !== "_" &&
text[i + 1] !== "_" &&
text[i - 1] !== "\\" &&
!isWithinMathBlock(text, i)
) {
// Check if underscore is word-internal (between word characters)
const prevChar = i > 0 ? text[i - 1] : "";
const nextChar = i < text.length - 1 ? text[i + 1] : "";
if (
prevChar &&
nextChar &&
/[\p{L}\p{N}_]/u.test(prevChar) &&
/[\p{L}\p{N}_]/u.test(nextChar)
) {
continue;
}
firstSingleUnderscoreIndex = i;
break;
}
}
if (firstSingleUnderscoreIndex === -1) {
return text;
}
// Don't process if the marker is inside an incomplete code block
const openFenceIndex = getOpenCodeFenceIndex(text);
if (openFenceIndex !== -1 && firstSingleUnderscoreIndex > openFenceIndex) {
return text;
}
// Get content after the first single underscore
const contentAfterFirstUnderscore = text.substring(firstSingleUnderscoreIndex + 1);
// Check if there's meaningful content after the underscore
// Don't close if content is only whitespace or emphasis markers
if (!contentAfterFirstUnderscore || /^[\s_~*`]*$/.test(contentAfterFirstUnderscore)) {
return text;
}
const singleUnderscores = countSingleUnderscores(text);
if (singleUnderscores % 2 === 1) {
// If text ends with newline(s), insert underscore before them
const trailingNewlineMatch = text.match(/\n+$/);
if (trailingNewlineMatch) {
const textBeforeNewlines = text.slice(0, -trailingNewlineMatch[0].length);
return `${textBeforeNewlines}_${trailingNewlineMatch[0]}`;
}
return `${text}_`;
}
}
return text;
};
// Checks if a backtick at position i is part of a triple backtick sequence
const isPartOfTripleBacktick = (text: string, i: number): boolean => {
const isTripleStart = text.substring(i, i + 3) === "```";
const isTripleMiddle = i > 0 && text.substring(i - 1, i + 2) === "```";
const isTripleEnd = i > 1 && text.substring(i - 2, i + 1) === "```";
return isTripleStart || isTripleMiddle || isTripleEnd;
};
// Counts single backticks that are not part of triple backticks
const countSingleBackticks = (text: string): number => {
let count = 0;
for (let i = 0; i < text.length; i++) {
if (text[i] === "`" && !isPartOfTripleBacktick(text, i)) {
count++;
}
}
return count;
};
// Completes incomplete inline code formatting (`)
// Avoids completing if inside an incomplete code block
const handleIncompleteInlineCode = (text: string): string => {
// Check if we have inline triple backticks (starts with ``` and should end with ```)
// This pattern should ONLY match truly inline code (no newlines)
// Examples: ```code``` or ```python code```
const inlineTripleBacktickMatch = text.match(/^```[^`\n]*```?$/);
if (inlineTripleBacktickMatch && !text.includes("\n")) {
// Check if it ends with exactly 2 backticks (incomplete)
if (text.endsWith("``") && !text.endsWith("```")) {
return `${text}\``;
}
// Already complete inline triple backticks
return text;
}
// Check if we're inside a multi-line code block (complete or incomplete)
const allTripleBackticks = (text.match(/```/g) || []).length;
const insideIncompleteCodeBlock = allTripleBackticks % 2 === 1;
// Don't modify text if we have complete multi-line code blocks (even pairs of ```)
if (allTripleBackticks > 0 && allTripleBackticks % 2 === 0 && text.includes("\n")) {
// We have complete multi-line code blocks, don't add any backticks
return text;
}
// Special case: if text ends with ```\n (triple backticks followed by newline)
// This is actually a complete code block, not incomplete
if (text.endsWith("```\n") || text.endsWith("```")) {
// Count all triple backticks - if even, it's complete
if (allTripleBackticks % 2 === 0) {
return text;
}
}
const inlineCodeMatch = text.match(inlineCodePattern);
if (inlineCodeMatch && !insideIncompleteCodeBlock) {
// Don't close if there's no meaningful content after the opening marker
// inlineCodeMatch[2] contains the content after `
// Check if content is only whitespace or other emphasis markers
const contentAfterMarker = inlineCodeMatch[2];
if (!contentAfterMarker || /^[\s_~*`]*$/.test(contentAfterMarker)) {
return text;
}
const singleBacktickCount = countSingleBackticks(text);
if (singleBacktickCount % 2 === 1) {
return `${text}\``;
}
}
return text;
};
// Completes incomplete strikethrough formatting (~~)
const handleIncompleteStrikethrough = (text: string): string => {
const strikethroughMatch = text.match(strikethroughPattern);
if (strikethroughMatch) {
// Don't close if there's no meaningful content after the opening markers
// strikethroughMatch[2] contains the content after ~~
// Check if content is only whitespace or other emphasis markers
const contentAfterMarker = strikethroughMatch[2];
if (!contentAfterMarker || /^[\s_~*`]*$/.test(contentAfterMarker)) {
return text;
}
const tildePairs = (text.match(/~~/g) || []).length;
if (tildePairs % 2 === 1) {
return `${text}~~`;
}
}
return text;
};
// Counts single dollar signs that are not part of double dollar signs and not escaped
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const _countSingleDollarSigns = (text: string): number => {
return text.split("").reduce((acc, char, index) => {
if (char === "$") {
const prevChar = text[index - 1];
const nextChar = text[index + 1];
// Skip if escaped with backslash
if (prevChar === "\\") {
return acc;
}
if (prevChar !== "$" && nextChar !== "$") {
return acc + 1;
}
}
return acc;
}, 0);
};
// Completes incomplete block KaTeX formatting ($$)
const handleIncompleteBlockKatex = (text: string): string => {
// Count all $$ pairs in the text
const dollarPairs = (text.match(/\$\$/g) || []).length;
// If we have an even number of $$, the block is complete
if (dollarPairs % 2 === 0) {
return text;
}
// If we have an odd number, add closing $$
// Check if this looks like a multi-line math block (contains newlines after opening $$)
const firstDollarIndex = text.indexOf("$$");
const hasNewlineAfterStart =
firstDollarIndex !== -1 && text.indexOf("\n", firstDollarIndex) !== -1;
// For multi-line blocks, add newline before closing $$ if not present
if (hasNewlineAfterStart && !text.endsWith("\n")) {
return `${text}\n$$`;
}
// For inline blocks or when already ending with newline, just add $$
return `${text}$$`;
};
// Counts triple asterisks that are not part of quadruple or more asterisks
const countTripleAsterisks = (text: string): number => {
let count = 0;
const matches = text.match(/\*+/g) || [];
for (const match of matches) {
// Count how many complete triple asterisks are in this sequence
const asteriskCount = match.length;
if (asteriskCount >= 3) {
// Each group of exactly 3 asterisks counts as one triple asterisk marker
count += Math.floor(asteriskCount / 3);
}
}
return count;
};
// Completes incomplete bold-italic formatting (***)
const handleIncompleteBoldItalic = (text: string): string => {
// Don't process if inside a complete code block
if (hasCompleteCodeBlock(text)) {
return text;
}
// Don't process if text is only asterisks and has 4 or more consecutive asterisks
// This prevents cases like **** from being treated as incomplete ***
if (/^\*{4,}$/.test(text)) {
return text;
}
const boldItalicMatch = text.match(boldItalicPattern);
if (boldItalicMatch) {
// Don't close if there's no meaningful content after the opening markers
// boldItalicMatch[2] contains the content after ***
// Check if content is only whitespace or other emphasis markers
const contentAfterMarker = boldItalicMatch[2];
if (!contentAfterMarker || /^[\s_~*`]*$/.test(contentAfterMarker)) {
return text;
}
// Find the position of the matched bold-italic marker
const markerIndex = text.lastIndexOf(boldItalicMatch[1]);
// Don't process if the marker is inside an incomplete code block
const openFenceIndex = getOpenCodeFenceIndex(text);
if (openFenceIndex !== -1 && markerIndex > openFenceIndex) {
return text;
}
const tripleAsteriskCount = countTripleAsterisks(text);
if (tripleAsteriskCount % 2 === 1) {
return `${text}***`;
}
}
return text;
};
// Parses markdown text and removes incomplete tokens to prevent partial rendering
export const parseIncompleteMarkdown = (text: string): string => {
if (!text || typeof text !== "string") {
return text;
}
let result = text;
// Handle incomplete links and images first
const processedResult = handleIncompleteLinksAndImages(result);
// If we added an incomplete link marker, don't process other formatting
// as the content inside the link should be preserved as-is
if (processedResult.endsWith("](streamdown:incomplete-link)")) {
return processedResult;
}
result = processedResult;
// Handle various formatting completions
// Handle triple asterisks first (most specific)
result = handleIncompleteBoldItalic(result);
result = handleIncompleteBold(result);
result = handleIncompleteDoubleUnderscoreItalic(result);
result = handleIncompleteSingleAsteriskItalic(result);
result = handleIncompleteSingleUnderscoreItalic(result);
result = handleIncompleteInlineCode(result);
result = handleIncompleteStrikethrough(result);
// Handle KaTeX formatting (only block math with $$)
result = handleIncompleteBlockKatex(result);
// Note: We don't handle inline KaTeX with single $ as they're likely currency symbols
return result;
};