CrashOverrideX's picture
Add files using upload-large-folder tool
31dd200 verified
import {
CODE_BLOCK_REGEXP,
LATEX_MATH_AND_CODE_PATTERN,
LATEX_LINEBREAK_REGEXP,
MHCHEM_PATTERN_MAP
} from '$lib/constants/latex-protection';
/**
* Replaces inline LaTeX expressions enclosed in `$...$` with placeholders, avoiding dollar signs
* that appear to be part of monetary values or identifiers.
*
* This function processes the input line by line and skips `$` sequences that are likely
* part of money amounts (e.g., `$5`, `$100.99`) or code-like tokens (e.g., `var$`, `$var`).
* Valid LaTeX inline math is replaced with a placeholder like `<<LATEX_0>>`, and the
* actual LaTeX content is stored in the provided `latexExpressions` array.
*
* @param content - The input text potentially containing LaTeX expressions.
* @param latexExpressions - An array used to collect extracted LaTeX expressions.
* @returns The processed string with LaTeX replaced by placeholders.
*/
export function maskInlineLaTeX(content: string, latexExpressions: string[]): string {
if (!content.includes('$')) {
return content;
}
return content
.split('\n')
.map((line) => {
if (line.indexOf('$') == -1) {
return line;
}
let processedLine = '';
let currentPosition = 0;
while (currentPosition < line.length) {
const openDollarIndex = line.indexOf('$', currentPosition);
if (openDollarIndex == -1) {
processedLine += line.slice(currentPosition);
break;
}
// Is there a next $-sign?
const closeDollarIndex = line.indexOf('$', openDollarIndex + 1);
if (closeDollarIndex == -1) {
processedLine += line.slice(currentPosition);
break;
}
const charBeforeOpen = openDollarIndex > 0 ? line[openDollarIndex - 1] : '';
const charAfterOpen = line[openDollarIndex + 1];
const charBeforeClose =
openDollarIndex + 1 < closeDollarIndex ? line[closeDollarIndex - 1] : '';
const charAfterClose = closeDollarIndex + 1 < line.length ? line[closeDollarIndex + 1] : '';
let shouldSkipAsNonLatex = false;
if (closeDollarIndex == currentPosition + 1) {
// No content
shouldSkipAsNonLatex = true;
}
if (/[A-Za-z0-9_$-]/.test(charBeforeOpen)) {
// Character, digit, $, _ or - before first '$', no TeX.
shouldSkipAsNonLatex = true;
}
if (
/[0-9]/.test(charAfterOpen) &&
(/[A-Za-z0-9_$-]/.test(charAfterClose) || ' ' == charBeforeClose)
) {
// First $ seems to belong to an amount.
shouldSkipAsNonLatex = true;
}
if (shouldSkipAsNonLatex) {
processedLine += line.slice(currentPosition, openDollarIndex + 1);
currentPosition = openDollarIndex + 1;
continue;
}
// Treat as LaTeX
processedLine += line.slice(currentPosition, openDollarIndex);
const latexContent = line.slice(openDollarIndex, closeDollarIndex + 1);
latexExpressions.push(latexContent);
processedLine += `<<LATEX_${latexExpressions.length - 1}>>`;
currentPosition = closeDollarIndex + 1;
}
return processedLine;
})
.join('\n');
}
function escapeBrackets(text: string): string {
return text.replace(
LATEX_MATH_AND_CODE_PATTERN,
(
match: string,
codeBlock: string | undefined,
squareBracket: string | undefined,
roundBracket: string | undefined
): string => {
if (codeBlock != null) {
return codeBlock;
} else if (squareBracket != null) {
return `$$${squareBracket}$$`;
} else if (roundBracket != null) {
return `$${roundBracket}$`;
}
return match;
}
);
}
// Escape $\\ce{...} → $\\ce{...} but with proper handling
function escapeMhchem(text: string): string {
return MHCHEM_PATTERN_MAP.reduce((result, [pattern, replacement]) => {
return result.replace(pattern, replacement);
}, text);
}
const doEscapeMhchem = false;
/**
* Preprocesses markdown content to safely handle LaTeX math expressions while protecting
* against false positives (e.g., dollar amounts like $5.99) and ensuring proper rendering.
*
* This function:
* - Protects code blocks (```) and inline code (`...`)
* - Safeguards block and inline LaTeX: \(...\), \[...\], $$...$$, and selective $...$
* - Escapes standalone dollar signs before numbers (e.g., $5 → \$5) to prevent misinterpretation
* - Restores protected LaTeX and code blocks after processing
* - Converts \(...\) → $...$ and \[...\] → $$...$$ for compatibility with math renderers
* - Applies additional escaping for brackets and mhchem syntax if needed
*
* @param content - The raw text (e.g., markdown) that may contain LaTeX or code blocks.
* @returns The preprocessed string with properly escaped and normalized LaTeX.
*
* @example
* preprocessLaTeX("Price: $10. The equation is \\(x^2\\).")
* // → "Price: $10. The equation is $x^2$."
*/
export function preprocessLaTeX(content: string): string {
// See also:
// https://github.com/danny-avila/LibreChat/blob/main/client/src/utils/latex.ts
// Step 0: Temporarily remove blockquote markers (>) to process LaTeX correctly
// Store the structure so we can restore it later
const blockquoteMarkers: Map<number, string> = new Map();
const lines = content.split('\n');
const processedLines = lines.map((line, index) => {
const match = line.match(/^(>\s*)/);
if (match) {
blockquoteMarkers.set(index, match[1]);
return line.slice(match[1].length);
}
return line;
});
content = processedLines.join('\n');
// Step 1: Protect code blocks
const codeBlocks: string[] = [];
content = content.replace(CODE_BLOCK_REGEXP, (match) => {
codeBlocks.push(match);
return `<<CODE_BLOCK_${codeBlocks.length - 1}>>`;
});
// Step 2: Protect existing LaTeX expressions
const latexExpressions: string[] = [];
// Match \S...\[...\] and protect them and insert a line-break.
content = content.replace(/([\S].*?)\\\[([\s\S]*?)\\\](.*)/g, (match, group1, group2, group3) => {
// Check if there are characters following the formula (display-formula in a table-cell?)
if (group1.endsWith('\\')) {
return match; // Backslash before \[, do nothing.
}
const hasSuffix = /\S/.test(group3);
let optBreak;
if (hasSuffix) {
latexExpressions.push(`\\(${group2.trim()}\\)`); // Convert into inline.
optBreak = '';
} else {
latexExpressions.push(`\\[${group2}\\]`);
optBreak = '\n';
}
return `${group1}${optBreak}<<LATEX_${latexExpressions.length - 1}>>${optBreak}${group3}`;
});
// Match \(...\), \[...\], $$...$$ and protect them
content = content.replace(
/(\$\$[\s\S]*?\$\$|(?<!\\)\\\[[\s\S]*?\\\]|(?<!\\)\\\(.*?\\\))/g,
(match) => {
latexExpressions.push(match);
return `<<LATEX_${latexExpressions.length - 1}>>`;
}
);
// Protect inline $...$ but NOT if it looks like money (e.g., $10, $3.99)
content = maskInlineLaTeX(content, latexExpressions);
// Step 3: Escape standalone $ before digits (currency like $5 → \$5)
// (Now that inline math is protected, this will only escape dollars not already protected)
content = content.replace(/\$(?=\d)/g, '\\$');
// Step 4: Restore protected LaTeX expressions (they are valid)
content = content.replace(/<<LATEX_(\d+)>>/g, (_, index) => {
let expr = latexExpressions[parseInt(index)];
const match = expr.match(LATEX_LINEBREAK_REGEXP);
if (match) {
// Katex: The $$-delimiters should be in their own line
// if there are \\-line-breaks.
const formula = match[1];
const prefix = formula.startsWith('\n') ? '' : '\n';
const suffix = formula.endsWith('\n') ? '' : '\n';
expr = '$$' + prefix + formula + suffix + '$$';
}
return expr;
});
// Step 5: Apply additional escaping functions (brackets and mhchem)
// This must happen BEFORE restoring code blocks to avoid affecting code content
content = escapeBrackets(content);
if (doEscapeMhchem && (content.includes('\\ce{') || content.includes('\\pu{'))) {
content = escapeMhchem(content);
}
// Step 6: Convert remaining \(...\) → $...$, \[...\] → $$...$$
// This must happen BEFORE restoring code blocks to avoid affecting code content
content = content
// Using the look‑behind pattern `(?<!\\)` we skip matches
// that are preceded by a backslash, e.g.
// `Definitions\\(also called macros)` (title of chapter 20 in The TeXbook).
.replace(/(?<!\\)\\\((.+?)\\\)/g, '$$$1$') // inline
.replace(
// Using the look‑behind pattern `(?<!\\)` we skip matches
// that are preceded by a backslash, e.g. `\\[4pt]`.
/(?<!\\)\\\[([\s\S]*?)\\\]/g, // display, see also PR #16599
(_, content: string) => {
return `$$${content}$$`;
}
);
// Step 7: Restore code blocks
// This happens AFTER all LaTeX conversions to preserve code content
content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
return codeBlocks[parseInt(index)];
});
// Step 8: Restore blockquote markers
if (blockquoteMarkers.size > 0) {
const finalLines = content.split('\n');
const restoredLines = finalLines.map((line, index) => {
const marker = blockquoteMarkers.get(index);
return marker ? marker + line : line;
});
content = restoredLines.join('\n');
}
return content;
}