File size: 4,790 Bytes
f0743f4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | // Pre-compile all regular expressions for better performance
const MHCHEM_CE_REGEX = /\$\\ce\{/g;
const MHCHEM_PU_REGEX = /\$\\pu\{/g;
const MHCHEM_CE_ESCAPED_REGEX = /\$\\\\ce\{[^}]*\}\$/g;
const MHCHEM_PU_ESCAPED_REGEX = /\$\\\\pu\{[^}]*\}\$/g;
const CURRENCY_REGEX =
/(?<![\\$])\$(?!\$)(?=\d+(?:,\d{3})*(?:\.\d+)?(?:[KMBkmb])?(?:\s|$|[^a-zA-Z\d]))/g;
const SINGLE_DOLLAR_REGEX = /(?<!\\)\$(?!\$)((?:[^$\n]|\\[$])+?)(?<!\\)(?<!`)\$(?!\$)/g;
/**
* Escapes mhchem package notation in LaTeX by converting single dollar delimiters to double dollars
* and escaping backslashes in mhchem commands.
*
* @param text - The input text containing potential mhchem notation
* @returns The processed text with properly escaped mhchem notation
*/
function escapeMhchem(text: string): string {
// First escape the backslashes in mhchem commands
let result = text.replace(MHCHEM_CE_REGEX, '$\\\\ce{');
result = result.replace(MHCHEM_PU_REGEX, '$\\\\pu{');
// Then convert single dollar mhchem to double dollar
result = result.replace(MHCHEM_CE_ESCAPED_REGEX, (match) => `$${match}$`);
result = result.replace(MHCHEM_PU_ESCAPED_REGEX, (match) => `$${match}$`);
return result;
}
/**
* Efficiently finds all code block regions in the content
* @param content The content to analyze
* @returns Array of code block regions [start, end]
*/
function findCodeBlockRegions(content: string): Array<[number, number]> {
const regions: Array<[number, number]> = [];
let inlineStart = -1;
let multilineStart = -1;
for (let i = 0; i < content.length; i++) {
const char = content[i];
// Check for multiline code blocks
if (
char === '`' &&
i + 2 < content.length &&
content[i + 1] === '`' &&
content[i + 2] === '`'
) {
if (multilineStart === -1) {
multilineStart = i;
i += 2; // Skip the next two backticks
} else {
regions.push([multilineStart, i + 2]);
multilineStart = -1;
i += 2;
}
}
// Check for inline code blocks (only if not in multiline)
else if (char === '`' && multilineStart === -1) {
if (inlineStart === -1) {
inlineStart = i;
} else {
regions.push([inlineStart, i]);
inlineStart = -1;
}
}
}
return regions;
}
/**
* Checks if a position is inside any code block region using binary search
* @param position The position to check
* @param codeRegions Array of code block regions
* @returns True if position is inside a code block
*/
function isInCodeBlock(position: number, codeRegions: Array<[number, number]>): boolean {
let left = 0;
let right = codeRegions.length - 1;
while (left <= right) {
const mid = Math.floor((left + right) / 2);
const [start, end] = codeRegions[mid];
if (position >= start && position <= end) {
return true;
} else if (position < start) {
right = mid - 1;
} else {
left = mid + 1;
}
}
return false;
}
/**
* Preprocesses LaTeX content by escaping currency indicators and converting single dollar math delimiters.
* Optimized for high-frequency execution.
* @param content The input string containing LaTeX expressions.
* @returns The processed string with escaped currency indicators and converted math delimiters.
*/
export function preprocessLaTeX(content: string): string {
// Early return for most common case
if (!content.includes('$')) return content;
// Process mhchem first (usually rare, so check if needed)
let processed = content;
if (content.includes('\\ce{') || content.includes('\\pu{')) {
processed = escapeMhchem(content);
}
// Find all code block regions once
const codeRegions = findCodeBlockRegions(processed);
// First pass: escape currency dollar signs
const parts: string[] = [];
let lastIndex = 0;
// Reset regex for reuse
CURRENCY_REGEX.lastIndex = 0;
let match: RegExpExecArray | null;
while ((match = CURRENCY_REGEX.exec(processed)) !== null) {
if (!isInCodeBlock(match.index, codeRegions)) {
parts.push(processed.substring(lastIndex, match.index));
parts.push('\\$');
lastIndex = match.index + 1;
}
}
parts.push(processed.substring(lastIndex));
processed = parts.join('');
// Second pass: convert single dollar delimiters to double dollars
const result: string[] = [];
lastIndex = 0;
// Reset regex for reuse
SINGLE_DOLLAR_REGEX.lastIndex = 0;
while ((match = SINGLE_DOLLAR_REGEX.exec(processed)) !== null) {
if (!isInCodeBlock(match.index, codeRegions)) {
result.push(processed.substring(lastIndex, match.index));
result.push(`$$${match[1]}$$`);
lastIndex = match.index + match[0].length;
}
}
result.push(processed.substring(lastIndex));
return result.join('');
}
|