Spaces:
Running
Running
| import { readFileSync, writeFileSync, existsSync, readdirSync } from 'fs'; | |
| import { join, dirname } from 'path'; | |
| import { fileURLToPath } from 'url'; | |
| const __filename = fileURLToPath(import.meta.url); | |
| const __dirname = dirname(__filename); | |
| /** | |
| * Post-processor for cleaning Markdown content from LaTeX conversion | |
| * Each function handles a specific type of cleanup for maintainability | |
| */ | |
| /** | |
| * Remove TeX low-level grouping commands that break KaTeX | |
| * @param {string} content - Markdown content | |
| * @returns {string} - Cleaned content | |
| */ | |
| function removeTexGroupingCommands(content) { | |
| console.log(' π§Ή Removing TeX grouping commands...'); | |
| return content | |
| .replace(/\\mathopen\{\}\\mathclose\\bgroup/g, '') | |
| .replace(/\\aftergroup\\egroup/g, '') | |
| .replace(/\\bgroup/g, '') | |
| .replace(/\\egroup/g, ''); | |
| } | |
| /** | |
| * Simplify LaTeX delimiter constructions | |
| * @param {string} content - Markdown content | |
| * @returns {string} - Cleaned content | |
| */ | |
| function simplifyLatexDelimiters(content) { | |
| console.log(' π§ Simplifying LaTeX delimiters...'); | |
| return content | |
| .replace(/\\left\[\s*/g, '[') | |
| .replace(/\s*\\right\]/g, ']'); | |
| } | |
| /** | |
| * Remove orphaned LaTeX labels | |
| * @param {string} content - Markdown content | |
| * @returns {string} - Cleaned content | |
| */ | |
| function removeOrphanedLabels(content) { | |
| console.log(' π·οΈ Removing orphaned labels...'); | |
| return content | |
| .replace(/^\s*\\label\{[^}]+\}\s*$/gm, '') | |
| .replace(/\\label\{[^}]+\}/g, ''); | |
| } | |
| /** | |
| * Fix KaTeX-incompatible math commands | |
| * @param {string} content - Markdown content | |
| * @returns {string} - Cleaned content | |
| */ | |
| function fixMathCommands(content) { | |
| console.log(' π Fixing KaTeX-incompatible math commands...'); | |
| return content | |
| // Replace \hdots with \ldots (KaTeX compatible) | |
| .replace(/\\hdots/g, '\\ldots') | |
| // Add more math command fixes here as needed | |
| .replace(/\\vdots/g, '\\vdots'); // This one should be fine, but kept for consistency | |
| } | |
| /** | |
| * Convert LaTeX matrix commands to KaTeX-compatible environments | |
| * @param {string} content - Markdown content | |
| * @returns {string} - Content with fixed matrix commands | |
| */ | |
| function fixMatrixCommands(content) { | |
| console.log(' π’ Converting matrix commands to KaTeX format...'); | |
| let fixedCount = 0; | |
| // Convert \pmatrix{...} to \begin{pmatrix}...\end{pmatrix} | |
| content = content.replace(/\\pmatrix\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, (match, matrixContent) => { | |
| fixedCount++; | |
| // Split by \\ for rows, handle nested braces | |
| const rows = matrixContent.split('\\\\').map(row => row.trim()).filter(row => row); | |
| return `\\begin{pmatrix}\n${rows.join(' \\\\\n')}\n\\end{pmatrix}`; | |
| }); | |
| // Convert \bmatrix{...} to \begin{bmatrix}...\end{bmatrix} | |
| content = content.replace(/\\bmatrix\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, (match, matrixContent) => { | |
| fixedCount++; | |
| const rows = matrixContent.split('\\\\').map(row => row.trim()).filter(row => row); | |
| return `\\begin{bmatrix}\n${rows.join(' \\\\\n')}\n\\end{bmatrix}`; | |
| }); | |
| // Convert \vmatrix{...} to \begin{vmatrix}...\end{vmatrix} | |
| content = content.replace(/\\vmatrix\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, (match, matrixContent) => { | |
| fixedCount++; | |
| const rows = matrixContent.split('\\\\').map(row => row.trim()).filter(row => row); | |
| return `\\begin{vmatrix}\n${rows.join(' \\\\\n')}\n\\end{vmatrix}`; | |
| }); | |
| if (fixedCount > 0) { | |
| console.log(` β Fixed ${fixedCount} matrix command(s)`); | |
| } | |
| return content; | |
| } | |
| /** | |
| * Fix Unicode characters that break MDX/JSX parsing | |
| * @param {string} content - Markdown content | |
| * @returns {string} - Cleaned content | |
| */ | |
| function fixUnicodeIssues(content) { | |
| console.log(' π Fixing Unicode characters for MDX compatibility...'); | |
| return content | |
| // Replace Unicode middle dot (Β·) with \cdot in math expressions | |
| .replace(/\$([^$]*?)Β·([^$]*?)\$/g, (match, before, after) => { | |
| return `$${before}\\cdot${after}$`; | |
| }) | |
| // Replace Unicode middle dot in display math | |
| .replace(/\$\$([^$]*?)Β·([^$]*?)\$\$/g, (match, before, after) => { | |
| return `$$${before}\\cdot${after}$$`; | |
| }) | |
| // Replace other problematic Unicode characters | |
| .replace(/[""]/g, '"') // Smart quotes to regular quotes | |
| .replace(/['']/g, "'") // Smart apostrophes to regular apostrophes | |
| .replace(/β¦/g, '...') // Ellipsis to three dots | |
| .replace(/β/g, '-') // En dash to hyphen | |
| .replace(/β/g, '--'); // Em dash to double hyphen | |
| } | |
| /** | |
| * Fix multiline math expressions for MDX compatibility | |
| * @param {string} content - Markdown content | |
| * @returns {string} - Cleaned content | |
| */ | |
| function fixMultilineMath(content) { | |
| console.log(' π Fixing multiline math expressions for MDX...'); | |
| return content | |
| // Convert multiline inline math to display math blocks (more precise regex) | |
| // Only match if the content is a self-contained math expression within a single line | |
| .replace(/\$([^$\n]*\\\\[^$\n]*)\$/g, (match, mathContent) => { | |
| // Only convert if it contains actual math operators and line breaks | |
| if (mathContent.includes('\\\\') && /[=+\-*/^_{}]/.test(mathContent)) { | |
| // Remove leading/trailing whitespace and normalize newlines | |
| const cleanedMath = mathContent | |
| .replace(/^\s+|\s+$/g, '') | |
| .replace(/\s*\\\\\s*/g, '\\\\\n '); | |
| return `$$\n${cleanedMath}\n$$`; | |
| } | |
| return match; // Keep original if it doesn't look like multiline math | |
| }) | |
| // Ensure display math blocks are properly separated | |
| .replace(/\$\$\s*\n\s*([^$]+?)\s*\n\s*\$\$/g, (match, mathContent) => { | |
| return `\n$$\n${mathContent.trim()}\n$$\n`; | |
| }); | |
| } | |
| /** | |
| * Inject code snippets into empty code blocks | |
| * @param {string} content - Markdown content | |
| * @param {string} inputDir - Directory containing the LaTeX source and snippets | |
| * @returns {string} - Content with injected code snippets | |
| */ | |
| function injectCodeSnippets(content, inputDir = null) { | |
| console.log(' π» Injecting code snippets...'); | |
| if (!inputDir) { | |
| console.log(' β οΈ No input directory provided, skipping code injection'); | |
| return content; | |
| } | |
| const snippetsDir = join(inputDir, 'snippets'); | |
| if (!existsSync(snippetsDir)) { | |
| console.log(' β οΈ Snippets directory not found, skipping code injection'); | |
| return content; | |
| } | |
| // Get all available snippet files | |
| let availableSnippets = []; | |
| try { | |
| availableSnippets = readdirSync(snippetsDir); | |
| console.log(` π Found ${availableSnippets.length} snippet file(s): ${availableSnippets.join(', ')}`); | |
| } catch (error) { | |
| console.log(` β Error reading snippets directory: ${error.message}`); | |
| return content; | |
| } | |
| // Find all empty code blocks | |
| const emptyCodeBlockPattern = /```\s*(\w+)\s*\n\s*```/g; | |
| let processedContent = content; | |
| let injectionCount = 0; | |
| processedContent = processedContent.replace(emptyCodeBlockPattern, (match, language) => { | |
| // Map language names to file extensions | |
| const extensionMap = { | |
| 'python': 'py', | |
| 'javascript': 'js', | |
| 'typescript': 'ts', | |
| 'bash': 'sh', | |
| 'shell': 'sh' | |
| }; | |
| const fileExtension = extensionMap[language] || language; | |
| // Try to find a matching snippet file for this language | |
| const matchingFiles = availableSnippets.filter(file => | |
| file.endsWith(`.${fileExtension}`) | |
| ); | |
| if (matchingFiles.length === 0) { | |
| console.log(` β οΈ No ${language} snippet found (looking for .${fileExtension})`); | |
| return match; | |
| } | |
| // Use the first matching file (could be made smarter with context analysis) | |
| const selectedFile = matchingFiles[0]; | |
| const snippetPath = join(snippetsDir, selectedFile); | |
| try { | |
| const snippetContent = readFileSync(snippetPath, 'utf8'); | |
| injectionCount++; | |
| console.log(` β Injected: ${selectedFile}`); | |
| return `\`\`\`${language}\n${snippetContent.trim()}\n\`\`\``; | |
| } catch (error) { | |
| console.log(` β Error reading ${selectedFile}: ${error.message}`); | |
| return match; | |
| } | |
| }); | |
| if (injectionCount > 0) { | |
| console.log(` π Injected ${injectionCount} code snippet(s)`); | |
| } | |
| return processedContent; | |
| } | |
| /** | |
| * Fix all attributes that still contain colons (href, data-reference, id) | |
| * @param {string} content - Markdown content | |
| * @returns {string} - Cleaned content | |
| */ | |
| function fixAllAttributes(content) { | |
| console.log(' π Fixing all attributes with colons...'); | |
| let fixedCount = 0; | |
| // Fix href attributes containing colons | |
| content = content.replace(/href="([^"]*):([^"]*)"/g, (match, before, after) => { | |
| fixedCount++; | |
| return `href="${before}-${after}"`; | |
| }); | |
| // Fix data-reference attributes containing colons | |
| content = content.replace(/data-reference="([^"]*):([^"]*)"/g, (match, before, after) => { | |
| fixedCount++; | |
| return `data-reference="${before}-${after}"`; | |
| }); | |
| // Fix id attributes containing colons (like in Figure components) | |
| content = content.replace(/id="([^"]*):([^"]*)"/g, (match, before, after) => { | |
| fixedCount++; | |
| return `id="${before}-${after}"`; | |
| }); | |
| if (fixedCount > 0) { | |
| console.log(` β Fixed ${fixedCount} attribute(s) with colons`); | |
| } | |
| return content; | |
| } | |
| /** | |
| * Fix link text content that still contains colons | |
| * @param {string} content - Markdown content | |
| * @returns {string} - Cleaned content | |
| */ | |
| function fixLinkTextContent(content) { | |
| console.log(' π Fixing link text content with colons...'); | |
| let fixedCount = 0; | |
| // Fix text content within links that contain references with colons | |
| // Pattern: <a ...>[text:content]</a> | |
| const cleanedContent = content.replace(/<a([^>]*)>\[([^:]*):([^\]]*)\]<\/a>/g, (match, attributes, before, after) => { | |
| fixedCount++; | |
| return `<a${attributes}>[${before}-${after}]</a>`; | |
| }); | |
| if (fixedCount > 0) { | |
| console.log(` β Fixed ${fixedCount} link text(s) with colons`); | |
| } | |
| return cleanedContent; | |
| } | |
| /** | |
| * Convert align anchor markers to proper HTML spans outside math blocks | |
| * @param {string} content - Markdown content | |
| * @returns {string} - Content with converted anchor spans | |
| */ | |
| function convertAlignAnchors(content) { | |
| console.log(' π·οΈ Converting align anchor markers to HTML spans...'); | |
| let convertedCount = 0; | |
| // Find and replace align anchor markers with proper spans outside math blocks | |
| content = content.replace(/``` math\n%%ALIGN_ANCHOR_ID\{([^}]+)\}%%\n([\s\S]*?)\n```/g, (match, anchorId, mathContent) => { | |
| convertedCount++; | |
| return `<span id="${anchorId}" style="position: absolute;"></span>\n\n\`\`\` math\n${mathContent}\n\`\`\``; | |
| }); | |
| if (convertedCount > 0) { | |
| console.log(` β Converted ${convertedCount} align anchor marker(s) to spans`); | |
| } | |
| return content; | |
| } | |
| /** | |
| * Main post-processing function that applies all cleanup steps | |
| * @param {string} content - Raw Markdown content from Pandoc | |
| * @param {string} inputDir - Optional: Directory containing LaTeX source for code injection | |
| * @returns {string} - Cleaned Markdown content | |
| */ | |
| export function postProcessMarkdown(content, inputDir = null) { | |
| console.log('π§ Post-processing for KaTeX compatibility...'); | |
| let processedContent = content; | |
| // Apply each cleanup step sequentially | |
| processedContent = removeTexGroupingCommands(processedContent); | |
| processedContent = simplifyLatexDelimiters(processedContent); | |
| processedContent = removeOrphanedLabels(processedContent); | |
| processedContent = convertAlignAnchors(processedContent); | |
| processedContent = fixMathCommands(processedContent); | |
| processedContent = fixMatrixCommands(processedContent); | |
| processedContent = fixUnicodeIssues(processedContent); | |
| processedContent = fixMultilineMath(processedContent); | |
| processedContent = fixAllAttributes(processedContent); | |
| processedContent = fixLinkTextContent(processedContent); | |
| // Inject code snippets if input directory is provided | |
| if (inputDir) { | |
| processedContent = injectCodeSnippets(processedContent, inputDir); | |
| } | |
| return processedContent; | |
| } | |
| /** | |
| * CLI interface for standalone usage | |
| */ | |
| function parseArgs() { | |
| const args = process.argv.slice(2); | |
| const config = { | |
| input: join(__dirname, 'output', 'main.md'), | |
| output: null, // Will default to input if not specified | |
| verbose: false, | |
| }; | |
| for (const arg of args) { | |
| if (arg.startsWith('--input=')) { | |
| config.input = arg.substring('--input='.length); | |
| } else if (arg.startsWith('--output=')) { | |
| config.output = arg.substring('--output='.length); | |
| } else if (arg === '--verbose') { | |
| config.verbose = true; | |
| } else if (arg === '--help' || arg === '-h') { | |
| console.log(` | |
| π§ Markdown Post-Processor | |
| Usage: | |
| node post-processor.mjs [options] | |
| Options: | |
| --input=PATH Input Markdown file (default: output/main.md) | |
| --output=PATH Output file (default: overwrites input) | |
| --verbose Verbose output | |
| --help, -h Show this help | |
| Examples: | |
| # Process main.md in-place | |
| node post-processor.mjs | |
| # Process with custom paths | |
| node post-processor.mjs --input=raw.md --output=clean.md | |
| `); | |
| process.exit(0); | |
| } | |
| } | |
| // Default output to input if not specified | |
| if (!config.output) { | |
| config.output = config.input; | |
| } | |
| return config; | |
| } | |
| function main() { | |
| const config = parseArgs(); | |
| console.log('π§ Markdown Post-Processor'); | |
| console.log(`π Input: ${config.input}`); | |
| console.log(`π Output: ${config.output}`); | |
| try { | |
| const content = readFileSync(config.input, 'utf8'); | |
| const processedContent = postProcessMarkdown(content); | |
| writeFileSync(config.output, processedContent); | |
| console.log(`β Post-processing completed: ${config.output}`); | |
| // Show stats if verbose | |
| if (config.verbose) { | |
| const originalLines = content.split('\n').length; | |
| const processedLines = processedContent.split('\n').length; | |
| console.log(`π Lines: ${originalLines} β ${processedLines}`); | |
| } | |
| } catch (error) { | |
| console.error('β Post-processing failed:'); | |
| console.error(error.message); | |
| process.exit(1); | |
| } | |
| } | |
| // Run CLI if called directly | |
| if (import.meta.url === `file://${process.argv[1]}`) { | |
| main(); | |
| } | |