import { readFileSync, readdirSync, statSync } from 'fs'; import { join, relative, dirname } from 'path'; import { fileURLToPath } from 'url'; /** * Extract HtmlEmbed, Image components and tables from MDX/Markdown content * Simple utility to find visual elements and their props */ /** * Simple Markdown to HTML converter for table cells * Handles: links, bold, italic, code, strikethrough */ function markdownToHtml(md) { if (!md) return ''; let html = md; // Escape HTML entities first (but not for already-converted content) // Skip if it already looks like HTML if (!html.includes('')) { html = html .replace(/&/g, '&') .replace(//g, '>'); } // Links: [text](url) html = html.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '$1'); // Bold: **text** or __text__ html = html.replace(/\*\*([^*]+)\*\*/g, '$1'); html = html.replace(/__([^_]+)__/g, '$1'); // Italic: *text* or _text_ (but not inside words) html = html.replace(/(?$1'); html = html.replace(/(?$1'); // Inline code: `code` html = html.replace(/`([^`]+)`/g, '$1'); // Strikethrough: ~~text~~ html = html.replace(/~~([^~]+)~~/g, '$1'); // Checkboxes (common in tables) html = html.replace(/\[x\]/gi, '✅'); html = html.replace(/\[ \]/g, '❌'); return html; } /** * Extract Image components from MDX content */ export function extractImages(content) { const images = []; // Match components const imagePattern = /]*\/>/gi; let match; while ((match = imagePattern.exec(content)) !== null) { const tag = match[0]; // Extract src attribute (variable reference like {myImage}) const srcMatch = tag.match(/src\s*=\s*\{([^}]+)\}/i); const src = srcMatch ? srcMatch[1].trim() : null; // Extract alt const altMatch = tag.match(/alt\s*=\s*["']([^"']+)["']/i); const alt = altMatch ? altMatch[1] : 'Image'; // Extract caption const captionMatch = tag.match(/caption\s*=\s*["']([^"']+)["']/i) || tag.match(/caption\s*=\s*\{`([^`]+)`\}/i); const caption = captionMatch ? captionMatch[1] : null; // Extract id const idMatch = tag.match(/id\s*=\s*["']([^"']+)["']/i); const id = idMatch ? idMatch[1] : null; if (src) { images.push({ type: 'image', src, alt, caption, id }); } } return images; } /** * Extract markdown tables from content */ export function extractTables(content) { const tables = []; // Match markdown tables (lines starting with |) // A table has at least a header row, separator row, and one data row const tablePattern = /(\|[^\n]+\|\n\|[-:\s|]+\|\n(?:\|[^\n]+\|\n?)+)/g; let match; let tableIndex = 0; while ((match = tablePattern.exec(content)) !== null) { const tableContent = match[1].trim(); const rows = tableContent.split('\n').filter(row => row.trim()); if (rows.length >= 3) { // Parse header - convert Markdown to HTML const headerRow = rows[0]; const headers = headerRow.split('|') .filter(cell => cell.trim()) .map(cell => markdownToHtml(cell.trim())); // Parse data rows (skip separator at index 1) - convert Markdown to HTML const dataRows = rows.slice(2).map(row => { return row.split('|') .filter(cell => cell.trim()) .map(cell => markdownToHtml(cell.trim())); }); tables.push({ type: 'table', id: `table-${tableIndex++}`, headers, rows: dataRows, raw: tableContent }); } } return tables; } export function extractHtmlEmbeds(content) { const embeds = []; // First, find all Wide components and mark their content // Pattern to match ... blocks const widePattern = /([\s\S]*?)<\/Wide>/gi; const wideBlocks = []; let wideMatch; while ((wideMatch = widePattern.exec(content)) !== null) { wideBlocks.push({ start: wideMatch.index, end: wideMatch.index + wideMatch[0].length, content: wideMatch[0] }); } // Helper to check if an embed is inside a Wide block const isInsideWide = (embedStartIndex) => { return wideBlocks.some(block => embedStartIndex >= block.start && embedStartIndex < block.end ); }; // Pattern to match HtmlEmbed opening tags const embedPattern = / while respecting string boundaries let pos = matchIndex + 10; // After " 0 ? content[pos - 1] : ''; match += char; // Track string boundaries (template strings, single, double quotes) if (!inString) { if ((char === '`' || char === '"' || char === "'") && prevChar !== '\\') { inString = true; stringDelim = char; } } else { if (char === stringDelim && prevChar !== '\\') { inString = false; stringDelim = null; } } // Track JSX expression braces (for config={{...}}, data={{...}}, etc.) if (!inString) { if (char === '{') { inJSXBraces++; } else if (char === '}') { inJSXBraces--; } } // Check for closing /> - only valid if not in string AND all JSX braces are closed if (!inString && inJSXBraces === 0 && char === '/' && pos + 1 < content.length && content[pos + 1] === '>') { match += '>'; break; } pos++; } // If config={{ is present, we need to find the real closing after }} // Check if config={{ is there but the match doesn't include the full config (doesn't end with }}) if (match.includes('config={{') && !match.includes('}}')) { // The match was cut off at the first /> it found (probably in desc) // We need to find the real closing after }} // Find where config={{ starts const configStart = match.indexOf('config={{'); if (configStart >= 0) { // Look for the matching }} after this, starting from the content let braceCount = 2; // We're inside {{ let pos = matchIndex + configStart + 9; // After "config={{" let foundEnd = false; while (pos < content.length) { const char = content[pos]; const prevChar = pos > 0 ? content[pos - 1] : ''; // Track strings to avoid counting braces inside strings if ((char === '"' || char === "'" || char === '`') && prevChar !== '\\') { // We hit a string delimiter - skip the entire string const stringDelim = char; pos++; while (pos < content.length) { if (content[pos] === stringDelim && content[pos - 1] !== '\\') { break; } // Handle template string ${...} if (stringDelim === '`' && content[pos] === '$' && pos + 1 < content.length && content[pos + 1] === '{') { // Skip ${...} without counting pos += 2; let innerBraces = 1; while (pos < content.length && innerBraces > 0) { if (content[pos] === '{') innerBraces++; if (content[pos] === '}') innerBraces--; pos++; } continue; } pos++; } pos++; // Skip the closing quote continue; } if (char === '{') braceCount++; if (char === '}') { braceCount--; if (braceCount === 0) { // Found the closing }} // Now look for /> after optional whitespace pos++; while (pos < content.length && /\s/.test(content[pos])) { pos++; } if (pos < content.length && content[pos] === '/' && pos + 1 < content.length && content[pos + 1] === '>') { // Found the real closing match = content.substring(matchIndex, pos + 2); foundEnd = true; break; } } } pos++; } if (!foundEnd) { // Fallback: try to find }} /> pattern after match const after = content.substring(matchIndex + match.length); const endPattern = after.match(/\}\}\s*\/>/); if (endPattern) { match = content.substring(matchIndex, matchIndex + match.length + endPattern.index + endPattern[0].length); } } } } // Helper function to extract attribute value supporting multiline const extractAttr = (attrName, content) => { // Try JSX template strings first: desc={`...`} const templateMatch = content.match(new RegExp(`${attrName}\\s*=\\s*\\{\`([\\s\\S]*?)\`\\}`, 'i')); if (templateMatch) return templateMatch[1].trim(); // Try single quotes: desc='...' const singleQuoteMatch = content.match(new RegExp(`${attrName}\\s*=\\s*'([\\s\\S]*?)'`, 'i')); if (singleQuoteMatch) return singleQuoteMatch[1].trim(); // Try double quotes: desc="..." const doubleQuoteMatch = content.match(new RegExp(`${attrName}\\s*=\\s*"([\\s\\S]*?)"`, 'i')); if (doubleQuoteMatch) return doubleQuoteMatch[1].trim(); return undefined; }; // Extract src attribute (required) const src = extractAttr('src', match); if (!src) continue; // Extract optional attributes const title = extractAttr('title', match); const desc = extractAttr('desc', match); const id = extractAttr('id', match); const data = extractAttr('data', match); const frameless = /\bframeless\b/i.test(match); const wideAttr = /\bwide\b/i.test(match); const skipGallery = /\bskipGallery\b/i.test(match); // Extract config attribute - JSX object format: config={{ ... }} let config = null; // Pattern to match config={{ ... }} with balanced braces const jsxConfigRegex = /config\s*=\s*\{\{/i; const jsxConfigMatch = match.match(jsxConfigRegex); if (jsxConfigMatch) { try { // Find the start position after config={{ const configStart = jsxConfigMatch.index; const startPos = match.indexOf('{{', configStart) + 2; // Find matching closing braces with better handling let braceCount = 1; // Start at 1 because we're inside {{ let inString = false; let stringChar = null; let pos = startPos; for (; pos < match.length; pos++) { const char = match[pos]; const prevChar = pos > 0 ? match[pos - 1] : ''; const nextChar = pos < match.length - 1 ? match[pos + 1] : ''; // Handle string literals - check for template strings too if (!inString) { if (char === '`') { inString = true; stringChar = '`'; } else if (char === '"' && prevChar !== '\\') { inString = true; stringChar = '"'; } else if (char === "'" && prevChar !== '\\') { inString = true; stringChar = "'"; } } else { // Check for end of string if (char === stringChar && prevChar !== '\\') { inString = false; stringChar = null; } // Template strings can contain ${...} - handle that if (stringChar === '`' && char === '$' && nextChar === '{') { // Skip the ${ but don't count it as a brace yet pos++; // Skip $ braceCount++; // Count the { we're about to see continue; } } if (!inString) { if (char === '{') { braceCount++; } else if (char === '}') { braceCount--; if (braceCount === 0) { // Found matching closing }} break; } } } } if (braceCount !== 0) { throw new Error(`Unbalanced braces: braceCount=${braceCount}`); } // Extract the JSX object content let jsxContent = match.substring(startPos, pos).trim(); // Instead of converting to JSON, evaluate the JavaScript object directly // This is safer in a build context (not in browser) try { // Wrap in parentheses and braces to make it a valid expression const jsCode = `({${jsxContent}})`; // Use Function constructor to safely evaluate (no access to local scope) // This is safe because we're in Node.js build time, not browser runtime config = new Function('return ' + jsCode)(); } catch (evalError) { // If eval fails, try the JSON approach as fallback // Fallback: try JSON parsing let jsonStr = jsxContent; // Add braces around the content jsonStr = '{' + jsonStr + '}'; // Quote unquoted keys for (let pass = 0; pass < 5; pass++) { jsonStr = jsonStr.replace(/([{,\[\s])([a-zA-Z_$][a-zA-Z0-9_$]*)\s*:/g, '$1"$2":'); jsonStr = jsonStr.replace(/^([a-zA-Z_$][a-zA-Z0-9_$]*)\s*:/gm, '"$1":'); } // Replace single quotes with double quotes jsonStr = jsonStr.replace(/'/g, '"'); // Remove trailing commas jsonStr = jsonStr.replace(/,\s*([}\]])/g, '$1'); try { config = JSON.parse(jsonStr); } catch (jsonError) { // Both methods failed - log warning but don't throw console.warn('[extract-embeds] Config parsing failed:', jsonError.message); } } } catch (e) { // If parsing fails, keep config as null // Component will handle missing config } } // Fallback: try standard attribute extraction (for string-based config) if (!config) { const configAttr = extractAttr('config', match); if (configAttr) { try { config = JSON.parse(configAttr); } catch (e) { // Keep as string if not valid JSON config = configAttr; } } } // Check if this embed is inside a Wide component OR has wide prop const isWide = isInsideWide(matchIndex) || wideAttr; embeds.push({ src, title, desc, id, frameless, data, config, wide: isWide, skipGallery }); } return embeds; } /** * Recursively find all MDX files in a directory * Skips demo chapters by default to avoid missing embeds */ function findMdxFiles(dir, baseDir = dir, files = [], skipDemo = true) { const entries = readdirSync(dir); for (const entry of entries) { const fullPath = join(dir, entry); const stat = statSync(fullPath); if (stat.isDirectory()) { // Skip demo directory if skipDemo is true if (skipDemo && entry === 'demo') { continue; } findMdxFiles(fullPath, baseDir, files, skipDemo); } else if (entry.endsWith('.mdx')) { files.push(fullPath); } } return files; } /** * Parse imports and chapter usage order from article.mdx */ function parseArticleChapters(articleContent, contentDir) { const chapterMap = new Map(); // Component name -> file path const chapterOrder = []; // Ordered list of file paths // Extract import statements const importPattern = /import\s+(\w+)\s+from\s+["'](.\/chapters\/[^"']+)["']/g; let match; while ((match = importPattern.exec(articleContent)) !== null) { const [, componentName, importPath] = match; const fullPath = join(contentDir, importPath); chapterMap.set(componentName, fullPath); } // Extract chapter usage order (e.g., ) const usagePattern = /<(\w+)\s*\/>/g; while ((match = usagePattern.exec(articleContent)) !== null) { const componentName = match[1]; if (chapterMap.has(componentName)) { const chapterPath = chapterMap.get(componentName); if (!chapterOrder.includes(chapterPath)) { chapterOrder.push(chapterPath); } } } return chapterOrder; } /** * Load and extract embeds from MDX content files, following article structure */ export function loadEmbedsFromMDX() { // Get absolute path to content directory // In dev: __dirname is app/src/utils, so we go ../content // In build: Astro copies files to dist/pages/, but the source files stay in src/ // So we need to resolve relative to the actual source location const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); // Try to resolve content directory - works in both dev and build // First try relative to current file location (dev) let contentDir = join(__dirname, '../content'); // If that doesn't work, try going up more levels (build scenario) if (!statSync(contentDir, { throwIfNoEntry: false })) { // dist/pages/../.. -> dist/../src/content contentDir = join(__dirname, '../../src/content'); } // If still not found, try one more level (dist/*.mjs) if (!statSync(contentDir, { throwIfNoEntry: false })) { contentDir = join(__dirname, '../../../src/content'); } const allEmbeds = []; const articleFile = join(contentDir, 'article.mdx'); try { // Read main article file const articleContent = readFileSync(articleFile, 'utf-8'); // Extract embeds from main article first const articleEmbeds = extractHtmlEmbeds(articleContent); articleEmbeds.forEach(embed => { embed.sourceFile = 'content/article.mdx'; }); allEmbeds.push(...articleEmbeds); // Parse chapter order from article const chapterOrder = parseArticleChapters(articleContent, contentDir); // Extract embeds from chapters in order for (const chapterPath of chapterOrder) { try { const chapterContent = readFileSync(chapterPath, 'utf-8'); const embeds = extractHtmlEmbeds(chapterContent); // Add source file info (relative path) const relativePath = relative(contentDir, chapterPath); embeds.forEach(embed => { embed.sourceFile = `content/${relativePath}`; }); allEmbeds.push(...embeds); } catch (error) { console.error(`Error reading chapter ${chapterPath}:`, error); } } // Also include any other MDX files not in chapters (for completeness) // Skip demo chapters to avoid missing embeds in dataviz gallery const allMdxFiles = findMdxFiles(contentDir, contentDir, [], true); const processedFiles = new Set([articleFile, ...chapterOrder]); for (const filePath of allMdxFiles) { if (!processedFiles.has(filePath)) { try { const rawContent = readFileSync(filePath, 'utf-8'); const embeds = extractHtmlEmbeds(rawContent); const relativePath = relative(contentDir, filePath); embeds.forEach(embed => { embed.sourceFile = `content/${relativePath}`; }); allEmbeds.push(...embeds); } catch (error) { console.error(`Error reading ${filePath}:`, error); } } } } catch (error) { console.error('Error processing article:', error); // Fallback to old behavior if article.mdx can't be read // Skip demo chapters to avoid missing embeds const mdxFiles = findMdxFiles(contentDir, contentDir, [], true); for (const filePath of mdxFiles) { try { const rawContent = readFileSync(filePath, 'utf-8'); const embeds = extractHtmlEmbeds(rawContent); const relativePath = relative(contentDir, filePath); embeds.forEach(embed => { embed.sourceFile = `content/${relativePath}`; }); allEmbeds.push(...embeds); } catch (err) { console.error(`Error reading ${filePath}:`, err); } } } // Remove duplicates based on src (keeping first occurrence = order of appearance) const uniqueEmbeds = Array.from( new Map(allEmbeds.map(e => [e.src, e])).values() ); return uniqueEmbeds; } /** * Helper to extract attribute from tag content */ function extractAttrFromTag(attrName, tagContent) { // Try JSX template strings first: attr={`...`} const templateMatch = tagContent.match(new RegExp(`${attrName}\\s*=\\s*\\{\`([\\s\\S]*?)\`\\}`, 'i')); if (templateMatch) return templateMatch[1].trim(); // Try single quotes: attr='...' const singleQuoteMatch = tagContent.match(new RegExp(`${attrName}\\s*=\\s*'([\\s\\S]*?)'`, 'i')); if (singleQuoteMatch) return singleQuoteMatch[1].trim(); // Try double quotes: attr="..." const doubleQuoteMatch = tagContent.match(new RegExp(`${attrName}\\s*=\\s*"([\\s\\S]*?)"`, 'i')); if (doubleQuoteMatch) return doubleQuoteMatch[1].trim(); return undefined; } /** * Check if position is inside a Wide component */ function isPositionInsideWide(content, position) { const widePattern = /([\s\S]*?)<\/Wide>/gi; let match; while ((match = widePattern.exec(content)) !== null) { if (position >= match.index && position < match.index + match[0].length) { return true; } } return false; } /** * Extract all visual elements from content with their position * Returns sorted by position (order of appearance) */ function extractAllVisualsWithPosition(content) { const visuals = []; // Extract HtmlEmbeds with position and ALL props const embedPattern = / 0 ? content[pos - 1] : ''; tagContent += char; if (!inString) { if ((char === '`' || char === '"' || char === "'") && prevChar !== '\\') { inString = true; stringDelim = char; } } else { if (char === stringDelim && prevChar !== '\\') { inString = false; stringDelim = null; } } if (!inString) { if (char === '{') inJSXBraces++; else if (char === '}') inJSXBraces--; } if (!inString && inJSXBraces === 0 && char === '/' && pos + 1 < content.length && content[pos + 1] === '>') { tagContent += '>'; break; } pos++; } // Extract all props const src = extractAttrFromTag('src', tagContent); if (src) { const title = extractAttrFromTag('title', tagContent); const desc = extractAttrFromTag('desc', tagContent); const id = extractAttrFromTag('id', tagContent); const data = extractAttrFromTag('data', tagContent); const frameless = /\bframeless\b/i.test(tagContent); const wideAttr = /\bwide\b/i.test(tagContent); const skipGallery = /\bskipGallery\b/i.test(tagContent); // Parse config if present let config = null; const jsxConfigMatch = tagContent.match(/config\s*=\s*\{\{/i); if (jsxConfigMatch) { try { const configStart = tagContent.indexOf('{{', jsxConfigMatch.index) + 2; let braceCount = 1; let configEnd = configStart; for (let i = configStart; i < tagContent.length && braceCount > 0; i++) { if (tagContent[i] === '{') braceCount++; if (tagContent[i] === '}') braceCount--; if (braceCount === 0) configEnd = i; } const jsxContent = tagContent.substring(configStart, configEnd).trim(); config = new Function('return ({' + jsxContent + '})')(); } catch (e) { // Config parsing failed, keep null } } const isWide = isPositionInsideWide(content, position) || wideAttr; visuals.push({ type: 'embed', position, src, title, desc, id, data, frameless, config, wide: isWide, skipGallery }); } } // Extract Images with position const imagePattern = /]*\/>/gi; while ((match = imagePattern.exec(content)) !== null) { const srcMatch = match[0].match(/src\s*=\s*\{([^}]+)\}/i); if (srcMatch) { const altMatch = match[0].match(/alt\s*=\s*["']([^"']+)["']/i); const captionMatch = match[0].match(/caption\s*=\s*["']([^"']+)["']/i); visuals.push({ type: 'image', position: match.index, src: srcMatch[1].trim(), alt: altMatch ? altMatch[1] : 'Image', caption: captionMatch ? captionMatch[1] : null, }); } } // Extract Tables with position const tablePattern = /(\|[^\n]+\|\n\|[-:\s|]+\|\n(?:\|[^\n]+\|\n?)+)/g; let tableIndex = 0; while ((match = tablePattern.exec(content)) !== null) { const tableContent = match[1].trim(); const rows = tableContent.split('\n').filter(row => row.trim()); if (rows.length >= 3) { const headerRow = rows[0]; // Convert Markdown to HTML in cells const headers = headerRow.split('|') .filter(cell => cell.trim()) .map(cell => markdownToHtml(cell.trim())); const dataRows = rows.slice(2).map(row => { return row.split('|') .filter(cell => cell.trim()) .map(cell => markdownToHtml(cell.trim())); }); visuals.push({ type: 'table', position: match.index, id: `table-${tableIndex++}`, headers, rows: dataRows, }); } } // Sort by position (order of appearance) visuals.sort((a, b) => a.position - b.position); return visuals; } /** * Load all visual elements (embeds, images, tables) from MDX content files * Returns them in order of appearance in the article */ export function loadAllVisualsFromMDX() { const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); let contentDir = join(__dirname, '../content'); if (!statSync(contentDir, { throwIfNoEntry: false })) { contentDir = join(__dirname, '../../src/content'); } if (!statSync(contentDir, { throwIfNoEntry: false })) { contentDir = join(__dirname, '../../../src/content'); } const allVisuals = []; const articleFile = join(contentDir, 'article.mdx'); try { const articleContent = readFileSync(articleFile, 'utf-8'); // Extract all visual elements from article IN ORDER (with all props) const articleVisuals = extractAllVisualsWithPosition(articleContent); articleVisuals.forEach(item => { item.sourceFile = 'content/article.mdx'; }); allVisuals.push(...articleVisuals); // Parse chapter order and extract from chapters const chapterOrder = parseArticleChapters(articleContent, contentDir); for (const chapterPath of chapterOrder) { // Skip demo chapters if (chapterPath.includes('/demo/')) continue; try { const chapterContent = readFileSync(chapterPath, 'utf-8'); // Extract all visuals IN ORDER from this chapter (with all props) const chapterVisuals = extractAllVisualsWithPosition(chapterContent); const relativePath = relative(contentDir, chapterPath); chapterVisuals.forEach(item => { item.sourceFile = `content/${relativePath}`; }); allVisuals.push(...chapterVisuals); } catch (error) { console.error(`Error reading chapter ${chapterPath}:`, error); } } // Process other MDX files (not in demo) const allMdxFiles = findMdxFiles(contentDir, contentDir, [], true); const processedFiles = new Set([articleFile, ...chapterOrder]); for (const filePath of allMdxFiles) { if (!processedFiles.has(filePath) && !filePath.includes('/demo/')) { try { const rawContent = readFileSync(filePath, 'utf-8'); const fileVisuals = extractAllVisualsWithPosition(rawContent); const relativePath = relative(contentDir, filePath); fileVisuals.forEach(item => { item.sourceFile = `content/${relativePath}`; }); allVisuals.push(...fileVisuals); } catch (error) { console.error(`Error reading ${filePath}:`, error); } } } } catch (error) { console.error('Error processing article:', error); } // Keep all occurrences (no deduplication) // Duplicates will be numbered in dataviz.astro (e.g., d3-line-chart, d3-line-chart-2) return allVisuals; }