tfrere's picture
tfrere HF Staff
fix: table parser handles pipes inside cells (union types, etc.)
7b4b8f5
import { readFileSync, readdirSync, statSync } from 'fs';
import { join, relative, dirname } from 'path';
import { fileURLToPath } from 'url';
import { createHash } from 'crypto';
/**
* Extract HtmlEmbed, Image components and tables from MDX/Markdown content
* Simple utility to find visual elements and their props
*/
/**
* Parse image import statements from MDX content.
* Returns a Map of variable name → filename (just the basename).
* e.g. import placeholder from '../../assets/image/placeholder.png'
* → Map { 'placeholder' => 'placeholder.png' }
*/
function parseImageImports(content) {
const importMap = new Map();
const importPattern = /import\s+(\w+)\s+from\s+["']([^"']+)["']/g;
let match;
while ((match = importPattern.exec(content)) !== null) {
const varName = match[1];
const importPath = match[2];
// Extract just the filename from the path
const filename = importPath.split('/').pop();
if (filename && /\.(png|jpe?g|gif|webp|svg)$/i.test(filename)) {
importMap.set(varName, filename);
}
}
return importMap;
}
/**
* Strip fenced code blocks (``` ... ```) from content.
* Replaces code block content with whitespace of equal length
* to preserve character positions for downstream extraction.
*/
function stripCodeBlocks(content) {
return content.replace(/```[\s\S]*?```/g, (match) => ' '.repeat(match.length));
}
/**
* Simple Markdown to HTML converter for table cells
* Handles: links, bold, italic, code, strikethrough
*/
function markdownToHtml(md) {
if (!md) return '';
let html = md;
// Escape HTML entities first (but not for already-converted content)
// Skip if it already looks like HTML
if (!html.includes('<a ') && !html.includes('<strong>')) {
html = html
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;');
}
// Links: [text](url)
html = html.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '<a href="$2" target="_blank" rel="noopener">$1</a>');
// Bold: **text** or __text__
html = html.replace(/\*\*([^*]+)\*\*/g, '<strong>$1</strong>');
html = html.replace(/__([^_]+)__/g, '<strong>$1</strong>');
// Italic: *text* or _text_ (but not inside words)
html = html.replace(/(?<![*_])\*([^*]+)\*(?![*_])/g, '<em>$1</em>');
html = html.replace(/(?<![*_])_([^_]+)_(?![*_])/g, '<em>$1</em>');
// Inline code: `code`
html = html.replace(/`([^`]+)`/g, '<code>$1</code>');
// Strikethrough: ~~text~~
html = html.replace(/~~([^~]+)~~/g, '<del>$1</del>');
// Checkboxes (common in tables)
html = html.replace(/\[x\]/gi, '✅');
html = html.replace(/\[ \]/g, '❌');
return html;
}
/**
* Extract Image components from MDX content
*/
export function extractImages(content) {
const images = [];
// Match <Image ... /> components
const imagePattern = /<Image[^>]*\/>/gi;
let match;
while ((match = imagePattern.exec(content)) !== null) {
const tag = match[0];
// Extract src attribute (variable reference like {myImage})
const srcMatch = tag.match(/src\s*=\s*\{([^}]+)\}/i);
const src = srcMatch ? srcMatch[1].trim() : null;
// Extract alt
const altMatch = tag.match(/alt\s*=\s*["']([^"']+)["']/i);
const alt = altMatch ? altMatch[1] : 'Image';
// Extract caption
const captionMatch = tag.match(/caption\s*=\s*["']([^"']+)["']/i) ||
tag.match(/caption\s*=\s*\{`([^`]+)`\}/i);
const caption = captionMatch ? captionMatch[1] : null;
// Extract id
const idMatch = tag.match(/id\s*=\s*["']([^"']+)["']/i);
const id = idMatch ? idMatch[1] : null;
// Extract skipGallery
const skipGallery = /\bskipGallery\b/i.test(tag);
if (src) {
images.push({
type: 'image',
src,
alt,
caption,
id,
skipGallery
});
}
}
return images;
}
/**
* Split a markdown table row on pipe characters, respecting backtick spans.
* Pipes inside `inline code` are treated as literal text, not separators.
* Returns the array of trimmed, non-empty cell strings.
*/
function splitTableRow(row) {
const cells = [];
let current = '';
let inBacktick = false;
for (let i = 0; i < row.length; i++) {
const ch = row[i];
if (ch === '`') {
inBacktick = !inBacktick;
current += ch;
} else if (ch === '|' && !inBacktick) {
cells.push(current);
current = '';
} else {
current += ch;
}
}
cells.push(current);
return cells;
}
/**
* Parse a markdown table row into exactly `expectedCols` cells.
* First splits respecting backticks (splitTableRow), then if the row
* has too many cells (e.g. unescaped | in values), merges overflow
* cells back together to match the expected column count.
*/
function parseTableRow(row, expectedCols) {
let cells = splitTableRow(row).filter(c => c.trim());
if (cells.length <= expectedCols) return cells;
// Too many cells: merge overflow into the last "overflowing" column.
// Strategy: keep first (expectedCols - 1) cells, merge the rest into one,
// then take the last cell separately (it's usually the description).
// This handles: | val | No | 'a' | 'b' | 'c' | Description |
// → [val, No, 'a' | 'b' | 'c', Description]
const head = cells.slice(0, expectedCols - 1);
const tail = cells.slice(expectedCols - 1);
// The last element is the final column; everything in between is the overflowing column
if (tail.length > 1) {
const lastCell = tail.pop();
const merged = tail.join(' | ');
return [...head, merged, lastCell].slice(0, expectedCols);
}
return cells.slice(0, expectedCols);
}
/**
* Extract markdown tables from content
*/
export function extractTables(content) {
const tables = [];
// Match markdown tables (lines starting with |)
// A table has at least a header row, separator row, and one data row
const tablePattern = /(\|[^\n]+\|\n\|[-:\s|]+\|\n(?:\|[^\n]+\|\n?)+)/g;
let match;
let tableIndex = 0;
while ((match = tablePattern.exec(content)) !== null) {
const tableContent = match[1].trim();
const rows = tableContent.split('\n').filter(row => row.trim());
if (rows.length >= 3) {
// Parse header
const headerRow = rows[0];
const headers = splitTableRow(headerRow)
.filter(cell => cell.trim())
.map(cell => markdownToHtml(cell.trim()));
const expectedCols = headers.length;
// Parse data rows (skip separator at index 1)
// Use parseTableRow to handle overflow pipes (e.g. union types)
const dataRows = rows.slice(2).map(row => {
return parseTableRow(row, expectedCols)
.map(cell => markdownToHtml(cell.trim()));
});
tables.push({
type: 'table',
id: `table-${tableIndex++}`,
headers,
rows: dataRows,
raw: tableContent
});
}
}
return tables;
}
export function extractHtmlEmbeds(rawContent) {
const embeds = [];
// Strip code blocks to avoid extracting components from code examples
const content = stripCodeBlocks(rawContent);
// First, find all Wide components and mark their content
// Pattern to match <Wide>...</Wide> blocks
const widePattern = /<Wide[\s\S]*?>([\s\S]*?)<\/Wide>/gi;
const wideBlocks = [];
let wideMatch;
while ((wideMatch = widePattern.exec(content)) !== null) {
wideBlocks.push({
start: wideMatch.index,
end: wideMatch.index + wideMatch[0].length,
content: wideMatch[0]
});
}
// Helper to check if an embed is inside a Wide block
const isInsideWide = (embedStartIndex) => {
return wideBlocks.some(block =>
embedStartIndex >= block.start && embedStartIndex < block.end
);
};
// Pattern to match HtmlEmbed opening tags
const embedPattern = /<HtmlEmbed/gi;
let embedMatch;
while ((embedMatch = embedPattern.exec(content)) !== null) {
const matchIndex = embedMatch.index;
// Manually find the closing /> while respecting string boundaries
let pos = matchIndex + 10; // After "<HtmlEmbed"
let match = '<HtmlEmbed';
let inString = false;
let stringDelim = null;
let inJSXBraces = 0; // Track depth in JSX expressions like config={{...}}
while (pos < content.length) {
const char = content[pos];
const prevChar = pos > 0 ? content[pos - 1] : '';
match += char;
// Track string boundaries (template strings, single, double quotes)
if (!inString) {
if ((char === '`' || char === '"' || char === "'") && prevChar !== '\\') {
inString = true;
stringDelim = char;
}
} else {
if (char === stringDelim && prevChar !== '\\') {
inString = false;
stringDelim = null;
}
}
// Track JSX expression braces (for config={{...}}, data={{...}}, etc.)
if (!inString) {
if (char === '{') {
inJSXBraces++;
} else if (char === '}') {
inJSXBraces--;
}
}
// Check for closing /> - only valid if not in string AND all JSX braces are closed
if (!inString && inJSXBraces === 0 && char === '/' && pos + 1 < content.length && content[pos + 1] === '>') {
match += '>';
break;
}
pos++;
}
// If config={{ is present, we need to find the real closing after }}
// Check if config={{ is there but the match doesn't include the full config (doesn't end with }})
if (match.includes('config={{') && !match.includes('}}')) {
// The match was cut off at the first /> it found (probably in desc)
// We need to find the real closing after }}
// Find where config={{ starts
const configStart = match.indexOf('config={{');
if (configStart >= 0) {
// Look for the matching }} after this, starting from the content
let braceCount = 2; // We're inside {{
let pos = matchIndex + configStart + 9; // After "config={{"
let foundEnd = false;
while (pos < content.length) {
const char = content[pos];
const prevChar = pos > 0 ? content[pos - 1] : '';
// Track strings to avoid counting braces inside strings
if ((char === '"' || char === "'" || char === '`') && prevChar !== '\\') {
// We hit a string delimiter - skip the entire string
const stringDelim = char;
pos++;
while (pos < content.length) {
if (content[pos] === stringDelim && content[pos - 1] !== '\\') {
break;
}
// Handle template string ${...}
if (stringDelim === '`' && content[pos] === '$' && pos + 1 < content.length && content[pos + 1] === '{') {
// Skip ${...} without counting
pos += 2;
let innerBraces = 1;
while (pos < content.length && innerBraces > 0) {
if (content[pos] === '{') innerBraces++;
if (content[pos] === '}') innerBraces--;
pos++;
}
continue;
}
pos++;
}
pos++; // Skip the closing quote
continue;
}
if (char === '{') braceCount++;
if (char === '}') {
braceCount--;
if (braceCount === 0) {
// Found the closing }}
// Now look for /> after optional whitespace
pos++;
while (pos < content.length && /\s/.test(content[pos])) {
pos++;
}
if (pos < content.length && content[pos] === '/' && pos + 1 < content.length && content[pos + 1] === '>') {
// Found the real closing
match = content.substring(matchIndex, pos + 2);
foundEnd = true;
break;
}
}
}
pos++;
}
if (!foundEnd) {
// Fallback: try to find }} /> pattern after match
const after = content.substring(matchIndex + match.length);
const endPattern = after.match(/\}\}\s*\/>/);
if (endPattern) {
match = content.substring(matchIndex, matchIndex + match.length + endPattern.index + endPattern[0].length);
}
}
}
}
// Helper function to extract attribute value supporting multiline
const extractAttr = (attrName, content) => {
// Try JSX template strings first: desc={`...`}
const templateMatch = content.match(new RegExp(`${attrName}\\s*=\\s*\\{\`([\\s\\S]*?)\`\\}`, 'i'));
if (templateMatch) return templateMatch[1].trim();
// Try single quotes: desc='...'
const singleQuoteMatch = content.match(new RegExp(`${attrName}\\s*=\\s*'([\\s\\S]*?)'`, 'i'));
if (singleQuoteMatch) return singleQuoteMatch[1].trim();
// Try double quotes: desc="..."
const doubleQuoteMatch = content.match(new RegExp(`${attrName}\\s*=\\s*"([\\s\\S]*?)"`, 'i'));
if (doubleQuoteMatch) return doubleQuoteMatch[1].trim();
return undefined;
};
// Extract src attribute (required)
const src = extractAttr('src', match);
if (!src) continue;
// Extract optional attributes
const title = extractAttr('title', match);
const desc = extractAttr('desc', match);
const id = extractAttr('id', match);
const data = extractAttr('data', match);
const frameless = /\bframeless\b/i.test(match);
const wideAttr = /\bwide\b/i.test(match);
const skipGallery = /\bskipGallery\b/i.test(match);
// Extract config attribute - JSX object format: config={{ ... }}
let config = null;
// Pattern to match config={{ ... }} with balanced braces
const jsxConfigRegex = /config\s*=\s*\{\{/i;
const jsxConfigMatch = match.match(jsxConfigRegex);
if (jsxConfigMatch) {
try {
// Find the start position after config={{
const configStart = jsxConfigMatch.index;
const startPos = match.indexOf('{{', configStart) + 2;
// Find matching closing braces with better handling
let braceCount = 1; // Start at 1 because we're inside {{
let inString = false;
let stringChar = null;
let pos = startPos;
for (; pos < match.length; pos++) {
const char = match[pos];
const prevChar = pos > 0 ? match[pos - 1] : '';
const nextChar = pos < match.length - 1 ? match[pos + 1] : '';
// Handle string literals - check for template strings too
if (!inString) {
if (char === '`') {
inString = true;
stringChar = '`';
} else if (char === '"' && prevChar !== '\\') {
inString = true;
stringChar = '"';
} else if (char === "'" && prevChar !== '\\') {
inString = true;
stringChar = "'";
}
} else {
// Check for end of string
if (char === stringChar && prevChar !== '\\') {
inString = false;
stringChar = null;
}
// Template strings can contain ${...} - handle that
if (stringChar === '`' && char === '$' && nextChar === '{') {
// Skip the ${ but don't count it as a brace yet
pos++; // Skip $
braceCount++; // Count the { we're about to see
continue;
}
}
if (!inString) {
if (char === '{') {
braceCount++;
} else if (char === '}') {
braceCount--;
if (braceCount === 0) {
// Found matching closing }}
break;
}
}
}
}
if (braceCount !== 0) {
throw new Error(`Unbalanced braces: braceCount=${braceCount}`);
}
// Extract the JSX object content
let jsxContent = match.substring(startPos, pos).trim();
// Instead of converting to JSON, evaluate the JavaScript object directly
// This is safer in a build context (not in browser)
try {
// Wrap in parentheses and braces to make it a valid expression
const jsCode = `({${jsxContent}})`;
// Use Function constructor to safely evaluate (no access to local scope)
// This is safe because we're in Node.js build time, not browser runtime
config = new Function('return ' + jsCode)();
} catch (evalError) {
// If eval fails, try the JSON approach as fallback
// Fallback: try JSON parsing
let jsonStr = jsxContent;
// Add braces around the content
jsonStr = '{' + jsonStr + '}';
// Quote unquoted keys
for (let pass = 0; pass < 5; pass++) {
jsonStr = jsonStr.replace(/([{,\[\s])([a-zA-Z_$][a-zA-Z0-9_$]*)\s*:/g, '$1"$2":');
jsonStr = jsonStr.replace(/^([a-zA-Z_$][a-zA-Z0-9_$]*)\s*:/gm, '"$1":');
}
// Replace single quotes with double quotes
jsonStr = jsonStr.replace(/'/g, '"');
// Remove trailing commas
jsonStr = jsonStr.replace(/,\s*([}\]])/g, '$1');
try {
config = JSON.parse(jsonStr);
} catch (jsonError) {
// Both methods failed - log warning but don't throw
console.warn('[extract-embeds] Config parsing failed:', jsonError.message);
}
}
} catch (e) {
// If parsing fails, keep config as null
// Component will handle missing config
}
}
// Fallback: try standard attribute extraction (for string-based config)
if (!config) {
const configAttr = extractAttr('config', match);
if (configAttr) {
try {
config = JSON.parse(configAttr);
} catch (e) {
// Keep as string if not valid JSON
config = configAttr;
}
}
}
// Check if this embed is inside a Wide component OR has wide prop
const isWide = isInsideWide(matchIndex) || wideAttr;
embeds.push({
src,
title,
desc,
id,
frameless,
data,
config,
wide: isWide,
skipGallery
});
}
return embeds;
}
/**
* Recursively find all MDX files in a directory
* Skips demo chapters by default to avoid missing embeds
*/
function findMdxFiles(dir, baseDir = dir, files = [], skipDemo = true) {
const entries = readdirSync(dir);
for (const entry of entries) {
const fullPath = join(dir, entry);
const stat = statSync(fullPath);
if (stat.isDirectory()) {
// Skip demo directory if skipDemo is true
if (skipDemo && entry === 'demo') {
continue;
}
findMdxFiles(fullPath, baseDir, files, skipDemo);
} else if (entry.endsWith('.mdx')) {
files.push(fullPath);
}
}
return files;
}
/**
* Parse imports and chapter usage order from article.mdx
*/
function parseArticleChapters(articleContent, contentDir) {
const chapterMap = new Map(); // Component name -> file path
const chapterOrder = []; // Ordered list of file paths
// Extract import statements
const importPattern = /import\s+(\w+)\s+from\s+["'](.\/chapters\/[^"']+)["']/g;
let match;
while ((match = importPattern.exec(articleContent)) !== null) {
const [, componentName, importPath] = match;
const fullPath = join(contentDir, importPath);
chapterMap.set(componentName, fullPath);
}
// Extract chapter usage order (e.g., <Introduction />)
const usagePattern = /<(\w+)\s*\/>/g;
while ((match = usagePattern.exec(articleContent)) !== null) {
const componentName = match[1];
if (chapterMap.has(componentName)) {
const chapterPath = chapterMap.get(componentName);
if (!chapterOrder.includes(chapterPath)) {
chapterOrder.push(chapterPath);
}
}
}
return chapterOrder;
}
/**
* Build a unique identity key for an embed.
*
* Strategy (in priority order):
* 1. `id` — if the author gave an explicit id, it's unique by convention.
* 2. `src` + deterministic hash of (config, data) — same template with
* different parameters produces different keys.
* 3. `src` alone — for embeds with no config/data (unique HTML file).
*
* This allows the same generic template (e.g. d3-line-chart.html) to appear
* multiple times when each instance carries a different config, while still
* deduplicating true duplicates (same src + same config that appear in both
* article.mdx and a chapter).
*/
function embedKey(embed) {
if (embed.id) return `id:${embed.id}`;
const hasConfig = embed.config != null;
const hasData = embed.data != null;
if (!hasConfig && !hasData) return `src:${embed.src}`;
// Deterministic hash of the variable parts
const payload = JSON.stringify({ config: embed.config ?? null, data: embed.data ?? null });
const hash = createHash('sha1').update(payload).digest('hex').slice(0, 10);
return `src:${embed.src}#${hash}`;
}
/**
* Load and extract embeds from MDX content files, following article structure
*/
export function loadEmbedsFromMDX() {
// Get absolute path to content directory
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
// Try to resolve content directory - works in both dev and build
let contentDir = join(__dirname, '../content');
if (!statSync(contentDir, { throwIfNoEntry: false })) {
contentDir = join(__dirname, '../../src/content');
}
if (!statSync(contentDir, { throwIfNoEntry: false })) {
contentDir = join(__dirname, '../../../src/content');
}
const allEmbeds = [];
const articleFile = join(contentDir, 'article.mdx');
try {
// Read main article file
const articleContent = readFileSync(articleFile, 'utf-8');
// Extract embeds from main article first
const articleEmbeds = extractHtmlEmbeds(articleContent);
articleEmbeds.forEach(embed => {
embed.sourceFile = 'content/article.mdx';
});
allEmbeds.push(...articleEmbeds);
// Parse chapter order from article
const chapterOrder = parseArticleChapters(articleContent, contentDir);
// Extract embeds from chapters in order
for (const chapterPath of chapterOrder) {
try {
const chapterContent = readFileSync(chapterPath, 'utf-8');
const embeds = extractHtmlEmbeds(chapterContent);
// Add source file info (relative path)
const relativePath = relative(contentDir, chapterPath);
embeds.forEach(embed => {
embed.sourceFile = `content/${relativePath}`;
});
allEmbeds.push(...embeds);
} catch (error) {
console.error(`Error reading chapter ${chapterPath}:`, error);
}
}
// Also include any other MDX files not in chapters (for completeness)
const allMdxFiles = findMdxFiles(contentDir, contentDir, [], false);
const processedFiles = new Set([articleFile, ...chapterOrder]);
for (const filePath of allMdxFiles) {
if (!processedFiles.has(filePath)) {
try {
const rawContent = readFileSync(filePath, 'utf-8');
const embeds = extractHtmlEmbeds(rawContent);
const relativePath = relative(contentDir, filePath);
embeds.forEach(embed => {
embed.sourceFile = `content/${relativePath}`;
});
allEmbeds.push(...embeds);
} catch (error) {
console.error(`Error reading ${filePath}:`, error);
}
}
}
} catch (error) {
console.error('Error processing article:', error);
// Fallback to old behavior if article.mdx can't be read
const mdxFiles = findMdxFiles(contentDir, contentDir, [], false);
for (const filePath of mdxFiles) {
try {
const rawContent = readFileSync(filePath, 'utf-8');
const embeds = extractHtmlEmbeds(rawContent);
const relativePath = relative(contentDir, filePath);
embeds.forEach(embed => {
embed.sourceFile = `content/${relativePath}`;
});
allEmbeds.push(...embeds);
} catch (err) {
console.error(`Error reading ${filePath}:`, err);
}
}
}
// Remove true duplicates (same identity) keeping first occurrence (= order of appearance).
// Identity = id (if set), or src + hash(config, data). This means the same generic
// template with different configs produces distinct entries.
const seen = new Map();
const uniqueEmbeds = [];
for (const embed of allEmbeds) {
const key = embedKey(embed);
if (!seen.has(key)) {
seen.set(key, true);
uniqueEmbeds.push(embed);
}
}
return uniqueEmbeds;
}
/**
* Helper to extract attribute from tag content
*/
function extractAttrFromTag(attrName, tagContent) {
// Try JSX template strings first: attr={`...`}
const templateMatch = tagContent.match(new RegExp(`${attrName}\\s*=\\s*\\{\`([\\s\\S]*?)\`\\}`, 'i'));
if (templateMatch) return templateMatch[1].trim();
// Try single quotes: attr='...'
const singleQuoteMatch = tagContent.match(new RegExp(`${attrName}\\s*=\\s*'([\\s\\S]*?)'`, 'i'));
if (singleQuoteMatch) return singleQuoteMatch[1].trim();
// Try double quotes: attr="..."
const doubleQuoteMatch = tagContent.match(new RegExp(`${attrName}\\s*=\\s*"([\\s\\S]*?)"`, 'i'));
if (doubleQuoteMatch) return doubleQuoteMatch[1].trim();
return undefined;
}
/**
* Check if position is inside a Wide component
*/
function isPositionInsideWide(content, position) {
const widePattern = /<Wide[\s\S]*?>([\s\S]*?)<\/Wide>/gi;
let match;
while ((match = widePattern.exec(content)) !== null) {
if (position >= match.index && position < match.index + match[0].length) {
return true;
}
}
return false;
}
/**
* Extract all visual elements from content with their position
* Returns sorted by position (order of appearance)
*/
function extractAllVisualsWithPosition(rawContent) {
const visuals = [];
// Parse image imports before stripping code blocks (imports are never in code blocks)
const imageImports = parseImageImports(rawContent);
// Strip code blocks to avoid extracting components from code examples
const content = stripCodeBlocks(rawContent);
// Extract HtmlEmbeds with position and ALL props
const embedPattern = /<HtmlEmbed/gi;
let match;
while ((match = embedPattern.exec(content)) !== null) {
const position = match.index;
// Find the end of this tag
let pos = position + 10;
let tagContent = '<HtmlEmbed';
let inString = false;
let stringDelim = null;
let inJSXBraces = 0;
while (pos < content.length) {
const char = content[pos];
const prevChar = pos > 0 ? content[pos - 1] : '';
tagContent += char;
if (!inString) {
if ((char === '`' || char === '"' || char === "'") && prevChar !== '\\') {
inString = true;
stringDelim = char;
}
} else {
if (char === stringDelim && prevChar !== '\\') {
inString = false;
stringDelim = null;
}
}
if (!inString) {
if (char === '{') inJSXBraces++;
else if (char === '}') inJSXBraces--;
}
if (!inString && inJSXBraces === 0 && char === '/' && pos + 1 < content.length && content[pos + 1] === '>') {
tagContent += '>';
break;
}
pos++;
}
// Extract all props
const src = extractAttrFromTag('src', tagContent);
if (src) {
const title = extractAttrFromTag('title', tagContent);
const desc = extractAttrFromTag('desc', tagContent);
const id = extractAttrFromTag('id', tagContent);
const data = extractAttrFromTag('data', tagContent);
const frameless = /\bframeless\b/i.test(tagContent);
const wideAttr = /\bwide\b/i.test(tagContent);
const skipGallery = /\bskipGallery\b/i.test(tagContent);
// Parse config if present
let config = null;
const jsxConfigMatch = tagContent.match(/config\s*=\s*\{\{/i);
if (jsxConfigMatch) {
try {
const configStart = tagContent.indexOf('{{', jsxConfigMatch.index) + 2;
let braceCount = 1;
let configEnd = configStart;
for (let i = configStart; i < tagContent.length && braceCount > 0; i++) {
if (tagContent[i] === '{') braceCount++;
if (tagContent[i] === '}') braceCount--;
if (braceCount === 0) configEnd = i;
}
const jsxContent = tagContent.substring(configStart, configEnd).trim();
config = new Function('return ({' + jsxContent + '})')();
} catch (e) {
// Config parsing failed, keep null
}
}
const isWide = isPositionInsideWide(content, position) || wideAttr;
visuals.push({
type: 'embed',
position,
src,
title,
desc,
id,
data,
frameless,
config,
wide: isWide,
skipGallery
});
}
}
// Find all Stack blocks to detect grouped images
const stackBlocks = [];
const stackPattern = /<Stack([\s\S]*?)>([\s\S]*?)<\/Stack>/gi;
while ((match = stackPattern.exec(content)) !== null) {
const stackAttrs = match[1];
const stackContent = match[2];
const stackStart = match.index;
const stackEnd = stackStart + match[0].length;
// Check if this Stack contains <Image> components
const innerImages = [];
const innerImagePattern = /<Image([^>]*)\/?>/gi;
let imgMatch;
while ((imgMatch = innerImagePattern.exec(stackContent)) !== null) {
const tag = imgMatch[0];
const srcM = tag.match(/src\s*=\s*\{([^}]+)\}/i);
if (srcM) {
const varName = srcM[1].trim();
const altM = tag.match(/alt\s*=\s*["']([^"']+)["']/i);
const captionM = tag.match(/caption\s*=\s*["']([^"']+)["']/i);
const imgSkipGallery = /\bskipGallery\b/i.test(tag);
innerImages.push({
src: varName,
resolvedFilename: imageImports.get(varName) || null,
alt: altM ? altM[1] : 'Image',
caption: captionM ? captionM[1] : null,
skipGallery: imgSkipGallery,
});
}
}
if (innerImages.length > 0) {
// Extract Stack layout/gap props
const layoutM = stackAttrs.match(/layout\s*=\s*["']([^"']+)["']/i);
const gapM = stackAttrs.match(/gap\s*=\s*["']([^"']+)["']/i);
// If ALL images have skipGallery, the whole stack is skipped
const allSkipped = innerImages.every(img => img.skipGallery);
stackBlocks.push({ start: stackStart, end: stackEnd });
visuals.push({
type: 'stack',
position: stackStart,
images: innerImages,
layout: layoutM ? layoutM[1] : '2-column',
gap: gapM ? gapM[1] : 'medium',
skipGallery: allSkipped,
});
}
}
// Helper to check if position is inside a Stack block
const isInsideStack = (pos) => {
return stackBlocks.some(b => pos >= b.start && pos < b.end);
};
// Extract standalone Images (not inside Stack)
const imagePattern = /<Image[^>]*\/>/gi;
while ((match = imagePattern.exec(content)) !== null) {
// Skip images already captured inside a Stack
if (isInsideStack(match.index)) continue;
const srcMatch = match[0].match(/src\s*=\s*\{([^}]+)\}/i);
if (srcMatch) {
const varName = srcMatch[1].trim();
const altMatch = match[0].match(/alt\s*=\s*["']([^"']+)["']/i);
const captionMatch = match[0].match(/caption\s*=\s*["']([^"']+)["']/i);
const skipGallery = /\bskipGallery\b/i.test(match[0]);
const resolvedFilename = imageImports.get(varName) || null;
visuals.push({
type: 'image',
position: match.index,
src: varName,
resolvedFilename,
alt: altMatch ? altMatch[1] : 'Image',
caption: captionMatch ? captionMatch[1] : null,
skipGallery,
});
}
}
// Extract Tables with position
const tablePattern = /(\|[^\n]+\|\n\|[-:\s|]+\|\n(?:\|[^\n]+\|\n?)+)/g;
let tableIndex = 0;
while ((match = tablePattern.exec(content)) !== null) {
const tableContent = match[1].trim();
const rows = tableContent.split('\n').filter(row => row.trim());
if (rows.length >= 3) {
const headerRow = rows[0];
const headers = splitTableRow(headerRow)
.filter(cell => cell.trim())
.map(cell => markdownToHtml(cell.trim()));
const expectedCols = headers.length;
const dataRows = rows.slice(2).map(row => {
return parseTableRow(row, expectedCols)
.map(cell => markdownToHtml(cell.trim()));
});
visuals.push({
type: 'table',
position: match.index,
id: `table-${tableIndex++}`,
headers,
rows: dataRows,
});
}
}
// Sort by position (order of appearance)
visuals.sort((a, b) => a.position - b.position);
return visuals;
}
/**
* Load all visual elements (embeds, images, tables) from MDX content files
* Returns them in order of appearance in the article
*/
export function loadAllVisualsFromMDX() {
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
let contentDir = join(__dirname, '../content');
if (!statSync(contentDir, { throwIfNoEntry: false })) {
contentDir = join(__dirname, '../../src/content');
}
if (!statSync(contentDir, { throwIfNoEntry: false })) {
contentDir = join(__dirname, '../../../src/content');
}
const allVisuals = [];
const articleFile = join(contentDir, 'article.mdx');
try {
const articleContent = readFileSync(articleFile, 'utf-8');
// Extract all visual elements from article IN ORDER (with all props)
const articleVisuals = extractAllVisualsWithPosition(articleContent);
articleVisuals.forEach(item => {
item.sourceFile = 'content/article.mdx';
});
allVisuals.push(...articleVisuals);
// Parse chapter order and extract from chapters
const chapterOrder = parseArticleChapters(articleContent, contentDir);
for (const chapterPath of chapterOrder) {
try {
const chapterContent = readFileSync(chapterPath, 'utf-8');
// Extract all visuals IN ORDER from this chapter (with all props)
const chapterVisuals = extractAllVisualsWithPosition(chapterContent);
const relativePath = relative(contentDir, chapterPath);
chapterVisuals.forEach(item => {
item.sourceFile = `content/${relativePath}`;
});
allVisuals.push(...chapterVisuals);
} catch (error) {
console.error(`Error reading chapter ${chapterPath}:`, error);
}
}
// Process other MDX files not already handled
const allMdxFiles = findMdxFiles(contentDir, contentDir, [], false);
const processedFiles = new Set([articleFile, ...chapterOrder]);
for (const filePath of allMdxFiles) {
if (!processedFiles.has(filePath)) {
try {
const rawContent = readFileSync(filePath, 'utf-8');
const fileVisuals = extractAllVisualsWithPosition(rawContent);
const relativePath = relative(contentDir, filePath);
fileVisuals.forEach(item => {
item.sourceFile = `content/${relativePath}`;
});
allVisuals.push(...fileVisuals);
} catch (error) {
console.error(`Error reading ${filePath}:`, error);
}
}
}
} catch (error) {
console.error('Error processing article:', error);
}
// Keep all occurrences (no deduplication)
// Duplicates will be numbered in dataviz.astro (e.g., d3-line-chart, d3-line-chart-2)
return allVisuals;
}